- 新增本地弹幕文件测试入口,支持直接对样本文件生成提纯结果 - 将本地统计、主题证据簇和语义事实提示接入斗鱼日报LLM材料 - 明确降低情绪刷屏权重,改为优先提取赛事、位置、英雄、对局和场外互动信息
1440 lines
60 KiB
Python
1440 lines
60 KiB
Python
# -*- coding: utf-8 -*-
|
||
import os
|
||
import re
|
||
from collections import Counter, defaultdict
|
||
from datetime import datetime
|
||
from typing import Any, Dict, List, Optional, Set
|
||
|
||
|
||
class DouyuDanmuSummaryHelper:
|
||
"""斗鱼弹幕场次抽取与压缩辅助器。"""
|
||
|
||
LINE_PATTERN = re.compile(
|
||
r"^\[(?P<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\]\s+"
|
||
r"(?P<nickname>.*?)\s+\(UID:\s*(?P<uid>[^,\)]+)(?P<profile>.*?)\):(?P<content>.*)$"
|
||
)
|
||
|
||
STOPWORDS: Set[str] = {
|
||
"哈哈", "哈哈哈", "hh", "hhh", "啊啊", "啊啊啊", "可以", "这个", "那个", "真的", "就是",
|
||
"一下", "主播", "兄弟", "老铁", "大家", "我们", "你们", "他们", "不是", "什么", "怎么",
|
||
"为啥", "有点", "感觉", "这里", "那里", "然后", "今天", "刚刚", "现在", "一个", "一下子",
|
||
}
|
||
|
||
SHORT_BURST_WORDS: Set[str] = {
|
||
"666", "6666", "牛", "牛逼", "稳", "寄", "杀", "帅", "好", "行", "绷", "哭", "乐",
|
||
"哈哈", "哈哈哈", "笑死", "卧槽", "wc", "awsl", "nb", "nbl", "c", "6",
|
||
}
|
||
# 这些词对“直播间气氛”有价值,但对“事实提炼”帮助有限。
|
||
# 后面在内容线索排序时会适当降权,避免把真正的赛事/英雄/剧情信息淹掉。
|
||
GENERIC_REACTION_TERMS: Set[str] = {
|
||
"哈哈", "哈哈哈", "哈哈哈哈", "哈哈哈哈哈", "哈哈哈哈哈哈",
|
||
"gg", "g", "888", "1", "啊", "啊?", "坏了", "好起来了", "翻了",
|
||
}
|
||
# 高频语义簇配置:
|
||
# 1. 不做中文分词,而是直接按“直播圈常见话题簇”收证据;
|
||
# 2. 每个簇都会保留计数、时间范围和原声样本;
|
||
# 3. 这样模型更容易抓到“今天到底发生了哪些具体事”,而不是只看到大量情绪词。
|
||
FACT_CLUSTER_CONFIGS: List[Dict[str, Any]] = [
|
||
{
|
||
"label": "赛事预告与报名动态",
|
||
"keywords": ["老头杯", "选人", "报名", "开赛", "比赛", "30号", "4月30", "4月30日"],
|
||
},
|
||
{
|
||
"label": "比赛位置与身份讨论",
|
||
"keywords": ["1号位", "5号位", "打1", "打5", "教练", "carry", "辅助"],
|
||
},
|
||
{
|
||
"label": "镜头与外形调侃",
|
||
"keywords": ["摄像头", "开摄像头", "光头", "秃头", "洗头", "面容", "露脸"],
|
||
},
|
||
{
|
||
"label": "团播人物与场外关系",
|
||
"keywords": ["糯糯", "瑶瑶", "冬瓜", "冬瓜强", "白队", "团播", "户外"],
|
||
},
|
||
{
|
||
"label": "关键对局与局势转折",
|
||
"keywords": ["奶绿", "muerta", "gg", "翻了", "拿下", "上高地", "守高地", "队友", "大炮", "萨尔", "炸弹人"],
|
||
},
|
||
]
|
||
HERO_ALIASES: Dict[str, List[str]] = {
|
||
"Muerta/奶绿": ["奶绿", "muerta"],
|
||
"德鲁伊/Lone Druid": ["德鲁伊", "lone druid", "熊德"],
|
||
"小小/Tiny": ["小小", "tiny"],
|
||
"帕克/Puck": ["帕克", "puck"],
|
||
"火猫/Ember Spirit": ["火猫", "ember"],
|
||
"敌法/Anti-Mage": ["敌法", "am", "anti-mage"],
|
||
"兽王/Beastmaster": ["兽王", "beastmaster"],
|
||
"萨尔/Disruptor": ["萨尔", "disruptor"],
|
||
"炸弹人/Techies": ["炸弹人", "techies"],
|
||
}
|
||
NOISE_PATTERNS = [
|
||
re.compile(r"本条弹幕.*机器人", re.I),
|
||
re.compile(r"请不要.*统计机器人数", re.I),
|
||
re.compile(r"原来直播间有.*非机器人用户", re.I),
|
||
re.compile(r"如果您的直播内容是以观看他人操作为主", re.I),
|
||
]
|
||
TEMPLATE_HINT_PATTERNS = [
|
||
re.compile(r"闭目不语任由"),
|
||
re.compile(r"你就忍心一辈"),
|
||
re.compile(r"刚刚偷看你直播被老板发现"),
|
||
re.compile(r"还好老板是蝙蝠侠"),
|
||
re.compile(r"原来直播间有这么多姐妹"),
|
||
re.compile(r"之前我一直不敢发弹幕"),
|
||
re.compile(r"强子也就是吃了直播的红利"),
|
||
re.compile(r"你声音好像强子"),
|
||
re.compile(r"怎么回事强子"),
|
||
re.compile(r"你是个人吗强"),
|
||
]
|
||
TEMPLATE_MIN_LENGTH = 14
|
||
TEMPLATE_MIN_REPEAT = 4
|
||
REPEAT_MIN_COUNT = 3
|
||
|
||
# 统一使用分钟级时间,是因为日报场景里秒级时间几乎不会增加理解价值,
|
||
# 但会显著拉长 LLM 输入;保留到 `HH:MM` 能兼顾时序感和压缩率。
|
||
COMPACT_TIME_FORMAT = "%H:%M"
|
||
|
||
@classmethod
|
||
def parse_danmu_line(cls, line: str) -> Optional[Dict[str, Any]]:
|
||
text = str(line or "").strip()
|
||
if not text:
|
||
return None
|
||
match = cls.LINE_PATTERN.match(text)
|
||
if not match:
|
||
return None
|
||
try:
|
||
ts = datetime.strptime(match.group("timestamp"), "%Y-%m-%d %H:%M:%S")
|
||
except Exception:
|
||
return None
|
||
return {
|
||
"timestamp": ts,
|
||
"timestamp_text": match.group("timestamp"),
|
||
"nickname": match.group("nickname").strip(),
|
||
"uid": str(match.group("uid")).strip(),
|
||
**cls._parse_profile_text(str(match.group("profile") or "")),
|
||
"content": match.group("content").strip(),
|
||
"raw": text,
|
||
}
|
||
|
||
@classmethod
|
||
def load_session_messages(cls, room_id: str, session: Dict[str, Any], base_dir: str = "temp") -> List[Dict[str, Any]]:
|
||
segments = cls._normalize_segments(session.get("segments", []) or [])
|
||
if not room_id or not segments:
|
||
return []
|
||
|
||
date_keys = sorted({segment["start"].strftime("%Y%m%d") for segment in segments} |
|
||
{segment["end"].strftime("%Y%m%d") for segment in segments})
|
||
collected: List[Dict[str, Any]] = []
|
||
for date_key in date_keys:
|
||
file_path = os.path.join(base_dir, "douyu_danmu", date_key, f"{room_id}_{date_key}.txt")
|
||
if not os.path.exists(file_path):
|
||
continue
|
||
try:
|
||
with open(file_path, "r", encoding="utf-8") as f:
|
||
for line in f:
|
||
parsed = cls.parse_danmu_line(line)
|
||
if not parsed:
|
||
continue
|
||
if cls._in_any_segment(parsed["timestamp"], segments):
|
||
collected.append(parsed)
|
||
except Exception:
|
||
continue
|
||
return collected
|
||
|
||
@classmethod
|
||
def load_day_messages(cls, room_id: str, date_key: str, base_dir: str = "temp") -> List[Dict[str, Any]]:
|
||
file_path = os.path.join(base_dir, "douyu_danmu", date_key, f"{room_id}_{date_key}.txt")
|
||
if not os.path.exists(file_path):
|
||
return []
|
||
collected: List[Dict[str, Any]] = []
|
||
with open(file_path, "r", encoding="utf-8") as f:
|
||
for line in f:
|
||
parsed = cls.parse_danmu_line(line)
|
||
if parsed:
|
||
collected.append(parsed)
|
||
return collected
|
||
|
||
@classmethod
|
||
def load_messages_from_file(cls, file_path: str) -> List[Dict[str, Any]]:
|
||
"""
|
||
从指定文本文件直接读取弹幕。
|
||
这个入口主要用于本地调试和样本回归:
|
||
1. 不依赖 Redis session;
|
||
2. 不要求文件落在 temp/douyu_danmu 目录;
|
||
3. 便于直接拿用户提供的测试样本跑提纯链路。
|
||
"""
|
||
path = str(file_path or "").strip()
|
||
if not path or not os.path.exists(path):
|
||
return []
|
||
collected: List[Dict[str, Any]] = []
|
||
with open(path, "r", encoding="utf-8", errors="ignore") as f:
|
||
for line in f:
|
||
parsed = cls.parse_danmu_line(line)
|
||
if parsed:
|
||
collected.append(parsed)
|
||
return collected
|
||
|
||
@classmethod
|
||
def build_summary_material(
|
||
cls,
|
||
room_id: str,
|
||
session: Dict[str, Any],
|
||
messages: List[Dict[str, Any]],
|
||
bucket_minutes: int = 5,
|
||
top_bucket_count: int = 10,
|
||
) -> Dict[str, Any]:
|
||
normalized = [item for item in messages if item and item.get("content")]
|
||
prepared = cls._prepare_messages(normalized)
|
||
organized_messages = prepared["organized_messages"]
|
||
unique_users = {str(item.get("uid") or "") for item in normalized if str(item.get("uid") or "").strip()}
|
||
organized_unique_users = {
|
||
str(item.get("uid") or "") for item in organized_messages if str(item.get("uid") or "").strip()
|
||
}
|
||
deduped_messages = cls._dedupe_consecutive_messages(organized_messages)
|
||
burst_terms = cls._build_burst_terms(organized_messages)
|
||
top_terms = cls._extract_top_terms(organized_messages, limit=30)
|
||
bucket_stats = cls._build_time_buckets(organized_messages, minutes=bucket_minutes)
|
||
peak_buckets = sorted(bucket_stats, key=lambda item: item.get("message_count", 0), reverse=True)[:top_bucket_count]
|
||
|
||
return {
|
||
"session_id": session.get("session_id", ""),
|
||
"room_id": room_id,
|
||
"anchor_day": session.get("anchor_day", ""),
|
||
"nickname": session.get("nickname", ""),
|
||
"room_name": session.get("room_name", ""),
|
||
"segments": cls._serialize_segments(session.get("segments", []) or []),
|
||
"message_count": len(normalized),
|
||
"noise_filtered_count": len(prepared["noise_messages"]),
|
||
"organized_message_count": len(organized_messages),
|
||
"deduped_message_count": len(deduped_messages),
|
||
"unique_user_count": len(unique_users),
|
||
"organized_unique_user_count": len(organized_unique_users),
|
||
"merged_templates": prepared["merged_templates"],
|
||
"top_terms": top_terms,
|
||
"burst_terms": burst_terms,
|
||
"time_buckets": bucket_stats,
|
||
"peak_buckets": peak_buckets,
|
||
"representative_messages": cls._pick_representative_messages(organized_messages, bucket_stats),
|
||
"operator_metrics": cls._build_operator_metrics(normalized, organized_messages),
|
||
}
|
||
|
||
@classmethod
|
||
def build_llm_payload(
|
||
cls,
|
||
room_id: str,
|
||
session: Dict[str, Any],
|
||
messages: List[Dict[str, Any]],
|
||
bucket_minutes: int = 5,
|
||
top_bucket_count: int = 8,
|
||
top_repeat_count: int = 24,
|
||
) -> Dict[str, Any]:
|
||
"""
|
||
面向 LLM 的高保真弹幕载荷。
|
||
规则:
|
||
1. 仅过滤平台/机器人类系统噪音。
|
||
2. 相同或高度模板化的内容做聚合,不直接删除。
|
||
3. 其他不同内容尽量保留,并按时段/热点组织给模型。
|
||
"""
|
||
normalized = [item for item in messages if item and item.get("content")]
|
||
prepared = cls._prepare_messages(normalized)
|
||
organized_messages = prepared["organized_messages"]
|
||
bucket_stats = cls._build_time_buckets(organized_messages, minutes=bucket_minutes)
|
||
peak_buckets = sorted(
|
||
bucket_stats,
|
||
key=lambda item: item.get("message_count", 0),
|
||
reverse=True,
|
||
)[:top_bucket_count]
|
||
|
||
unique_users = {str(item.get("uid") or "") for item in normalized if str(item.get("uid") or "").strip()}
|
||
organized_unique_users = {
|
||
str(item.get("uid") or "") for item in organized_messages if str(item.get("uid") or "").strip()
|
||
}
|
||
|
||
return {
|
||
"session_meta": {
|
||
"room_id": room_id,
|
||
"session_id": session.get("session_id", ""),
|
||
"anchor_day": session.get("anchor_day", ""),
|
||
"nickname": session.get("nickname", ""),
|
||
"room_name": session.get("room_name", ""),
|
||
"segments": cls._serialize_segments(session.get("segments", []) or []),
|
||
"message_count": len(normalized),
|
||
"noise_filtered_count": len(prepared["noise_messages"]),
|
||
"organized_message_count": len(organized_messages),
|
||
"unique_user_count": len(unique_users),
|
||
"organized_unique_user_count": len(organized_unique_users),
|
||
},
|
||
"operator_metrics": cls._build_operator_metrics(normalized, organized_messages),
|
||
"cleaning_rules": [
|
||
"仅过滤系统噪音、机器人探测、平台提示类弹幕。",
|
||
"明显重复的长模板文案按内容聚合,保留出现次数、人数、首末时间。",
|
||
"其他相同内容按重复短语归并,但不抹掉不同观点和不同句式。",
|
||
"高峰时段补充原始弹幕样本,方便 LLM 还原语境。",
|
||
],
|
||
"merged_templates": prepared["merged_templates"],
|
||
"repeated_messages": cls._build_repeated_messages(
|
||
organized_messages,
|
||
limit=top_repeat_count,
|
||
),
|
||
"top_terms": cls._extract_top_terms(organized_messages, limit=30),
|
||
"burst_terms": cls._build_burst_terms(organized_messages),
|
||
"peak_buckets": cls._simplify_peak_buckets(peak_buckets),
|
||
"representative_messages": cls._pick_representative_messages(organized_messages, bucket_stats),
|
||
"raw_window_samples": cls._build_raw_window_samples(peak_buckets, per_bucket_limit=8),
|
||
# 给日报类 LLM 再补一层“按时间推进的现场切片”。
|
||
# 这样模型除了看热点窗口,还能顺着时间线理解气氛如何起、如何变、最后怎么收,
|
||
# 对粉丝日报这类强调“节目效果”和“接梗链路”的文本尤其有帮助。
|
||
"chronological_samples": cls._build_chronological_samples(organized_messages, limit=20),
|
||
# 每个 session 单独给一个轻量摘要,避免多场直播合并后,
|
||
# 模型只看到全局热点而丢失“第一场在聊什么、第二场为什么突然转节奏”的信息。
|
||
"session_storyline": cls._build_session_storyline(
|
||
organized_messages,
|
||
bucket_stats,
|
||
top_terms_limit=8,
|
||
sample_limit=10,
|
||
),
|
||
# 这是专门给 LLM 准备的“压缩但保真”的材料层:
|
||
# 1. 用户画像从逐条弹幕里抽出来,避免昵称/牌子/等级在每条消息里重复出现;
|
||
# 2. 时间统一压到日期 + 时分,减少无意义的秒级噪音;
|
||
# 3. 主体按时间线块组织,更像“现场事件流”,比中文分词更适合日报生成。
|
||
"compact_prompt_assets": cls.build_compact_prompt_assets(
|
||
organized_messages,
|
||
bucket_minutes=bucket_minutes,
|
||
),
|
||
}
|
||
|
||
@classmethod
|
||
def build_compact_prompt_assets(
|
||
cls,
|
||
messages: List[Dict[str, Any]],
|
||
*,
|
||
bucket_minutes: int = 5,
|
||
speaker_limit: int = 80,
|
||
timeline_limit: int = 24,
|
||
samples_per_bucket: int = 6,
|
||
cue_limit: int = 18,
|
||
) -> Dict[str, Any]:
|
||
"""
|
||
生成专供 LLM 使用的压缩材料。
|
||
设计目标:
|
||
1. 不做中文分词,尽量保留“整句/整段弹幕”的原始信息密度;
|
||
2. 把重复出现的用户元信息抽到索引表,降低 token 浪费;
|
||
3. 把现场内容组织成时间线块,帮助模型理解节奏推进和集体起哄链路。
|
||
"""
|
||
ordered_messages = sorted(
|
||
[item for item in messages if item and str(item.get("content") or "").strip()],
|
||
key=lambda item: item.get("timestamp") or datetime.min,
|
||
)
|
||
if not ordered_messages:
|
||
return {
|
||
"speaker_index": [],
|
||
"timeline_digest": [],
|
||
"content_cues": [],
|
||
}
|
||
|
||
speaker_index, speaker_alias_map = cls._build_speaker_index(
|
||
ordered_messages,
|
||
limit=speaker_limit,
|
||
)
|
||
repeated_messages = cls._build_repeated_messages(ordered_messages, limit=cue_limit)
|
||
burst_terms = cls._build_burst_terms(ordered_messages)
|
||
return {
|
||
"speaker_index": speaker_index,
|
||
"timeline_digest": cls._build_timeline_digest(
|
||
ordered_messages,
|
||
speaker_alias_map,
|
||
bucket_minutes=bucket_minutes,
|
||
limit=timeline_limit,
|
||
samples_per_bucket=samples_per_bucket,
|
||
),
|
||
"content_cues": cls._build_content_cues(
|
||
ordered_messages,
|
||
repeated_messages=repeated_messages,
|
||
burst_terms=burst_terms,
|
||
limit=cue_limit,
|
||
),
|
||
# 再补一层“事实型提示”,专门抬高赛事、位置、英雄、镜头梗、关键对局等信息密度高的内容。
|
||
"semantic_fact_hints": cls._build_semantic_fact_hints(ordered_messages),
|
||
}
|
||
|
||
@staticmethod
|
||
def _parse_profile_text(profile_text: str) -> Dict[str, Any]:
|
||
text = str(profile_text or "").strip()
|
||
room_level = 0
|
||
fans_name = ""
|
||
fans_level = 0
|
||
noble_name = ""
|
||
|
||
room_match = re.search(r",\s*Lv\s*(\d+)", text, re.I)
|
||
if room_match:
|
||
try:
|
||
room_level = int(room_match.group(1))
|
||
except Exception:
|
||
room_level = 0
|
||
|
||
fans_match = re.search(r"/\s*([^/]+?)\s+Lv\s*(\d+)", text, re.I)
|
||
if fans_match:
|
||
fans_name = str(fans_match.group(1) or "").strip()
|
||
try:
|
||
fans_level = int(fans_match.group(2))
|
||
except Exception:
|
||
fans_level = 0
|
||
|
||
noble_match = re.search(r"/\s*(骑士|子爵|伯爵|公爵|国王|皇帝|游侠|超级皇帝|幻神)\b", text)
|
||
if noble_match:
|
||
noble_name = str(noble_match.group(1) or "").strip()
|
||
|
||
return {
|
||
"room_level": room_level,
|
||
"fans_name": fans_name,
|
||
"fans_level": fans_level,
|
||
"noble_name": noble_name,
|
||
"has_fans_badge": bool(fans_name),
|
||
"has_noble": bool(noble_name),
|
||
}
|
||
|
||
@classmethod
|
||
def _build_operator_metrics(cls, messages: List[Dict[str, Any]], organized_messages: List[Dict[str, Any]]) -> Dict[str, Any]:
|
||
user_profiles: Dict[str, Dict[str, Any]] = {}
|
||
user_message_count = Counter()
|
||
user_organized_count = Counter()
|
||
fans_badge_users: Set[str] = set()
|
||
noble_users: Set[str] = set()
|
||
high_room_level_users: Set[str] = set()
|
||
high_fans_level_users: Set[str] = set()
|
||
fans_badge_message_count = 0
|
||
noble_message_count = 0
|
||
room_level_histogram = Counter()
|
||
fans_level_histogram = Counter()
|
||
badge_user_counter = Counter()
|
||
badge_message_counter = Counter()
|
||
|
||
for item in messages:
|
||
uid = str(item.get("uid") or "").strip()
|
||
if not uid:
|
||
continue
|
||
user_message_count[uid] += 1
|
||
|
||
profile = user_profiles.setdefault(uid, {
|
||
"uid": uid,
|
||
"nickname": str(item.get("nickname") or "").strip(),
|
||
"room_level": 0,
|
||
"fans_name": "",
|
||
"fans_level": 0,
|
||
"noble_name": "",
|
||
})
|
||
room_level = int(item.get("room_level") or 0)
|
||
fans_level = int(item.get("fans_level") or 0)
|
||
fans_name = str(item.get("fans_name") or "").strip()
|
||
noble_name = str(item.get("noble_name") or "").strip()
|
||
|
||
if room_level > int(profile.get("room_level") or 0):
|
||
profile["room_level"] = room_level
|
||
if fans_level > int(profile.get("fans_level") or 0):
|
||
profile["fans_level"] = fans_level
|
||
if fans_name and not profile.get("fans_name"):
|
||
profile["fans_name"] = fans_name
|
||
if noble_name and not profile.get("noble_name"):
|
||
profile["noble_name"] = noble_name
|
||
if profile.get("nickname") == "" and str(item.get("nickname") or "").strip():
|
||
profile["nickname"] = str(item.get("nickname") or "").strip()
|
||
|
||
if fans_name:
|
||
fans_badge_users.add(uid)
|
||
fans_badge_message_count += 1
|
||
badge_user_counter[fans_name] += 0
|
||
badge_message_counter[fans_name] += 1
|
||
if noble_name:
|
||
noble_users.add(uid)
|
||
noble_message_count += 1
|
||
|
||
for uid, profile in user_profiles.items():
|
||
room_level = int(profile.get("room_level") or 0)
|
||
fans_level = int(profile.get("fans_level") or 0)
|
||
fans_name = str(profile.get("fans_name") or "").strip()
|
||
noble_name = str(profile.get("noble_name") or "").strip()
|
||
|
||
room_level_histogram[cls._level_bucket(room_level)] += 1
|
||
if fans_name:
|
||
badge_user_counter[fans_name] += 1
|
||
fans_level_histogram[cls._fans_level_bucket(fans_level)] += 1
|
||
if noble_name:
|
||
noble_users.add(uid)
|
||
if room_level >= 30:
|
||
high_room_level_users.add(uid)
|
||
if fans_level >= 10:
|
||
high_fans_level_users.add(uid)
|
||
|
||
for item in organized_messages:
|
||
uid = str(item.get("uid") or "").strip()
|
||
if uid:
|
||
user_organized_count[uid] += 1
|
||
|
||
active_users_5plus = sum(1 for uid, count in user_message_count.items() if count >= 5)
|
||
active_users_10plus = sum(1 for uid, count in user_message_count.items() if count >= 10)
|
||
|
||
top_active_users = []
|
||
for uid, count in user_message_count.most_common(12):
|
||
profile = user_profiles.get(uid, {})
|
||
top_active_users.append({
|
||
"uid": uid,
|
||
"nickname": str(profile.get("nickname") or ""),
|
||
"message_count": count,
|
||
"organized_message_count": int(user_organized_count.get(uid, 0) or 0),
|
||
"room_level": int(profile.get("room_level", 0) or 0),
|
||
"fans_name": str(profile.get("fans_name") or ""),
|
||
"fans_level": int(profile.get("fans_level", 0) or 0),
|
||
"noble_name": str(profile.get("noble_name") or ""),
|
||
})
|
||
|
||
top_badges = []
|
||
for badge_name, unique_user_count in badge_user_counter.most_common(10):
|
||
if not badge_name:
|
||
continue
|
||
top_badges.append({
|
||
"badge_name": badge_name,
|
||
"user_count": unique_user_count,
|
||
"message_count": int(badge_message_counter.get(badge_name, 0) or 0),
|
||
})
|
||
|
||
total_unique_users = max(len(user_profiles), 1)
|
||
return {
|
||
"active_unique_users": len(user_profiles),
|
||
"active_users_5plus": active_users_5plus,
|
||
"active_users_10plus": active_users_10plus,
|
||
"fans_badge_user_count": len(fans_badge_users),
|
||
"fans_badge_user_ratio": round(len(fans_badge_users) / total_unique_users, 4),
|
||
"fans_badge_message_count": fans_badge_message_count,
|
||
"high_room_level_user_count": len(high_room_level_users),
|
||
"high_room_level_threshold": 30,
|
||
"high_fans_level_user_count": len(high_fans_level_users),
|
||
"high_fans_level_threshold": 10,
|
||
"noble_user_count": len(noble_users),
|
||
"noble_message_count": noble_message_count,
|
||
"room_level_distribution": [
|
||
{"bucket": bucket, "user_count": count}
|
||
for bucket, count in sorted(room_level_histogram.items())
|
||
],
|
||
"fans_level_distribution": [
|
||
{"bucket": bucket, "user_count": count}
|
||
for bucket, count in sorted(fans_level_histogram.items())
|
||
],
|
||
"top_badges": top_badges,
|
||
"top_active_users": top_active_users,
|
||
}
|
||
|
||
@staticmethod
|
||
def _level_bucket(level: int) -> str:
|
||
if level >= 40:
|
||
return "40+"
|
||
if level >= 30:
|
||
return "30-39"
|
||
if level >= 20:
|
||
return "20-29"
|
||
if level >= 10:
|
||
return "10-19"
|
||
return "1-9"
|
||
|
||
@staticmethod
|
||
def _fans_level_bucket(level: int) -> str:
|
||
if level >= 20:
|
||
return "20+"
|
||
if level >= 15:
|
||
return "15-19"
|
||
if level >= 10:
|
||
return "10-14"
|
||
if level >= 5:
|
||
return "5-9"
|
||
return "1-4"
|
||
|
||
@classmethod
|
||
def infer_sessions_from_messages(
|
||
cls,
|
||
room_id: str,
|
||
messages: List[Dict[str, Any]],
|
||
*,
|
||
session_cutoff_hour: int = 6,
|
||
merge_gap_hours: int = 4,
|
||
min_session_messages: int = 50,
|
||
) -> List[Dict[str, Any]]:
|
||
if not messages:
|
||
return []
|
||
ordered = sorted(messages, key=lambda item: item.get("timestamp") or datetime.min)
|
||
sessions: List[Dict[str, Any]] = []
|
||
current_messages: List[Dict[str, Any]] = []
|
||
|
||
def flush_current():
|
||
if len(current_messages) < min_session_messages:
|
||
return
|
||
session = cls._build_inferred_session(
|
||
room_id,
|
||
current_messages,
|
||
session_cutoff_hour=session_cutoff_hour,
|
||
)
|
||
if session:
|
||
sessions.append(session)
|
||
|
||
prev_dt: Optional[datetime] = None
|
||
for item in ordered:
|
||
ts = item.get("timestamp")
|
||
if not isinstance(ts, datetime):
|
||
continue
|
||
if prev_dt is not None:
|
||
gap_seconds = (ts - prev_dt).total_seconds()
|
||
if gap_seconds > merge_gap_hours * 3600:
|
||
flush_current()
|
||
current_messages = []
|
||
current_messages.append(item)
|
||
prev_dt = ts
|
||
|
||
flush_current()
|
||
return sessions
|
||
|
||
@classmethod
|
||
def _build_inferred_session(
|
||
cls,
|
||
room_id: str,
|
||
messages: List[Dict[str, Any]],
|
||
*,
|
||
session_cutoff_hour: int = 6,
|
||
) -> Optional[Dict[str, Any]]:
|
||
if not messages:
|
||
return None
|
||
ordered = sorted(messages, key=lambda item: item.get("timestamp") or datetime.min)
|
||
start_dt = ordered[0]["timestamp"]
|
||
end_dt = ordered[-1]["timestamp"]
|
||
anchor_dt = start_dt
|
||
if start_dt.hour < session_cutoff_hour:
|
||
from datetime import timedelta
|
||
anchor_dt = start_dt - timedelta(days=1)
|
||
anchor_day = anchor_dt.strftime("%Y-%m-%d")
|
||
segments = []
|
||
seg_start = ordered[0]["timestamp"]
|
||
prev_dt = ordered[0]["timestamp"]
|
||
for item in ordered[1:]:
|
||
current_dt = item["timestamp"]
|
||
if (current_dt - prev_dt).total_seconds() > 30 * 60:
|
||
segments.append({
|
||
"start_time": seg_start.strftime("%Y-%m-%d %H:%M:%S"),
|
||
"end_time": prev_dt.strftime("%Y-%m-%d %H:%M:%S"),
|
||
})
|
||
seg_start = current_dt
|
||
prev_dt = current_dt
|
||
segments.append({
|
||
"start_time": seg_start.strftime("%Y-%m-%d %H:%M:%S"),
|
||
"end_time": end_dt.strftime("%Y-%m-%d %H:%M:%S"),
|
||
})
|
||
return {
|
||
"session_id": f"{room_id}_{anchor_day.replace('-', '')}_{start_dt.strftime('%H%M%S')}",
|
||
"room_id": room_id,
|
||
"anchor_day": anchor_day,
|
||
"nickname": "",
|
||
"room_name": "",
|
||
"segments": segments,
|
||
"is_live": False,
|
||
"source": "inferred_from_danmu",
|
||
}
|
||
|
||
@staticmethod
|
||
def _normalize_segments(segments: List[Dict[str, Any]]) -> List[Dict[str, datetime]]:
|
||
normalized = []
|
||
for item in segments:
|
||
try:
|
||
start_dt = datetime.strptime(str(item.get("start_time") or ""), "%Y-%m-%d %H:%M:%S")
|
||
end_dt = datetime.strptime(str(item.get("end_time") or ""), "%Y-%m-%d %H:%M:%S")
|
||
except Exception:
|
||
continue
|
||
if end_dt < start_dt:
|
||
continue
|
||
normalized.append({"start": start_dt, "end": end_dt})
|
||
return normalized
|
||
|
||
@staticmethod
|
||
def _serialize_segments(segments: List[Dict[str, Any]]) -> List[Dict[str, str]]:
|
||
result = []
|
||
for item in segments:
|
||
start_time = str(item.get("start_time") or "").strip()
|
||
end_time = str(item.get("end_time") or "").strip()
|
||
if start_time and end_time:
|
||
result.append({"start_time": start_time, "end_time": end_time})
|
||
return result
|
||
|
||
@staticmethod
|
||
def _in_any_segment(target_dt: datetime, segments: List[Dict[str, datetime]]) -> bool:
|
||
for segment in segments:
|
||
if segment["start"] <= target_dt <= segment["end"]:
|
||
return True
|
||
return False
|
||
|
||
@staticmethod
|
||
def _dedupe_consecutive_messages(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||
result: List[Dict[str, Any]] = []
|
||
prev_key = None
|
||
repeat_count = 0
|
||
for item in messages:
|
||
current_key = (item.get("uid"), item.get("content"))
|
||
if current_key == prev_key:
|
||
repeat_count += 1
|
||
result[-1]["repeat_count"] = repeat_count
|
||
continue
|
||
copied = dict(item)
|
||
copied["repeat_count"] = 1
|
||
result.append(copied)
|
||
prev_key = current_key
|
||
repeat_count = 1
|
||
return result
|
||
|
||
@classmethod
|
||
def _filter_noise_messages(cls, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||
result = []
|
||
for item in messages:
|
||
content = str(item.get("content") or "").strip()
|
||
if not content:
|
||
continue
|
||
if cls._is_noise_message(content):
|
||
continue
|
||
result.append(item)
|
||
return result
|
||
|
||
@classmethod
|
||
def _prepare_messages(cls, messages: List[Dict[str, Any]]) -> Dict[str, Any]:
|
||
noise_messages: List[Dict[str, Any]] = []
|
||
candidate_messages: List[Dict[str, Any]] = []
|
||
for item in messages:
|
||
content = str(item.get("content") or "").strip()
|
||
if not content:
|
||
continue
|
||
if cls._is_noise_message(content):
|
||
noise_messages.append(item)
|
||
continue
|
||
candidate_messages.append(item)
|
||
|
||
merged_templates, organized_messages = cls._merge_template_messages(candidate_messages)
|
||
return {
|
||
"noise_messages": noise_messages,
|
||
"merged_templates": merged_templates,
|
||
"organized_messages": organized_messages,
|
||
}
|
||
|
||
@classmethod
|
||
def _is_noise_message(cls, content: str) -> bool:
|
||
text = str(content or "").strip()
|
||
if not text:
|
||
return True
|
||
for pattern in cls.NOISE_PATTERNS:
|
||
if pattern.search(text):
|
||
return True
|
||
if len(text) >= 30 and len(set(text)) <= 6:
|
||
return True
|
||
return False
|
||
|
||
@classmethod
|
||
def _merge_template_messages(cls, messages: List[Dict[str, Any]]) -> (List[Dict[str, Any]], List[Dict[str, Any]]):
|
||
normalized_counter = Counter()
|
||
normalized_to_messages: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
|
||
for item in messages:
|
||
normalized = cls._normalize_template_text(str(item.get("content") or ""))
|
||
if not normalized:
|
||
continue
|
||
normalized_counter[normalized] += 1
|
||
normalized_to_messages[normalized].append(item)
|
||
|
||
template_keys = {
|
||
key for key, count in normalized_counter.items()
|
||
if len(key) >= cls.TEMPLATE_MIN_LENGTH and count >= cls.TEMPLATE_MIN_REPEAT
|
||
}
|
||
for key in list(normalized_counter.keys()):
|
||
if any(pattern.search(key) for pattern in cls.TEMPLATE_HINT_PATTERNS):
|
||
template_keys.add(key)
|
||
|
||
merged_templates: List[Dict[str, Any]] = []
|
||
organized_messages: List[Dict[str, Any]] = []
|
||
for normalized, items in normalized_to_messages.items():
|
||
if normalized in template_keys:
|
||
first = items[0]
|
||
merged_templates.append({
|
||
"text": str(first.get("content") or "").strip()[:120],
|
||
"count": len(items),
|
||
"user_count": len({str(item.get('uid') or '') for item in items if str(item.get('uid') or '').strip()}),
|
||
"first_time": str(first.get("timestamp_text") or ""),
|
||
"last_time": str(items[-1].get("timestamp_text") or ""),
|
||
})
|
||
else:
|
||
organized_messages.extend(items)
|
||
|
||
merged_templates.sort(key=lambda item: item.get("count", 0), reverse=True)
|
||
organized_messages.sort(key=lambda item: item.get("timestamp") or datetime.min)
|
||
return merged_templates[:20], organized_messages
|
||
|
||
@classmethod
|
||
def _build_repeated_messages(cls, messages: List[Dict[str, Any]], limit: int = 24) -> List[Dict[str, Any]]:
|
||
grouped: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
|
||
for item in messages:
|
||
content = str(item.get("content") or "").strip()
|
||
if not content:
|
||
continue
|
||
normalized = cls._normalize_template_text(content)
|
||
if not normalized:
|
||
continue
|
||
if cls._looks_like_pure_punctuation(content):
|
||
continue
|
||
grouped[normalized].append(item)
|
||
|
||
repeated_messages: List[Dict[str, Any]] = []
|
||
for items in grouped.values():
|
||
if len(items) < cls.REPEAT_MIN_COUNT:
|
||
continue
|
||
first = items[0]
|
||
repeated_messages.append({
|
||
"text": str(first.get("content") or "").strip()[:120],
|
||
"count": len(items),
|
||
"user_count": len({
|
||
str(item.get("uid") or "") for item in items if str(item.get("uid") or "").strip()
|
||
}),
|
||
"first_time": str(first.get("timestamp_text") or ""),
|
||
"last_time": str(items[-1].get("timestamp_text") or ""),
|
||
"first_date": cls._format_date(first.get("timestamp")),
|
||
"first_hm": cls._format_hm(first.get("timestamp")),
|
||
"last_date": cls._format_date(items[-1].get("timestamp")),
|
||
"last_hm": cls._format_hm(items[-1].get("timestamp")),
|
||
})
|
||
repeated_messages.sort(key=lambda item: item.get("count", 0), reverse=True)
|
||
return repeated_messages[:limit]
|
||
|
||
@classmethod
|
||
def _build_burst_terms(cls, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||
counters: Dict[str, Dict[str, Any]] = {}
|
||
for item in messages:
|
||
content = str(item.get("content") or "").strip().lower()
|
||
if not content:
|
||
continue
|
||
if content not in cls.SHORT_BURST_WORDS and len(content) > 6:
|
||
continue
|
||
target = counters.setdefault(content, {"text": content, "count": 0, "users": set()})
|
||
target["count"] += 1
|
||
target["users"].add(str(item.get("uid") or ""))
|
||
result = []
|
||
for item in sorted(counters.values(), key=lambda entry: entry["count"], reverse=True)[:15]:
|
||
result.append({
|
||
"text": item["text"],
|
||
"count": item["count"],
|
||
"user_count": len([uid for uid in item["users"] if uid]),
|
||
})
|
||
return result
|
||
|
||
@classmethod
|
||
def _extract_top_terms(cls, messages: List[Dict[str, Any]], limit: int = 30) -> List[Dict[str, Any]]:
|
||
counter = Counter()
|
||
for item in messages:
|
||
for token in cls._tokenize(str(item.get("content") or "")):
|
||
counter[token] += 1
|
||
result = []
|
||
for term, count in counter.most_common(limit):
|
||
result.append({"term": term, "count": count})
|
||
return result
|
||
|
||
@classmethod
|
||
def _tokenize(cls, content: str) -> List[str]:
|
||
text = str(content or "").lower().strip()
|
||
if not text:
|
||
return []
|
||
chinese_terms = re.findall(r"[\u4e00-\u9fff]{2,6}", text)
|
||
alpha_terms = re.findall(r"[a-z0-9_\-]{3,20}", text)
|
||
tokens = []
|
||
for token in chinese_terms + alpha_terms:
|
||
normalized = token.strip().lower()
|
||
if not normalized or normalized in cls.STOPWORDS:
|
||
continue
|
||
if normalized.isdigit() and len(normalized) <= 2:
|
||
continue
|
||
tokens.append(normalized)
|
||
return tokens
|
||
|
||
@staticmethod
|
||
def _normalize_template_text(content: str) -> str:
|
||
text = str(content or "").strip().lower()
|
||
if not text:
|
||
return ""
|
||
text = re.sub(r"\s+", "", text)
|
||
text = re.sub(r"[`~!@#$%^&*()_\-+=\[\]{}\\|;:'\",.<>/?,。!?、…()【】《》“”‘’·]", "", text)
|
||
text = re.sub(r"(.)\1{4,}", r"\1\1\1", text)
|
||
return text
|
||
|
||
@classmethod
|
||
def _build_time_buckets(cls, messages: List[Dict[str, Any]], minutes: int = 5) -> List[Dict[str, Any]]:
|
||
buckets: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
|
||
bucket_dt_map: Dict[str, datetime] = {}
|
||
for item in messages:
|
||
ts = item.get("timestamp")
|
||
if not isinstance(ts, datetime):
|
||
continue
|
||
bucket_minute = (ts.minute // minutes) * minutes
|
||
bucket_key = ts.replace(minute=bucket_minute, second=0)
|
||
bucket_text = bucket_key.strftime("%Y-%m-%d %H:%M:%S")
|
||
bucket_dt_map[bucket_text] = bucket_key
|
||
buckets[bucket_text].append(item)
|
||
|
||
results: List[Dict[str, Any]] = []
|
||
for bucket_start, items in sorted(buckets.items()):
|
||
bucket_dt = bucket_dt_map.get(bucket_start)
|
||
top_terms = cls._extract_top_terms(items, limit=8)
|
||
burst_terms = cls._build_burst_terms(items)[:5]
|
||
results.append({
|
||
"start_time": bucket_start,
|
||
"date": cls._format_date(bucket_dt),
|
||
"start_hm": cls._format_hm(bucket_dt),
|
||
"message_count": len(items),
|
||
"user_count": len({str(item.get("uid") or "") for item in items if str(item.get("uid") or "").strip()}),
|
||
"top_terms": top_terms,
|
||
"burst_terms": burst_terms,
|
||
"sample_messages": cls._pick_bucket_samples(items, limit=6),
|
||
})
|
||
return results
|
||
|
||
@staticmethod
|
||
def _pick_bucket_samples(items: List[Dict[str, Any]], limit: int = 6) -> List[Dict[str, str]]:
|
||
if not items:
|
||
return []
|
||
indexes = sorted({0, len(items) // 3, len(items) // 2, (len(items) * 2) // 3, len(items) - 1})
|
||
selected = []
|
||
seen = set()
|
||
for idx in indexes:
|
||
item = items[idx]
|
||
content = str(item.get("content") or "").strip()
|
||
if not content or content in seen:
|
||
continue
|
||
selected.append({
|
||
"time": str(item.get("timestamp_text") or ""),
|
||
"date": DouyuDanmuSummaryHelper._format_date(item.get("timestamp")),
|
||
"hm": DouyuDanmuSummaryHelper._format_hm(item.get("timestamp")),
|
||
"nickname": str(item.get("nickname") or ""),
|
||
"content": content[:80],
|
||
})
|
||
seen.add(content)
|
||
if len(selected) >= limit:
|
||
break
|
||
# 固定位置采样只能快速抓到“窗口骨架”,但在弹幕量大时通常不足以凑满 limit。
|
||
# 这里继续顺序补样本,把同一热点窗口里更多真实原声带给 LLM,
|
||
# 减少模型只看到 4-5 条孤立短句、难以还原现场氛围的问题。
|
||
if len(selected) < limit:
|
||
for item in items:
|
||
content = str(item.get("content") or "").strip()
|
||
if not content or content in seen:
|
||
continue
|
||
selected.append({
|
||
"time": str(item.get("timestamp_text") or ""),
|
||
"date": DouyuDanmuSummaryHelper._format_date(item.get("timestamp")),
|
||
"hm": DouyuDanmuSummaryHelper._format_hm(item.get("timestamp")),
|
||
"nickname": str(item.get("nickname") or ""),
|
||
"content": content[:80],
|
||
})
|
||
seen.add(content)
|
||
if len(selected) >= limit:
|
||
break
|
||
return selected
|
||
|
||
@classmethod
|
||
def _pick_representative_messages(cls, messages: List[Dict[str, Any]], buckets: List[Dict[str, Any]]) -> List[Dict[str, str]]:
|
||
selected: List[Dict[str, str]] = []
|
||
seen = set()
|
||
|
||
for bucket in sorted(buckets, key=lambda item: item.get("message_count", 0), reverse=True)[:6]:
|
||
for sample in bucket.get("sample_messages", []):
|
||
content = str(sample.get("content") or "").strip()
|
||
if not content or content in seen:
|
||
continue
|
||
selected.append(sample)
|
||
seen.add(content)
|
||
if len(selected) >= 18:
|
||
return selected
|
||
|
||
for item in messages:
|
||
content = str(item.get("content") or "").strip()
|
||
if not content or content in seen:
|
||
continue
|
||
selected.append({
|
||
"time": str(item.get("timestamp_text") or ""),
|
||
"date": cls._format_date(item.get("timestamp")),
|
||
"hm": cls._format_hm(item.get("timestamp")),
|
||
"nickname": str(item.get("nickname") or ""),
|
||
"content": content[:80],
|
||
})
|
||
seen.add(content)
|
||
if len(selected) >= 18:
|
||
break
|
||
return selected
|
||
|
||
@classmethod
|
||
def _build_raw_window_samples(
|
||
cls,
|
||
peak_buckets: List[Dict[str, Any]],
|
||
per_bucket_limit: int = 8,
|
||
) -> List[Dict[str, Any]]:
|
||
windows: List[Dict[str, Any]] = []
|
||
for bucket in peak_buckets:
|
||
samples = []
|
||
for sample in bucket.get("sample_messages", [])[:per_bucket_limit]:
|
||
content = str(sample.get("content") or "").strip()
|
||
if not content:
|
||
continue
|
||
samples.append({
|
||
"time": str(sample.get("time") or ""),
|
||
"date": str(sample.get("date") or ""),
|
||
"hm": str(sample.get("hm") or ""),
|
||
"nickname": str(sample.get("nickname") or ""),
|
||
"content": content,
|
||
})
|
||
if not samples:
|
||
continue
|
||
windows.append({
|
||
"start_time": str(bucket.get("start_time") or ""),
|
||
"message_count": int(bucket.get("message_count", 0) or 0),
|
||
"user_count": int(bucket.get("user_count", 0) or 0),
|
||
"samples": samples,
|
||
})
|
||
return windows
|
||
|
||
@classmethod
|
||
def _build_chronological_samples(
|
||
cls,
|
||
messages: List[Dict[str, Any]],
|
||
limit: int = 20,
|
||
) -> List[Dict[str, str]]:
|
||
"""
|
||
从整场弹幕里按时间均匀抽取样本。
|
||
设计目的:
|
||
1. 热点窗口只能解释“最炸的几分钟”,但日报还需要理解整体节奏;
|
||
2. 顺时序样本能帮助 LLM 看到开场铺垫、中段起哄、尾段收束;
|
||
3. 对粉丝日报来说,这比单纯词频更容易还原“今天到底经历了什么”。
|
||
"""
|
||
if not messages:
|
||
return []
|
||
|
||
indexes = {
|
||
int(round((len(messages) - 1) * idx / max(limit - 1, 1)))
|
||
for idx in range(min(limit, len(messages)))
|
||
}
|
||
selected: List[Dict[str, str]] = []
|
||
seen = set()
|
||
for idx in sorted(indexes):
|
||
item = messages[idx]
|
||
content = str(item.get("content") or "").strip()
|
||
if not content:
|
||
continue
|
||
normalized = cls._normalize_template_text(content)
|
||
if normalized and normalized in seen:
|
||
continue
|
||
if normalized:
|
||
seen.add(normalized)
|
||
selected.append({
|
||
"time": str(item.get("timestamp_text") or ""),
|
||
"date": cls._format_date(item.get("timestamp")),
|
||
"hm": cls._format_hm(item.get("timestamp")),
|
||
"nickname": str(item.get("nickname") or ""),
|
||
"content": content[:90],
|
||
})
|
||
if len(selected) >= limit:
|
||
break
|
||
return selected
|
||
|
||
@classmethod
|
||
def _build_session_storyline(
|
||
cls,
|
||
messages: List[Dict[str, Any]],
|
||
bucket_stats: List[Dict[str, Any]],
|
||
*,
|
||
top_terms_limit: int = 8,
|
||
sample_limit: int = 10,
|
||
) -> Dict[str, Any]:
|
||
"""
|
||
组装单场直播的轻量叙事骨架。
|
||
这里不追求大而全,而是给模型一个“这场直播从头到尾在发生什么”的概览,
|
||
让它在写日报时更容易把梗、情绪和时间顺序串起来。
|
||
"""
|
||
first_message = messages[0] if messages else {}
|
||
last_message = messages[-1] if messages else {}
|
||
hottest_bucket = max(
|
||
bucket_stats,
|
||
key=lambda item: int(item.get("message_count", 0) or 0),
|
||
default={},
|
||
)
|
||
return {
|
||
"start_time": str(first_message.get("timestamp_text") or ""),
|
||
"end_time": str(last_message.get("timestamp_text") or ""),
|
||
"top_terms": cls._extract_top_terms(messages, limit=top_terms_limit),
|
||
"burst_terms": cls._build_burst_terms(messages)[:6],
|
||
"chronological_samples": cls._build_chronological_samples(messages, limit=sample_limit),
|
||
"hottest_moment": {
|
||
"start_time": str(hottest_bucket.get("start_time") or ""),
|
||
"message_count": int(hottest_bucket.get("message_count", 0) or 0),
|
||
"user_count": int(hottest_bucket.get("user_count", 0) or 0),
|
||
"top_terms": hottest_bucket.get("top_terms", [])[:6],
|
||
"sample_messages": hottest_bucket.get("sample_messages", [])[:6],
|
||
},
|
||
}
|
||
|
||
@staticmethod
|
||
def _simplify_peak_buckets(buckets: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||
simplified = []
|
||
for bucket in buckets:
|
||
simplified.append({
|
||
"start_time": str(bucket.get("start_time") or ""),
|
||
"message_count": int(bucket.get("message_count", 0) or 0),
|
||
"user_count": int(bucket.get("user_count", 0) or 0),
|
||
"top_terms": bucket.get("top_terms", [])[:6],
|
||
"burst_terms": bucket.get("burst_terms", [])[:5],
|
||
})
|
||
return simplified
|
||
|
||
@staticmethod
|
||
def _format_date(ts: Any) -> str:
|
||
if isinstance(ts, datetime):
|
||
return ts.strftime("%Y-%m-%d")
|
||
return ""
|
||
|
||
@classmethod
|
||
def _format_hm(cls, ts: Any) -> str:
|
||
if isinstance(ts, datetime):
|
||
return ts.strftime(cls.COMPACT_TIME_FORMAT)
|
||
return ""
|
||
|
||
@classmethod
|
||
def _build_speaker_index(
|
||
cls,
|
||
messages: List[Dict[str, Any]],
|
||
*,
|
||
limit: int = 80,
|
||
) -> (List[Dict[str, Any]], Dict[str, str]):
|
||
"""
|
||
构建用户索引表。
|
||
这样时间线块里只保留 `speaker_id`,把 UID/牌子/等级这些重复元信息折叠到索引里,
|
||
既节省 token,也能保留用户画像的完整性。
|
||
"""
|
||
profiles: Dict[str, Dict[str, Any]] = {}
|
||
counts = Counter()
|
||
for item in messages:
|
||
uid = str(item.get("uid") or "").strip()
|
||
if not uid:
|
||
continue
|
||
counts[uid] += 1
|
||
profile = profiles.setdefault(uid, {
|
||
"uid": uid,
|
||
"nickname": str(item.get("nickname") or "").strip(),
|
||
"room_level": 0,
|
||
"fans_name": "",
|
||
"fans_level": 0,
|
||
"noble_name": "",
|
||
})
|
||
if not profile["nickname"]:
|
||
profile["nickname"] = str(item.get("nickname") or "").strip()
|
||
profile["room_level"] = max(int(profile.get("room_level", 0) or 0), int(item.get("room_level", 0) or 0))
|
||
if int(item.get("fans_level", 0) or 0) > int(profile.get("fans_level", 0) or 0):
|
||
profile["fans_level"] = int(item.get("fans_level", 0) or 0)
|
||
if not str(profile.get("fans_name") or "").strip():
|
||
profile["fans_name"] = str(item.get("fans_name") or "").strip()
|
||
if not str(profile.get("noble_name") or "").strip():
|
||
profile["noble_name"] = str(item.get("noble_name") or "").strip()
|
||
|
||
speaker_index: List[Dict[str, Any]] = []
|
||
speaker_alias_map: Dict[str, str] = {}
|
||
for idx, (uid, count) in enumerate(counts.most_common(limit), start=1):
|
||
profile = profiles.get(uid, {})
|
||
speaker_id = f"U{idx:02d}"
|
||
speaker_alias_map[uid] = speaker_id
|
||
speaker_index.append({
|
||
"speaker_id": speaker_id,
|
||
"nickname": str(profile.get("nickname") or "").strip(),
|
||
# 只保留 UID 尾号,方便定位老观众/同名用户,又不会把整串 UUID 反复塞给模型。
|
||
"uid_tail": uid[-4:] if len(uid) >= 4 else uid,
|
||
"badge_name": str(profile.get("fans_name") or "").strip(),
|
||
"badge_level": int(profile.get("fans_level", 0) or 0),
|
||
"room_level": int(profile.get("room_level", 0) or 0),
|
||
"noble_name": str(profile.get("noble_name") or "").strip(),
|
||
"message_count": count,
|
||
})
|
||
return speaker_index, speaker_alias_map
|
||
|
||
@classmethod
|
||
def _build_timeline_digest(
|
||
cls,
|
||
messages: List[Dict[str, Any]],
|
||
speaker_alias_map: Dict[str, str],
|
||
*,
|
||
bucket_minutes: int = 5,
|
||
limit: int = 24,
|
||
samples_per_bucket: int = 6,
|
||
) -> List[Dict[str, Any]]:
|
||
"""
|
||
把弹幕按时间窗口压成事件块。
|
||
每个块里同时保留:
|
||
1. 热度数据;
|
||
2. 重复刷屏的整句线索;
|
||
3. 少量原声样本。
|
||
这比按词切碎更容易让模型理解“这一段到底发生了什么”。
|
||
"""
|
||
buckets: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
|
||
bucket_dt_map: Dict[str, datetime] = {}
|
||
for item in messages:
|
||
ts = item.get("timestamp")
|
||
if not isinstance(ts, datetime):
|
||
continue
|
||
bucket_minute = (ts.minute // bucket_minutes) * bucket_minutes
|
||
bucket_dt = ts.replace(minute=bucket_minute, second=0)
|
||
bucket_key = bucket_dt.strftime("%Y-%m-%d %H:%M:%S")
|
||
bucket_dt_map[bucket_key] = bucket_dt
|
||
buckets[bucket_key].append(item)
|
||
|
||
timeline_blocks: List[Dict[str, Any]] = []
|
||
for bucket_key in sorted(buckets.keys()):
|
||
bucket_messages = buckets[bucket_key]
|
||
repeated = cls._build_repeated_messages(bucket_messages, limit=3)
|
||
samples: List[Dict[str, Any]] = []
|
||
seen_contents = set()
|
||
for item in bucket_messages:
|
||
content = str(item.get("content") or "").strip()
|
||
if not content:
|
||
continue
|
||
normalized = cls._normalize_template_text(content)
|
||
if normalized and normalized in seen_contents:
|
||
continue
|
||
if normalized:
|
||
seen_contents.add(normalized)
|
||
uid = str(item.get("uid") or "").strip()
|
||
samples.append({
|
||
"speaker_id": speaker_alias_map.get(uid, "U00"),
|
||
"hm": cls._format_hm(item.get("timestamp")),
|
||
"content": content[:90],
|
||
})
|
||
if len(samples) >= samples_per_bucket:
|
||
break
|
||
|
||
bucket_dt = bucket_dt_map.get(bucket_key)
|
||
timeline_blocks.append({
|
||
"date": cls._format_date(bucket_dt),
|
||
"start_hm": cls._format_hm(bucket_dt),
|
||
"message_count": len(bucket_messages),
|
||
"user_count": len({
|
||
str(item.get("uid") or "") for item in bucket_messages if str(item.get("uid") or "").strip()
|
||
}),
|
||
"repeated_cues": [
|
||
{
|
||
"text": str(item.get("text") or "").strip()[:80],
|
||
"count": int(item.get("count", 0) or 0),
|
||
"user_count": int(item.get("user_count", 0) or 0),
|
||
}
|
||
for item in repeated
|
||
if str(item.get("text") or "").strip()
|
||
],
|
||
"samples": samples,
|
||
})
|
||
return timeline_blocks[:limit]
|
||
|
||
@classmethod
|
||
def _build_content_cues(
|
||
cls,
|
||
messages: List[Dict[str, Any]],
|
||
*,
|
||
repeated_messages: List[Dict[str, Any]],
|
||
burst_terms: List[Dict[str, Any]],
|
||
limit: int = 18,
|
||
) -> List[Dict[str, Any]]:
|
||
"""
|
||
生成不依赖中文分词的高频内容线索。
|
||
规则:
|
||
1. 优先保留整句复读内容;
|
||
2. 短促情绪词单独保留为 burst;
|
||
3. 对短句高频原话做补充,不把中文切碎成词。
|
||
"""
|
||
cues: List[Dict[str, Any]] = []
|
||
seen = set()
|
||
|
||
def push(kind: str, text: str, count: int, user_count: int = 0) -> None:
|
||
value = str(text or "").strip()
|
||
if not value:
|
||
return
|
||
normalized = cls._normalize_template_text(value)
|
||
if not normalized or normalized in seen:
|
||
return
|
||
seen.add(normalized)
|
||
cues.append({
|
||
"kind": kind,
|
||
"text": value[:90],
|
||
"count": int(count or 0),
|
||
"user_count": int(user_count or 0),
|
||
})
|
||
|
||
for item in repeated_messages:
|
||
push(
|
||
"emotion" if cls._normalize_template_text(str(item.get("text") or "")) in cls.GENERIC_REACTION_TERMS else "repeat",
|
||
str(item.get("text") or ""),
|
||
int(item.get("count", 0) or 0),
|
||
int(item.get("user_count", 0) or 0),
|
||
)
|
||
|
||
short_message_counter = Counter()
|
||
short_message_users: Dict[str, Set[str]] = defaultdict(set)
|
||
short_message_text_map: Dict[str, str] = {}
|
||
for item in messages:
|
||
content = str(item.get("content") or "").strip()
|
||
if not content or len(content) > 16 or cls._looks_like_pure_punctuation(content):
|
||
continue
|
||
normalized = cls._normalize_template_text(content)
|
||
if not normalized:
|
||
continue
|
||
short_message_counter[normalized] += 1
|
||
short_message_users[normalized].add(str(item.get("uid") or "").strip())
|
||
short_message_text_map.setdefault(normalized, content)
|
||
for normalized, count in short_message_counter.most_common(limit):
|
||
if count < 2:
|
||
continue
|
||
push(
|
||
"emotion" if normalized in cls.GENERIC_REACTION_TERMS else "short_repeat",
|
||
short_message_text_map.get(normalized, normalized),
|
||
count,
|
||
len([uid for uid in short_message_users.get(normalized, set()) if uid]),
|
||
)
|
||
|
||
for item in burst_terms:
|
||
push(
|
||
"emotion" if cls._normalize_template_text(str(item.get("text") or "")) in cls.GENERIC_REACTION_TERMS else "burst",
|
||
str(item.get("text") or ""),
|
||
int(item.get("count", 0) or 0),
|
||
int(item.get("user_count", 0) or 0),
|
||
)
|
||
|
||
kind_priority = {
|
||
"repeat": 5,
|
||
"short_repeat": 4,
|
||
"burst": 3,
|
||
"emotion": 1,
|
||
}
|
||
cues.sort(
|
||
key=lambda item: (
|
||
int(kind_priority.get(str(item.get("kind") or ""), 0)),
|
||
int(item.get("count", 0) or 0),
|
||
int(item.get("user_count", 0) or 0),
|
||
),
|
||
reverse=True,
|
||
)
|
||
return cues[:limit]
|
||
|
||
@classmethod
|
||
def _build_semantic_fact_hints(cls, messages: List[Dict[str, Any]]) -> Dict[str, Any]:
|
||
"""
|
||
生成“事实型语义提示”。
|
||
这层不是做总结,而是把模型容易漏掉的高价值信息提前挂出来:
|
||
1. 赛事/选人/位置讨论;
|
||
2. 英雄与关键局;
|
||
3. 摄像头、团播人物等场外互动梗。
|
||
"""
|
||
return {
|
||
"topic_clusters": cls._build_fact_topic_clusters(messages),
|
||
"hero_mentions": cls._build_hero_mentions(messages),
|
||
}
|
||
|
||
@classmethod
|
||
def _build_fact_topic_clusters(cls, messages: List[Dict[str, Any]], limit: int = 8) -> List[Dict[str, Any]]:
|
||
clusters: List[Dict[str, Any]] = []
|
||
for config in cls.FACT_CLUSTER_CONFIGS:
|
||
matched_items: List[Dict[str, Any]] = []
|
||
keywords = [str(item).lower() for item in (config.get("keywords") or []) if str(item).strip()]
|
||
for item in messages:
|
||
content = str(item.get("content") or "").strip()
|
||
if not content:
|
||
continue
|
||
lowered = content.lower()
|
||
if any(keyword in lowered for keyword in keywords):
|
||
matched_items.append(item)
|
||
if not matched_items:
|
||
continue
|
||
sample_messages = []
|
||
seen = set()
|
||
for item in matched_items:
|
||
content = str(item.get("content") or "").strip()
|
||
normalized = cls._normalize_template_text(content)
|
||
if not normalized or normalized in seen:
|
||
continue
|
||
seen.add(normalized)
|
||
sample_messages.append({
|
||
"date": cls._format_date(item.get("timestamp")),
|
||
"hm": cls._format_hm(item.get("timestamp")),
|
||
"nickname": str(item.get("nickname") or "").strip(),
|
||
"content": content[:100],
|
||
})
|
||
if len(sample_messages) >= 5:
|
||
break
|
||
if not sample_messages:
|
||
continue
|
||
clusters.append({
|
||
"label": str(config.get("label") or "").strip(),
|
||
"match_count": len(matched_items),
|
||
"user_count": len({
|
||
str(item.get("uid") or "") for item in matched_items if str(item.get("uid") or "").strip()
|
||
}),
|
||
"first_hm": cls._format_hm(matched_items[0].get("timestamp")),
|
||
"last_hm": cls._format_hm(matched_items[-1].get("timestamp")),
|
||
"keywords": config.get("keywords", [])[:8],
|
||
"samples": sample_messages,
|
||
})
|
||
clusters.sort(
|
||
key=lambda item: (
|
||
int(item.get("match_count", 0) or 0),
|
||
int(item.get("user_count", 0) or 0),
|
||
),
|
||
reverse=True,
|
||
)
|
||
return clusters[:limit]
|
||
|
||
@classmethod
|
||
def _build_hero_mentions(cls, messages: List[Dict[str, Any]], limit: int = 6) -> List[Dict[str, Any]]:
|
||
hero_results: List[Dict[str, Any]] = []
|
||
for hero_name, aliases in cls.HERO_ALIASES.items():
|
||
matched_items: List[Dict[str, Any]] = []
|
||
alias_list = [str(alias).lower() for alias in aliases if str(alias).strip()]
|
||
for item in messages:
|
||
content = str(item.get("content") or "").strip()
|
||
if not content:
|
||
continue
|
||
lowered = content.lower()
|
||
if any(alias in lowered for alias in alias_list):
|
||
matched_items.append(item)
|
||
if not matched_items:
|
||
continue
|
||
samples = []
|
||
seen = set()
|
||
for item in matched_items:
|
||
content = str(item.get("content") or "").strip()
|
||
normalized = cls._normalize_template_text(content)
|
||
if not normalized or normalized in seen:
|
||
continue
|
||
seen.add(normalized)
|
||
samples.append({
|
||
"hm": cls._format_hm(item.get("timestamp")),
|
||
"nickname": str(item.get("nickname") or "").strip(),
|
||
"content": content[:100],
|
||
})
|
||
if len(samples) >= 4:
|
||
break
|
||
hero_results.append({
|
||
"hero": hero_name,
|
||
"mention_count": len(matched_items),
|
||
"user_count": len({
|
||
str(item.get("uid") or "") for item in matched_items if str(item.get("uid") or "").strip()
|
||
}),
|
||
"samples": samples,
|
||
})
|
||
hero_results.sort(
|
||
key=lambda item: (
|
||
int(item.get("mention_count", 0) or 0),
|
||
int(item.get("user_count", 0) or 0),
|
||
),
|
||
reverse=True,
|
||
)
|
||
return hero_results[:limit]
|
||
|
||
@staticmethod
|
||
def _looks_like_pure_punctuation(content: str) -> bool:
|
||
text = str(content or "").strip()
|
||
if not text:
|
||
return True
|
||
return re.fullmatch(r"[\W_]+", text, re.UNICODE) is not None
|