# -*- coding: utf-8 -*- import os import re from collections import Counter, defaultdict from datetime import datetime from typing import Any, Dict, List, Optional, Set class DouyuDanmuSummaryHelper: """斗鱼弹幕场次抽取与压缩辅助器。""" LINE_PATTERN = re.compile( r"^\[(?P\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\]\s+" r"(?P.*?)\s+\(UID:\s*(?P[^,\)]+)(?P.*?)\):(?P.*)$" ) STOPWORDS: Set[str] = { "哈哈", "哈哈哈", "hh", "hhh", "啊啊", "啊啊啊", "可以", "这个", "那个", "真的", "就是", "一下", "主播", "兄弟", "老铁", "大家", "我们", "你们", "他们", "不是", "什么", "怎么", "为啥", "有点", "感觉", "这里", "那里", "然后", "今天", "刚刚", "现在", "一个", "一下子", } SHORT_BURST_WORDS: Set[str] = { "666", "6666", "牛", "牛逼", "稳", "寄", "杀", "帅", "好", "行", "绷", "哭", "乐", "哈哈", "哈哈哈", "笑死", "卧槽", "wc", "awsl", "nb", "nbl", "c", "6", } # 这些词对“直播间气氛”有价值,但对“事实提炼”帮助有限。 # 后面在内容线索排序时会适当降权,避免把真正的赛事/英雄/剧情信息淹掉。 GENERIC_REACTION_TERMS: Set[str] = { "哈哈", "哈哈哈", "哈哈哈哈", "哈哈哈哈哈", "哈哈哈哈哈哈", "gg", "g", "888", "1", "啊", "啊?", "坏了", "好起来了", "翻了", } # 高频语义簇配置: # 1. 不做中文分词,而是直接按“直播圈常见话题簇”收证据; # 2. 每个簇都会保留计数、时间范围和原声样本; # 3. 这样模型更容易抓到“今天到底发生了哪些具体事”,而不是只看到大量情绪词。 FACT_CLUSTER_CONFIGS: List[Dict[str, Any]] = [ { "label": "赛事预告与报名动态", "keywords": ["老头杯", "选人", "报名", "开赛", "比赛", "30号", "4月30", "4月30日"], }, { "label": "比赛位置与身份讨论", "keywords": ["1号位", "5号位", "打1", "打5", "教练", "carry", "辅助"], }, { "label": "镜头与外形调侃", "keywords": ["摄像头", "开摄像头", "光头", "秃头", "洗头", "面容", "露脸"], }, { "label": "团播人物与场外关系", "keywords": ["糯糯", "瑶瑶", "冬瓜", "冬瓜强", "白队", "团播", "户外"], }, { "label": "关键对局与局势转折", "keywords": ["奶绿", "muerta", "gg", "翻了", "拿下", "上高地", "守高地", "队友", "大炮", "萨尔", "炸弹人"], }, ] HERO_ALIASES: Dict[str, List[str]] = { "Muerta/奶绿": ["奶绿", "muerta"], "德鲁伊/Lone Druid": ["德鲁伊", "lone druid", "熊德"], "小小/Tiny": ["小小", "tiny"], "帕克/Puck": ["帕克", "puck"], "火猫/Ember Spirit": ["火猫", "ember"], "敌法/Anti-Mage": ["敌法", "am", "anti-mage"], "兽王/Beastmaster": ["兽王", "beastmaster"], "萨尔/Disruptor": ["萨尔", "disruptor"], "炸弹人/Techies": ["炸弹人", "techies"], } NOISE_PATTERNS = [ re.compile(r"本条弹幕.*机器人", re.I), re.compile(r"请不要.*统计机器人数", re.I), re.compile(r"原来直播间有.*非机器人用户", re.I), re.compile(r"如果您的直播内容是以观看他人操作为主", re.I), ] TEMPLATE_HINT_PATTERNS = [ re.compile(r"闭目不语任由"), re.compile(r"你就忍心一辈"), re.compile(r"刚刚偷看你直播被老板发现"), re.compile(r"还好老板是蝙蝠侠"), re.compile(r"原来直播间有这么多姐妹"), re.compile(r"之前我一直不敢发弹幕"), re.compile(r"强子也就是吃了直播的红利"), re.compile(r"你声音好像强子"), re.compile(r"怎么回事强子"), re.compile(r"你是个人吗强"), ] TEMPLATE_MIN_LENGTH = 14 TEMPLATE_MIN_REPEAT = 4 REPEAT_MIN_COUNT = 3 # 统一使用分钟级时间,是因为日报场景里秒级时间几乎不会增加理解价值, # 但会显著拉长 LLM 输入;保留到 `HH:MM` 能兼顾时序感和压缩率。 COMPACT_TIME_FORMAT = "%H:%M" @classmethod def parse_danmu_line(cls, line: str) -> Optional[Dict[str, Any]]: text = str(line or "").strip() if not text: return None match = cls.LINE_PATTERN.match(text) if not match: return None try: ts = datetime.strptime(match.group("timestamp"), "%Y-%m-%d %H:%M:%S") except Exception: return None return { "timestamp": ts, "timestamp_text": match.group("timestamp"), "nickname": match.group("nickname").strip(), "uid": str(match.group("uid")).strip(), **cls._parse_profile_text(str(match.group("profile") or "")), "content": match.group("content").strip(), "raw": text, } @classmethod def load_session_messages(cls, room_id: str, session: Dict[str, Any], base_dir: str = "temp") -> List[Dict[str, Any]]: segments = cls._normalize_segments(session.get("segments", []) or []) if not room_id or not segments: return [] date_keys = sorted({segment["start"].strftime("%Y%m%d") for segment in segments} | {segment["end"].strftime("%Y%m%d") for segment in segments}) collected: List[Dict[str, Any]] = [] for date_key in date_keys: file_path = os.path.join(base_dir, "douyu_danmu", date_key, f"{room_id}_{date_key}.txt") if not os.path.exists(file_path): continue try: with open(file_path, "r", encoding="utf-8") as f: for line in f: parsed = cls.parse_danmu_line(line) if not parsed: continue if cls._in_any_segment(parsed["timestamp"], segments): collected.append(parsed) except Exception: continue return collected @classmethod def load_day_messages(cls, room_id: str, date_key: str, base_dir: str = "temp") -> List[Dict[str, Any]]: file_path = os.path.join(base_dir, "douyu_danmu", date_key, f"{room_id}_{date_key}.txt") if not os.path.exists(file_path): return [] collected: List[Dict[str, Any]] = [] with open(file_path, "r", encoding="utf-8") as f: for line in f: parsed = cls.parse_danmu_line(line) if parsed: collected.append(parsed) return collected @classmethod def load_messages_from_file(cls, file_path: str) -> List[Dict[str, Any]]: """ 从指定文本文件直接读取弹幕。 这个入口主要用于本地调试和样本回归: 1. 不依赖 Redis session; 2. 不要求文件落在 temp/douyu_danmu 目录; 3. 便于直接拿用户提供的测试样本跑提纯链路。 """ path = str(file_path or "").strip() if not path or not os.path.exists(path): return [] collected: List[Dict[str, Any]] = [] with open(path, "r", encoding="utf-8", errors="ignore") as f: for line in f: parsed = cls.parse_danmu_line(line) if parsed: collected.append(parsed) return collected @classmethod def build_summary_material( cls, room_id: str, session: Dict[str, Any], messages: List[Dict[str, Any]], bucket_minutes: int = 5, top_bucket_count: int = 10, ) -> Dict[str, Any]: normalized = [item for item in messages if item and item.get("content")] prepared = cls._prepare_messages(normalized) organized_messages = prepared["organized_messages"] unique_users = {str(item.get("uid") or "") for item in normalized if str(item.get("uid") or "").strip()} organized_unique_users = { str(item.get("uid") or "") for item in organized_messages if str(item.get("uid") or "").strip() } deduped_messages = cls._dedupe_consecutive_messages(organized_messages) burst_terms = cls._build_burst_terms(organized_messages) top_terms = cls._extract_top_terms(organized_messages, limit=30) bucket_stats = cls._build_time_buckets(organized_messages, minutes=bucket_minutes) peak_buckets = sorted(bucket_stats, key=lambda item: item.get("message_count", 0), reverse=True)[:top_bucket_count] return { "session_id": session.get("session_id", ""), "room_id": room_id, "anchor_day": session.get("anchor_day", ""), "nickname": session.get("nickname", ""), "room_name": session.get("room_name", ""), "segments": cls._serialize_segments(session.get("segments", []) or []), "message_count": len(normalized), "noise_filtered_count": len(prepared["noise_messages"]), "organized_message_count": len(organized_messages), "deduped_message_count": len(deduped_messages), "unique_user_count": len(unique_users), "organized_unique_user_count": len(organized_unique_users), "merged_templates": prepared["merged_templates"], "top_terms": top_terms, "burst_terms": burst_terms, "time_buckets": bucket_stats, "peak_buckets": peak_buckets, "representative_messages": cls._pick_representative_messages(organized_messages, bucket_stats), "operator_metrics": cls._build_operator_metrics(normalized, organized_messages), } @classmethod def build_llm_payload( cls, room_id: str, session: Dict[str, Any], messages: List[Dict[str, Any]], bucket_minutes: int = 5, top_bucket_count: int = 8, top_repeat_count: int = 24, ) -> Dict[str, Any]: """ 面向 LLM 的高保真弹幕载荷。 规则: 1. 仅过滤平台/机器人类系统噪音。 2. 相同或高度模板化的内容做聚合,不直接删除。 3. 其他不同内容尽量保留,并按时段/热点组织给模型。 """ normalized = [item for item in messages if item and item.get("content")] prepared = cls._prepare_messages(normalized) organized_messages = prepared["organized_messages"] bucket_stats = cls._build_time_buckets(organized_messages, minutes=bucket_minutes) peak_buckets = sorted( bucket_stats, key=lambda item: item.get("message_count", 0), reverse=True, )[:top_bucket_count] unique_users = {str(item.get("uid") or "") for item in normalized if str(item.get("uid") or "").strip()} organized_unique_users = { str(item.get("uid") or "") for item in organized_messages if str(item.get("uid") or "").strip() } return { "session_meta": { "room_id": room_id, "session_id": session.get("session_id", ""), "anchor_day": session.get("anchor_day", ""), "nickname": session.get("nickname", ""), "room_name": session.get("room_name", ""), "segments": cls._serialize_segments(session.get("segments", []) or []), "message_count": len(normalized), "noise_filtered_count": len(prepared["noise_messages"]), "organized_message_count": len(organized_messages), "unique_user_count": len(unique_users), "organized_unique_user_count": len(organized_unique_users), }, "operator_metrics": cls._build_operator_metrics(normalized, organized_messages), "cleaning_rules": [ "仅过滤系统噪音、机器人探测、平台提示类弹幕。", "明显重复的长模板文案按内容聚合,保留出现次数、人数、首末时间。", "其他相同内容按重复短语归并,但不抹掉不同观点和不同句式。", "高峰时段补充原始弹幕样本,方便 LLM 还原语境。", ], "merged_templates": prepared["merged_templates"], "repeated_messages": cls._build_repeated_messages( organized_messages, limit=top_repeat_count, ), "top_terms": cls._extract_top_terms(organized_messages, limit=30), "burst_terms": cls._build_burst_terms(organized_messages), "peak_buckets": cls._simplify_peak_buckets(peak_buckets), "representative_messages": cls._pick_representative_messages(organized_messages, bucket_stats), "raw_window_samples": cls._build_raw_window_samples(peak_buckets, per_bucket_limit=8), # 给日报类 LLM 再补一层“按时间推进的现场切片”。 # 这样模型除了看热点窗口,还能顺着时间线理解气氛如何起、如何变、最后怎么收, # 对粉丝日报这类强调“节目效果”和“接梗链路”的文本尤其有帮助。 "chronological_samples": cls._build_chronological_samples(organized_messages, limit=20), # 每个 session 单独给一个轻量摘要,避免多场直播合并后, # 模型只看到全局热点而丢失“第一场在聊什么、第二场为什么突然转节奏”的信息。 "session_storyline": cls._build_session_storyline( organized_messages, bucket_stats, top_terms_limit=8, sample_limit=10, ), # 这是专门给 LLM 准备的“压缩但保真”的材料层: # 1. 用户画像从逐条弹幕里抽出来,避免昵称/牌子/等级在每条消息里重复出现; # 2. 时间统一压到日期 + 时分,减少无意义的秒级噪音; # 3. 主体按时间线块组织,更像“现场事件流”,比中文分词更适合日报生成。 "compact_prompt_assets": cls.build_compact_prompt_assets( organized_messages, bucket_minutes=bucket_minutes, ), } @classmethod def build_compact_prompt_assets( cls, messages: List[Dict[str, Any]], *, bucket_minutes: int = 5, speaker_limit: int = 80, timeline_limit: int = 24, samples_per_bucket: int = 6, cue_limit: int = 18, ) -> Dict[str, Any]: """ 生成专供 LLM 使用的压缩材料。 设计目标: 1. 不做中文分词,尽量保留“整句/整段弹幕”的原始信息密度; 2. 把重复出现的用户元信息抽到索引表,降低 token 浪费; 3. 把现场内容组织成时间线块,帮助模型理解节奏推进和集体起哄链路。 """ ordered_messages = sorted( [item for item in messages if item and str(item.get("content") or "").strip()], key=lambda item: item.get("timestamp") or datetime.min, ) if not ordered_messages: return { "speaker_index": [], "timeline_digest": [], "content_cues": [], } speaker_index, speaker_alias_map = cls._build_speaker_index( ordered_messages, limit=speaker_limit, ) repeated_messages = cls._build_repeated_messages(ordered_messages, limit=cue_limit) burst_terms = cls._build_burst_terms(ordered_messages) return { "speaker_index": speaker_index, "timeline_digest": cls._build_timeline_digest( ordered_messages, speaker_alias_map, bucket_minutes=bucket_minutes, limit=timeline_limit, samples_per_bucket=samples_per_bucket, ), "content_cues": cls._build_content_cues( ordered_messages, repeated_messages=repeated_messages, burst_terms=burst_terms, limit=cue_limit, ), # 再补一层“事实型提示”,专门抬高赛事、位置、英雄、镜头梗、关键对局等信息密度高的内容。 "semantic_fact_hints": cls._build_semantic_fact_hints(ordered_messages), } @staticmethod def _parse_profile_text(profile_text: str) -> Dict[str, Any]: text = str(profile_text or "").strip() room_level = 0 fans_name = "" fans_level = 0 noble_name = "" room_match = re.search(r",\s*Lv\s*(\d+)", text, re.I) if room_match: try: room_level = int(room_match.group(1)) except Exception: room_level = 0 fans_match = re.search(r"/\s*([^/]+?)\s+Lv\s*(\d+)", text, re.I) if fans_match: fans_name = str(fans_match.group(1) or "").strip() try: fans_level = int(fans_match.group(2)) except Exception: fans_level = 0 noble_match = re.search(r"/\s*(骑士|子爵|伯爵|公爵|国王|皇帝|游侠|超级皇帝|幻神)\b", text) if noble_match: noble_name = str(noble_match.group(1) or "").strip() return { "room_level": room_level, "fans_name": fans_name, "fans_level": fans_level, "noble_name": noble_name, "has_fans_badge": bool(fans_name), "has_noble": bool(noble_name), } @classmethod def _build_operator_metrics(cls, messages: List[Dict[str, Any]], organized_messages: List[Dict[str, Any]]) -> Dict[str, Any]: user_profiles: Dict[str, Dict[str, Any]] = {} user_message_count = Counter() user_organized_count = Counter() fans_badge_users: Set[str] = set() noble_users: Set[str] = set() high_room_level_users: Set[str] = set() high_fans_level_users: Set[str] = set() fans_badge_message_count = 0 noble_message_count = 0 room_level_histogram = Counter() fans_level_histogram = Counter() badge_user_counter = Counter() badge_message_counter = Counter() for item in messages: uid = str(item.get("uid") or "").strip() if not uid: continue user_message_count[uid] += 1 profile = user_profiles.setdefault(uid, { "uid": uid, "nickname": str(item.get("nickname") or "").strip(), "room_level": 0, "fans_name": "", "fans_level": 0, "noble_name": "", }) room_level = int(item.get("room_level") or 0) fans_level = int(item.get("fans_level") or 0) fans_name = str(item.get("fans_name") or "").strip() noble_name = str(item.get("noble_name") or "").strip() if room_level > int(profile.get("room_level") or 0): profile["room_level"] = room_level if fans_level > int(profile.get("fans_level") or 0): profile["fans_level"] = fans_level if fans_name and not profile.get("fans_name"): profile["fans_name"] = fans_name if noble_name and not profile.get("noble_name"): profile["noble_name"] = noble_name if profile.get("nickname") == "" and str(item.get("nickname") or "").strip(): profile["nickname"] = str(item.get("nickname") or "").strip() if fans_name: fans_badge_users.add(uid) fans_badge_message_count += 1 badge_user_counter[fans_name] += 0 badge_message_counter[fans_name] += 1 if noble_name: noble_users.add(uid) noble_message_count += 1 for uid, profile in user_profiles.items(): room_level = int(profile.get("room_level") or 0) fans_level = int(profile.get("fans_level") or 0) fans_name = str(profile.get("fans_name") or "").strip() noble_name = str(profile.get("noble_name") or "").strip() room_level_histogram[cls._level_bucket(room_level)] += 1 if fans_name: badge_user_counter[fans_name] += 1 fans_level_histogram[cls._fans_level_bucket(fans_level)] += 1 if noble_name: noble_users.add(uid) if room_level >= 30: high_room_level_users.add(uid) if fans_level >= 10: high_fans_level_users.add(uid) for item in organized_messages: uid = str(item.get("uid") or "").strip() if uid: user_organized_count[uid] += 1 active_users_5plus = sum(1 for uid, count in user_message_count.items() if count >= 5) active_users_10plus = sum(1 for uid, count in user_message_count.items() if count >= 10) top_active_users = [] for uid, count in user_message_count.most_common(12): profile = user_profiles.get(uid, {}) top_active_users.append({ "uid": uid, "nickname": str(profile.get("nickname") or ""), "message_count": count, "organized_message_count": int(user_organized_count.get(uid, 0) or 0), "room_level": int(profile.get("room_level", 0) or 0), "fans_name": str(profile.get("fans_name") or ""), "fans_level": int(profile.get("fans_level", 0) or 0), "noble_name": str(profile.get("noble_name") or ""), }) top_badges = [] for badge_name, unique_user_count in badge_user_counter.most_common(10): if not badge_name: continue top_badges.append({ "badge_name": badge_name, "user_count": unique_user_count, "message_count": int(badge_message_counter.get(badge_name, 0) or 0), }) total_unique_users = max(len(user_profiles), 1) return { "active_unique_users": len(user_profiles), "active_users_5plus": active_users_5plus, "active_users_10plus": active_users_10plus, "fans_badge_user_count": len(fans_badge_users), "fans_badge_user_ratio": round(len(fans_badge_users) / total_unique_users, 4), "fans_badge_message_count": fans_badge_message_count, "high_room_level_user_count": len(high_room_level_users), "high_room_level_threshold": 30, "high_fans_level_user_count": len(high_fans_level_users), "high_fans_level_threshold": 10, "noble_user_count": len(noble_users), "noble_message_count": noble_message_count, "room_level_distribution": [ {"bucket": bucket, "user_count": count} for bucket, count in sorted(room_level_histogram.items()) ], "fans_level_distribution": [ {"bucket": bucket, "user_count": count} for bucket, count in sorted(fans_level_histogram.items()) ], "top_badges": top_badges, "top_active_users": top_active_users, } @staticmethod def _level_bucket(level: int) -> str: if level >= 40: return "40+" if level >= 30: return "30-39" if level >= 20: return "20-29" if level >= 10: return "10-19" return "1-9" @staticmethod def _fans_level_bucket(level: int) -> str: if level >= 20: return "20+" if level >= 15: return "15-19" if level >= 10: return "10-14" if level >= 5: return "5-9" return "1-4" @classmethod def infer_sessions_from_messages( cls, room_id: str, messages: List[Dict[str, Any]], *, session_cutoff_hour: int = 6, merge_gap_hours: int = 4, min_session_messages: int = 50, ) -> List[Dict[str, Any]]: if not messages: return [] ordered = sorted(messages, key=lambda item: item.get("timestamp") or datetime.min) sessions: List[Dict[str, Any]] = [] current_messages: List[Dict[str, Any]] = [] def flush_current(): if len(current_messages) < min_session_messages: return session = cls._build_inferred_session( room_id, current_messages, session_cutoff_hour=session_cutoff_hour, ) if session: sessions.append(session) prev_dt: Optional[datetime] = None for item in ordered: ts = item.get("timestamp") if not isinstance(ts, datetime): continue if prev_dt is not None: gap_seconds = (ts - prev_dt).total_seconds() if gap_seconds > merge_gap_hours * 3600: flush_current() current_messages = [] current_messages.append(item) prev_dt = ts flush_current() return sessions @classmethod def _build_inferred_session( cls, room_id: str, messages: List[Dict[str, Any]], *, session_cutoff_hour: int = 6, ) -> Optional[Dict[str, Any]]: if not messages: return None ordered = sorted(messages, key=lambda item: item.get("timestamp") or datetime.min) start_dt = ordered[0]["timestamp"] end_dt = ordered[-1]["timestamp"] anchor_dt = start_dt if start_dt.hour < session_cutoff_hour: from datetime import timedelta anchor_dt = start_dt - timedelta(days=1) anchor_day = anchor_dt.strftime("%Y-%m-%d") segments = [] seg_start = ordered[0]["timestamp"] prev_dt = ordered[0]["timestamp"] for item in ordered[1:]: current_dt = item["timestamp"] if (current_dt - prev_dt).total_seconds() > 30 * 60: segments.append({ "start_time": seg_start.strftime("%Y-%m-%d %H:%M:%S"), "end_time": prev_dt.strftime("%Y-%m-%d %H:%M:%S"), }) seg_start = current_dt prev_dt = current_dt segments.append({ "start_time": seg_start.strftime("%Y-%m-%d %H:%M:%S"), "end_time": end_dt.strftime("%Y-%m-%d %H:%M:%S"), }) return { "session_id": f"{room_id}_{anchor_day.replace('-', '')}_{start_dt.strftime('%H%M%S')}", "room_id": room_id, "anchor_day": anchor_day, "nickname": "", "room_name": "", "segments": segments, "is_live": False, "source": "inferred_from_danmu", } @staticmethod def _normalize_segments(segments: List[Dict[str, Any]]) -> List[Dict[str, datetime]]: normalized = [] for item in segments: try: start_dt = datetime.strptime(str(item.get("start_time") or ""), "%Y-%m-%d %H:%M:%S") end_dt = datetime.strptime(str(item.get("end_time") or ""), "%Y-%m-%d %H:%M:%S") except Exception: continue if end_dt < start_dt: continue normalized.append({"start": start_dt, "end": end_dt}) return normalized @staticmethod def _serialize_segments(segments: List[Dict[str, Any]]) -> List[Dict[str, str]]: result = [] for item in segments: start_time = str(item.get("start_time") or "").strip() end_time = str(item.get("end_time") or "").strip() if start_time and end_time: result.append({"start_time": start_time, "end_time": end_time}) return result @staticmethod def _in_any_segment(target_dt: datetime, segments: List[Dict[str, datetime]]) -> bool: for segment in segments: if segment["start"] <= target_dt <= segment["end"]: return True return False @staticmethod def _dedupe_consecutive_messages(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]: result: List[Dict[str, Any]] = [] prev_key = None repeat_count = 0 for item in messages: current_key = (item.get("uid"), item.get("content")) if current_key == prev_key: repeat_count += 1 result[-1]["repeat_count"] = repeat_count continue copied = dict(item) copied["repeat_count"] = 1 result.append(copied) prev_key = current_key repeat_count = 1 return result @classmethod def _filter_noise_messages(cls, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]: result = [] for item in messages: content = str(item.get("content") or "").strip() if not content: continue if cls._is_noise_message(content): continue result.append(item) return result @classmethod def _prepare_messages(cls, messages: List[Dict[str, Any]]) -> Dict[str, Any]: noise_messages: List[Dict[str, Any]] = [] candidate_messages: List[Dict[str, Any]] = [] for item in messages: content = str(item.get("content") or "").strip() if not content: continue if cls._is_noise_message(content): noise_messages.append(item) continue candidate_messages.append(item) merged_templates, organized_messages = cls._merge_template_messages(candidate_messages) return { "noise_messages": noise_messages, "merged_templates": merged_templates, "organized_messages": organized_messages, } @classmethod def _is_noise_message(cls, content: str) -> bool: text = str(content or "").strip() if not text: return True for pattern in cls.NOISE_PATTERNS: if pattern.search(text): return True if len(text) >= 30 and len(set(text)) <= 6: return True return False @classmethod def _merge_template_messages(cls, messages: List[Dict[str, Any]]) -> (List[Dict[str, Any]], List[Dict[str, Any]]): normalized_counter = Counter() normalized_to_messages: Dict[str, List[Dict[str, Any]]] = defaultdict(list) for item in messages: normalized = cls._normalize_template_text(str(item.get("content") or "")) if not normalized: continue normalized_counter[normalized] += 1 normalized_to_messages[normalized].append(item) template_keys = { key for key, count in normalized_counter.items() if len(key) >= cls.TEMPLATE_MIN_LENGTH and count >= cls.TEMPLATE_MIN_REPEAT } for key in list(normalized_counter.keys()): if any(pattern.search(key) for pattern in cls.TEMPLATE_HINT_PATTERNS): template_keys.add(key) merged_templates: List[Dict[str, Any]] = [] organized_messages: List[Dict[str, Any]] = [] for normalized, items in normalized_to_messages.items(): if normalized in template_keys: first = items[0] merged_templates.append({ "text": str(first.get("content") or "").strip()[:120], "count": len(items), "user_count": len({str(item.get('uid') or '') for item in items if str(item.get('uid') or '').strip()}), "first_time": str(first.get("timestamp_text") or ""), "last_time": str(items[-1].get("timestamp_text") or ""), }) else: organized_messages.extend(items) merged_templates.sort(key=lambda item: item.get("count", 0), reverse=True) organized_messages.sort(key=lambda item: item.get("timestamp") or datetime.min) return merged_templates[:20], organized_messages @classmethod def _build_repeated_messages(cls, messages: List[Dict[str, Any]], limit: int = 24) -> List[Dict[str, Any]]: grouped: Dict[str, List[Dict[str, Any]]] = defaultdict(list) for item in messages: content = str(item.get("content") or "").strip() if not content: continue normalized = cls._normalize_template_text(content) if not normalized: continue if cls._looks_like_pure_punctuation(content): continue grouped[normalized].append(item) repeated_messages: List[Dict[str, Any]] = [] for items in grouped.values(): if len(items) < cls.REPEAT_MIN_COUNT: continue first = items[0] repeated_messages.append({ "text": str(first.get("content") or "").strip()[:120], "count": len(items), "user_count": len({ str(item.get("uid") or "") for item in items if str(item.get("uid") or "").strip() }), "first_time": str(first.get("timestamp_text") or ""), "last_time": str(items[-1].get("timestamp_text") or ""), "first_date": cls._format_date(first.get("timestamp")), "first_hm": cls._format_hm(first.get("timestamp")), "last_date": cls._format_date(items[-1].get("timestamp")), "last_hm": cls._format_hm(items[-1].get("timestamp")), }) repeated_messages.sort(key=lambda item: item.get("count", 0), reverse=True) return repeated_messages[:limit] @classmethod def _build_burst_terms(cls, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]: counters: Dict[str, Dict[str, Any]] = {} for item in messages: content = str(item.get("content") or "").strip().lower() if not content: continue if content not in cls.SHORT_BURST_WORDS and len(content) > 6: continue target = counters.setdefault(content, {"text": content, "count": 0, "users": set()}) target["count"] += 1 target["users"].add(str(item.get("uid") or "")) result = [] for item in sorted(counters.values(), key=lambda entry: entry["count"], reverse=True)[:15]: result.append({ "text": item["text"], "count": item["count"], "user_count": len([uid for uid in item["users"] if uid]), }) return result @classmethod def _extract_top_terms(cls, messages: List[Dict[str, Any]], limit: int = 30) -> List[Dict[str, Any]]: counter = Counter() for item in messages: for token in cls._tokenize(str(item.get("content") or "")): counter[token] += 1 result = [] for term, count in counter.most_common(limit): result.append({"term": term, "count": count}) return result @classmethod def _tokenize(cls, content: str) -> List[str]: text = str(content or "").lower().strip() if not text: return [] chinese_terms = re.findall(r"[\u4e00-\u9fff]{2,6}", text) alpha_terms = re.findall(r"[a-z0-9_\-]{3,20}", text) tokens = [] for token in chinese_terms + alpha_terms: normalized = token.strip().lower() if not normalized or normalized in cls.STOPWORDS: continue if normalized.isdigit() and len(normalized) <= 2: continue tokens.append(normalized) return tokens @staticmethod def _normalize_template_text(content: str) -> str: text = str(content or "").strip().lower() if not text: return "" text = re.sub(r"\s+", "", text) text = re.sub(r"[`~!@#$%^&*()_\-+=\[\]{}\\|;:'\",.<>/?,。!?、…()【】《》“”‘’·]", "", text) text = re.sub(r"(.)\1{4,}", r"\1\1\1", text) return text @classmethod def _build_time_buckets(cls, messages: List[Dict[str, Any]], minutes: int = 5) -> List[Dict[str, Any]]: buckets: Dict[str, List[Dict[str, Any]]] = defaultdict(list) bucket_dt_map: Dict[str, datetime] = {} for item in messages: ts = item.get("timestamp") if not isinstance(ts, datetime): continue bucket_minute = (ts.minute // minutes) * minutes bucket_key = ts.replace(minute=bucket_minute, second=0) bucket_text = bucket_key.strftime("%Y-%m-%d %H:%M:%S") bucket_dt_map[bucket_text] = bucket_key buckets[bucket_text].append(item) results: List[Dict[str, Any]] = [] for bucket_start, items in sorted(buckets.items()): bucket_dt = bucket_dt_map.get(bucket_start) top_terms = cls._extract_top_terms(items, limit=8) burst_terms = cls._build_burst_terms(items)[:5] results.append({ "start_time": bucket_start, "date": cls._format_date(bucket_dt), "start_hm": cls._format_hm(bucket_dt), "message_count": len(items), "user_count": len({str(item.get("uid") or "") for item in items if str(item.get("uid") or "").strip()}), "top_terms": top_terms, "burst_terms": burst_terms, "sample_messages": cls._pick_bucket_samples(items, limit=6), }) return results @staticmethod def _pick_bucket_samples(items: List[Dict[str, Any]], limit: int = 6) -> List[Dict[str, str]]: if not items: return [] indexes = sorted({0, len(items) // 3, len(items) // 2, (len(items) * 2) // 3, len(items) - 1}) selected = [] seen = set() for idx in indexes: item = items[idx] content = str(item.get("content") or "").strip() if not content or content in seen: continue selected.append({ "time": str(item.get("timestamp_text") or ""), "date": DouyuDanmuSummaryHelper._format_date(item.get("timestamp")), "hm": DouyuDanmuSummaryHelper._format_hm(item.get("timestamp")), "nickname": str(item.get("nickname") or ""), "content": content[:80], }) seen.add(content) if len(selected) >= limit: break # 固定位置采样只能快速抓到“窗口骨架”,但在弹幕量大时通常不足以凑满 limit。 # 这里继续顺序补样本,把同一热点窗口里更多真实原声带给 LLM, # 减少模型只看到 4-5 条孤立短句、难以还原现场氛围的问题。 if len(selected) < limit: for item in items: content = str(item.get("content") or "").strip() if not content or content in seen: continue selected.append({ "time": str(item.get("timestamp_text") or ""), "date": DouyuDanmuSummaryHelper._format_date(item.get("timestamp")), "hm": DouyuDanmuSummaryHelper._format_hm(item.get("timestamp")), "nickname": str(item.get("nickname") or ""), "content": content[:80], }) seen.add(content) if len(selected) >= limit: break return selected @classmethod def _pick_representative_messages(cls, messages: List[Dict[str, Any]], buckets: List[Dict[str, Any]]) -> List[Dict[str, str]]: selected: List[Dict[str, str]] = [] seen = set() for bucket in sorted(buckets, key=lambda item: item.get("message_count", 0), reverse=True)[:6]: for sample in bucket.get("sample_messages", []): content = str(sample.get("content") or "").strip() if not content or content in seen: continue selected.append(sample) seen.add(content) if len(selected) >= 18: return selected for item in messages: content = str(item.get("content") or "").strip() if not content or content in seen: continue selected.append({ "time": str(item.get("timestamp_text") or ""), "date": cls._format_date(item.get("timestamp")), "hm": cls._format_hm(item.get("timestamp")), "nickname": str(item.get("nickname") or ""), "content": content[:80], }) seen.add(content) if len(selected) >= 18: break return selected @classmethod def _build_raw_window_samples( cls, peak_buckets: List[Dict[str, Any]], per_bucket_limit: int = 8, ) -> List[Dict[str, Any]]: windows: List[Dict[str, Any]] = [] for bucket in peak_buckets: samples = [] for sample in bucket.get("sample_messages", [])[:per_bucket_limit]: content = str(sample.get("content") or "").strip() if not content: continue samples.append({ "time": str(sample.get("time") or ""), "date": str(sample.get("date") or ""), "hm": str(sample.get("hm") or ""), "nickname": str(sample.get("nickname") or ""), "content": content, }) if not samples: continue windows.append({ "start_time": str(bucket.get("start_time") or ""), "message_count": int(bucket.get("message_count", 0) or 0), "user_count": int(bucket.get("user_count", 0) or 0), "samples": samples, }) return windows @classmethod def _build_chronological_samples( cls, messages: List[Dict[str, Any]], limit: int = 20, ) -> List[Dict[str, str]]: """ 从整场弹幕里按时间均匀抽取样本。 设计目的: 1. 热点窗口只能解释“最炸的几分钟”,但日报还需要理解整体节奏; 2. 顺时序样本能帮助 LLM 看到开场铺垫、中段起哄、尾段收束; 3. 对粉丝日报来说,这比单纯词频更容易还原“今天到底经历了什么”。 """ if not messages: return [] indexes = { int(round((len(messages) - 1) * idx / max(limit - 1, 1))) for idx in range(min(limit, len(messages))) } selected: List[Dict[str, str]] = [] seen = set() for idx in sorted(indexes): item = messages[idx] content = str(item.get("content") or "").strip() if not content: continue normalized = cls._normalize_template_text(content) if normalized and normalized in seen: continue if normalized: seen.add(normalized) selected.append({ "time": str(item.get("timestamp_text") or ""), "date": cls._format_date(item.get("timestamp")), "hm": cls._format_hm(item.get("timestamp")), "nickname": str(item.get("nickname") or ""), "content": content[:90], }) if len(selected) >= limit: break return selected @classmethod def _build_session_storyline( cls, messages: List[Dict[str, Any]], bucket_stats: List[Dict[str, Any]], *, top_terms_limit: int = 8, sample_limit: int = 10, ) -> Dict[str, Any]: """ 组装单场直播的轻量叙事骨架。 这里不追求大而全,而是给模型一个“这场直播从头到尾在发生什么”的概览, 让它在写日报时更容易把梗、情绪和时间顺序串起来。 """ first_message = messages[0] if messages else {} last_message = messages[-1] if messages else {} hottest_bucket = max( bucket_stats, key=lambda item: int(item.get("message_count", 0) or 0), default={}, ) return { "start_time": str(first_message.get("timestamp_text") or ""), "end_time": str(last_message.get("timestamp_text") or ""), "top_terms": cls._extract_top_terms(messages, limit=top_terms_limit), "burst_terms": cls._build_burst_terms(messages)[:6], "chronological_samples": cls._build_chronological_samples(messages, limit=sample_limit), "hottest_moment": { "start_time": str(hottest_bucket.get("start_time") or ""), "message_count": int(hottest_bucket.get("message_count", 0) or 0), "user_count": int(hottest_bucket.get("user_count", 0) or 0), "top_terms": hottest_bucket.get("top_terms", [])[:6], "sample_messages": hottest_bucket.get("sample_messages", [])[:6], }, } @staticmethod def _simplify_peak_buckets(buckets: List[Dict[str, Any]]) -> List[Dict[str, Any]]: simplified = [] for bucket in buckets: simplified.append({ "start_time": str(bucket.get("start_time") or ""), "message_count": int(bucket.get("message_count", 0) or 0), "user_count": int(bucket.get("user_count", 0) or 0), "top_terms": bucket.get("top_terms", [])[:6], "burst_terms": bucket.get("burst_terms", [])[:5], }) return simplified @staticmethod def _format_date(ts: Any) -> str: if isinstance(ts, datetime): return ts.strftime("%Y-%m-%d") return "" @classmethod def _format_hm(cls, ts: Any) -> str: if isinstance(ts, datetime): return ts.strftime(cls.COMPACT_TIME_FORMAT) return "" @classmethod def _build_speaker_index( cls, messages: List[Dict[str, Any]], *, limit: int = 80, ) -> (List[Dict[str, Any]], Dict[str, str]): """ 构建用户索引表。 这样时间线块里只保留 `speaker_id`,把 UID/牌子/等级这些重复元信息折叠到索引里, 既节省 token,也能保留用户画像的完整性。 """ profiles: Dict[str, Dict[str, Any]] = {} counts = Counter() for item in messages: uid = str(item.get("uid") or "").strip() if not uid: continue counts[uid] += 1 profile = profiles.setdefault(uid, { "uid": uid, "nickname": str(item.get("nickname") or "").strip(), "room_level": 0, "fans_name": "", "fans_level": 0, "noble_name": "", }) if not profile["nickname"]: profile["nickname"] = str(item.get("nickname") or "").strip() profile["room_level"] = max(int(profile.get("room_level", 0) or 0), int(item.get("room_level", 0) or 0)) if int(item.get("fans_level", 0) or 0) > int(profile.get("fans_level", 0) or 0): profile["fans_level"] = int(item.get("fans_level", 0) or 0) if not str(profile.get("fans_name") or "").strip(): profile["fans_name"] = str(item.get("fans_name") or "").strip() if not str(profile.get("noble_name") or "").strip(): profile["noble_name"] = str(item.get("noble_name") or "").strip() speaker_index: List[Dict[str, Any]] = [] speaker_alias_map: Dict[str, str] = {} for idx, (uid, count) in enumerate(counts.most_common(limit), start=1): profile = profiles.get(uid, {}) speaker_id = f"U{idx:02d}" speaker_alias_map[uid] = speaker_id speaker_index.append({ "speaker_id": speaker_id, "nickname": str(profile.get("nickname") or "").strip(), # 只保留 UID 尾号,方便定位老观众/同名用户,又不会把整串 UUID 反复塞给模型。 "uid_tail": uid[-4:] if len(uid) >= 4 else uid, "badge_name": str(profile.get("fans_name") or "").strip(), "badge_level": int(profile.get("fans_level", 0) or 0), "room_level": int(profile.get("room_level", 0) or 0), "noble_name": str(profile.get("noble_name") or "").strip(), "message_count": count, }) return speaker_index, speaker_alias_map @classmethod def _build_timeline_digest( cls, messages: List[Dict[str, Any]], speaker_alias_map: Dict[str, str], *, bucket_minutes: int = 5, limit: int = 24, samples_per_bucket: int = 6, ) -> List[Dict[str, Any]]: """ 把弹幕按时间窗口压成事件块。 每个块里同时保留: 1. 热度数据; 2. 重复刷屏的整句线索; 3. 少量原声样本。 这比按词切碎更容易让模型理解“这一段到底发生了什么”。 """ buckets: Dict[str, List[Dict[str, Any]]] = defaultdict(list) bucket_dt_map: Dict[str, datetime] = {} for item in messages: ts = item.get("timestamp") if not isinstance(ts, datetime): continue bucket_minute = (ts.minute // bucket_minutes) * bucket_minutes bucket_dt = ts.replace(minute=bucket_minute, second=0) bucket_key = bucket_dt.strftime("%Y-%m-%d %H:%M:%S") bucket_dt_map[bucket_key] = bucket_dt buckets[bucket_key].append(item) timeline_blocks: List[Dict[str, Any]] = [] for bucket_key in sorted(buckets.keys()): bucket_messages = buckets[bucket_key] repeated = cls._build_repeated_messages(bucket_messages, limit=3) samples: List[Dict[str, Any]] = [] seen_contents = set() for item in bucket_messages: content = str(item.get("content") or "").strip() if not content: continue normalized = cls._normalize_template_text(content) if normalized and normalized in seen_contents: continue if normalized: seen_contents.add(normalized) uid = str(item.get("uid") or "").strip() samples.append({ "speaker_id": speaker_alias_map.get(uid, "U00"), "hm": cls._format_hm(item.get("timestamp")), "content": content[:90], }) if len(samples) >= samples_per_bucket: break bucket_dt = bucket_dt_map.get(bucket_key) timeline_blocks.append({ "date": cls._format_date(bucket_dt), "start_hm": cls._format_hm(bucket_dt), "message_count": len(bucket_messages), "user_count": len({ str(item.get("uid") or "") for item in bucket_messages if str(item.get("uid") or "").strip() }), "repeated_cues": [ { "text": str(item.get("text") or "").strip()[:80], "count": int(item.get("count", 0) or 0), "user_count": int(item.get("user_count", 0) or 0), } for item in repeated if str(item.get("text") or "").strip() ], "samples": samples, }) return timeline_blocks[:limit] @classmethod def _build_content_cues( cls, messages: List[Dict[str, Any]], *, repeated_messages: List[Dict[str, Any]], burst_terms: List[Dict[str, Any]], limit: int = 18, ) -> List[Dict[str, Any]]: """ 生成不依赖中文分词的高频内容线索。 规则: 1. 优先保留整句复读内容; 2. 短促情绪词单独保留为 burst; 3. 对短句高频原话做补充,不把中文切碎成词。 """ cues: List[Dict[str, Any]] = [] seen = set() def push(kind: str, text: str, count: int, user_count: int = 0) -> None: value = str(text or "").strip() if not value: return normalized = cls._normalize_template_text(value) if not normalized or normalized in seen: return seen.add(normalized) cues.append({ "kind": kind, "text": value[:90], "count": int(count or 0), "user_count": int(user_count or 0), }) for item in repeated_messages: push( "emotion" if cls._normalize_template_text(str(item.get("text") or "")) in cls.GENERIC_REACTION_TERMS else "repeat", str(item.get("text") or ""), int(item.get("count", 0) or 0), int(item.get("user_count", 0) or 0), ) short_message_counter = Counter() short_message_users: Dict[str, Set[str]] = defaultdict(set) short_message_text_map: Dict[str, str] = {} for item in messages: content = str(item.get("content") or "").strip() if not content or len(content) > 16 or cls._looks_like_pure_punctuation(content): continue normalized = cls._normalize_template_text(content) if not normalized: continue short_message_counter[normalized] += 1 short_message_users[normalized].add(str(item.get("uid") or "").strip()) short_message_text_map.setdefault(normalized, content) for normalized, count in short_message_counter.most_common(limit): if count < 2: continue push( "emotion" if normalized in cls.GENERIC_REACTION_TERMS else "short_repeat", short_message_text_map.get(normalized, normalized), count, len([uid for uid in short_message_users.get(normalized, set()) if uid]), ) for item in burst_terms: push( "emotion" if cls._normalize_template_text(str(item.get("text") or "")) in cls.GENERIC_REACTION_TERMS else "burst", str(item.get("text") or ""), int(item.get("count", 0) or 0), int(item.get("user_count", 0) or 0), ) kind_priority = { "repeat": 5, "short_repeat": 4, "burst": 3, "emotion": 1, } cues.sort( key=lambda item: ( int(kind_priority.get(str(item.get("kind") or ""), 0)), int(item.get("count", 0) or 0), int(item.get("user_count", 0) or 0), ), reverse=True, ) return cues[:limit] @classmethod def _build_semantic_fact_hints(cls, messages: List[Dict[str, Any]]) -> Dict[str, Any]: """ 生成“事实型语义提示”。 这层不是做总结,而是把模型容易漏掉的高价值信息提前挂出来: 1. 赛事/选人/位置讨论; 2. 英雄与关键局; 3. 摄像头、团播人物等场外互动梗。 """ return { "topic_clusters": cls._build_fact_topic_clusters(messages), "hero_mentions": cls._build_hero_mentions(messages), } @classmethod def _build_fact_topic_clusters(cls, messages: List[Dict[str, Any]], limit: int = 8) -> List[Dict[str, Any]]: clusters: List[Dict[str, Any]] = [] for config in cls.FACT_CLUSTER_CONFIGS: matched_items: List[Dict[str, Any]] = [] keywords = [str(item).lower() for item in (config.get("keywords") or []) if str(item).strip()] for item in messages: content = str(item.get("content") or "").strip() if not content: continue lowered = content.lower() if any(keyword in lowered for keyword in keywords): matched_items.append(item) if not matched_items: continue sample_messages = [] seen = set() for item in matched_items: content = str(item.get("content") or "").strip() normalized = cls._normalize_template_text(content) if not normalized or normalized in seen: continue seen.add(normalized) sample_messages.append({ "date": cls._format_date(item.get("timestamp")), "hm": cls._format_hm(item.get("timestamp")), "nickname": str(item.get("nickname") or "").strip(), "content": content[:100], }) if len(sample_messages) >= 5: break if not sample_messages: continue clusters.append({ "label": str(config.get("label") or "").strip(), "match_count": len(matched_items), "user_count": len({ str(item.get("uid") or "") for item in matched_items if str(item.get("uid") or "").strip() }), "first_hm": cls._format_hm(matched_items[0].get("timestamp")), "last_hm": cls._format_hm(matched_items[-1].get("timestamp")), "keywords": config.get("keywords", [])[:8], "samples": sample_messages, }) clusters.sort( key=lambda item: ( int(item.get("match_count", 0) or 0), int(item.get("user_count", 0) or 0), ), reverse=True, ) return clusters[:limit] @classmethod def _build_hero_mentions(cls, messages: List[Dict[str, Any]], limit: int = 6) -> List[Dict[str, Any]]: hero_results: List[Dict[str, Any]] = [] for hero_name, aliases in cls.HERO_ALIASES.items(): matched_items: List[Dict[str, Any]] = [] alias_list = [str(alias).lower() for alias in aliases if str(alias).strip()] for item in messages: content = str(item.get("content") or "").strip() if not content: continue lowered = content.lower() if any(alias in lowered for alias in alias_list): matched_items.append(item) if not matched_items: continue samples = [] seen = set() for item in matched_items: content = str(item.get("content") or "").strip() normalized = cls._normalize_template_text(content) if not normalized or normalized in seen: continue seen.add(normalized) samples.append({ "hm": cls._format_hm(item.get("timestamp")), "nickname": str(item.get("nickname") or "").strip(), "content": content[:100], }) if len(samples) >= 4: break hero_results.append({ "hero": hero_name, "mention_count": len(matched_items), "user_count": len({ str(item.get("uid") or "") for item in matched_items if str(item.get("uid") or "").strip() }), "samples": samples, }) hero_results.sort( key=lambda item: ( int(item.get("mention_count", 0) or 0), int(item.get("user_count", 0) or 0), ), reverse=True, ) return hero_results[:limit] @staticmethod def _looks_like_pure_punctuation(content: str) -> bool: text = str(content or "").strip() if not text: return True return re.fullmatch(r"[\W_]+", text, re.UNICODE) is not None