File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -164,14 +164,13 @@ class DouyuDanmuSummaryHelper:
|
||||
面向 LLM 的高保真弹幕载荷。
|
||||
规则:
|
||||
1. 仅过滤平台/机器人类系统噪音。
|
||||
2. 只对“同内容重复弹幕”做合并,不再做偏激进的模板化折叠。
|
||||
3. 其他不同内容尽量保留,让模型看到更接近当晚现场的讨论全貌。
|
||||
2. 相同或高度模板化的内容做聚合,不直接删除。
|
||||
3. 其他不同内容尽量保留,并按时段/热点组织给模型。
|
||||
"""
|
||||
normalized = [item for item in messages if item and item.get("content")]
|
||||
prepared = cls._prepare_messages_for_llm(normalized)
|
||||
source_messages = prepared["llm_source_messages"]
|
||||
prepared = cls._prepare_messages(normalized)
|
||||
organized_messages = prepared["organized_messages"]
|
||||
bucket_stats = cls._build_time_buckets(source_messages, minutes=bucket_minutes)
|
||||
bucket_stats = cls._build_time_buckets(organized_messages, minutes=bucket_minutes)
|
||||
peak_buckets = sorted(
|
||||
bucket_stats,
|
||||
key=lambda item: item.get("message_count", 0),
|
||||
@@ -200,30 +199,31 @@ class DouyuDanmuSummaryHelper:
|
||||
"operator_metrics": cls._build_operator_metrics(normalized, organized_messages),
|
||||
"cleaning_rules": [
|
||||
"仅过滤系统噪音、机器人探测、平台提示类弹幕。",
|
||||
"只合并同内容重复弹幕,保留出现次数、人数、首末时间。",
|
||||
"不同句式、不同观点、不同刀圈讨论尽量原样保留,不再做模板化压缩。",
|
||||
"热点时段、顺时序样本和原声片段共同保留,方便 LLM 还原完整语境。",
|
||||
"明显重复的长模板文案按内容聚合,保留出现次数、人数、首末时间。",
|
||||
"其他相同内容按重复短语归并,但不抹掉不同观点和不同句式。",
|
||||
"高峰时段补充原始弹幕样本,方便 LLM 还原语境。",
|
||||
],
|
||||
# 字段名继续沿用 merged_templates / repeated_messages,
|
||||
# 目的是兼容下游模板和主流程,实际语义已经切换成“同内容重复弹幕聚合结果”。
|
||||
"merged_templates": prepared["duplicate_groups"][:top_repeat_count],
|
||||
"repeated_messages": prepared["duplicate_groups"][:top_repeat_count],
|
||||
"top_terms": cls._extract_top_terms(source_messages, limit=40),
|
||||
"burst_terms": cls._build_burst_terms(source_messages),
|
||||
"merged_templates": prepared["merged_templates"],
|
||||
"repeated_messages": cls._build_repeated_messages(
|
||||
organized_messages,
|
||||
limit=top_repeat_count,
|
||||
),
|
||||
"top_terms": cls._extract_top_terms(organized_messages, limit=30),
|
||||
"burst_terms": cls._build_burst_terms(organized_messages),
|
||||
"peak_buckets": cls._simplify_peak_buckets(peak_buckets),
|
||||
"representative_messages": cls._pick_representative_messages(organized_messages, bucket_stats),
|
||||
"raw_window_samples": cls._build_raw_window_samples(peak_buckets, per_bucket_limit=12),
|
||||
"raw_window_samples": cls._build_raw_window_samples(peak_buckets, per_bucket_limit=8),
|
||||
# 给日报类 LLM 再补一层“按时间推进的现场切片”。
|
||||
# 这样模型除了看热点窗口,还能顺着时间线理解气氛如何起、如何变、最后怎么收,
|
||||
# 对粉丝日报这类强调“节目效果”和“接梗链路”的文本尤其有帮助。
|
||||
"chronological_samples": cls._build_chronological_samples(organized_messages, limit=28),
|
||||
"chronological_samples": cls._build_chronological_samples(organized_messages, limit=20),
|
||||
# 每个 session 单独给一个轻量摘要,避免多场直播合并后,
|
||||
# 模型只看到全局热点而丢失“第一场在聊什么、第二场为什么突然转节奏”的信息。
|
||||
"session_storyline": cls._build_session_storyline(
|
||||
source_messages,
|
||||
organized_messages,
|
||||
bucket_stats,
|
||||
top_terms_limit=12,
|
||||
sample_limit=14,
|
||||
top_terms_limit=8,
|
||||
sample_limit=10,
|
||||
),
|
||||
}
|
||||
|
||||
@@ -586,36 +586,6 @@ class DouyuDanmuSummaryHelper:
|
||||
"organized_messages": organized_messages,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def _prepare_messages_for_llm(cls, messages: List[Dict[str, Any]]) -> Dict[str, Any]:
|
||||
"""
|
||||
为日报/粉丝日报的 LLM 链路做更保守的清洗。
|
||||
设计原则:
|
||||
1. 只过滤系统噪音,不再做“像模板就折叠掉”的强压缩;
|
||||
2. 只对完全同内容的重复弹幕做合并,避免复读海啸把语料挤爆;
|
||||
3. 其他不同表达、不同观点、不同讨论方向尽量完整保留。
|
||||
"""
|
||||
noise_messages: List[Dict[str, Any]] = []
|
||||
candidate_messages: List[Dict[str, Any]] = []
|
||||
for item in messages:
|
||||
content = str(item.get("content") or "").strip()
|
||||
if not content:
|
||||
continue
|
||||
if cls._is_noise_message(content):
|
||||
noise_messages.append(item)
|
||||
continue
|
||||
candidate_messages.append(item)
|
||||
|
||||
duplicate_groups, organized_messages = cls._merge_exact_duplicate_messages(candidate_messages)
|
||||
return {
|
||||
"noise_messages": noise_messages,
|
||||
# llm_source_messages 保留所有非噪音原始消息,用于词频、热点时段和讨论热度统计。
|
||||
"llm_source_messages": candidate_messages,
|
||||
# organized_messages 只去掉完全重复内容,用于给模型喂更丰富但不至于刷屏的原声样本。
|
||||
"organized_messages": organized_messages,
|
||||
"duplicate_groups": duplicate_groups,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def _is_noise_message(cls, content: str) -> bool:
|
||||
text = str(content or "").strip()
|
||||
@@ -666,52 +636,6 @@ class DouyuDanmuSummaryHelper:
|
||||
organized_messages.sort(key=lambda item: item.get("timestamp") or datetime.min)
|
||||
return merged_templates[:20], organized_messages
|
||||
|
||||
@classmethod
|
||||
def _merge_exact_duplicate_messages(cls, messages: List[Dict[str, Any]]) -> (List[Dict[str, Any]], List[Dict[str, Any]]):
|
||||
"""
|
||||
只合并“同内容重复弹幕”。
|
||||
注意这里的归一化非常保守:
|
||||
1. 只处理大小写、空白和零宽字符;
|
||||
2. 不再去掉标点,不再做模板化抽象;
|
||||
3. 目的是尽量保留不同句式、不同观点和不同刀圈细节。
|
||||
"""
|
||||
grouped_messages: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
|
||||
ordered_keys: List[str] = []
|
||||
for item in messages:
|
||||
normalized = cls._normalize_duplicate_text(str(item.get("content") or ""))
|
||||
if not normalized:
|
||||
continue
|
||||
if normalized not in grouped_messages:
|
||||
ordered_keys.append(normalized)
|
||||
grouped_messages[normalized].append(item)
|
||||
|
||||
duplicate_groups: List[Dict[str, Any]] = []
|
||||
organized_messages: List[Dict[str, Any]] = []
|
||||
for normalized in ordered_keys:
|
||||
items = grouped_messages[normalized]
|
||||
first = items[0]
|
||||
copied = dict(first)
|
||||
copied["repeat_count"] = len(items)
|
||||
copied["repeat_user_count"] = len({
|
||||
str(item.get("uid") or "") for item in items if str(item.get("uid") or "").strip()
|
||||
})
|
||||
organized_messages.append(copied)
|
||||
if len(items) < 2:
|
||||
continue
|
||||
duplicate_groups.append({
|
||||
"text": str(first.get("content") or "").strip()[:120],
|
||||
"count": len(items),
|
||||
"user_count": len({
|
||||
str(item.get("uid") or "") for item in items if str(item.get("uid") or "").strip()
|
||||
}),
|
||||
"first_time": str(first.get("timestamp_text") or ""),
|
||||
"last_time": str(items[-1].get("timestamp_text") or ""),
|
||||
})
|
||||
|
||||
duplicate_groups.sort(key=lambda item: item.get("count", 0), reverse=True)
|
||||
organized_messages.sort(key=lambda item: item.get("timestamp") or datetime.min)
|
||||
return duplicate_groups[:80], organized_messages
|
||||
|
||||
@classmethod
|
||||
def _build_repeated_messages(cls, messages: List[Dict[str, Any]], limit: int = 24) -> List[Dict[str, Any]]:
|
||||
grouped: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
|
||||
@@ -743,22 +667,6 @@ class DouyuDanmuSummaryHelper:
|
||||
repeated_messages.sort(key=lambda item: item.get("count", 0), reverse=True)
|
||||
return repeated_messages[:limit]
|
||||
|
||||
@staticmethod
|
||||
def _normalize_duplicate_text(content: str) -> str:
|
||||
"""
|
||||
LLM 清洗链路的“同内容”判定保持保守。
|
||||
只做最小化标准化,避免把本来不同的表达误并成一类:
|
||||
1. 转小写;
|
||||
2. 去零宽字符;
|
||||
3. 折叠空白。
|
||||
"""
|
||||
text = str(content or "").strip().lower()
|
||||
if not text:
|
||||
return ""
|
||||
text = text.replace("\u200b", "").replace("\ufeff", "")
|
||||
text = re.sub(r"\s+", " ", text)
|
||||
return text
|
||||
|
||||
@classmethod
|
||||
def _build_burst_terms(cls, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
counters: Dict[str, Dict[str, Any]] = {}
|
||||
|
||||
@@ -514,12 +514,12 @@ class DouyuRedisManager:
|
||||
class DouyuPlugin(MessagePluginInterface):
|
||||
# 报告缓存版本号:
|
||||
# 1. 版本升级后会自动让历史缓存失效,避免继续复用旧文本/旧图片;
|
||||
# 2. 本次将版本提升到 11:
|
||||
# - 日报给 LLM 的弹幕清洗策略改成“只去系统噪音 + 只合并同内容重复”;
|
||||
# - 不再对不同表达做强模板压缩,尽量保留现场讨论细节;
|
||||
# 2. 本次将版本提升到 10:
|
||||
# - 新增粉丝日报定时任务链路;
|
||||
# - LLM 输入材料再补充顺时序现场切片与场次故事线;
|
||||
# - 同时让新日报结果自动避开旧缓存污染;
|
||||
# 因此需要刷新旧缓存,确保新版结果真实命中新链路。
|
||||
_DAILY_REPORT_CACHE_VERSION = 11
|
||||
_DAILY_REPORT_CACHE_VERSION = 10
|
||||
FEATURE_KEY = "DOUYU_MONITOR"
|
||||
FEATURE_DESCRIPTION = "🎮 斗鱼开播提醒 [订阅斗鱼 房间号, 取消订阅斗鱼 房间号]"
|
||||
|
||||
@@ -2304,13 +2304,11 @@ class DouyuPlugin(MessagePluginInterface):
|
||||
"3. 另起一行写标题:`【弹幕名场面】`,下面写 4-6 条 bullet,尽量保留弹幕原话风格,像现场回放。\n"
|
||||
"4. 另起一行写标题:`【梗王榜】`,下面写 3 条 bullet,把今天最刷屏、最有共识的梗排出来。\n"
|
||||
"5. 另起一行写标题:`【收尾播报】`,下面只写 1 句收尾,轻松一点,像群里发图后的总结句。\n"
|
||||
"6. 如果材料里出现 Dota/刀圈比赛话题,比如位置分工、老头杯、阵容、选人、比赛评价、解说讨论,必须明确写出来,不能只泛化成“气氛很好”“节目效果很足”。\n"
|
||||
"7. 可以夸张一点、调皮一点,但不要低俗,不要攻击主播,不要使用“建议、策略、转化、数据表现”等运营词。\n\n"
|
||||
"6. 可以夸张一点、调皮一点,但不要低俗,不要攻击主播,不要使用“建议、策略、转化、数据表现”等运营词。\n\n"
|
||||
f"主播:{meta.get('nickname') or meta.get('room_name') or meta.get('room_id')}\n"
|
||||
f"日期:{meta.get('anchor_day', '')}\n"
|
||||
f"{room_context_prompt}"
|
||||
"下面是已经提纯给 LLM 的现场材料。当前清洗策略只合并完全相同的重复弹幕,不同表达会尽量保留。\n"
|
||||
"请优先抓原声弹幕、热点窗口、顺时序讨论推进,以及其中出现的刀圈比赛话题,不要写成空泛热闹总结。\n"
|
||||
"下面是已经提纯给 LLM 的现场材料,请优先抓原声弹幕、热点窗口和集体起哄片段,少写空泛概括。\n"
|
||||
f"材料:\n{json.dumps(prompt_material, ensure_ascii=False, indent=2)}"
|
||||
)
|
||||
return system_prompt, user_prompt
|
||||
|
||||
Reference in New Issue
Block a user