优化斗鱼日报LLM输入提纯并清理think输出\n\n- 为斗鱼日报链路统一接入think和推理内容清洗\n- 将提交给LLM的材料改为更聚焦现场弹幕的提纯结构\n- 提高热点窗口原声样本量,避免窗口样本长期不足\n- 刷新日报缓存版本,确保新提示词和新材料立即生效

This commit is contained in:
liuwei
2026-04-27 13:20:30 +08:00
parent 0253e705e5
commit a830089b10
3 changed files with 220 additions and 12 deletions

View File

@@ -759,6 +759,22 @@ class DouyuDanmuSummaryHelper:
seen.add(content)
if len(selected) >= limit:
break
# 固定位置采样只能快速抓到“窗口骨架”,但在弹幕量大时通常不足以凑满 limit。
# 这里继续顺序补样本,把同一热点窗口里更多真实原声带给 LLM
# 减少模型只看到 4-5 条孤立短句、难以还原现场氛围的问题。
if len(selected) < limit:
for item in items:
content = str(item.get("content") or "").strip()
if not content or content in seen:
continue
selected.append({
"time": str(item.get("timestamp_text") or ""),
"nickname": str(item.get("nickname") or ""),
"content": content[:80],
})
seen.add(content)
if len(selected) >= limit:
break
return selected
@classmethod

File diff suppressed because one or more lines are too long

View File

@@ -30,6 +30,7 @@ from utils.decorator.plugin_decorators import plugin_stats_decorator
from utils.decorator.points_decorator import plugin_points_cost
from utils.markdown_to_image import convert_md_str_to_image, html_to_image
from utils.robot_cmd.robot_command import Feature, PermissionStatus, GroupBotManager
from utils.string_utils import remove_reasoning_content
from wechat_ipad import WechatAPIClient
from wechat_ipad.models.appmsg_xml import DOUYU_MESSAGE_XML
@@ -513,9 +514,11 @@ class DouyuRedisManager:
class DouyuPlugin(MessagePluginInterface):
# 报告缓存版本号:
# 1. 版本升级后会自动让历史缓存失效,避免继续复用旧文本/旧图片;
# 2. 本次将版本提升到 8背景画像链路删掉了部分过程控制和提示噪音
# 需要刷新旧日报缓存,确保新版 prompt 使用新的精简上下文。
_DAILY_REPORT_CACHE_VERSION = 8
# 2. 本次将版本提升到 9
# - LLM 输入材料从“整份大 payload”改成“提纯后的现场材料”
# - 同时统一清洗 <think> / reasoning 输出;
# 因此需要刷新旧缓存,确保新版结果真实命中新链路。
_DAILY_REPORT_CACHE_VERSION = 9
FEATURE_KEY = "DOUYU_MONITOR"
FEATURE_DESCRIPTION = "🎮 斗鱼开播提醒 [订阅斗鱼 房间号, 取消订阅斗鱼 房间号]"
@@ -764,7 +767,7 @@ class DouyuPlugin(MessagePluginInterface):
1. 模型把 JSON 包在 ```json 代码块里;
2. 模型前后补了少量解释文字。
"""
raw = str(text or "").strip()
raw = remove_reasoning_content(str(text or "").strip())
if not raw:
return None
if raw.startswith("```"):
@@ -2172,6 +2175,7 @@ class DouyuPlugin(MessagePluginInterface):
def _build_daily_report_prompt(self, payload: Dict[str, Any]) -> Tuple[str, str]:
meta = payload.get("report_meta", {}) or {}
room_context_prompt = self._build_room_context_prompt_block(payload)
prompt_material = self._build_llm_prompt_material(payload, include_operator=True)
system_prompt = (
"你是斗鱼直播日报助手。请基于给定的结构化弹幕材料,输出一份适合发群的中文日报。"
"要求简洁、自然、信息密度高,不要编造,不要使用代码块。"
@@ -2186,13 +2190,15 @@ class DouyuPlugin(MessagePluginInterface):
"5. 单独列出 2-3 个热点时段。\n"
"6. 整体控制在 600 字以内。\n\n"
f"{room_context_prompt}"
f"材料如下:\n{json.dumps(payload, ensure_ascii=False, indent=2)}"
"下面是已经提纯给 LLM 的材料,优先依据现场弹幕片段、热点窗口和共识梗来写,不要被大段统计信息带偏。\n"
f"材料如下:\n{json.dumps(prompt_material, ensure_ascii=False, indent=2)}"
)
return system_prompt, user_prompt
def _build_danmu_summary_prompt(self, payload: Dict[str, Any]) -> Tuple[str, str]:
meta = payload.get("report_meta", {}) or {}
room_context_prompt = self._build_room_context_prompt_block(payload)
prompt_material = self._build_llm_prompt_material(payload, include_operator=False)
system_prompt = (
"你是直播弹幕总结助手。请只根据给定材料,总结这场直播的弹幕内容与氛围。"
"不要输出运营数据,不要编造,不要写空话套话。"
@@ -2209,7 +2215,8 @@ class DouyuPlugin(MessagePluginInterface):
f"主播:{meta.get('nickname') or meta.get('room_name') or meta.get('room_id')}\n"
f"日期:{meta.get('anchor_day', '')}\n"
f"{room_context_prompt}"
f"材料:\n{json.dumps(payload, ensure_ascii=False, indent=2)}"
"下面是已经提纯给 LLM 的现场材料,请优先使用原声弹幕、热点窗口和复读梗,不要写成词频复述。\n"
f"材料:\n{json.dumps(prompt_material, ensure_ascii=False, indent=2)}"
)
return system_prompt, user_prompt
@@ -2222,6 +2229,7 @@ class DouyuPlugin(MessagePluginInterface):
"""
meta = payload.get("report_meta", {}) or {}
room_context_prompt = self._build_room_context_prompt_block(payload)
prompt_material = self._build_llm_prompt_material(payload, include_operator=False)
system_prompt = (
"你是斗鱼直播间的粉丝向整活日报编辑。"
"请只根据提供的真实弹幕材料,输出一份开心、欢乐、带一点恶搞气质的中文总结。"
@@ -2239,10 +2247,173 @@ class DouyuPlugin(MessagePluginInterface):
f"主播:{meta.get('nickname') or meta.get('room_name') or meta.get('room_id')}\n"
f"日期:{meta.get('anchor_day', '')}\n"
f"{room_context_prompt}"
f"材料:\n{json.dumps(payload, ensure_ascii=False, indent=2)}"
"下面是已经提纯给 LLM 的现场材料,请优先抓原声弹幕、热点窗口和集体起哄片段,少写空泛概括。\n"
f"材料:\n{json.dumps(prompt_material, ensure_ascii=False, indent=2)}"
)
return system_prompt, user_prompt
def _build_llm_prompt_material(
self,
payload: Dict[str, Any],
*,
include_operator: bool = False,
) -> Dict[str, Any]:
"""
为日报 LLM 单独构建“提纯后的材料”。
设计目标:
1. 把真正有现场感的弹幕内容提到前面,降低大体量统计 JSON 对模型注意力的干扰;
2. 保留足够的梗、热点时段和原声样本,方便模型写出更像直播间回放的内容;
3. 运营指标只在确实需要的日报正文场景里保留精简版,不再无差别塞给所有任务。
"""
meta = payload.get("report_meta", {}) or {}
room_context = payload.get("room_context", {}) or {}
sessions = payload.get("sessions", []) or []
representative_messages = payload.get("representative_messages", []) or []
raw_window_samples = payload.get("raw_window_samples", []) or []
merged_templates = payload.get("merged_templates", []) or []
repeated_messages = payload.get("repeated_messages", []) or []
top_terms = payload.get("top_terms", []) or []
burst_terms = payload.get("burst_terms", []) or []
peak_buckets = payload.get("peak_buckets", []) or []
material: Dict[str, Any] = {
"report_meta": {
"room_id": str(meta.get("room_id") or "").strip(),
"anchor_day": str(meta.get("anchor_day") or "").strip(),
"nickname": str(meta.get("nickname") or "").strip(),
"room_name": str(meta.get("room_name") or "").strip(),
"session_count": int(meta.get("session_count", 0) or 0),
"message_count": int(meta.get("message_count", 0) or 0),
"unique_user_count": int(meta.get("unique_user_count", 0) or 0),
},
"room_context": {
"domain": str(room_context.get("domain") or "").strip(),
"inferred_domains": self._normalize_text_list(room_context.get("inferred_domains"))[:6],
"identity_summary": str(room_context.get("identity_summary") or "").strip(),
"career_background": str(room_context.get("career_background") or "").strip(),
"related_people": self._normalize_text_list(room_context.get("related_people"))[:10],
"storyline_keywords": self._normalize_text_list(room_context.get("storyline_keywords"))[:10],
"style_hints": self._normalize_text_list(room_context.get("style_hints"))[:6],
},
"session_overview": [
{
"session_id": str(item.get("session_id") or "").strip(),
"segments": item.get("segments", []) or [],
"message_count": int(item.get("message_count", 0) or 0),
"organized_message_count": int(item.get("organized_message_count", 0) or 0),
}
for item in sessions[:4]
],
"high_frequency_topics": {
"top_terms": [
{"term": str(item.get("term") or "").strip(), "count": int(item.get("count", 0) or 0)}
for item in top_terms[:16]
if str(item.get("term") or "").strip()
],
"burst_terms": [
{"text": str(item.get("text") or "").strip(), "count": int(item.get("count", 0) or 0)}
for item in burst_terms[:12]
if str(item.get("text") or "").strip()
],
"merged_templates": [
{
"text": str(item.get("text") or "").strip()[:80],
"count": int(item.get("count", 0) or 0),
"user_count": int(item.get("user_count", 0) or 0),
}
for item in merged_templates[:12]
if str(item.get("text") or "").strip()
],
"repeated_messages": [
{
"text": str(item.get("text") or "").strip()[:80],
"count": int(item.get("count", 0) or 0),
"user_count": int(item.get("user_count", 0) or 0),
}
for item in repeated_messages[:12]
if str(item.get("text") or "").strip()
],
},
# 这里把“现场材料”单独抽成一大块,方便模型先看原声,再看统计摘要。
"live_scene_material": {
"representative_messages": [
{
"time": str(item.get("time") or "").strip(),
"nickname": str(item.get("nickname") or "").strip(),
"content": str(item.get("content") or "").strip()[:90],
}
for item in representative_messages[:24]
if str(item.get("content") or "").strip()
],
"hot_window_samples": [
{
"start_time": str(window.get("start_time") or "").strip(),
"message_count": int(window.get("message_count", 0) or 0),
"user_count": int(window.get("user_count", 0) or 0),
"top_terms": [
str(term.get("term") or "").strip()
for term in (peak_buckets[index].get("top_terms", []) or [])[:5]
if str(term.get("term") or "").strip()
] if index < len(peak_buckets) else [],
"samples": [
{
"time": str(sample.get("time") or "").strip(),
"nickname": str(sample.get("nickname") or "").strip(),
"content": str(sample.get("content") or "").strip()[:90],
}
for sample in (window.get("samples", []) or [])[:8]
if str(sample.get("content") or "").strip()
],
}
for index, window in enumerate(raw_window_samples[:8])
],
},
}
if include_operator:
operator = payload.get("operator_metrics", {}) or {}
material["operator_focus"] = {
"fans_badge_user_count": int(operator.get("fans_badge_user_count", 0) or 0),
"high_room_level_user_count": int(operator.get("high_room_level_user_count", 0) or 0),
"high_fans_level_user_count": int(operator.get("high_fans_level_user_count", 0) or 0),
"active_users_5plus": int(operator.get("active_users_5plus", 0) or 0),
"active_users_10plus": int(operator.get("active_users_10plus", 0) or 0),
"top_badges": [
{
"badge_name": str(item.get("badge_name") or "").strip(),
"user_count": int(item.get("user_count", 0) or 0),
"message_count": int(item.get("message_count", 0) or 0),
}
for item in (operator.get("top_badges", []) or [])[:6]
if str(item.get("badge_name") or "").strip()
],
"top_active_users": [
{
"nickname": str(item.get("nickname") or item.get("uid") or "").strip(),
"message_count": int(item.get("message_count", 0) or 0),
"fans_name": str(item.get("fans_name") or "").strip(),
"fans_level": int(item.get("fans_level", 0) or 0),
"room_level": int(item.get("room_level", 0) or 0),
}
for item in (operator.get("top_active_users", []) or [])[:8]
],
}
return material
@staticmethod
def _clean_daily_report_llm_text(text: str) -> str:
"""
统一清理日报类 LLM 输出中的思考内容。
重点处理:
1. <think> / <thinking> / <reasoning> 标签;
2. 某些模型额外吐出来的“思考过程”“分析过程”等段落;
3. 清理后顺手压缩多余空行,避免图片模板里出现大片空白。
"""
cleaned = remove_reasoning_content(str(text or "").strip())
cleaned = re.sub(r"\n{3,}", "\n\n", cleaned).strip()
return cleaned
def _build_funny_scene_lines(self, payload: Dict[str, Any], limit: int = 5) -> List[str]:
"""
组装“弹幕名场面”兜底素材。
@@ -2980,7 +3151,9 @@ class DouyuPlugin(MessagePluginInterface):
tag=f"douyu_danmu_summary_{(payload.get('report_meta', {}) or {}).get('room_id', '')}",
)
if result:
return result.strip()
cleaned = self._clean_daily_report_llm_text(result)
if cleaned:
return cleaned
logger.warning(
f"斗鱼弹幕总结 LLM 生成失败: model={self._daily_report_llm_client.model}, "
f"last_error={self._daily_report_llm_client.last_error}"
@@ -3003,10 +3176,11 @@ class DouyuPlugin(MessagePluginInterface):
tag=f"douyu_fans_daily_report_{(payload.get('report_meta', {}) or {}).get('room_id', '')}",
)
if result:
text = result.strip()
text = self._clean_daily_report_llm_text(result)
if len(text) > self._daily_report_max_length:
return text[: self._daily_report_max_length - 20].rstrip() + "\n...(已截断)"
return text
if text:
return text
logger.warning(
f"斗鱼粉丝日报 LLM 生成失败: model={self._daily_report_llm_client.model}, "
f"last_error={self._daily_report_llm_client.last_error}"
@@ -3135,10 +3309,11 @@ class DouyuPlugin(MessagePluginInterface):
tag=f"douyu_daily_report_{(payload.get('report_meta', {}) or {}).get('room_id', '')}",
)
if result:
text = result.strip()
text = self._clean_daily_report_llm_text(result)
if len(text) > self._daily_report_max_length:
return text[: self._daily_report_max_length - 20].rstrip() + "\n...(已截断)"
return text
if text:
return text
logger.warning(
f"斗鱼每日报告 LLM 生成失败: model={self._daily_report_llm_client.model}, "
f"last_error={self._daily_report_llm_client.last_error}"