优化斗鱼日报LLM输入提纯并清理think输出\n\n- 为斗鱼日报链路统一接入think和推理内容清洗\n- 将提交给LLM的材料改为更聚焦现场弹幕的提纯结构\n- 提高热点窗口原声样本量,避免窗口样本长期不足\n- 刷新日报缓存版本,确保新提示词和新材料立即生效
This commit is contained in:
@@ -759,6 +759,22 @@ class DouyuDanmuSummaryHelper:
|
||||
seen.add(content)
|
||||
if len(selected) >= limit:
|
||||
break
|
||||
# 固定位置采样只能快速抓到“窗口骨架”,但在弹幕量大时通常不足以凑满 limit。
|
||||
# 这里继续顺序补样本,把同一热点窗口里更多真实原声带给 LLM,
|
||||
# 减少模型只看到 4-5 条孤立短句、难以还原现场氛围的问题。
|
||||
if len(selected) < limit:
|
||||
for item in items:
|
||||
content = str(item.get("content") or "").strip()
|
||||
if not content or content in seen:
|
||||
continue
|
||||
selected.append({
|
||||
"time": str(item.get("timestamp_text") or ""),
|
||||
"nickname": str(item.get("nickname") or ""),
|
||||
"content": content[:80],
|
||||
})
|
||||
seen.add(content)
|
||||
if len(selected) >= limit:
|
||||
break
|
||||
return selected
|
||||
|
||||
@classmethod
|
||||
|
||||
17
plugins/douyu/danmu_test.txt
Normal file
17
plugins/douyu/danmu_test.txt
Normal file
File diff suppressed because one or more lines are too long
@@ -30,6 +30,7 @@ from utils.decorator.plugin_decorators import plugin_stats_decorator
|
||||
from utils.decorator.points_decorator import plugin_points_cost
|
||||
from utils.markdown_to_image import convert_md_str_to_image, html_to_image
|
||||
from utils.robot_cmd.robot_command import Feature, PermissionStatus, GroupBotManager
|
||||
from utils.string_utils import remove_reasoning_content
|
||||
from wechat_ipad import WechatAPIClient
|
||||
from wechat_ipad.models.appmsg_xml import DOUYU_MESSAGE_XML
|
||||
|
||||
@@ -513,9 +514,11 @@ class DouyuRedisManager:
|
||||
class DouyuPlugin(MessagePluginInterface):
|
||||
# 报告缓存版本号:
|
||||
# 1. 版本升级后会自动让历史缓存失效,避免继续复用旧文本/旧图片;
|
||||
# 2. 本次将版本提升到 8,背景画像链路删掉了部分过程控制和提示噪音,
|
||||
# 需要刷新旧日报缓存,确保新版 prompt 使用新的精简上下文。
|
||||
_DAILY_REPORT_CACHE_VERSION = 8
|
||||
# 2. 本次将版本提升到 9:
|
||||
# - LLM 输入材料从“整份大 payload”改成“提纯后的现场材料”;
|
||||
# - 同时统一清洗 <think> / reasoning 输出;
|
||||
# 因此需要刷新旧缓存,确保新版结果真实命中新链路。
|
||||
_DAILY_REPORT_CACHE_VERSION = 9
|
||||
FEATURE_KEY = "DOUYU_MONITOR"
|
||||
FEATURE_DESCRIPTION = "🎮 斗鱼开播提醒 [订阅斗鱼 房间号, 取消订阅斗鱼 房间号]"
|
||||
|
||||
@@ -764,7 +767,7 @@ class DouyuPlugin(MessagePluginInterface):
|
||||
1. 模型把 JSON 包在 ```json 代码块里;
|
||||
2. 模型前后补了少量解释文字。
|
||||
"""
|
||||
raw = str(text or "").strip()
|
||||
raw = remove_reasoning_content(str(text or "").strip())
|
||||
if not raw:
|
||||
return None
|
||||
if raw.startswith("```"):
|
||||
@@ -2172,6 +2175,7 @@ class DouyuPlugin(MessagePluginInterface):
|
||||
def _build_daily_report_prompt(self, payload: Dict[str, Any]) -> Tuple[str, str]:
|
||||
meta = payload.get("report_meta", {}) or {}
|
||||
room_context_prompt = self._build_room_context_prompt_block(payload)
|
||||
prompt_material = self._build_llm_prompt_material(payload, include_operator=True)
|
||||
system_prompt = (
|
||||
"你是斗鱼直播日报助手。请基于给定的结构化弹幕材料,输出一份适合发群的中文日报。"
|
||||
"要求简洁、自然、信息密度高,不要编造,不要使用代码块。"
|
||||
@@ -2186,13 +2190,15 @@ class DouyuPlugin(MessagePluginInterface):
|
||||
"5. 单独列出 2-3 个热点时段。\n"
|
||||
"6. 整体控制在 600 字以内。\n\n"
|
||||
f"{room_context_prompt}"
|
||||
f"材料如下:\n{json.dumps(payload, ensure_ascii=False, indent=2)}"
|
||||
"下面是已经提纯给 LLM 的材料,优先依据现场弹幕片段、热点窗口和共识梗来写,不要被大段统计信息带偏。\n"
|
||||
f"材料如下:\n{json.dumps(prompt_material, ensure_ascii=False, indent=2)}"
|
||||
)
|
||||
return system_prompt, user_prompt
|
||||
|
||||
def _build_danmu_summary_prompt(self, payload: Dict[str, Any]) -> Tuple[str, str]:
|
||||
meta = payload.get("report_meta", {}) or {}
|
||||
room_context_prompt = self._build_room_context_prompt_block(payload)
|
||||
prompt_material = self._build_llm_prompt_material(payload, include_operator=False)
|
||||
system_prompt = (
|
||||
"你是直播弹幕总结助手。请只根据给定材料,总结这场直播的弹幕内容与氛围。"
|
||||
"不要输出运营数据,不要编造,不要写空话套话。"
|
||||
@@ -2209,7 +2215,8 @@ class DouyuPlugin(MessagePluginInterface):
|
||||
f"主播:{meta.get('nickname') or meta.get('room_name') or meta.get('room_id')}\n"
|
||||
f"日期:{meta.get('anchor_day', '')}\n"
|
||||
f"{room_context_prompt}"
|
||||
f"材料:\n{json.dumps(payload, ensure_ascii=False, indent=2)}"
|
||||
"下面是已经提纯给 LLM 的现场材料,请优先使用原声弹幕、热点窗口和复读梗,不要写成词频复述。\n"
|
||||
f"材料:\n{json.dumps(prompt_material, ensure_ascii=False, indent=2)}"
|
||||
)
|
||||
return system_prompt, user_prompt
|
||||
|
||||
@@ -2222,6 +2229,7 @@ class DouyuPlugin(MessagePluginInterface):
|
||||
"""
|
||||
meta = payload.get("report_meta", {}) or {}
|
||||
room_context_prompt = self._build_room_context_prompt_block(payload)
|
||||
prompt_material = self._build_llm_prompt_material(payload, include_operator=False)
|
||||
system_prompt = (
|
||||
"你是斗鱼直播间的粉丝向整活日报编辑。"
|
||||
"请只根据提供的真实弹幕材料,输出一份开心、欢乐、带一点恶搞气质的中文总结。"
|
||||
@@ -2239,10 +2247,173 @@ class DouyuPlugin(MessagePluginInterface):
|
||||
f"主播:{meta.get('nickname') or meta.get('room_name') or meta.get('room_id')}\n"
|
||||
f"日期:{meta.get('anchor_day', '')}\n"
|
||||
f"{room_context_prompt}"
|
||||
f"材料:\n{json.dumps(payload, ensure_ascii=False, indent=2)}"
|
||||
"下面是已经提纯给 LLM 的现场材料,请优先抓原声弹幕、热点窗口和集体起哄片段,少写空泛概括。\n"
|
||||
f"材料:\n{json.dumps(prompt_material, ensure_ascii=False, indent=2)}"
|
||||
)
|
||||
return system_prompt, user_prompt
|
||||
|
||||
def _build_llm_prompt_material(
|
||||
self,
|
||||
payload: Dict[str, Any],
|
||||
*,
|
||||
include_operator: bool = False,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
为日报 LLM 单独构建“提纯后的材料”。
|
||||
设计目标:
|
||||
1. 把真正有现场感的弹幕内容提到前面,降低大体量统计 JSON 对模型注意力的干扰;
|
||||
2. 保留足够的梗、热点时段和原声样本,方便模型写出更像直播间回放的内容;
|
||||
3. 运营指标只在确实需要的日报正文场景里保留精简版,不再无差别塞给所有任务。
|
||||
"""
|
||||
meta = payload.get("report_meta", {}) or {}
|
||||
room_context = payload.get("room_context", {}) or {}
|
||||
sessions = payload.get("sessions", []) or []
|
||||
representative_messages = payload.get("representative_messages", []) or []
|
||||
raw_window_samples = payload.get("raw_window_samples", []) or []
|
||||
merged_templates = payload.get("merged_templates", []) or []
|
||||
repeated_messages = payload.get("repeated_messages", []) or []
|
||||
top_terms = payload.get("top_terms", []) or []
|
||||
burst_terms = payload.get("burst_terms", []) or []
|
||||
peak_buckets = payload.get("peak_buckets", []) or []
|
||||
|
||||
material: Dict[str, Any] = {
|
||||
"report_meta": {
|
||||
"room_id": str(meta.get("room_id") or "").strip(),
|
||||
"anchor_day": str(meta.get("anchor_day") or "").strip(),
|
||||
"nickname": str(meta.get("nickname") or "").strip(),
|
||||
"room_name": str(meta.get("room_name") or "").strip(),
|
||||
"session_count": int(meta.get("session_count", 0) or 0),
|
||||
"message_count": int(meta.get("message_count", 0) or 0),
|
||||
"unique_user_count": int(meta.get("unique_user_count", 0) or 0),
|
||||
},
|
||||
"room_context": {
|
||||
"domain": str(room_context.get("domain") or "").strip(),
|
||||
"inferred_domains": self._normalize_text_list(room_context.get("inferred_domains"))[:6],
|
||||
"identity_summary": str(room_context.get("identity_summary") or "").strip(),
|
||||
"career_background": str(room_context.get("career_background") or "").strip(),
|
||||
"related_people": self._normalize_text_list(room_context.get("related_people"))[:10],
|
||||
"storyline_keywords": self._normalize_text_list(room_context.get("storyline_keywords"))[:10],
|
||||
"style_hints": self._normalize_text_list(room_context.get("style_hints"))[:6],
|
||||
},
|
||||
"session_overview": [
|
||||
{
|
||||
"session_id": str(item.get("session_id") or "").strip(),
|
||||
"segments": item.get("segments", []) or [],
|
||||
"message_count": int(item.get("message_count", 0) or 0),
|
||||
"organized_message_count": int(item.get("organized_message_count", 0) or 0),
|
||||
}
|
||||
for item in sessions[:4]
|
||||
],
|
||||
"high_frequency_topics": {
|
||||
"top_terms": [
|
||||
{"term": str(item.get("term") or "").strip(), "count": int(item.get("count", 0) or 0)}
|
||||
for item in top_terms[:16]
|
||||
if str(item.get("term") or "").strip()
|
||||
],
|
||||
"burst_terms": [
|
||||
{"text": str(item.get("text") or "").strip(), "count": int(item.get("count", 0) or 0)}
|
||||
for item in burst_terms[:12]
|
||||
if str(item.get("text") or "").strip()
|
||||
],
|
||||
"merged_templates": [
|
||||
{
|
||||
"text": str(item.get("text") or "").strip()[:80],
|
||||
"count": int(item.get("count", 0) or 0),
|
||||
"user_count": int(item.get("user_count", 0) or 0),
|
||||
}
|
||||
for item in merged_templates[:12]
|
||||
if str(item.get("text") or "").strip()
|
||||
],
|
||||
"repeated_messages": [
|
||||
{
|
||||
"text": str(item.get("text") or "").strip()[:80],
|
||||
"count": int(item.get("count", 0) or 0),
|
||||
"user_count": int(item.get("user_count", 0) or 0),
|
||||
}
|
||||
for item in repeated_messages[:12]
|
||||
if str(item.get("text") or "").strip()
|
||||
],
|
||||
},
|
||||
# 这里把“现场材料”单独抽成一大块,方便模型先看原声,再看统计摘要。
|
||||
"live_scene_material": {
|
||||
"representative_messages": [
|
||||
{
|
||||
"time": str(item.get("time") or "").strip(),
|
||||
"nickname": str(item.get("nickname") or "").strip(),
|
||||
"content": str(item.get("content") or "").strip()[:90],
|
||||
}
|
||||
for item in representative_messages[:24]
|
||||
if str(item.get("content") or "").strip()
|
||||
],
|
||||
"hot_window_samples": [
|
||||
{
|
||||
"start_time": str(window.get("start_time") or "").strip(),
|
||||
"message_count": int(window.get("message_count", 0) or 0),
|
||||
"user_count": int(window.get("user_count", 0) or 0),
|
||||
"top_terms": [
|
||||
str(term.get("term") or "").strip()
|
||||
for term in (peak_buckets[index].get("top_terms", []) or [])[:5]
|
||||
if str(term.get("term") or "").strip()
|
||||
] if index < len(peak_buckets) else [],
|
||||
"samples": [
|
||||
{
|
||||
"time": str(sample.get("time") or "").strip(),
|
||||
"nickname": str(sample.get("nickname") or "").strip(),
|
||||
"content": str(sample.get("content") or "").strip()[:90],
|
||||
}
|
||||
for sample in (window.get("samples", []) or [])[:8]
|
||||
if str(sample.get("content") or "").strip()
|
||||
],
|
||||
}
|
||||
for index, window in enumerate(raw_window_samples[:8])
|
||||
],
|
||||
},
|
||||
}
|
||||
|
||||
if include_operator:
|
||||
operator = payload.get("operator_metrics", {}) or {}
|
||||
material["operator_focus"] = {
|
||||
"fans_badge_user_count": int(operator.get("fans_badge_user_count", 0) or 0),
|
||||
"high_room_level_user_count": int(operator.get("high_room_level_user_count", 0) or 0),
|
||||
"high_fans_level_user_count": int(operator.get("high_fans_level_user_count", 0) or 0),
|
||||
"active_users_5plus": int(operator.get("active_users_5plus", 0) or 0),
|
||||
"active_users_10plus": int(operator.get("active_users_10plus", 0) or 0),
|
||||
"top_badges": [
|
||||
{
|
||||
"badge_name": str(item.get("badge_name") or "").strip(),
|
||||
"user_count": int(item.get("user_count", 0) or 0),
|
||||
"message_count": int(item.get("message_count", 0) or 0),
|
||||
}
|
||||
for item in (operator.get("top_badges", []) or [])[:6]
|
||||
if str(item.get("badge_name") or "").strip()
|
||||
],
|
||||
"top_active_users": [
|
||||
{
|
||||
"nickname": str(item.get("nickname") or item.get("uid") or "").strip(),
|
||||
"message_count": int(item.get("message_count", 0) or 0),
|
||||
"fans_name": str(item.get("fans_name") or "").strip(),
|
||||
"fans_level": int(item.get("fans_level", 0) or 0),
|
||||
"room_level": int(item.get("room_level", 0) or 0),
|
||||
}
|
||||
for item in (operator.get("top_active_users", []) or [])[:8]
|
||||
],
|
||||
}
|
||||
|
||||
return material
|
||||
|
||||
@staticmethod
|
||||
def _clean_daily_report_llm_text(text: str) -> str:
|
||||
"""
|
||||
统一清理日报类 LLM 输出中的思考内容。
|
||||
重点处理:
|
||||
1. <think> / <thinking> / <reasoning> 标签;
|
||||
2. 某些模型额外吐出来的“思考过程”“分析过程”等段落;
|
||||
3. 清理后顺手压缩多余空行,避免图片模板里出现大片空白。
|
||||
"""
|
||||
cleaned = remove_reasoning_content(str(text or "").strip())
|
||||
cleaned = re.sub(r"\n{3,}", "\n\n", cleaned).strip()
|
||||
return cleaned
|
||||
|
||||
def _build_funny_scene_lines(self, payload: Dict[str, Any], limit: int = 5) -> List[str]:
|
||||
"""
|
||||
组装“弹幕名场面”兜底素材。
|
||||
@@ -2980,7 +3151,9 @@ class DouyuPlugin(MessagePluginInterface):
|
||||
tag=f"douyu_danmu_summary_{(payload.get('report_meta', {}) or {}).get('room_id', '')}",
|
||||
)
|
||||
if result:
|
||||
return result.strip()
|
||||
cleaned = self._clean_daily_report_llm_text(result)
|
||||
if cleaned:
|
||||
return cleaned
|
||||
logger.warning(
|
||||
f"斗鱼弹幕总结 LLM 生成失败: model={self._daily_report_llm_client.model}, "
|
||||
f"last_error={self._daily_report_llm_client.last_error}"
|
||||
@@ -3003,10 +3176,11 @@ class DouyuPlugin(MessagePluginInterface):
|
||||
tag=f"douyu_fans_daily_report_{(payload.get('report_meta', {}) or {}).get('room_id', '')}",
|
||||
)
|
||||
if result:
|
||||
text = result.strip()
|
||||
text = self._clean_daily_report_llm_text(result)
|
||||
if len(text) > self._daily_report_max_length:
|
||||
return text[: self._daily_report_max_length - 20].rstrip() + "\n...(已截断)"
|
||||
return text
|
||||
if text:
|
||||
return text
|
||||
logger.warning(
|
||||
f"斗鱼粉丝日报 LLM 生成失败: model={self._daily_report_llm_client.model}, "
|
||||
f"last_error={self._daily_report_llm_client.last_error}"
|
||||
@@ -3135,10 +3309,11 @@ class DouyuPlugin(MessagePluginInterface):
|
||||
tag=f"douyu_daily_report_{(payload.get('report_meta', {}) or {}).get('room_id', '')}",
|
||||
)
|
||||
if result:
|
||||
text = result.strip()
|
||||
text = self._clean_daily_report_llm_text(result)
|
||||
if len(text) > self._daily_report_max_length:
|
||||
return text[: self._daily_report_max_length - 20].rstrip() + "\n...(已截断)"
|
||||
return text
|
||||
if text:
|
||||
return text
|
||||
logger.warning(
|
||||
f"斗鱼每日报告 LLM 生成失败: model={self._daily_report_llm_client.model}, "
|
||||
f"last_error={self._daily_report_llm_client.last_error}"
|
||||
|
||||
Reference in New Issue
Block a user