优化斗鱼日报LLM输入提纯并清理think输出\n\n- 为斗鱼日报链路统一接入think和推理内容清洗\n- 将提交给LLM的材料改为更聚焦现场弹幕的提纯结构\n- 提高热点窗口原声样本量,避免窗口样本长期不足\n- 刷新日报缓存版本,确保新提示词和新材料立即生效
This commit is contained in:
@@ -759,6 +759,22 @@ class DouyuDanmuSummaryHelper:
|
|||||||
seen.add(content)
|
seen.add(content)
|
||||||
if len(selected) >= limit:
|
if len(selected) >= limit:
|
||||||
break
|
break
|
||||||
|
# 固定位置采样只能快速抓到“窗口骨架”,但在弹幕量大时通常不足以凑满 limit。
|
||||||
|
# 这里继续顺序补样本,把同一热点窗口里更多真实原声带给 LLM,
|
||||||
|
# 减少模型只看到 4-5 条孤立短句、难以还原现场氛围的问题。
|
||||||
|
if len(selected) < limit:
|
||||||
|
for item in items:
|
||||||
|
content = str(item.get("content") or "").strip()
|
||||||
|
if not content or content in seen:
|
||||||
|
continue
|
||||||
|
selected.append({
|
||||||
|
"time": str(item.get("timestamp_text") or ""),
|
||||||
|
"nickname": str(item.get("nickname") or ""),
|
||||||
|
"content": content[:80],
|
||||||
|
})
|
||||||
|
seen.add(content)
|
||||||
|
if len(selected) >= limit:
|
||||||
|
break
|
||||||
return selected
|
return selected
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
|||||||
17
plugins/douyu/danmu_test.txt
Normal file
17
plugins/douyu/danmu_test.txt
Normal file
File diff suppressed because one or more lines are too long
@@ -30,6 +30,7 @@ from utils.decorator.plugin_decorators import plugin_stats_decorator
|
|||||||
from utils.decorator.points_decorator import plugin_points_cost
|
from utils.decorator.points_decorator import plugin_points_cost
|
||||||
from utils.markdown_to_image import convert_md_str_to_image, html_to_image
|
from utils.markdown_to_image import convert_md_str_to_image, html_to_image
|
||||||
from utils.robot_cmd.robot_command import Feature, PermissionStatus, GroupBotManager
|
from utils.robot_cmd.robot_command import Feature, PermissionStatus, GroupBotManager
|
||||||
|
from utils.string_utils import remove_reasoning_content
|
||||||
from wechat_ipad import WechatAPIClient
|
from wechat_ipad import WechatAPIClient
|
||||||
from wechat_ipad.models.appmsg_xml import DOUYU_MESSAGE_XML
|
from wechat_ipad.models.appmsg_xml import DOUYU_MESSAGE_XML
|
||||||
|
|
||||||
@@ -513,9 +514,11 @@ class DouyuRedisManager:
|
|||||||
class DouyuPlugin(MessagePluginInterface):
|
class DouyuPlugin(MessagePluginInterface):
|
||||||
# 报告缓存版本号:
|
# 报告缓存版本号:
|
||||||
# 1. 版本升级后会自动让历史缓存失效,避免继续复用旧文本/旧图片;
|
# 1. 版本升级后会自动让历史缓存失效,避免继续复用旧文本/旧图片;
|
||||||
# 2. 本次将版本提升到 8,背景画像链路删掉了部分过程控制和提示噪音,
|
# 2. 本次将版本提升到 9:
|
||||||
# 需要刷新旧日报缓存,确保新版 prompt 使用新的精简上下文。
|
# - LLM 输入材料从“整份大 payload”改成“提纯后的现场材料”;
|
||||||
_DAILY_REPORT_CACHE_VERSION = 8
|
# - 同时统一清洗 <think> / reasoning 输出;
|
||||||
|
# 因此需要刷新旧缓存,确保新版结果真实命中新链路。
|
||||||
|
_DAILY_REPORT_CACHE_VERSION = 9
|
||||||
FEATURE_KEY = "DOUYU_MONITOR"
|
FEATURE_KEY = "DOUYU_MONITOR"
|
||||||
FEATURE_DESCRIPTION = "🎮 斗鱼开播提醒 [订阅斗鱼 房间号, 取消订阅斗鱼 房间号]"
|
FEATURE_DESCRIPTION = "🎮 斗鱼开播提醒 [订阅斗鱼 房间号, 取消订阅斗鱼 房间号]"
|
||||||
|
|
||||||
@@ -764,7 +767,7 @@ class DouyuPlugin(MessagePluginInterface):
|
|||||||
1. 模型把 JSON 包在 ```json 代码块里;
|
1. 模型把 JSON 包在 ```json 代码块里;
|
||||||
2. 模型前后补了少量解释文字。
|
2. 模型前后补了少量解释文字。
|
||||||
"""
|
"""
|
||||||
raw = str(text or "").strip()
|
raw = remove_reasoning_content(str(text or "").strip())
|
||||||
if not raw:
|
if not raw:
|
||||||
return None
|
return None
|
||||||
if raw.startswith("```"):
|
if raw.startswith("```"):
|
||||||
@@ -2172,6 +2175,7 @@ class DouyuPlugin(MessagePluginInterface):
|
|||||||
def _build_daily_report_prompt(self, payload: Dict[str, Any]) -> Tuple[str, str]:
|
def _build_daily_report_prompt(self, payload: Dict[str, Any]) -> Tuple[str, str]:
|
||||||
meta = payload.get("report_meta", {}) or {}
|
meta = payload.get("report_meta", {}) or {}
|
||||||
room_context_prompt = self._build_room_context_prompt_block(payload)
|
room_context_prompt = self._build_room_context_prompt_block(payload)
|
||||||
|
prompt_material = self._build_llm_prompt_material(payload, include_operator=True)
|
||||||
system_prompt = (
|
system_prompt = (
|
||||||
"你是斗鱼直播日报助手。请基于给定的结构化弹幕材料,输出一份适合发群的中文日报。"
|
"你是斗鱼直播日报助手。请基于给定的结构化弹幕材料,输出一份适合发群的中文日报。"
|
||||||
"要求简洁、自然、信息密度高,不要编造,不要使用代码块。"
|
"要求简洁、自然、信息密度高,不要编造,不要使用代码块。"
|
||||||
@@ -2186,13 +2190,15 @@ class DouyuPlugin(MessagePluginInterface):
|
|||||||
"5. 单独列出 2-3 个热点时段。\n"
|
"5. 单独列出 2-3 个热点时段。\n"
|
||||||
"6. 整体控制在 600 字以内。\n\n"
|
"6. 整体控制在 600 字以内。\n\n"
|
||||||
f"{room_context_prompt}"
|
f"{room_context_prompt}"
|
||||||
f"材料如下:\n{json.dumps(payload, ensure_ascii=False, indent=2)}"
|
"下面是已经提纯给 LLM 的材料,优先依据现场弹幕片段、热点窗口和共识梗来写,不要被大段统计信息带偏。\n"
|
||||||
|
f"材料如下:\n{json.dumps(prompt_material, ensure_ascii=False, indent=2)}"
|
||||||
)
|
)
|
||||||
return system_prompt, user_prompt
|
return system_prompt, user_prompt
|
||||||
|
|
||||||
def _build_danmu_summary_prompt(self, payload: Dict[str, Any]) -> Tuple[str, str]:
|
def _build_danmu_summary_prompt(self, payload: Dict[str, Any]) -> Tuple[str, str]:
|
||||||
meta = payload.get("report_meta", {}) or {}
|
meta = payload.get("report_meta", {}) or {}
|
||||||
room_context_prompt = self._build_room_context_prompt_block(payload)
|
room_context_prompt = self._build_room_context_prompt_block(payload)
|
||||||
|
prompt_material = self._build_llm_prompt_material(payload, include_operator=False)
|
||||||
system_prompt = (
|
system_prompt = (
|
||||||
"你是直播弹幕总结助手。请只根据给定材料,总结这场直播的弹幕内容与氛围。"
|
"你是直播弹幕总结助手。请只根据给定材料,总结这场直播的弹幕内容与氛围。"
|
||||||
"不要输出运营数据,不要编造,不要写空话套话。"
|
"不要输出运营数据,不要编造,不要写空话套话。"
|
||||||
@@ -2209,7 +2215,8 @@ class DouyuPlugin(MessagePluginInterface):
|
|||||||
f"主播:{meta.get('nickname') or meta.get('room_name') or meta.get('room_id')}\n"
|
f"主播:{meta.get('nickname') or meta.get('room_name') or meta.get('room_id')}\n"
|
||||||
f"日期:{meta.get('anchor_day', '')}\n"
|
f"日期:{meta.get('anchor_day', '')}\n"
|
||||||
f"{room_context_prompt}"
|
f"{room_context_prompt}"
|
||||||
f"材料:\n{json.dumps(payload, ensure_ascii=False, indent=2)}"
|
"下面是已经提纯给 LLM 的现场材料,请优先使用原声弹幕、热点窗口和复读梗,不要写成词频复述。\n"
|
||||||
|
f"材料:\n{json.dumps(prompt_material, ensure_ascii=False, indent=2)}"
|
||||||
)
|
)
|
||||||
return system_prompt, user_prompt
|
return system_prompt, user_prompt
|
||||||
|
|
||||||
@@ -2222,6 +2229,7 @@ class DouyuPlugin(MessagePluginInterface):
|
|||||||
"""
|
"""
|
||||||
meta = payload.get("report_meta", {}) or {}
|
meta = payload.get("report_meta", {}) or {}
|
||||||
room_context_prompt = self._build_room_context_prompt_block(payload)
|
room_context_prompt = self._build_room_context_prompt_block(payload)
|
||||||
|
prompt_material = self._build_llm_prompt_material(payload, include_operator=False)
|
||||||
system_prompt = (
|
system_prompt = (
|
||||||
"你是斗鱼直播间的粉丝向整活日报编辑。"
|
"你是斗鱼直播间的粉丝向整活日报编辑。"
|
||||||
"请只根据提供的真实弹幕材料,输出一份开心、欢乐、带一点恶搞气质的中文总结。"
|
"请只根据提供的真实弹幕材料,输出一份开心、欢乐、带一点恶搞气质的中文总结。"
|
||||||
@@ -2239,10 +2247,173 @@ class DouyuPlugin(MessagePluginInterface):
|
|||||||
f"主播:{meta.get('nickname') or meta.get('room_name') or meta.get('room_id')}\n"
|
f"主播:{meta.get('nickname') or meta.get('room_name') or meta.get('room_id')}\n"
|
||||||
f"日期:{meta.get('anchor_day', '')}\n"
|
f"日期:{meta.get('anchor_day', '')}\n"
|
||||||
f"{room_context_prompt}"
|
f"{room_context_prompt}"
|
||||||
f"材料:\n{json.dumps(payload, ensure_ascii=False, indent=2)}"
|
"下面是已经提纯给 LLM 的现场材料,请优先抓原声弹幕、热点窗口和集体起哄片段,少写空泛概括。\n"
|
||||||
|
f"材料:\n{json.dumps(prompt_material, ensure_ascii=False, indent=2)}"
|
||||||
)
|
)
|
||||||
return system_prompt, user_prompt
|
return system_prompt, user_prompt
|
||||||
|
|
||||||
|
def _build_llm_prompt_material(
|
||||||
|
self,
|
||||||
|
payload: Dict[str, Any],
|
||||||
|
*,
|
||||||
|
include_operator: bool = False,
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
为日报 LLM 单独构建“提纯后的材料”。
|
||||||
|
设计目标:
|
||||||
|
1. 把真正有现场感的弹幕内容提到前面,降低大体量统计 JSON 对模型注意力的干扰;
|
||||||
|
2. 保留足够的梗、热点时段和原声样本,方便模型写出更像直播间回放的内容;
|
||||||
|
3. 运营指标只在确实需要的日报正文场景里保留精简版,不再无差别塞给所有任务。
|
||||||
|
"""
|
||||||
|
meta = payload.get("report_meta", {}) or {}
|
||||||
|
room_context = payload.get("room_context", {}) or {}
|
||||||
|
sessions = payload.get("sessions", []) or []
|
||||||
|
representative_messages = payload.get("representative_messages", []) or []
|
||||||
|
raw_window_samples = payload.get("raw_window_samples", []) or []
|
||||||
|
merged_templates = payload.get("merged_templates", []) or []
|
||||||
|
repeated_messages = payload.get("repeated_messages", []) or []
|
||||||
|
top_terms = payload.get("top_terms", []) or []
|
||||||
|
burst_terms = payload.get("burst_terms", []) or []
|
||||||
|
peak_buckets = payload.get("peak_buckets", []) or []
|
||||||
|
|
||||||
|
material: Dict[str, Any] = {
|
||||||
|
"report_meta": {
|
||||||
|
"room_id": str(meta.get("room_id") or "").strip(),
|
||||||
|
"anchor_day": str(meta.get("anchor_day") or "").strip(),
|
||||||
|
"nickname": str(meta.get("nickname") or "").strip(),
|
||||||
|
"room_name": str(meta.get("room_name") or "").strip(),
|
||||||
|
"session_count": int(meta.get("session_count", 0) or 0),
|
||||||
|
"message_count": int(meta.get("message_count", 0) or 0),
|
||||||
|
"unique_user_count": int(meta.get("unique_user_count", 0) or 0),
|
||||||
|
},
|
||||||
|
"room_context": {
|
||||||
|
"domain": str(room_context.get("domain") or "").strip(),
|
||||||
|
"inferred_domains": self._normalize_text_list(room_context.get("inferred_domains"))[:6],
|
||||||
|
"identity_summary": str(room_context.get("identity_summary") or "").strip(),
|
||||||
|
"career_background": str(room_context.get("career_background") or "").strip(),
|
||||||
|
"related_people": self._normalize_text_list(room_context.get("related_people"))[:10],
|
||||||
|
"storyline_keywords": self._normalize_text_list(room_context.get("storyline_keywords"))[:10],
|
||||||
|
"style_hints": self._normalize_text_list(room_context.get("style_hints"))[:6],
|
||||||
|
},
|
||||||
|
"session_overview": [
|
||||||
|
{
|
||||||
|
"session_id": str(item.get("session_id") or "").strip(),
|
||||||
|
"segments": item.get("segments", []) or [],
|
||||||
|
"message_count": int(item.get("message_count", 0) or 0),
|
||||||
|
"organized_message_count": int(item.get("organized_message_count", 0) or 0),
|
||||||
|
}
|
||||||
|
for item in sessions[:4]
|
||||||
|
],
|
||||||
|
"high_frequency_topics": {
|
||||||
|
"top_terms": [
|
||||||
|
{"term": str(item.get("term") or "").strip(), "count": int(item.get("count", 0) or 0)}
|
||||||
|
for item in top_terms[:16]
|
||||||
|
if str(item.get("term") or "").strip()
|
||||||
|
],
|
||||||
|
"burst_terms": [
|
||||||
|
{"text": str(item.get("text") or "").strip(), "count": int(item.get("count", 0) or 0)}
|
||||||
|
for item in burst_terms[:12]
|
||||||
|
if str(item.get("text") or "").strip()
|
||||||
|
],
|
||||||
|
"merged_templates": [
|
||||||
|
{
|
||||||
|
"text": str(item.get("text") or "").strip()[:80],
|
||||||
|
"count": int(item.get("count", 0) or 0),
|
||||||
|
"user_count": int(item.get("user_count", 0) or 0),
|
||||||
|
}
|
||||||
|
for item in merged_templates[:12]
|
||||||
|
if str(item.get("text") or "").strip()
|
||||||
|
],
|
||||||
|
"repeated_messages": [
|
||||||
|
{
|
||||||
|
"text": str(item.get("text") or "").strip()[:80],
|
||||||
|
"count": int(item.get("count", 0) or 0),
|
||||||
|
"user_count": int(item.get("user_count", 0) or 0),
|
||||||
|
}
|
||||||
|
for item in repeated_messages[:12]
|
||||||
|
if str(item.get("text") or "").strip()
|
||||||
|
],
|
||||||
|
},
|
||||||
|
# 这里把“现场材料”单独抽成一大块,方便模型先看原声,再看统计摘要。
|
||||||
|
"live_scene_material": {
|
||||||
|
"representative_messages": [
|
||||||
|
{
|
||||||
|
"time": str(item.get("time") or "").strip(),
|
||||||
|
"nickname": str(item.get("nickname") or "").strip(),
|
||||||
|
"content": str(item.get("content") or "").strip()[:90],
|
||||||
|
}
|
||||||
|
for item in representative_messages[:24]
|
||||||
|
if str(item.get("content") or "").strip()
|
||||||
|
],
|
||||||
|
"hot_window_samples": [
|
||||||
|
{
|
||||||
|
"start_time": str(window.get("start_time") or "").strip(),
|
||||||
|
"message_count": int(window.get("message_count", 0) or 0),
|
||||||
|
"user_count": int(window.get("user_count", 0) or 0),
|
||||||
|
"top_terms": [
|
||||||
|
str(term.get("term") or "").strip()
|
||||||
|
for term in (peak_buckets[index].get("top_terms", []) or [])[:5]
|
||||||
|
if str(term.get("term") or "").strip()
|
||||||
|
] if index < len(peak_buckets) else [],
|
||||||
|
"samples": [
|
||||||
|
{
|
||||||
|
"time": str(sample.get("time") or "").strip(),
|
||||||
|
"nickname": str(sample.get("nickname") or "").strip(),
|
||||||
|
"content": str(sample.get("content") or "").strip()[:90],
|
||||||
|
}
|
||||||
|
for sample in (window.get("samples", []) or [])[:8]
|
||||||
|
if str(sample.get("content") or "").strip()
|
||||||
|
],
|
||||||
|
}
|
||||||
|
for index, window in enumerate(raw_window_samples[:8])
|
||||||
|
],
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
if include_operator:
|
||||||
|
operator = payload.get("operator_metrics", {}) or {}
|
||||||
|
material["operator_focus"] = {
|
||||||
|
"fans_badge_user_count": int(operator.get("fans_badge_user_count", 0) or 0),
|
||||||
|
"high_room_level_user_count": int(operator.get("high_room_level_user_count", 0) or 0),
|
||||||
|
"high_fans_level_user_count": int(operator.get("high_fans_level_user_count", 0) or 0),
|
||||||
|
"active_users_5plus": int(operator.get("active_users_5plus", 0) or 0),
|
||||||
|
"active_users_10plus": int(operator.get("active_users_10plus", 0) or 0),
|
||||||
|
"top_badges": [
|
||||||
|
{
|
||||||
|
"badge_name": str(item.get("badge_name") or "").strip(),
|
||||||
|
"user_count": int(item.get("user_count", 0) or 0),
|
||||||
|
"message_count": int(item.get("message_count", 0) or 0),
|
||||||
|
}
|
||||||
|
for item in (operator.get("top_badges", []) or [])[:6]
|
||||||
|
if str(item.get("badge_name") or "").strip()
|
||||||
|
],
|
||||||
|
"top_active_users": [
|
||||||
|
{
|
||||||
|
"nickname": str(item.get("nickname") or item.get("uid") or "").strip(),
|
||||||
|
"message_count": int(item.get("message_count", 0) or 0),
|
||||||
|
"fans_name": str(item.get("fans_name") or "").strip(),
|
||||||
|
"fans_level": int(item.get("fans_level", 0) or 0),
|
||||||
|
"room_level": int(item.get("room_level", 0) or 0),
|
||||||
|
}
|
||||||
|
for item in (operator.get("top_active_users", []) or [])[:8]
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
return material
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _clean_daily_report_llm_text(text: str) -> str:
|
||||||
|
"""
|
||||||
|
统一清理日报类 LLM 输出中的思考内容。
|
||||||
|
重点处理:
|
||||||
|
1. <think> / <thinking> / <reasoning> 标签;
|
||||||
|
2. 某些模型额外吐出来的“思考过程”“分析过程”等段落;
|
||||||
|
3. 清理后顺手压缩多余空行,避免图片模板里出现大片空白。
|
||||||
|
"""
|
||||||
|
cleaned = remove_reasoning_content(str(text or "").strip())
|
||||||
|
cleaned = re.sub(r"\n{3,}", "\n\n", cleaned).strip()
|
||||||
|
return cleaned
|
||||||
|
|
||||||
def _build_funny_scene_lines(self, payload: Dict[str, Any], limit: int = 5) -> List[str]:
|
def _build_funny_scene_lines(self, payload: Dict[str, Any], limit: int = 5) -> List[str]:
|
||||||
"""
|
"""
|
||||||
组装“弹幕名场面”兜底素材。
|
组装“弹幕名场面”兜底素材。
|
||||||
@@ -2980,7 +3151,9 @@ class DouyuPlugin(MessagePluginInterface):
|
|||||||
tag=f"douyu_danmu_summary_{(payload.get('report_meta', {}) or {}).get('room_id', '')}",
|
tag=f"douyu_danmu_summary_{(payload.get('report_meta', {}) or {}).get('room_id', '')}",
|
||||||
)
|
)
|
||||||
if result:
|
if result:
|
||||||
return result.strip()
|
cleaned = self._clean_daily_report_llm_text(result)
|
||||||
|
if cleaned:
|
||||||
|
return cleaned
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"斗鱼弹幕总结 LLM 生成失败: model={self._daily_report_llm_client.model}, "
|
f"斗鱼弹幕总结 LLM 生成失败: model={self._daily_report_llm_client.model}, "
|
||||||
f"last_error={self._daily_report_llm_client.last_error}"
|
f"last_error={self._daily_report_llm_client.last_error}"
|
||||||
@@ -3003,10 +3176,11 @@ class DouyuPlugin(MessagePluginInterface):
|
|||||||
tag=f"douyu_fans_daily_report_{(payload.get('report_meta', {}) or {}).get('room_id', '')}",
|
tag=f"douyu_fans_daily_report_{(payload.get('report_meta', {}) or {}).get('room_id', '')}",
|
||||||
)
|
)
|
||||||
if result:
|
if result:
|
||||||
text = result.strip()
|
text = self._clean_daily_report_llm_text(result)
|
||||||
if len(text) > self._daily_report_max_length:
|
if len(text) > self._daily_report_max_length:
|
||||||
return text[: self._daily_report_max_length - 20].rstrip() + "\n...(已截断)"
|
return text[: self._daily_report_max_length - 20].rstrip() + "\n...(已截断)"
|
||||||
return text
|
if text:
|
||||||
|
return text
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"斗鱼粉丝日报 LLM 生成失败: model={self._daily_report_llm_client.model}, "
|
f"斗鱼粉丝日报 LLM 生成失败: model={self._daily_report_llm_client.model}, "
|
||||||
f"last_error={self._daily_report_llm_client.last_error}"
|
f"last_error={self._daily_report_llm_client.last_error}"
|
||||||
@@ -3135,10 +3309,11 @@ class DouyuPlugin(MessagePluginInterface):
|
|||||||
tag=f"douyu_daily_report_{(payload.get('report_meta', {}) or {}).get('room_id', '')}",
|
tag=f"douyu_daily_report_{(payload.get('report_meta', {}) or {}).get('room_id', '')}",
|
||||||
)
|
)
|
||||||
if result:
|
if result:
|
||||||
text = result.strip()
|
text = self._clean_daily_report_llm_text(result)
|
||||||
if len(text) > self._daily_report_max_length:
|
if len(text) > self._daily_report_max_length:
|
||||||
return text[: self._daily_report_max_length - 20].rstrip() + "\n...(已截断)"
|
return text[: self._daily_report_max_length - 20].rstrip() + "\n...(已截断)"
|
||||||
return text
|
if text:
|
||||||
|
return text
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"斗鱼每日报告 LLM 生成失败: model={self._daily_report_llm_client.model}, "
|
f"斗鱼每日报告 LLM 生成失败: model={self._daily_report_llm_client.model}, "
|
||||||
f"last_error={self._daily_report_llm_client.last_error}"
|
f"last_error={self._daily_report_llm_client.last_error}"
|
||||||
|
|||||||
Reference in New Issue
Block a user