From 12a5d89c760418ce75d1fdea5e9fb605e704498e Mon Sep 17 00:00:00 2001
From: liuwei <liuwei@wdtrgf.com.cn>
Date: Wed, 29 Apr 2026 14:02:15 +0800
Subject: [PATCH] =?UTF-8?q?=E6=9C=AC=E5=9C=B0=E6=B8=85=E6=B4=97=E5=BC=B9?=
 =?UTF-8?q?=E5=B9=95TXT=E5=B9=B6=E5=8E=8B=E7=BC=A9=E9=87=8D=E5=A4=8D?=
 =?UTF-8?q?=E5=88=B7=E5=B1=8F=E5=86=85=E5=AE=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 plugins/douyu/danmu_summary.py | 43 ++++++++++++++++++++++++++++++++--
 plugins/douyu/main.py          | 26 ++++++++++++++++++--
 2 files changed, 65 insertions(+), 4 deletions(-)

diff --git a/plugins/douyu/danmu_summary.py b/plugins/douyu/danmu_summary.py
index 774bce9..e9ffbcd 100644
--- a/plugins/douyu/danmu_summary.py
+++ b/plugins/douyu/danmu_summary.py
@@ -981,10 +981,49 @@ class DouyuDanmuSummaryHelper:
             time_text = str(item.get("timestamp_text") or "").strip()
             nickname = str(item.get("nickname") or "").strip() or "观众"
             repeat_count = int(item.get("repeat_count", 1) or 1)
-            repeat_suffix = f" [重复{repeat_count}次]" if repeat_count > 1 else ""
-            lines.append(f"[{time_text}] {nickname}：{content}{repeat_suffix}")
+            normalized_content = cls._format_llm_transcript_content(content, repeat_count)
+            # 本地清洗给 LLM 的 txt 时，统一移除 UID。
+            # UID 对日报写作没有帮助，还会占 token、污染阅读流。
+            lines.append(f"[{time_text}] {nickname}：{normalized_content}")
         return lines
 
+    @classmethod
+    def _format_llm_transcript_content(cls, content: str, repeat_count: int) -> str:
+        """
+        规范化给 LLM 的弹幕正文显示形式。
+        目标：
+        1. 像“哈哈哈”“666”“？”这类典型短刷屏，直接压成 `哈哈哈*120`；
+        2. 正常讨论内容仍保留原句，只在后面标一次重复次数；
+        3. 既减小文本体积，又尽量不牺牲讨论语义。
+        """
+        text = str(content or "").strip()
+        count = int(repeat_count or 1)
+        if count <= 1:
+            return text
+        if cls._should_compact_burst_text(text):
+            return f"{text}*{count}"
+        return f"{text} [重复{count}次]"
+
+    @classmethod
+    def _should_compact_burst_text(cls, content: str) -> bool:
+        """
+        判断某条弹幕是否属于“适合压缩成 xN”的短刷屏文本。
+        这里故意保持保守，只压缩：
+        1. 已知短 burst 词；
+        2. 纯问号/感叹号/句号等情绪符号；
+        3. 很短、且由同类字符重复组成的刷屏文本。
+        """
+        text = str(content or "").strip().lower()
+        if not text:
+            return False
+        if text in cls.SHORT_BURST_WORDS:
+            return True
+        if re.fullmatch(r"[?？!！。\.~～]+", text):
+            return True
+        if len(text) <= 8 and len(set(text)) <= 3:
+            return True
+        return False
+
     @classmethod
     def _build_chronological_samples(
         cls,
diff --git a/plugins/douyu/main.py b/plugins/douyu/main.py
index 1e38bf4..0d1f1b1 100644
--- a/plugins/douyu/main.py
+++ b/plugins/douyu/main.py
@@ -2251,6 +2251,18 @@ class DouyuPlugin(MessagePluginInterface):
         artifact_path = os.path.join(artifact_dir, f"{room_id}_{anchor_day.replace('-', '')}_daily_report_payload.json")
         with open(artifact_path, "w", encoding="utf-8") as f:
             json.dump(payload, f, ensure_ascii=False, indent=2)
+        # 额外落一份“专门给 LLM 看”的本地清洗 txt。
+        # 它和原始弹幕文件的区别在于：
+        # 1. 已经过滤系统噪音；
+        # 2. 已移除 UID；
+        # 3. 已把短刷屏压缩成 `哈哈哈*120` 这类更省上下文的写法。
+        cleaned_transcript_path = os.path.join(
+            artifact_dir,
+            f"{room_id}_{anchor_day.replace('-', '')}_llm_transcript.txt",
+        )
+        with open(cleaned_transcript_path, "w", encoding="utf-8") as f:
+            f.write(str(payload.get("raw_danmu_transcript") or "").strip())
+        payload["cleaned_transcript_file"] = os.path.abspath(cleaned_transcript_path)
         return payload
 
     def _build_daily_report_prompt(self, payload: Dict[str, Any]) -> Tuple[str, str]:
@@ -2564,6 +2576,7 @@ class DouyuPlugin(MessagePluginInterface):
 
         effective_lines = raw_lines[:max_lines]
         lines = ["【按时间顺序整理的原始弹幕全文（已过滤系统噪音，仅合并完全相同重复内容）】"]
+        lines.append("说明：文本已移除 UID；短刷屏弹幕会压缩成“哈哈哈*120”这类格式。")
         if len(effective_lines) < len(raw_lines):
             lines.append(f"以下仅展开前 {len(effective_lines)} 行，剩余内容因长度限制未继续拼接。")
         lines.extend(effective_lines)
@@ -3003,14 +3016,23 @@ class DouyuPlugin(MessagePluginInterface):
     def _build_dify_daily_report_files(self, payload: Dict[str, Any], user_id: str) -> List[Dict[str, Any]]:
         """
         组装斗鱼日报要上传给 Dify 的原始文件列表。
-        当前优先上传当天命中的原始弹幕 txt，让工作流里的 sys.files
-        真正拿到“源文件级材料”，而不是只有摘要 JSON。
+        当前优先上传“本地清洗后的 LLM 专用 txt”，让工作流里的 sys.files
+        拿到的是更适合总结任务的材料，而不是带 UID / 平台噪音的原始源文件。
         """
         if not self._daily_report_llm_client or self._daily_report_llm_client.provider != "dify":
             return []
 
         uploaded_files: List[Dict[str, Any]] = []
+        upload_candidates: List[str] = []
+        cleaned_transcript_file = os.path.abspath(str(payload.get("cleaned_transcript_file") or "").strip())
+        if cleaned_transcript_file:
+            upload_candidates.append(cleaned_transcript_file)
         for file_path in (payload.get("source_danmu_files", []) or [])[:2]:
+            normalized_source_path = os.path.abspath(str(file_path or "").strip())
+            if normalized_source_path and normalized_source_path not in upload_candidates:
+                upload_candidates.append(normalized_source_path)
+
+        for file_path in upload_candidates[:3]:
             normalized_path = os.path.abspath(str(file_path or "").strip())
             if not normalized_path or not os.path.exists(normalized_path) or not os.path.isfile(normalized_path):
                 continue