From 850fd5e7d38c7f15da058e42cfe21feb5aef788a Mon Sep 17 00:00:00 2001
From: liuwei <liuwei@wdtrgf.com.cn>
Date: Wed, 29 Apr 2026 14:15:26 +0800
Subject: [PATCH] =?UTF-8?q?Revert=20"=E6=94=AF=E6=8C=81=E5=90=91LLM?=
 =?UTF-8?q?=E4=B8=8A=E4=BC=A0=E5=8E=9F=E5=A7=8B=E5=BC=B9=E5=B9=95TXT?=
 =?UTF-8?q?=E5=B9=B6=E8=A1=A5=E5=85=85=E5=85=A8=E6=96=87=E8=BE=93=E5=85=A5?=
 =?UTF-8?q?"?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit 86c3dea1d219a4dc9908efadcb87f027b75e9acc.
---
 plugins/douyu/danmu_summary.py |  47 ---------------
 plugins/douyu/main.py          | 106 +--------------------------------
 2 files changed, 2 insertions(+), 151 deletions(-)

diff --git a/plugins/douyu/danmu_summary.py b/plugins/douyu/danmu_summary.py
index 774bce9..cf79602 100644
--- a/plugins/douyu/danmu_summary.py
+++ b/plugins/douyu/danmu_summary.py
@@ -93,28 +93,6 @@ class DouyuDanmuSummaryHelper:
                 continue
         return collected
 
-    @classmethod
-    def collect_session_source_files(cls, room_id: str, session: Dict[str, Any], base_dir: str = "temp") -> List[str]:
-        """
-        收集某个 session 实际对应到的原始弹幕 txt 文件路径。
-        这里不读取文件内容，只返回“这场直播跨到了哪些日期文件”，
-        方便上层在需要时直接把原始 txt 上传给 LLM。
-        """
-        segments = cls._normalize_segments(session.get("segments", []) or [])
-        if not room_id or not segments:
-            return []
-
-        file_paths: List[str] = []
-        date_keys = sorted(
-            {segment["start"].strftime("%Y%m%d") for segment in segments}
-            | {segment["end"].strftime("%Y%m%d") for segment in segments}
-        )
-        for date_key in date_keys:
-            file_path = os.path.join(base_dir, "douyu_danmu", date_key, f"{room_id}_{date_key}.txt")
-            if os.path.exists(file_path):
-                file_paths.append(file_path)
-        return file_paths
-
     @classmethod
     def load_day_messages(cls, room_id: str, date_key: str, base_dir: str = "temp") -> List[Dict[str, Any]]:
         file_path = os.path.join(base_dir, "douyu_danmu", date_key, f"{room_id}_{date_key}.txt")
@@ -235,10 +213,6 @@ class DouyuDanmuSummaryHelper:
             "peak_buckets": cls._simplify_peak_buckets(peak_buckets),
             "representative_messages": cls._pick_representative_messages(organized_messages, bucket_stats),
             "raw_window_samples": cls._build_raw_window_samples(peak_buckets, per_bucket_limit=12),
-            # 把去噪后、且只合并了“完全相同重复弹幕”的原始弹幕全文也保留下来。
-            # 这样上层如果希望直接把整场弹幕塞给 LLM，而不是只喂摘要样本，
-            # 就不需要再重新读文件和重复清洗。
-            "raw_transcript_lines": cls._build_raw_transcript_lines(organized_messages),
             # 给日报类 LLM 再补一层“按时间推进的现场切片”。
             # 这样模型除了看热点窗口，还能顺着时间线理解气氛如何起、如何变、最后怎么收，
             # 对粉丝日报这类强调“节目效果”和“接梗链路”的文本尤其有帮助。
@@ -964,27 +938,6 @@ class DouyuDanmuSummaryHelper:
             })
         return windows
 
-    @classmethod
-    def _build_raw_transcript_lines(cls, messages: List[Dict[str, Any]]) -> List[str]:
-        """
-        生成可直接给 LLM 使用的顺时序弹幕全文。
-        规则：
-        1. 输入消息已经过“系统噪音过滤 + 完全相同重复合并”；
-        2. 不再进一步摘要，尽量保留现场原话；
-        3. 对重复合并过的消息补上次数信息，帮助模型感知刷屏强度。
-        """
-        lines: List[str] = []
-        for item in messages:
-            content = str(item.get("content") or "").strip()
-            if not content:
-                continue
-            time_text = str(item.get("timestamp_text") or "").strip()
-            nickname = str(item.get("nickname") or "").strip() or "观众"
-            repeat_count = int(item.get("repeat_count", 1) or 1)
-            repeat_suffix = f" [重复{repeat_count}次]" if repeat_count > 1 else ""
-            lines.append(f"[{time_text}] {nickname}：{content}{repeat_suffix}")
-        return lines
-
     @classmethod
     def _build_chronological_samples(
         cls,
diff --git a/plugins/douyu/main.py b/plugins/douyu/main.py
index 1e38bf4..2f318d1 100644
--- a/plugins/douyu/main.py
+++ b/plugins/douyu/main.py
@@ -519,7 +519,7 @@ class DouyuPlugin(MessagePluginInterface):
     #    - 不再对不同表达做强模板压缩，尽量保留现场讨论细节；
     #    - 同时让新日报结果自动避开旧缓存污染；
     #    因此需要刷新旧缓存，确保新版结果真实命中新链路。
-    _DAILY_REPORT_CACHE_VERSION = 12
+    _DAILY_REPORT_CACHE_VERSION = 11
     FEATURE_KEY = "DOUYU_MONITOR"
     FEATURE_DESCRIPTION = "🎮 斗鱼开播提醒 [订阅斗鱼 房间号, 取消订阅斗鱼 房间号]"
 
@@ -2054,10 +2054,8 @@ class DouyuPlugin(MessagePluginInterface):
         raw_window_samples: List[Dict[str, Any]] = []
         chronological_samples: List[Dict[str, Any]] = []
         session_storylines: List[Dict[str, Any]] = []
-        raw_transcript_lines: List[str] = []
         top_terms_counter = Counter()
         burst_terms_counter = Counter()
-        source_danmu_files: List[str] = []
         operator_totals = {
             "fans_badge_user_count": 0,
             "fans_badge_message_count": 0,
@@ -2103,10 +2101,6 @@ class DouyuPlugin(MessagePluginInterface):
                 raw_window_samples.append(dict(item))
             for item in payload.get("chronological_samples", []) or []:
                 chronological_samples.append(dict(item))
-            for line in payload.get("raw_transcript_lines", []) or []:
-                line_text = str(line or "").strip()
-                if line_text:
-                    raw_transcript_lines.append(line_text)
             storyline = payload.get("session_storyline") or {}
             if storyline:
                 session_storylines.append(dict(storyline))
@@ -2161,13 +2155,6 @@ class DouyuPlugin(MessagePluginInterface):
                 uid = str(session_message.get("uid") or "").strip()
                 if uid:
                     total_unique_users.add(uid)
-            # 顺手把本场真正落盘的原始弹幕 txt 也记录下来。
-            # 这样后面如果走 Dify 文件上传链路，就可以直接把“当天原文件”交给 LLM，
-            # 而不是再二次拼装一个近似文本。
-            for file_path in DouyuDanmuSummaryHelper.collect_session_source_files(room_id, session):
-                normalized_path = os.path.abspath(file_path)
-                if normalized_path and normalized_path not in source_danmu_files:
-                    source_danmu_files.append(normalized_path)
 
         if not session_payloads:
             return None
@@ -2240,13 +2227,6 @@ class DouyuPlugin(MessagePluginInterface):
             "chronological_samples": chronological_samples[:40],
             # 多场直播时保留每一场的轻量故事线，让粉丝日报更容易写出真正的“回放感”。
             "session_storylines": session_storylines[:6],
-            # 整天范围内的原始弹幕全文。
-            # 这里保留的是“过滤系统噪音 + 合并完全相同重复内容”后的顺时序文本，
-            # 目的不是做摘要，而是尽量把现场讨论原貌交给 LLM。
-            "raw_danmu_transcript": "\n".join(raw_transcript_lines),
-            "raw_danmu_transcript_lines": raw_transcript_lines,
-            # 保存实际原始 txt 路径，供 Dify 模式上传文件时直接复用。
-            "source_danmu_files": source_danmu_files[:8],
         }
         artifact_path = os.path.join(artifact_dir, f"{room_id}_{anchor_day.replace('-', '')}_daily_report_payload.json")
         with open(artifact_path, "w", encoding="utf-8") as f:
@@ -2311,7 +2291,6 @@ class DouyuPlugin(MessagePluginInterface):
         meta = payload.get("report_meta", {}) or {}
         room_context_prompt = self._build_room_context_prompt_block(payload)
         prompt_material = self._build_llm_prompt_material(payload, include_operator=False)
-        raw_transcript_block = self._build_raw_transcript_prompt_block(payload, max_lines=12000)
         system_prompt = (
             "你是斗鱼直播间的粉丝向整活日报编辑。"
             "请只根据提供的真实弹幕材料，输出一份开心、欢乐、带一点恶搞气质的中文总结。"
@@ -2332,9 +2311,7 @@ class DouyuPlugin(MessagePluginInterface):
             f"{room_context_prompt}"
             "下面是已经提纯给 LLM 的现场材料。当前清洗策略只合并完全相同的重复弹幕，不同表达会尽量保留。\n"
             "请优先抓原声弹幕、热点窗口、顺时序讨论推进，以及其中出现的刀圈比赛话题，不要写成空泛热闹总结。\n"
-            "如果下面附带了原始弹幕全文，请把原始弹幕全文视为最高优先级证据，前面的摘要材料只是导航和补充。\n"
-            f"材料：\n{json.dumps(prompt_material, ensure_ascii=False, indent=2)}\n\n"
-            f"{raw_transcript_block}"
+            f"材料：\n{json.dumps(prompt_material, ensure_ascii=False, indent=2)}"
         )
         return system_prompt, user_prompt
 
@@ -2363,7 +2340,6 @@ class DouyuPlugin(MessagePluginInterface):
         peak_buckets = payload.get("peak_buckets", []) or []
         chronological_samples = payload.get("chronological_samples", []) or []
         session_storylines = payload.get("session_storylines", []) or []
-        raw_transcript_lines = payload.get("raw_danmu_transcript_lines", []) or []
 
         material: Dict[str, Any] = {
             "report_meta": {
@@ -2508,13 +2484,6 @@ class DouyuPlugin(MessagePluginInterface):
                     for item in session_storylines[:4]
                 ],
             },
-            # 保留一小段原始弹幕预览，主要用于调试和工作流里快速核对输入是否足够“像现场”。
-            # 真正的大全文会单独拼进 prompt，避免这里的结构化 JSON 膨胀得过于夸张。
-            "raw_transcript_preview": [
-                str(line or "").strip()
-                for line in raw_transcript_lines[:120]
-                if str(line or "").strip()
-            ],
         }
 
         if include_operator:
@@ -2548,27 +2517,6 @@ class DouyuPlugin(MessagePluginInterface):
 
         return material
 
-    def _build_raw_transcript_prompt_block(self, payload: Dict[str, Any], max_lines: int = 12000) -> str:
-        """
-        把原始弹幕全文整理成 prompt 可直接拼接的文本块。
-        这里优先满足“尽量把原始弹幕整体给 LLM 看”的目标，
-        同时保留一个最大行数阈值，避免极端超长场次直接把请求撑爆。
-        """
-        raw_lines = [
-            str(line or "").strip()
-            for line in (payload.get("raw_danmu_transcript_lines", []) or [])
-            if str(line or "").strip()
-        ]
-        if not raw_lines:
-            return "【按时间顺序整理的原始弹幕全文】\n（暂无可用原始弹幕全文）"
-
-        effective_lines = raw_lines[:max_lines]
-        lines = ["【按时间顺序整理的原始弹幕全文（已过滤系统噪音，仅合并完全相同重复内容）】"]
-        if len(effective_lines) < len(raw_lines):
-            lines.append(f"以下仅展开前 {len(effective_lines)} 行，剩余内容因长度限制未继续拼接。")
-        lines.extend(effective_lines)
-        return "\n".join(lines)
-
     @staticmethod
     def _clean_daily_report_llm_text(text: str) -> str:
         """
@@ -2992,59 +2940,11 @@ class DouyuPlugin(MessagePluginInterface):
             # 控制输出长度：Dify 该变量在部分工作流中配置为 paragraph(字符串)类型，
             # 因此这里统一传字符串，避免出现 “max_length must be a string” 的 400 校验错误。
             "max_length": str(int(self._daily_report_max_length or 1800)),
-            # 即使工作流节点没有消费 sys.files，也尽量把整段原始弹幕文本塞进 inputs，
-            # 这样工作流内仍可通过变量直接引用全文，不必只依赖 query 里的摘要部分。
-            "raw_danmu_transcript": str(payload.get("raw_danmu_transcript") or ""),
         }
         if self._daily_report_include_structured_inputs:
             inputs["report_payload_json"] = json.dumps(payload, ensure_ascii=False)
         return inputs
 
-    def _build_dify_daily_report_files(self, payload: Dict[str, Any], user_id: str) -> List[Dict[str, Any]]:
-        """
-        组装斗鱼日报要上传给 Dify 的原始文件列表。
-        当前优先上传当天命中的原始弹幕 txt，让工作流里的 sys.files
-        真正拿到“源文件级材料”，而不是只有摘要 JSON。
-        """
-        if not self._daily_report_llm_client or self._daily_report_llm_client.provider != "dify":
-            return []
-
-        uploaded_files: List[Dict[str, Any]] = []
-        for file_path in (payload.get("source_danmu_files", []) or [])[:2]:
-            normalized_path = os.path.abspath(str(file_path or "").strip())
-            if not normalized_path or not os.path.exists(normalized_path) or not os.path.isfile(normalized_path):
-                continue
-            try:
-                with open(normalized_path, "rb") as file_obj:
-                    file_bytes = file_obj.read()
-            except Exception as exc:
-                logger.warning(f"斗鱼日报原始弹幕文件读取失败: path={normalized_path}, error={exc}")
-                continue
-            if not file_bytes:
-                continue
-
-            upload_result = self._daily_report_llm_client.upload_dify_file(
-                user=user_id,
-                file_bytes=file_bytes,
-                filename=os.path.basename(normalized_path),
-                mime_type="text/plain",
-            )
-            upload_file_id = str((upload_result or {}).get("id") or "").strip()
-            if not upload_file_id:
-                logger.warning(
-                    "斗鱼日报原始弹幕文件上传 Dify 失败: "
-                    f"path={normalized_path}, last_error={self._daily_report_llm_client.last_error}"
-                )
-                continue
-
-            file_ref = self._daily_report_llm_client.build_dify_file_ref(
-                file_type="document",
-                upload_file_id=upload_file_id,
-            )
-            if file_ref:
-                uploaded_files.append(file_ref)
-        return uploaded_files
-
     def _build_room_background_profile_seed(self, payload: Dict[str, Any]) -> Dict[str, Any]:
         """
         从日报载荷里抽取一份“适合给背景画像模型看的精简材料”。
@@ -3336,7 +3236,6 @@ class DouyuPlugin(MessagePluginInterface):
         room_id = str(meta.get("room_id") or "").strip()
         user_id = f"douyu_daily_report_{room_id or 'unknown'}"
         if self._daily_report_llm_client.provider == "dify":
-            files = self._build_dify_daily_report_files(payload, user_id)
             inputs = self._build_dify_daily_report_inputs(
                 task_type=task_type,
                 system_prompt=system_prompt,
@@ -3348,7 +3247,6 @@ class DouyuPlugin(MessagePluginInterface):
                 user=user_id,
                 inputs=inputs,
                 tag=tag,
-                files=files,
             )
             return str((result or {}).get("text", "") or "").strip()
         return self._daily_report_llm_client.chat(