From fd61f17448bccba2d8d61df3a8906803f2ed4fc9 Mon Sep 17 00:00:00 2001 From: liuwei Date: Wed, 29 Apr 2026 14:15:26 +0800 Subject: [PATCH] =?UTF-8?q?Revert=20"=E6=9C=AC=E5=9C=B0=E6=B8=85=E6=B4=97?= =?UTF-8?q?=E5=BC=B9=E5=B9=95TXT=E5=B9=B6=E5=8E=8B=E7=BC=A9=E9=87=8D?= =?UTF-8?q?=E5=A4=8D=E5=88=B7=E5=B1=8F=E5=86=85=E5=AE=B9"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 12a5d89c760418ce75d1fdea5e9fb605e704498e. --- plugins/douyu/danmu_summary.py | 43 ++-------------------------------- plugins/douyu/main.py | 26 ++------------------ 2 files changed, 4 insertions(+), 65 deletions(-) diff --git a/plugins/douyu/danmu_summary.py b/plugins/douyu/danmu_summary.py index e9ffbcd..774bce9 100644 --- a/plugins/douyu/danmu_summary.py +++ b/plugins/douyu/danmu_summary.py @@ -981,49 +981,10 @@ class DouyuDanmuSummaryHelper: time_text = str(item.get("timestamp_text") or "").strip() nickname = str(item.get("nickname") or "").strip() or "观众" repeat_count = int(item.get("repeat_count", 1) or 1) - normalized_content = cls._format_llm_transcript_content(content, repeat_count) - # 本地清洗给 LLM 的 txt 时,统一移除 UID。 - # UID 对日报写作没有帮助,还会占 token、污染阅读流。 - lines.append(f"[{time_text}] {nickname}:{normalized_content}") + repeat_suffix = f" [重复{repeat_count}次]" if repeat_count > 1 else "" + lines.append(f"[{time_text}] {nickname}:{content}{repeat_suffix}") return lines - @classmethod - def _format_llm_transcript_content(cls, content: str, repeat_count: int) -> str: - """ - 规范化给 LLM 的弹幕正文显示形式。 - 目标: - 1. 像“哈哈哈”“666”“?”这类典型短刷屏,直接压成 `哈哈哈*120`; - 2. 正常讨论内容仍保留原句,只在后面标一次重复次数; - 3. 既减小文本体积,又尽量不牺牲讨论语义。 - """ - text = str(content or "").strip() - count = int(repeat_count or 1) - if count <= 1: - return text - if cls._should_compact_burst_text(text): - return f"{text}*{count}" - return f"{text} [重复{count}次]" - - @classmethod - def _should_compact_burst_text(cls, content: str) -> bool: - """ - 判断某条弹幕是否属于“适合压缩成 xN”的短刷屏文本。 - 这里故意保持保守,只压缩: - 1. 已知短 burst 词; - 2. 纯问号/感叹号/句号等情绪符号; - 3. 很短、且由同类字符重复组成的刷屏文本。 - """ - text = str(content or "").strip().lower() - if not text: - return False - if text in cls.SHORT_BURST_WORDS: - return True - if re.fullmatch(r"[??!!。\.~~]+", text): - return True - if len(text) <= 8 and len(set(text)) <= 3: - return True - return False - @classmethod def _build_chronological_samples( cls, diff --git a/plugins/douyu/main.py b/plugins/douyu/main.py index 0d1f1b1..1e38bf4 100644 --- a/plugins/douyu/main.py +++ b/plugins/douyu/main.py @@ -2251,18 +2251,6 @@ class DouyuPlugin(MessagePluginInterface): artifact_path = os.path.join(artifact_dir, f"{room_id}_{anchor_day.replace('-', '')}_daily_report_payload.json") with open(artifact_path, "w", encoding="utf-8") as f: json.dump(payload, f, ensure_ascii=False, indent=2) - # 额外落一份“专门给 LLM 看”的本地清洗 txt。 - # 它和原始弹幕文件的区别在于: - # 1. 已经过滤系统噪音; - # 2. 已移除 UID; - # 3. 已把短刷屏压缩成 `哈哈哈*120` 这类更省上下文的写法。 - cleaned_transcript_path = os.path.join( - artifact_dir, - f"{room_id}_{anchor_day.replace('-', '')}_llm_transcript.txt", - ) - with open(cleaned_transcript_path, "w", encoding="utf-8") as f: - f.write(str(payload.get("raw_danmu_transcript") or "").strip()) - payload["cleaned_transcript_file"] = os.path.abspath(cleaned_transcript_path) return payload def _build_daily_report_prompt(self, payload: Dict[str, Any]) -> Tuple[str, str]: @@ -2576,7 +2564,6 @@ class DouyuPlugin(MessagePluginInterface): effective_lines = raw_lines[:max_lines] lines = ["【按时间顺序整理的原始弹幕全文(已过滤系统噪音,仅合并完全相同重复内容)】"] - lines.append("说明:文本已移除 UID;短刷屏弹幕会压缩成“哈哈哈*120”这类格式。") if len(effective_lines) < len(raw_lines): lines.append(f"以下仅展开前 {len(effective_lines)} 行,剩余内容因长度限制未继续拼接。") lines.extend(effective_lines) @@ -3016,23 +3003,14 @@ class DouyuPlugin(MessagePluginInterface): def _build_dify_daily_report_files(self, payload: Dict[str, Any], user_id: str) -> List[Dict[str, Any]]: """ 组装斗鱼日报要上传给 Dify 的原始文件列表。 - 当前优先上传“本地清洗后的 LLM 专用 txt”,让工作流里的 sys.files - 拿到的是更适合总结任务的材料,而不是带 UID / 平台噪音的原始源文件。 + 当前优先上传当天命中的原始弹幕 txt,让工作流里的 sys.files + 真正拿到“源文件级材料”,而不是只有摘要 JSON。 """ if not self._daily_report_llm_client or self._daily_report_llm_client.provider != "dify": return [] uploaded_files: List[Dict[str, Any]] = [] - upload_candidates: List[str] = [] - cleaned_transcript_file = os.path.abspath(str(payload.get("cleaned_transcript_file") or "").strip()) - if cleaned_transcript_file: - upload_candidates.append(cleaned_transcript_file) for file_path in (payload.get("source_danmu_files", []) or [])[:2]: - normalized_source_path = os.path.abspath(str(file_path or "").strip()) - if normalized_source_path and normalized_source_path not in upload_candidates: - upload_candidates.append(normalized_source_path) - - for file_path in upload_candidates[:3]: normalized_path = os.path.abspath(str(file_path or "").strip()) if not normalized_path or not os.path.exists(normalized_path) or not os.path.isfile(normalized_path): continue