本地清洗弹幕TXT并压缩重复刷屏内容
This commit is contained in:
@@ -981,10 +981,49 @@ class DouyuDanmuSummaryHelper:
|
|||||||
time_text = str(item.get("timestamp_text") or "").strip()
|
time_text = str(item.get("timestamp_text") or "").strip()
|
||||||
nickname = str(item.get("nickname") or "").strip() or "观众"
|
nickname = str(item.get("nickname") or "").strip() or "观众"
|
||||||
repeat_count = int(item.get("repeat_count", 1) or 1)
|
repeat_count = int(item.get("repeat_count", 1) or 1)
|
||||||
repeat_suffix = f" [重复{repeat_count}次]" if repeat_count > 1 else ""
|
normalized_content = cls._format_llm_transcript_content(content, repeat_count)
|
||||||
lines.append(f"[{time_text}] {nickname}:{content}{repeat_suffix}")
|
# 本地清洗给 LLM 的 txt 时,统一移除 UID。
|
||||||
|
# UID 对日报写作没有帮助,还会占 token、污染阅读流。
|
||||||
|
lines.append(f"[{time_text}] {nickname}:{normalized_content}")
|
||||||
return lines
|
return lines
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _format_llm_transcript_content(cls, content: str, repeat_count: int) -> str:
|
||||||
|
"""
|
||||||
|
规范化给 LLM 的弹幕正文显示形式。
|
||||||
|
目标:
|
||||||
|
1. 像“哈哈哈”“666”“?”这类典型短刷屏,直接压成 `哈哈哈*120`;
|
||||||
|
2. 正常讨论内容仍保留原句,只在后面标一次重复次数;
|
||||||
|
3. 既减小文本体积,又尽量不牺牲讨论语义。
|
||||||
|
"""
|
||||||
|
text = str(content or "").strip()
|
||||||
|
count = int(repeat_count or 1)
|
||||||
|
if count <= 1:
|
||||||
|
return text
|
||||||
|
if cls._should_compact_burst_text(text):
|
||||||
|
return f"{text}*{count}"
|
||||||
|
return f"{text} [重复{count}次]"
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _should_compact_burst_text(cls, content: str) -> bool:
|
||||||
|
"""
|
||||||
|
判断某条弹幕是否属于“适合压缩成 xN”的短刷屏文本。
|
||||||
|
这里故意保持保守,只压缩:
|
||||||
|
1. 已知短 burst 词;
|
||||||
|
2. 纯问号/感叹号/句号等情绪符号;
|
||||||
|
3. 很短、且由同类字符重复组成的刷屏文本。
|
||||||
|
"""
|
||||||
|
text = str(content or "").strip().lower()
|
||||||
|
if not text:
|
||||||
|
return False
|
||||||
|
if text in cls.SHORT_BURST_WORDS:
|
||||||
|
return True
|
||||||
|
if re.fullmatch(r"[??!!。\.~~]+", text):
|
||||||
|
return True
|
||||||
|
if len(text) <= 8 and len(set(text)) <= 3:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _build_chronological_samples(
|
def _build_chronological_samples(
|
||||||
cls,
|
cls,
|
||||||
|
|||||||
@@ -2251,6 +2251,18 @@ class DouyuPlugin(MessagePluginInterface):
|
|||||||
artifact_path = os.path.join(artifact_dir, f"{room_id}_{anchor_day.replace('-', '')}_daily_report_payload.json")
|
artifact_path = os.path.join(artifact_dir, f"{room_id}_{anchor_day.replace('-', '')}_daily_report_payload.json")
|
||||||
with open(artifact_path, "w", encoding="utf-8") as f:
|
with open(artifact_path, "w", encoding="utf-8") as f:
|
||||||
json.dump(payload, f, ensure_ascii=False, indent=2)
|
json.dump(payload, f, ensure_ascii=False, indent=2)
|
||||||
|
# 额外落一份“专门给 LLM 看”的本地清洗 txt。
|
||||||
|
# 它和原始弹幕文件的区别在于:
|
||||||
|
# 1. 已经过滤系统噪音;
|
||||||
|
# 2. 已移除 UID;
|
||||||
|
# 3. 已把短刷屏压缩成 `哈哈哈*120` 这类更省上下文的写法。
|
||||||
|
cleaned_transcript_path = os.path.join(
|
||||||
|
artifact_dir,
|
||||||
|
f"{room_id}_{anchor_day.replace('-', '')}_llm_transcript.txt",
|
||||||
|
)
|
||||||
|
with open(cleaned_transcript_path, "w", encoding="utf-8") as f:
|
||||||
|
f.write(str(payload.get("raw_danmu_transcript") or "").strip())
|
||||||
|
payload["cleaned_transcript_file"] = os.path.abspath(cleaned_transcript_path)
|
||||||
return payload
|
return payload
|
||||||
|
|
||||||
def _build_daily_report_prompt(self, payload: Dict[str, Any]) -> Tuple[str, str]:
|
def _build_daily_report_prompt(self, payload: Dict[str, Any]) -> Tuple[str, str]:
|
||||||
@@ -2564,6 +2576,7 @@ class DouyuPlugin(MessagePluginInterface):
|
|||||||
|
|
||||||
effective_lines = raw_lines[:max_lines]
|
effective_lines = raw_lines[:max_lines]
|
||||||
lines = ["【按时间顺序整理的原始弹幕全文(已过滤系统噪音,仅合并完全相同重复内容)】"]
|
lines = ["【按时间顺序整理的原始弹幕全文(已过滤系统噪音,仅合并完全相同重复内容)】"]
|
||||||
|
lines.append("说明:文本已移除 UID;短刷屏弹幕会压缩成“哈哈哈*120”这类格式。")
|
||||||
if len(effective_lines) < len(raw_lines):
|
if len(effective_lines) < len(raw_lines):
|
||||||
lines.append(f"以下仅展开前 {len(effective_lines)} 行,剩余内容因长度限制未继续拼接。")
|
lines.append(f"以下仅展开前 {len(effective_lines)} 行,剩余内容因长度限制未继续拼接。")
|
||||||
lines.extend(effective_lines)
|
lines.extend(effective_lines)
|
||||||
@@ -3003,14 +3016,23 @@ class DouyuPlugin(MessagePluginInterface):
|
|||||||
def _build_dify_daily_report_files(self, payload: Dict[str, Any], user_id: str) -> List[Dict[str, Any]]:
|
def _build_dify_daily_report_files(self, payload: Dict[str, Any], user_id: str) -> List[Dict[str, Any]]:
|
||||||
"""
|
"""
|
||||||
组装斗鱼日报要上传给 Dify 的原始文件列表。
|
组装斗鱼日报要上传给 Dify 的原始文件列表。
|
||||||
当前优先上传当天命中的原始弹幕 txt,让工作流里的 sys.files
|
当前优先上传“本地清洗后的 LLM 专用 txt”,让工作流里的 sys.files
|
||||||
真正拿到“源文件级材料”,而不是只有摘要 JSON。
|
拿到的是更适合总结任务的材料,而不是带 UID / 平台噪音的原始源文件。
|
||||||
"""
|
"""
|
||||||
if not self._daily_report_llm_client or self._daily_report_llm_client.provider != "dify":
|
if not self._daily_report_llm_client or self._daily_report_llm_client.provider != "dify":
|
||||||
return []
|
return []
|
||||||
|
|
||||||
uploaded_files: List[Dict[str, Any]] = []
|
uploaded_files: List[Dict[str, Any]] = []
|
||||||
|
upload_candidates: List[str] = []
|
||||||
|
cleaned_transcript_file = os.path.abspath(str(payload.get("cleaned_transcript_file") or "").strip())
|
||||||
|
if cleaned_transcript_file:
|
||||||
|
upload_candidates.append(cleaned_transcript_file)
|
||||||
for file_path in (payload.get("source_danmu_files", []) or [])[:2]:
|
for file_path in (payload.get("source_danmu_files", []) or [])[:2]:
|
||||||
|
normalized_source_path = os.path.abspath(str(file_path or "").strip())
|
||||||
|
if normalized_source_path and normalized_source_path not in upload_candidates:
|
||||||
|
upload_candidates.append(normalized_source_path)
|
||||||
|
|
||||||
|
for file_path in upload_candidates[:3]:
|
||||||
normalized_path = os.path.abspath(str(file_path or "").strip())
|
normalized_path = os.path.abspath(str(file_path or "").strip())
|
||||||
if not normalized_path or not os.path.exists(normalized_path) or not os.path.isfile(normalized_path):
|
if not normalized_path or not os.path.exists(normalized_path) or not os.path.isfile(normalized_path):
|
||||||
continue
|
continue
|
||||||
|
|||||||
Reference in New Issue
Block a user