@@ -93,28 +93,6 @@ class DouyuDanmuSummaryHelper:
|
||||
continue
|
||||
return collected
|
||||
|
||||
@classmethod
|
||||
def collect_session_source_files(cls, room_id: str, session: Dict[str, Any], base_dir: str = "temp") -> List[str]:
|
||||
"""
|
||||
收集某个 session 实际对应到的原始弹幕 txt 文件路径。
|
||||
这里不读取文件内容,只返回“这场直播跨到了哪些日期文件”,
|
||||
方便上层在需要时直接把原始 txt 上传给 LLM。
|
||||
"""
|
||||
segments = cls._normalize_segments(session.get("segments", []) or [])
|
||||
if not room_id or not segments:
|
||||
return []
|
||||
|
||||
file_paths: List[str] = []
|
||||
date_keys = sorted(
|
||||
{segment["start"].strftime("%Y%m%d") for segment in segments}
|
||||
| {segment["end"].strftime("%Y%m%d") for segment in segments}
|
||||
)
|
||||
for date_key in date_keys:
|
||||
file_path = os.path.join(base_dir, "douyu_danmu", date_key, f"{room_id}_{date_key}.txt")
|
||||
if os.path.exists(file_path):
|
||||
file_paths.append(file_path)
|
||||
return file_paths
|
||||
|
||||
@classmethod
|
||||
def load_day_messages(cls, room_id: str, date_key: str, base_dir: str = "temp") -> List[Dict[str, Any]]:
|
||||
file_path = os.path.join(base_dir, "douyu_danmu", date_key, f"{room_id}_{date_key}.txt")
|
||||
@@ -235,10 +213,6 @@ class DouyuDanmuSummaryHelper:
|
||||
"peak_buckets": cls._simplify_peak_buckets(peak_buckets),
|
||||
"representative_messages": cls._pick_representative_messages(organized_messages, bucket_stats),
|
||||
"raw_window_samples": cls._build_raw_window_samples(peak_buckets, per_bucket_limit=12),
|
||||
# 把去噪后、且只合并了“完全相同重复弹幕”的原始弹幕全文也保留下来。
|
||||
# 这样上层如果希望直接把整场弹幕塞给 LLM,而不是只喂摘要样本,
|
||||
# 就不需要再重新读文件和重复清洗。
|
||||
"raw_transcript_lines": cls._build_raw_transcript_lines(organized_messages),
|
||||
# 给日报类 LLM 再补一层“按时间推进的现场切片”。
|
||||
# 这样模型除了看热点窗口,还能顺着时间线理解气氛如何起、如何变、最后怎么收,
|
||||
# 对粉丝日报这类强调“节目效果”和“接梗链路”的文本尤其有帮助。
|
||||
@@ -964,27 +938,6 @@ class DouyuDanmuSummaryHelper:
|
||||
})
|
||||
return windows
|
||||
|
||||
@classmethod
|
||||
def _build_raw_transcript_lines(cls, messages: List[Dict[str, Any]]) -> List[str]:
|
||||
"""
|
||||
生成可直接给 LLM 使用的顺时序弹幕全文。
|
||||
规则:
|
||||
1. 输入消息已经过“系统噪音过滤 + 完全相同重复合并”;
|
||||
2. 不再进一步摘要,尽量保留现场原话;
|
||||
3. 对重复合并过的消息补上次数信息,帮助模型感知刷屏强度。
|
||||
"""
|
||||
lines: List[str] = []
|
||||
for item in messages:
|
||||
content = str(item.get("content") or "").strip()
|
||||
if not content:
|
||||
continue
|
||||
time_text = str(item.get("timestamp_text") or "").strip()
|
||||
nickname = str(item.get("nickname") or "").strip() or "观众"
|
||||
repeat_count = int(item.get("repeat_count", 1) or 1)
|
||||
repeat_suffix = f" [重复{repeat_count}次]" if repeat_count > 1 else ""
|
||||
lines.append(f"[{time_text}] {nickname}:{content}{repeat_suffix}")
|
||||
return lines
|
||||
|
||||
@classmethod
|
||||
def _build_chronological_samples(
|
||||
cls,
|
||||
|
||||
Reference in New Issue
Block a user