重构斗鱼粉丝日报信息提纯链路
- 新增本地弹幕文件测试入口,支持直接对样本文件生成提纯结果 - 将本地统计、主题证据簇和语义事实提示接入斗鱼日报LLM材料 - 明确降低情绪刷屏权重,改为优先提取赛事、位置、英雄、对局和场外互动信息
This commit is contained in:
@@ -24,6 +24,49 @@ class DouyuDanmuSummaryHelper:
|
||||
"666", "6666", "牛", "牛逼", "稳", "寄", "杀", "帅", "好", "行", "绷", "哭", "乐",
|
||||
"哈哈", "哈哈哈", "笑死", "卧槽", "wc", "awsl", "nb", "nbl", "c", "6",
|
||||
}
|
||||
# 这些词对“直播间气氛”有价值,但对“事实提炼”帮助有限。
|
||||
# 后面在内容线索排序时会适当降权,避免把真正的赛事/英雄/剧情信息淹掉。
|
||||
GENERIC_REACTION_TERMS: Set[str] = {
|
||||
"哈哈", "哈哈哈", "哈哈哈哈", "哈哈哈哈哈", "哈哈哈哈哈哈",
|
||||
"gg", "g", "888", "1", "啊", "啊?", "坏了", "好起来了", "翻了",
|
||||
}
|
||||
# 高频语义簇配置:
|
||||
# 1. 不做中文分词,而是直接按“直播圈常见话题簇”收证据;
|
||||
# 2. 每个簇都会保留计数、时间范围和原声样本;
|
||||
# 3. 这样模型更容易抓到“今天到底发生了哪些具体事”,而不是只看到大量情绪词。
|
||||
FACT_CLUSTER_CONFIGS: List[Dict[str, Any]] = [
|
||||
{
|
||||
"label": "赛事预告与报名动态",
|
||||
"keywords": ["老头杯", "选人", "报名", "开赛", "比赛", "30号", "4月30", "4月30日"],
|
||||
},
|
||||
{
|
||||
"label": "比赛位置与身份讨论",
|
||||
"keywords": ["1号位", "5号位", "打1", "打5", "教练", "carry", "辅助"],
|
||||
},
|
||||
{
|
||||
"label": "镜头与外形调侃",
|
||||
"keywords": ["摄像头", "开摄像头", "光头", "秃头", "洗头", "面容", "露脸"],
|
||||
},
|
||||
{
|
||||
"label": "团播人物与场外关系",
|
||||
"keywords": ["糯糯", "瑶瑶", "冬瓜", "冬瓜强", "白队", "团播", "户外"],
|
||||
},
|
||||
{
|
||||
"label": "关键对局与局势转折",
|
||||
"keywords": ["奶绿", "muerta", "gg", "翻了", "拿下", "上高地", "守高地", "队友", "大炮", "萨尔", "炸弹人"],
|
||||
},
|
||||
]
|
||||
HERO_ALIASES: Dict[str, List[str]] = {
|
||||
"Muerta/奶绿": ["奶绿", "muerta"],
|
||||
"德鲁伊/Lone Druid": ["德鲁伊", "lone druid", "熊德"],
|
||||
"小小/Tiny": ["小小", "tiny"],
|
||||
"帕克/Puck": ["帕克", "puck"],
|
||||
"火猫/Ember Spirit": ["火猫", "ember"],
|
||||
"敌法/Anti-Mage": ["敌法", "am", "anti-mage"],
|
||||
"兽王/Beastmaster": ["兽王", "beastmaster"],
|
||||
"萨尔/Disruptor": ["萨尔", "disruptor"],
|
||||
"炸弹人/Techies": ["炸弹人", "techies"],
|
||||
}
|
||||
NOISE_PATTERNS = [
|
||||
re.compile(r"本条弹幕.*机器人", re.I),
|
||||
re.compile(r"请不要.*统计机器人数", re.I),
|
||||
@@ -110,6 +153,26 @@ class DouyuDanmuSummaryHelper:
|
||||
collected.append(parsed)
|
||||
return collected
|
||||
|
||||
@classmethod
|
||||
def load_messages_from_file(cls, file_path: str) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
从指定文本文件直接读取弹幕。
|
||||
这个入口主要用于本地调试和样本回归:
|
||||
1. 不依赖 Redis session;
|
||||
2. 不要求文件落在 temp/douyu_danmu 目录;
|
||||
3. 便于直接拿用户提供的测试样本跑提纯链路。
|
||||
"""
|
||||
path = str(file_path or "").strip()
|
||||
if not path or not os.path.exists(path):
|
||||
return []
|
||||
collected: List[Dict[str, Any]] = []
|
||||
with open(path, "r", encoding="utf-8", errors="ignore") as f:
|
||||
for line in f:
|
||||
parsed = cls.parse_danmu_line(line)
|
||||
if parsed:
|
||||
collected.append(parsed)
|
||||
return collected
|
||||
|
||||
@classmethod
|
||||
def build_summary_material(
|
||||
cls,
|
||||
@@ -289,6 +352,8 @@ class DouyuDanmuSummaryHelper:
|
||||
burst_terms=burst_terms,
|
||||
limit=cue_limit,
|
||||
),
|
||||
# 再补一层“事实型提示”,专门抬高赛事、位置、英雄、镜头梗、关键对局等信息密度高的内容。
|
||||
"semantic_fact_hints": cls._build_semantic_fact_hints(ordered_messages),
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
@@ -1199,7 +1264,7 @@ class DouyuDanmuSummaryHelper:
|
||||
|
||||
for item in repeated_messages:
|
||||
push(
|
||||
"repeat",
|
||||
"emotion" if cls._normalize_template_text(str(item.get("text") or "")) in cls.GENERIC_REACTION_TERMS else "repeat",
|
||||
str(item.get("text") or ""),
|
||||
int(item.get("count", 0) or 0),
|
||||
int(item.get("user_count", 0) or 0),
|
||||
@@ -1222,7 +1287,7 @@ class DouyuDanmuSummaryHelper:
|
||||
if count < 2:
|
||||
continue
|
||||
push(
|
||||
"short_repeat",
|
||||
"emotion" if normalized in cls.GENERIC_REACTION_TERMS else "short_repeat",
|
||||
short_message_text_map.get(normalized, normalized),
|
||||
count,
|
||||
len([uid for uid in short_message_users.get(normalized, set()) if uid]),
|
||||
@@ -1230,15 +1295,142 @@ class DouyuDanmuSummaryHelper:
|
||||
|
||||
for item in burst_terms:
|
||||
push(
|
||||
"burst",
|
||||
"emotion" if cls._normalize_template_text(str(item.get("text") or "")) in cls.GENERIC_REACTION_TERMS else "burst",
|
||||
str(item.get("text") or ""),
|
||||
int(item.get("count", 0) or 0),
|
||||
int(item.get("user_count", 0) or 0),
|
||||
)
|
||||
|
||||
cues.sort(key=lambda item: (int(item.get("count", 0) or 0), int(item.get("user_count", 0) or 0)), reverse=True)
|
||||
kind_priority = {
|
||||
"repeat": 5,
|
||||
"short_repeat": 4,
|
||||
"burst": 3,
|
||||
"emotion": 1,
|
||||
}
|
||||
cues.sort(
|
||||
key=lambda item: (
|
||||
int(kind_priority.get(str(item.get("kind") or ""), 0)),
|
||||
int(item.get("count", 0) or 0),
|
||||
int(item.get("user_count", 0) or 0),
|
||||
),
|
||||
reverse=True,
|
||||
)
|
||||
return cues[:limit]
|
||||
|
||||
@classmethod
|
||||
def _build_semantic_fact_hints(cls, messages: List[Dict[str, Any]]) -> Dict[str, Any]:
|
||||
"""
|
||||
生成“事实型语义提示”。
|
||||
这层不是做总结,而是把模型容易漏掉的高价值信息提前挂出来:
|
||||
1. 赛事/选人/位置讨论;
|
||||
2. 英雄与关键局;
|
||||
3. 摄像头、团播人物等场外互动梗。
|
||||
"""
|
||||
return {
|
||||
"topic_clusters": cls._build_fact_topic_clusters(messages),
|
||||
"hero_mentions": cls._build_hero_mentions(messages),
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def _build_fact_topic_clusters(cls, messages: List[Dict[str, Any]], limit: int = 8) -> List[Dict[str, Any]]:
|
||||
clusters: List[Dict[str, Any]] = []
|
||||
for config in cls.FACT_CLUSTER_CONFIGS:
|
||||
matched_items: List[Dict[str, Any]] = []
|
||||
keywords = [str(item).lower() for item in (config.get("keywords") or []) if str(item).strip()]
|
||||
for item in messages:
|
||||
content = str(item.get("content") or "").strip()
|
||||
if not content:
|
||||
continue
|
||||
lowered = content.lower()
|
||||
if any(keyword in lowered for keyword in keywords):
|
||||
matched_items.append(item)
|
||||
if not matched_items:
|
||||
continue
|
||||
sample_messages = []
|
||||
seen = set()
|
||||
for item in matched_items:
|
||||
content = str(item.get("content") or "").strip()
|
||||
normalized = cls._normalize_template_text(content)
|
||||
if not normalized or normalized in seen:
|
||||
continue
|
||||
seen.add(normalized)
|
||||
sample_messages.append({
|
||||
"date": cls._format_date(item.get("timestamp")),
|
||||
"hm": cls._format_hm(item.get("timestamp")),
|
||||
"nickname": str(item.get("nickname") or "").strip(),
|
||||
"content": content[:100],
|
||||
})
|
||||
if len(sample_messages) >= 5:
|
||||
break
|
||||
if not sample_messages:
|
||||
continue
|
||||
clusters.append({
|
||||
"label": str(config.get("label") or "").strip(),
|
||||
"match_count": len(matched_items),
|
||||
"user_count": len({
|
||||
str(item.get("uid") or "") for item in matched_items if str(item.get("uid") or "").strip()
|
||||
}),
|
||||
"first_hm": cls._format_hm(matched_items[0].get("timestamp")),
|
||||
"last_hm": cls._format_hm(matched_items[-1].get("timestamp")),
|
||||
"keywords": config.get("keywords", [])[:8],
|
||||
"samples": sample_messages,
|
||||
})
|
||||
clusters.sort(
|
||||
key=lambda item: (
|
||||
int(item.get("match_count", 0) or 0),
|
||||
int(item.get("user_count", 0) or 0),
|
||||
),
|
||||
reverse=True,
|
||||
)
|
||||
return clusters[:limit]
|
||||
|
||||
@classmethod
|
||||
def _build_hero_mentions(cls, messages: List[Dict[str, Any]], limit: int = 6) -> List[Dict[str, Any]]:
|
||||
hero_results: List[Dict[str, Any]] = []
|
||||
for hero_name, aliases in cls.HERO_ALIASES.items():
|
||||
matched_items: List[Dict[str, Any]] = []
|
||||
alias_list = [str(alias).lower() for alias in aliases if str(alias).strip()]
|
||||
for item in messages:
|
||||
content = str(item.get("content") or "").strip()
|
||||
if not content:
|
||||
continue
|
||||
lowered = content.lower()
|
||||
if any(alias in lowered for alias in alias_list):
|
||||
matched_items.append(item)
|
||||
if not matched_items:
|
||||
continue
|
||||
samples = []
|
||||
seen = set()
|
||||
for item in matched_items:
|
||||
content = str(item.get("content") or "").strip()
|
||||
normalized = cls._normalize_template_text(content)
|
||||
if not normalized or normalized in seen:
|
||||
continue
|
||||
seen.add(normalized)
|
||||
samples.append({
|
||||
"hm": cls._format_hm(item.get("timestamp")),
|
||||
"nickname": str(item.get("nickname") or "").strip(),
|
||||
"content": content[:100],
|
||||
})
|
||||
if len(samples) >= 4:
|
||||
break
|
||||
hero_results.append({
|
||||
"hero": hero_name,
|
||||
"mention_count": len(matched_items),
|
||||
"user_count": len({
|
||||
str(item.get("uid") or "") for item in matched_items if str(item.get("uid") or "").strip()
|
||||
}),
|
||||
"samples": samples,
|
||||
})
|
||||
hero_results.sort(
|
||||
key=lambda item: (
|
||||
int(item.get("mention_count", 0) or 0),
|
||||
int(item.get("user_count", 0) or 0),
|
||||
),
|
||||
reverse=True,
|
||||
)
|
||||
return hero_results[:limit]
|
||||
|
||||
@staticmethod
|
||||
def _looks_like_pure_punctuation(content: str) -> bool:
|
||||
text = str(content or "").strip()
|
||||
|
||||
Reference in New Issue
Block a user