abot/plugins/douyu/local_test_runner.py

# -*- coding: utf-8 -*-
"""
斗鱼弹幕本地测试脚本。

用途：
1. 直接读取用户提供的本地弹幕文本样本；
2. 跑一遍“本地提纯 + 证据簇提炼”链路；
3. 将结果输出到 temp/douyu_materials，方便人工查看；
4. 不依赖 Redis、Dify、直播 session。
"""

import importlib.util
import json
import os
import sys
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List


def _load_helper():
    current_dir = Path(__file__).resolve().parent
    module_path = current_dir / "danmu_summary.py"
    spec = importlib.util.spec_from_file_location("douyu_danmu_summary_local", module_path)
    module = importlib.util.module_from_spec(spec)
    assert spec.loader is not None
    spec.loader.exec_module(module)
    return module.DouyuDanmuSummaryHelper


def _load_report_template_module():
    """
    单独按文件路径加载模板模块。
    这样本地预览不需要完整初始化插件，也不依赖 Redis 或其他运行时对象。
    """
    current_dir = Path(__file__).resolve().parent
    project_root = current_dir.parent.parent
    project_root_str = str(project_root)
    # 把项目根目录补进 sys.path，保证 report_template.py 内部引用 utils 等项目模块时可正常导入。
    if project_root_str not in sys.path:
        sys.path.insert(0, project_root_str)
    module_path = current_dir / "report_template.py"
    spec = importlib.util.spec_from_file_location("douyu_report_template_local", module_path)
    module = importlib.util.module_from_spec(spec)
    assert spec.loader is not None
    spec.loader.exec_module(module)
    return module


def _build_session(room_id: str, anchor_day: str, messages: List[Dict[str, Any]]) -> Dict[str, Any]:
    ordered = sorted(messages, key=lambda item: item.get("timestamp") or datetime.min)
    if not ordered:
        return {
            "session_id": f"{room_id}_{anchor_day.replace('-', '')}_empty",
            "room_id": room_id,
            "anchor_day": anchor_day,
            "nickname": "",
            "room_name": "",
            "segments": [],
        }
    return {
        "session_id": f"{room_id}_{anchor_day.replace('-', '')}_local_test",
        "room_id": room_id,
        "anchor_day": anchor_day,
        "nickname": "",
        "room_name": "",
        "segments": [{
            "start_time": ordered[0]["timestamp"].strftime("%Y-%m-%d %H:%M:%S"),
            "end_time": ordered[-1]["timestamp"].strftime("%Y-%m-%d %H:%M:%S"),
        }],
    }


def _build_preview_template_payload(local_result: Dict[str, Any]) -> Dict[str, Any]:
    """
    将本地测试结果转成粉丝日报模板真正需要的结构。
    这样预览链路和正式模板共用同一套字段命名，后续查问题更直观。
    """
    session_meta = local_result.get("session_meta", {}) or {}
    local_stats_preview = local_result.get("local_stats_preview", {}) or {}
    topic_clusters = local_result.get("topic_evidence_clusters", []) or []
    hero_mentions = local_result.get("hero_mentions", []) or []
    content_cues = local_result.get("content_cues", []) or []
    timeline_digest = local_result.get("timeline_digest", []) or []
    representative_messages = local_result.get("representative_messages", []) or []

    return {
        "report_meta": {
            "room_id": str(session_meta.get("room_id") or "").strip(),
            "anchor_day": str(session_meta.get("anchor_day") or "").strip(),
            "nickname": str(session_meta.get("nickname") or "").strip(),
            "room_name": str(session_meta.get("room_name") or "").strip(),
            "session_count": 1,
            "message_count": int(session_meta.get("message_count", 0) or 0),
            "unique_user_count": int(session_meta.get("unique_user_count", 0) or 0),
        },
        "local_stats": {
            "message_count": int(session_meta.get("message_count", 0) or 0),
            "unique_user_count": int(session_meta.get("unique_user_count", 0) or 0),
            "top_emotion_bursts": [
                {
                    "text": str(item.get("text") or "").strip(),
                    "count": int(item.get("count", 0) or 0),
                }
                for item in content_cues
                if str(item.get("kind") or "").strip() == "emotion"
            ][:8],
            "top_repeated_messages": [
                {
                    "text": str(item.get("text") or "").strip(),
                    "count": int(item.get("count", 0) or 0),
                    "user_count": int(item.get("user_count", 0) or 0),
                }
                for item in local_stats_preview.get("top_repeated_messages", [])[:8]
            ],
            "peak_windows": [
                {
                    "start_time": str(item.get("start_time") or "").strip(),
                    "message_count": int(item.get("message_count", 0) or 0),
                    "user_count": int(item.get("user_count", 0) or 0),
                }
                for item in local_stats_preview.get("peak_buckets", [])[:6]
            ],
        },
        "topic_evidence_clusters": [
            {
                "label": str(item.get("label") or "").strip(),
                "count": int(item.get("match_count", item.get("count", 0)) or 0),
                "user_count": int(item.get("user_count", 0) or 0),
                "time_range": (
                    f"{str(item.get('first_hm') or '').strip()}-{str(item.get('last_hm') or '').strip()}"
                ).strip("-"),
                "keywords": item.get("keywords", []) or [],
                "samples": item.get("samples", []) or [],
            }
            for item in topic_clusters[:6]
        ],
        "compact_scene_material": {
            "semantic_fact_hints": {
                "hero_mentions": hero_mentions[:6],
            },
            "content_cues": content_cues[:18],
            "timeline_digest": timeline_digest[:20],
        },
        "representative_messages": representative_messages[:12],
        "repeated_messages": [
            {
                "text": str(item.get("text") or "").strip(),
                "count": int(item.get("count", 0) or 0),
                "user_count": int(item.get("user_count", 0) or 0),
            }
            for item in local_stats_preview.get("top_repeated_messages", [])[:12]
        ],
        "burst_terms": [
            {
                "text": str(item.get("text") or "").strip(),
                "count": int(item.get("count", 0) or 0),
            }
            for item in local_stats_preview.get("top_burst_terms", [])[:12]
        ],
        "peak_buckets": local_stats_preview.get("peak_buckets", [])[:6],
        "top_terms": [
            {"term": str(keyword).strip(), "count": 0}
            for item in topic_clusters[:4]
            for keyword in (item.get("keywords", []) or [])[:2]
            if str(keyword).strip()
        ],
    }


def _build_preview_report_text(payload: Dict[str, Any]) -> str:
    """
    为本地模板预览提供一份稳定的示例文本。
    这里不依赖真实 LLM，只用已经提纯好的结果拼装固定结构，
    方便我们快速检查模板是否把关键信息展示完整。
    """
    meta = payload.get("report_meta", {}) or {}
    topic_clusters = payload.get("topic_evidence_clusters", []) or []
    hero_mentions = (
        payload.get("compact_scene_material", {})
        .get("semantic_fact_hints", {})
        .get("hero_mentions", [])
        or []
    )
    repeated_messages = payload.get("repeated_messages", []) or []
    burst_terms = payload.get("burst_terms", []) or []
    peak_buckets = payload.get("peak_buckets", []) or []
    representative_messages = payload.get("representative_messages", []) or []
    anchor_day = str(meta.get("anchor_day") or "").strip()

    lines = [
        f"{anchor_day} 这场直播的弹幕不只是热闹，核心信息也很密：赛事、位置、英雄、团播人物和摄像头梗都有人追着聊。",
        "【今日重点信息】",
    ]
    for item in topic_clusters[:5]:
        label = str(item.get("label") or "").strip()
        time_range = str(item.get("time_range") or "").strip()
        count = int(item.get("count", 0) or 0)
        samples = item.get("samples", []) or []
        sample_text = str(samples[0].get("content") or "").strip()[:42] if samples else ""
        if label and sample_text:
            lines.append(f"- {label}从 {time_range or '全场'} 一直有人聊，相关弹幕约 {count} 条，代表说法是「{sample_text}」。")

    lines.append("【核心讨论话题】")
    for item in topic_clusters[:4]:
        label = str(item.get("label") or "").strip()
        keywords = [str(keyword).strip() for keyword in (item.get("keywords", []) or [])[:5] if str(keyword).strip()]
        if label and keywords:
            lines.append(f"- 大家围着 {label} 打转，关键词主要是 {'、'.join(keywords)}。")

    lines.append("【英雄与对局焦点】")
    for item in hero_mentions[:4]:
        hero_name = str(item.get("hero") or "").strip()
        mention_count = int(item.get("mention_count", 0) or 0)
        samples = item.get("samples", []) or []
        sample_text = str(samples[0].get("content") or "").strip()[:36] if samples else ""
        if hero_name and sample_text:
            lines.append(f"- {hero_name}被点名 {mention_count} 次，弹幕现场直接聊到「{sample_text}」。")

    lines.append("【今日笑点】")
    if peak_buckets:
        top_bucket = peak_buckets[0]
        lines.append(
            f"- {str(top_bucket.get('start_time') or '')[-8:-3]} 前后是最热窗口，弹幕量直接冲到 {int(top_bucket.get('message_count', 0) or 0)} 条。"
        )
    if repeated_messages:
        item = repeated_messages[0]
        lines.append(f"- 复读冠军是「{str(item.get('text') or '').strip()[:24]}」，一天被刷了 {int(item.get('count', 0) or 0)} 次。")
    if burst_terms:
        item = burst_terms[0]
        lines.append(f"- 情绪词「{str(item.get('text') or '').strip()}」集中爆了 {int(item.get('count', 0) or 0)} 次。")

    lines.append("【弹幕名场面】")
    for item in representative_messages[:5]:
        nickname = str(item.get("nickname") or "").strip() or "观众"
        content = str(item.get("content") or "").strip()
        if content:
            lines.append(f"- {nickname}：{content[:44]}")

    lines.append("【梗王榜】")
    for item in repeated_messages[:3]:
        lines.append(f"- {str(item.get('text') or '').strip()[:28]}｜复读 {int(item.get('count', 0) or 0)} 次")

    lines.append("【收尾播报】")
    lines.append("- 本地预览版已经把有效信息和乐子一起塞进同一张图里了。")
    return "\n".join(lines)


def run_local_test(file_path: str) -> str:
    helper = _load_helper()
    resolved_path = str(Path(file_path).resolve())
    messages = helper.load_messages_from_file(resolved_path)
    file_name = Path(file_path).stem
    room_id, date_key = file_name.split("_", 1)
    anchor_day = f"{date_key[:4]}-{date_key[4:6]}-{date_key[6:8]}"
    session = _build_session(room_id, anchor_day, messages)
    payload = helper.build_llm_payload(room_id, session, messages)

    compact = payload.get("compact_prompt_assets", {}) or {}
    result = {
        "file_path": resolved_path,
        "message_count": len(messages),
        "session_meta": payload.get("session_meta", {}) or {},
        "local_stats_preview": {
            "top_repeated_messages": (payload.get("repeated_messages", []) or [])[:10],
            "top_burst_terms": (payload.get("burst_terms", []) or [])[:10],
            "peak_buckets": (payload.get("peak_buckets", []) or [])[:6],
        },
        "topic_evidence_clusters": ((compact.get("semantic_fact_hints", {}) or {}).get("topic_clusters", []) or [])[:8],
        "hero_mentions": ((compact.get("semantic_fact_hints", {}) or {}).get("hero_mentions", []) or [])[:8],
        "content_cues": (compact.get("content_cues", []) or [])[:16],
        "timeline_digest": (compact.get("timeline_digest", []) or [])[:12],
        "representative_messages": (payload.get("representative_messages", []) or [])[:18],
    }

    output_dir = Path(os.getcwd()) / "temp" / "douyu_materials"
    output_dir.mkdir(parents=True, exist_ok=True)
    output_path = output_dir / f"{file_name}_local_test_result.json"
    output_path.write_text(json.dumps(result, ensure_ascii=False, indent=2), encoding="utf-8")
    return str(output_path)


def render_fans_preview_from_file(file_path: str) -> str:
    """
    读取本地弹幕文件并直接产出新版粉丝日报 HTML 预览。
    这样我们每次调整提纯逻辑或模板后，都能用同一条命令快速验收最终展示效果。
    """
    local_result_path = Path(run_local_test(file_path))
    local_result = json.loads(local_result_path.read_text(encoding="utf-8"))
    payload = _build_preview_template_payload(local_result)
    report_text = _build_preview_report_text(payload)
    report_template = _load_report_template_module()
    html_content = report_template.render_fans_daily_report_html(
        payload=payload,
        fans_report_text=report_text,
    )

    output_dir = Path(os.getcwd()) / "temp" / "douyu_materials"
    output_dir.mkdir(parents=True, exist_ok=True)
    file_name = Path(file_path).stem
    output_path = output_dir / f"{file_name}_fans_template_preview.html"
    output_path.write_text(html_content, encoding="utf-8")
    return str(output_path)


if __name__ == "__main__":
    sample_files = [
        r"plugins\douyu\danmu_test\52876_20260428.txt",
        r"plugins\douyu\danmu_test\52876_20260429.txt",
    ]
    for sample in sample_files:
        result_path = run_local_test(sample)
        preview_path = render_fans_preview_from_file(sample)
        print(result_path)
        print(preview_path)