将@关系批处理业务迁移到 value_rank 插件

- 从 MessageStorageDB 移除@抽取与社交图写入逻辑，消息层仅保留归档职责 - 从系统级任务移除 process_pending_mentions，取消 message_to_db 中对应入口 - 在 value_rank 插件新增定时动作 value_rank_mentions_extract（每10分钟） - 在插件内实现窗口化批处理（默认10~20分钟前）、@提取、幂等写入明细/边表/日汇总及 unique_interactors 回填 - 新增插件侧可配置参数 mention_batch_size / mention_window_start_minutes / mention_window_end_minutes
2026-04-21 14:10:25 +08:00
parent d60d496bc3
commit d64d11a384
5 changed files with 359 additions and 460 deletions
--- a/plugins/value_rank/config.toml
+++ b/plugins/value_rank/config.toml
@@ -28,3 +28,8 @@ base_score_scale = 1000
 # 排行默认展示数量
 default_rank_limit = 10
 max_rank_limit = 50
+
+# @关系批处理（插件定时任务）参数
+mention_batch_size = 200
+mention_window_start_minutes = 20
+mention_window_end_minutes = 10
--- a/plugins/value_rank/main.py
+++ b/plugins/value_rank/main.py
@@ -1,6 +1,8 @@
 # -*- coding: utf-8 -*-
 import json
 import math
+import re
+import xml.etree.ElementTree as ET
 from datetime import datetime, timedelta
 from typing import Any, Dict, List, Optional, Tuple

@@ -268,6 +270,145 @@ class ValueRankDB(BaseDBOperator):
            result[user_id] = float(row.get("score") or 0.0)
        return result

+    def get_pending_mention_extract_messages_for_group(
+        self,
+        group_id: str,
+        limit: int,
+        window_start_minutes: int,
+        window_end_minutes: int,
+    ) -> List[Dict[str, Any]]:
+        """按群读取待处理@抽取消息。"""
+        sql = """
+        SELECT message_id, group_id, sender, message_xml, timestamp
+        FROM messages
+        WHERE group_id = %s
+          AND (mentioned_user_ids IS NULL OR mentioned_user_ids = '')
+          AND message_xml IS NOT NULL
+          AND message_xml <> ''
+          AND timestamp >= DATE_SUB(NOW(), INTERVAL %s MINUTE)
+          AND timestamp < DATE_SUB(NOW(), INTERVAL %s MINUTE)
+        ORDER BY timestamp ASC
+        LIMIT %s
+        """
+        return self.execute_query(sql, (group_id, window_start_minutes, window_end_minutes, limit)) or []
+
+    def update_message_mentioned_user_ids(
+        self,
+        message_id: str,
+        group_id: str,
+        sender_id: str,
+        mentioned_user_ids_json: str,
+    ) -> bool:
+        """回填消息表的 mentioned_user_ids 字段。"""
+        return self.execute_update(
+            """
+            UPDATE messages
+            SET mentioned_user_ids = %s
+            WHERE message_id = %s
+              AND group_id = %s
+              AND sender = %s
+            """,
+            (mentioned_user_ids_json, message_id, group_id, sender_id),
+        )
+
+    def get_existing_mentions(self, message_id: str, group_id: str, sender_id: str) -> List[str]:
+        """查询某条消息已经入库的@关系，避免重复累加。"""
+        rows = self.execute_query(
+            """
+            SELECT mentioned_user_id
+            FROM t_message_mentions
+            WHERE message_id = %s
+              AND group_id = %s
+              AND sender_id = %s
+            """,
+            (message_id, group_id, sender_id),
+        ) or []
+        return [str(r.get("mentioned_user_id") or "").strip() for r in rows if str(r.get("mentioned_user_id") or "").strip()]
+
+    def insert_message_mentions(self, rows: List[Tuple[Any, ...]]) -> bool:
+        """批量写入@明细。"""
+        if not rows:
+            return True
+        return self.execute_batch(
+            """
+            INSERT IGNORE INTO t_message_mentions
+            (message_id, group_id, sender_id, mentioned_user_id, stat_date, msg_time)
+            VALUES (%s, %s, %s, %s, %s, %s)
+            """,
+            rows,
+        )
+
+    def upsert_social_edges_daily(self, rows: List[Tuple[Any, ...]]) -> bool:
+        """批量累加社交边。"""
+        if not rows:
+            return True
+        return self.execute_batch(
+            """
+            INSERT INTO t_social_edges_daily
+            (stat_date, group_id, from_user_id, to_user_id, mention_count, interaction_score)
+            VALUES (%s, %s, %s, %s, %s, %s)
+            ON DUPLICATE KEY UPDATE
+                mention_count = mention_count + VALUES(mention_count),
+                interaction_score = interaction_score + VALUES(interaction_score),
+                update_time = CURRENT_TIMESTAMP
+            """,
+            rows,
+        )
+
+    def upsert_social_daily_row(self, row: Tuple[Any, ...]) -> bool:
+        """写入或更新单个用户的社交日汇总。"""
+        return self.execute_update(
+            """
+            INSERT INTO t_value_rank_social_daily
+            (stat_date, group_id, user_id, mentioned_count, mention_others_count, unique_interactors, interaction_score)
+            VALUES (%s, %s, %s, %s, %s, %s, %s)
+            ON DUPLICATE KEY UPDATE
+                mentioned_count = mentioned_count + VALUES(mentioned_count),
+                mention_others_count = mention_others_count + VALUES(mention_others_count),
+                interaction_score = interaction_score + VALUES(interaction_score),
+                update_time = CURRENT_TIMESTAMP
+            """,
+            row,
+        )
+
+    def refresh_unique_interactors(self, stat_date: str, group_id: str, user_ids: List[str]) -> None:
+        """回填去重互动人数。"""
+        deduped = []
+        seen = set()
+        for uid in user_ids:
+            normalized = str(uid or "").strip()
+            if not normalized or normalized in seen:
+                continue
+            seen.add(normalized)
+            deduped.append(normalized)
+
+        for uid in deduped:
+            row = self.execute_query(
+                """
+                SELECT COUNT(DISTINCT partner_id) AS partner_count
+                FROM (
+                    SELECT mentioned_user_id AS partner_id
+                    FROM t_message_mentions
+                    WHERE stat_date = %s AND group_id = %s AND sender_id = %s
+                    UNION
+                    SELECT sender_id AS partner_id
+                    FROM t_message_mentions
+                    WHERE stat_date = %s AND group_id = %s AND mentioned_user_id = %s
+                ) t
+                """,
+                (stat_date, group_id, uid, stat_date, group_id, uid),
+                fetch_one=True,
+            ) or {}
+            partner_count = int(row.get("partner_count") or 0)
+            self.execute_update(
+                """
+                UPDATE t_value_rank_social_daily
+                SET unique_interactors = %s, update_time = CURRENT_TIMESTAMP
+                WHERE stat_date = %s AND group_id = %s AND user_id = %s
+                """,
+                (partner_count, stat_date, group_id, uid),
+            )
+

 class ValueRankPlugin(MessagePluginInterface):
    """群成员身价排行插件。
@@ -337,6 +478,9 @@ class ValueRankPlugin(MessagePluginInterface):

        self.default_rank_limit = 10
        self.max_rank_limit = 50
+        self.mention_batch_size = 200
+        self.mention_window_start_minutes = 20
+        self.mention_window_end_minutes = 10

    def initialize(self, context: Dict[str, Any]) -> bool:
        """初始化插件与配置。"""
@@ -361,6 +505,9 @@ class ValueRankPlugin(MessagePluginInterface):

        self.default_rank_limit = int(cfg.get("default_rank_limit", self.default_rank_limit))
        self.max_rank_limit = int(cfg.get("max_rank_limit", self.max_rank_limit))
+        self.mention_batch_size = int(cfg.get("mention_batch_size", self.mention_batch_size))
+        self.mention_window_start_minutes = int(cfg.get("mention_window_start_minutes", self.mention_window_start_minutes))
+        self.mention_window_end_minutes = int(cfg.get("mention_window_end_minutes", self.mention_window_end_minutes))

        # 权重归一化：避免配置误差导致总权重不为 1。
        weight_sum = self.points_weight + self.message_weight + self.active_days_weight + self.social_weight
@@ -470,6 +617,17 @@ class ValueRankPlugin(MessagePluginInterface):
                "payload": {},
                "default_enabled": True,
            },
+            {
+                "action_key": "value_rank_mentions_extract",
+                "name": "@关系批处理",
+                "description": "每10分钟批量抽取10-20分钟前的@关系并更新社交图",
+                "trigger_type": "every_seconds",
+                "trigger_config": {"seconds": 600},
+                "target_scope": "all_enabled_groups",
+                "target_config": {},
+                "payload": {},
+                "default_enabled": True,
+            },
            {
                "action_key": "value_rank_weekly_report_push",
                "name": "身价周报推送",
@@ -486,7 +644,7 @@ class ValueRankPlugin(MessagePluginInterface):

    async def run_scheduled_action(self, action_key: str, context: Dict[str, Any]) -> Dict[str, Any]:
        """执行调度动作。"""
-        if action_key not in {"value_rank_daily_recompute", "value_rank_weekly_report_push"}:
+        if action_key not in {"value_rank_daily_recompute", "value_rank_weekly_report_push", "value_rank_mentions_extract"}:
            return {"success": False, "summary": f"不支持动作: {action_key}", "detail": {}}

        target_groups = [str(g).strip() for g in (context.get("target_groups") or []) if str(g).strip()]
@@ -504,6 +662,35 @@ class ValueRankPlugin(MessagePluginInterface):
        stat_date = datetime.now().strftime("%Y-%m-%d")
        bot = context.get("bot") or getattr(self, "bot", None)

+        # @抽取任务不依赖 bot。
+        if action_key == "value_rank_mentions_extract":
+            total_stats = {"total": 0, "processed": 0, "with_mentions": 0, "failed": 0}
+            for gid in target_groups:
+                try:
+                    stats = self._process_pending_mentions_for_group(gid)
+                    total_stats["total"] += int(stats.get("total", 0))
+                    total_stats["processed"] += int(stats.get("processed", 0))
+                    total_stats["with_mentions"] += int(stats.get("with_mentions", 0))
+                    total_stats["failed"] += int(stats.get("failed", 0))
+                    success_groups.append(gid)
+                except Exception as e:
+                    failed_groups[gid] = str(e)
+
+            return {
+                "success": len(failed_groups) == 0,
+                "summary": (
+                    f"@关系批处理完成: 读取{total_stats['total']}条, "
+                    f"处理{total_stats['processed']}条, 含@{total_stats['with_mentions']}条, "
+                    f"失败{total_stats['failed']}条, 异常群{len(failed_groups)}个"
+                ),
+                "detail": {
+                    "window": f"[NOW-{self.mention_window_start_minutes}m, NOW-{self.mention_window_end_minutes}m)",
+                    "batch_size": self.mention_batch_size,
+                    "stats": total_stats,
+                    "failed_groups": failed_groups,
+                },
+            }
+
        # 周报任务先确保当日快照存在，再执行推送，避免“有报表命令但无数据”。
        if action_key == "value_rank_weekly_report_push" and not bot:
            return {"success": False, "summary": "周报推送失败：bot 未注入", "detail": {}}
@@ -864,6 +1051,172 @@ class ValueRankPlugin(MessagePluginInterface):
        lines.append("提示：分数由积分/发言/活跃/社交影响力综合计算。")
        return "\n".join(lines)

+    def _process_pending_mentions_for_group(self, group_id: str) -> Dict[str, int]:
+        """处理单群待抽取@消息（插件内定时业务）。"""
+        if not self.db:
+            return {"total": 0, "processed": 0, "with_mentions": 0, "failed": 0}
+
+        started_at = datetime.now()
+        window_start = max(int(self.mention_window_start_minutes), 1)
+        window_end = max(int(self.mention_window_end_minutes), 0)
+        if window_start <= window_end:
+            window_start = window_end + 10
+            self.LOG.warning(
+                f"[{self.name}] @窗口参数异常已修正: group={group_id}, "
+                f"window_start={self.mention_window_start_minutes}, "
+                f"window_end={self.mention_window_end_minutes}, fixed=[{window_start},{window_end}]"
+            )
+
+        rows = self.db.get_pending_mention_extract_messages_for_group(
+            group_id=group_id,
+            limit=self.mention_batch_size,
+            window_start_minutes=window_start,
+            window_end_minutes=window_end,
+        )
+        if not rows:
+            return {"total": 0, "processed": 0, "with_mentions": 0, "failed": 0}
+
+        processed, with_mentions, failed = 0, 0, 0
+        fail_samples: List[str] = []
+
+        for idx, row in enumerate(rows, start=1):
+            try:
+                message_id = str(row.get("message_id") or "").strip()
+                sender_id = str(row.get("sender") or "").strip()
+                raw_xml = str(row.get("message_xml") or "")
+                msg_time = self._safe_parse_message_time(row.get("timestamp"))
+
+                mentioned_ids = self._extract_mentioned_user_ids(raw_xml)
+                mentioned_ids_json = json.dumps(mentioned_ids, ensure_ascii=False)
+                self.db.update_message_mentioned_user_ids(
+                    message_id=message_id,
+                    group_id=group_id,
+                    sender_id=sender_id,
+                    mentioned_user_ids_json=mentioned_ids_json,
+                )
+
+                self._persist_mention_graph_data(
+                    group_id=group_id,
+                    sender_id=sender_id,
+                    message_id=message_id,
+                    mentioned_user_ids=mentioned_ids,
+                    msg_time=msg_time,
+                )
+
+                processed += 1
+                if mentioned_ids:
+                    with_mentions += 1
+                if idx <= 2:
+                    self.LOG.debug(
+                        f"[{self.name}] @抽取样本: group={group_id}, msg={message_id}, "
+                        f"sender={sender_id}, mentioned_count={len(mentioned_ids)}"
+                    )
+            except Exception as e:
+                failed += 1
+                if len(fail_samples) < 5:
+                    fail_samples.append(str(row.get("message_id") or ""))
+                self.LOG.error(f"[{self.name}] @抽取失败: group={group_id}, message_id={row.get('message_id')}, error={e}")
+
+        elapsed_ms = int((datetime.now() - started_at).total_seconds() * 1000)
+        stats = {"total": len(rows), "processed": processed, "with_mentions": with_mentions, "failed": failed}
+        self.LOG.info(
+            f"[{self.name}] @批处理完成: group={group_id}, total={stats['total']}, processed={processed}, "
+            f"with_mentions={with_mentions}, failed={failed}, cost={elapsed_ms}ms, fail_samples={fail_samples}"
+        )
+        return stats
+
+    @staticmethod
+    def _safe_parse_message_time(value: Any) -> datetime:
+        """安全解析消息时间，失败时回退到当前时间。"""
+        if isinstance(value, datetime):
+            return value
+        text = str(value or "").strip()
+        if not text:
+            return datetime.now()
+        try:
+            return datetime.strptime(text, "%Y-%m-%d %H:%M:%S")
+        except Exception:
+            return datetime.now()
+
+    @staticmethod
+    def _extract_mentioned_user_ids(raw_xml: str) -> List[str]:
+        """从消息 XML 提取@用户ID清单。"""
+        raw_xml = str(raw_xml or "")
+        if not raw_xml:
+            return []
+
+        at_user_list_text = ""
+        try:
+            root = ET.fromstring(raw_xml)
+            node = root.find(".//atuserlist")
+            if node is not None and node.text:
+                at_user_list_text = str(node.text).strip()
+        except Exception:
+            match = re.search(r"<atuserlist><!\[CDATA\[(.*?)\]\]></atuserlist>", raw_xml, flags=re.IGNORECASE | re.DOTALL)
+            if match:
+                at_user_list_text = str(match.group(1) or "").strip()
+
+        if not at_user_list_text:
+            return []
+
+        raw_ids = re.split(r"[,\s;]+", at_user_list_text)
+        seen = set()
+        result: List[str] = []
+        for uid in raw_ids:
+            normalized = str(uid or "").strip()
+            if not normalized or normalized in seen:
+                continue
+            seen.add(normalized)
+            result.append(normalized)
+        return result
+
+    def _persist_mention_graph_data(
+        self,
+        group_id: str,
+        sender_id: str,
+        message_id: str,
+        mentioned_user_ids: List[str],
+        msg_time: datetime,
+    ) -> None:
+        """落盘社交图增量数据（明细 + 边 + 个人日汇总）。"""
+        if not self.db or not group_id or not sender_id or not message_id:
+            return
+
+        invalid_mentions = {"notify@all", "all", "@all"}
+        clean_ids: List[str] = []
+        seen = set()
+        for uid in mentioned_user_ids:
+            normalized = str(uid or "").strip()
+            if (not normalized or normalized in invalid_mentions or normalized == sender_id or normalized in seen):
+                continue
+            seen.add(normalized)
+            clean_ids.append(normalized)
+
+        if not clean_ids:
+            return
+
+        existing = set(self.db.get_existing_mentions(message_id, group_id, sender_id))
+        new_ids = [uid for uid in clean_ids if uid not in existing]
+        if not new_ids:
+            return
+
+        stat_date = msg_time.strftime("%Y-%m-%d")
+        msg_time_text = msg_time.strftime("%Y-%m-%d %H:%M:%S")
+
+        mention_rows = [(message_id, group_id, sender_id, uid, stat_date, msg_time_text) for uid in new_ids]
+        self.db.insert_message_mentions(mention_rows)
+
+        edge_rows = [(stat_date, group_id, sender_id, uid, 1, 1.0) for uid in new_ids]
+        self.db.upsert_social_edges_daily(edge_rows)
+
+        # 发起方：主动@次数 + 互动分
+        self.db.upsert_social_daily_row((stat_date, group_id, sender_id, 0, len(new_ids), 0, float(len(new_ids))))
+        # 接收方：被@次数 + 互动分
+        for uid in new_ids:
+            self.db.upsert_social_daily_row((stat_date, group_id, uid, 1, 0, 0, 1.0))
+
+        self.db.refresh_unique_interactors(stat_date, group_id, [sender_id, *new_ids])
+
    def _build_explain_text(self) -> str:
        """输出算法说明文本。"""
        return (