为自动回复增加群画像数据库快照缓存

2026-04-24 16:21:00 +08:00
parent 8a813df4a3
commit fa9dc44bbe
3 changed files with 236 additions and 1 deletions
--- a/db/group_profile_snapshot_db.py
+++ b/db/group_profile_snapshot_db.py
@@ -0,0 +1,109 @@
 # -*- coding: utf-8 -*-
 import json
 from datetime import datetime
 from typing import Dict, Optional
 from db.base import BaseDBOperator
 from db.connection import DBConnectionManager
 class GroupProfileSnapshotDBOperator(BaseDBOperator):
    """群画像快照数据库操作"""
    def __init__(self, db_manager: DBConnectionManager):
        super().__init__(db_manager)
        self._create_tables()
    def _create_tables(self):
        try:
            self.execute_update("""
            CREATE TABLE IF NOT EXISTS t_group_profile_snapshot (
                id INT AUTO_INCREMENT PRIMARY KEY,
                chatroom_id VARCHAR(64) NOT NULL COMMENT '群聊ID',
                group_name VARCHAR(128) DEFAULT '' COMMENT '群名称',
                profile_json LONGTEXT COMMENT '群画像快照JSON',
                source_summary_latest_at DATETIME NULL COMMENT '构建时参考的最近群总结更新时间',
                source_message_latest_at DATETIME NULL COMMENT '构建时参考的最近群消息时间',
                source_summary_count INT NOT NULL DEFAULT 0 COMMENT '构建时参考的群总结条数',
                source_message_sample_count INT NOT NULL DEFAULT 0 COMMENT '构建时参考的消息样本数',
                last_generated_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '最后一次生成时间',
                create_time DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
                update_time DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
                UNIQUE KEY idx_group_profile_snapshot (chatroom_id),
                KEY idx_group_profile_generated_at (last_generated_at)
            ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='群画像快照表';
            """)
        except Exception as e:
            self.LOG.error(f"创建群画像快照表失败: {e}")
    def get_snapshot(self, chatroom_id: str) -> Optional[Dict]:
        try:
            sql = """
            SELECT *
            FROM t_group_profile_snapshot
            WHERE chatroom_id = %s
            LIMIT 1
            """
            row = self.execute_query(sql, (chatroom_id,), fetch_one=True)
            return self._deserialize_row(row)
        except Exception as e:
            self.LOG.error(f"获取群画像快照失败: {e}")
            return None
    def save_snapshot(self, snapshot: Dict) -> bool:
        try:
            data = {
                "chatroom_id": snapshot.get("chatroom_id", ""),
                "group_name": snapshot.get("group_name", ""),
                "profile_json": json.dumps(snapshot.get("profile", {}), ensure_ascii=False),
                "source_summary_latest_at": snapshot.get("source_summary_latest_at"),
                "source_message_latest_at": snapshot.get("source_message_latest_at"),
                "source_summary_count": int(snapshot.get("source_summary_count", 0) or 0),
                "source_message_sample_count": int(snapshot.get("source_message_sample_count", 0) or 0),
                "last_generated_at": snapshot.get(
                    "last_generated_at",
                    datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                ),
            }
            fields = ", ".join(data.keys())
            placeholders = ", ".join(["%s"] * len(data))
            update_clause = ", ".join(
                [f"{key}=VALUES({key})" for key in data.keys() if key != "chatroom_id"]
            )
            sql = f"""
            INSERT INTO t_group_profile_snapshot ({fields})
            VALUES ({placeholders})
            ON DUPLICATE KEY UPDATE {update_clause}
            """
            return self.execute_update(sql, tuple(data.values()))
        except Exception as e:
            self.LOG.error(f"保存群画像快照失败: {e}")
            return False
    @staticmethod
    def _deserialize_row(row: Optional[Dict]) -> Optional[Dict]:
        if not row:
            return row
        profile_json = row.get("profile_json")
        if profile_json:
            try:
                row["profile_json"] = json.loads(profile_json)
            except Exception:
                row["profile_json"] = {}
        else:
            row["profile_json"] = {}
        for key in (
            "source_summary_latest_at",
            "source_message_latest_at",
            "last_generated_at",
            "create_time",
            "update_time",
        ):
            value = row.get(key)
            if isinstance(value, datetime):
                row[key] = value.strftime("%Y-%m-%d %H:%M:%S")
        row["profile"] = row.get("profile_json", {})
        return row
--- a/plugins/ai_auto_response/config.toml
+++ b/plugins/ai_auto_response/config.toml
@@ -195,8 +195,10 @@ debug = true
 # 1. 这里读取最近 5 份群摘要，再聚合成稳定主题/近期重点/未决问题；
 # 2. 自动回复消费时优先走这些结构字段，减少 markdown 大段文本的理解损耗；
 # 3. item_limit 控制每类字段带给模型的条数，避免群背景过重。
 # 4. cache_ttl_sec 让群画像结果落库复用，在短时间内且源数据没变时直接读快照，避免每条消息重复聚合。
 summary_history_limit = 5
 summary_item_limit = 4
 cache_ttl_sec = 600
 [group_profiles.default]
 mode = "social"
--- a/plugins/ai_auto_response/memory/group_memory_profile.py
+++ b/plugins/ai_auto_response/memory/group_memory_profile.py
@@ -3,8 +3,10 @@ from __future__ import annotations
 import json
 import re
 from collections import Counter
 from datetime import datetime
 from typing import Any, Dict, List, Optional
 from db.group_profile_snapshot_db import GroupProfileSnapshotDBOperator
 from db.message_storage import MessageStorageDB
 from db.message_summary_db import MessageSummaryDBOperator
@@ -29,14 +31,31 @@ class GroupMemoryService:
        self.config = config or {}
        self.message_db = MessageStorageDB(db_manager)
        self.summary_db = MessageSummaryDBOperator(db_manager)
        self.snapshot_db = GroupProfileSnapshotDBOperator(db_manager)
        # 群聊自动回复不应该只盯着“昨天那一篇总结”：
        # 1. 日摘要天然是日维度，如果只读最新一条，很容易把短期偶发波动误当成长期背景；
        # 2. 这里改成读取最近几份摘要，再做轻量聚合，能让群长期画像更稳定；
        # 3. 同时保留条数上限，避免群摘要本身反过来把 prompt 挤爆。
        self.summary_history_limit = max(int(self.config.get("summary_history_limit", 5) or 5), 1)
        self.summary_item_limit = max(int(self.config.get("summary_item_limit", 4) or 4), 1)
        # 群画像快照缓存：
        # 1. 自动回复是高频路径，群画像如果每条消息都重新聚合，会重复扫群总结和近期消息；
        # 2. 这里引入数据库快照，只要在 TTL 内且源数据没有变化，就直接复用；
        # 3. 这样快照既能跨进程/重启保留，又能把每条消息的聚合成本压下来。
        self.cache_ttl_sec = max(int(self.config.get("cache_ttl_sec", 600) or 600), 0)
    def build_group_memory_profile(self, room_id: str, group_name: str = "") -> Dict:
        source_summary_latest_at = self._get_latest_summary_time(room_id)
        source_message_latest_at = self._get_latest_group_message_time(room_id)
        cached_profile = self._load_cached_profile_if_fresh(
            room_id=room_id,
            group_name=group_name,
            source_summary_latest_at=source_summary_latest_at,
            source_message_latest_at=source_message_latest_at,
        )
        if cached_profile:
            return cached_profile
        recent_messages = self.message_db.get_messages_for_summary(
            room_id, hours_ago=48, min_messages=20, max_hours=168, max_results=300
        ) or []
@@ -100,7 +119,7 @@ class GroupMemoryService:
            serious_hits=serious_hits,
            short_message_ratio=(short_message_count / message_count) if message_count else 0.0,
        )
-        return {
+        profile = {
            "room_id": room_id,
            "group_name": group_name,
            "inferred_domain": inferred_domain,
@@ -111,7 +130,19 @@ class GroupMemoryService:
            "structured_summary": structured_summary,
            "summary_source_count": len(summary_records),
            "summary_timeline": structured_summary.get("timeline", []) or [],
            "cache_status": "rebuilt",
            "last_generated_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        }
        self._save_profile_snapshot(
            room_id=room_id,
            group_name=group_name,
            profile=profile,
            source_summary_latest_at=source_summary_latest_at,
            source_message_latest_at=source_message_latest_at,
            source_summary_count=len(summary_records),
            source_message_sample_count=len(recent_messages),
        )
        return profile
    @staticmethod
    def _count_hits(text: str, keywords: List[str]) -> int:
@@ -181,6 +212,99 @@ class GroupMemoryService:
            records.append(normalized)
        return records
    def _load_cached_profile_if_fresh(
        self,
        *,
        room_id: str,
        group_name: str,
        source_summary_latest_at: str,
        source_message_latest_at: str,
    ) -> Optional[Dict]:
        if self.cache_ttl_sec <= 0:
            return None
        snapshot = self.snapshot_db.get_snapshot(room_id)
        if not snapshot:
            return None
        # 快照新鲜度判断分两层：
        # 1. 先看快照是否还在 TTL 内，避免长期无限复用旧画像；
        # 2. 再看“最近群总结更新时间 / 最近群消息时间”是否和上次构建时一致，
        #    只有源数据没变，才允许直接复用。
        last_generated_at = self._parse_dt(snapshot.get("last_generated_at"))
        if not last_generated_at:
            return None
        age_sec = max((datetime.now() - last_generated_at).total_seconds(), 0.0)
        if age_sec > float(self.cache_ttl_sec):
            return None
        if str(snapshot.get("source_summary_latest_at", "") or "") != str(source_summary_latest_at or ""):
            return None
        if str(snapshot.get("source_message_latest_at", "") or "") != str(source_message_latest_at or ""):
            return None
        profile = dict(snapshot.get("profile", {}) or {})
        if not profile:
            return None
        profile["group_name"] = group_name or profile.get("group_name", "")
        profile["cache_status"] = "hit"
        profile["last_generated_at"] = str(snapshot.get("last_generated_at", "") or "")
        return profile
    def _save_profile_snapshot(
        self,
        *,
        room_id: str,
        group_name: str,
        profile: Dict,
        source_summary_latest_at: str,
        source_message_latest_at: str,
        source_summary_count: int,
        source_message_sample_count: int,
    ) -> bool:
        return self.snapshot_db.save_snapshot({
            "chatroom_id": room_id,
            "group_name": group_name,
            "profile": profile,
            "source_summary_latest_at": source_summary_latest_at or None,
            "source_message_latest_at": source_message_latest_at or None,
            "source_summary_count": source_summary_count,
            "source_message_sample_count": source_message_sample_count,
            "last_generated_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        })
    def _get_latest_summary_time(self, room_id: str) -> str:
        sql = """
        SELECT COALESCE(MAX(update_time), MAX(last_generated_at), MAX(period_end)) AS latest_time
        FROM t_message_summary
        WHERE chatroom_id = %s
          AND summary_type IN ('daily', 'manual')
        """
        row = self.summary_db.execute_query(sql, (room_id,), fetch_one=True) or {}
        return self._normalize_dt(row.get("latest_time"))
    def _get_latest_group_message_time(self, room_id: str) -> str:
        latest_message = self.message_db.get_group_last_message(room_id) or {}
        return self._normalize_dt(latest_message.get("timestamp"))
    @staticmethod
    def _normalize_dt(value: Any) -> str:
        if isinstance(value, datetime):
            return value.strftime("%Y-%m-%d %H:%M:%S")
        text = str(value or "").strip()
        return text
    @staticmethod
    def _parse_dt(value: Any) -> Optional[datetime]:
        text = str(value or "").strip()
        if not text:
            return None
        for fmt, size in (("%Y-%m-%d %H:%M:%S", 19), ("%Y-%m-%d", 10)):
            try:
                return datetime.strptime(text[:size], fmt)
            except Exception:
                continue
        return None
    def _build_structured_summary_digest(self, records: List[Dict]) -> Dict:
        if not records:
            return {