feat: 重构成员画像为日周月分层沉淀链路并增强后台摘要能力
本次提交围绕成员画像插件进行了较大升级,核心目标是把原来偏单次、偏近期的成员交互摘要,升级为可随时间沉淀的分层画像能力。 主要功能变更如下: 1. 新增成员分层摘要存储表 t_member_digest,并提供对应的数据库操作层,支持按成员、按群、按摘要类型(daily/weekly/monthly)持久化周期性摘要结果。 2. 在 member_context 插件内新增 MemberDigestService,把画像生成拆分为日摘要、周摘要、月摘要三级处理流程,再由最终画像服务消费这些分层摘要,减少直接反复处理大量原始消息带来的成本和失真。 3. 新增提示词构建模块,分别为日级观察、周级归纳、月级归纳以及最终画像整理提供独立提示词,强调中性、克制、避免敏感推断,并将长期特征与近期状态明确分层。 4. 重写成员最终画像生成逻辑,优先基于日/周/月摘要融合出长期特征、习惯模式、长期回复偏好、近期状态等信息,再用 AI 对分层摘要做最终整理,避免仅依赖近 30 天消息得出偏短期结论。 5. 保留并增强长期画像融合逻辑,通过打分、衰减和重复证据累积,使长期特征随着时间逐步稳定,而不会被单次刷新完全覆盖。 6. 在消息存储层补充成员按时间增量获取、按活跃日期统计、按天取消息等查询方法,为后续分层摘要生成提供数据支撑。 7. 扩展 member_context 插件配置,增加日级摘要消息上限、日摘要最小消息数、单次回填的日摘要数量上限、最终画像使用的日/周/月摘要数量等参数,便于在准确性和系统负载之间做平衡。 8. 后台成员摘要详情页新增长期沟通倾向、长期特征、习惯模式、长期回复偏好、近期状态、历史样本数、分层摘要数量等展示字段,方便观察画像沉淀程度。 9. 优化后台查看成员摘要接口逻辑:首次打开如果还没有摘要,不再同步阻塞生成,而是返回未就绪状态,配合后台手动异步刷新,降低页面卡顿和接口阻塞风险。 10. 增强刷新日志,单成员和群级刷新会输出当前刷新模式以及日/周/月摘要数量,便于排查画像构建进度。 11. 调整当前日、当前周、当前月摘要的重算逻辑,确保新增日摘要写入后,本周和本月摘要不会长期停留在旧版本。 本次提交后,成员画像能力从“基于近期样本的单层摘要”升级为“基于时间沉淀的分层画像管线”,为后续把画像稳定接入 AI 自动回复上下文打下基础,同时尽量保持现有群权限控制和后台异步刷新方式不变。
This commit is contained in:
157
db/member_digest_db.py
Normal file
157
db/member_digest_db.py
Normal file
@@ -0,0 +1,157 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import json
|
||||
from datetime import datetime
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from db.base import BaseDBOperator
|
||||
from db.connection import DBConnectionManager
|
||||
|
||||
|
||||
class MemberDigestDBOperator(BaseDBOperator):
|
||||
"""成员分层摘要数据库操作"""
|
||||
|
||||
def __init__(self, db_manager: DBConnectionManager):
|
||||
super().__init__(db_manager)
|
||||
self._create_tables()
|
||||
|
||||
def _create_tables(self):
|
||||
try:
|
||||
self.execute_update("""
|
||||
CREATE TABLE IF NOT EXISTS t_member_digest (
|
||||
id INT AUTO_INCREMENT PRIMARY KEY,
|
||||
chatroom_id VARCHAR(64) NOT NULL COMMENT '群聊ID',
|
||||
wxid VARCHAR(64) NOT NULL COMMENT '成员微信ID',
|
||||
digest_type VARCHAR(16) NOT NULL COMMENT '摘要类型 daily|weekly|monthly',
|
||||
period_key VARCHAR(32) NOT NULL COMMENT '周期主键',
|
||||
period_start DATETIME NULL COMMENT '周期开始时间',
|
||||
period_end DATETIME NULL COMMENT '周期结束时间',
|
||||
display_name VARCHAR(128) COMMENT '成员展示名',
|
||||
source_count INT DEFAULT 0 COMMENT '源数据条数',
|
||||
summary_text TEXT COMMENT '摘要说明',
|
||||
structured_json LONGTEXT COMMENT '结构化摘要JSON',
|
||||
meta_json LONGTEXT COMMENT '附加元数据JSON',
|
||||
last_generated_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '最后生成时间',
|
||||
create_time DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
|
||||
update_time DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
|
||||
UNIQUE KEY idx_member_digest (chatroom_id, wxid, digest_type, period_key),
|
||||
KEY idx_digest_lookup (chatroom_id, wxid, digest_type, period_end)
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='成员分层摘要表';
|
||||
""")
|
||||
except Exception as e:
|
||||
self.LOG.error(f"创建成员分层摘要表失败: {e}")
|
||||
|
||||
def save_digest(self, digest: Dict) -> bool:
|
||||
try:
|
||||
data = {
|
||||
"chatroom_id": digest.get("chatroom_id", ""),
|
||||
"wxid": digest.get("wxid", ""),
|
||||
"digest_type": digest.get("digest_type", ""),
|
||||
"period_key": digest.get("period_key", ""),
|
||||
"period_start": digest.get("period_start"),
|
||||
"period_end": digest.get("period_end"),
|
||||
"display_name": digest.get("display_name", ""),
|
||||
"source_count": digest.get("source_count", 0),
|
||||
"summary_text": digest.get("summary_text", ""),
|
||||
"structured_json": json.dumps(digest.get("structured", {}), ensure_ascii=False),
|
||||
"meta_json": json.dumps(digest.get("meta", {}), ensure_ascii=False),
|
||||
"last_generated_at": digest.get("last_generated_at", datetime.now().strftime("%Y-%m-%d %H:%M:%S")),
|
||||
}
|
||||
fields = ", ".join(data.keys())
|
||||
placeholders = ", ".join(["%s"] * len(data))
|
||||
update_clause = ", ".join(
|
||||
[f"{key}=VALUES({key})" for key in data.keys() if key not in ("chatroom_id", "wxid", "digest_type", "period_key")]
|
||||
)
|
||||
sql = f"""
|
||||
INSERT INTO t_member_digest ({fields})
|
||||
VALUES ({placeholders})
|
||||
ON DUPLICATE KEY UPDATE {update_clause}
|
||||
"""
|
||||
return self.execute_update(sql, tuple(data.values()))
|
||||
except Exception as e:
|
||||
self.LOG.error(f"保存成员分层摘要失败: {e}")
|
||||
return False
|
||||
|
||||
def get_digest(self, chatroom_id: str, wxid: str, digest_type: str, period_key: str) -> Optional[Dict]:
|
||||
try:
|
||||
sql = """
|
||||
SELECT *
|
||||
FROM t_member_digest
|
||||
WHERE chatroom_id = %s AND wxid = %s AND digest_type = %s AND period_key = %s
|
||||
LIMIT 1
|
||||
"""
|
||||
row = self.execute_query(sql, (chatroom_id, wxid, digest_type, period_key), fetch_one=True)
|
||||
return self._deserialize_row(row)
|
||||
except Exception as e:
|
||||
self.LOG.error(f"获取成员分层摘要失败: {e}")
|
||||
return None
|
||||
|
||||
def list_digests(self, chatroom_id: str, wxid: str, digest_type: str, limit: int = 20) -> List[Dict]:
|
||||
try:
|
||||
sql = """
|
||||
SELECT *
|
||||
FROM t_member_digest
|
||||
WHERE chatroom_id = %s AND wxid = %s AND digest_type = %s
|
||||
ORDER BY period_end DESC, period_key DESC
|
||||
LIMIT %s
|
||||
"""
|
||||
rows = self.execute_query(sql, (chatroom_id, wxid, digest_type, limit)) or []
|
||||
return [self._deserialize_row(row) for row in rows]
|
||||
except Exception as e:
|
||||
self.LOG.error(f"获取成员分层摘要列表失败: {e}")
|
||||
return []
|
||||
|
||||
def list_digest_keys(self, chatroom_id: str, wxid: str, digest_type: str) -> List[str]:
|
||||
try:
|
||||
sql = """
|
||||
SELECT period_key
|
||||
FROM t_member_digest
|
||||
WHERE chatroom_id = %s AND wxid = %s AND digest_type = %s
|
||||
"""
|
||||
rows = self.execute_query(sql, (chatroom_id, wxid, digest_type)) or []
|
||||
return [str(row.get("period_key")) for row in rows if row.get("period_key")]
|
||||
except Exception as e:
|
||||
self.LOG.error(f"获取成员摘要key失败: {e}")
|
||||
return []
|
||||
|
||||
def list_period_digests(self, chatroom_id: str, wxid: str, digest_type: str,
|
||||
period_keys: List[str]) -> List[Dict]:
|
||||
try:
|
||||
if not period_keys:
|
||||
return []
|
||||
placeholders = ", ".join(["%s"] * len(period_keys))
|
||||
sql = f"""
|
||||
SELECT *
|
||||
FROM t_member_digest
|
||||
WHERE chatroom_id = %s AND wxid = %s AND digest_type = %s AND period_key IN ({placeholders})
|
||||
ORDER BY period_end ASC, period_key ASC
|
||||
"""
|
||||
params = (chatroom_id, wxid, digest_type, *period_keys)
|
||||
rows = self.execute_query(sql, params) or []
|
||||
return [self._deserialize_row(row) for row in rows]
|
||||
except Exception as e:
|
||||
self.LOG.error(f"批量获取成员分层摘要失败: {e}")
|
||||
return []
|
||||
|
||||
@staticmethod
|
||||
def _deserialize_row(row: Optional[Dict]) -> Optional[Dict]:
|
||||
if not row:
|
||||
return row
|
||||
|
||||
for key in ("structured_json", "meta_json"):
|
||||
value = row.get(key)
|
||||
if not value:
|
||||
row[key] = {}
|
||||
continue
|
||||
try:
|
||||
row[key] = json.loads(value)
|
||||
except Exception:
|
||||
row[key] = {}
|
||||
|
||||
for key in ("period_start", "period_end", "last_generated_at"):
|
||||
value = row.get(key)
|
||||
if isinstance(value, datetime):
|
||||
row[key] = value.strftime("%Y-%m-%d %H:%M:%S")
|
||||
|
||||
row["structured"] = row.get("structured_json", {})
|
||||
row["meta"] = row.get("meta_json", {})
|
||||
return row
|
||||
@@ -60,6 +60,68 @@ class MessageStorageDB(BaseDBOperator):
|
||||
results = self.execute_query(sql, (days, group_id, wxid, limit)) or []
|
||||
return list(reversed(results))
|
||||
|
||||
def get_member_messages_since(self, group_id: str, wxid: str, since_time, limit: int = 200) -> List[Dict]:
|
||||
"""获取指定时间之后的成员消息"""
|
||||
sql = """
|
||||
SELECT timestamp, sender, content, message_type
|
||||
FROM messages
|
||||
WHERE timestamp > %s
|
||||
AND group_id = %s
|
||||
AND sender = %s
|
||||
AND message_type IN (1, 49)
|
||||
AND CHAR_LENGTH(content) BETWEEN 2 AND 500
|
||||
AND content NOT LIKE '/%%'
|
||||
ORDER BY timestamp ASC
|
||||
LIMIT %s
|
||||
"""
|
||||
if isinstance(since_time, datetime):
|
||||
since_time = since_time.strftime("%Y-%m-%d %H:%M:%S")
|
||||
return self.execute_query(sql, (since_time, group_id, wxid, limit)) or []
|
||||
|
||||
def get_member_active_dates(self, group_id: str, wxid: str, days: int = 365) -> List[Dict]:
|
||||
"""获取成员在指定时间窗口内的活跃日期列表"""
|
||||
sql = """
|
||||
SELECT
|
||||
DATE(timestamp) AS message_date,
|
||||
COUNT(*) AS msg_count,
|
||||
MIN(timestamp) AS first_message_time,
|
||||
MAX(timestamp) AS last_message_time
|
||||
FROM messages
|
||||
WHERE timestamp >= DATE_SUB(NOW(), INTERVAL %s DAY)
|
||||
AND group_id = %s
|
||||
AND sender = %s
|
||||
AND message_type IN (1, 49)
|
||||
AND CHAR_LENGTH(content) BETWEEN 2 AND 500
|
||||
AND content NOT LIKE '/%%'
|
||||
GROUP BY DATE(timestamp)
|
||||
ORDER BY message_date ASC
|
||||
"""
|
||||
rows = self.execute_query(sql, (days, group_id, wxid)) or []
|
||||
for row in rows:
|
||||
for key in ("message_date", "first_message_time", "last_message_time"):
|
||||
value = row.get(key)
|
||||
if isinstance(value, datetime):
|
||||
row[key] = value.strftime("%Y-%m-%d %H:%M:%S") if key != "message_date" else value.strftime("%Y-%m-%d")
|
||||
elif value:
|
||||
row[key] = str(value)
|
||||
return rows
|
||||
|
||||
def get_member_messages_on_date(self, group_id: str, wxid: str, target_date: str, limit: int = 120) -> List[Dict]:
|
||||
"""获取成员在某一天的消息"""
|
||||
sql = """
|
||||
SELECT timestamp, sender, content, message_type
|
||||
FROM messages
|
||||
WHERE DATE(timestamp) = %s
|
||||
AND group_id = %s
|
||||
AND sender = %s
|
||||
AND message_type IN (1, 49)
|
||||
AND CHAR_LENGTH(content) BETWEEN 2 AND 500
|
||||
AND content NOT LIKE '/%%'
|
||||
ORDER BY timestamp ASC
|
||||
LIMIT %s
|
||||
"""
|
||||
return self.execute_query(sql, (target_date, group_id, wxid, limit)) or []
|
||||
|
||||
def get_message_count_by_date(self, date: str) -> List[Dict]:
|
||||
"""获取指定日期的消息统计"""
|
||||
sql = """
|
||||
|
||||
Reference in New Issue
Block a user