完善 value_rank 社交图设计并落地 @ 结构化存储
- messages 表新增 mentioned_user_ids 字段设计,使用 JSON 数组字符串存储被@用户清单 - 新增社交图相关表设计:t_message_mentions、t_social_edges_daily、t_value_rank_social_daily - 增加迁移脚本 20260421_add_mentions_and_social_graph_tables.sql,支持现网平滑升级 - 改造 MessageStorageDB 入库流程:解析 msg_source.atuserlist 并写入 mentioned_user_ids - 更新 value_rank README:补充社交图数据链路、可产出图表及实现说明
This commit is contained in:
@@ -2,6 +2,8 @@
|
||||
|
||||
from datetime import datetime
|
||||
import json
|
||||
import re
|
||||
import xml.etree.ElementTree as ET
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from db.base import BaseDBOperator
|
||||
@@ -39,15 +41,17 @@ class MessageStorageDB(BaseDBOperator):
|
||||
|
||||
# 尽可能保存完整原始负载:优先使用对象自带序列化能力,其次兜底到 __dict__。
|
||||
raw_payload = self._serialize_raw_payload(msg)
|
||||
# 在入库阶段结构化提取被@清单,避免后续统计每次都回扫原始包。
|
||||
mentioned_user_ids_json = self._extract_mentioned_user_ids(msg)
|
||||
|
||||
sql_with_raw_payload = """
|
||||
INSERT INTO messages (
|
||||
group_id, timestamp, sender, content, message_type,
|
||||
attachment_url, message_id, message_xml, raw_payload, message_thumb
|
||||
attachment_url, message_id, message_xml, raw_payload, mentioned_user_ids, message_thumb
|
||||
)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
|
||||
"""
|
||||
params_with_raw_payload = (*base_params[:8], raw_payload, base_params[8])
|
||||
params_with_raw_payload = (*base_params[:8], raw_payload, mentioned_user_ids_json, base_params[8])
|
||||
if self.execute_update(sql_with_raw_payload, params_with_raw_payload):
|
||||
return True
|
||||
|
||||
@@ -87,6 +91,48 @@ class MessageStorageDB(BaseDBOperator):
|
||||
# 最后的保底策略:即使序列化失败,也确保字段有可追溯文本,避免丢失原始上下文。
|
||||
return str(msg)
|
||||
|
||||
def _extract_mentioned_user_ids(self, msg: WxMessage) -> str:
|
||||
"""从消息中提取被@用户ID列表,并返回 JSON 数组字符串。
|
||||
|
||||
解析策略:
|
||||
1. 优先从 `msg.msg_source` 的 XML 里读取 `atuserlist` 节点;
|
||||
2. 若 XML 解析失败,则退化为正则提取 `atuserlist` 文本;
|
||||
3. 去重并过滤空值,保证输出稳定。
|
||||
|
||||
返回值示例:`["wxid_a","wxid_b"]`
|
||||
"""
|
||||
raw_xml = str(getattr(msg, "msg_source", "") or "")
|
||||
if not raw_xml:
|
||||
return "[]"
|
||||
|
||||
at_user_list_text = ""
|
||||
try:
|
||||
root = ET.fromstring(raw_xml)
|
||||
node = root.find(".//atuserlist")
|
||||
if node is not None and node.text:
|
||||
at_user_list_text = str(node.text).strip()
|
||||
except Exception:
|
||||
# 兼容异常格式 XML,采用正则兜底,确保尽量不丢数据。
|
||||
match = re.search(r"<atuserlist><!\[CDATA\[(.*?)\]\]></atuserlist>", raw_xml, flags=re.IGNORECASE | re.DOTALL)
|
||||
if match:
|
||||
at_user_list_text = str(match.group(1) or "").strip()
|
||||
|
||||
if not at_user_list_text:
|
||||
return "[]"
|
||||
|
||||
# 微信 atuserlist 常见分隔符为 ',',但实际环境可能混入 ';' 或空白,这里统一兼容。
|
||||
raw_ids = re.split(r"[,\s;]+", at_user_list_text)
|
||||
seen = set()
|
||||
result = []
|
||||
for uid in raw_ids:
|
||||
normalized_uid = str(uid or "").strip()
|
||||
if not normalized_uid or normalized_uid in seen:
|
||||
continue
|
||||
seen.add(normalized_uid)
|
||||
result.append(normalized_uid)
|
||||
|
||||
return json.dumps(result, ensure_ascii=False)
|
||||
|
||||
def get_recent_messages(self, group_id: str, hours_ago: int = 8, min_content_length: int = 6) -> List[Dict]:
|
||||
"""获取最近的消息"""
|
||||
sql = """
|
||||
|
||||
Reference in New Issue
Block a user