持久化表情中文语义资产\n\n- 新增 t_emoji_assets 表及迁移脚本,持久化保存表情发送参数、中文语义与预览图路径\n- 在消息归档与媒体补偿流程中自动回填表情资产,实现收到表情即落语义、补图后回填预览\n- 后台表情库与自动回复优先读取持久化表情资产,仅在空表场景下小范围回补历史数据

This commit is contained in:
liuwei
2026-04-27 11:52:31 +08:00
parent 623ca505d4
commit 62e6f67836
7 changed files with 334 additions and 172 deletions

View File

@@ -9,6 +9,11 @@ from urllib.parse import quote
from flask import Blueprint, render_template, jsonify, request, current_app, redirect, send_file
from .auth import login_required
from loguru import logger
from utils.wechat.emoji_semantic_parser import (
dedupe_emoji_semantic_candidates as shared_dedupe_emoji_semantic_candidates,
extract_emoji_meta as shared_extract_emoji_meta,
extract_emoji_semantic_info as shared_extract_emoji_semantic_info,
)
# 创建联系人管理蓝图
contacts_bp = Blueprint('contacts', __name__, url_prefix='/contacts')
@@ -189,36 +194,8 @@ def _compact_media_caption(content: str, fallback: str) -> str:
def _extract_emoji_meta(attachment_url: str, image_path: str):
text = _safe_text(attachment_url)
md5 = ""
total_length = 0
# 只接受 XML 中的参数,不做文件名或文件大小回退,避免参数污染。
if not text.startswith("<"):
return "", 0
try:
root = ET.fromstring(text)
emoji_node = root.find(".//emoji")
if emoji_node is None:
return "", 0
md5 = _safe_text(emoji_node.attrib.get("md5", "")).strip().lower()
for key in ("totallen", "total_len", "totalLen", "len"):
value = _safe_text(emoji_node.attrib.get(key, "")).strip()
if value.isdigit():
total_length = int(value)
break
except Exception:
md5_match = _EMOJI_MD5_RE.search(text)
if md5_match:
md5 = md5_match.group(1).lower()
len_match = _EMOJI_TOTALLEN_RE.search(text)
if len_match:
try:
total_length = int(len_match.group(1))
except Exception:
total_length = 0
return md5, total_length
# 这里保留原函数签名,内部切到共享解析模块,避免后台和自动回复两边出现语义漂移。
return shared_extract_emoji_meta(attachment_url)
def _read_protobuf_varint(payload: bytes, offset: int):
@@ -341,18 +318,7 @@ def _is_emoji_semantic_candidate(value: str):
def _dedupe_emoji_semantic_candidates(values):
"""按出现顺序去重候选语义文本。"""
seen = set()
results = []
for item in values or []:
text = _sanitize_emoji_semantic_text(item)
if not _is_emoji_semantic_candidate(text):
continue
key = text.lower()
if key in seen:
continue
seen.add(key)
results.append(text)
return results
return shared_dedupe_emoji_semantic_candidates(values)
def _maybe_decode_base64_payload(value: str):
@@ -411,52 +377,9 @@ def _decode_emoji_semantic_value(value: str):
def _extract_emoji_semantic_info(attachment_url: str):
"""从表情 XML 中提取“可读语义”。
说明:
1. 当前表情库主要只有 md5/len不方便后续让 AI 直接利用;
2. 这里优先解析 desc、attachedtext、emojiattr 这些潜在语义字段;
3. 返回主语义 + 别名列表 + 来源,后续无论是后台展示还是自动回复匹配都能复用。
这里统一走共享解析模块,保证后台展示、持久化回填和自动回复使用同一套语义规则。
"""
text = _safe_text(attachment_url).strip()
if not text.startswith("<"):
return {
"semantic_text": "",
"semantic_aliases": [],
"semantic_source": "",
}
field_values = []
try:
root = ET.fromstring(text)
emoji_node = root.find(".//emoji")
if emoji_node is not None:
for field_name in ("desc", "attachedtext", "emojiattr"):
field_values.append((field_name, _safe_text(emoji_node.attrib.get(field_name, "")).strip()))
except Exception:
for field_name in ("desc", "attachedtext", "emojiattr"):
match = re.search(rf'{field_name}\s*=\s*[\"\']([^\"\']+)[\"\']', text, re.IGNORECASE)
field_values.append((field_name, _safe_text(match.group(1) if match else "").strip()))
aliases = []
sources = []
for field_name, field_value in field_values:
decoded_candidates = _decode_emoji_semantic_value(field_value)
if not decoded_candidates:
continue
sources.append(field_name)
aliases.extend(decoded_candidates)
semantic_aliases = _dedupe_emoji_semantic_candidates(aliases)
semantic_text = ""
if semantic_aliases:
# 优先选中文最明显的候选,尽量把“哈哈哈”“害”这类直观语义放到第一位。
chinese_first = [item for item in semantic_aliases if re.search(r"[\u4e00-\u9fff]", item)]
semantic_text = chinese_first[0] if chinese_first else semantic_aliases[0]
return {
"semantic_text": semantic_text,
"semantic_aliases": semantic_aliases,
"semantic_source": ",".join(sources),
}
return shared_extract_emoji_semantic_info(attachment_url)
def _parse_positive_int(value):
@@ -484,6 +407,24 @@ def _get_emoji_asset_by_md5(message_storage, md5: str):
if not message_storage or not md5:
return None
# 优先读取持久化表情资产:
# 1. 这张表已经做过语义和参数收敛,命中速度更快;
# 2. 若拿不到,再回退到原始 messages 表反查,兼容老数据和初始化阶段;
# 3. 这样后台发送、表情库展示、自动回复三条链路都共享统一资产源。
emoji_asset_db = getattr(message_storage, "emoji_asset_db", None)
if emoji_asset_db and hasattr(emoji_asset_db, "get_persisted_emoji_asset_by_md5"):
asset = emoji_asset_db.get_persisted_emoji_asset_by_md5(md5)
if asset:
return {
"attachment_url": "",
"image_path": _safe_text(asset.get("preview_url")).strip(),
"message_id": _safe_text(asset.get("sample_message_id")).strip(),
"group_id": _safe_text(asset.get("sample_group_id")).strip(),
"sender": _safe_text(asset.get("sample_sender")).strip(),
"md5": _safe_text(asset.get("md5")).strip(),
"total_length": asset.get("total_length"),
}
if hasattr(message_storage, "get_emoji_asset_by_md5"):
return message_storage.get_emoji_asset_by_md5(md5)
@@ -515,10 +456,13 @@ def _resolve_emoji_send_meta(message_storage, md5: str, total_length: int):
if not asset:
return normalized_md5, 0
resolved_md5, resolved_total_length = _extract_emoji_meta(
_safe_text(asset.get("attachment_url")),
_safe_text(asset.get("image_path"))
)
resolved_total_length = _parse_positive_int(asset.get("total_length"))
resolved_md5 = _safe_text(asset.get("md5")).strip().lower()
if not resolved_md5 or resolved_total_length <= 0:
resolved_md5, resolved_total_length = _extract_emoji_meta(
_safe_text(asset.get("attachment_url")),
_safe_text(asset.get("image_path"))
)
if resolved_md5 and resolved_md5 != normalized_md5:
# 历史数据如果出现大小写或异常值,以前端传入的 md5 为准,避免串表情。
logger.warning(f"表情参数回填命中 md5 不一致request_md5={normalized_md5}, record_md5={resolved_md5}")
@@ -1023,61 +967,16 @@ def api_emoji_library():
try:
server = current_app.dashboard_server
limit = min(max(int(request.args.get("limit", 200)), 1), 500)
message_storage = getattr(server, "message_storage", None)
if not message_storage:
return jsonify({"success": False, "message": "消息存储未初始化"}), 503
emoji_asset_db = getattr(server, "emoji_asset_db", None)
if emoji_asset_db is None:
return jsonify({"success": False, "message": "表情资产库未初始化"}), 503
if hasattr(message_storage, "get_recent_emoji_assets"):
records = message_storage.get_recent_emoji_assets(limit=limit)
elif hasattr(message_storage, "message_db") and hasattr(message_storage.message_db, "get_recent_emoji_assets"):
records = message_storage.message_db.get_recent_emoji_assets(limit=limit)
else:
logger.error("当前 message_storage 不支持 get_recent_emoji_assets")
return jsonify({"success": False, "message": "当前消息存储版本不支持表情库"}), 500
dedup = {}
for item in records:
attachment_url = _safe_text(item.get("attachment_url"))
image_path = _safe_text(item.get("image_path")).strip()
md5, total_length = _extract_emoji_meta(attachment_url, image_path)
if not md5 or total_length <= 0:
continue
semantic_info = _extract_emoji_semantic_info(attachment_url)
# 同一个 md5 可能在多条历史里反复出现:
# 1. 有的记录有预览图但没有语义;
# 2. 有的记录有语义但图片还没落盘;
# 3. 因此这里按 md5 聚合,尽量把“发送参数 + 预览图 + 中文语义”拼成一条完整资产。
target = dedup.setdefault(md5, {
"md5": md5,
"total_length": total_length,
"preview_url": "",
"timestamp": _safe_text(item.get("timestamp")),
"group_id": _safe_text(item.get("group_id")),
"message_id": _safe_text(item.get("message_id")),
"semantic_text": "",
"semantic_aliases": [],
"semantic_source": "",
})
if not target.get("preview_url") and image_path:
target["preview_url"] = image_path
if not target.get("total_length") and total_length > 0:
target["total_length"] = total_length
target_aliases = target.get("semantic_aliases") or []
merged_aliases = _dedupe_emoji_semantic_candidates(target_aliases + (semantic_info.get("semantic_aliases") or []))
target["semantic_aliases"] = merged_aliases
if not target.get("semantic_text") and semantic_info.get("semantic_text"):
target["semantic_text"] = semantic_info.get("semantic_text")
if not target.get("semantic_source") and semantic_info.get("semantic_source"):
target["semantic_source"] = semantic_info.get("semantic_source")
# 只有带预览图的表情才回给前端弹窗:
# 1. 目前弹窗主要承担“人工挑选并发送”的作用,没有缩略图会很难用;
# 2. 语义可以从其他重复记录补过来,但最终仍要求至少有一条落盘图片;
# 3. 后续若要纯语义离线匹配,可再单独开放无预览的内部接口。
emojis = [item for item in dedup.values() if item.get("preview_url")]
emojis = emoji_asset_db.list_emoji_assets(limit=limit, require_preview=True)
if not emojis:
# 只有当持久化表还是空的时候,才对最近一小批历史消息做一次兜底回填。
# 正常运行时,新表情会在“消息归档 + 媒体补偿”阶段自动写入资产表,不需要每次接口都回扫历史。
emoji_asset_db.sync_recent_emoji_assets(limit=min(max(limit, 50), 120))
emojis = emoji_asset_db.list_emoji_assets(limit=limit, require_preview=True)
return jsonify({
"success": True,
"data": {