打通自动回复与表情语义库联动\n\n- 新增表情语义解析与表情资产查询模块,支持从历史表情中提取可读中文语义\n- 为 ai_auto_response 增加短回复表情匹配器,命中语义时优先发送表情并支持失败回退文本\n- 调整自动回复提示词与配置项,强化短情绪回复场景的表情替换能力
This commit is contained in:
45
db/emoji_asset_db.py
Normal file
45
db/emoji_asset_db.py
Normal file
@@ -0,0 +1,45 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
from typing import Dict, List, Optional
|
||||||
|
|
||||||
|
from db.base import BaseDBOperator
|
||||||
|
from db.connection import DBConnectionManager
|
||||||
|
|
||||||
|
|
||||||
|
class EmojiAssetDB(BaseDBOperator):
|
||||||
|
"""表情资产查询。
|
||||||
|
|
||||||
|
说明:
|
||||||
|
1. 这里单独抽出查询类,避免自动回复插件为了拿表情库再去依赖后台蓝图;
|
||||||
|
2. 查询只关心消息表里的原始表情记录,不负责语义解析和匹配打分;
|
||||||
|
3. 后续无论后台页面、自动回复还是其他插件,都可以复用同一份表情资产数据源。
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, db_manager: DBConnectionManager):
|
||||||
|
super().__init__(db_manager)
|
||||||
|
|
||||||
|
def get_recent_emoji_assets(self, limit: int = 500) -> List[Dict]:
|
||||||
|
"""获取近期表情消息记录。"""
|
||||||
|
sql = """
|
||||||
|
SELECT message_id, group_id, sender, timestamp, message_type, attachment_url, image_path
|
||||||
|
FROM messages
|
||||||
|
WHERE message_type IN ('47', '1048625', '1090519089')
|
||||||
|
AND attachment_url IS NOT NULL
|
||||||
|
AND attachment_url <> ''
|
||||||
|
ORDER BY timestamp DESC
|
||||||
|
LIMIT %s
|
||||||
|
"""
|
||||||
|
return self.execute_query(sql, (limit,)) or []
|
||||||
|
|
||||||
|
def get_emoji_asset_by_md5(self, md5: str) -> Optional[Dict]:
|
||||||
|
"""根据 md5 获取最近一条表情记录。"""
|
||||||
|
sql = """
|
||||||
|
SELECT message_id, group_id, sender, timestamp, message_type, attachment_url, image_path
|
||||||
|
FROM messages
|
||||||
|
WHERE message_type IN ('47', '1048625', '1090519089')
|
||||||
|
AND attachment_url IS NOT NULL
|
||||||
|
AND attachment_url <> ''
|
||||||
|
AND attachment_url LIKE %s
|
||||||
|
ORDER BY timestamp DESC
|
||||||
|
LIMIT 1
|
||||||
|
"""
|
||||||
|
return self.execute_query(sql, (f'%md5="{md5}"%',), fetch_one=True)
|
||||||
@@ -71,6 +71,20 @@ qa_with_context_total_limit = 30
|
|||||||
default_char_limit = 30
|
default_char_limit = 30
|
||||||
default_total_limit = 30
|
default_total_limit = 30
|
||||||
|
|
||||||
|
[emoji_reply]
|
||||||
|
# 自动回复和表情库的衔接策略:
|
||||||
|
# 1. 模型仍然只输出自然文本,本地只在“极短情绪回复”场景里尝试换成表情;
|
||||||
|
# 2. 这样不用把 md5 暴露给模型,也更方便后续继续扩展同义词和人工校准;
|
||||||
|
# 3. 第一版只做保守替换,避免把正常答疑文本误发成表情。
|
||||||
|
enable = true
|
||||||
|
asset_scan_limit = 800
|
||||||
|
cache_ttl_sec = 300
|
||||||
|
max_reply_chars = 8
|
||||||
|
max_alias_chars = 16
|
||||||
|
min_match_score = 75
|
||||||
|
min_semantic_length = 1
|
||||||
|
require_single_chunk = true
|
||||||
|
|
||||||
[prompt_compact]
|
[prompt_compact]
|
||||||
# 这里改成“常驻轻背景 + 相关增强”后,群长期摘要和成员轻画像都会稳定带给模型:
|
# 这里改成“常驻轻背景 + 相关增强”后,群长期摘要和成员轻画像都会稳定带给模型:
|
||||||
# 1. group_profile 放宽,让群长期摘要不会总被前面的模式/知识域说明挤掉;
|
# 1. group_profile 放宽,让群长期摘要不会总被前面的模式/知识域说明挤掉;
|
||||||
|
|||||||
160
plugins/ai_auto_response/core/emoji_reply.py
Normal file
160
plugins/ai_auto_response/core/emoji_reply.py
Normal file
@@ -0,0 +1,160 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import time
|
||||||
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
|
from db.emoji_asset_db import EmojiAssetDB
|
||||||
|
from utils.wechat.emoji_semantic_parser import (
|
||||||
|
dedupe_emoji_semantic_candidates,
|
||||||
|
extract_emoji_meta,
|
||||||
|
extract_emoji_semantic_info,
|
||||||
|
normalize_emoji_match_text,
|
||||||
|
safe_text,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class EmojiReplySelector:
|
||||||
|
"""自动回复表情替换选择器。
|
||||||
|
|
||||||
|
设计目标:
|
||||||
|
1. 自动回复模型仍然先产出自然文本,本地只在“极短情绪回复”场景里尝试替换成表情;
|
||||||
|
2. 选择逻辑完全基于表情库现有中文语义,不要求模型知道 md5;
|
||||||
|
3. 一旦表情匹配失败或发送失败,主链路仍然可以无损回退到文本发送。
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, db_manager, config: Dict[str, Any] | None = None):
|
||||||
|
self.db_manager = db_manager
|
||||||
|
self.config = config or {}
|
||||||
|
self.enabled = bool(self.config.get("enable", True)) and db_manager is not None
|
||||||
|
self.asset_limit = max(int(self.config.get("asset_scan_limit", 800) or 800), 50)
|
||||||
|
self.cache_ttl_sec = max(int(self.config.get("cache_ttl_sec", 300) or 300), 30)
|
||||||
|
self.max_reply_chars = max(int(self.config.get("max_reply_chars", 8) or 8), 1)
|
||||||
|
self.max_alias_chars = max(int(self.config.get("max_alias_chars", 16) or 16), 1)
|
||||||
|
self.min_match_score = max(int(self.config.get("min_match_score", 75) or 75), 1)
|
||||||
|
self.min_semantic_length = max(int(self.config.get("min_semantic_length", 1) or 1), 1)
|
||||||
|
self.require_single_chunk = bool(self.config.get("require_single_chunk", True))
|
||||||
|
self.asset_db = EmojiAssetDB(db_manager) if db_manager is not None else None
|
||||||
|
self._cache_assets: List[Dict[str, Any]] = []
|
||||||
|
self._cache_expires_at = 0.0
|
||||||
|
|
||||||
|
def match_reply_to_emoji(self, reply_text: str, reply_chunks: List[str] | None = None) -> Optional[Dict[str, Any]]:
|
||||||
|
"""根据最终回复文本挑选最合适的表情资产。
|
||||||
|
|
||||||
|
说明:
|
||||||
|
1. 只处理很短的一句式情绪回复,避免把正常答疑误替换成表情;
|
||||||
|
2. 匹配优先级是:完全相等 > 语义前后包含 > 去语气词后的近似命中;
|
||||||
|
3. 返回值里直接带上 md5 / total_length,主流程可以立刻发送。
|
||||||
|
"""
|
||||||
|
if not self.enabled:
|
||||||
|
return None
|
||||||
|
|
||||||
|
chunks = [chunk for chunk in (reply_chunks or []) if safe_text(chunk).strip()]
|
||||||
|
if self.require_single_chunk and len(chunks) > 1:
|
||||||
|
return None
|
||||||
|
|
||||||
|
raw_text = safe_text(reply_text).strip()
|
||||||
|
if not raw_text or len(raw_text) > self.max_reply_chars:
|
||||||
|
return None
|
||||||
|
|
||||||
|
normalized = normalize_emoji_match_text(raw_text)
|
||||||
|
if not normalized or len(normalized) < self.min_semantic_length:
|
||||||
|
return None
|
||||||
|
|
||||||
|
best_asset = None
|
||||||
|
best_score = -1
|
||||||
|
for asset in self._load_assets():
|
||||||
|
for alias in asset.get("semantic_aliases", []) or []:
|
||||||
|
score = self._score_alias_match(normalized, alias)
|
||||||
|
if score > best_score:
|
||||||
|
best_score = score
|
||||||
|
best_asset = asset
|
||||||
|
|
||||||
|
if not best_asset or best_score < self.min_match_score:
|
||||||
|
return None
|
||||||
|
return {
|
||||||
|
"md5": best_asset.get("md5", ""),
|
||||||
|
"total_length": int(best_asset.get("total_length") or 0),
|
||||||
|
"semantic_text": best_asset.get("semantic_text", ""),
|
||||||
|
"semantic_aliases": best_asset.get("semantic_aliases", []) or [],
|
||||||
|
"match_score": best_score,
|
||||||
|
}
|
||||||
|
|
||||||
|
def _load_assets(self) -> List[Dict[str, Any]]:
|
||||||
|
"""加载并缓存可用于自动回复的表情资产。"""
|
||||||
|
if not self.enabled or self.asset_db is None:
|
||||||
|
return []
|
||||||
|
|
||||||
|
now = time.time()
|
||||||
|
if self._cache_assets and now < self._cache_expires_at:
|
||||||
|
return self._cache_assets
|
||||||
|
|
||||||
|
rows = self.asset_db.get_recent_emoji_assets(limit=self.asset_limit)
|
||||||
|
assets: Dict[str, Dict[str, Any]] = {}
|
||||||
|
for row in rows:
|
||||||
|
attachment_url = safe_text(row.get("attachment_url"))
|
||||||
|
md5, total_length = extract_emoji_meta(attachment_url)
|
||||||
|
if not md5 or total_length <= 0:
|
||||||
|
continue
|
||||||
|
|
||||||
|
semantic_info = extract_emoji_semantic_info(attachment_url)
|
||||||
|
semantic_aliases = [
|
||||||
|
alias
|
||||||
|
for alias in (semantic_info.get("semantic_aliases") or [])
|
||||||
|
if len(alias) <= self.max_alias_chars
|
||||||
|
]
|
||||||
|
if not semantic_aliases:
|
||||||
|
continue
|
||||||
|
|
||||||
|
target = assets.setdefault(md5, {
|
||||||
|
"md5": md5,
|
||||||
|
"total_length": total_length,
|
||||||
|
"semantic_text": "",
|
||||||
|
"semantic_aliases": [],
|
||||||
|
})
|
||||||
|
if not target.get("total_length") and total_length > 0:
|
||||||
|
target["total_length"] = total_length
|
||||||
|
if not target.get("semantic_text") and semantic_info.get("semantic_text"):
|
||||||
|
target["semantic_text"] = semantic_info.get("semantic_text")
|
||||||
|
target["semantic_aliases"] = dedupe_emoji_semantic_candidates(
|
||||||
|
list(target.get("semantic_aliases") or []) + semantic_aliases
|
||||||
|
)
|
||||||
|
|
||||||
|
self._cache_assets = [asset for asset in assets.values() if asset.get("semantic_aliases")]
|
||||||
|
self._cache_expires_at = now + self.cache_ttl_sec
|
||||||
|
return self._cache_assets
|
||||||
|
|
||||||
|
def _score_alias_match(self, normalized_reply: str, alias: str) -> int:
|
||||||
|
"""给“回复文本 vs 表情语义”打匹配分。
|
||||||
|
|
||||||
|
分值设计:
|
||||||
|
1. 完全相等最高,优先替换像“哈哈哈 -> 哈哈哈表情”这种明确命中;
|
||||||
|
2. 前后包含次之,覆盖“哈哈 -> 哈哈哈”“就离谱啊 -> 就离谱”;
|
||||||
|
3. 去掉句尾语气词后的相等再次兜底,兼容“哇啊”“害呀”这类自然口语。
|
||||||
|
"""
|
||||||
|
normalized_alias = normalize_emoji_match_text(alias)
|
||||||
|
if not normalized_reply or not normalized_alias:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
if normalized_reply == normalized_alias:
|
||||||
|
return 100
|
||||||
|
|
||||||
|
stripped_reply = self._strip_modal_suffix(normalized_reply)
|
||||||
|
stripped_alias = self._strip_modal_suffix(normalized_alias)
|
||||||
|
if stripped_reply and stripped_reply == normalized_alias:
|
||||||
|
return 96
|
||||||
|
if stripped_reply and stripped_reply == stripped_alias:
|
||||||
|
return 94
|
||||||
|
|
||||||
|
if normalized_reply in normalized_alias or normalized_alias in normalized_reply:
|
||||||
|
overlap = min(len(normalized_reply), len(normalized_alias))
|
||||||
|
return 82 + min(overlap, 10)
|
||||||
|
|
||||||
|
return 0
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _strip_modal_suffix(text: str) -> str:
|
||||||
|
"""去掉常见句尾语气字,减少口语扰动。"""
|
||||||
|
normalized = safe_text(text)
|
||||||
|
while normalized and normalized[-1] in {"啊", "呀", "啦", "呢", "嘛", "吧", "哇", "诶", "欸"}:
|
||||||
|
normalized = normalized[:-1]
|
||||||
|
return normalized
|
||||||
@@ -28,6 +28,7 @@ def build_user_prompt(context: Dict, memory_hints: Dict) -> str:
|
|||||||
"规则优先级:当前发言可验证信息 > 群场景约束 > 人设措辞润色。",
|
"规则优先级:当前发言可验证信息 > 群场景约束 > 人设措辞润色。",
|
||||||
"如果是明确问题,先给结论;只给第一层答案,不主动展开第二层解释。",
|
"如果是明确问题,先给结论;只给第一层答案,不主动展开第二层解释。",
|
||||||
length_rule,
|
length_rule,
|
||||||
|
"如果最自然的回复只是短情绪词或短语气词,比如“哈哈”“哇”“害”“难道”,就只回那个短词,不要为了凑完整句硬补解释。",
|
||||||
"能少说就少说,优先像群友随口接一句,不要写成说明文。",
|
"能少说就少说,优先像群友随口接一句,不要写成说明文。",
|
||||||
"回复总长度尽量控制在30字内;确实需要补充时最多2句且总长度不超过55字。",
|
"回复总长度尽量控制在30字内;确实需要补充时最多2句且总长度不超过55字。",
|
||||||
"禁止大段铺垫、总结腔、条目化回答。",
|
"禁止大段铺垫、总结腔、条目化回答。",
|
||||||
|
|||||||
@@ -38,6 +38,7 @@ from .memory.social_memory import SocialMemoryService
|
|||||||
from .profile.group_profile import GroupProfileResolver
|
from .profile.group_profile import GroupProfileResolver
|
||||||
from .context.conversation_hints import build_conversation_hints
|
from .context.conversation_hints import build_conversation_hints
|
||||||
from .core.decision_flow import DecisionFlow
|
from .core.decision_flow import DecisionFlow
|
||||||
|
from .core.emoji_reply import EmojiReplySelector
|
||||||
from .core.triggers import TriggerRouter
|
from .core.triggers import TriggerRouter
|
||||||
from .core.llm_result_parser import LLMResultParser
|
from .core.llm_result_parser import LLMResultParser
|
||||||
from .core.reply_formatter import finalize_reply, preview_text
|
from .core.reply_formatter import finalize_reply, preview_text
|
||||||
@@ -101,6 +102,7 @@ class AIAutoResponsePlugin(MessagePluginInterface):
|
|||||||
self.queue_maxsize = 200
|
self.queue_maxsize = 200
|
||||||
self.queue_workers: List[asyncio.Task] = []
|
self.queue_workers: List[asyncio.Task] = []
|
||||||
self.reply_limits: Dict[str, Any] = {}
|
self.reply_limits: Dict[str, Any] = {}
|
||||||
|
self.emoji_reply_config: Dict[str, Any] = {}
|
||||||
self.prompt_compact_config: Dict[str, Any] = {}
|
self.prompt_compact_config: Dict[str, Any] = {}
|
||||||
self.message_expire_sec = 0.0
|
self.message_expire_sec = 0.0
|
||||||
self.room_message_seq_counter = 0
|
self.room_message_seq_counter = 0
|
||||||
@@ -142,8 +144,10 @@ class AIAutoResponsePlugin(MessagePluginInterface):
|
|||||||
self.mode_config = self._config.get("mode", {}) or {}
|
self.mode_config = self._config.get("mode", {}) or {}
|
||||||
self.cooldown_config = self._config.get("cooldown", {}) or {}
|
self.cooldown_config = self._config.get("cooldown", {}) or {}
|
||||||
self.reply_limits = self._config.get("reply", {}) or {}
|
self.reply_limits = self._config.get("reply", {}) or {}
|
||||||
|
self.emoji_reply_config = self._config.get("emoji_reply", {}) or {}
|
||||||
self.prompt_compact_config = self._config.get("prompt_compact", {}) or {}
|
self.prompt_compact_config = self._config.get("prompt_compact", {}) or {}
|
||||||
self.cooldown = CooldownManager(self.cooldown_config)
|
self.cooldown = CooldownManager(self.cooldown_config)
|
||||||
|
self.emoji_reply_selector = EmojiReplySelector(self.db_manager, self.emoji_reply_config)
|
||||||
self.image_config = self._config.get("image", {}) or {}
|
self.image_config = self._config.get("image", {}) or {}
|
||||||
self.spam_config = self._config.get("spam_guard", {}) or {}
|
self.spam_config = self._config.get("spam_guard", {}) or {}
|
||||||
runtime_config = self._config.get("runtime", {}) or {}
|
runtime_config = self._config.get("runtime", {}) or {}
|
||||||
@@ -681,8 +685,37 @@ class AIAutoResponsePlugin(MessagePluginInterface):
|
|||||||
)
|
)
|
||||||
return False, "duplicate_reply"
|
return False, "duplicate_reply"
|
||||||
|
|
||||||
for chunk in reply_chunks:
|
# 这里让“自动回复文本”先经过一次本地表情匹配:
|
||||||
await bot.send_text_message(room_id, chunk, sender)
|
# 1. 模型仍然只负责输出自然语言,不需要知道 md5;
|
||||||
|
# 2. 只有命中中文语义库且回复足够短时,才会切换成表情发送;
|
||||||
|
# 3. 若表情发送失败,立刻回退到原始文本,避免因为表情链路影响主回复成功率。
|
||||||
|
sent_as_emoji = False
|
||||||
|
emoji_asset = self.emoji_reply_selector.match_reply_to_emoji(final_response_text, reply_chunks)
|
||||||
|
if emoji_asset and emoji_asset.get("md5") and int(emoji_asset.get("total_length") or 0) > 0:
|
||||||
|
try:
|
||||||
|
await bot.send_emoji_message(
|
||||||
|
room_id,
|
||||||
|
str(emoji_asset.get("md5")),
|
||||||
|
int(emoji_asset.get("total_length") or 0),
|
||||||
|
)
|
||||||
|
sent_as_emoji = True
|
||||||
|
except Exception as emoji_error:
|
||||||
|
self._log_event(
|
||||||
|
"emoji_fallback",
|
||||||
|
room_id=room_id,
|
||||||
|
sender=sender,
|
||||||
|
trigger_type=trigger.trigger_type,
|
||||||
|
reply_mode=reply_mode,
|
||||||
|
topic=selected_topic,
|
||||||
|
response_preview=preview_text(final_response_text),
|
||||||
|
emoji_semantic=emoji_asset.get("semantic_text", ""),
|
||||||
|
emoji_match_score=emoji_asset.get("match_score", 0),
|
||||||
|
error=str(emoji_error),
|
||||||
|
)
|
||||||
|
|
||||||
|
if not sent_as_emoji:
|
||||||
|
for chunk in reply_chunks:
|
||||||
|
await bot.send_text_message(room_id, chunk, sender)
|
||||||
self.cooldown.note_reply(room_id)
|
self.cooldown.note_reply(room_id)
|
||||||
self.flow_manager.note_bot_reply(room_id)
|
self.flow_manager.note_bot_reply(room_id)
|
||||||
self.memory_store.note_bot_reply(room_id, sender, selected_topic)
|
self.memory_store.note_bot_reply(room_id, sender, selected_topic)
|
||||||
@@ -698,6 +731,9 @@ class AIAutoResponsePlugin(MessagePluginInterface):
|
|||||||
response_preview=preview_text(final_response_text),
|
response_preview=preview_text(final_response_text),
|
||||||
response_len=len(final_response_text),
|
response_len=len(final_response_text),
|
||||||
chunk_count=len(reply_chunks),
|
chunk_count=len(reply_chunks),
|
||||||
|
sent_as_emoji=yn(sent_as_emoji),
|
||||||
|
emoji_semantic=(emoji_asset or {}).get("semantic_text", ""),
|
||||||
|
emoji_match_score=(emoji_asset or {}).get("match_score", 0),
|
||||||
)
|
)
|
||||||
return False, "replied"
|
return False, "replied"
|
||||||
finally:
|
finally:
|
||||||
|
|||||||
282
utils/wechat/emoji_semantic_parser.py
Normal file
282
utils/wechat/emoji_semantic_parser.py
Normal file
@@ -0,0 +1,282 @@
|
|||||||
|
import base64
|
||||||
|
import re
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
from typing import Dict, List, Tuple
|
||||||
|
|
||||||
|
|
||||||
|
# 说明:
|
||||||
|
# 1. 微信表情消息里的语义字段并不稳定,有时是明文,有时是 base64 + protobuf;
|
||||||
|
# 2. 这里把“发送参数解析”和“中文语义提取”收敛成独立工具,便于后台表情库和 AI 自动回复共用;
|
||||||
|
# 3. 模块只保留纯解析逻辑,不依赖 Flask / DB,方便在任何场景下直接复用。
|
||||||
|
_EMOJI_MD5_RE = re.compile(r'md5\s*=\s*[\"\']([0-9a-fA-F]{16,64})[\"\']', re.IGNORECASE)
|
||||||
|
_EMOJI_TOTALLEN_RE = re.compile(r'(?:totallen|total_len|len)\s*=\s*[\"\'](\d+)[\"\']', re.IGNORECASE)
|
||||||
|
_EMOJI_BASE64_RE = re.compile(r"^[A-Za-z0-9+/=]+$")
|
||||||
|
_EMOJI_LOCALE_KEYS = {"zh_cn", "zh_tw", "zh_hk", "default", "en", "ja", "ko"}
|
||||||
|
_EMOJI_SEMANTIC_STOPWORDS = {
|
||||||
|
"default",
|
||||||
|
"zh_cn",
|
||||||
|
"zh_tw",
|
||||||
|
"zh_hk",
|
||||||
|
"en",
|
||||||
|
"ja",
|
||||||
|
"ko",
|
||||||
|
"opus",
|
||||||
|
"gif",
|
||||||
|
"png",
|
||||||
|
"jpg",
|
||||||
|
"jpeg",
|
||||||
|
"webp",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def safe_text(value) -> str:
|
||||||
|
"""安全转字符串,避免 None 参与解析。"""
|
||||||
|
return "" if value is None else str(value)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_emoji_meta(attachment_url: str) -> Tuple[str, int]:
|
||||||
|
"""从表情 XML 中提取发送所需的 md5 与 total_length。"""
|
||||||
|
text = safe_text(attachment_url).strip()
|
||||||
|
if not text.startswith("<"):
|
||||||
|
return "", 0
|
||||||
|
|
||||||
|
md5 = ""
|
||||||
|
total_length = 0
|
||||||
|
try:
|
||||||
|
root = ET.fromstring(text)
|
||||||
|
emoji_node = root.find(".//emoji")
|
||||||
|
if emoji_node is None:
|
||||||
|
return "", 0
|
||||||
|
md5 = safe_text(emoji_node.attrib.get("md5", "")).strip().lower()
|
||||||
|
for key in ("totallen", "total_len", "totalLen", "len"):
|
||||||
|
value = safe_text(emoji_node.attrib.get(key, "")).strip()
|
||||||
|
if value.isdigit():
|
||||||
|
total_length = int(value)
|
||||||
|
break
|
||||||
|
except Exception:
|
||||||
|
md5_match = _EMOJI_MD5_RE.search(text)
|
||||||
|
if md5_match:
|
||||||
|
md5 = md5_match.group(1).lower()
|
||||||
|
len_match = _EMOJI_TOTALLEN_RE.search(text)
|
||||||
|
if len_match:
|
||||||
|
try:
|
||||||
|
total_length = int(len_match.group(1))
|
||||||
|
except Exception:
|
||||||
|
total_length = 0
|
||||||
|
return md5, total_length
|
||||||
|
|
||||||
|
|
||||||
|
def _read_protobuf_varint(payload: bytes, offset: int):
|
||||||
|
"""读取 protobuf varint。"""
|
||||||
|
result = 0
|
||||||
|
shift = 0
|
||||||
|
index = offset
|
||||||
|
while index < len(payload) and shift <= 63:
|
||||||
|
current = payload[index]
|
||||||
|
index += 1
|
||||||
|
result |= (current & 0x7F) << shift
|
||||||
|
if not (current & 0x80):
|
||||||
|
return result, index
|
||||||
|
shift += 7
|
||||||
|
raise ValueError("protobuf varint 读取失败")
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_protobuf_strings(payload: bytes, depth: int = 0) -> List[str]:
|
||||||
|
"""递归提取 protobuf length-delimited 字段里的 UTF-8 文本。"""
|
||||||
|
if not payload:
|
||||||
|
return []
|
||||||
|
|
||||||
|
results: List[str] = []
|
||||||
|
index = 0
|
||||||
|
while index < len(payload):
|
||||||
|
try:
|
||||||
|
tag, index = _read_protobuf_varint(payload, index)
|
||||||
|
except Exception:
|
||||||
|
break
|
||||||
|
if tag <= 0:
|
||||||
|
break
|
||||||
|
|
||||||
|
wire_type = tag & 0x07
|
||||||
|
if wire_type == 0:
|
||||||
|
try:
|
||||||
|
_, index = _read_protobuf_varint(payload, index)
|
||||||
|
except Exception:
|
||||||
|
break
|
||||||
|
continue
|
||||||
|
if wire_type == 1:
|
||||||
|
index += 8
|
||||||
|
continue
|
||||||
|
if wire_type == 5:
|
||||||
|
index += 4
|
||||||
|
continue
|
||||||
|
if wire_type != 2:
|
||||||
|
break
|
||||||
|
|
||||||
|
try:
|
||||||
|
length, index = _read_protobuf_varint(payload, index)
|
||||||
|
except Exception:
|
||||||
|
break
|
||||||
|
if length < 0 or index + length > len(payload):
|
||||||
|
break
|
||||||
|
|
||||||
|
chunk = payload[index:index + length]
|
||||||
|
index += length
|
||||||
|
if not chunk:
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
decoded = chunk.decode("utf-8")
|
||||||
|
except Exception:
|
||||||
|
decoded = ""
|
||||||
|
if decoded:
|
||||||
|
results.append(decoded)
|
||||||
|
|
||||||
|
# desc 常见是语言包嵌套结构,递归两层足够覆盖大多数历史数据。
|
||||||
|
if depth < 2:
|
||||||
|
results.extend(_extract_protobuf_strings(chunk, depth + 1))
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def sanitize_emoji_semantic_text(value: str) -> str:
|
||||||
|
"""清洗候选语义文本,去掉控制字符和多余空白。"""
|
||||||
|
text = "".join(ch for ch in safe_text(value) if ch.isprintable()).strip()
|
||||||
|
text = re.sub(r"\s+", " ", text)
|
||||||
|
return text.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def is_emoji_semantic_candidate(value: str) -> bool:
|
||||||
|
"""判断一个候选文本是否像“可读的表情语义”。"""
|
||||||
|
text = sanitize_emoji_semantic_text(value)
|
||||||
|
if not text:
|
||||||
|
return False
|
||||||
|
|
||||||
|
lowered = text.lower()
|
||||||
|
if lowered in _EMOJI_LOCALE_KEYS or lowered in _EMOJI_SEMANTIC_STOPWORDS:
|
||||||
|
return False
|
||||||
|
if any(locale_key in lowered for locale_key in _EMOJI_LOCALE_KEYS):
|
||||||
|
return False
|
||||||
|
if lowered.startswith("com.tencent.") or lowered.startswith("finder:"):
|
||||||
|
return False
|
||||||
|
if re.fullmatch(r"[0-9a-f]{16,64}", lowered):
|
||||||
|
return False
|
||||||
|
if len(text) >= 8 and _EMOJI_BASE64_RE.fullmatch(text):
|
||||||
|
return False
|
||||||
|
if len(text) > 40:
|
||||||
|
return False
|
||||||
|
return bool(re.search(r"[\u4e00-\u9fffA-Za-z]", text))
|
||||||
|
|
||||||
|
|
||||||
|
def dedupe_emoji_semantic_candidates(values: List[str]) -> List[str]:
|
||||||
|
"""按出现顺序去重候选语义文本。"""
|
||||||
|
seen = set()
|
||||||
|
results: List[str] = []
|
||||||
|
for item in values or []:
|
||||||
|
text = sanitize_emoji_semantic_text(item)
|
||||||
|
if not is_emoji_semantic_candidate(text):
|
||||||
|
continue
|
||||||
|
key = text.lower()
|
||||||
|
if key in seen:
|
||||||
|
continue
|
||||||
|
seen.add(key)
|
||||||
|
results.append(text)
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def _maybe_decode_base64_payload(value: str) -> bytes:
|
||||||
|
"""尽量把字段值解成 base64 原始字节,失败时返回空字节。"""
|
||||||
|
normalized = re.sub(r"\s+", "", safe_text(value))
|
||||||
|
if len(normalized) < 4 or not _EMOJI_BASE64_RE.fullmatch(normalized):
|
||||||
|
return b""
|
||||||
|
normalized += "=" * (-len(normalized) % 4)
|
||||||
|
try:
|
||||||
|
return base64.b64decode(normalized, validate=False)
|
||||||
|
except Exception:
|
||||||
|
return b""
|
||||||
|
|
||||||
|
|
||||||
|
def decode_emoji_semantic_value(value: str) -> List[str]:
|
||||||
|
"""解析单个表情语义字段,输出候选语义文本列表。"""
|
||||||
|
raw_text = safe_text(value).strip()
|
||||||
|
if not raw_text:
|
||||||
|
return []
|
||||||
|
|
||||||
|
candidates: List[str] = []
|
||||||
|
if is_emoji_semantic_candidate(raw_text):
|
||||||
|
candidates.append(raw_text)
|
||||||
|
|
||||||
|
decoded_bytes = _maybe_decode_base64_payload(raw_text)
|
||||||
|
if not decoded_bytes:
|
||||||
|
return dedupe_emoji_semantic_candidates(candidates)
|
||||||
|
|
||||||
|
protobuf_texts = _extract_protobuf_strings(decoded_bytes)
|
||||||
|
candidates.extend(protobuf_texts)
|
||||||
|
|
||||||
|
# 有些字段是“base64 后的纯文本”,不是 protobuf。
|
||||||
|
# 只有在 protobuf 路径没有抽出有效文本时,才回退整段 UTF-8 解码,避免把语言包壳子拼成脏值。
|
||||||
|
if not dedupe_emoji_semantic_candidates(candidates):
|
||||||
|
try:
|
||||||
|
decoded_text = decoded_bytes.decode("utf-8")
|
||||||
|
except Exception:
|
||||||
|
decoded_text = ""
|
||||||
|
if decoded_text:
|
||||||
|
candidates.append(decoded_text)
|
||||||
|
return dedupe_emoji_semantic_candidates(candidates)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_emoji_semantic_info(attachment_url: str) -> Dict[str, object]:
|
||||||
|
"""从表情 XML 中提取“主语义 + 别名列表 + 来源字段”。"""
|
||||||
|
text = safe_text(attachment_url).strip()
|
||||||
|
if not text.startswith("<"):
|
||||||
|
return {
|
||||||
|
"semantic_text": "",
|
||||||
|
"semantic_aliases": [],
|
||||||
|
"semantic_source": "",
|
||||||
|
}
|
||||||
|
|
||||||
|
field_values = []
|
||||||
|
try:
|
||||||
|
root = ET.fromstring(text)
|
||||||
|
emoji_node = root.find(".//emoji")
|
||||||
|
if emoji_node is not None:
|
||||||
|
for field_name in ("desc", "attachedtext", "emojiattr"):
|
||||||
|
field_values.append((field_name, safe_text(emoji_node.attrib.get(field_name, "")).strip()))
|
||||||
|
except Exception:
|
||||||
|
for field_name in ("desc", "attachedtext", "emojiattr"):
|
||||||
|
match = re.search(rf'{field_name}\s*=\s*[\"\']([^\"\']+)[\"\']', text, re.IGNORECASE)
|
||||||
|
field_values.append((field_name, safe_text(match.group(1) if match else "").strip()))
|
||||||
|
|
||||||
|
aliases: List[str] = []
|
||||||
|
sources: List[str] = []
|
||||||
|
for field_name, field_value in field_values:
|
||||||
|
decoded_candidates = decode_emoji_semantic_value(field_value)
|
||||||
|
if not decoded_candidates:
|
||||||
|
continue
|
||||||
|
aliases.extend(decoded_candidates)
|
||||||
|
sources.append(field_name)
|
||||||
|
|
||||||
|
semantic_aliases = dedupe_emoji_semantic_candidates(aliases)
|
||||||
|
semantic_text = ""
|
||||||
|
if semantic_aliases:
|
||||||
|
# 优先选中文最明显的语义,便于后续直接拿来做展示和匹配。
|
||||||
|
chinese_first = [item for item in semantic_aliases if re.search(r"[\u4e00-\u9fff]", item)]
|
||||||
|
semantic_text = chinese_first[0] if chinese_first else semantic_aliases[0]
|
||||||
|
|
||||||
|
return {
|
||||||
|
"semantic_text": semantic_text,
|
||||||
|
"semantic_aliases": semantic_aliases,
|
||||||
|
"semantic_source": ",".join(sources),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_emoji_match_text(value: str) -> str:
|
||||||
|
"""把回复文本和表情语义统一归一化,便于做本地匹配。
|
||||||
|
|
||||||
|
说明:
|
||||||
|
1. 这里会去掉空白和大部分标点,让“就离谱”“就 离谱”“就离谱啊”更容易靠近;
|
||||||
|
2. 只做轻量归一化,不做分词和语义扩展,避免把普通文本误命中成表情;
|
||||||
|
3. 自动回复侧会继续叠加长度和匹配分阈值,控制替换激进度。
|
||||||
|
"""
|
||||||
|
text = sanitize_emoji_semantic_text(value).lower()
|
||||||
|
text = re.sub(r"[,。!?、;:,.!?\-~~`'\"“”‘’()()\[\]【】<>《》/\\|_]+", "", text)
|
||||||
|
text = re.sub(r"\s+", "", text)
|
||||||
|
return text.strip()
|
||||||
Reference in New Issue
Block a user