From 623ca505d4f893c56e8ab7d97ab8bd2b581f3bd8 Mon Sep 17 00:00:00 2001 From: liuwei Date: Mon, 27 Apr 2026 11:40:44 +0800 Subject: [PATCH] =?UTF-8?q?=E6=89=93=E9=80=9A=E8=87=AA=E5=8A=A8=E5=9B=9E?= =?UTF-8?q?=E5=A4=8D=E4=B8=8E=E8=A1=A8=E6=83=85=E8=AF=AD=E4=B9=89=E5=BA=93?= =?UTF-8?q?=E8=81=94=E5=8A=A8\n\n-=20=E6=96=B0=E5=A2=9E=E8=A1=A8=E6=83=85?= =?UTF-8?q?=E8=AF=AD=E4=B9=89=E8=A7=A3=E6=9E=90=E4=B8=8E=E8=A1=A8=E6=83=85?= =?UTF-8?q?=E8=B5=84=E4=BA=A7=E6=9F=A5=E8=AF=A2=E6=A8=A1=E5=9D=97=EF=BC=8C?= =?UTF-8?q?=E6=94=AF=E6=8C=81=E4=BB=8E=E5=8E=86=E5=8F=B2=E8=A1=A8=E6=83=85?= =?UTF-8?q?=E4=B8=AD=E6=8F=90=E5=8F=96=E5=8F=AF=E8=AF=BB=E4=B8=AD=E6=96=87?= =?UTF-8?q?=E8=AF=AD=E4=B9=89\n-=20=E4=B8=BA=20ai=5Fauto=5Fresponse=20?= =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E7=9F=AD=E5=9B=9E=E5=A4=8D=E8=A1=A8=E6=83=85?= =?UTF-8?q?=E5=8C=B9=E9=85=8D=E5=99=A8=EF=BC=8C=E5=91=BD=E4=B8=AD=E8=AF=AD?= =?UTF-8?q?=E4=B9=89=E6=97=B6=E4=BC=98=E5=85=88=E5=8F=91=E9=80=81=E8=A1=A8?= =?UTF-8?q?=E6=83=85=E5=B9=B6=E6=94=AF=E6=8C=81=E5=A4=B1=E8=B4=A5=E5=9B=9E?= =?UTF-8?q?=E9=80=80=E6=96=87=E6=9C=AC\n-=20=E8=B0=83=E6=95=B4=E8=87=AA?= =?UTF-8?q?=E5=8A=A8=E5=9B=9E=E5=A4=8D=E6=8F=90=E7=A4=BA=E8=AF=8D=E4=B8=8E?= =?UTF-8?q?=E9=85=8D=E7=BD=AE=E9=A1=B9=EF=BC=8C=E5=BC=BA=E5=8C=96=E7=9F=AD?= =?UTF-8?q?=E6=83=85=E7=BB=AA=E5=9B=9E=E5=A4=8D=E5=9C=BA=E6=99=AF=E7=9A=84?= =?UTF-8?q?=E8=A1=A8=E6=83=85=E6=9B=BF=E6=8D=A2=E8=83=BD=E5=8A=9B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- db/emoji_asset_db.py | 45 +++ plugins/ai_auto_response/config.toml | 14 + plugins/ai_auto_response/core/emoji_reply.py | 160 ++++++++++ .../ai_auto_response/core/prompt_builder.py | 1 + plugins/ai_auto_response/main.py | 40 ++- utils/wechat/emoji_semantic_parser.py | 282 ++++++++++++++++++ 6 files changed, 540 insertions(+), 2 deletions(-) create mode 100644 db/emoji_asset_db.py create mode 100644 plugins/ai_auto_response/core/emoji_reply.py create mode 100644 utils/wechat/emoji_semantic_parser.py diff --git a/db/emoji_asset_db.py b/db/emoji_asset_db.py new file mode 100644 index 0000000..d58dbf9 --- /dev/null +++ b/db/emoji_asset_db.py @@ -0,0 +1,45 @@ +# -*- coding: utf-8 -*- +from typing import Dict, List, Optional + +from db.base import BaseDBOperator +from db.connection import DBConnectionManager + + +class EmojiAssetDB(BaseDBOperator): + """表情资产查询。 + + 说明: + 1. 这里单独抽出查询类,避免自动回复插件为了拿表情库再去依赖后台蓝图; + 2. 查询只关心消息表里的原始表情记录,不负责语义解析和匹配打分; + 3. 后续无论后台页面、自动回复还是其他插件,都可以复用同一份表情资产数据源。 + """ + + def __init__(self, db_manager: DBConnectionManager): + super().__init__(db_manager) + + def get_recent_emoji_assets(self, limit: int = 500) -> List[Dict]: + """获取近期表情消息记录。""" + sql = """ + SELECT message_id, group_id, sender, timestamp, message_type, attachment_url, image_path + FROM messages + WHERE message_type IN ('47', '1048625', '1090519089') + AND attachment_url IS NOT NULL + AND attachment_url <> '' + ORDER BY timestamp DESC + LIMIT %s + """ + return self.execute_query(sql, (limit,)) or [] + + def get_emoji_asset_by_md5(self, md5: str) -> Optional[Dict]: + """根据 md5 获取最近一条表情记录。""" + sql = """ + SELECT message_id, group_id, sender, timestamp, message_type, attachment_url, image_path + FROM messages + WHERE message_type IN ('47', '1048625', '1090519089') + AND attachment_url IS NOT NULL + AND attachment_url <> '' + AND attachment_url LIKE %s + ORDER BY timestamp DESC + LIMIT 1 + """ + return self.execute_query(sql, (f'%md5="{md5}"%',), fetch_one=True) diff --git a/plugins/ai_auto_response/config.toml b/plugins/ai_auto_response/config.toml index d6410f5..1b27e40 100644 --- a/plugins/ai_auto_response/config.toml +++ b/plugins/ai_auto_response/config.toml @@ -71,6 +71,20 @@ qa_with_context_total_limit = 30 default_char_limit = 30 default_total_limit = 30 +[emoji_reply] +# 自动回复和表情库的衔接策略: +# 1. 模型仍然只输出自然文本,本地只在“极短情绪回复”场景里尝试换成表情; +# 2. 这样不用把 md5 暴露给模型,也更方便后续继续扩展同义词和人工校准; +# 3. 第一版只做保守替换,避免把正常答疑文本误发成表情。 +enable = true +asset_scan_limit = 800 +cache_ttl_sec = 300 +max_reply_chars = 8 +max_alias_chars = 16 +min_match_score = 75 +min_semantic_length = 1 +require_single_chunk = true + [prompt_compact] # 这里改成“常驻轻背景 + 相关增强”后,群长期摘要和成员轻画像都会稳定带给模型: # 1. group_profile 放宽,让群长期摘要不会总被前面的模式/知识域说明挤掉; diff --git a/plugins/ai_auto_response/core/emoji_reply.py b/plugins/ai_auto_response/core/emoji_reply.py new file mode 100644 index 0000000..005a4aa --- /dev/null +++ b/plugins/ai_auto_response/core/emoji_reply.py @@ -0,0 +1,160 @@ +from __future__ import annotations + +import time +from typing import Any, Dict, List, Optional + +from db.emoji_asset_db import EmojiAssetDB +from utils.wechat.emoji_semantic_parser import ( + dedupe_emoji_semantic_candidates, + extract_emoji_meta, + extract_emoji_semantic_info, + normalize_emoji_match_text, + safe_text, +) + + +class EmojiReplySelector: + """自动回复表情替换选择器。 + + 设计目标: + 1. 自动回复模型仍然先产出自然文本,本地只在“极短情绪回复”场景里尝试替换成表情; + 2. 选择逻辑完全基于表情库现有中文语义,不要求模型知道 md5; + 3. 一旦表情匹配失败或发送失败,主链路仍然可以无损回退到文本发送。 + """ + + def __init__(self, db_manager, config: Dict[str, Any] | None = None): + self.db_manager = db_manager + self.config = config or {} + self.enabled = bool(self.config.get("enable", True)) and db_manager is not None + self.asset_limit = max(int(self.config.get("asset_scan_limit", 800) or 800), 50) + self.cache_ttl_sec = max(int(self.config.get("cache_ttl_sec", 300) or 300), 30) + self.max_reply_chars = max(int(self.config.get("max_reply_chars", 8) or 8), 1) + self.max_alias_chars = max(int(self.config.get("max_alias_chars", 16) or 16), 1) + self.min_match_score = max(int(self.config.get("min_match_score", 75) or 75), 1) + self.min_semantic_length = max(int(self.config.get("min_semantic_length", 1) or 1), 1) + self.require_single_chunk = bool(self.config.get("require_single_chunk", True)) + self.asset_db = EmojiAssetDB(db_manager) if db_manager is not None else None + self._cache_assets: List[Dict[str, Any]] = [] + self._cache_expires_at = 0.0 + + def match_reply_to_emoji(self, reply_text: str, reply_chunks: List[str] | None = None) -> Optional[Dict[str, Any]]: + """根据最终回复文本挑选最合适的表情资产。 + + 说明: + 1. 只处理很短的一句式情绪回复,避免把正常答疑误替换成表情; + 2. 匹配优先级是:完全相等 > 语义前后包含 > 去语气词后的近似命中; + 3. 返回值里直接带上 md5 / total_length,主流程可以立刻发送。 + """ + if not self.enabled: + return None + + chunks = [chunk for chunk in (reply_chunks or []) if safe_text(chunk).strip()] + if self.require_single_chunk and len(chunks) > 1: + return None + + raw_text = safe_text(reply_text).strip() + if not raw_text or len(raw_text) > self.max_reply_chars: + return None + + normalized = normalize_emoji_match_text(raw_text) + if not normalized or len(normalized) < self.min_semantic_length: + return None + + best_asset = None + best_score = -1 + for asset in self._load_assets(): + for alias in asset.get("semantic_aliases", []) or []: + score = self._score_alias_match(normalized, alias) + if score > best_score: + best_score = score + best_asset = asset + + if not best_asset or best_score < self.min_match_score: + return None + return { + "md5": best_asset.get("md5", ""), + "total_length": int(best_asset.get("total_length") or 0), + "semantic_text": best_asset.get("semantic_text", ""), + "semantic_aliases": best_asset.get("semantic_aliases", []) or [], + "match_score": best_score, + } + + def _load_assets(self) -> List[Dict[str, Any]]: + """加载并缓存可用于自动回复的表情资产。""" + if not self.enabled or self.asset_db is None: + return [] + + now = time.time() + if self._cache_assets and now < self._cache_expires_at: + return self._cache_assets + + rows = self.asset_db.get_recent_emoji_assets(limit=self.asset_limit) + assets: Dict[str, Dict[str, Any]] = {} + for row in rows: + attachment_url = safe_text(row.get("attachment_url")) + md5, total_length = extract_emoji_meta(attachment_url) + if not md5 or total_length <= 0: + continue + + semantic_info = extract_emoji_semantic_info(attachment_url) + semantic_aliases = [ + alias + for alias in (semantic_info.get("semantic_aliases") or []) + if len(alias) <= self.max_alias_chars + ] + if not semantic_aliases: + continue + + target = assets.setdefault(md5, { + "md5": md5, + "total_length": total_length, + "semantic_text": "", + "semantic_aliases": [], + }) + if not target.get("total_length") and total_length > 0: + target["total_length"] = total_length + if not target.get("semantic_text") and semantic_info.get("semantic_text"): + target["semantic_text"] = semantic_info.get("semantic_text") + target["semantic_aliases"] = dedupe_emoji_semantic_candidates( + list(target.get("semantic_aliases") or []) + semantic_aliases + ) + + self._cache_assets = [asset for asset in assets.values() if asset.get("semantic_aliases")] + self._cache_expires_at = now + self.cache_ttl_sec + return self._cache_assets + + def _score_alias_match(self, normalized_reply: str, alias: str) -> int: + """给“回复文本 vs 表情语义”打匹配分。 + + 分值设计: + 1. 完全相等最高,优先替换像“哈哈哈 -> 哈哈哈表情”这种明确命中; + 2. 前后包含次之,覆盖“哈哈 -> 哈哈哈”“就离谱啊 -> 就离谱”; + 3. 去掉句尾语气词后的相等再次兜底,兼容“哇啊”“害呀”这类自然口语。 + """ + normalized_alias = normalize_emoji_match_text(alias) + if not normalized_reply or not normalized_alias: + return 0 + + if normalized_reply == normalized_alias: + return 100 + + stripped_reply = self._strip_modal_suffix(normalized_reply) + stripped_alias = self._strip_modal_suffix(normalized_alias) + if stripped_reply and stripped_reply == normalized_alias: + return 96 + if stripped_reply and stripped_reply == stripped_alias: + return 94 + + if normalized_reply in normalized_alias or normalized_alias in normalized_reply: + overlap = min(len(normalized_reply), len(normalized_alias)) + return 82 + min(overlap, 10) + + return 0 + + @staticmethod + def _strip_modal_suffix(text: str) -> str: + """去掉常见句尾语气字,减少口语扰动。""" + normalized = safe_text(text) + while normalized and normalized[-1] in {"啊", "呀", "啦", "呢", "嘛", "吧", "哇", "诶", "欸"}: + normalized = normalized[:-1] + return normalized diff --git a/plugins/ai_auto_response/core/prompt_builder.py b/plugins/ai_auto_response/core/prompt_builder.py index 98d5b66..144d376 100644 --- a/plugins/ai_auto_response/core/prompt_builder.py +++ b/plugins/ai_auto_response/core/prompt_builder.py @@ -28,6 +28,7 @@ def build_user_prompt(context: Dict, memory_hints: Dict) -> str: "规则优先级:当前发言可验证信息 > 群场景约束 > 人设措辞润色。", "如果是明确问题,先给结论;只给第一层答案,不主动展开第二层解释。", length_rule, + "如果最自然的回复只是短情绪词或短语气词,比如“哈哈”“哇”“害”“难道”,就只回那个短词,不要为了凑完整句硬补解释。", "能少说就少说,优先像群友随口接一句,不要写成说明文。", "回复总长度尽量控制在30字内;确实需要补充时最多2句且总长度不超过55字。", "禁止大段铺垫、总结腔、条目化回答。", diff --git a/plugins/ai_auto_response/main.py b/plugins/ai_auto_response/main.py index 73142e5..1805c14 100644 --- a/plugins/ai_auto_response/main.py +++ b/plugins/ai_auto_response/main.py @@ -38,6 +38,7 @@ from .memory.social_memory import SocialMemoryService from .profile.group_profile import GroupProfileResolver from .context.conversation_hints import build_conversation_hints from .core.decision_flow import DecisionFlow +from .core.emoji_reply import EmojiReplySelector from .core.triggers import TriggerRouter from .core.llm_result_parser import LLMResultParser from .core.reply_formatter import finalize_reply, preview_text @@ -101,6 +102,7 @@ class AIAutoResponsePlugin(MessagePluginInterface): self.queue_maxsize = 200 self.queue_workers: List[asyncio.Task] = [] self.reply_limits: Dict[str, Any] = {} + self.emoji_reply_config: Dict[str, Any] = {} self.prompt_compact_config: Dict[str, Any] = {} self.message_expire_sec = 0.0 self.room_message_seq_counter = 0 @@ -142,8 +144,10 @@ class AIAutoResponsePlugin(MessagePluginInterface): self.mode_config = self._config.get("mode", {}) or {} self.cooldown_config = self._config.get("cooldown", {}) or {} self.reply_limits = self._config.get("reply", {}) or {} + self.emoji_reply_config = self._config.get("emoji_reply", {}) or {} self.prompt_compact_config = self._config.get("prompt_compact", {}) or {} self.cooldown = CooldownManager(self.cooldown_config) + self.emoji_reply_selector = EmojiReplySelector(self.db_manager, self.emoji_reply_config) self.image_config = self._config.get("image", {}) or {} self.spam_config = self._config.get("spam_guard", {}) or {} runtime_config = self._config.get("runtime", {}) or {} @@ -681,8 +685,37 @@ class AIAutoResponsePlugin(MessagePluginInterface): ) return False, "duplicate_reply" - for chunk in reply_chunks: - await bot.send_text_message(room_id, chunk, sender) + # 这里让“自动回复文本”先经过一次本地表情匹配: + # 1. 模型仍然只负责输出自然语言,不需要知道 md5; + # 2. 只有命中中文语义库且回复足够短时,才会切换成表情发送; + # 3. 若表情发送失败,立刻回退到原始文本,避免因为表情链路影响主回复成功率。 + sent_as_emoji = False + emoji_asset = self.emoji_reply_selector.match_reply_to_emoji(final_response_text, reply_chunks) + if emoji_asset and emoji_asset.get("md5") and int(emoji_asset.get("total_length") or 0) > 0: + try: + await bot.send_emoji_message( + room_id, + str(emoji_asset.get("md5")), + int(emoji_asset.get("total_length") or 0), + ) + sent_as_emoji = True + except Exception as emoji_error: + self._log_event( + "emoji_fallback", + room_id=room_id, + sender=sender, + trigger_type=trigger.trigger_type, + reply_mode=reply_mode, + topic=selected_topic, + response_preview=preview_text(final_response_text), + emoji_semantic=emoji_asset.get("semantic_text", ""), + emoji_match_score=emoji_asset.get("match_score", 0), + error=str(emoji_error), + ) + + if not sent_as_emoji: + for chunk in reply_chunks: + await bot.send_text_message(room_id, chunk, sender) self.cooldown.note_reply(room_id) self.flow_manager.note_bot_reply(room_id) self.memory_store.note_bot_reply(room_id, sender, selected_topic) @@ -698,6 +731,9 @@ class AIAutoResponsePlugin(MessagePluginInterface): response_preview=preview_text(final_response_text), response_len=len(final_response_text), chunk_count=len(reply_chunks), + sent_as_emoji=yn(sent_as_emoji), + emoji_semantic=(emoji_asset or {}).get("semantic_text", ""), + emoji_match_score=(emoji_asset or {}).get("match_score", 0), ) return False, "replied" finally: diff --git a/utils/wechat/emoji_semantic_parser.py b/utils/wechat/emoji_semantic_parser.py new file mode 100644 index 0000000..1496880 --- /dev/null +++ b/utils/wechat/emoji_semantic_parser.py @@ -0,0 +1,282 @@ +import base64 +import re +import xml.etree.ElementTree as ET +from typing import Dict, List, Tuple + + +# 说明: +# 1. 微信表情消息里的语义字段并不稳定,有时是明文,有时是 base64 + protobuf; +# 2. 这里把“发送参数解析”和“中文语义提取”收敛成独立工具,便于后台表情库和 AI 自动回复共用; +# 3. 模块只保留纯解析逻辑,不依赖 Flask / DB,方便在任何场景下直接复用。 +_EMOJI_MD5_RE = re.compile(r'md5\s*=\s*[\"\']([0-9a-fA-F]{16,64})[\"\']', re.IGNORECASE) +_EMOJI_TOTALLEN_RE = re.compile(r'(?:totallen|total_len|len)\s*=\s*[\"\'](\d+)[\"\']', re.IGNORECASE) +_EMOJI_BASE64_RE = re.compile(r"^[A-Za-z0-9+/=]+$") +_EMOJI_LOCALE_KEYS = {"zh_cn", "zh_tw", "zh_hk", "default", "en", "ja", "ko"} +_EMOJI_SEMANTIC_STOPWORDS = { + "default", + "zh_cn", + "zh_tw", + "zh_hk", + "en", + "ja", + "ko", + "opus", + "gif", + "png", + "jpg", + "jpeg", + "webp", +} + + +def safe_text(value) -> str: + """安全转字符串,避免 None 参与解析。""" + return "" if value is None else str(value) + + +def extract_emoji_meta(attachment_url: str) -> Tuple[str, int]: + """从表情 XML 中提取发送所需的 md5 与 total_length。""" + text = safe_text(attachment_url).strip() + if not text.startswith("<"): + return "", 0 + + md5 = "" + total_length = 0 + try: + root = ET.fromstring(text) + emoji_node = root.find(".//emoji") + if emoji_node is None: + return "", 0 + md5 = safe_text(emoji_node.attrib.get("md5", "")).strip().lower() + for key in ("totallen", "total_len", "totalLen", "len"): + value = safe_text(emoji_node.attrib.get(key, "")).strip() + if value.isdigit(): + total_length = int(value) + break + except Exception: + md5_match = _EMOJI_MD5_RE.search(text) + if md5_match: + md5 = md5_match.group(1).lower() + len_match = _EMOJI_TOTALLEN_RE.search(text) + if len_match: + try: + total_length = int(len_match.group(1)) + except Exception: + total_length = 0 + return md5, total_length + + +def _read_protobuf_varint(payload: bytes, offset: int): + """读取 protobuf varint。""" + result = 0 + shift = 0 + index = offset + while index < len(payload) and shift <= 63: + current = payload[index] + index += 1 + result |= (current & 0x7F) << shift + if not (current & 0x80): + return result, index + shift += 7 + raise ValueError("protobuf varint 读取失败") + + +def _extract_protobuf_strings(payload: bytes, depth: int = 0) -> List[str]: + """递归提取 protobuf length-delimited 字段里的 UTF-8 文本。""" + if not payload: + return [] + + results: List[str] = [] + index = 0 + while index < len(payload): + try: + tag, index = _read_protobuf_varint(payload, index) + except Exception: + break + if tag <= 0: + break + + wire_type = tag & 0x07 + if wire_type == 0: + try: + _, index = _read_protobuf_varint(payload, index) + except Exception: + break + continue + if wire_type == 1: + index += 8 + continue + if wire_type == 5: + index += 4 + continue + if wire_type != 2: + break + + try: + length, index = _read_protobuf_varint(payload, index) + except Exception: + break + if length < 0 or index + length > len(payload): + break + + chunk = payload[index:index + length] + index += length + if not chunk: + continue + + try: + decoded = chunk.decode("utf-8") + except Exception: + decoded = "" + if decoded: + results.append(decoded) + + # desc 常见是语言包嵌套结构,递归两层足够覆盖大多数历史数据。 + if depth < 2: + results.extend(_extract_protobuf_strings(chunk, depth + 1)) + return results + + +def sanitize_emoji_semantic_text(value: str) -> str: + """清洗候选语义文本,去掉控制字符和多余空白。""" + text = "".join(ch for ch in safe_text(value) if ch.isprintable()).strip() + text = re.sub(r"\s+", " ", text) + return text.strip() + + +def is_emoji_semantic_candidate(value: str) -> bool: + """判断一个候选文本是否像“可读的表情语义”。""" + text = sanitize_emoji_semantic_text(value) + if not text: + return False + + lowered = text.lower() + if lowered in _EMOJI_LOCALE_KEYS or lowered in _EMOJI_SEMANTIC_STOPWORDS: + return False + if any(locale_key in lowered for locale_key in _EMOJI_LOCALE_KEYS): + return False + if lowered.startswith("com.tencent.") or lowered.startswith("finder:"): + return False + if re.fullmatch(r"[0-9a-f]{16,64}", lowered): + return False + if len(text) >= 8 and _EMOJI_BASE64_RE.fullmatch(text): + return False + if len(text) > 40: + return False + return bool(re.search(r"[\u4e00-\u9fffA-Za-z]", text)) + + +def dedupe_emoji_semantic_candidates(values: List[str]) -> List[str]: + """按出现顺序去重候选语义文本。""" + seen = set() + results: List[str] = [] + for item in values or []: + text = sanitize_emoji_semantic_text(item) + if not is_emoji_semantic_candidate(text): + continue + key = text.lower() + if key in seen: + continue + seen.add(key) + results.append(text) + return results + + +def _maybe_decode_base64_payload(value: str) -> bytes: + """尽量把字段值解成 base64 原始字节,失败时返回空字节。""" + normalized = re.sub(r"\s+", "", safe_text(value)) + if len(normalized) < 4 or not _EMOJI_BASE64_RE.fullmatch(normalized): + return b"" + normalized += "=" * (-len(normalized) % 4) + try: + return base64.b64decode(normalized, validate=False) + except Exception: + return b"" + + +def decode_emoji_semantic_value(value: str) -> List[str]: + """解析单个表情语义字段,输出候选语义文本列表。""" + raw_text = safe_text(value).strip() + if not raw_text: + return [] + + candidates: List[str] = [] + if is_emoji_semantic_candidate(raw_text): + candidates.append(raw_text) + + decoded_bytes = _maybe_decode_base64_payload(raw_text) + if not decoded_bytes: + return dedupe_emoji_semantic_candidates(candidates) + + protobuf_texts = _extract_protobuf_strings(decoded_bytes) + candidates.extend(protobuf_texts) + + # 有些字段是“base64 后的纯文本”,不是 protobuf。 + # 只有在 protobuf 路径没有抽出有效文本时,才回退整段 UTF-8 解码,避免把语言包壳子拼成脏值。 + if not dedupe_emoji_semantic_candidates(candidates): + try: + decoded_text = decoded_bytes.decode("utf-8") + except Exception: + decoded_text = "" + if decoded_text: + candidates.append(decoded_text) + return dedupe_emoji_semantic_candidates(candidates) + + +def extract_emoji_semantic_info(attachment_url: str) -> Dict[str, object]: + """从表情 XML 中提取“主语义 + 别名列表 + 来源字段”。""" + text = safe_text(attachment_url).strip() + if not text.startswith("<"): + return { + "semantic_text": "", + "semantic_aliases": [], + "semantic_source": "", + } + + field_values = [] + try: + root = ET.fromstring(text) + emoji_node = root.find(".//emoji") + if emoji_node is not None: + for field_name in ("desc", "attachedtext", "emojiattr"): + field_values.append((field_name, safe_text(emoji_node.attrib.get(field_name, "")).strip())) + except Exception: + for field_name in ("desc", "attachedtext", "emojiattr"): + match = re.search(rf'{field_name}\s*=\s*[\"\']([^\"\']+)[\"\']', text, re.IGNORECASE) + field_values.append((field_name, safe_text(match.group(1) if match else "").strip())) + + aliases: List[str] = [] + sources: List[str] = [] + for field_name, field_value in field_values: + decoded_candidates = decode_emoji_semantic_value(field_value) + if not decoded_candidates: + continue + aliases.extend(decoded_candidates) + sources.append(field_name) + + semantic_aliases = dedupe_emoji_semantic_candidates(aliases) + semantic_text = "" + if semantic_aliases: + # 优先选中文最明显的语义,便于后续直接拿来做展示和匹配。 + chinese_first = [item for item in semantic_aliases if re.search(r"[\u4e00-\u9fff]", item)] + semantic_text = chinese_first[0] if chinese_first else semantic_aliases[0] + + return { + "semantic_text": semantic_text, + "semantic_aliases": semantic_aliases, + "semantic_source": ",".join(sources), + } + + +def normalize_emoji_match_text(value: str) -> str: + """把回复文本和表情语义统一归一化,便于做本地匹配。 + + 说明: + 1. 这里会去掉空白和大部分标点,让“就离谱”“就 离谱”“就离谱啊”更容易靠近; + 2. 只做轻量归一化,不做分词和语义扩展,避免把普通文本误命中成表情; + 3. 自动回复侧会继续叠加长度和匹配分阈值,控制替换激进度。 + """ + text = sanitize_emoji_semantic_text(value).lower() + text = re.sub(r"[,。!?、;:,.!?\-~~`'\"“”‘’()()\[\]【】<>《》/\\|_]+", "", text) + text = re.sub(r"\s+", "", text) + return text.strip()