变更项: 1. 扩展引用发送者解析字段,新增 fromusr/fromnickname/sourceusername/sourcedisplayname 等兼容项。 2. 增加引用质量门控:发送者、标题、正文均缺失时直接丢弃 quote_context,避免污染 LLM。 3. 构建引用补充时不再输出“被引用发送者:未知成员”等低价值字段。 4. 增加兜底策略:仅剩引用类型且无正文标题时不输出引用补充。
127 lines
4.2 KiB
Python
127 lines
4.2 KiB
Python
from __future__ import annotations
|
||
|
||
import html
|
||
import xml.etree.ElementTree as ET
|
||
from typing import Any, Callable, Dict
|
||
|
||
from wechat_ipad.models.message import MessageType
|
||
|
||
|
||
def parse_quote_context(full_msg: Any, room_id: str, get_sender_name: Callable[[str, str], str]) -> Dict[str, str]:
|
||
if not full_msg or not getattr(full_msg, "content", None):
|
||
return {}
|
||
xml_content = getattr(full_msg.content, "xml_content", "") or ""
|
||
if not xml_content:
|
||
return {}
|
||
try:
|
||
root = ET.fromstring(xml_content)
|
||
except ET.ParseError:
|
||
return {}
|
||
|
||
appmsg = root.find(".//appmsg")
|
||
if appmsg is None or appmsg.findtext("type", "").strip() != "57":
|
||
return {}
|
||
|
||
refer = appmsg.find("refermsg")
|
||
if refer is None:
|
||
return {}
|
||
|
||
title = html.unescape(appmsg.findtext("title", "") or "").strip()
|
||
quote_sender_name = _extract_quote_sender_name(refer, room_id, get_sender_name)
|
||
ref_type = int(refer.findtext("type", "0") or 0)
|
||
ref_content = html.unescape(refer.findtext("content", "") or "").strip()
|
||
quote_type_label = quote_type_label_for(ref_type)
|
||
quote_body = build_quote_body(ref_type, ref_content, title)
|
||
|
||
# 降噪:引用信息没有有效载荷时,不喂给下游上下文,避免污染LLM判断
|
||
if _is_low_signal_quote(quote_sender_name, quote_body, title, quote_type_label):
|
||
return {}
|
||
|
||
return {
|
||
"title": title,
|
||
"quote_sender_name": quote_sender_name,
|
||
"quote_type_label": quote_type_label,
|
||
"quote_body": quote_body,
|
||
"raw_ref_content": ref_content,
|
||
}
|
||
|
||
|
||
def quote_type_label_for(ref_type: int) -> str:
|
||
mapping = {
|
||
MessageType.TEXT.value: "引用文本",
|
||
MessageType.IMAGE.value: "引用图片",
|
||
MessageType.VIDEO.value: "引用视频",
|
||
MessageType.APP.value: "引用应用消息",
|
||
MessageType.EMOTICON.value: "引用表情",
|
||
}
|
||
return mapping.get(ref_type, f"引用消息[{ref_type}]")
|
||
|
||
|
||
def build_quote_body(ref_type: int, ref_content: str, title: str) -> str:
|
||
if ref_type == MessageType.TEXT.value:
|
||
return ref_content[:220].strip()
|
||
if ref_type == MessageType.IMAGE.value:
|
||
details = []
|
||
if title:
|
||
details.append(f"当前追问文案:{title}")
|
||
if ref_content:
|
||
details.append("被引用的是一张图片")
|
||
return ";".join(details) or "被引用的是一张图片"
|
||
if title:
|
||
return title[:220].strip()
|
||
return ref_content[:220].strip()
|
||
|
||
|
||
def _extract_quote_sender_name(
|
||
refer: ET.Element,
|
||
room_id: str,
|
||
get_sender_name: Callable[[str, str], str],
|
||
) -> str:
|
||
# 常见字段:displayname/chatusr;部分端可能是fromusr/fromnickname/source*
|
||
direct_name = _first_non_empty(
|
||
refer,
|
||
"displayname",
|
||
"fromnickname",
|
||
"sourcedisplayname",
|
||
"source_displayname",
|
||
)
|
||
if direct_name:
|
||
return direct_name
|
||
|
||
quote_sender = _first_non_empty(
|
||
refer,
|
||
"chatusr",
|
||
"fromusr",
|
||
"sourceusername",
|
||
"source_username",
|
||
)
|
||
if quote_sender:
|
||
resolved = get_sender_name(room_id, quote_sender)
|
||
return (resolved or "").strip() or quote_sender
|
||
return "未知成员"
|
||
|
||
|
||
def _first_non_empty(root: ET.Element, *tags: str) -> str:
|
||
for tag in tags:
|
||
value = html.unescape(root.findtext(tag, "") or "").strip()
|
||
if value:
|
||
return value
|
||
return ""
|
||
|
||
|
||
def _is_low_signal_quote(quote_sender_name: str, quote_body: str, title: str, quote_type_label: str) -> bool:
|
||
sender = (quote_sender_name or "").strip()
|
||
body = (quote_body or "").strip()
|
||
title_text = (title or "").strip()
|
||
type_label = (quote_type_label or "").strip()
|
||
|
||
has_sender = sender and sender != "未知成员"
|
||
has_body = bool(body)
|
||
has_title = bool(title_text)
|
||
# 引用消息[数字] 代表类型未知,若同时没有发送者/正文/标题,则直接丢弃
|
||
unknown_type = type_label.startswith("引用消息[")
|
||
if unknown_type and (not has_sender) and (not has_body) and (not has_title):
|
||
return True
|
||
# 普通场景:三者都缺失也丢弃
|
||
return (not has_sender) and (not has_body) and (not has_title)
|