优化引用上下文质量并修复无效引用噪声

变更项:

1. 扩展引用发送者解析字段,新增 fromusr/fromnickname/sourceusername/sourcedisplayname 等兼容项。

2. 增加引用质量门控:发送者、标题、正文均缺失时直接丢弃 quote_context,避免污染 LLM。

3. 构建引用补充时不再输出“被引用发送者:未知成员”等低价值字段。

4. 增加兜底策略:仅剩引用类型且无正文标题时不输出引用补充。
This commit is contained in:
liuwei
2026-04-16 11:12:16 +08:00
parent b4b3fa92e0
commit a68d6d5e6c
2 changed files with 67 additions and 7 deletions

View File

@@ -401,18 +401,22 @@ class ContextBuilder:
if not quote_context:
return ""
quote_type = quote_context.get("quote_type_label", "引用消息")
quote_sender = quote_context.get("quote_sender_name", "") or "未知成员"
quote_sender = (quote_context.get("quote_sender_name", "") or "").strip()
quote_body = quote_context.get("quote_body", "") or ""
title = quote_context.get("title", "") or ""
lines = [
f"用户这次是在引用消息后发言。",
f"引用类型:{quote_type}",
f"被引用发送者:{quote_sender}",
f"被引用发送者:{quote_sender}" if quote_sender and quote_sender != "未知成员" else "",
f"图片附件:已附带原图" if quote_context.get("has_image_attachment") else "",
f"引用标题:{title}" if title else "",
f"被引用内容:{quote_body}" if quote_body else "",
]
return "\n".join([line for line in lines if line])
payload = [line for line in lines if line]
# 兜底:如果最终只剩“引用类型”,没有可用内容,就不输出引用补充
if len(payload) <= 2 and not quote_body and not title:
return ""
return "\n".join(payload)
@staticmethod
def _build_image_prompt(image_context: Dict) -> str:

View File

@@ -27,14 +27,16 @@ def parse_quote_context(full_msg: Any, room_id: str, get_sender_name: Callable[[
return {}
title = html.unescape(appmsg.findtext("title", "") or "").strip()
quote_sender_name = html.unescape(refer.findtext("displayname", "") or "").strip()
if not quote_sender_name:
quote_sender = html.unescape(refer.findtext("chatusr", "") or "").strip()
quote_sender_name = get_sender_name(room_id, quote_sender) if quote_sender else "未知成员"
quote_sender_name = _extract_quote_sender_name(refer, room_id, get_sender_name)
ref_type = int(refer.findtext("type", "0") or 0)
ref_content = html.unescape(refer.findtext("content", "") or "").strip()
quote_type_label = quote_type_label_for(ref_type)
quote_body = build_quote_body(ref_type, ref_content, title)
# 降噪引用信息没有有效载荷时不喂给下游上下文避免污染LLM判断
if _is_low_signal_quote(quote_sender_name, quote_body, title, quote_type_label):
return {}
return {
"title": title,
"quote_sender_name": quote_sender_name,
@@ -68,3 +70,57 @@ def build_quote_body(ref_type: int, ref_content: str, title: str) -> str:
if title:
return title[:220].strip()
return ref_content[:220].strip()
def _extract_quote_sender_name(
refer: ET.Element,
room_id: str,
get_sender_name: Callable[[str, str], str],
) -> str:
# 常见字段displayname/chatusr部分端可能是fromusr/fromnickname/source*
direct_name = _first_non_empty(
refer,
"displayname",
"fromnickname",
"sourcedisplayname",
"source_displayname",
)
if direct_name:
return direct_name
quote_sender = _first_non_empty(
refer,
"chatusr",
"fromusr",
"sourceusername",
"source_username",
)
if quote_sender:
resolved = get_sender_name(room_id, quote_sender)
return (resolved or "").strip() or quote_sender
return "未知成员"
def _first_non_empty(root: ET.Element, *tags: str) -> str:
for tag in tags:
value = html.unescape(root.findtext(tag, "") or "").strip()
if value:
return value
return ""
def _is_low_signal_quote(quote_sender_name: str, quote_body: str, title: str, quote_type_label: str) -> bool:
sender = (quote_sender_name or "").strip()
body = (quote_body or "").strip()
title_text = (title or "").strip()
type_label = (quote_type_label or "").strip()
has_sender = sender and sender != "未知成员"
has_body = bool(body)
has_title = bool(title_text)
# 引用消息[数字] 代表类型未知,若同时没有发送者/正文/标题,则直接丢弃
unknown_type = type_label.startswith("引用消息[")
if unknown_type and (not has_sender) and (not has_body) and (not has_title):
return True
# 普通场景:三者都缺失也丢弃
return (not has_sender) and (not has_body) and (not has_title)