message_summary模板模式改为JSON优先解析并同步Dify提示词

This commit is contained in:
liuwei
2026-04-23 11:24:37 +08:00
parent e87eeba256
commit 4f0b3c041e
3 changed files with 219 additions and 81 deletions

View File

@@ -579,6 +579,180 @@ class MessageSummaryPlugin(MessagePluginInterface):
return text
@classmethod
def _extract_json_object_from_text(cls, raw_text: str) -> Optional[Dict[str, Any]]:
"""从文本中提取 JSON 对象(优先服务 template 模式)。"""
# 设计说明:
# 1. 优先支持三类常见返回:纯 JSON、```json 代码块、被前后说明文字包裹的 JSON
# 2. 仅返回 dict避免数组/字符串误入模板渲染链路;
# 3. 解析失败时返回 None不中断主流程后续自动回退 Markdown 结构提取。
text = str(raw_text or "").strip()
if not text:
return None
# 场景一:整段就是 JSON 对象。
try:
if text.startswith("{") and text.endswith("}"):
parsed = json.loads(text)
if isinstance(parsed, dict):
return parsed
except Exception:
pass
# 场景二:```json ... ``` 包裹。
fenced_match = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", text, flags=re.IGNORECASE)
if fenced_match:
candidate = str(fenced_match.group(1) or "").strip()
try:
parsed = json.loads(candidate)
if isinstance(parsed, dict):
return parsed
except Exception:
pass
# 场景三:文本中夹杂 JSON。采用大括号包围段做兜底提取。
left = text.find("{")
right = text.rfind("}")
if left >= 0 and right > left:
candidate = text[left:right + 1].strip()
try:
parsed = json.loads(candidate)
if isinstance(parsed, dict):
return parsed
except Exception:
return None
return None
@classmethod
def _normalize_json_text_list(cls, value: Any, limit: int = 6, item_max_len: int = 120) -> List[str]:
"""把 JSON 字段标准化为字符串列表。"""
# 设计说明:
# 1. 兼容字符串、数组、混合对象等脏数据输入;
# 2. 统一做 Markdown 行内清理,避免样式噪音进入模板;
# 3. 强制长度与条数上限,防止单条过长撑爆卡片布局。
texts: List[str] = []
if isinstance(value, str):
candidate = cls._strip_markdown_inline(value).strip()
if candidate:
texts.append(candidate[:item_max_len])
return texts[:limit]
if not isinstance(value, list):
return texts
for item in value:
if isinstance(item, str):
candidate = cls._strip_markdown_inline(item).strip()
elif isinstance(item, dict):
candidate = cls._strip_markdown_inline(
str(item.get("text") or item.get("title") or item.get("value") or "")
).strip()
else:
candidate = cls._strip_markdown_inline(str(item or "")).strip()
if not candidate:
continue
texts.append(candidate[:item_max_len])
if len(texts) >= limit:
break
return texts
@classmethod
def _extract_template_json_data(cls, summary_text: str) -> Optional[Dict[str, Any]]:
"""提取 template 模式专用 JSON 结构。"""
# 说明:
# 1. 允许 LLM 按固定 schema 输出 JSON渲染稳定性显著高于 Markdown 再解析;
# 2. 这里做“宽松字段兼容”,便于后续提示词小幅调整也不影响线上;
# 3. 只有检测到有效 JSON 且关键字段存在时才返回,避免误判。
payload = cls._extract_json_object_from_text(summary_text)
if not payload:
return None
title = cls._strip_markdown_inline(
str(payload.get("title") or payload.get("document_title") or payload.get("doc_title") or "")
).strip()
lead = cls._strip_markdown_inline(
str(payload.get("lead") or payload.get("summary_lead") or payload.get("overview") or "")
).strip()
fallback_text = cls._strip_markdown_inline(
str(payload.get("fallback_text") or payload.get("raw_summary") or "")
).strip()
# 解析话题卡片。
topic_cards: List[Dict[str, Any]] = []
topics = payload.get("topics")
if isinstance(topics, list):
for raw_topic in topics:
if not isinstance(raw_topic, dict):
continue
topic_title = cls._clean_topic_title(str(raw_topic.get("title") or raw_topic.get("name") or ""))
if not topic_title:
topic_title = "未命名话题"
overview_points = cls._normalize_json_text_list(
raw_topic.get("overview_points") or raw_topic.get("key_points") or raw_topic.get("highlights"),
limit=3,
item_max_len=120,
)
analysis_points = cls._normalize_json_text_list(
raw_topic.get("analysis_points") or raw_topic.get("analysis"),
limit=2,
item_max_len=120,
)
quote_text = cls._strip_markdown_inline(str(raw_topic.get("quote_text") or raw_topic.get("quote") or "")).strip()
time_range = cls._strip_markdown_inline(str(raw_topic.get("time_range") or raw_topic.get("time") or "")).strip()
participants = cls._strip_markdown_inline(
str(raw_topic.get("participants") or raw_topic.get("participant_count") or "")
).strip()
topic_cards.append(
{
"title": topic_title[:42],
"time_range": time_range[:58],
"participants": participants[:42],
"overview_points": overview_points,
"analysis_points": analysis_points,
"quote_text": quote_text[:120],
}
)
if len(topic_cards) >= 5:
break
# 解析命名模块。
named_modules = {
"shared_resources": cls._normalize_json_text_list(payload.get("shared_resources"), limit=6, item_max_len=110),
"marketplace": cls._normalize_json_text_list(payload.get("marketplace"), limit=6, item_max_len=110),
"unresolved_pool": cls._normalize_json_text_list(payload.get("unresolved_pool"), limit=4, item_max_len=110),
"core_points": cls._normalize_json_text_list(payload.get("core_knowledge_points") or payload.get("core_points"), limit=4, item_max_len=110),
"top_contributors": cls._normalize_json_text_list(payload.get("top_contributors"), limit=3, item_max_len=18),
}
# 构造 sections 给现有统计提取逻辑复用。
sections: List[Dict[str, Any]] = []
for topic in topic_cards:
items: List[Dict[str, str]] = []
for line in topic.get("overview_points", []):
items.append({"kind": "bullet", "text": line})
for line in topic.get("analysis_points", []):
items.append({"kind": "paragraph", "text": line})
if topic.get("quote_text"):
items.append({"kind": "quote", "text": topic["quote_text"]})
sections.append({"title": topic.get("title", "未命名话题"), "items": items})
if not topic_cards and not any(named_modules.values()) and not lead and not title:
return None
if not lead and topic_cards:
lead = (topic_cards[0].get("overview_points") or [""])[0]
if not fallback_text:
fallback_text = lead or "暂无总结内容。"
return {
"document_title": title,
"lead": lead or "暂无总结内容。",
"fallback_text": fallback_text,
"sections": sections,
"topic_cards": topic_cards,
"named_modules": named_modules,
}
@classmethod
def _build_summary_layout_data(cls, summary_text: str) -> Dict[str, Any]:
"""把 LLM 总结文本重排为模板可直接消费的结构化数据。
@@ -1186,17 +1360,27 @@ class MessageSummaryPlugin(MessagePluginInterface):
# 1. 不再把 LLM 原文直接转 HTML 内嵌到模板;
# 2. 先结构化解析文本,再由模板按组件渲染,稳定控制最终排版。
renderer = HtmlTemplateRenderer()
layout_data = self._build_summary_layout_data(summary_text)
# 解析策略:
# 1. template 模式优先吃 JSON稳定、可控、低漂移
# 2. JSON 不可用时再回退 Markdown 结构解析,保持兼容。
json_layout_data = self._extract_template_json_data(summary_text)
layout_data = json_layout_data or self._build_summary_layout_data(summary_text)
metrics_data = self._build_summary_template_metrics(
message_stats=message_stats,
layout_data=layout_data,
metadata=metadata,
)
sections = layout_data.get("sections", []) or []
topic_cards = self._build_topic_cards_from_sections(sections, limit=5)
if json_layout_data and json_layout_data.get("topic_cards"):
topic_cards = json_layout_data.get("topic_cards", [])[:5]
else:
topic_cards = self._build_topic_cards_from_sections(sections, limit=5)
topic_titles = [card.get("title", "") for card in topic_cards]
auxiliary_sections = self._build_auxiliary_sections(sections, topic_titles)
named_modules = self._build_template_named_modules(sections)
if json_layout_data and isinstance(json_layout_data.get("named_modules"), dict):
named_modules = json_layout_data.get("named_modules", {})
else:
named_modules = self._build_template_named_modules(sections)
resource_hub_items = self._build_resource_hub_items(named_modules.get("shared_resources", []))
# 说明:
# 1. 这里注入“本地字体 CSS”到模板避免依赖 Google Fonts 等外网资源;