message_summary模板模式改为JSON优先解析并同步Dify提示词

2026-04-23 11:24:37 +08:00
parent e87eeba256
commit 4f0b3c041e
3 changed files with 219 additions and 81 deletions
--- a/plugins/message_summary/main.py
+++ b/plugins/message_summary/main.py
@@ -579,6 +579,180 @@ class MessageSummaryPlugin(MessagePluginInterface):

        return text

+    @classmethod
+    def _extract_json_object_from_text(cls, raw_text: str) -> Optional[Dict[str, Any]]:
+        """从文本中提取 JSON 对象（优先服务 template 模式）。"""
+        # 设计说明：
+        # 1. 优先支持三类常见返回：纯 JSON、```json 代码块、被前后说明文字包裹的 JSON；
+        # 2. 仅返回 dict，避免数组/字符串误入模板渲染链路；
+        # 3. 解析失败时返回 None，不中断主流程，后续自动回退 Markdown 结构提取。
+        text = str(raw_text or "").strip()
+        if not text:
+            return None
+
+        # 场景一：整段就是 JSON 对象。
+        try:
+            if text.startswith("{") and text.endswith("}"):
+                parsed = json.loads(text)
+                if isinstance(parsed, dict):
+                    return parsed
+        except Exception:
+            pass
+
+        # 场景二：```json ... ``` 包裹。
+        fenced_match = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", text, flags=re.IGNORECASE)
+        if fenced_match:
+            candidate = str(fenced_match.group(1) or "").strip()
+            try:
+                parsed = json.loads(candidate)
+                if isinstance(parsed, dict):
+                    return parsed
+            except Exception:
+                pass
+
+        # 场景三：文本中夹杂 JSON。采用大括号包围段做兜底提取。
+        left = text.find("{")
+        right = text.rfind("}")
+        if left >= 0 and right > left:
+            candidate = text[left:right + 1].strip()
+            try:
+                parsed = json.loads(candidate)
+                if isinstance(parsed, dict):
+                    return parsed
+            except Exception:
+                return None
+        return None
+
+    @classmethod
+    def _normalize_json_text_list(cls, value: Any, limit: int = 6, item_max_len: int = 120) -> List[str]:
+        """把 JSON 字段标准化为字符串列表。"""
+        # 设计说明：
+        # 1. 兼容字符串、数组、混合对象等脏数据输入；
+        # 2. 统一做 Markdown 行内清理，避免样式噪音进入模板；
+        # 3. 强制长度与条数上限，防止单条过长撑爆卡片布局。
+        texts: List[str] = []
+        if isinstance(value, str):
+            candidate = cls._strip_markdown_inline(value).strip()
+            if candidate:
+                texts.append(candidate[:item_max_len])
+            return texts[:limit]
+        if not isinstance(value, list):
+            return texts
+
+        for item in value:
+            if isinstance(item, str):
+                candidate = cls._strip_markdown_inline(item).strip()
+            elif isinstance(item, dict):
+                candidate = cls._strip_markdown_inline(
+                    str(item.get("text") or item.get("title") or item.get("value") or "")
+                ).strip()
+            else:
+                candidate = cls._strip_markdown_inline(str(item or "")).strip()
+            if not candidate:
+                continue
+            texts.append(candidate[:item_max_len])
+            if len(texts) >= limit:
+                break
+        return texts
+
+    @classmethod
+    def _extract_template_json_data(cls, summary_text: str) -> Optional[Dict[str, Any]]:
+        """提取 template 模式专用 JSON 结构。"""
+        # 说明：
+        # 1. 允许 LLM 按固定 schema 输出 JSON，渲染稳定性显著高于 Markdown 再解析；
+        # 2. 这里做“宽松字段兼容”，便于后续提示词小幅调整也不影响线上；
+        # 3. 只有检测到有效 JSON 且关键字段存在时才返回，避免误判。
+        payload = cls._extract_json_object_from_text(summary_text)
+        if not payload:
+            return None
+
+        title = cls._strip_markdown_inline(
+            str(payload.get("title") or payload.get("document_title") or payload.get("doc_title") or "")
+        ).strip()
+        lead = cls._strip_markdown_inline(
+            str(payload.get("lead") or payload.get("summary_lead") or payload.get("overview") or "")
+        ).strip()
+        fallback_text = cls._strip_markdown_inline(
+            str(payload.get("fallback_text") or payload.get("raw_summary") or "")
+        ).strip()
+
+        # 解析话题卡片。
+        topic_cards: List[Dict[str, Any]] = []
+        topics = payload.get("topics")
+        if isinstance(topics, list):
+            for raw_topic in topics:
+                if not isinstance(raw_topic, dict):
+                    continue
+                topic_title = cls._clean_topic_title(str(raw_topic.get("title") or raw_topic.get("name") or ""))
+                if not topic_title:
+                    topic_title = "未命名话题"
+                overview_points = cls._normalize_json_text_list(
+                    raw_topic.get("overview_points") or raw_topic.get("key_points") or raw_topic.get("highlights"),
+                    limit=3,
+                    item_max_len=120,
+                )
+                analysis_points = cls._normalize_json_text_list(
+                    raw_topic.get("analysis_points") or raw_topic.get("analysis"),
+                    limit=2,
+                    item_max_len=120,
+                )
+                quote_text = cls._strip_markdown_inline(str(raw_topic.get("quote_text") or raw_topic.get("quote") or "")).strip()
+                time_range = cls._strip_markdown_inline(str(raw_topic.get("time_range") or raw_topic.get("time") or "")).strip()
+                participants = cls._strip_markdown_inline(
+                    str(raw_topic.get("participants") or raw_topic.get("participant_count") or "")
+                ).strip()
+
+                topic_cards.append(
+                    {
+                        "title": topic_title[:42],
+                        "time_range": time_range[:58],
+                        "participants": participants[:42],
+                        "overview_points": overview_points,
+                        "analysis_points": analysis_points,
+                        "quote_text": quote_text[:120],
+                    }
+                )
+                if len(topic_cards) >= 5:
+                    break
+
+        # 解析命名模块。
+        named_modules = {
+            "shared_resources": cls._normalize_json_text_list(payload.get("shared_resources"), limit=6, item_max_len=110),
+            "marketplace": cls._normalize_json_text_list(payload.get("marketplace"), limit=6, item_max_len=110),
+            "unresolved_pool": cls._normalize_json_text_list(payload.get("unresolved_pool"), limit=4, item_max_len=110),
+            "core_points": cls._normalize_json_text_list(payload.get("core_knowledge_points") or payload.get("core_points"), limit=4, item_max_len=110),
+            "top_contributors": cls._normalize_json_text_list(payload.get("top_contributors"), limit=3, item_max_len=18),
+        }
+
+        # 构造 sections 给现有统计提取逻辑复用。
+        sections: List[Dict[str, Any]] = []
+        for topic in topic_cards:
+            items: List[Dict[str, str]] = []
+            for line in topic.get("overview_points", []):
+                items.append({"kind": "bullet", "text": line})
+            for line in topic.get("analysis_points", []):
+                items.append({"kind": "paragraph", "text": line})
+            if topic.get("quote_text"):
+                items.append({"kind": "quote", "text": topic["quote_text"]})
+            sections.append({"title": topic.get("title", "未命名话题"), "items": items})
+
+        if not topic_cards and not any(named_modules.values()) and not lead and not title:
+            return None
+
+        if not lead and topic_cards:
+            lead = (topic_cards[0].get("overview_points") or [""])[0]
+        if not fallback_text:
+            fallback_text = lead or "暂无总结内容。"
+
+        return {
+            "document_title": title,
+            "lead": lead or "暂无总结内容。",
+            "fallback_text": fallback_text,
+            "sections": sections,
+            "topic_cards": topic_cards,
+            "named_modules": named_modules,
+        }
+
    @classmethod
    def _build_summary_layout_data(cls, summary_text: str) -> Dict[str, Any]:
        """把 LLM 总结文本重排为模板可直接消费的结构化数据。
@@ -1186,17 +1360,27 @@ class MessageSummaryPlugin(MessagePluginInterface):
        # 1. 不再把 LLM 原文直接转 HTML 内嵌到模板；
        # 2. 先结构化解析文本，再由模板按组件渲染，稳定控制最终排版。
        renderer = HtmlTemplateRenderer()
-        layout_data = self._build_summary_layout_data(summary_text)
+        # 解析策略：
+        # 1. template 模式优先吃 JSON（稳定、可控、低漂移）；
+        # 2. JSON 不可用时再回退 Markdown 结构解析，保持兼容。
+        json_layout_data = self._extract_template_json_data(summary_text)
+        layout_data = json_layout_data or self._build_summary_layout_data(summary_text)
        metrics_data = self._build_summary_template_metrics(
            message_stats=message_stats,
            layout_data=layout_data,
            metadata=metadata,
        )
        sections = layout_data.get("sections", []) or []
-        topic_cards = self._build_topic_cards_from_sections(sections, limit=5)
+        if json_layout_data and json_layout_data.get("topic_cards"):
+            topic_cards = json_layout_data.get("topic_cards", [])[:5]
+        else:
+            topic_cards = self._build_topic_cards_from_sections(sections, limit=5)
        topic_titles = [card.get("title", "") for card in topic_cards]
        auxiliary_sections = self._build_auxiliary_sections(sections, topic_titles)
-        named_modules = self._build_template_named_modules(sections)
+        if json_layout_data and isinstance(json_layout_data.get("named_modules"), dict):
+            named_modules = json_layout_data.get("named_modules", {})
+        else:
+            named_modules = self._build_template_named_modules(sections)
        resource_hub_items = self._build_resource_hub_items(named_modules.get("shared_resources", []))
        # 说明：
        # 1. 这里注入“本地字体 CSS”到模板，避免依赖 Google Fonts 等外网资源；
--- a/plugins/message_summary/群总结AI
+++ b/plugins/message_summary/群总结AI
@@ -322,87 +322,41 @@ workflow:
        - id: template_system_prompt
          role: system
          text: |
-            你是一名「微信群总结结构化编辑官」，你的输出将用于 Gemini 风格总结卡片渲染。
+            你是一名「微信群总结结构化编辑官」，输出将直接用于总结卡片模板渲染。

-            目标：
-            1. 让内容结构尽量贴合 gemini-code 模板模块；
-            2. 信息密度高，但保持短句、可扫描；
-            3. 不做空泛抒情，不要写冗长大段落。
+            核心要求：
+            1. 只输出 JSON 对象，不要输出 Markdown，不要输出解释文本；
+            2. 不要使用 ```json 代码块包裹；
+            3. 必须覆盖至少 5 个话题；
+            4. 不要翻译昵称，不要改写 @昵称；
+            5. 字段缺失时用空字符串或空数组，禁止省略关键字段。

-            必须遵守：
-            1. 必须覆盖至少 5 个话题（缺少时也要从聊天里归并凑足 5 个主题）；
-            2. 不要翻译用户昵称，不要改写 @昵称；
-            3. 输出必须是纯 Markdown，不要 JSON，不要 ``` 代码块；
-            4. 每个话题都要包含：
-               - 时段
-               - 参与人数
-               - 核心观点回顾（2-3条）
-               - 客观分析（1-2条）
-               - 亮点瞬间（1条）
-            5. 每条 bullet 尽量不超过 40 字；
-            6. 结论用“可执行建议”表达，不要空话。
+            输出 JSON Schema（字段名必须一致）：
+            {
+              "title": "字符串，整篇标题",
+              "lead": "字符串，2-3句导语",
+              "topics": [
+                {
+                  "title": "话题标题",
+                  "time_range": "时段，如 09:20-10:10",
+                  "participants": "参与人数，如 18人",
+                  "overview_points": ["核心观点1", "核心观点2", "核心观点3"],
+                  "analysis_points": ["客观分析1", "客观分析2"],
+                  "quote_text": "亮点金句或高光总结"
+                }
+              ],
+              "shared_resources": ["资源项1", "资源项2"],
+              "marketplace": ["交易项1", "交易项2"],
+              "unresolved_pool": ["待解问题1", "待解问题2"],
+              "core_knowledge_points": ["知识点1", "知识点2"],
+              "top_contributors": ["昵称A", "昵称B", "昵称C"]
+            }

-            输出格式请严格按以下骨架：
-
-            # 🌟「[群名] - [最新日期] 总结」🌟
-
-            ## ⚡ 一分钟速览
-            - 今日消息数：[总数]
-            - 最热时段：[时段]
-
-            ### 🏆 核心话题与结论
-            1. [话题1简述]：[一句结论]
-            2. [话题2简述]：[一句结论]
-            3. [话题3简述]：[一句结论]
-            4. [话题4简述]：[一句结论]
-            5. [话题5简述]：[一句结论]
-
-            ### 💰 交易/资源快报
-            - [@用户A] [卖货/资源]
-            - [@用户B] [卖货/资源]
-
-            ### 📌 今日总结
-            [1-2句结论 + 后续建议]
-
-            ## 🌌 话题详情
-            ### 1️⃣ 【[话题1]】
-            - **时段**：[开始]-[结束]
-            - **参与人数**：[人数]
-            #### 🔍 核心观点回顾
-            - [观点1]
-            - [观点2]
-            #### 🧩 客观分析
-            - [共识/分歧/价值]
-            #### 🔥 亮点瞬间
-            > [金句或高光总结]
-
-            （继续话题2~5，保持同结构）
-
-            ## 🔗 Shared Resources
-            - [仓库/文档/工具链接 + 一句说明]
-            - [仓库/文档/工具链接 + 一句说明]
-
-            ## 🛒 Marketplace
-            - [出/求] [标的]：[价格或状态]
-            - [出/求] [标的]：[价格或状态]
-
-            ## ❓ Unresolved Pool
-            - [待解问题1]
-            - [待解问题2]
-
-            ## 🧠 Core Knowledge Points
-            - [关键配置/经验1]
-            - [关键配置/经验2]
-
-            ## 🎖️ 今日荣誉榜
-            ### 🏆 群聊 MVP：[@用户N]
-            - 理由1
-            - 理由2
-
-            ## 👥 Top Contributors
-            - [昵称A]
-            - [昵称B]
-            - [昵称C]
+            生成约束：
+            1. topics 数组长度必须为 5（不足请归并补齐）；
+            2. overview_points 每个话题 2-3 条，analysis_points 每个话题 1-2 条；
+            3. 每条文本尽量 <= 40 字，短句可扫描；
+            4. 结论要具体，不要空泛。
        - id: template_user_prompt
          role: user
          text: '{{#1775526517808.query#}}'
--- a/temp/md2image/summary_demo_render_latest.png
+++ b/temp/md2image/summary_demo_render_latest.png