完善系统健康面板并接入AI运行态观测

2026-04-30 15:12:47 +08:00
parent 83910b287b
commit 4ddab01b8d
4 changed files with 206 additions and 9 deletions
--- a/admin/dashboard/blueprints/system.py
+++ b/admin/dashboard/blueprints/system.py
@@ -15,6 +15,7 @@ import toml
 from utils.markdown_to_image import get_md2img_health_snapshot, warmup_md2img_browser_sync
 from utils.ai.llm_registry import LLMRegistry
 from base.plugin_common.plugin_interface import PluginStatus
 from utils.ai.unified_llm import UnifiedLLMClient
 # 创建系统信息蓝图
 system_bp = Blueprint('system', __name__)
@@ -403,6 +404,33 @@ def api_system_health_summary():
        # 错误数量直接复用现有统计库，避免为了首页卡片再单独写一套 SQL。
        _, recent_error_count = server.stats_db.get_error_logs(days=1, page=1, limit=1)
        # 基础设施健康：
        # 1. MySQL 用最轻量的 SELECT 1 做可用性探测；
        # 2. Redis 用 PING 验证连接池当前是否可拿到可用连接；
        # 3. 即使探测失败也只反馈到看板，不影响主接口整体返回。
        mysql_status = "healthy"
        mysql_summary = "连接正常"
        try:
            mysql_conn = server.db_manager.get_mysql_connection()
            try:
                with mysql_conn.cursor() as cursor:
                    cursor.execute("SELECT 1")
                    cursor.fetchone()
            finally:
                mysql_conn.close()
        except Exception as mysql_error:
            mysql_status = "danger"
            mysql_summary = f"MySQL 探测失败: {mysql_error}"
        redis_status = "healthy"
        redis_summary = "连接正常"
        try:
            redis_conn = server.db_manager.get_redis_connection()
            redis_conn.ping()
        except Exception as redis_error:
            redis_status = "danger"
            redis_summary = f"Redis 探测失败: {redis_error}"
        # md2img 健康快照已经有现成实现，这里只做聚合，不主动预热运行时。
        md2img_snapshot = get_md2img_health_snapshot(ensure_runtime=False) or {}
        browser_ready = bool(
@@ -452,6 +480,28 @@ def api_system_health_summary():
            md2img_status = "danger"
            md2img_summary = "运行时未就绪，相关转图能力可能不可用"
        # AI 运行态：
        # 1. 统一从 UnifiedLLMClient 最近调用窗口读取，避免各插件单独维护监控数据；
        # 2. 若当前窗口还没有调用记录，就明确返回“暂无调用”，避免误判成异常。
        ai_runtime = UnifiedLLMClient.get_runtime_snapshot()
        ai_total_calls = int(ai_runtime.get("total_calls") or 0)
        ai_failed_calls = int(ai_runtime.get("failed_calls") or 0)
        if ai_total_calls <= 0:
            ai_status = "warning"
            ai_summary = "最近窗口内暂无统一 LLM 调用记录"
        elif ai_failed_calls > 0:
            ai_status = "warning"
            ai_summary = (
                f"最近 {ai_total_calls} 次调用中失败 {ai_failed_calls} 次，"
                f"平均耗时 {ai_runtime.get('avg_latency_ms', 0)}ms"
            )
        else:
            ai_status = "healthy"
            ai_summary = (
                f"最近 {ai_total_calls} 次调用全部成功，"
                f"平均耗时 {ai_runtime.get('avg_latency_ms', 0)}ms"
            )
        return jsonify({
            "success": True,
            "data": {
@@ -473,6 +523,27 @@ def api_system_health_summary():
                    "recent_24h_count": recent_error_count,
                    "summary": error_summary,
                },
                "infrastructure": {
                    "status": "healthy" if mysql_status == "healthy" and redis_status == "healthy" else "danger",
                    "summary": (
                        "MySQL / Redis 均正常"
                        if mysql_status == "healthy" and redis_status == "healthy"
                        else "存在基础设施连接异常"
                    ),
                    "mysql": {
                        "status": mysql_status,
                        "summary": mysql_summary,
                    },
                    "redis": {
                        "status": redis_status,
                        "summary": redis_summary,
                    },
                },
                "ai_runtime": {
                    "status": ai_status,
                    "summary": ai_summary,
                    **ai_runtime,
                },
                "md2img": {
                    "status": md2img_status,
                    "healthy": md2img_healthy,
--- a/admin/dashboard/templates/index.html
+++ b/admin/dashboard/templates/index.html
@@ -355,6 +355,26 @@
                        recent_24h_count: 0,
                        summary: '加载中...'
                    },
                    infrastructure: {
                        status: 'warning',
                        summary: '加载中...',
                        mysql: {
                            status: 'warning',
                            summary: '加载中...'
                        },
                        redis: {
                            status: 'warning',
                            summary: '加载中...'
                        }
                    },
                    ai_runtime: {
                        status: 'warning',
                        total_calls: 0,
                        failed_calls: 0,
                        avg_latency_ms: 0,
                        summary: '加载中...',
                        last_call: {}
                    },
                    md2img: {
                        status: 'warning',
                        healthy: false,
@@ -401,6 +421,8 @@
                const robot = this.healthSummary.robot || {};
                const plugins = this.healthSummary.plugins || {};
                const errors = this.healthSummary.errors || {};
                const infrastructure = this.healthSummary.infrastructure || {};
                const aiRuntime = this.healthSummary.ai_runtime || {};
                const md2img = this.healthSummary.md2img || {};
                return [
                    {
@@ -427,6 +449,22 @@
                        summary: errors.summary || '暂无状态',
                        extra: '统计窗口：近 24 小时'
                    },
                    {
                        key: 'infrastructure',
                        title: '基础设施',
                        status: infrastructure.status || 'warning',
                        value: infrastructure.status === 'healthy' ? '正常' : '异常',
                        summary: infrastructure.summary || '暂无状态',
                        extra: `MySQL：${((infrastructure.mysql || {}).status === 'healthy') ? '正常' : '异常'} / Redis：${((infrastructure.redis || {}).status === 'healthy') ? '正常' : '异常'}`
                    },
                    {
                        key: 'ai_runtime',
                        title: 'AI 运行态',
                        status: aiRuntime.status || 'warning',
                        value: `${aiRuntime.avg_latency_ms || 0} ms`,
                        summary: aiRuntime.summary || '暂无状态',
                        extra: `最近调用 ${aiRuntime.total_calls || 0} 次，失败 ${aiRuntime.failed_calls || 0} 次`
                    },
                    {
                        key: 'md2img',
                        title: 'Markdown 转图',
@@ -978,7 +1016,7 @@
    .health-grid {
        display: grid;
-        grid-template-columns: repeat(4, minmax(0, 1fr));
+        grid-template-columns: repeat(3, minmax(0, 1fr));
        gap: 16px;
        margin-top: 18px;
    }
--- a/docs/工程优化与Feature清单.md
+++ b/docs/工程优化与Feature清单.md
@@ -18,6 +18,7 @@
 - 已将插件调用统计改为主链路直接埋点，降低维护复杂度
 - 已在消息主链路接入 `trace_id`，用于串联消息处理、插件统计与异常日志
 - 已在后台首页补充“系统健康快照”，可集中查看机器人连接、插件运行、近 24 小时异常与 md2img 运行状态
 - 已补充 MySQL / Redis 连接探测与统一 LLM 最近调用快照，基础设施与 AI 运行态可直接在首页查看
 ## 2. 项目现状判断
@@ -314,6 +315,7 @@
 当前进展：
 - 第一阶段已完成：首页已增加系统健康快照，可快速查看核心运行状态
 - 第二阶段已完成：已补充基础设施连通性与 AI 最近调用耗时/成功率快照
 - 后续可继续补充更细粒度的吞吐、延迟、存储连接与 AI 调用链指标
 建议内容：
--- a/utils/ai/unified_llm.py
+++ b/utils/ai/unified_llm.py
@@ -5,6 +5,8 @@ import binascii
 import json
 import mimetypes
 import time
 from collections import deque
 from threading import Lock
 from typing import Any, Dict, List, Optional, Tuple
 from urllib.parse import urlparse
@@ -18,6 +20,13 @@ from utils.ai.llm_registry import LLMRegistry
 class UnifiedLLMClient:
    """统一的 LLM 调用客户端，兼容 OpenAI-compatible 与 Dify。"""
    # 运行时观测快照：
    # 1. 只保留最近一小段调用窗口，避免无限增长；
    # 2. 放在统一客户端层，所有复用该客户端的插件天然受益；
    # 3. 这里存的不是业务明细，而是运维看板需要的轻量健康指标。
    _runtime_metrics = deque(maxlen=50)
    _runtime_lock = Lock()
    def __init__(self, config: Optional[Dict[str, Any]] = None):
        self.LOG = logger
        self.raw_config = config or {}
@@ -41,6 +50,62 @@ class UnifiedLLMClient:
        self.default_system_prompt = str(self.config.get("system_prompt", "")).strip()
        self.last_error = ""
    @classmethod
    def _record_runtime_metric(
        cls,
        *,
        provider: str,
        backend: str,
        scene: str,
        model: str,
        success: bool,
        latency_ms: float,
        error: str = "",
    ) -> None:
        """记录最近一次 LLM 调用结果，供后台健康面板聚合展示。"""
        with cls._runtime_lock:
            cls._runtime_metrics.append({
                "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
                "provider": str(provider or "").strip(),
                "backend": str(backend or "").strip(),
                "scene": str(scene or "").strip(),
                "model": str(model or "").strip(),
                "success": bool(success),
                "latency_ms": round(float(latency_ms or 0.0), 2),
                "error": str(error or "").strip()[:300],
            })
    @classmethod
    def get_runtime_snapshot(cls) -> Dict[str, Any]:
        """返回最近调用窗口的聚合快照，供后台可观测性接口直接复用。"""
        with cls._runtime_lock:
            rows = list(cls._runtime_metrics)
        total_calls = len(rows)
        success_calls = sum(1 for item in rows if item.get("success"))
        failed_calls = total_calls - success_calls
        avg_latency_ms = round(
            sum(float(item.get("latency_ms") or 0.0) for item in rows) / total_calls,
            2
        ) if total_calls else 0.0
        last_call = rows[-1] if rows else {}
        last_error = ""
        for item in reversed(rows):
            if not item.get("success") and item.get("error"):
                last_error = str(item.get("error") or "").strip()
                break
        return {
            "window_size": cls._runtime_metrics.maxlen,
            "total_calls": total_calls,
            "success_calls": success_calls,
            "failed_calls": failed_calls,
            "success_rate": round((success_calls / total_calls) * 100, 2) if total_calls else 0.0,
            "avg_latency_ms": avg_latency_ms,
            "last_call": last_call,
            "last_error": last_error,
        }
    def is_available(self) -> bool:
        if not self.enabled:
            return False
@@ -168,29 +233,50 @@ class UnifiedLLMClient:
        image_urls: Optional[List[str]] = None,
        files: Optional[List[Dict[str, Any]]] = None,
    ) -> Optional[Dict[str, Any]]:
        started_at = time.monotonic()
        self.last_error = ""
        result: Optional[Dict[str, Any]] = None
        if not self.is_available():
            self.last_error = "client_unavailable"
-            return None
+        elif self.provider == "dify":
-
+            result = self._generate_dify(
        if self.provider == "dify":
            return self._generate_dify(
                prompt=prompt,
                user=user,
                inputs=inputs or {},
                tag=tag,
                files=files or [],
            )
-        if self.provider == "openai_compatible":
+        elif self.provider == "openai_compatible":
-            return self._generate_openai(
+            result = self._generate_openai(
                system_prompt=system_prompt,
                user_prompt=user_prompt or prompt,
                user=user,
                image_urls=image_urls or [],
            )
        else:
            self.last_error = f"unsupported_provider:{self.provider}"
-        self.last_error = f"unsupported_provider:{self.provider}"
+        # 统一在出口记录运行时快照，避免每种 provider 都重复埋点逻辑。
-        return None
+        usage = (result or {}).get("usage", {}) if isinstance(result, dict) else {}
        latency_ms = 0.0
        if isinstance(usage, dict) and usage.get("latency") not in (None, ""):
            try:
                latency_ms = float(usage.get("latency")) * 1000
            except Exception:
                latency_ms = 0.0
        if latency_ms <= 0:
            latency_ms = (time.monotonic() - started_at) * 1000
        self._record_runtime_metric(
            provider=self.provider,
            backend=str(self.config.get("backend", "") or ""),
            scene=str(self.config.get("scene", "") or ""),
            model=self.model or str(self.mode or ""),
            success=bool(result and result.get("text")),
            latency_ms=latency_ms,
            error=self.last_error,
        )
        return result
    def _generate_openai(
        self,