diff --git a/admin/dashboard/blueprints/system.py b/admin/dashboard/blueprints/system.py index 235c59d..37db1ff 100644 --- a/admin/dashboard/blueprints/system.py +++ b/admin/dashboard/blueprints/system.py @@ -15,6 +15,7 @@ import toml from utils.markdown_to_image import get_md2img_health_snapshot, warmup_md2img_browser_sync from utils.ai.llm_registry import LLMRegistry from base.plugin_common.plugin_interface import PluginStatus +from utils.ai.unified_llm import UnifiedLLMClient # 创建系统信息蓝图 system_bp = Blueprint('system', __name__) @@ -403,6 +404,33 @@ def api_system_health_summary(): # 错误数量直接复用现有统计库,避免为了首页卡片再单独写一套 SQL。 _, recent_error_count = server.stats_db.get_error_logs(days=1, page=1, limit=1) + # 基础设施健康: + # 1. MySQL 用最轻量的 SELECT 1 做可用性探测; + # 2. Redis 用 PING 验证连接池当前是否可拿到可用连接; + # 3. 即使探测失败也只反馈到看板,不影响主接口整体返回。 + mysql_status = "healthy" + mysql_summary = "连接正常" + try: + mysql_conn = server.db_manager.get_mysql_connection() + try: + with mysql_conn.cursor() as cursor: + cursor.execute("SELECT 1") + cursor.fetchone() + finally: + mysql_conn.close() + except Exception as mysql_error: + mysql_status = "danger" + mysql_summary = f"MySQL 探测失败: {mysql_error}" + + redis_status = "healthy" + redis_summary = "连接正常" + try: + redis_conn = server.db_manager.get_redis_connection() + redis_conn.ping() + except Exception as redis_error: + redis_status = "danger" + redis_summary = f"Redis 探测失败: {redis_error}" + # md2img 健康快照已经有现成实现,这里只做聚合,不主动预热运行时。 md2img_snapshot = get_md2img_health_snapshot(ensure_runtime=False) or {} browser_ready = bool( @@ -452,6 +480,28 @@ def api_system_health_summary(): md2img_status = "danger" md2img_summary = "运行时未就绪,相关转图能力可能不可用" + # AI 运行态: + # 1. 统一从 UnifiedLLMClient 最近调用窗口读取,避免各插件单独维护监控数据; + # 2. 若当前窗口还没有调用记录,就明确返回“暂无调用”,避免误判成异常。 + ai_runtime = UnifiedLLMClient.get_runtime_snapshot() + ai_total_calls = int(ai_runtime.get("total_calls") or 0) + ai_failed_calls = int(ai_runtime.get("failed_calls") or 0) + if ai_total_calls <= 0: + ai_status = "warning" + ai_summary = "最近窗口内暂无统一 LLM 调用记录" + elif ai_failed_calls > 0: + ai_status = "warning" + ai_summary = ( + f"最近 {ai_total_calls} 次调用中失败 {ai_failed_calls} 次," + f"平均耗时 {ai_runtime.get('avg_latency_ms', 0)}ms" + ) + else: + ai_status = "healthy" + ai_summary = ( + f"最近 {ai_total_calls} 次调用全部成功," + f"平均耗时 {ai_runtime.get('avg_latency_ms', 0)}ms" + ) + return jsonify({ "success": True, "data": { @@ -473,6 +523,27 @@ def api_system_health_summary(): "recent_24h_count": recent_error_count, "summary": error_summary, }, + "infrastructure": { + "status": "healthy" if mysql_status == "healthy" and redis_status == "healthy" else "danger", + "summary": ( + "MySQL / Redis 均正常" + if mysql_status == "healthy" and redis_status == "healthy" + else "存在基础设施连接异常" + ), + "mysql": { + "status": mysql_status, + "summary": mysql_summary, + }, + "redis": { + "status": redis_status, + "summary": redis_summary, + }, + }, + "ai_runtime": { + "status": ai_status, + "summary": ai_summary, + **ai_runtime, + }, "md2img": { "status": md2img_status, "healthy": md2img_healthy, diff --git a/admin/dashboard/templates/index.html b/admin/dashboard/templates/index.html index 23f6758..d14657d 100644 --- a/admin/dashboard/templates/index.html +++ b/admin/dashboard/templates/index.html @@ -355,6 +355,26 @@ recent_24h_count: 0, summary: '加载中...' }, + infrastructure: { + status: 'warning', + summary: '加载中...', + mysql: { + status: 'warning', + summary: '加载中...' + }, + redis: { + status: 'warning', + summary: '加载中...' + } + }, + ai_runtime: { + status: 'warning', + total_calls: 0, + failed_calls: 0, + avg_latency_ms: 0, + summary: '加载中...', + last_call: {} + }, md2img: { status: 'warning', healthy: false, @@ -401,6 +421,8 @@ const robot = this.healthSummary.robot || {}; const plugins = this.healthSummary.plugins || {}; const errors = this.healthSummary.errors || {}; + const infrastructure = this.healthSummary.infrastructure || {}; + const aiRuntime = this.healthSummary.ai_runtime || {}; const md2img = this.healthSummary.md2img || {}; return [ { @@ -427,6 +449,22 @@ summary: errors.summary || '暂无状态', extra: '统计窗口:近 24 小时' }, + { + key: 'infrastructure', + title: '基础设施', + status: infrastructure.status || 'warning', + value: infrastructure.status === 'healthy' ? '正常' : '异常', + summary: infrastructure.summary || '暂无状态', + extra: `MySQL:${((infrastructure.mysql || {}).status === 'healthy') ? '正常' : '异常'} / Redis:${((infrastructure.redis || {}).status === 'healthy') ? '正常' : '异常'}` + }, + { + key: 'ai_runtime', + title: 'AI 运行态', + status: aiRuntime.status || 'warning', + value: `${aiRuntime.avg_latency_ms || 0} ms`, + summary: aiRuntime.summary || '暂无状态', + extra: `最近调用 ${aiRuntime.total_calls || 0} 次,失败 ${aiRuntime.failed_calls || 0} 次` + }, { key: 'md2img', title: 'Markdown 转图', @@ -978,7 +1016,7 @@ .health-grid { display: grid; - grid-template-columns: repeat(4, minmax(0, 1fr)); + grid-template-columns: repeat(3, minmax(0, 1fr)); gap: 16px; margin-top: 18px; } diff --git a/docs/工程优化与Feature清单.md b/docs/工程优化与Feature清单.md index ca38830..ba5f105 100644 --- a/docs/工程优化与Feature清单.md +++ b/docs/工程优化与Feature清单.md @@ -18,6 +18,7 @@ - 已将插件调用统计改为主链路直接埋点,降低维护复杂度 - 已在消息主链路接入 `trace_id`,用于串联消息处理、插件统计与异常日志 - 已在后台首页补充“系统健康快照”,可集中查看机器人连接、插件运行、近 24 小时异常与 md2img 运行状态 +- 已补充 MySQL / Redis 连接探测与统一 LLM 最近调用快照,基础设施与 AI 运行态可直接在首页查看 ## 2. 项目现状判断 @@ -314,6 +315,7 @@ 当前进展: - 第一阶段已完成:首页已增加系统健康快照,可快速查看核心运行状态 +- 第二阶段已完成:已补充基础设施连通性与 AI 最近调用耗时/成功率快照 - 后续可继续补充更细粒度的吞吐、延迟、存储连接与 AI 调用链指标 建议内容: diff --git a/utils/ai/unified_llm.py b/utils/ai/unified_llm.py index f02a78d..1505672 100644 --- a/utils/ai/unified_llm.py +++ b/utils/ai/unified_llm.py @@ -5,6 +5,8 @@ import binascii import json import mimetypes import time +from collections import deque +from threading import Lock from typing import Any, Dict, List, Optional, Tuple from urllib.parse import urlparse @@ -18,6 +20,13 @@ from utils.ai.llm_registry import LLMRegistry class UnifiedLLMClient: """统一的 LLM 调用客户端,兼容 OpenAI-compatible 与 Dify。""" + # 运行时观测快照: + # 1. 只保留最近一小段调用窗口,避免无限增长; + # 2. 放在统一客户端层,所有复用该客户端的插件天然受益; + # 3. 这里存的不是业务明细,而是运维看板需要的轻量健康指标。 + _runtime_metrics = deque(maxlen=50) + _runtime_lock = Lock() + def __init__(self, config: Optional[Dict[str, Any]] = None): self.LOG = logger self.raw_config = config or {} @@ -41,6 +50,62 @@ class UnifiedLLMClient: self.default_system_prompt = str(self.config.get("system_prompt", "")).strip() self.last_error = "" + @classmethod + def _record_runtime_metric( + cls, + *, + provider: str, + backend: str, + scene: str, + model: str, + success: bool, + latency_ms: float, + error: str = "", + ) -> None: + """记录最近一次 LLM 调用结果,供后台健康面板聚合展示。""" + with cls._runtime_lock: + cls._runtime_metrics.append({ + "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"), + "provider": str(provider or "").strip(), + "backend": str(backend or "").strip(), + "scene": str(scene or "").strip(), + "model": str(model or "").strip(), + "success": bool(success), + "latency_ms": round(float(latency_ms or 0.0), 2), + "error": str(error or "").strip()[:300], + }) + + @classmethod + def get_runtime_snapshot(cls) -> Dict[str, Any]: + """返回最近调用窗口的聚合快照,供后台可观测性接口直接复用。""" + with cls._runtime_lock: + rows = list(cls._runtime_metrics) + + total_calls = len(rows) + success_calls = sum(1 for item in rows if item.get("success")) + failed_calls = total_calls - success_calls + avg_latency_ms = round( + sum(float(item.get("latency_ms") or 0.0) for item in rows) / total_calls, + 2 + ) if total_calls else 0.0 + last_call = rows[-1] if rows else {} + last_error = "" + for item in reversed(rows): + if not item.get("success") and item.get("error"): + last_error = str(item.get("error") or "").strip() + break + + return { + "window_size": cls._runtime_metrics.maxlen, + "total_calls": total_calls, + "success_calls": success_calls, + "failed_calls": failed_calls, + "success_rate": round((success_calls / total_calls) * 100, 2) if total_calls else 0.0, + "avg_latency_ms": avg_latency_ms, + "last_call": last_call, + "last_error": last_error, + } + def is_available(self) -> bool: if not self.enabled: return False @@ -168,29 +233,50 @@ class UnifiedLLMClient: image_urls: Optional[List[str]] = None, files: Optional[List[Dict[str, Any]]] = None, ) -> Optional[Dict[str, Any]]: + started_at = time.monotonic() self.last_error = "" + result: Optional[Dict[str, Any]] = None if not self.is_available(): self.last_error = "client_unavailable" - return None - - if self.provider == "dify": - return self._generate_dify( + elif self.provider == "dify": + result = self._generate_dify( prompt=prompt, user=user, inputs=inputs or {}, tag=tag, files=files or [], ) - if self.provider == "openai_compatible": - return self._generate_openai( + elif self.provider == "openai_compatible": + result = self._generate_openai( system_prompt=system_prompt, user_prompt=user_prompt or prompt, user=user, image_urls=image_urls or [], ) + else: + self.last_error = f"unsupported_provider:{self.provider}" - self.last_error = f"unsupported_provider:{self.provider}" - return None + # 统一在出口记录运行时快照,避免每种 provider 都重复埋点逻辑。 + usage = (result or {}).get("usage", {}) if isinstance(result, dict) else {} + latency_ms = 0.0 + if isinstance(usage, dict) and usage.get("latency") not in (None, ""): + try: + latency_ms = float(usage.get("latency")) * 1000 + except Exception: + latency_ms = 0.0 + if latency_ms <= 0: + latency_ms = (time.monotonic() - started_at) * 1000 + + self._record_runtime_metric( + provider=self.provider, + backend=str(self.config.get("backend", "") or ""), + scene=str(self.config.get("scene", "") or ""), + model=self.model or str(self.mode or ""), + success=bool(result and result.get("text")), + latency_ms=latency_ms, + error=self.last_error, + ) + return result def _generate_openai( self,