完善系统健康面板并接入AI运行态观测

This commit is contained in:
liuwei
2026-04-30 15:12:47 +08:00
parent 83910b287b
commit 4ddab01b8d
4 changed files with 206 additions and 9 deletions

View File

@@ -15,6 +15,7 @@ import toml
from utils.markdown_to_image import get_md2img_health_snapshot, warmup_md2img_browser_sync from utils.markdown_to_image import get_md2img_health_snapshot, warmup_md2img_browser_sync
from utils.ai.llm_registry import LLMRegistry from utils.ai.llm_registry import LLMRegistry
from base.plugin_common.plugin_interface import PluginStatus from base.plugin_common.plugin_interface import PluginStatus
from utils.ai.unified_llm import UnifiedLLMClient
# 创建系统信息蓝图 # 创建系统信息蓝图
system_bp = Blueprint('system', __name__) system_bp = Blueprint('system', __name__)
@@ -403,6 +404,33 @@ def api_system_health_summary():
# 错误数量直接复用现有统计库,避免为了首页卡片再单独写一套 SQL。 # 错误数量直接复用现有统计库,避免为了首页卡片再单独写一套 SQL。
_, recent_error_count = server.stats_db.get_error_logs(days=1, page=1, limit=1) _, recent_error_count = server.stats_db.get_error_logs(days=1, page=1, limit=1)
# 基础设施健康:
# 1. MySQL 用最轻量的 SELECT 1 做可用性探测;
# 2. Redis 用 PING 验证连接池当前是否可拿到可用连接;
# 3. 即使探测失败也只反馈到看板,不影响主接口整体返回。
mysql_status = "healthy"
mysql_summary = "连接正常"
try:
mysql_conn = server.db_manager.get_mysql_connection()
try:
with mysql_conn.cursor() as cursor:
cursor.execute("SELECT 1")
cursor.fetchone()
finally:
mysql_conn.close()
except Exception as mysql_error:
mysql_status = "danger"
mysql_summary = f"MySQL 探测失败: {mysql_error}"
redis_status = "healthy"
redis_summary = "连接正常"
try:
redis_conn = server.db_manager.get_redis_connection()
redis_conn.ping()
except Exception as redis_error:
redis_status = "danger"
redis_summary = f"Redis 探测失败: {redis_error}"
# md2img 健康快照已经有现成实现,这里只做聚合,不主动预热运行时。 # md2img 健康快照已经有现成实现,这里只做聚合,不主动预热运行时。
md2img_snapshot = get_md2img_health_snapshot(ensure_runtime=False) or {} md2img_snapshot = get_md2img_health_snapshot(ensure_runtime=False) or {}
browser_ready = bool( browser_ready = bool(
@@ -452,6 +480,28 @@ def api_system_health_summary():
md2img_status = "danger" md2img_status = "danger"
md2img_summary = "运行时未就绪,相关转图能力可能不可用" md2img_summary = "运行时未就绪,相关转图能力可能不可用"
# AI 运行态:
# 1. 统一从 UnifiedLLMClient 最近调用窗口读取,避免各插件单独维护监控数据;
# 2. 若当前窗口还没有调用记录,就明确返回“暂无调用”,避免误判成异常。
ai_runtime = UnifiedLLMClient.get_runtime_snapshot()
ai_total_calls = int(ai_runtime.get("total_calls") or 0)
ai_failed_calls = int(ai_runtime.get("failed_calls") or 0)
if ai_total_calls <= 0:
ai_status = "warning"
ai_summary = "最近窗口内暂无统一 LLM 调用记录"
elif ai_failed_calls > 0:
ai_status = "warning"
ai_summary = (
f"最近 {ai_total_calls} 次调用中失败 {ai_failed_calls} 次,"
f"平均耗时 {ai_runtime.get('avg_latency_ms', 0)}ms"
)
else:
ai_status = "healthy"
ai_summary = (
f"最近 {ai_total_calls} 次调用全部成功,"
f"平均耗时 {ai_runtime.get('avg_latency_ms', 0)}ms"
)
return jsonify({ return jsonify({
"success": True, "success": True,
"data": { "data": {
@@ -473,6 +523,27 @@ def api_system_health_summary():
"recent_24h_count": recent_error_count, "recent_24h_count": recent_error_count,
"summary": error_summary, "summary": error_summary,
}, },
"infrastructure": {
"status": "healthy" if mysql_status == "healthy" and redis_status == "healthy" else "danger",
"summary": (
"MySQL / Redis 均正常"
if mysql_status == "healthy" and redis_status == "healthy"
else "存在基础设施连接异常"
),
"mysql": {
"status": mysql_status,
"summary": mysql_summary,
},
"redis": {
"status": redis_status,
"summary": redis_summary,
},
},
"ai_runtime": {
"status": ai_status,
"summary": ai_summary,
**ai_runtime,
},
"md2img": { "md2img": {
"status": md2img_status, "status": md2img_status,
"healthy": md2img_healthy, "healthy": md2img_healthy,

View File

@@ -355,6 +355,26 @@
recent_24h_count: 0, recent_24h_count: 0,
summary: '加载中...' summary: '加载中...'
}, },
infrastructure: {
status: 'warning',
summary: '加载中...',
mysql: {
status: 'warning',
summary: '加载中...'
},
redis: {
status: 'warning',
summary: '加载中...'
}
},
ai_runtime: {
status: 'warning',
total_calls: 0,
failed_calls: 0,
avg_latency_ms: 0,
summary: '加载中...',
last_call: {}
},
md2img: { md2img: {
status: 'warning', status: 'warning',
healthy: false, healthy: false,
@@ -401,6 +421,8 @@
const robot = this.healthSummary.robot || {}; const robot = this.healthSummary.robot || {};
const plugins = this.healthSummary.plugins || {}; const plugins = this.healthSummary.plugins || {};
const errors = this.healthSummary.errors || {}; const errors = this.healthSummary.errors || {};
const infrastructure = this.healthSummary.infrastructure || {};
const aiRuntime = this.healthSummary.ai_runtime || {};
const md2img = this.healthSummary.md2img || {}; const md2img = this.healthSummary.md2img || {};
return [ return [
{ {
@@ -427,6 +449,22 @@
summary: errors.summary || '暂无状态', summary: errors.summary || '暂无状态',
extra: '统计窗口:近 24 小时' extra: '统计窗口:近 24 小时'
}, },
{
key: 'infrastructure',
title: '基础设施',
status: infrastructure.status || 'warning',
value: infrastructure.status === 'healthy' ? '正常' : '异常',
summary: infrastructure.summary || '暂无状态',
extra: `MySQL${((infrastructure.mysql || {}).status === 'healthy') ? '正常' : '异常'} / Redis${((infrastructure.redis || {}).status === 'healthy') ? '正常' : '异常'}`
},
{
key: 'ai_runtime',
title: 'AI 运行态',
status: aiRuntime.status || 'warning',
value: `${aiRuntime.avg_latency_ms || 0} ms`,
summary: aiRuntime.summary || '暂无状态',
extra: `最近调用 ${aiRuntime.total_calls || 0} 次,失败 ${aiRuntime.failed_calls || 0}`
},
{ {
key: 'md2img', key: 'md2img',
title: 'Markdown 转图', title: 'Markdown 转图',
@@ -978,7 +1016,7 @@
.health-grid { .health-grid {
display: grid; display: grid;
grid-template-columns: repeat(4, minmax(0, 1fr)); grid-template-columns: repeat(3, minmax(0, 1fr));
gap: 16px; gap: 16px;
margin-top: 18px; margin-top: 18px;
} }

View File

@@ -18,6 +18,7 @@
- 已将插件调用统计改为主链路直接埋点,降低维护复杂度 - 已将插件调用统计改为主链路直接埋点,降低维护复杂度
- 已在消息主链路接入 `trace_id`,用于串联消息处理、插件统计与异常日志 - 已在消息主链路接入 `trace_id`,用于串联消息处理、插件统计与异常日志
- 已在后台首页补充“系统健康快照”,可集中查看机器人连接、插件运行、近 24 小时异常与 md2img 运行状态 - 已在后台首页补充“系统健康快照”,可集中查看机器人连接、插件运行、近 24 小时异常与 md2img 运行状态
- 已补充 MySQL / Redis 连接探测与统一 LLM 最近调用快照,基础设施与 AI 运行态可直接在首页查看
## 2. 项目现状判断 ## 2. 项目现状判断
@@ -314,6 +315,7 @@
当前进展: 当前进展:
- 第一阶段已完成:首页已增加系统健康快照,可快速查看核心运行状态 - 第一阶段已完成:首页已增加系统健康快照,可快速查看核心运行状态
- 第二阶段已完成:已补充基础设施连通性与 AI 最近调用耗时/成功率快照
- 后续可继续补充更细粒度的吞吐、延迟、存储连接与 AI 调用链指标 - 后续可继续补充更细粒度的吞吐、延迟、存储连接与 AI 调用链指标
建议内容: 建议内容:

View File

@@ -5,6 +5,8 @@ import binascii
import json import json
import mimetypes import mimetypes
import time import time
from collections import deque
from threading import Lock
from typing import Any, Dict, List, Optional, Tuple from typing import Any, Dict, List, Optional, Tuple
from urllib.parse import urlparse from urllib.parse import urlparse
@@ -18,6 +20,13 @@ from utils.ai.llm_registry import LLMRegistry
class UnifiedLLMClient: class UnifiedLLMClient:
"""统一的 LLM 调用客户端,兼容 OpenAI-compatible 与 Dify。""" """统一的 LLM 调用客户端,兼容 OpenAI-compatible 与 Dify。"""
# 运行时观测快照:
# 1. 只保留最近一小段调用窗口,避免无限增长;
# 2. 放在统一客户端层,所有复用该客户端的插件天然受益;
# 3. 这里存的不是业务明细,而是运维看板需要的轻量健康指标。
_runtime_metrics = deque(maxlen=50)
_runtime_lock = Lock()
def __init__(self, config: Optional[Dict[str, Any]] = None): def __init__(self, config: Optional[Dict[str, Any]] = None):
self.LOG = logger self.LOG = logger
self.raw_config = config or {} self.raw_config = config or {}
@@ -41,6 +50,62 @@ class UnifiedLLMClient:
self.default_system_prompt = str(self.config.get("system_prompt", "")).strip() self.default_system_prompt = str(self.config.get("system_prompt", "")).strip()
self.last_error = "" self.last_error = ""
@classmethod
def _record_runtime_metric(
cls,
*,
provider: str,
backend: str,
scene: str,
model: str,
success: bool,
latency_ms: float,
error: str = "",
) -> None:
"""记录最近一次 LLM 调用结果,供后台健康面板聚合展示。"""
with cls._runtime_lock:
cls._runtime_metrics.append({
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
"provider": str(provider or "").strip(),
"backend": str(backend or "").strip(),
"scene": str(scene or "").strip(),
"model": str(model or "").strip(),
"success": bool(success),
"latency_ms": round(float(latency_ms or 0.0), 2),
"error": str(error or "").strip()[:300],
})
@classmethod
def get_runtime_snapshot(cls) -> Dict[str, Any]:
"""返回最近调用窗口的聚合快照,供后台可观测性接口直接复用。"""
with cls._runtime_lock:
rows = list(cls._runtime_metrics)
total_calls = len(rows)
success_calls = sum(1 for item in rows if item.get("success"))
failed_calls = total_calls - success_calls
avg_latency_ms = round(
sum(float(item.get("latency_ms") or 0.0) for item in rows) / total_calls,
2
) if total_calls else 0.0
last_call = rows[-1] if rows else {}
last_error = ""
for item in reversed(rows):
if not item.get("success") and item.get("error"):
last_error = str(item.get("error") or "").strip()
break
return {
"window_size": cls._runtime_metrics.maxlen,
"total_calls": total_calls,
"success_calls": success_calls,
"failed_calls": failed_calls,
"success_rate": round((success_calls / total_calls) * 100, 2) if total_calls else 0.0,
"avg_latency_ms": avg_latency_ms,
"last_call": last_call,
"last_error": last_error,
}
def is_available(self) -> bool: def is_available(self) -> bool:
if not self.enabled: if not self.enabled:
return False return False
@@ -168,29 +233,50 @@ class UnifiedLLMClient:
image_urls: Optional[List[str]] = None, image_urls: Optional[List[str]] = None,
files: Optional[List[Dict[str, Any]]] = None, files: Optional[List[Dict[str, Any]]] = None,
) -> Optional[Dict[str, Any]]: ) -> Optional[Dict[str, Any]]:
started_at = time.monotonic()
self.last_error = "" self.last_error = ""
result: Optional[Dict[str, Any]] = None
if not self.is_available(): if not self.is_available():
self.last_error = "client_unavailable" self.last_error = "client_unavailable"
return None elif self.provider == "dify":
result = self._generate_dify(
if self.provider == "dify":
return self._generate_dify(
prompt=prompt, prompt=prompt,
user=user, user=user,
inputs=inputs or {}, inputs=inputs or {},
tag=tag, tag=tag,
files=files or [], files=files or [],
) )
if self.provider == "openai_compatible": elif self.provider == "openai_compatible":
return self._generate_openai( result = self._generate_openai(
system_prompt=system_prompt, system_prompt=system_prompt,
user_prompt=user_prompt or prompt, user_prompt=user_prompt or prompt,
user=user, user=user,
image_urls=image_urls or [], image_urls=image_urls or [],
) )
else:
self.last_error = f"unsupported_provider:{self.provider}"
self.last_error = f"unsupported_provider:{self.provider}" # 统一在出口记录运行时快照,避免每种 provider 都重复埋点逻辑。
return None usage = (result or {}).get("usage", {}) if isinstance(result, dict) else {}
latency_ms = 0.0
if isinstance(usage, dict) and usage.get("latency") not in (None, ""):
try:
latency_ms = float(usage.get("latency")) * 1000
except Exception:
latency_ms = 0.0
if latency_ms <= 0:
latency_ms = (time.monotonic() - started_at) * 1000
self._record_runtime_metric(
provider=self.provider,
backend=str(self.config.get("backend", "") or ""),
scene=str(self.config.get("scene", "") or ""),
model=self.model or str(self.mode or ""),
success=bool(result and result.get("text")),
latency_ms=latency_ms,
error=self.last_error,
)
return result
def _generate_openai( def _generate_openai(
self, self,