完善系统健康面板并接入AI运行态观测
This commit is contained in:
@@ -15,6 +15,7 @@ import toml
|
|||||||
from utils.markdown_to_image import get_md2img_health_snapshot, warmup_md2img_browser_sync
|
from utils.markdown_to_image import get_md2img_health_snapshot, warmup_md2img_browser_sync
|
||||||
from utils.ai.llm_registry import LLMRegistry
|
from utils.ai.llm_registry import LLMRegistry
|
||||||
from base.plugin_common.plugin_interface import PluginStatus
|
from base.plugin_common.plugin_interface import PluginStatus
|
||||||
|
from utils.ai.unified_llm import UnifiedLLMClient
|
||||||
|
|
||||||
# 创建系统信息蓝图
|
# 创建系统信息蓝图
|
||||||
system_bp = Blueprint('system', __name__)
|
system_bp = Blueprint('system', __name__)
|
||||||
@@ -403,6 +404,33 @@ def api_system_health_summary():
|
|||||||
# 错误数量直接复用现有统计库,避免为了首页卡片再单独写一套 SQL。
|
# 错误数量直接复用现有统计库,避免为了首页卡片再单独写一套 SQL。
|
||||||
_, recent_error_count = server.stats_db.get_error_logs(days=1, page=1, limit=1)
|
_, recent_error_count = server.stats_db.get_error_logs(days=1, page=1, limit=1)
|
||||||
|
|
||||||
|
# 基础设施健康:
|
||||||
|
# 1. MySQL 用最轻量的 SELECT 1 做可用性探测;
|
||||||
|
# 2. Redis 用 PING 验证连接池当前是否可拿到可用连接;
|
||||||
|
# 3. 即使探测失败也只反馈到看板,不影响主接口整体返回。
|
||||||
|
mysql_status = "healthy"
|
||||||
|
mysql_summary = "连接正常"
|
||||||
|
try:
|
||||||
|
mysql_conn = server.db_manager.get_mysql_connection()
|
||||||
|
try:
|
||||||
|
with mysql_conn.cursor() as cursor:
|
||||||
|
cursor.execute("SELECT 1")
|
||||||
|
cursor.fetchone()
|
||||||
|
finally:
|
||||||
|
mysql_conn.close()
|
||||||
|
except Exception as mysql_error:
|
||||||
|
mysql_status = "danger"
|
||||||
|
mysql_summary = f"MySQL 探测失败: {mysql_error}"
|
||||||
|
|
||||||
|
redis_status = "healthy"
|
||||||
|
redis_summary = "连接正常"
|
||||||
|
try:
|
||||||
|
redis_conn = server.db_manager.get_redis_connection()
|
||||||
|
redis_conn.ping()
|
||||||
|
except Exception as redis_error:
|
||||||
|
redis_status = "danger"
|
||||||
|
redis_summary = f"Redis 探测失败: {redis_error}"
|
||||||
|
|
||||||
# md2img 健康快照已经有现成实现,这里只做聚合,不主动预热运行时。
|
# md2img 健康快照已经有现成实现,这里只做聚合,不主动预热运行时。
|
||||||
md2img_snapshot = get_md2img_health_snapshot(ensure_runtime=False) or {}
|
md2img_snapshot = get_md2img_health_snapshot(ensure_runtime=False) or {}
|
||||||
browser_ready = bool(
|
browser_ready = bool(
|
||||||
@@ -452,6 +480,28 @@ def api_system_health_summary():
|
|||||||
md2img_status = "danger"
|
md2img_status = "danger"
|
||||||
md2img_summary = "运行时未就绪,相关转图能力可能不可用"
|
md2img_summary = "运行时未就绪,相关转图能力可能不可用"
|
||||||
|
|
||||||
|
# AI 运行态:
|
||||||
|
# 1. 统一从 UnifiedLLMClient 最近调用窗口读取,避免各插件单独维护监控数据;
|
||||||
|
# 2. 若当前窗口还没有调用记录,就明确返回“暂无调用”,避免误判成异常。
|
||||||
|
ai_runtime = UnifiedLLMClient.get_runtime_snapshot()
|
||||||
|
ai_total_calls = int(ai_runtime.get("total_calls") or 0)
|
||||||
|
ai_failed_calls = int(ai_runtime.get("failed_calls") or 0)
|
||||||
|
if ai_total_calls <= 0:
|
||||||
|
ai_status = "warning"
|
||||||
|
ai_summary = "最近窗口内暂无统一 LLM 调用记录"
|
||||||
|
elif ai_failed_calls > 0:
|
||||||
|
ai_status = "warning"
|
||||||
|
ai_summary = (
|
||||||
|
f"最近 {ai_total_calls} 次调用中失败 {ai_failed_calls} 次,"
|
||||||
|
f"平均耗时 {ai_runtime.get('avg_latency_ms', 0)}ms"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
ai_status = "healthy"
|
||||||
|
ai_summary = (
|
||||||
|
f"最近 {ai_total_calls} 次调用全部成功,"
|
||||||
|
f"平均耗时 {ai_runtime.get('avg_latency_ms', 0)}ms"
|
||||||
|
)
|
||||||
|
|
||||||
return jsonify({
|
return jsonify({
|
||||||
"success": True,
|
"success": True,
|
||||||
"data": {
|
"data": {
|
||||||
@@ -473,6 +523,27 @@ def api_system_health_summary():
|
|||||||
"recent_24h_count": recent_error_count,
|
"recent_24h_count": recent_error_count,
|
||||||
"summary": error_summary,
|
"summary": error_summary,
|
||||||
},
|
},
|
||||||
|
"infrastructure": {
|
||||||
|
"status": "healthy" if mysql_status == "healthy" and redis_status == "healthy" else "danger",
|
||||||
|
"summary": (
|
||||||
|
"MySQL / Redis 均正常"
|
||||||
|
if mysql_status == "healthy" and redis_status == "healthy"
|
||||||
|
else "存在基础设施连接异常"
|
||||||
|
),
|
||||||
|
"mysql": {
|
||||||
|
"status": mysql_status,
|
||||||
|
"summary": mysql_summary,
|
||||||
|
},
|
||||||
|
"redis": {
|
||||||
|
"status": redis_status,
|
||||||
|
"summary": redis_summary,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"ai_runtime": {
|
||||||
|
"status": ai_status,
|
||||||
|
"summary": ai_summary,
|
||||||
|
**ai_runtime,
|
||||||
|
},
|
||||||
"md2img": {
|
"md2img": {
|
||||||
"status": md2img_status,
|
"status": md2img_status,
|
||||||
"healthy": md2img_healthy,
|
"healthy": md2img_healthy,
|
||||||
|
|||||||
@@ -355,6 +355,26 @@
|
|||||||
recent_24h_count: 0,
|
recent_24h_count: 0,
|
||||||
summary: '加载中...'
|
summary: '加载中...'
|
||||||
},
|
},
|
||||||
|
infrastructure: {
|
||||||
|
status: 'warning',
|
||||||
|
summary: '加载中...',
|
||||||
|
mysql: {
|
||||||
|
status: 'warning',
|
||||||
|
summary: '加载中...'
|
||||||
|
},
|
||||||
|
redis: {
|
||||||
|
status: 'warning',
|
||||||
|
summary: '加载中...'
|
||||||
|
}
|
||||||
|
},
|
||||||
|
ai_runtime: {
|
||||||
|
status: 'warning',
|
||||||
|
total_calls: 0,
|
||||||
|
failed_calls: 0,
|
||||||
|
avg_latency_ms: 0,
|
||||||
|
summary: '加载中...',
|
||||||
|
last_call: {}
|
||||||
|
},
|
||||||
md2img: {
|
md2img: {
|
||||||
status: 'warning',
|
status: 'warning',
|
||||||
healthy: false,
|
healthy: false,
|
||||||
@@ -401,6 +421,8 @@
|
|||||||
const robot = this.healthSummary.robot || {};
|
const robot = this.healthSummary.robot || {};
|
||||||
const plugins = this.healthSummary.plugins || {};
|
const plugins = this.healthSummary.plugins || {};
|
||||||
const errors = this.healthSummary.errors || {};
|
const errors = this.healthSummary.errors || {};
|
||||||
|
const infrastructure = this.healthSummary.infrastructure || {};
|
||||||
|
const aiRuntime = this.healthSummary.ai_runtime || {};
|
||||||
const md2img = this.healthSummary.md2img || {};
|
const md2img = this.healthSummary.md2img || {};
|
||||||
return [
|
return [
|
||||||
{
|
{
|
||||||
@@ -427,6 +449,22 @@
|
|||||||
summary: errors.summary || '暂无状态',
|
summary: errors.summary || '暂无状态',
|
||||||
extra: '统计窗口:近 24 小时'
|
extra: '统计窗口:近 24 小时'
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
key: 'infrastructure',
|
||||||
|
title: '基础设施',
|
||||||
|
status: infrastructure.status || 'warning',
|
||||||
|
value: infrastructure.status === 'healthy' ? '正常' : '异常',
|
||||||
|
summary: infrastructure.summary || '暂无状态',
|
||||||
|
extra: `MySQL:${((infrastructure.mysql || {}).status === 'healthy') ? '正常' : '异常'} / Redis:${((infrastructure.redis || {}).status === 'healthy') ? '正常' : '异常'}`
|
||||||
|
},
|
||||||
|
{
|
||||||
|
key: 'ai_runtime',
|
||||||
|
title: 'AI 运行态',
|
||||||
|
status: aiRuntime.status || 'warning',
|
||||||
|
value: `${aiRuntime.avg_latency_ms || 0} ms`,
|
||||||
|
summary: aiRuntime.summary || '暂无状态',
|
||||||
|
extra: `最近调用 ${aiRuntime.total_calls || 0} 次,失败 ${aiRuntime.failed_calls || 0} 次`
|
||||||
|
},
|
||||||
{
|
{
|
||||||
key: 'md2img',
|
key: 'md2img',
|
||||||
title: 'Markdown 转图',
|
title: 'Markdown 转图',
|
||||||
@@ -978,7 +1016,7 @@
|
|||||||
|
|
||||||
.health-grid {
|
.health-grid {
|
||||||
display: grid;
|
display: grid;
|
||||||
grid-template-columns: repeat(4, minmax(0, 1fr));
|
grid-template-columns: repeat(3, minmax(0, 1fr));
|
||||||
gap: 16px;
|
gap: 16px;
|
||||||
margin-top: 18px;
|
margin-top: 18px;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -18,6 +18,7 @@
|
|||||||
- 已将插件调用统计改为主链路直接埋点,降低维护复杂度
|
- 已将插件调用统计改为主链路直接埋点,降低维护复杂度
|
||||||
- 已在消息主链路接入 `trace_id`,用于串联消息处理、插件统计与异常日志
|
- 已在消息主链路接入 `trace_id`,用于串联消息处理、插件统计与异常日志
|
||||||
- 已在后台首页补充“系统健康快照”,可集中查看机器人连接、插件运行、近 24 小时异常与 md2img 运行状态
|
- 已在后台首页补充“系统健康快照”,可集中查看机器人连接、插件运行、近 24 小时异常与 md2img 运行状态
|
||||||
|
- 已补充 MySQL / Redis 连接探测与统一 LLM 最近调用快照,基础设施与 AI 运行态可直接在首页查看
|
||||||
|
|
||||||
## 2. 项目现状判断
|
## 2. 项目现状判断
|
||||||
|
|
||||||
@@ -314,6 +315,7 @@
|
|||||||
当前进展:
|
当前进展:
|
||||||
|
|
||||||
- 第一阶段已完成:首页已增加系统健康快照,可快速查看核心运行状态
|
- 第一阶段已完成:首页已增加系统健康快照,可快速查看核心运行状态
|
||||||
|
- 第二阶段已完成:已补充基础设施连通性与 AI 最近调用耗时/成功率快照
|
||||||
- 后续可继续补充更细粒度的吞吐、延迟、存储连接与 AI 调用链指标
|
- 后续可继续补充更细粒度的吞吐、延迟、存储连接与 AI 调用链指标
|
||||||
|
|
||||||
建议内容:
|
建议内容:
|
||||||
|
|||||||
@@ -5,6 +5,8 @@ import binascii
|
|||||||
import json
|
import json
|
||||||
import mimetypes
|
import mimetypes
|
||||||
import time
|
import time
|
||||||
|
from collections import deque
|
||||||
|
from threading import Lock
|
||||||
from typing import Any, Dict, List, Optional, Tuple
|
from typing import Any, Dict, List, Optional, Tuple
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
@@ -18,6 +20,13 @@ from utils.ai.llm_registry import LLMRegistry
|
|||||||
class UnifiedLLMClient:
|
class UnifiedLLMClient:
|
||||||
"""统一的 LLM 调用客户端,兼容 OpenAI-compatible 与 Dify。"""
|
"""统一的 LLM 调用客户端,兼容 OpenAI-compatible 与 Dify。"""
|
||||||
|
|
||||||
|
# 运行时观测快照:
|
||||||
|
# 1. 只保留最近一小段调用窗口,避免无限增长;
|
||||||
|
# 2. 放在统一客户端层,所有复用该客户端的插件天然受益;
|
||||||
|
# 3. 这里存的不是业务明细,而是运维看板需要的轻量健康指标。
|
||||||
|
_runtime_metrics = deque(maxlen=50)
|
||||||
|
_runtime_lock = Lock()
|
||||||
|
|
||||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||||
self.LOG = logger
|
self.LOG = logger
|
||||||
self.raw_config = config or {}
|
self.raw_config = config or {}
|
||||||
@@ -41,6 +50,62 @@ class UnifiedLLMClient:
|
|||||||
self.default_system_prompt = str(self.config.get("system_prompt", "")).strip()
|
self.default_system_prompt = str(self.config.get("system_prompt", "")).strip()
|
||||||
self.last_error = ""
|
self.last_error = ""
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _record_runtime_metric(
|
||||||
|
cls,
|
||||||
|
*,
|
||||||
|
provider: str,
|
||||||
|
backend: str,
|
||||||
|
scene: str,
|
||||||
|
model: str,
|
||||||
|
success: bool,
|
||||||
|
latency_ms: float,
|
||||||
|
error: str = "",
|
||||||
|
) -> None:
|
||||||
|
"""记录最近一次 LLM 调用结果,供后台健康面板聚合展示。"""
|
||||||
|
with cls._runtime_lock:
|
||||||
|
cls._runtime_metrics.append({
|
||||||
|
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
|
||||||
|
"provider": str(provider or "").strip(),
|
||||||
|
"backend": str(backend or "").strip(),
|
||||||
|
"scene": str(scene or "").strip(),
|
||||||
|
"model": str(model or "").strip(),
|
||||||
|
"success": bool(success),
|
||||||
|
"latency_ms": round(float(latency_ms or 0.0), 2),
|
||||||
|
"error": str(error or "").strip()[:300],
|
||||||
|
})
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_runtime_snapshot(cls) -> Dict[str, Any]:
|
||||||
|
"""返回最近调用窗口的聚合快照,供后台可观测性接口直接复用。"""
|
||||||
|
with cls._runtime_lock:
|
||||||
|
rows = list(cls._runtime_metrics)
|
||||||
|
|
||||||
|
total_calls = len(rows)
|
||||||
|
success_calls = sum(1 for item in rows if item.get("success"))
|
||||||
|
failed_calls = total_calls - success_calls
|
||||||
|
avg_latency_ms = round(
|
||||||
|
sum(float(item.get("latency_ms") or 0.0) for item in rows) / total_calls,
|
||||||
|
2
|
||||||
|
) if total_calls else 0.0
|
||||||
|
last_call = rows[-1] if rows else {}
|
||||||
|
last_error = ""
|
||||||
|
for item in reversed(rows):
|
||||||
|
if not item.get("success") and item.get("error"):
|
||||||
|
last_error = str(item.get("error") or "").strip()
|
||||||
|
break
|
||||||
|
|
||||||
|
return {
|
||||||
|
"window_size": cls._runtime_metrics.maxlen,
|
||||||
|
"total_calls": total_calls,
|
||||||
|
"success_calls": success_calls,
|
||||||
|
"failed_calls": failed_calls,
|
||||||
|
"success_rate": round((success_calls / total_calls) * 100, 2) if total_calls else 0.0,
|
||||||
|
"avg_latency_ms": avg_latency_ms,
|
||||||
|
"last_call": last_call,
|
||||||
|
"last_error": last_error,
|
||||||
|
}
|
||||||
|
|
||||||
def is_available(self) -> bool:
|
def is_available(self) -> bool:
|
||||||
if not self.enabled:
|
if not self.enabled:
|
||||||
return False
|
return False
|
||||||
@@ -168,29 +233,50 @@ class UnifiedLLMClient:
|
|||||||
image_urls: Optional[List[str]] = None,
|
image_urls: Optional[List[str]] = None,
|
||||||
files: Optional[List[Dict[str, Any]]] = None,
|
files: Optional[List[Dict[str, Any]]] = None,
|
||||||
) -> Optional[Dict[str, Any]]:
|
) -> Optional[Dict[str, Any]]:
|
||||||
|
started_at = time.monotonic()
|
||||||
self.last_error = ""
|
self.last_error = ""
|
||||||
|
result: Optional[Dict[str, Any]] = None
|
||||||
if not self.is_available():
|
if not self.is_available():
|
||||||
self.last_error = "client_unavailable"
|
self.last_error = "client_unavailable"
|
||||||
return None
|
elif self.provider == "dify":
|
||||||
|
result = self._generate_dify(
|
||||||
if self.provider == "dify":
|
|
||||||
return self._generate_dify(
|
|
||||||
prompt=prompt,
|
prompt=prompt,
|
||||||
user=user,
|
user=user,
|
||||||
inputs=inputs or {},
|
inputs=inputs or {},
|
||||||
tag=tag,
|
tag=tag,
|
||||||
files=files or [],
|
files=files or [],
|
||||||
)
|
)
|
||||||
if self.provider == "openai_compatible":
|
elif self.provider == "openai_compatible":
|
||||||
return self._generate_openai(
|
result = self._generate_openai(
|
||||||
system_prompt=system_prompt,
|
system_prompt=system_prompt,
|
||||||
user_prompt=user_prompt or prompt,
|
user_prompt=user_prompt or prompt,
|
||||||
user=user,
|
user=user,
|
||||||
image_urls=image_urls or [],
|
image_urls=image_urls or [],
|
||||||
)
|
)
|
||||||
|
else:
|
||||||
self.last_error = f"unsupported_provider:{self.provider}"
|
self.last_error = f"unsupported_provider:{self.provider}"
|
||||||
return None
|
|
||||||
|
# 统一在出口记录运行时快照,避免每种 provider 都重复埋点逻辑。
|
||||||
|
usage = (result or {}).get("usage", {}) if isinstance(result, dict) else {}
|
||||||
|
latency_ms = 0.0
|
||||||
|
if isinstance(usage, dict) and usage.get("latency") not in (None, ""):
|
||||||
|
try:
|
||||||
|
latency_ms = float(usage.get("latency")) * 1000
|
||||||
|
except Exception:
|
||||||
|
latency_ms = 0.0
|
||||||
|
if latency_ms <= 0:
|
||||||
|
latency_ms = (time.monotonic() - started_at) * 1000
|
||||||
|
|
||||||
|
self._record_runtime_metric(
|
||||||
|
provider=self.provider,
|
||||||
|
backend=str(self.config.get("backend", "") or ""),
|
||||||
|
scene=str(self.config.get("scene", "") or ""),
|
||||||
|
model=self.model or str(self.mode or ""),
|
||||||
|
success=bool(result and result.get("text")),
|
||||||
|
latency_ms=latency_ms,
|
||||||
|
error=self.last_error,
|
||||||
|
)
|
||||||
|
return result
|
||||||
|
|
||||||
def _generate_openai(
|
def _generate_openai(
|
||||||
self,
|
self,
|
||||||
|
|||||||
Reference in New Issue
Block a user