Revert "增强首页LLM运行态与任务调度监控卡片"

This reverts commit 5487142fe1.
This commit is contained in:
Liu
2026-05-01 12:45:39 +08:00
parent d1c2aa06f0
commit 9b9059a6d9
2 changed files with 69 additions and 448 deletions

View File

@@ -16,7 +16,6 @@ from utils.markdown_to_image import get_md2img_health_snapshot, warmup_md2img_br
from utils.ai.llm_registry import LLMRegistry
from base.plugin_common.plugin_interface import PluginStatus
from utils.ai.unified_llm import UnifiedLLMClient
from utils.decorator.async_job import async_job
# 创建系统信息蓝图
system_bp = Blueprint('system', __name__)
@@ -240,269 +239,6 @@ def _extract_redis_runtime_snapshot(db_manager) -> dict:
return snapshot
def _parse_snapshot_datetime(value: str | None) -> datetime | None:
"""把首页摘要里常用的时间字符串安全转换为 datetime。"""
text = str(value or "").strip()
if not text:
return None
try:
return datetime.strptime(text, "%Y-%m-%d %H:%M:%S")
except ValueError:
return None
def _count_enabled_runtime_items(items) -> int:
"""统计启用项数量。
兼容原因:
1. 新版目录模型里 providers/backends/scenes 可能是 dict
2. 后台页面某些兜底逻辑里也可能给出 list
3. 旧配置没有 enabled 字段时,直接按存在即计数。
"""
rows = []
if isinstance(items, dict):
rows = list(items.values())
elif isinstance(items, list):
rows = list(items)
count = 0
for row in rows:
if not isinstance(row, dict):
continue
if "enabled" not in row or bool(row.get("enabled", True)):
count += 1
return count
def _extract_llm_catalog_summary() -> dict:
"""提取首页 LLM 路由配置摘要。
这里不做真实调用探测,只回答两个问题:
1. 运行时有没有可用的场景与目标;
2. 管理员当前看到的调用记录,大致落到了哪一套路由上。
"""
try:
catalog = LLMRegistry.get_catalog() or {}
if catalog:
providers = catalog.get("providers", {}) or {}
dify_apps = catalog.get("dify_apps", {}) or {}
backends = catalog.get("backends", {}) or {}
scenes = catalog.get("scenes", {}) or {}
default_scene = str(catalog.get("default_scene") or "").strip()
default_backend = str(LLMRegistry.get_scene_backend_name(default_scene) or "").strip() if default_scene else ""
return {
"provider_count": _count_enabled_runtime_items(providers),
"scene_count": _count_enabled_runtime_items(scenes),
"target_count": _count_enabled_runtime_items(backends) + _count_enabled_runtime_items(dify_apps),
"default_scene": default_scene,
"default_backend": default_backend,
"has_routing": _count_enabled_runtime_items(scenes) > 0,
}
# 目录模型不存在时回退到 legacy 视图,至少让首页知道“有没有基础路由配置”。
legacy_llm = LLMRegistry.get_llm_config() or {}
scenes = legacy_llm.get("scenes", {}) or {}
backends = legacy_llm.get("backends", {}) or {}
default_backend = str(legacy_llm.get("default_backend") or "").strip()
return {
"provider_count": 0,
"scene_count": len(scenes) if isinstance(scenes, dict) else 0,
"target_count": len(backends) if isinstance(backends, dict) else 0,
"default_scene": "",
"default_backend": default_backend,
"has_routing": bool(scenes) or bool(default_backend),
}
except Exception as llm_catalog_error:
logger.warning(f"提取 LLM 路由摘要失败: {llm_catalog_error}")
return {
"provider_count": 0,
"scene_count": 0,
"target_count": 0,
"default_scene": "",
"default_backend": "",
"has_routing": False,
}
def _extract_ai_runtime_snapshot() -> dict:
"""构建首页 LLM 运行态摘要。
设计原则:
1. 首页只展示“最近调用窗口”的被动观测结果,不主动发请求探活;
2. 把最近调用和静态路由配置拼在一起,避免管理员只看到“成功/失败”却不知道走的是哪条链路;
3. 如果近期没有调用,也明确区分“未配置”和“已配置但当前空闲”。
"""
runtime_snapshot = UnifiedLLMClient.get_runtime_snapshot() or {}
last_call = dict(runtime_snapshot.get("last_call") or {})
catalog_summary = _extract_llm_catalog_summary()
total_calls = _safe_int(runtime_snapshot.get("total_calls"))
failed_calls = _safe_int(runtime_snapshot.get("failed_calls"))
success_rate = _safe_float(runtime_snapshot.get("success_rate"))
avg_latency_ms = _safe_float(runtime_snapshot.get("avg_latency_ms"))
last_error = str(runtime_snapshot.get("last_error") or "").strip()
snapshot = {
**runtime_snapshot,
"last_call": last_call,
"provider_count": catalog_summary.get("provider_count", 0),
"scene_count": catalog_summary.get("scene_count", 0),
"target_count": catalog_summary.get("target_count", 0),
"default_scene": catalog_summary.get("default_scene", ""),
"default_backend": catalog_summary.get("default_backend", ""),
"has_routing": bool(catalog_summary.get("has_routing")),
"last_provider": str(last_call.get("provider") or "").strip(),
"last_backend": str(last_call.get("backend") or "").strip(),
"last_scene": str(last_call.get("scene") or "").strip(),
"last_model": str(last_call.get("model") or "").strip(),
"last_timestamp": str(last_call.get("timestamp") or "").strip(),
"last_latency_ms": _safe_float(last_call.get("latency_ms")),
}
if not snapshot["has_routing"]:
snapshot["status"] = "warning"
snapshot["summary"] = "当前未发现完整的 LLM 路由配置,建议先检查默认场景与后端绑定"
return snapshot
if total_calls <= 0:
snapshot["status"] = "warning"
snapshot["summary"] = (
f"已配置 {snapshot['scene_count']} 个场景、{snapshot['target_count']} 个目标,"
"最近窗口内暂无统一 LLM 调用记录"
)
return snapshot
if failed_calls >= total_calls and total_calls > 0:
snapshot["status"] = "danger"
snapshot["summary"] = (
f"最近 {total_calls} 次调用全部失败,成功率 {success_rate:.2f}%"
f"平均耗时 {avg_latency_ms:.2f}ms"
)
return snapshot
if failed_calls > 0 or last_error:
snapshot["status"] = "warning"
snapshot["summary"] = (
f"最近 {total_calls} 次调用中失败 {failed_calls} 次,成功率 {success_rate:.2f}%"
f"平均耗时 {avg_latency_ms:.2f}ms"
)
return snapshot
snapshot["status"] = "healthy"
snapshot["summary"] = (
f"最近 {total_calls} 次调用全部成功,成功率 {success_rate:.2f}%"
f"平均耗时 {avg_latency_ms:.2f}ms"
)
return snapshot
def _extract_scheduler_runtime_snapshot() -> dict:
"""聚合 async_job 运行态,生成首页任务调度摘要。
这里的目标不是替代完整任务页,而是回答管理员最常问的几件事:
1. 任务有没有正常装载;
2. 是否存在失败或非法调度;
3. 下一次任务大概何时执行;
4. 当前更多是系统任务,还是插件任务在跑。
"""
runtime_rows = async_job.get_jobs_snapshot()
next_run_candidates = []
failed_rows = []
system_job_count = 0
plugin_job_count = 0
for row in runtime_rows:
job_key = str(row.get("job_key") or "").strip()
owner_name = str(row.get("owner_name") or "system").strip().lower()
next_run_at = _parse_snapshot_datetime(row.get("next_run_at"))
last_status = str(row.get("last_status") or "").strip().lower()
if job_key.startswith("plugin_schedule:") or owner_name != "system":
plugin_job_count += 1
else:
system_job_count += 1
if bool(row.get("enabled")) and next_run_at:
next_run_candidates.append(next_run_at)
if last_status in {"failed", "invalid_schedule"}:
failed_rows.append(row)
latest_failed_row = {}
if failed_rows:
failed_rows.sort(
key=lambda row: (
_parse_snapshot_datetime(row.get("updated_at"))
or _parse_snapshot_datetime(row.get("last_run_at"))
or datetime.min
),
reverse=True,
)
latest_failed_row = failed_rows[0]
invalid_jobs = sum(
1 for row in runtime_rows if str(row.get("last_status") or "").strip().lower() == "invalid_schedule"
)
total_jobs = len(runtime_rows)
enabled_jobs = sum(1 for row in runtime_rows if bool(row.get("enabled")))
running_jobs = sum(1 for row in runtime_rows if bool(row.get("running")))
failed_jobs = len(failed_rows)
paused_jobs = total_jobs - enabled_jobs
never_run_jobs = sum(1 for row in runtime_rows if str(row.get("last_status") or "").strip().lower() == "never")
next_run_at_text = min(next_run_candidates).strftime("%Y-%m-%d %H:%M:%S") if next_run_candidates else ""
latest_failed_error = str(latest_failed_row.get("last_error") or "").strip()
if len(latest_failed_error) > 120:
latest_failed_error = f"{latest_failed_error[:117]}..."
snapshot = {
"status": "healthy",
"summary": "任务调度运行正常",
"total_jobs": total_jobs,
"enabled_jobs": enabled_jobs,
"running_jobs": running_jobs,
"failed_jobs": failed_jobs,
"invalid_jobs": invalid_jobs,
"paused_jobs": paused_jobs,
"never_run_jobs": never_run_jobs,
"system_job_count": system_job_count,
"plugin_job_count": plugin_job_count,
"next_run_at": next_run_at_text,
"latest_failed_job_name": str(latest_failed_row.get("name") or "").strip(),
"latest_failed_error": latest_failed_error,
}
if total_jobs <= 0:
snapshot["status"] = "warning"
snapshot["summary"] = "当前没有加载任何定时任务"
return snapshot
if invalid_jobs > 0:
snapshot["status"] = "danger"
snapshot["summary"] = f"发现 {invalid_jobs} 个任务调度配置非法,建议立即检查任务页"
return snapshot
if failed_jobs > 0:
snapshot["status"] = "warning"
snapshot["summary"] = (
f"最近有 {failed_jobs} 个任务执行失败,"
f"下一次执行 {next_run_at_text or '暂未计算'}"
)
return snapshot
if enabled_jobs <= 0:
snapshot["status"] = "warning"
snapshot["summary"] = "任务已加载,但当前没有启用中的调度任务"
return snapshot
if running_jobs > 0:
snapshot["summary"] = (
f"当前有 {running_jobs} 个任务执行中,"
f"下一次执行 {next_run_at_text or '暂未计算'}"
)
return snapshot
snapshot["summary"] = f"已启用 {enabled_jobs} 个任务,下一次执行 {next_run_at_text or '暂未计算'}"
return snapshot
def _legacy_llm_to_catalog(legacy_llm: dict) -> dict:
"""把旧 llm(backends/scenes) 结构转换为新目录结构(仅用于兜底展示)。
@@ -872,6 +608,20 @@ def api_system_health_summary():
mysql_snapshot = _extract_mysql_runtime_snapshot(server.db_manager)
redis_snapshot = _extract_redis_runtime_snapshot(server.db_manager)
# md2img 健康快照已经有现成实现,这里只做聚合,不主动预热运行时。
md2img_snapshot = get_md2img_health_snapshot(ensure_runtime=False) or {}
browser_ready = bool(
md2img_snapshot.get("browser_ready")
or md2img_snapshot.get("playwright_ready")
or md2img_snapshot.get("ready")
)
runtime_ready = bool(
md2img_snapshot.get("runtime_ready")
or md2img_snapshot.get("runtime_initialized")
or md2img_snapshot.get("initialized")
)
md2img_healthy = runtime_ready and browser_ready
# 首页只需要“够判断”的轻量结论,因此统一产出 status + summary 文本,前端无需重复拼装业务规则。
robot_running = bool(getattr(robot, "ipad_running", False))
robot_nickname = str(getattr(robot, "nickname", "") or "").strip()
@@ -897,11 +647,37 @@ def api_system_health_summary():
error_status = "healthy"
error_summary = "近 24 小时未记录到异常"
# 首页 AI 卡片升级为“运行态 + 路由摘要”,仍然保持被动观测,不主动探活。
ai_runtime = _extract_ai_runtime_snapshot()
if md2img_healthy:
md2img_status = "healthy"
md2img_summary = "运行时与浏览器均已就绪"
elif runtime_ready or browser_ready:
md2img_status = "warning"
md2img_summary = "运行时部分可用,建议检查预热状态"
else:
md2img_status = "danger"
md2img_summary = "运行时未就绪,相关转图能力可能不可用"
# Markdown 转图更适合保留在专门页面里排障,首页右侧改成更通用的任务调度摘要。
scheduler_runtime = _extract_scheduler_runtime_snapshot()
# AI 运行态:
# 1. 统一从 UnifiedLLMClient 最近调用窗口读取,避免各插件单独维护监控数据;
# 2. 若当前窗口还没有调用记录,就明确返回“暂无调用”,避免误判成异常。
ai_runtime = UnifiedLLMClient.get_runtime_snapshot()
ai_total_calls = int(ai_runtime.get("total_calls") or 0)
ai_failed_calls = int(ai_runtime.get("failed_calls") or 0)
if ai_total_calls <= 0:
ai_status = "warning"
ai_summary = "最近窗口内暂无统一 LLM 调用记录"
elif ai_failed_calls > 0:
ai_status = "warning"
ai_summary = (
f"最近 {ai_total_calls} 次调用中失败 {ai_failed_calls} 次,"
f"平均耗时 {ai_runtime.get('avg_latency_ms', 0)}ms"
)
else:
ai_status = "healthy"
ai_summary = (
f"最近 {ai_total_calls} 次调用全部成功,"
f"平均耗时 {ai_runtime.get('avg_latency_ms', 0)}ms"
)
return jsonify({
"success": True,
@@ -943,10 +719,17 @@ def api_system_health_summary():
"redis": redis_snapshot,
},
"ai_runtime": {
"status": ai_status,
"summary": ai_summary,
**ai_runtime,
},
"scheduler": {
**scheduler_runtime,
"md2img": {
"status": md2img_status,
"healthy": md2img_healthy,
"runtime_ready": runtime_ready,
"browser_ready": browser_ready,
"summary": md2img_summary,
"detail": md2img_snapshot,
},
}
})

View File

@@ -131,7 +131,7 @@
<div class="section-heading section-heading--stack">
<div>
<h3>系统健康快照</h3>
<p>把连接状态、插件运行、异常数量、LLM 运行态与任务调度集中到一个面板里。</p>
<p>把连接状态、插件运行、异常数量与转图运行时集中到一个面板里。</p>
</div>
<div class="health-overview-meta">
<span class="health-overview-meta__label">最近刷新</span>
@@ -394,38 +394,15 @@
status: 'warning',
total_calls: 0,
failed_calls: 0,
success_rate: 0,
avg_latency_ms: 0,
summary: '加载中...',
last_call: {},
scene_count: 0,
target_count: 0,
provider_count: 0,
has_routing: false,
default_scene: '',
default_backend: '',
last_provider: '',
last_backend: '',
last_scene: '',
last_model: '',
last_timestamp: '',
last_latency_ms: 0,
last_error: ''
last_call: {}
},
scheduler: {
md2img: {
status: 'warning',
total_jobs: 0,
enabled_jobs: 0,
running_jobs: 0,
failed_jobs: 0,
invalid_jobs: 0,
paused_jobs: 0,
never_run_jobs: 0,
system_job_count: 0,
plugin_job_count: 0,
next_run_at: '',
latest_failed_job_name: '',
latest_failed_error: '',
healthy: false,
runtime_ready: false,
browser_ready: false,
summary: '加载中...'
}
},
@@ -469,7 +446,7 @@
const errors = this.healthSummary.errors || {};
const infrastructure = this.healthSummary.infrastructure || {};
const aiRuntime = this.healthSummary.ai_runtime || {};
const scheduler = this.healthSummary.scheduler || {};
const md2img = this.healthSummary.md2img || {};
return [
{
key: 'robot',
@@ -506,23 +483,19 @@
},
{
key: 'ai_runtime',
title: 'LLM 运行态',
title: 'AI 运行态',
status: aiRuntime.status || 'warning',
value: (aiRuntime.total_calls || 0) > 0
? `${this.formatMetricNumber(aiRuntime.success_rate, 2)}%`
: `${aiRuntime.scene_count || 0} 个场景`,
value: `${aiRuntime.avg_latency_ms || 0} ms`,
summary: aiRuntime.summary || '暂无状态',
serviceBlocks: this.buildAiRuntimeServiceBlocks(aiRuntime),
extra: this.buildAiRuntimeExtra(aiRuntime)
extra: `最近调用 ${aiRuntime.total_calls || 0} 次,失败 ${aiRuntime.failed_calls || 0}`
},
{
key: 'scheduler',
title: '任务调度',
status: scheduler.status || 'warning',
value: `${scheduler.enabled_jobs || 0} / ${scheduler.total_jobs || 0}`,
summary: scheduler.summary || '暂无状态',
serviceBlocks: this.buildSchedulerServiceBlocks(scheduler),
extra: this.buildSchedulerExtra(scheduler)
key: 'md2img',
title: 'Markdown 转图',
status: md2img.status || 'warning',
value: md2img.healthy ? '就绪' : '待检查',
summary: md2img.summary || '暂无状态',
extra: `Runtime ${md2img.runtime_ready ? '已就绪' : '未就绪'} / Browser ${md2img.browser_ready ? '已就绪' : '未就绪'}`
}
];
}
@@ -684,141 +657,6 @@
}
];
},
buildAiRuntimeServiceBlocks(aiRuntime) {
// AI 卡片拆成“路由配置”和“最近调用”两个子面板,
// 让首页既能判断配置是否完整,也能快速定位最近请求到底走了哪条链路。
return [
{
key: 'ai-routing',
title: '路由配置',
status: aiRuntime.has_routing ? 'healthy' : 'warning',
summary: aiRuntime.default_scene
? `默认场景:${aiRuntime.default_scene}`
: '当前未设置默认场景',
metrics: [
{
label: '场景数量',
value: this.formatMetricNumber(aiRuntime.scene_count)
},
{
label: '目标数量',
value: this.formatMetricNumber(aiRuntime.target_count)
},
{
label: 'Provider 模板',
value: this.formatMetricNumber(aiRuntime.provider_count)
},
{
label: '默认后端',
value: aiRuntime.default_backend || '-'
}
]
},
{
key: 'ai-last-call',
title: '最近调用',
status: (aiRuntime.failed_calls || 0) > 0 ? 'warning' : ((aiRuntime.total_calls || 0) > 0 ? 'healthy' : 'warning'),
summary: aiRuntime.last_timestamp
? `最近一次记录时间:${aiRuntime.last_timestamp}`
: '当前窗口内暂无调用记录',
metrics: [
{
label: 'Provider',
value: aiRuntime.last_provider || '-'
},
{
label: 'Backend',
value: aiRuntime.last_backend || '-'
},
{
label: 'Scene',
value: aiRuntime.last_scene || '-'
},
{
label: '模型',
value: aiRuntime.last_model || '-'
},
{
label: '最近耗时',
value: `${this.formatMetricNumber(aiRuntime.last_latency_ms, 2)} ms`
},
{
label: '最近错误',
value: aiRuntime.last_error || '无'
}
]
}
];
},
buildAiRuntimeExtra(aiRuntime) {
return `最近调用 ${aiRuntime.total_calls || 0} 次,失败 ${aiRuntime.failed_calls || 0} 次,平均耗时 ${this.formatMetricNumber(aiRuntime.avg_latency_ms, 2)} ms`;
},
buildSchedulerServiceBlocks(scheduler) {
// 任务调度卡片只保留首页最需要的摘要:
// 任务装载量、执行态、失败数,以及系统任务/插件任务的大致构成。
return [
{
key: 'scheduler-overview',
title: '任务装载',
status: scheduler.enabled_jobs > 0 ? 'healthy' : 'warning',
summary: scheduler.next_run_at
? `下一次执行:${scheduler.next_run_at}`
: '当前没有可计算的下一次执行时间',
metrics: [
{
label: '启用任务',
value: this.formatMetricNumber(scheduler.enabled_jobs)
},
{
label: '暂停任务',
value: this.formatMetricNumber(scheduler.paused_jobs)
},
{
label: '系统任务',
value: this.formatMetricNumber(scheduler.system_job_count)
},
{
label: '插件任务',
value: this.formatMetricNumber(scheduler.plugin_job_count)
}
]
},
{
key: 'scheduler-runtime',
title: '执行状态',
status: scheduler.status || 'warning',
summary: scheduler.latest_failed_job_name
? `最近失败任务:${scheduler.latest_failed_job_name}`
: '当前未发现最近失败任务',
metrics: [
{
label: '执行中',
value: this.formatMetricNumber(scheduler.running_jobs)
},
{
label: '失败任务',
value: this.formatMetricNumber(scheduler.failed_jobs)
},
{
label: '非法调度',
value: this.formatMetricNumber(scheduler.invalid_jobs)
},
{
label: '未执行过',
value: this.formatMetricNumber(scheduler.never_run_jobs)
}
]
}
];
},
buildSchedulerExtra(scheduler) {
if (scheduler.latest_failed_error) {
return `最近失败原因:${scheduler.latest_failed_error}`;
}
return scheduler.next_run_at
? `下次执行时间:${scheduler.next_run_at}`
: '当前暂无可用的下一次执行时间';
},
renderPieChart(chartId, usageValue, label) {
const ctx = document.getElementById(chartId);
if (!ctx) return;