Revert "增强首页LLM运行态与任务调度监控卡片"

This reverts commit 5487142fe1.
2026-05-01 12:45:39 +08:00
parent d1c2aa06f0
commit 9b9059a6d9
2 changed files with 69 additions and 448 deletions
--- a/admin/dashboard/blueprints/system.py
+++ b/admin/dashboard/blueprints/system.py
@@ -16,7 +16,6 @@ from utils.markdown_to_image import get_md2img_health_snapshot, warmup_md2img_br
 from utils.ai.llm_registry import LLMRegistry
 from base.plugin_common.plugin_interface import PluginStatus
 from utils.ai.unified_llm import UnifiedLLMClient
-from utils.decorator.async_job import async_job

 # 创建系统信息蓝图
 system_bp = Blueprint('system', __name__)
@@ -240,269 +239,6 @@ def _extract_redis_runtime_snapshot(db_manager) -> dict:
        return snapshot


-def _parse_snapshot_datetime(value: str | None) -> datetime | None:
-    """把首页摘要里常用的时间字符串安全转换为 datetime。"""
-    text = str(value or "").strip()
-    if not text:
-        return None
-    try:
-        return datetime.strptime(text, "%Y-%m-%d %H:%M:%S")
-    except ValueError:
-        return None
-
-
-def _count_enabled_runtime_items(items) -> int:
-    """统计启用项数量。
-
-    兼容原因：
-    1. 新版目录模型里 providers/backends/scenes 可能是 dict；
-    2. 后台页面某些兜底逻辑里也可能给出 list；
-    3. 旧配置没有 enabled 字段时，直接按存在即计数。
-    """
-    rows = []
-    if isinstance(items, dict):
-        rows = list(items.values())
-    elif isinstance(items, list):
-        rows = list(items)
-    count = 0
-    for row in rows:
-        if not isinstance(row, dict):
-            continue
-        if "enabled" not in row or bool(row.get("enabled", True)):
-            count += 1
-    return count
-
-
-def _extract_llm_catalog_summary() -> dict:
-    """提取首页 LLM 路由配置摘要。
-
-    这里不做真实调用探测，只回答两个问题：
-    1. 运行时有没有可用的场景与目标；
-    2. 管理员当前看到的调用记录，大致落到了哪一套路由上。
-    """
-    try:
-        catalog = LLMRegistry.get_catalog() or {}
-        if catalog:
-            providers = catalog.get("providers", {}) or {}
-            dify_apps = catalog.get("dify_apps", {}) or {}
-            backends = catalog.get("backends", {}) or {}
-            scenes = catalog.get("scenes", {}) or {}
-            default_scene = str(catalog.get("default_scene") or "").strip()
-            default_backend = str(LLMRegistry.get_scene_backend_name(default_scene) or "").strip() if default_scene else ""
-            return {
-                "provider_count": _count_enabled_runtime_items(providers),
-                "scene_count": _count_enabled_runtime_items(scenes),
-                "target_count": _count_enabled_runtime_items(backends) + _count_enabled_runtime_items(dify_apps),
-                "default_scene": default_scene,
-                "default_backend": default_backend,
-                "has_routing": _count_enabled_runtime_items(scenes) > 0,
-            }
-
-        # 目录模型不存在时回退到 legacy 视图，至少让首页知道“有没有基础路由配置”。
-        legacy_llm = LLMRegistry.get_llm_config() or {}
-        scenes = legacy_llm.get("scenes", {}) or {}
-        backends = legacy_llm.get("backends", {}) or {}
-        default_backend = str(legacy_llm.get("default_backend") or "").strip()
-        return {
-            "provider_count": 0,
-            "scene_count": len(scenes) if isinstance(scenes, dict) else 0,
-            "target_count": len(backends) if isinstance(backends, dict) else 0,
-            "default_scene": "",
-            "default_backend": default_backend,
-            "has_routing": bool(scenes) or bool(default_backend),
-        }
-    except Exception as llm_catalog_error:
-        logger.warning(f"提取 LLM 路由摘要失败: {llm_catalog_error}")
-        return {
-            "provider_count": 0,
-            "scene_count": 0,
-            "target_count": 0,
-            "default_scene": "",
-            "default_backend": "",
-            "has_routing": False,
-        }
-
-
-def _extract_ai_runtime_snapshot() -> dict:
-    """构建首页 LLM 运行态摘要。
-
-    设计原则：
-    1. 首页只展示“最近调用窗口”的被动观测结果，不主动发请求探活；
-    2. 把最近调用和静态路由配置拼在一起，避免管理员只看到“成功/失败”却不知道走的是哪条链路；
-    3. 如果近期没有调用，也明确区分“未配置”和“已配置但当前空闲”。
-    """
-    runtime_snapshot = UnifiedLLMClient.get_runtime_snapshot() or {}
-    last_call = dict(runtime_snapshot.get("last_call") or {})
-    catalog_summary = _extract_llm_catalog_summary()
-
-    total_calls = _safe_int(runtime_snapshot.get("total_calls"))
-    failed_calls = _safe_int(runtime_snapshot.get("failed_calls"))
-    success_rate = _safe_float(runtime_snapshot.get("success_rate"))
-    avg_latency_ms = _safe_float(runtime_snapshot.get("avg_latency_ms"))
-    last_error = str(runtime_snapshot.get("last_error") or "").strip()
-
-    snapshot = {
-        **runtime_snapshot,
-        "last_call": last_call,
-        "provider_count": catalog_summary.get("provider_count", 0),
-        "scene_count": catalog_summary.get("scene_count", 0),
-        "target_count": catalog_summary.get("target_count", 0),
-        "default_scene": catalog_summary.get("default_scene", ""),
-        "default_backend": catalog_summary.get("default_backend", ""),
-        "has_routing": bool(catalog_summary.get("has_routing")),
-        "last_provider": str(last_call.get("provider") or "").strip(),
-        "last_backend": str(last_call.get("backend") or "").strip(),
-        "last_scene": str(last_call.get("scene") or "").strip(),
-        "last_model": str(last_call.get("model") or "").strip(),
-        "last_timestamp": str(last_call.get("timestamp") or "").strip(),
-        "last_latency_ms": _safe_float(last_call.get("latency_ms")),
-    }
-
-    if not snapshot["has_routing"]:
-        snapshot["status"] = "warning"
-        snapshot["summary"] = "当前未发现完整的 LLM 路由配置，建议先检查默认场景与后端绑定"
-        return snapshot
-
-    if total_calls <= 0:
-        snapshot["status"] = "warning"
-        snapshot["summary"] = (
-            f"已配置 {snapshot['scene_count']} 个场景、{snapshot['target_count']} 个目标，"
-            "最近窗口内暂无统一 LLM 调用记录"
-        )
-        return snapshot
-
-    if failed_calls >= total_calls and total_calls > 0:
-        snapshot["status"] = "danger"
-        snapshot["summary"] = (
-            f"最近 {total_calls} 次调用全部失败，成功率 {success_rate:.2f}%，"
-            f"平均耗时 {avg_latency_ms:.2f}ms"
-        )
-        return snapshot
-
-    if failed_calls > 0 or last_error:
-        snapshot["status"] = "warning"
-        snapshot["summary"] = (
-            f"最近 {total_calls} 次调用中失败 {failed_calls} 次，成功率 {success_rate:.2f}%，"
-            f"平均耗时 {avg_latency_ms:.2f}ms"
-        )
-        return snapshot
-
-    snapshot["status"] = "healthy"
-    snapshot["summary"] = (
-        f"最近 {total_calls} 次调用全部成功，成功率 {success_rate:.2f}%，"
-        f"平均耗时 {avg_latency_ms:.2f}ms"
-    )
-    return snapshot
-
-
-def _extract_scheduler_runtime_snapshot() -> dict:
-    """聚合 async_job 运行态，生成首页任务调度摘要。
-
-    这里的目标不是替代完整任务页，而是回答管理员最常问的几件事：
-    1. 任务有没有正常装载；
-    2. 是否存在失败或非法调度；
-    3. 下一次任务大概何时执行；
-    4. 当前更多是系统任务，还是插件任务在跑。
-    """
-    runtime_rows = async_job.get_jobs_snapshot()
-    next_run_candidates = []
-    failed_rows = []
-    system_job_count = 0
-    plugin_job_count = 0
-
-    for row in runtime_rows:
-        job_key = str(row.get("job_key") or "").strip()
-        owner_name = str(row.get("owner_name") or "system").strip().lower()
-        next_run_at = _parse_snapshot_datetime(row.get("next_run_at"))
-        last_status = str(row.get("last_status") or "").strip().lower()
-
-        if job_key.startswith("plugin_schedule:") or owner_name != "system":
-            plugin_job_count += 1
-        else:
-            system_job_count += 1
-
-        if bool(row.get("enabled")) and next_run_at:
-            next_run_candidates.append(next_run_at)
-        if last_status in {"failed", "invalid_schedule"}:
-            failed_rows.append(row)
-
-    latest_failed_row = {}
-    if failed_rows:
-        failed_rows.sort(
-            key=lambda row: (
-                _parse_snapshot_datetime(row.get("updated_at"))
-                or _parse_snapshot_datetime(row.get("last_run_at"))
-                or datetime.min
-            ),
-            reverse=True,
-        )
-        latest_failed_row = failed_rows[0]
-
-    invalid_jobs = sum(
-        1 for row in runtime_rows if str(row.get("last_status") or "").strip().lower() == "invalid_schedule"
-    )
-    total_jobs = len(runtime_rows)
-    enabled_jobs = sum(1 for row in runtime_rows if bool(row.get("enabled")))
-    running_jobs = sum(1 for row in runtime_rows if bool(row.get("running")))
-    failed_jobs = len(failed_rows)
-    paused_jobs = total_jobs - enabled_jobs
-    never_run_jobs = sum(1 for row in runtime_rows if str(row.get("last_status") or "").strip().lower() == "never")
-    next_run_at_text = min(next_run_candidates).strftime("%Y-%m-%d %H:%M:%S") if next_run_candidates else ""
-    latest_failed_error = str(latest_failed_row.get("last_error") or "").strip()
-    if len(latest_failed_error) > 120:
-        latest_failed_error = f"{latest_failed_error[:117]}..."
-
-    snapshot = {
-        "status": "healthy",
-        "summary": "任务调度运行正常",
-        "total_jobs": total_jobs,
-        "enabled_jobs": enabled_jobs,
-        "running_jobs": running_jobs,
-        "failed_jobs": failed_jobs,
-        "invalid_jobs": invalid_jobs,
-        "paused_jobs": paused_jobs,
-        "never_run_jobs": never_run_jobs,
-        "system_job_count": system_job_count,
-        "plugin_job_count": plugin_job_count,
-        "next_run_at": next_run_at_text,
-        "latest_failed_job_name": str(latest_failed_row.get("name") or "").strip(),
-        "latest_failed_error": latest_failed_error,
-    }
-
-    if total_jobs <= 0:
-        snapshot["status"] = "warning"
-        snapshot["summary"] = "当前没有加载任何定时任务"
-        return snapshot
-
-    if invalid_jobs > 0:
-        snapshot["status"] = "danger"
-        snapshot["summary"] = f"发现 {invalid_jobs} 个任务调度配置非法，建议立即检查任务页"
-        return snapshot
-
-    if failed_jobs > 0:
-        snapshot["status"] = "warning"
-        snapshot["summary"] = (
-            f"最近有 {failed_jobs} 个任务执行失败，"
-            f"下一次执行 {next_run_at_text or '暂未计算'}"
-        )
-        return snapshot
-
-    if enabled_jobs <= 0:
-        snapshot["status"] = "warning"
-        snapshot["summary"] = "任务已加载，但当前没有启用中的调度任务"
-        return snapshot
-
-    if running_jobs > 0:
-        snapshot["summary"] = (
-            f"当前有 {running_jobs} 个任务执行中，"
-            f"下一次执行 {next_run_at_text or '暂未计算'}"
-        )
-        return snapshot
-
-    snapshot["summary"] = f"已启用 {enabled_jobs} 个任务，下一次执行 {next_run_at_text or '暂未计算'}"
-    return snapshot
-
-
 def _legacy_llm_to_catalog(legacy_llm: dict) -> dict:
    """把旧 llm(backends/scenes) 结构转换为新目录结构（仅用于兜底展示）。

@@ -872,6 +608,20 @@ def api_system_health_summary():
        mysql_snapshot = _extract_mysql_runtime_snapshot(server.db_manager)
        redis_snapshot = _extract_redis_runtime_snapshot(server.db_manager)

+        # md2img 健康快照已经有现成实现，这里只做聚合，不主动预热运行时。
+        md2img_snapshot = get_md2img_health_snapshot(ensure_runtime=False) or {}
+        browser_ready = bool(
+            md2img_snapshot.get("browser_ready")
+            or md2img_snapshot.get("playwright_ready")
+            or md2img_snapshot.get("ready")
+        )
+        runtime_ready = bool(
+            md2img_snapshot.get("runtime_ready")
+            or md2img_snapshot.get("runtime_initialized")
+            or md2img_snapshot.get("initialized")
+        )
+        md2img_healthy = runtime_ready and browser_ready
+
        # 首页只需要“够判断”的轻量结论，因此统一产出 status + summary 文本，前端无需重复拼装业务规则。
        robot_running = bool(getattr(robot, "ipad_running", False))
        robot_nickname = str(getattr(robot, "nickname", "") or "").strip()
@@ -897,11 +647,37 @@ def api_system_health_summary():
            error_status = "healthy"
            error_summary = "近 24 小时未记录到异常"

-        # 首页 AI 卡片升级为“运行态 + 路由摘要”，仍然保持被动观测，不主动探活。
-        ai_runtime = _extract_ai_runtime_snapshot()
+        if md2img_healthy:
+            md2img_status = "healthy"
+            md2img_summary = "运行时与浏览器均已就绪"
+        elif runtime_ready or browser_ready:
+            md2img_status = "warning"
+            md2img_summary = "运行时部分可用，建议检查预热状态"
+        else:
+            md2img_status = "danger"
+            md2img_summary = "运行时未就绪，相关转图能力可能不可用"

-        # Markdown 转图更适合保留在专门页面里排障，首页右侧改成更通用的任务调度摘要。
-        scheduler_runtime = _extract_scheduler_runtime_snapshot()
+        # AI 运行态：
+        # 1. 统一从 UnifiedLLMClient 最近调用窗口读取，避免各插件单独维护监控数据；
+        # 2. 若当前窗口还没有调用记录，就明确返回“暂无调用”，避免误判成异常。
+        ai_runtime = UnifiedLLMClient.get_runtime_snapshot()
+        ai_total_calls = int(ai_runtime.get("total_calls") or 0)
+        ai_failed_calls = int(ai_runtime.get("failed_calls") or 0)
+        if ai_total_calls <= 0:
+            ai_status = "warning"
+            ai_summary = "最近窗口内暂无统一 LLM 调用记录"
+        elif ai_failed_calls > 0:
+            ai_status = "warning"
+            ai_summary = (
+                f"最近 {ai_total_calls} 次调用中失败 {ai_failed_calls} 次，"
+                f"平均耗时 {ai_runtime.get('avg_latency_ms', 0)}ms"
+            )
+        else:
+            ai_status = "healthy"
+            ai_summary = (
+                f"最近 {ai_total_calls} 次调用全部成功，"
+                f"平均耗时 {ai_runtime.get('avg_latency_ms', 0)}ms"
+            )

        return jsonify({
            "success": True,
@@ -943,10 +719,17 @@ def api_system_health_summary():
                    "redis": redis_snapshot,
                },
                "ai_runtime": {
+                    "status": ai_status,
+                    "summary": ai_summary,
                    **ai_runtime,
                },
-                "scheduler": {
-                    **scheduler_runtime,
+                "md2img": {
+                    "status": md2img_status,
+                    "healthy": md2img_healthy,
+                    "runtime_ready": runtime_ready,
+                    "browser_ready": browser_ready,
+                    "summary": md2img_summary,
+                    "detail": md2img_snapshot,
                },
            }
        })