diff --git a/admin/dashboard/blueprints/system.py b/admin/dashboard/blueprints/system.py index b93d685..1a372d8 100644 --- a/admin/dashboard/blueprints/system.py +++ b/admin/dashboard/blueprints/system.py @@ -16,7 +16,6 @@ from utils.markdown_to_image import get_md2img_health_snapshot, warmup_md2img_br from utils.ai.llm_registry import LLMRegistry from base.plugin_common.plugin_interface import PluginStatus from utils.ai.unified_llm import UnifiedLLMClient -from utils.decorator.async_job import async_job # 创建系统信息蓝图 system_bp = Blueprint('system', __name__) @@ -240,269 +239,6 @@ def _extract_redis_runtime_snapshot(db_manager) -> dict: return snapshot -def _parse_snapshot_datetime(value: str | None) -> datetime | None: - """把首页摘要里常用的时间字符串安全转换为 datetime。""" - text = str(value or "").strip() - if not text: - return None - try: - return datetime.strptime(text, "%Y-%m-%d %H:%M:%S") - except ValueError: - return None - - -def _count_enabled_runtime_items(items) -> int: - """统计启用项数量。 - - 兼容原因: - 1. 新版目录模型里 providers/backends/scenes 可能是 dict; - 2. 后台页面某些兜底逻辑里也可能给出 list; - 3. 旧配置没有 enabled 字段时,直接按存在即计数。 - """ - rows = [] - if isinstance(items, dict): - rows = list(items.values()) - elif isinstance(items, list): - rows = list(items) - count = 0 - for row in rows: - if not isinstance(row, dict): - continue - if "enabled" not in row or bool(row.get("enabled", True)): - count += 1 - return count - - -def _extract_llm_catalog_summary() -> dict: - """提取首页 LLM 路由配置摘要。 - - 这里不做真实调用探测,只回答两个问题: - 1. 运行时有没有可用的场景与目标; - 2. 管理员当前看到的调用记录,大致落到了哪一套路由上。 - """ - try: - catalog = LLMRegistry.get_catalog() or {} - if catalog: - providers = catalog.get("providers", {}) or {} - dify_apps = catalog.get("dify_apps", {}) or {} - backends = catalog.get("backends", {}) or {} - scenes = catalog.get("scenes", {}) or {} - default_scene = str(catalog.get("default_scene") or "").strip() - default_backend = str(LLMRegistry.get_scene_backend_name(default_scene) or "").strip() if default_scene else "" - return { - "provider_count": _count_enabled_runtime_items(providers), - "scene_count": _count_enabled_runtime_items(scenes), - "target_count": _count_enabled_runtime_items(backends) + _count_enabled_runtime_items(dify_apps), - "default_scene": default_scene, - "default_backend": default_backend, - "has_routing": _count_enabled_runtime_items(scenes) > 0, - } - - # 目录模型不存在时回退到 legacy 视图,至少让首页知道“有没有基础路由配置”。 - legacy_llm = LLMRegistry.get_llm_config() or {} - scenes = legacy_llm.get("scenes", {}) or {} - backends = legacy_llm.get("backends", {}) or {} - default_backend = str(legacy_llm.get("default_backend") or "").strip() - return { - "provider_count": 0, - "scene_count": len(scenes) if isinstance(scenes, dict) else 0, - "target_count": len(backends) if isinstance(backends, dict) else 0, - "default_scene": "", - "default_backend": default_backend, - "has_routing": bool(scenes) or bool(default_backend), - } - except Exception as llm_catalog_error: - logger.warning(f"提取 LLM 路由摘要失败: {llm_catalog_error}") - return { - "provider_count": 0, - "scene_count": 0, - "target_count": 0, - "default_scene": "", - "default_backend": "", - "has_routing": False, - } - - -def _extract_ai_runtime_snapshot() -> dict: - """构建首页 LLM 运行态摘要。 - - 设计原则: - 1. 首页只展示“最近调用窗口”的被动观测结果,不主动发请求探活; - 2. 把最近调用和静态路由配置拼在一起,避免管理员只看到“成功/失败”却不知道走的是哪条链路; - 3. 如果近期没有调用,也明确区分“未配置”和“已配置但当前空闲”。 - """ - runtime_snapshot = UnifiedLLMClient.get_runtime_snapshot() or {} - last_call = dict(runtime_snapshot.get("last_call") or {}) - catalog_summary = _extract_llm_catalog_summary() - - total_calls = _safe_int(runtime_snapshot.get("total_calls")) - failed_calls = _safe_int(runtime_snapshot.get("failed_calls")) - success_rate = _safe_float(runtime_snapshot.get("success_rate")) - avg_latency_ms = _safe_float(runtime_snapshot.get("avg_latency_ms")) - last_error = str(runtime_snapshot.get("last_error") or "").strip() - - snapshot = { - **runtime_snapshot, - "last_call": last_call, - "provider_count": catalog_summary.get("provider_count", 0), - "scene_count": catalog_summary.get("scene_count", 0), - "target_count": catalog_summary.get("target_count", 0), - "default_scene": catalog_summary.get("default_scene", ""), - "default_backend": catalog_summary.get("default_backend", ""), - "has_routing": bool(catalog_summary.get("has_routing")), - "last_provider": str(last_call.get("provider") or "").strip(), - "last_backend": str(last_call.get("backend") or "").strip(), - "last_scene": str(last_call.get("scene") or "").strip(), - "last_model": str(last_call.get("model") or "").strip(), - "last_timestamp": str(last_call.get("timestamp") or "").strip(), - "last_latency_ms": _safe_float(last_call.get("latency_ms")), - } - - if not snapshot["has_routing"]: - snapshot["status"] = "warning" - snapshot["summary"] = "当前未发现完整的 LLM 路由配置,建议先检查默认场景与后端绑定" - return snapshot - - if total_calls <= 0: - snapshot["status"] = "warning" - snapshot["summary"] = ( - f"已配置 {snapshot['scene_count']} 个场景、{snapshot['target_count']} 个目标," - "最近窗口内暂无统一 LLM 调用记录" - ) - return snapshot - - if failed_calls >= total_calls and total_calls > 0: - snapshot["status"] = "danger" - snapshot["summary"] = ( - f"最近 {total_calls} 次调用全部失败,成功率 {success_rate:.2f}%," - f"平均耗时 {avg_latency_ms:.2f}ms" - ) - return snapshot - - if failed_calls > 0 or last_error: - snapshot["status"] = "warning" - snapshot["summary"] = ( - f"最近 {total_calls} 次调用中失败 {failed_calls} 次,成功率 {success_rate:.2f}%," - f"平均耗时 {avg_latency_ms:.2f}ms" - ) - return snapshot - - snapshot["status"] = "healthy" - snapshot["summary"] = ( - f"最近 {total_calls} 次调用全部成功,成功率 {success_rate:.2f}%," - f"平均耗时 {avg_latency_ms:.2f}ms" - ) - return snapshot - - -def _extract_scheduler_runtime_snapshot() -> dict: - """聚合 async_job 运行态,生成首页任务调度摘要。 - - 这里的目标不是替代完整任务页,而是回答管理员最常问的几件事: - 1. 任务有没有正常装载; - 2. 是否存在失败或非法调度; - 3. 下一次任务大概何时执行; - 4. 当前更多是系统任务,还是插件任务在跑。 - """ - runtime_rows = async_job.get_jobs_snapshot() - next_run_candidates = [] - failed_rows = [] - system_job_count = 0 - plugin_job_count = 0 - - for row in runtime_rows: - job_key = str(row.get("job_key") or "").strip() - owner_name = str(row.get("owner_name") or "system").strip().lower() - next_run_at = _parse_snapshot_datetime(row.get("next_run_at")) - last_status = str(row.get("last_status") or "").strip().lower() - - if job_key.startswith("plugin_schedule:") or owner_name != "system": - plugin_job_count += 1 - else: - system_job_count += 1 - - if bool(row.get("enabled")) and next_run_at: - next_run_candidates.append(next_run_at) - if last_status in {"failed", "invalid_schedule"}: - failed_rows.append(row) - - latest_failed_row = {} - if failed_rows: - failed_rows.sort( - key=lambda row: ( - _parse_snapshot_datetime(row.get("updated_at")) - or _parse_snapshot_datetime(row.get("last_run_at")) - or datetime.min - ), - reverse=True, - ) - latest_failed_row = failed_rows[0] - - invalid_jobs = sum( - 1 for row in runtime_rows if str(row.get("last_status") or "").strip().lower() == "invalid_schedule" - ) - total_jobs = len(runtime_rows) - enabled_jobs = sum(1 for row in runtime_rows if bool(row.get("enabled"))) - running_jobs = sum(1 for row in runtime_rows if bool(row.get("running"))) - failed_jobs = len(failed_rows) - paused_jobs = total_jobs - enabled_jobs - never_run_jobs = sum(1 for row in runtime_rows if str(row.get("last_status") or "").strip().lower() == "never") - next_run_at_text = min(next_run_candidates).strftime("%Y-%m-%d %H:%M:%S") if next_run_candidates else "" - latest_failed_error = str(latest_failed_row.get("last_error") or "").strip() - if len(latest_failed_error) > 120: - latest_failed_error = f"{latest_failed_error[:117]}..." - - snapshot = { - "status": "healthy", - "summary": "任务调度运行正常", - "total_jobs": total_jobs, - "enabled_jobs": enabled_jobs, - "running_jobs": running_jobs, - "failed_jobs": failed_jobs, - "invalid_jobs": invalid_jobs, - "paused_jobs": paused_jobs, - "never_run_jobs": never_run_jobs, - "system_job_count": system_job_count, - "plugin_job_count": plugin_job_count, - "next_run_at": next_run_at_text, - "latest_failed_job_name": str(latest_failed_row.get("name") or "").strip(), - "latest_failed_error": latest_failed_error, - } - - if total_jobs <= 0: - snapshot["status"] = "warning" - snapshot["summary"] = "当前没有加载任何定时任务" - return snapshot - - if invalid_jobs > 0: - snapshot["status"] = "danger" - snapshot["summary"] = f"发现 {invalid_jobs} 个任务调度配置非法,建议立即检查任务页" - return snapshot - - if failed_jobs > 0: - snapshot["status"] = "warning" - snapshot["summary"] = ( - f"最近有 {failed_jobs} 个任务执行失败," - f"下一次执行 {next_run_at_text or '暂未计算'}" - ) - return snapshot - - if enabled_jobs <= 0: - snapshot["status"] = "warning" - snapshot["summary"] = "任务已加载,但当前没有启用中的调度任务" - return snapshot - - if running_jobs > 0: - snapshot["summary"] = ( - f"当前有 {running_jobs} 个任务执行中," - f"下一次执行 {next_run_at_text or '暂未计算'}" - ) - return snapshot - - snapshot["summary"] = f"已启用 {enabled_jobs} 个任务,下一次执行 {next_run_at_text or '暂未计算'}" - return snapshot - - def _legacy_llm_to_catalog(legacy_llm: dict) -> dict: """把旧 llm(backends/scenes) 结构转换为新目录结构(仅用于兜底展示)。 @@ -872,6 +608,20 @@ def api_system_health_summary(): mysql_snapshot = _extract_mysql_runtime_snapshot(server.db_manager) redis_snapshot = _extract_redis_runtime_snapshot(server.db_manager) + # md2img 健康快照已经有现成实现,这里只做聚合,不主动预热运行时。 + md2img_snapshot = get_md2img_health_snapshot(ensure_runtime=False) or {} + browser_ready = bool( + md2img_snapshot.get("browser_ready") + or md2img_snapshot.get("playwright_ready") + or md2img_snapshot.get("ready") + ) + runtime_ready = bool( + md2img_snapshot.get("runtime_ready") + or md2img_snapshot.get("runtime_initialized") + or md2img_snapshot.get("initialized") + ) + md2img_healthy = runtime_ready and browser_ready + # 首页只需要“够判断”的轻量结论,因此统一产出 status + summary 文本,前端无需重复拼装业务规则。 robot_running = bool(getattr(robot, "ipad_running", False)) robot_nickname = str(getattr(robot, "nickname", "") or "").strip() @@ -897,11 +647,37 @@ def api_system_health_summary(): error_status = "healthy" error_summary = "近 24 小时未记录到异常" - # 首页 AI 卡片升级为“运行态 + 路由摘要”,仍然保持被动观测,不主动探活。 - ai_runtime = _extract_ai_runtime_snapshot() + if md2img_healthy: + md2img_status = "healthy" + md2img_summary = "运行时与浏览器均已就绪" + elif runtime_ready or browser_ready: + md2img_status = "warning" + md2img_summary = "运行时部分可用,建议检查预热状态" + else: + md2img_status = "danger" + md2img_summary = "运行时未就绪,相关转图能力可能不可用" - # Markdown 转图更适合保留在专门页面里排障,首页右侧改成更通用的任务调度摘要。 - scheduler_runtime = _extract_scheduler_runtime_snapshot() + # AI 运行态: + # 1. 统一从 UnifiedLLMClient 最近调用窗口读取,避免各插件单独维护监控数据; + # 2. 若当前窗口还没有调用记录,就明确返回“暂无调用”,避免误判成异常。 + ai_runtime = UnifiedLLMClient.get_runtime_snapshot() + ai_total_calls = int(ai_runtime.get("total_calls") or 0) + ai_failed_calls = int(ai_runtime.get("failed_calls") or 0) + if ai_total_calls <= 0: + ai_status = "warning" + ai_summary = "最近窗口内暂无统一 LLM 调用记录" + elif ai_failed_calls > 0: + ai_status = "warning" + ai_summary = ( + f"最近 {ai_total_calls} 次调用中失败 {ai_failed_calls} 次," + f"平均耗时 {ai_runtime.get('avg_latency_ms', 0)}ms" + ) + else: + ai_status = "healthy" + ai_summary = ( + f"最近 {ai_total_calls} 次调用全部成功," + f"平均耗时 {ai_runtime.get('avg_latency_ms', 0)}ms" + ) return jsonify({ "success": True, @@ -943,10 +719,17 @@ def api_system_health_summary(): "redis": redis_snapshot, }, "ai_runtime": { + "status": ai_status, + "summary": ai_summary, **ai_runtime, }, - "scheduler": { - **scheduler_runtime, + "md2img": { + "status": md2img_status, + "healthy": md2img_healthy, + "runtime_ready": runtime_ready, + "browser_ready": browser_ready, + "summary": md2img_summary, + "detail": md2img_snapshot, }, } }) diff --git a/admin/dashboard/templates/index.html b/admin/dashboard/templates/index.html index eeabcbd..84344f1 100644 --- a/admin/dashboard/templates/index.html +++ b/admin/dashboard/templates/index.html @@ -131,7 +131,7 @@

系统健康快照

-

把连接状态、插件运行、异常数量、LLM 运行态与任务调度集中到一个面板里。

+

把连接状态、插件运行、异常数量与转图运行时集中到一个面板里。

最近刷新 @@ -394,38 +394,15 @@ status: 'warning', total_calls: 0, failed_calls: 0, - success_rate: 0, avg_latency_ms: 0, summary: '加载中...', - last_call: {}, - scene_count: 0, - target_count: 0, - provider_count: 0, - has_routing: false, - default_scene: '', - default_backend: '', - last_provider: '', - last_backend: '', - last_scene: '', - last_model: '', - last_timestamp: '', - last_latency_ms: 0, - last_error: '' + last_call: {} }, - scheduler: { + md2img: { status: 'warning', - total_jobs: 0, - enabled_jobs: 0, - running_jobs: 0, - failed_jobs: 0, - invalid_jobs: 0, - paused_jobs: 0, - never_run_jobs: 0, - system_job_count: 0, - plugin_job_count: 0, - next_run_at: '', - latest_failed_job_name: '', - latest_failed_error: '', + healthy: false, + runtime_ready: false, + browser_ready: false, summary: '加载中...' } }, @@ -469,7 +446,7 @@ const errors = this.healthSummary.errors || {}; const infrastructure = this.healthSummary.infrastructure || {}; const aiRuntime = this.healthSummary.ai_runtime || {}; - const scheduler = this.healthSummary.scheduler || {}; + const md2img = this.healthSummary.md2img || {}; return [ { key: 'robot', @@ -506,23 +483,19 @@ }, { key: 'ai_runtime', - title: 'LLM 运行态', + title: 'AI 运行态', status: aiRuntime.status || 'warning', - value: (aiRuntime.total_calls || 0) > 0 - ? `${this.formatMetricNumber(aiRuntime.success_rate, 2)}%` - : `${aiRuntime.scene_count || 0} 个场景`, + value: `${aiRuntime.avg_latency_ms || 0} ms`, summary: aiRuntime.summary || '暂无状态', - serviceBlocks: this.buildAiRuntimeServiceBlocks(aiRuntime), - extra: this.buildAiRuntimeExtra(aiRuntime) + extra: `最近调用 ${aiRuntime.total_calls || 0} 次,失败 ${aiRuntime.failed_calls || 0} 次` }, { - key: 'scheduler', - title: '任务调度', - status: scheduler.status || 'warning', - value: `${scheduler.enabled_jobs || 0} / ${scheduler.total_jobs || 0}`, - summary: scheduler.summary || '暂无状态', - serviceBlocks: this.buildSchedulerServiceBlocks(scheduler), - extra: this.buildSchedulerExtra(scheduler) + key: 'md2img', + title: 'Markdown 转图', + status: md2img.status || 'warning', + value: md2img.healthy ? '就绪' : '待检查', + summary: md2img.summary || '暂无状态', + extra: `Runtime ${md2img.runtime_ready ? '已就绪' : '未就绪'} / Browser ${md2img.browser_ready ? '已就绪' : '未就绪'}` } ]; } @@ -684,141 +657,6 @@ } ]; }, - buildAiRuntimeServiceBlocks(aiRuntime) { - // AI 卡片拆成“路由配置”和“最近调用”两个子面板, - // 让首页既能判断配置是否完整,也能快速定位最近请求到底走了哪条链路。 - return [ - { - key: 'ai-routing', - title: '路由配置', - status: aiRuntime.has_routing ? 'healthy' : 'warning', - summary: aiRuntime.default_scene - ? `默认场景:${aiRuntime.default_scene}` - : '当前未设置默认场景', - metrics: [ - { - label: '场景数量', - value: this.formatMetricNumber(aiRuntime.scene_count) - }, - { - label: '目标数量', - value: this.formatMetricNumber(aiRuntime.target_count) - }, - { - label: 'Provider 模板', - value: this.formatMetricNumber(aiRuntime.provider_count) - }, - { - label: '默认后端', - value: aiRuntime.default_backend || '-' - } - ] - }, - { - key: 'ai-last-call', - title: '最近调用', - status: (aiRuntime.failed_calls || 0) > 0 ? 'warning' : ((aiRuntime.total_calls || 0) > 0 ? 'healthy' : 'warning'), - summary: aiRuntime.last_timestamp - ? `最近一次记录时间:${aiRuntime.last_timestamp}` - : '当前窗口内暂无调用记录', - metrics: [ - { - label: 'Provider', - value: aiRuntime.last_provider || '-' - }, - { - label: 'Backend', - value: aiRuntime.last_backend || '-' - }, - { - label: 'Scene', - value: aiRuntime.last_scene || '-' - }, - { - label: '模型', - value: aiRuntime.last_model || '-' - }, - { - label: '最近耗时', - value: `${this.formatMetricNumber(aiRuntime.last_latency_ms, 2)} ms` - }, - { - label: '最近错误', - value: aiRuntime.last_error || '无' - } - ] - } - ]; - }, - buildAiRuntimeExtra(aiRuntime) { - return `最近调用 ${aiRuntime.total_calls || 0} 次,失败 ${aiRuntime.failed_calls || 0} 次,平均耗时 ${this.formatMetricNumber(aiRuntime.avg_latency_ms, 2)} ms`; - }, - buildSchedulerServiceBlocks(scheduler) { - // 任务调度卡片只保留首页最需要的摘要: - // 任务装载量、执行态、失败数,以及系统任务/插件任务的大致构成。 - return [ - { - key: 'scheduler-overview', - title: '任务装载', - status: scheduler.enabled_jobs > 0 ? 'healthy' : 'warning', - summary: scheduler.next_run_at - ? `下一次执行:${scheduler.next_run_at}` - : '当前没有可计算的下一次执行时间', - metrics: [ - { - label: '启用任务', - value: this.formatMetricNumber(scheduler.enabled_jobs) - }, - { - label: '暂停任务', - value: this.formatMetricNumber(scheduler.paused_jobs) - }, - { - label: '系统任务', - value: this.formatMetricNumber(scheduler.system_job_count) - }, - { - label: '插件任务', - value: this.formatMetricNumber(scheduler.plugin_job_count) - } - ] - }, - { - key: 'scheduler-runtime', - title: '执行状态', - status: scheduler.status || 'warning', - summary: scheduler.latest_failed_job_name - ? `最近失败任务:${scheduler.latest_failed_job_name}` - : '当前未发现最近失败任务', - metrics: [ - { - label: '执行中', - value: this.formatMetricNumber(scheduler.running_jobs) - }, - { - label: '失败任务', - value: this.formatMetricNumber(scheduler.failed_jobs) - }, - { - label: '非法调度', - value: this.formatMetricNumber(scheduler.invalid_jobs) - }, - { - label: '未执行过', - value: this.formatMetricNumber(scheduler.never_run_jobs) - } - ] - } - ]; - }, - buildSchedulerExtra(scheduler) { - if (scheduler.latest_failed_error) { - return `最近失败原因:${scheduler.latest_failed_error}`; - } - return scheduler.next_run_at - ? `下次执行时间:${scheduler.next_run_at}` - : '当前暂无可用的下一次执行时间'; - }, renderPieChart(chartId, usageValue, label) { const ctx = document.getElementById(chartId); if (!ctx) return;