diff --git a/admin/dashboard/blueprints/system.py b/admin/dashboard/blueprints/system.py index 37db1ff..61a1000 100644 --- a/admin/dashboard/blueprints/system.py +++ b/admin/dashboard/blueprints/system.py @@ -16,6 +16,7 @@ from utils.markdown_to_image import get_md2img_health_snapshot, warmup_md2img_br from utils.ai.llm_registry import LLMRegistry from base.plugin_common.plugin_interface import PluginStatus from utils.ai.unified_llm import UnifiedLLMClient +from utils.decorator.async_job import async_job # 创建系统信息蓝图 system_bp = Blueprint('system', __name__) @@ -42,6 +43,442 @@ def _save_system_yaml(config_obj: dict) -> None: yaml.safe_dump(config_obj, f, allow_unicode=True, sort_keys=False) +def _safe_int(value, default: int = 0) -> int: + """把数据库 / Redis 返回的字符串数字安全转成整数。""" + try: + if value in (None, ""): + return default + return int(float(value)) + except (TypeError, ValueError): + return default + + +def _safe_float(value, default: float = 0.0) -> float: + """把数据库 / Redis 返回的值安全转成浮点数。""" + try: + if value in (None, ""): + return default + return float(value) + except (TypeError, ValueError): + return default + + +def _format_bytes_to_mb(value: int) -> float: + """把字节数转换为 MB,保留两位小数便于首页摘要展示。""" + return round((_safe_float(value, 0.0) / 1024 / 1024), 2) + + +def _extract_mysql_runtime_snapshot(db_manager) -> dict: + """采集 MySQL 运行态摘要。 + + 首页目标不是替代 DBA 工具,而是让管理员一眼判断: + 1. 数据库是不是活着; + 2. 当前连接压力高不高; + 3. 当前库规模是否已经明显变大; + 4. 有没有必要继续深入到更专业的监控页排查。 + """ + snapshot = { + "status": "healthy", + "summary": "连接正常", + "database": db_manager.get_mysql_database_name(), + "version": "", + "threads_connected": 0, + "threads_running": 0, + "max_connections": 0, + "connection_usage_percent": 0.0, + "questions_per_second": 0.0, + "uptime_seconds": 0, + "table_count": 0, + "schema_size_mb": 0.0, + "slow_query_threshold_ms": db_manager.get_slow_query_threshold_ms(), + } + + mysql_conn = db_manager.get_mysql_connection() + try: + with mysql_conn.cursor(dictionary=True) as cursor: + # 基础探活与版本识别: + # 1. SELECT VERSION() 成本极低; + # 2. 相比只做 SELECT 1,它还能顺便拿到版本信息; + # 3. 首页卡片里显示版本,方便线上排查“是不是某台库版本不一致”。 + cursor.execute("SELECT VERSION() AS version, DATABASE() AS database_name") + version_row = cursor.fetchone() or {} + snapshot["version"] = str(version_row.get("version") or "").strip() + snapshot["database"] = str(version_row.get("database_name") or snapshot["database"] or "").strip() + + cursor.execute( + """ + SHOW GLOBAL STATUS + WHERE Variable_name IN ('Threads_connected', 'Threads_running', 'Questions', 'Uptime') + """ + ) + status_rows = cursor.fetchall() or [] + status_map = { + str(row.get("Variable_name") or "").strip(): row.get("Value") + for row in status_rows + } + + cursor.execute( + """ + SHOW GLOBAL VARIABLES + WHERE Variable_name IN ('max_connections') + """ + ) + variable_rows = cursor.fetchall() or [] + variable_map = { + str(row.get("Variable_name") or "").strip(): row.get("Value") + for row in variable_rows + } + + # information_schema 聚合虽然比 SELECT 1 重一点,但仍属于轻量级元信息查询: + # 1. 只在首页 30 秒级刷新一次,成本可接受; + # 2. 能直接给出当前业务库表数量与体量变化; + # 3. 对判断“是不是消息表膨胀导致后台变慢”很有帮助。 + cursor.execute( + """ + SELECT + COUNT(*) AS table_count, + COALESCE(SUM(data_length + index_length), 0) AS schema_size_bytes + FROM information_schema.tables + WHERE table_schema = DATABASE() + """ + ) + schema_row = cursor.fetchone() or {} + + snapshot["threads_connected"] = _safe_int(status_map.get("Threads_connected")) + snapshot["threads_running"] = _safe_int(status_map.get("Threads_running")) + snapshot["max_connections"] = _safe_int(variable_map.get("max_connections")) + snapshot["uptime_seconds"] = _safe_int(status_map.get("Uptime")) + total_questions = _safe_int(status_map.get("Questions")) + if snapshot["uptime_seconds"] > 0: + snapshot["questions_per_second"] = round(total_questions / snapshot["uptime_seconds"], 2) + if snapshot["max_connections"] > 0: + snapshot["connection_usage_percent"] = round( + (snapshot["threads_connected"] / snapshot["max_connections"]) * 100, + 1, + ) + snapshot["table_count"] = _safe_int(schema_row.get("table_count")) + snapshot["schema_size_mb"] = _format_bytes_to_mb(schema_row.get("schema_size_bytes")) + + if snapshot["connection_usage_percent"] >= 80 or snapshot["threads_running"] >= 12: + snapshot["status"] = "warning" + snapshot["summary"] = ( + f"连接压力偏高:已连接 {snapshot['threads_connected']} / {snapshot['max_connections']}," + f"运行中线程 {snapshot['threads_running']}" + ) + else: + snapshot["summary"] = ( + f"连接正常:已连接 {snapshot['threads_connected']} / {snapshot['max_connections'] or '-'}," + f"QPS {snapshot['questions_per_second']}" + ) + return snapshot + except Exception as mysql_error: + snapshot["status"] = "danger" + snapshot["summary"] = f"MySQL 探测失败: {mysql_error}" + return snapshot + finally: + mysql_conn.close() + + +def _extract_redis_runtime_snapshot(db_manager) -> dict: + """采集 Redis 运行态摘要。""" + redis_config = getattr(db_manager, "redis_config", {}) or {} + snapshot = { + "status": "healthy", + "summary": "连接正常", + "db_index": _safe_int(redis_config.get("db", 0)), + "key_count": 0, + "connected_clients": 0, + "blocked_clients": 0, + "ops_per_sec": 0, + "used_memory_human": "", + "used_memory_peak_human": "", + "memory_usage_percent": 0.0, + "uptime_seconds": 0, + "hit_rate_percent": 0.0, + } + + try: + redis_conn = db_manager.get_redis_connection() + redis_conn.ping() + info = redis_conn.info() or {} + snapshot["key_count"] = _safe_int(redis_conn.dbsize()) + snapshot["connected_clients"] = _safe_int(info.get("connected_clients")) + snapshot["blocked_clients"] = _safe_int(info.get("blocked_clients")) + snapshot["ops_per_sec"] = _safe_int(info.get("instantaneous_ops_per_sec")) + snapshot["used_memory_human"] = str(info.get("used_memory_human") or "").strip() + snapshot["used_memory_peak_human"] = str(info.get("used_memory_peak_human") or "").strip() + snapshot["uptime_seconds"] = _safe_int(info.get("uptime_in_seconds")) + + maxmemory = _safe_int(info.get("maxmemory")) + used_memory = _safe_int(info.get("used_memory")) + if maxmemory > 0: + snapshot["memory_usage_percent"] = round((used_memory / maxmemory) * 100, 1) + + keyspace_hits = _safe_int(info.get("keyspace_hits")) + keyspace_misses = _safe_int(info.get("keyspace_misses")) + if (keyspace_hits + keyspace_misses) > 0: + snapshot["hit_rate_percent"] = round( + (keyspace_hits / (keyspace_hits + keyspace_misses)) * 100, + 1, + ) + + if snapshot["blocked_clients"] > 0 or snapshot["memory_usage_percent"] >= 80: + snapshot["status"] = "warning" + snapshot["summary"] = ( + f"缓存压力需关注:keys {snapshot['key_count']}," + f"clients {snapshot['connected_clients']},ops/s {snapshot['ops_per_sec']}" + ) + else: + snapshot["summary"] = ( + f"缓存正常:keys {snapshot['key_count']}," + f"clients {snapshot['connected_clients']},ops/s {snapshot['ops_per_sec']}" + ) + return snapshot + except Exception as redis_error: + snapshot["status"] = "danger" + snapshot["summary"] = f"Redis 探测失败: {redis_error}" + return snapshot + + +def _parse_snapshot_datetime(value: str | None) -> datetime | None: + """把首页摘要里常用的时间字符串安全转换为 datetime。""" + text = str(value or "").strip() + if not text: + return None + try: + return datetime.strptime(text, "%Y-%m-%d %H:%M:%S") + except ValueError: + return None + + +def _count_enabled_runtime_items(items) -> int: + """统计启用项数量。""" + rows = [] + if isinstance(items, dict): + rows = list(items.values()) + elif isinstance(items, list): + rows = list(items) + count = 0 + for row in rows: + if not isinstance(row, dict): + continue + if "enabled" not in row or bool(row.get("enabled", True)): + count += 1 + return count + + +def _extract_llm_catalog_summary() -> dict: + """提取首页 LLM 路由配置摘要。""" + try: + catalog = LLMRegistry.get_catalog() or {} + if catalog: + providers = catalog.get("providers", {}) or {} + dify_apps = catalog.get("dify_apps", {}) or {} + backends = catalog.get("backends", {}) or {} + scenes = catalog.get("scenes", {}) or {} + default_scene = str(catalog.get("default_scene") or "").strip() + default_backend = str(LLMRegistry.get_scene_backend_name(default_scene) or "").strip() if default_scene else "" + return { + "provider_count": _count_enabled_runtime_items(providers), + "scene_count": _count_enabled_runtime_items(scenes), + "target_count": _count_enabled_runtime_items(backends) + _count_enabled_runtime_items(dify_apps), + "default_scene": default_scene, + "default_backend": default_backend, + "has_routing": _count_enabled_runtime_items(scenes) > 0, + } + + legacy_llm = LLMRegistry.get_llm_config() or {} + scenes = legacy_llm.get("scenes", {}) or {} + backends = legacy_llm.get("backends", {}) or {} + default_backend = str(legacy_llm.get("default_backend") or "").strip() + return { + "provider_count": 0, + "scene_count": len(scenes) if isinstance(scenes, dict) else 0, + "target_count": len(backends) if isinstance(backends, dict) else 0, + "default_scene": "", + "default_backend": default_backend, + "has_routing": bool(scenes) or bool(default_backend), + } + except Exception as llm_catalog_error: + logger.warning(f"提取 LLM 路由摘要失败: {llm_catalog_error}") + return { + "provider_count": 0, + "scene_count": 0, + "target_count": 0, + "default_scene": "", + "default_backend": "", + "has_routing": False, + } + + +def _extract_ai_runtime_snapshot() -> dict: + """构建首页 LLM 运行态摘要。""" + runtime_snapshot = UnifiedLLMClient.get_runtime_snapshot() or {} + last_call = dict(runtime_snapshot.get("last_call") or {}) + catalog_summary = _extract_llm_catalog_summary() + + total_calls = _safe_int(runtime_snapshot.get("total_calls")) + failed_calls = _safe_int(runtime_snapshot.get("failed_calls")) + success_rate = _safe_float(runtime_snapshot.get("success_rate")) + avg_latency_ms = _safe_float(runtime_snapshot.get("avg_latency_ms")) + last_error = str(runtime_snapshot.get("last_error") or "").strip() + + snapshot = { + **runtime_snapshot, + "last_call": last_call, + "provider_count": catalog_summary.get("provider_count", 0), + "scene_count": catalog_summary.get("scene_count", 0), + "target_count": catalog_summary.get("target_count", 0), + "default_scene": catalog_summary.get("default_scene", ""), + "default_backend": catalog_summary.get("default_backend", ""), + "has_routing": bool(catalog_summary.get("has_routing")), + "last_provider": str(last_call.get("provider") or "").strip(), + "last_backend": str(last_call.get("backend") or "").strip(), + "last_scene": str(last_call.get("scene") or "").strip(), + "last_model": str(last_call.get("model") or "").strip(), + "last_timestamp": str(last_call.get("timestamp") or "").strip(), + "last_latency_ms": _safe_float(last_call.get("latency_ms")), + "last_error": last_error, + } + + if not snapshot["has_routing"]: + snapshot["status"] = "warning" + snapshot["summary"] = "当前未发现完整的 LLM 路由配置,建议先检查默认场景与后端绑定" + return snapshot + + if total_calls <= 0: + snapshot["status"] = "warning" + snapshot["summary"] = ( + f"已配置 {snapshot['scene_count']} 个场景、{snapshot['target_count']} 个目标," + "最近窗口内暂无统一 LLM 调用记录" + ) + return snapshot + + if failed_calls >= total_calls and total_calls > 0: + snapshot["status"] = "danger" + snapshot["summary"] = ( + f"最近 {total_calls} 次调用全部失败,成功率 {success_rate:.2f}%," + f"平均耗时 {avg_latency_ms:.2f}ms" + ) + return snapshot + + if failed_calls > 0 or last_error: + snapshot["status"] = "warning" + snapshot["summary"] = ( + f"最近 {total_calls} 次调用中失败 {failed_calls} 次,成功率 {success_rate:.2f}%," + f"平均耗时 {avg_latency_ms:.2f}ms" + ) + return snapshot + + snapshot["status"] = "healthy" + snapshot["summary"] = ( + f"最近 {total_calls} 次调用全部成功,成功率 {success_rate:.2f}%," + f"平均耗时 {avg_latency_ms:.2f}ms" + ) + return snapshot + + +def _extract_scheduler_runtime_snapshot() -> dict: + """聚合 async_job 运行态,生成首页任务调度摘要。""" + runtime_rows = async_job.get_jobs_snapshot() + next_run_candidates = [] + failed_rows = [] + system_job_count = 0 + plugin_job_count = 0 + + for row in runtime_rows: + job_key = str(row.get("job_key") or "").strip() + owner_name = str(row.get("owner_name") or "system").strip().lower() + next_run_at = _parse_snapshot_datetime(row.get("next_run_at")) + last_status = str(row.get("last_status") or "").strip().lower() + + if job_key.startswith("plugin_schedule:") or owner_name != "system": + plugin_job_count += 1 + else: + system_job_count += 1 + + if bool(row.get("enabled")) and next_run_at: + next_run_candidates.append(next_run_at) + if last_status in {"failed", "invalid_schedule"}: + failed_rows.append(row) + + latest_failed_row = {} + if failed_rows: + failed_rows.sort( + key=lambda row: ( + _parse_snapshot_datetime(row.get("updated_at")) + or _parse_snapshot_datetime(row.get("last_run_at")) + or datetime.min + ), + reverse=True, + ) + latest_failed_row = failed_rows[0] + + invalid_jobs = sum( + 1 for row in runtime_rows if str(row.get("last_status") or "").strip().lower() == "invalid_schedule" + ) + total_jobs = len(runtime_rows) + enabled_jobs = sum(1 for row in runtime_rows if bool(row.get("enabled"))) + running_jobs = sum(1 for row in runtime_rows if bool(row.get("running"))) + failed_jobs = len(failed_rows) + paused_jobs = total_jobs - enabled_jobs + never_run_jobs = sum(1 for row in runtime_rows if str(row.get("last_status") or "").strip().lower() == "never") + next_run_at_text = min(next_run_candidates).strftime("%Y-%m-%d %H:%M:%S") if next_run_candidates else "" + latest_failed_error = str(latest_failed_row.get("last_error") or "").strip() + if len(latest_failed_error) > 120: + latest_failed_error = f"{latest_failed_error[:117]}..." + + snapshot = { + "status": "healthy", + "summary": "任务调度运行正常", + "total_jobs": total_jobs, + "enabled_jobs": enabled_jobs, + "running_jobs": running_jobs, + "failed_jobs": failed_jobs, + "invalid_jobs": invalid_jobs, + "paused_jobs": paused_jobs, + "never_run_jobs": never_run_jobs, + "system_job_count": system_job_count, + "plugin_job_count": plugin_job_count, + "next_run_at": next_run_at_text, + "latest_failed_job_name": str(latest_failed_row.get("name") or "").strip(), + "latest_failed_error": latest_failed_error, + } + + if total_jobs <= 0: + snapshot["status"] = "warning" + snapshot["summary"] = "当前没有加载任何定时任务" + return snapshot + + if invalid_jobs > 0: + snapshot["status"] = "danger" + snapshot["summary"] = f"发现 {invalid_jobs} 个任务调度配置非法,建议立即检查任务页" + return snapshot + + if failed_jobs > 0: + snapshot["status"] = "warning" + snapshot["summary"] = ( + f"最近有 {failed_jobs} 个任务执行失败," + f"下一次执行 {next_run_at_text or '暂未计算'}" + ) + return snapshot + + if enabled_jobs <= 0: + snapshot["status"] = "warning" + snapshot["summary"] = "任务已加载,但当前没有启用中的调度任务" + return snapshot + + if running_jobs > 0: + snapshot["summary"] = ( + f"当前有 {running_jobs} 个任务执行中," + f"下一次执行 {next_run_at_text or '暂未计算'}" + ) + return snapshot + + snapshot["summary"] = f"已启用 {enabled_jobs} 个任务,下一次执行 {next_run_at_text or '暂未计算'}" + return snapshot + + def _legacy_llm_to_catalog(legacy_llm: dict) -> dict: """把旧 llm(backends/scenes) 结构转换为新目录结构(仅用于兜底展示)。 @@ -405,45 +842,11 @@ def api_system_health_summary(): _, recent_error_count = server.stats_db.get_error_logs(days=1, page=1, limit=1) # 基础设施健康: - # 1. MySQL 用最轻量的 SELECT 1 做可用性探测; - # 2. Redis 用 PING 验证连接池当前是否可拿到可用连接; + # 1. MySQL / Redis 都在这里做“首页摘要级”探测,而不是完整深度巡检; + # 2. 除了连通性,还补充少量负载指标,方便管理员快速判断是否需要继续下钻; # 3. 即使探测失败也只反馈到看板,不影响主接口整体返回。 - mysql_status = "healthy" - mysql_summary = "连接正常" - try: - mysql_conn = server.db_manager.get_mysql_connection() - try: - with mysql_conn.cursor() as cursor: - cursor.execute("SELECT 1") - cursor.fetchone() - finally: - mysql_conn.close() - except Exception as mysql_error: - mysql_status = "danger" - mysql_summary = f"MySQL 探测失败: {mysql_error}" - - redis_status = "healthy" - redis_summary = "连接正常" - try: - redis_conn = server.db_manager.get_redis_connection() - redis_conn.ping() - except Exception as redis_error: - redis_status = "danger" - redis_summary = f"Redis 探测失败: {redis_error}" - - # md2img 健康快照已经有现成实现,这里只做聚合,不主动预热运行时。 - md2img_snapshot = get_md2img_health_snapshot(ensure_runtime=False) or {} - browser_ready = bool( - md2img_snapshot.get("browser_ready") - or md2img_snapshot.get("playwright_ready") - or md2img_snapshot.get("ready") - ) - runtime_ready = bool( - md2img_snapshot.get("runtime_ready") - or md2img_snapshot.get("runtime_initialized") - or md2img_snapshot.get("initialized") - ) - md2img_healthy = runtime_ready and browser_ready + mysql_snapshot = _extract_mysql_runtime_snapshot(server.db_manager) + redis_snapshot = _extract_redis_runtime_snapshot(server.db_manager) # 首页只需要“够判断”的轻量结论,因此统一产出 status + summary 文本,前端无需重复拼装业务规则。 robot_running = bool(getattr(robot, "ipad_running", False)) @@ -470,37 +873,11 @@ def api_system_health_summary(): error_status = "healthy" error_summary = "近 24 小时未记录到异常" - if md2img_healthy: - md2img_status = "healthy" - md2img_summary = "运行时与浏览器均已就绪" - elif runtime_ready or browser_ready: - md2img_status = "warning" - md2img_summary = "运行时部分可用,建议检查预热状态" - else: - md2img_status = "danger" - md2img_summary = "运行时未就绪,相关转图能力可能不可用" + # 首页 AI 卡片升级为“运行态 + 路由摘要”,仍然保持被动观测,不主动探活。 + ai_runtime = _extract_ai_runtime_snapshot() - # AI 运行态: - # 1. 统一从 UnifiedLLMClient 最近调用窗口读取,避免各插件单独维护监控数据; - # 2. 若当前窗口还没有调用记录,就明确返回“暂无调用”,避免误判成异常。 - ai_runtime = UnifiedLLMClient.get_runtime_snapshot() - ai_total_calls = int(ai_runtime.get("total_calls") or 0) - ai_failed_calls = int(ai_runtime.get("failed_calls") or 0) - if ai_total_calls <= 0: - ai_status = "warning" - ai_summary = "最近窗口内暂无统一 LLM 调用记录" - elif ai_failed_calls > 0: - ai_status = "warning" - ai_summary = ( - f"最近 {ai_total_calls} 次调用中失败 {ai_failed_calls} 次," - f"平均耗时 {ai_runtime.get('avg_latency_ms', 0)}ms" - ) - else: - ai_status = "healthy" - ai_summary = ( - f"最近 {ai_total_calls} 次调用全部成功," - f"平均耗时 {ai_runtime.get('avg_latency_ms', 0)}ms" - ) + # Markdown 转图更适合保留在专门页面里排障,首页右侧改成更通用的任务调度摘要。 + scheduler_runtime = _extract_scheduler_runtime_snapshot() return jsonify({ "success": True, @@ -524,33 +901,28 @@ def api_system_health_summary(): "summary": error_summary, }, "infrastructure": { - "status": "healthy" if mysql_status == "healthy" and redis_status == "healthy" else "danger", + "status": ( + "danger" + if "danger" in {mysql_snapshot.get("status"), redis_snapshot.get("status")} + else ("warning" if "warning" in {mysql_snapshot.get("status"), redis_snapshot.get("status")} else "healthy") + ), "summary": ( "MySQL / Redis 均正常" - if mysql_status == "healthy" and redis_status == "healthy" - else "存在基础设施连接异常" + if mysql_snapshot.get("status") == "healthy" and redis_snapshot.get("status") == "healthy" + else ( + "基础设施连接正常,但部分负载指标需要关注" + if mysql_snapshot.get("status") != "danger" and redis_snapshot.get("status") != "danger" + else "存在基础设施连接异常" + ) ), - "mysql": { - "status": mysql_status, - "summary": mysql_summary, - }, - "redis": { - "status": redis_status, - "summary": redis_summary, - }, + "mysql": mysql_snapshot, + "redis": redis_snapshot, }, "ai_runtime": { - "status": ai_status, - "summary": ai_summary, **ai_runtime, }, - "md2img": { - "status": md2img_status, - "healthy": md2img_healthy, - "runtime_ready": runtime_ready, - "browser_ready": browser_ready, - "summary": md2img_summary, - "detail": md2img_snapshot, + "scheduler": { + **scheduler_runtime, }, } }) diff --git a/admin/dashboard/templates/index.html b/admin/dashboard/templates/index.html index d14657d..a93d1c4 100644 --- a/admin/dashboard/templates/index.html +++ b/admin/dashboard/templates/index.html @@ -131,7 +131,7 @@

系统健康快照

-

把连接状态、插件运行、异常数量与转图运行时集中到一个面板里。

+

把连接状态、插件运行、异常数量、LLM 运行态与任务调度集中到一个面板里。

最近刷新 @@ -148,6 +148,29 @@
{% raw %}{{ card.value }}{% endraw %}
{% raw %}{{ card.summary }}{% endraw %}
+
+
+
+
+
{% raw %}{{ service.title }}{% endraw %}
+
{% raw %}{{ service.summary }}{% endraw %}
+
+ + {% raw %}{{ getHealthStatusText(service.status) }}{% endraw %} + +
+
+
+ {% raw %}{{ metric.label }}{% endraw %} + {% raw %}{{ metric.value }}{% endraw %} +
+
+
+
{% raw %}{{ card.extra }}{% endraw %}
@@ -371,15 +394,38 @@ status: 'warning', total_calls: 0, failed_calls: 0, + success_rate: 0, avg_latency_ms: 0, summary: '加载中...', - last_call: {} + last_call: {}, + scene_count: 0, + target_count: 0, + provider_count: 0, + has_routing: false, + default_scene: '', + default_backend: '', + last_provider: '', + last_backend: '', + last_scene: '', + last_model: '', + last_timestamp: '', + last_latency_ms: 0, + last_error: '' }, - md2img: { + scheduler: { status: 'warning', - healthy: false, - runtime_ready: false, - browser_ready: false, + total_jobs: 0, + enabled_jobs: 0, + running_jobs: 0, + failed_jobs: 0, + invalid_jobs: 0, + paused_jobs: 0, + never_run_jobs: 0, + system_job_count: 0, + plugin_job_count: 0, + next_run_at: '', + latest_failed_job_name: '', + latest_failed_error: '', summary: '加载中...' } }, @@ -423,7 +469,7 @@ const errors = this.healthSummary.errors || {}; const infrastructure = this.healthSummary.infrastructure || {}; const aiRuntime = this.healthSummary.ai_runtime || {}; - const md2img = this.healthSummary.md2img || {}; + const scheduler = this.healthSummary.scheduler || {}; return [ { key: 'robot', @@ -453,25 +499,30 @@ key: 'infrastructure', title: '基础设施', status: infrastructure.status || 'warning', - value: infrastructure.status === 'healthy' ? '正常' : '异常', + value: `${this.countHealthyInfrastructureServices(infrastructure)} / 2`, summary: infrastructure.summary || '暂无状态', - extra: `MySQL:${((infrastructure.mysql || {}).status === 'healthy') ? '正常' : '异常'} / Redis:${((infrastructure.redis || {}).status === 'healthy') ? '正常' : '异常'}` + serviceBlocks: this.buildInfrastructureServiceBlocks(infrastructure), + extra: '首页展示的是服务摘要;如果后续要做更深入的运维排查,再单独拆详细页会更合适。' }, { key: 'ai_runtime', - title: 'AI 运行态', + title: 'LLM 运行态', status: aiRuntime.status || 'warning', - value: `${aiRuntime.avg_latency_ms || 0} ms`, + value: (aiRuntime.total_calls || 0) > 0 + ? `${this.formatMetricNumber(aiRuntime.success_rate, 2)}%` + : `${aiRuntime.scene_count || 0} 个场景`, summary: aiRuntime.summary || '暂无状态', - extra: `最近调用 ${aiRuntime.total_calls || 0} 次,失败 ${aiRuntime.failed_calls || 0} 次` + serviceBlocks: this.buildAiRuntimeServiceBlocks(aiRuntime), + extra: this.buildAiRuntimeExtra(aiRuntime) }, { - key: 'md2img', - title: 'Markdown 转图', - status: md2img.status || 'warning', - value: md2img.healthy ? '就绪' : '待检查', - summary: md2img.summary || '暂无状态', - extra: `Runtime ${md2img.runtime_ready ? '已就绪' : '未就绪'} / Browser ${md2img.browser_ready ? '已就绪' : '未就绪'}` + key: 'scheduler', + title: '任务调度', + status: scheduler.status || 'warning', + value: `${scheduler.enabled_jobs || 0} / ${scheduler.total_jobs || 0}`, + summary: scheduler.summary || '暂无状态', + serviceBlocks: this.buildSchedulerServiceBlocks(scheduler), + extra: this.buildSchedulerExtra(scheduler) } ]; } @@ -539,6 +590,133 @@ }; return statusMap[status] || '未知'; }, + formatCompactDuration(seconds) { + const totalSeconds = parseInt(seconds) || 0; + if (totalSeconds <= 0) return '-'; + const days = Math.floor(totalSeconds / 86400); + const hours = Math.floor((totalSeconds % 86400) / 3600); + const minutes = Math.floor((totalSeconds % 3600) / 60); + if (days > 0) return `${days}D ${hours}H`; + if (hours > 0) return `${hours}H ${minutes}M`; + return `${minutes}M`; + }, + formatMetricNumber(value, fractionDigits = 0) { + if (value === null || value === undefined || value === '') return '-'; + const numeric = Number(value); + if (Number.isNaN(numeric)) return String(value); + return numeric.toFixed(fractionDigits); + }, + countHealthyInfrastructureServices(infrastructure) { + const mysql = infrastructure.mysql || {}; + const redis = infrastructure.redis || {}; + let count = 0; + if (mysql.status === 'healthy') count += 1; + if (redis.status === 'healthy') count += 1; + return count; + }, + buildInfrastructureServiceBlocks(infrastructure) { + const mysql = infrastructure.mysql || {}; + const redis = infrastructure.redis || {}; + return [ + { + key: 'mysql', + title: 'MySQL', + status: mysql.status || 'warning', + summary: mysql.summary || '暂无状态', + metrics: [ + { label: '连接负载', value: `${this.formatMetricNumber(mysql.connection_usage_percent, 1)}%` }, + { label: '连接数', value: `${this.formatMetricNumber(mysql.threads_connected)} / ${mysql.max_connections || '-'}` }, + { label: '运行线程', value: this.formatMetricNumber(mysql.threads_running) }, + { label: 'QPS', value: this.formatMetricNumber(mysql.questions_per_second, 2) }, + { label: '库体积', value: `${this.formatMetricNumber(mysql.schema_size_mb, 2)} MB` }, + { label: '表数量', value: this.formatMetricNumber(mysql.table_count) } + ] + }, + { + key: 'redis', + title: 'Redis', + status: redis.status || 'warning', + summary: redis.summary || '暂无状态', + metrics: [ + { label: 'Key 数量', value: this.formatMetricNumber(redis.key_count) }, + { label: '客户端', value: this.formatMetricNumber(redis.connected_clients) }, + { label: 'OPS/s', value: this.formatMetricNumber(redis.ops_per_sec) }, + { label: '内存占用', value: redis.used_memory_human || '-' }, + { label: '命中率', value: `${this.formatMetricNumber(redis.hit_rate_percent, 1)}%` }, + { label: '运行时间', value: this.formatCompactDuration(redis.uptime_seconds) } + ] + } + ]; + }, + buildAiRuntimeServiceBlocks(aiRuntime) { + return [ + { + key: 'ai-routing', + title: '路由配置', + status: aiRuntime.has_routing ? 'healthy' : 'warning', + summary: aiRuntime.default_scene ? `默认场景:${aiRuntime.default_scene}` : '当前未设置默认场景', + metrics: [ + { label: '场景数量', value: this.formatMetricNumber(aiRuntime.scene_count) }, + { label: '目标数量', value: this.formatMetricNumber(aiRuntime.target_count) }, + { label: 'Provider 模板', value: this.formatMetricNumber(aiRuntime.provider_count) }, + { label: '默认后端', value: aiRuntime.default_backend || '-' } + ] + }, + { + key: 'ai-last-call', + title: '最近调用', + status: (aiRuntime.failed_calls || 0) > 0 ? 'warning' : ((aiRuntime.total_calls || 0) > 0 ? 'healthy' : 'warning'), + summary: aiRuntime.last_timestamp ? `最近一次记录时间:${aiRuntime.last_timestamp}` : '当前窗口内暂无调用记录', + metrics: [ + { label: 'Provider', value: aiRuntime.last_provider || '-' }, + { label: 'Backend', value: aiRuntime.last_backend || '-' }, + { label: 'Scene', value: aiRuntime.last_scene || '-' }, + { label: '模型', value: aiRuntime.last_model || '-' }, + { label: '最近耗时', value: `${this.formatMetricNumber(aiRuntime.last_latency_ms, 2)} ms` }, + { label: '最近错误', value: aiRuntime.last_error || '无' } + ] + } + ]; + }, + buildAiRuntimeExtra(aiRuntime) { + return `最近调用 ${aiRuntime.total_calls || 0} 次,失败 ${aiRuntime.failed_calls || 0} 次,平均耗时 ${this.formatMetricNumber(aiRuntime.avg_latency_ms, 2)} ms`; + }, + buildSchedulerServiceBlocks(scheduler) { + return [ + { + key: 'scheduler-overview', + title: '任务装载', + status: scheduler.enabled_jobs > 0 ? 'healthy' : 'warning', + summary: scheduler.next_run_at ? `下一次执行:${scheduler.next_run_at}` : '当前没有可计算的下一次执行时间', + metrics: [ + { label: '启用任务', value: this.formatMetricNumber(scheduler.enabled_jobs) }, + { label: '暂停任务', value: this.formatMetricNumber(scheduler.paused_jobs) }, + { label: '系统任务', value: this.formatMetricNumber(scheduler.system_job_count) }, + { label: '插件任务', value: this.formatMetricNumber(scheduler.plugin_job_count) } + ] + }, + { + key: 'scheduler-runtime', + title: '执行状态', + status: scheduler.status || 'warning', + summary: scheduler.latest_failed_job_name ? `最近失败任务:${scheduler.latest_failed_job_name}` : '当前未发现最近失败任务', + metrics: [ + { label: '执行中', value: this.formatMetricNumber(scheduler.running_jobs) }, + { label: '失败任务', value: this.formatMetricNumber(scheduler.failed_jobs) }, + { label: '非法调度', value: this.formatMetricNumber(scheduler.invalid_jobs) }, + { label: '未执行过', value: this.formatMetricNumber(scheduler.never_run_jobs) } + ] + } + ]; + }, + buildSchedulerExtra(scheduler) { + if (scheduler.latest_failed_error) { + return `最近失败原因:${scheduler.latest_failed_error}`; + } + return scheduler.next_run_at + ? `下次执行时间:${scheduler.next_run_at}` + : '当前暂无可用的下一次执行时间'; + }, renderPieChart(chartId, usageValue, label) { const ctx = document.getElementById(chartId); if (!ctx) return; @@ -1095,6 +1273,104 @@ color: #475569; } + .health-service-grid { + display: grid; + grid-template-columns: repeat(2, minmax(0, 1fr)); + gap: 12px; + margin-top: 16px; + } + + .health-service-panel { + padding: 14px; + border-radius: 16px; + border: 1px solid rgba(148, 163, 184, 0.14); + background: rgba(248, 250, 252, 0.72); + } + + .health-service-panel--healthy { + box-shadow: inset 0 0 0 1px rgba(16, 185, 129, 0.08); + } + + .health-service-panel--warning { + box-shadow: inset 0 0 0 1px rgba(245, 158, 11, 0.10); + } + + .health-service-panel--danger { + box-shadow: inset 0 0 0 1px rgba(239, 68, 68, 0.10); + } + + .health-service-panel__head { + display: flex; + align-items: flex-start; + justify-content: space-between; + gap: 12px; + margin-bottom: 12px; + } + + .health-service-panel__title { + font-size: 14px; + font-weight: 700; + color: #0f172a; + margin-bottom: 4px; + } + + .health-service-panel__summary { + font-size: 12px; + line-height: 1.6; + color: #64748b; + } + + .health-service-panel__badge { + display: inline-flex; + align-items: center; + justify-content: center; + min-width: 44px; + padding: 4px 8px; + border-radius: 999px; + font-size: 11px; + font-weight: 700; + flex-shrink: 0; + } + + .health-service-panel__badge--healthy { + color: #047857; + background: rgba(16, 185, 129, 0.12); + } + + .health-service-panel__badge--warning { + color: #b45309; + background: rgba(245, 158, 11, 0.14); + } + + .health-service-panel__badge--danger { + color: #b91c1c; + background: rgba(239, 68, 68, 0.14); + } + + .health-service-metrics { + display: grid; + grid-template-columns: repeat(2, minmax(0, 1fr)); + gap: 10px 12px; + } + + .health-service-metric { + display: flex; + flex-direction: column; + gap: 4px; + } + + .health-service-metric__label { + font-size: 11px; + color: #94a3b8; + } + + .health-service-metric__value { + font-size: 13px; + font-weight: 600; + color: #1e293b; + word-break: break-word; + } + .health-item__extra { margin-top: 12px; padding-top: 12px; @@ -1450,6 +1726,10 @@ .health-grid { grid-template-columns: 1fr; } + + .health-service-grid { + grid-template-columns: 1fr; + } } @media (max-width: 768px) { @@ -1559,6 +1839,10 @@ font-size: 24px; } + .health-service-metrics { + grid-template-columns: 1fr; + } + .chart-container--large, .chart-container--panel { height: 220px;