diff --git a/admin/dashboard/blueprints/system.py b/admin/dashboard/blueprints/system.py index 1a372d8..37db1ff 100644 --- a/admin/dashboard/blueprints/system.py +++ b/admin/dashboard/blueprints/system.py @@ -42,203 +42,6 @@ def _save_system_yaml(config_obj: dict) -> None: yaml.safe_dump(config_obj, f, allow_unicode=True, sort_keys=False) -def _safe_int(value, default: int = 0) -> int: - """把数据库 / Redis 返回的字符串数字安全转成整数。""" - try: - if value in (None, ""): - return default - return int(float(value)) - except (TypeError, ValueError): - return default - - -def _safe_float(value, default: float = 0.0) -> float: - """把数据库 / Redis 返回的值安全转成浮点数。""" - try: - if value in (None, ""): - return default - return float(value) - except (TypeError, ValueError): - return default - - -def _format_bytes_to_mb(value: int) -> float: - """把字节数转换为 MB,保留两位小数便于首页摘要展示。""" - return round((_safe_float(value, 0.0) / 1024 / 1024), 2) - - -def _extract_mysql_runtime_snapshot(db_manager) -> dict: - """采集 MySQL 运行态摘要。 - - 首页目标不是替代 DBA 工具,而是让管理员一眼判断: - 1. 数据库是不是活着; - 2. 当前连接压力高不高; - 3. 当前库规模是否已经明显变大; - 4. 有没有必要继续深入到更专业的监控页排查。 - """ - snapshot = { - "status": "healthy", - "summary": "连接正常", - "database": db_manager.get_mysql_database_name(), - "version": "", - "threads_connected": 0, - "threads_running": 0, - "max_connections": 0, - "connection_usage_percent": 0.0, - "questions_per_second": 0.0, - "uptime_seconds": 0, - "table_count": 0, - "schema_size_mb": 0.0, - "slow_query_threshold_ms": db_manager.get_slow_query_threshold_ms(), - } - - mysql_conn = db_manager.get_mysql_connection() - try: - with mysql_conn.cursor(dictionary=True) as cursor: - # 基础探活与版本识别: - # 1. SELECT VERSION() 成本极低; - # 2. 相比只做 SELECT 1,它还能顺便拿到版本信息; - # 3. 首页卡片里显示版本,方便线上排查“是不是某台库版本不一致”。 - cursor.execute("SELECT VERSION() AS version, DATABASE() AS database_name") - version_row = cursor.fetchone() or {} - snapshot["version"] = str(version_row.get("version") or "").strip() - snapshot["database"] = str(version_row.get("database_name") or snapshot["database"] or "").strip() - - cursor.execute( - """ - SHOW GLOBAL STATUS - WHERE Variable_name IN ('Threads_connected', 'Threads_running', 'Questions', 'Uptime') - """ - ) - status_rows = cursor.fetchall() or [] - status_map = { - str(row.get("Variable_name") or "").strip(): row.get("Value") - for row in status_rows - } - - cursor.execute( - """ - SHOW GLOBAL VARIABLES - WHERE Variable_name IN ('max_connections') - """ - ) - variable_rows = cursor.fetchall() or [] - variable_map = { - str(row.get("Variable_name") or "").strip(): row.get("Value") - for row in variable_rows - } - - # information_schema 聚合虽然比 SELECT 1 重一点,但仍属于轻量级元信息查询: - # 1. 只在首页 30 秒级刷新一次,成本可接受; - # 2. 能直接给出当前业务库表数量与体量变化; - # 3. 对判断“是不是消息表膨胀导致后台变慢”很有帮助。 - cursor.execute( - """ - SELECT - COUNT(*) AS table_count, - COALESCE(SUM(data_length + index_length), 0) AS schema_size_bytes - FROM information_schema.tables - WHERE table_schema = DATABASE() - """ - ) - schema_row = cursor.fetchone() or {} - - snapshot["threads_connected"] = _safe_int(status_map.get("Threads_connected")) - snapshot["threads_running"] = _safe_int(status_map.get("Threads_running")) - snapshot["max_connections"] = _safe_int(variable_map.get("max_connections")) - snapshot["uptime_seconds"] = _safe_int(status_map.get("Uptime")) - total_questions = _safe_int(status_map.get("Questions")) - if snapshot["uptime_seconds"] > 0: - snapshot["questions_per_second"] = round(total_questions / snapshot["uptime_seconds"], 2) - if snapshot["max_connections"] > 0: - snapshot["connection_usage_percent"] = round( - (snapshot["threads_connected"] / snapshot["max_connections"]) * 100, - 1, - ) - snapshot["table_count"] = _safe_int(schema_row.get("table_count")) - snapshot["schema_size_mb"] = _format_bytes_to_mb(schema_row.get("schema_size_bytes")) - - if snapshot["connection_usage_percent"] >= 80 or snapshot["threads_running"] >= 12: - snapshot["status"] = "warning" - snapshot["summary"] = ( - f"连接压力偏高:已连接 {snapshot['threads_connected']} / {snapshot['max_connections']}," - f"运行中线程 {snapshot['threads_running']}" - ) - else: - snapshot["summary"] = ( - f"连接正常:已连接 {snapshot['threads_connected']} / {snapshot['max_connections'] or '-'}," - f"QPS {snapshot['questions_per_second']}" - ) - return snapshot - except Exception as mysql_error: - snapshot["status"] = "danger" - snapshot["summary"] = f"MySQL 探测失败: {mysql_error}" - return snapshot - finally: - mysql_conn.close() - - -def _extract_redis_runtime_snapshot(db_manager) -> dict: - """采集 Redis 运行态摘要。""" - redis_config = getattr(db_manager, "redis_config", {}) or {} - snapshot = { - "status": "healthy", - "summary": "连接正常", - "db_index": _safe_int(redis_config.get("db", 0)), - "key_count": 0, - "connected_clients": 0, - "blocked_clients": 0, - "ops_per_sec": 0, - "used_memory_human": "", - "used_memory_peak_human": "", - "memory_usage_percent": 0.0, - "uptime_seconds": 0, - "hit_rate_percent": 0.0, - } - - try: - redis_conn = db_manager.get_redis_connection() - redis_conn.ping() - info = redis_conn.info() or {} - snapshot["key_count"] = _safe_int(redis_conn.dbsize()) - snapshot["connected_clients"] = _safe_int(info.get("connected_clients")) - snapshot["blocked_clients"] = _safe_int(info.get("blocked_clients")) - snapshot["ops_per_sec"] = _safe_int(info.get("instantaneous_ops_per_sec")) - snapshot["used_memory_human"] = str(info.get("used_memory_human") or "").strip() - snapshot["used_memory_peak_human"] = str(info.get("used_memory_peak_human") or "").strip() - snapshot["uptime_seconds"] = _safe_int(info.get("uptime_in_seconds")) - - maxmemory = _safe_int(info.get("maxmemory")) - used_memory = _safe_int(info.get("used_memory")) - if maxmemory > 0: - snapshot["memory_usage_percent"] = round((used_memory / maxmemory) * 100, 1) - - keyspace_hits = _safe_int(info.get("keyspace_hits")) - keyspace_misses = _safe_int(info.get("keyspace_misses")) - if (keyspace_hits + keyspace_misses) > 0: - snapshot["hit_rate_percent"] = round( - (keyspace_hits / (keyspace_hits + keyspace_misses)) * 100, - 1, - ) - - if snapshot["blocked_clients"] > 0 or snapshot["memory_usage_percent"] >= 80: - snapshot["status"] = "warning" - snapshot["summary"] = ( - f"缓存压力需关注:keys {snapshot['key_count']}," - f"clients {snapshot['connected_clients']},ops/s {snapshot['ops_per_sec']}" - ) - else: - snapshot["summary"] = ( - f"缓存正常:keys {snapshot['key_count']}," - f"clients {snapshot['connected_clients']},ops/s {snapshot['ops_per_sec']}" - ) - return snapshot - except Exception as redis_error: - snapshot["status"] = "danger" - snapshot["summary"] = f"Redis 探测失败: {redis_error}" - return snapshot - - def _legacy_llm_to_catalog(legacy_llm: dict) -> dict: """把旧 llm(backends/scenes) 结构转换为新目录结构(仅用于兜底展示)。 @@ -602,11 +405,31 @@ def api_system_health_summary(): _, recent_error_count = server.stats_db.get_error_logs(days=1, page=1, limit=1) # 基础设施健康: - # 1. MySQL / Redis 都在这里做“首页摘要级”探测,而不是完整深度巡检; - # 2. 除了连通性,还补充少量负载指标,方便管理员快速判断是否需要继续下钻; + # 1. MySQL 用最轻量的 SELECT 1 做可用性探测; + # 2. Redis 用 PING 验证连接池当前是否可拿到可用连接; # 3. 即使探测失败也只反馈到看板,不影响主接口整体返回。 - mysql_snapshot = _extract_mysql_runtime_snapshot(server.db_manager) - redis_snapshot = _extract_redis_runtime_snapshot(server.db_manager) + mysql_status = "healthy" + mysql_summary = "连接正常" + try: + mysql_conn = server.db_manager.get_mysql_connection() + try: + with mysql_conn.cursor() as cursor: + cursor.execute("SELECT 1") + cursor.fetchone() + finally: + mysql_conn.close() + except Exception as mysql_error: + mysql_status = "danger" + mysql_summary = f"MySQL 探测失败: {mysql_error}" + + redis_status = "healthy" + redis_summary = "连接正常" + try: + redis_conn = server.db_manager.get_redis_connection() + redis_conn.ping() + except Exception as redis_error: + redis_status = "danger" + redis_summary = f"Redis 探测失败: {redis_error}" # md2img 健康快照已经有现成实现,这里只做聚合,不主动预热运行时。 md2img_snapshot = get_md2img_health_snapshot(ensure_runtime=False) or {} @@ -701,22 +524,20 @@ def api_system_health_summary(): "summary": error_summary, }, "infrastructure": { - "status": ( - "danger" - if "danger" in {mysql_snapshot.get("status"), redis_snapshot.get("status")} - else ("warning" if "warning" in {mysql_snapshot.get("status"), redis_snapshot.get("status")} else "healthy") - ), + "status": "healthy" if mysql_status == "healthy" and redis_status == "healthy" else "danger", "summary": ( "MySQL / Redis 均正常" - if mysql_snapshot.get("status") == "healthy" and redis_snapshot.get("status") == "healthy" - else ( - "基础设施连接正常,但部分负载指标需要关注" - if mysql_snapshot.get("status") != "danger" and redis_snapshot.get("status") != "danger" - else "存在基础设施连接异常" - ) + if mysql_status == "healthy" and redis_status == "healthy" + else "存在基础设施连接异常" ), - "mysql": mysql_snapshot, - "redis": redis_snapshot, + "mysql": { + "status": mysql_status, + "summary": mysql_summary, + }, + "redis": { + "status": redis_status, + "summary": redis_summary, + }, }, "ai_runtime": { "status": ai_status, diff --git a/admin/dashboard/templates/index.html b/admin/dashboard/templates/index.html index 84344f1..d14657d 100644 --- a/admin/dashboard/templates/index.html +++ b/admin/dashboard/templates/index.html @@ -148,29 +148,6 @@