diff --git a/admin/GlancesMonitor.py b/admin/GlancesMonitor.py deleted file mode 100644 index 5abb2a3..0000000 --- a/admin/GlancesMonitor.py +++ /dev/null @@ -1,157 +0,0 @@ -import time -import threading -import subprocess -import requests - -from loguru import logger - - -class GlancesMonitor: - def __init__(self, email_sender, host='192.168.2.170', port=61208, - cpu_threshold=80.0, load_threshold=None, io_threshold=80.0, - disk_usage_threshold=80.0, handle_threshold=20000, - monitor_interval=30, recipient=None): - """初始化 Glances 监控组件 - - Args: - email_sender: 已初始化的 EmailSender 实例 - host (str): Glances 主机地址 - port (int): Glances Web 服务端口 - cpu_threshold (float): CPU 使用率阈值 (%) - load_threshold (float): 系统负载阈值(默认 CPU 核心数 * 2) - io_threshold (float): 磁盘 I/O 阈值(MB/s) - disk_usage_threshold (float): 磁盘占用阈值 (%) - handle_threshold (int): 句柄数阈值 - recipient (str): 告警邮件接收者 - """ - self.host = host - self.port = port - self.api_url = f"http://{self.host}:{self.port}/api/4" - self.cpu_threshold = cpu_threshold - self.load_threshold = load_threshold or (self.get_cpu_count() * 2) - self.io_threshold = io_threshold - self.disk_usage_threshold = disk_usage_threshold - self.handle_threshold = handle_threshold - self.email_sender = email_sender - self.recipient = recipient - self.glances_process = None - self.last_alert_times = {} - self._running = False - self.monitor_interval = monitor_interval - self._loop_index = 0 - - def get_cpu_count(self): - """获取 CPU 核心数""" - try: - response = requests.get(f"{self.api_url}/cpu") - response.raise_for_status() - return response.json().get('count', 1) - except Exception as e: - logger.error(e) - return 1 - - def start_glances(self): - """启动 Glances Web 服务""" - try: - subprocess.run(['glances', '--version'], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - self.glances_process = subprocess.Popen( - ['glances', '-w', f'--port', str(self.port)], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE - ) - time.sleep(2) - if self.glances_process.poll() is not None: - raise RuntimeError("Glances 启动失败") - logger.info(f"Glances Web 服务已启动: http://{self.host}:{self.port}") - except subprocess.CalledProcessError: - logger.error("错误: Glances 未安装。请运行: python3.11 -m pip install glances") - raise - except Exception as e: - logger.error(f"启动 Glances 失败: {e}") - raise - - def stop_glances(self): - """停止 Glances 服务""" - if self.glances_process: - self.glances_process.terminate() - self.glances_process.wait() - logger.error("Glances Web 服务已停止") - - def send_alert_email(self, metric, value, threshold): - """发送告警邮件,限制每小时一次""" - if not self.email_sender or not self.recipient: - return - current_time = time.time() - last_alert_time = self.last_alert_times.get(metric, 0) - if current_time - last_alert_time < 3600: - return - subject = f"服务器告警: {metric} 过高" - body = f"警告: {metric} 当前值为 {value},超过阈值 {threshold}!\n时间: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}" - if self.email_sender.send_email(self.recipient, subject, body): - self.last_alert_times[metric] = current_time - - def monitor(self): - """监控服务器指标并触发告警""" - while self._running: - try: - self._loop_index += 1 - - response = requests.get(f"{self.api_url}/cpu/total") - response.raise_for_status() - cpu_usage = response.json().get('total', 0) - if cpu_usage > self.cpu_threshold: - self.send_alert_email("CPU 使用率", cpu_usage, self.cpu_threshold) - - response = requests.get(f"{self.api_url}/load") - response.raise_for_status() - load_avg = response.json().get('min1', 0) - if load_avg > self.load_threshold: - self.send_alert_email("系统负载(1分钟)", load_avg, self.load_threshold) - - if self._loop_index % 6 == 0: - response = requests.get(f"{self.api_url}/diskio") - response.raise_for_status() - disks = response.json() - max_io_usage = 0 - for disk in disks: - read_bytes = disk.get('read_bytes', 0) - write_bytes = disk.get('write_bytes', 0) - io_usage = (read_bytes + write_bytes) / (2048 * 1024) - max_io_usage = max(max_io_usage, io_usage) - if max_io_usage > self.io_threshold: - self.send_alert_email("磁盘 I/O(MB/s)", max_io_usage, self.io_threshold) - - response = requests.get(f"{self.api_url}/fs") - response.raise_for_status() - filesystems = response.json() - for fs in filesystems: - disk_usage = fs.get('percent', 0) - if disk_usage > self.disk_usage_threshold: - self.send_alert_email(f"磁盘占用 ({fs.get('mnt_point')})", disk_usage, - self.disk_usage_threshold) - - response = requests.get(f"{self.api_url}/processcount") - response.raise_for_status() - handle_count = response.json().get('total', 0) - if handle_count > self.handle_threshold: - self.send_alert_email("句柄数", handle_count, self.handle_threshold) - - time.sleep(self.monitor_interval) - except requests.RequestException as e: - logger.error(f"连接 Glances API 失败: {e}") - time.sleep(60) - except Exception as e: - logger.error(f"监控错误: {e}") - time.sleep(60) - - def run(self): - """启动 Glances 服务和监控线程(非阻塞)""" - self._running = True - self.start_glances() - monitor_thread = threading.Thread(target=self.monitor, daemon=True) - monitor_thread.start() - - def stop(self): - """停止 Glances 服务和监控""" - self._running = False - self.stop_glances() diff --git a/admin/dashboard/blueprints/system.py b/admin/dashboard/blueprints/system.py index 072d8a4..21e0efe 100644 --- a/admin/dashboard/blueprints/system.py +++ b/admin/dashboard/blueprints/system.py @@ -4,6 +4,7 @@ from loguru import logger import os import time import subprocess +import socket from datetime import datetime import platform import psutil @@ -23,6 +24,16 @@ system_bp = Blueprint('system', __name__) # 记录应用启动时间 APP_START_TIME = time.time() +# 记录最近一次网络计数器采样,用于在资源监控页估算上/下行速率。 +# 这里故意只做“页面级轻量采样”: +# 1. 不起额外守护线程,避免为了展示速率再引入常驻后台任务; +# 2. 只有用户刷新/轮询资源页时才计算速率,开销接近于零; +# 3. 即便进程重启缓存丢失,也只会让第一次速率显示为 0,不影响整体可用性。 +NETWORK_IO_SAMPLE = { + "timestamp": 0.0, + "bytes_sent": 0, + "bytes_recv": 0, +} def _system_config_path() -> str: @@ -68,6 +79,178 @@ def _format_bytes_to_mb(value: int) -> float: return round((_safe_float(value, 0.0) / 1024 / 1024), 2) +def _safe_divide(numerator: float, denominator: float, default: float = 0.0) -> float: + """安全除法,避免速率与占比计算时被 0 除打断。""" + try: + if not denominator: + return default + return numerator / denominator + except Exception: + return default + + +def _primary_disk_path() -> str: + """返回当前系统最稳妥的主盘路径。""" + # 资源监控页既要兼容你本地 Windows 开发环境,也要兼容线上 Linux: + # 1. 优先用系统根目录,Linux 下是 /; + # 2. Windows 下会自动变成当前盘符根路径; + # 3. 避免把磁盘路径硬编码成 /,导致本地调试时报错。 + return os.path.abspath(os.sep) + + +def _format_datetime_text(timestamp_value: float | int | None) -> str: + """把时间戳格式化为后台页面可直接展示的文本。""" + if not timestamp_value: + return "-" + try: + return datetime.fromtimestamp(float(timestamp_value)).strftime("%Y-%m-%d %H:%M:%S") + except Exception: + return "-" + + +def _sample_network_speed() -> dict: + """根据两次页面采样估算网络上下行速率。""" + counters = psutil.net_io_counters() + now = time.time() + current_sent = _safe_int(getattr(counters, "bytes_sent", 0)) + current_recv = _safe_int(getattr(counters, "bytes_recv", 0)) + last_timestamp = _safe_float(NETWORK_IO_SAMPLE.get("timestamp")) + elapsed = max(now - last_timestamp, 0.0) + + upload_speed = 0.0 + download_speed = 0.0 + if elapsed > 0 and last_timestamp > 0: + upload_speed = _safe_divide(current_sent - _safe_int(NETWORK_IO_SAMPLE.get("bytes_sent")), elapsed, 0.0) + download_speed = _safe_divide(current_recv - _safe_int(NETWORK_IO_SAMPLE.get("bytes_recv")), elapsed, 0.0) + upload_speed = max(upload_speed, 0.0) + download_speed = max(download_speed, 0.0) + + NETWORK_IO_SAMPLE["timestamp"] = now + NETWORK_IO_SAMPLE["bytes_sent"] = current_sent + NETWORK_IO_SAMPLE["bytes_recv"] = current_recv + + return { + "bytes_sent": current_sent, + "bytes_recv": current_recv, + "upload_speed_bps": round(upload_speed, 2), + "download_speed_bps": round(download_speed, 2), + } + + +def _extract_server_runtime_snapshot() -> dict: + """构建资源监控页使用的轻量服务器运行态快照。""" + # 这套快照有意只覆盖“日常观察最有价值”的内容: + # 1. 主机资源:CPU / 内存 / 磁盘 / 网络; + # 2. 应用进程:当前 ABOT 进程是否活着、吃了多少资源; + # 3. 基础设施:MySQL / Redis 继续复用现有摘要探测; + # 4. 不再依赖 glances 进程,部署和运维负担会轻很多。 + server = current_app.dashboard_server + current_process = psutil.Process(os.getpid()) + virtual_memory = psutil.virtual_memory() + swap_memory = psutil.swap_memory() + cpu_usage = psutil.cpu_percent(interval=None) + process_cpu_usage = current_process.cpu_percent(interval=None) + boot_time = psutil.boot_time() + network_sample = _sample_network_speed() + disk_io = psutil.disk_io_counters() + try: + load_values = os.getloadavg() + except (AttributeError, OSError): + load_values = (0.0, 0.0, 0.0) + + disk_items = [] + seen_mountpoints = set() + for partition in psutil.disk_partitions(all=False): + mountpoint = str(getattr(partition, "mountpoint", "") or "").strip() + if not mountpoint or mountpoint in seen_mountpoints: + continue + seen_mountpoints.add(mountpoint) + try: + usage = psutil.disk_usage(mountpoint) + except Exception: + continue + disk_items.append({ + "device": str(getattr(partition, "device", "") or "").strip() or mountpoint, + "mountpoint": mountpoint, + "fstype": str(getattr(partition, "fstype", "") or "").strip(), + "total_bytes": _safe_int(getattr(usage, "total", 0)), + "used_bytes": _safe_int(getattr(usage, "used", 0)), + "free_bytes": _safe_int(getattr(usage, "free", 0)), + "usage_percent": round(_safe_float(getattr(usage, "percent", 0.0)), 1), + }) + disk_items.sort(key=lambda item: item.get("usage_percent", 0.0), reverse=True) + + primary_disk_usage = psutil.disk_usage(_primary_disk_path()) + process_memory = current_process.memory_info() + try: + open_files = len(current_process.open_files()) + except Exception: + open_files = 0 + + try: + tcp_connections = current_process.connections(kind="inet") + established_connections = sum( + 1 for conn in tcp_connections if str(getattr(conn, "status", "") or "").upper() == "ESTABLISHED" + ) + except Exception: + established_connections = 0 + + return { + "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + "server": { + "hostname": socket.gethostname(), + "os": platform.system(), + "os_version": platform.version(), + "python_version": platform.python_version(), + "boot_time": _format_datetime_text(boot_time), + "uptime_seconds": round(max(time.time() - boot_time, 0), 2), + }, + "cpu": { + "usage_percent": round(cpu_usage, 1), + "logical_count": psutil.cpu_count(logical=True) or 0, + "physical_count": psutil.cpu_count(logical=False) or 0, + "load_1": round(_safe_float(load_values[0]), 2), + "load_5": round(_safe_float(load_values[1]), 2), + "load_15": round(_safe_float(load_values[2]), 2), + }, + "memory": { + "usage_percent": round(_safe_float(getattr(virtual_memory, "percent", 0.0)), 1), + "total_bytes": _safe_int(getattr(virtual_memory, "total", 0)), + "used_bytes": _safe_int(getattr(virtual_memory, "used", 0)), + "available_bytes": _safe_int(getattr(virtual_memory, "available", 0)), + "swap_usage_percent": round(_safe_float(getattr(swap_memory, "percent", 0.0)), 1), + "swap_total_bytes": _safe_int(getattr(swap_memory, "total", 0)), + "swap_used_bytes": _safe_int(getattr(swap_memory, "used", 0)), + }, + "disk": { + "primary_usage_percent": round(_safe_float(getattr(primary_disk_usage, "percent", 0.0)), 1), + "primary_total_bytes": _safe_int(getattr(primary_disk_usage, "total", 0)), + "primary_used_bytes": _safe_int(getattr(primary_disk_usage, "used", 0)), + "io_read_bytes": _safe_int(getattr(disk_io, "read_bytes", 0)) if disk_io else 0, + "io_write_bytes": _safe_int(getattr(disk_io, "write_bytes", 0)) if disk_io else 0, + "items": disk_items[:8], + }, + "network": { + **network_sample, + "established_connections": established_connections, + }, + "process": { + "pid": current_process.pid, + "cpu_percent": round(process_cpu_usage, 1), + "memory_percent": round(current_process.memory_percent(), 2), + "memory_rss_bytes": _safe_int(getattr(process_memory, "rss", 0)), + "thread_count": current_process.num_threads(), + "open_files": open_files, + "create_time": _format_datetime_text(current_process.create_time()), + "uptime_seconds": round(max(time.time() - current_process.create_time(), 0), 2), + }, + "infrastructure": { + "mysql": _extract_mysql_runtime_snapshot(server.db_manager), + "redis": _extract_redis_runtime_snapshot(server.db_manager), + }, + } + + def _extract_mysql_runtime_snapshot(db_manager) -> dict: """采集 MySQL 运行态摘要。 @@ -761,17 +944,11 @@ def api_docs(): @system_bp.route('/system_status') @login_required def system_status(): - src = request.args.get('src') - if not src: - try: - server = current_app.dashboard_server - glances = getattr(server.robot, "config").glances if hasattr(server.robot, "config") else {} - host = glances.get("host", "127.0.0.1") - port = glances.get("port", 61208) - src = f"http://{host}:{port}/" - except Exception: - src = "http://127.0.0.1:61208/" - return render_template('system_status.html', src_url=src) + # 资源监控页改为项目内置轻量面板: + # 1. 不再依赖 glances 独立进程; + # 2. 页面只消费当前服务自身的 API; + # 3. 线上部署时不用额外开放 61208 之类的端口。 + return render_template('system_status.html') @system_bp.route('/system_llm') @@ -811,6 +988,20 @@ def api_system_info(): return jsonify({"success": False, "error": str(e)}), 500 +@system_bp.route('/api/system_status_overview') +@login_required +def api_system_status_overview(): + """返回资源监控页使用的轻量服务器快照。""" + try: + return jsonify({ + "success": True, + "data": _extract_server_runtime_snapshot(), + }) + except Exception as e: + logger.error(f"获取资源监控快照失败: {e}") + return jsonify({"success": False, "error": str(e)}), 500 + + @system_bp.route('/api/system_health_summary') @login_required def api_system_health_summary(): diff --git a/admin/dashboard/templates/system_status.html b/admin/dashboard/templates/system_status.html index 7799ba1..f5a3e69 100644 --- a/admin/dashboard/templates/system_status.html +++ b/admin/dashboard/templates/system_status.html @@ -6,9 +6,23 @@
直接在后台查看系统资源变化与运行状态,保持监控入口简洁清晰。
+直接观察 ABOT 所在服务器的关键资源、应用进程和基础设施状态,不再依赖额外的 glances 进程。
+直接在控制台内查看系统资源变化与运行状态。
+观察主机、Python 运行环境和负载情况,快速判断是不是机器层面的问题。
+确认当前应用自身的资源占用,避免只看主机而忽略进程级热点。
+把数据库和缓存的关键摘要放在同一屏里,日常看状态不需要再跳出去。
+按使用率排序展示常用挂载点,方便快速发现哪个分区快满了。
+