服务器监控

This commit is contained in:
liuwei
2025-05-27 09:06:21 +08:00
parent 108735fdad
commit 60b6b9f491
6 changed files with 234 additions and 1 deletions

149
admin/GlancesMonitor.py Normal file
View File

@@ -0,0 +1,149 @@
import time
import threading
import subprocess
import requests
class GlancesMonitor:
def __init__(self, email_sender, host='localhost', port=61208,
cpu_threshold=80.0, load_threshold=None, io_threshold=80.0,
disk_usage_threshold=60.0, handle_threshold=20000,
recipient=None):
"""初始化 Glances 监控组件
Args:
email_sender: 已初始化的 EmailSender 实例
host (str): Glances 主机地址
port (int): Glances Web 服务端口
cpu_threshold (float): CPU 使用率阈值 (%)
load_threshold (float): 系统负载阈值(默认 CPU 核心数 * 2
io_threshold (float): 磁盘 I/O 阈值MB/s
disk_usage_threshold (float): 磁盘占用阈值 (%)
handle_threshold (int): 句柄数阈值
recipient (str): 告警邮件接收者
"""
self.host = host
self.port = port
self.cpu_threshold = cpu_threshold
self.load_threshold = load_threshold or (self.get_cpu_count() * 2)
self.io_threshold = io_threshold
self.disk_usage_threshold = disk_usage_threshold
self.handle_threshold = handle_threshold
self.email_sender = email_sender
self.recipient = recipient
self.glances_process = None
self.api_url = f"http://{self.host}:{self.port}/api/3"
self.last_alert_times = {}
self._running = False
def get_cpu_count(self):
"""获取 CPU 核心数"""
try:
response = requests.get(f"{self.api_url}/cpu")
response.raise_for_status()
return response.json().get('count', 1)
except Exception:
return 1
def start_glances(self):
"""启动 Glances Web 服务"""
try:
subprocess.run(['glances', '--version'], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
self.glances_process = subprocess.Popen(
['glances', '-w', f'--port', str(self.port)],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE
)
time.sleep(2)
if self.glances_process.poll() is not None:
raise RuntimeError("Glances 启动失败")
print(f"Glances Web 服务已启动: http://{self.host}:{self.port}")
except subprocess.CalledProcessError:
print("错误: Glances 未安装。请运行: python3.11 -m pip install glances")
raise
except Exception as e:
print(f"启动 Glances 失败: {e}")
raise
def stop_glances(self):
"""停止 Glances 服务"""
if self.glances_process:
self.glances_process.terminate()
self.glances_process.wait()
print("Glances Web 服务已停止")
def send_alert_email(self, metric, value, threshold):
"""发送告警邮件,限制每小时一次"""
if not self.email_sender or not self.recipient:
return
current_time = time.time()
last_alert_time = self.last_alert_times.get(metric, 0)
if current_time - last_alert_time < 3600:
return
subject = f"服务器告警: {metric} 过高"
body = f"警告: {metric} 当前值为 {value},超过阈值 {threshold}\n时间: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}"
if self.email_sender.send_email(self.recipient, subject, body):
self.last_alert_times[metric] = current_time
def monitor(self):
"""监控服务器指标并触发告警"""
while self._running:
try:
response = requests.get(f"{self.api_url}/cpu/total")
response.raise_for_status()
cpu_usage = response.json().get('total', 0)
if cpu_usage > self.cpu_threshold:
self.send_alert_email("CPU 使用率", cpu_usage, self.cpu_threshold)
response = requests.get(f"{self.api_url}/load")
response.raise_for_status()
load_avg = response.json().get('min1', 0)
if load_avg > self.load_threshold:
self.send_alert_email("系统负载1分钟", load_avg, self.load_threshold)
response = requests.get(f"{self.api_url}/diskio")
response.raise_for_status()
disks = response.json()
max_io_usage = 0
for disk in disks:
read_bytes = disk.get('read_bytes', 0)
write_bytes = disk.get('write_bytes', 0)
io_usage = (read_bytes + write_bytes) / (1024 * 1024)
max_io_usage = max(max_io_usage, io_usage)
if max_io_usage > self.io_threshold:
self.send_alert_email("磁盘 I/OMB/s", max_io_usage, self.io_threshold)
response = requests.get(f"{self.api_url}/fs")
response.raise_for_status()
filesystems = response.json()
for fs in filesystems:
disk_usage = fs.get('percent', 0)
if disk_usage > self.disk_usage_threshold:
self.send_alert_email(f"磁盘占用 ({fs.get('mnt_point')})", disk_usage,
self.disk_usage_threshold)
response = requests.get(f"{self.api_url}/processcount")
response.raise_for_status()
handle_count = response.json().get('total', 0)
if handle_count > self.handle_threshold:
self.send_alert_email("句柄数", handle_count, self.handle_threshold)
time.sleep(10)
except requests.RequestException as e:
print(f"连接 Glances API 失败: {e}")
time.sleep(60)
except Exception as e:
print(f"监控错误: {e}")
time.sleep(60)
def run(self):
"""启动 Glances 服务和监控线程(非阻塞)"""
self._running = True
self.start_glances()
monitor_thread = threading.Thread(target=self.monitor, daemon=True)
monitor_thread.start()
def stop(self):
"""停止 Glances 服务和监控"""
self._running = False
self.stop_glances()