153 lines
6.4 KiB
Python
153 lines
6.4 KiB
Python
import time
|
||
import threading
|
||
import subprocess
|
||
import requests
|
||
|
||
from loguru import logger
|
||
|
||
|
||
class GlancesMonitor:
|
||
def __init__(self, email_sender, host='192.168.2.170', port=61208,
|
||
cpu_threshold=80.0, load_threshold=None, io_threshold=80.0,
|
||
disk_usage_threshold=60.0, handle_threshold=20000,
|
||
recipient=None):
|
||
"""初始化 Glances 监控组件
|
||
|
||
Args:
|
||
email_sender: 已初始化的 EmailSender 实例
|
||
host (str): Glances 主机地址
|
||
port (int): Glances Web 服务端口
|
||
cpu_threshold (float): CPU 使用率阈值 (%)
|
||
load_threshold (float): 系统负载阈值(默认 CPU 核心数 * 2)
|
||
io_threshold (float): 磁盘 I/O 阈值(MB/s)
|
||
disk_usage_threshold (float): 磁盘占用阈值 (%)
|
||
handle_threshold (int): 句柄数阈值
|
||
recipient (str): 告警邮件接收者
|
||
"""
|
||
self.host = host
|
||
self.port = port
|
||
self.api_url = f"http://{self.host}:{self.port}/api/4"
|
||
self.cpu_threshold = cpu_threshold
|
||
self.load_threshold = load_threshold or (self.get_cpu_count() * 2)
|
||
self.io_threshold = io_threshold
|
||
self.disk_usage_threshold = disk_usage_threshold
|
||
self.handle_threshold = handle_threshold
|
||
self.email_sender = email_sender
|
||
self.recipient = recipient
|
||
self.glances_process = None
|
||
self.last_alert_times = {}
|
||
self._running = False
|
||
|
||
def get_cpu_count(self):
|
||
"""获取 CPU 核心数"""
|
||
try:
|
||
response = requests.get(f"{self.api_url}/cpu")
|
||
response.raise_for_status()
|
||
return response.json().get('count', 1)
|
||
except Exception as e:
|
||
logger.error(e)
|
||
return 1
|
||
|
||
def start_glances(self):
|
||
"""启动 Glances Web 服务"""
|
||
try:
|
||
subprocess.run(['glances', '--version'], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||
self.glances_process = subprocess.Popen(
|
||
['glances', '-w', f'--port', str(self.port)],
|
||
stdout=subprocess.PIPE,
|
||
stderr=subprocess.PIPE
|
||
)
|
||
time.sleep(2)
|
||
if self.glances_process.poll() is not None:
|
||
raise RuntimeError("Glances 启动失败")
|
||
logger.info(f"Glances Web 服务已启动: http://{self.host}:{self.port}")
|
||
except subprocess.CalledProcessError:
|
||
logger.error("错误: Glances 未安装。请运行: python3.11 -m pip install glances")
|
||
raise
|
||
except Exception as e:
|
||
logger.error(f"启动 Glances 失败: {e}")
|
||
raise
|
||
|
||
def stop_glances(self):
|
||
"""停止 Glances 服务"""
|
||
if self.glances_process:
|
||
self.glances_process.terminate()
|
||
self.glances_process.wait()
|
||
logger.error("Glances Web 服务已停止")
|
||
|
||
def send_alert_email(self, metric, value, threshold):
|
||
"""发送告警邮件,限制每小时一次"""
|
||
if not self.email_sender or not self.recipient:
|
||
return
|
||
current_time = time.time()
|
||
last_alert_time = self.last_alert_times.get(metric, 0)
|
||
if current_time - last_alert_time < 3600:
|
||
return
|
||
subject = f"服务器告警: {metric} 过高"
|
||
body = f"警告: {metric} 当前值为 {value},超过阈值 {threshold}!\n时间: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}"
|
||
if self.email_sender.send_email(self.recipient, subject, body):
|
||
self.last_alert_times[metric] = current_time
|
||
|
||
def monitor(self):
|
||
"""监控服务器指标并触发告警"""
|
||
while self._running:
|
||
try:
|
||
response = requests.get(f"{self.api_url}/cpu/total")
|
||
response.raise_for_status()
|
||
cpu_usage = response.json().get('total', 0)
|
||
if cpu_usage > self.cpu_threshold:
|
||
self.send_alert_email("CPU 使用率", cpu_usage, self.cpu_threshold)
|
||
|
||
response = requests.get(f"{self.api_url}/load")
|
||
response.raise_for_status()
|
||
load_avg = response.json().get('min1', 0)
|
||
if load_avg > self.load_threshold:
|
||
self.send_alert_email("系统负载(1分钟)", load_avg, self.load_threshold)
|
||
|
||
response = requests.get(f"{self.api_url}/diskio")
|
||
response.raise_for_status()
|
||
disks = response.json()
|
||
max_io_usage = 0
|
||
for disk in disks:
|
||
read_bytes = disk.get('read_bytes', 0)
|
||
write_bytes = disk.get('write_bytes', 0)
|
||
io_usage = (read_bytes + write_bytes) / (2048 * 1024)
|
||
max_io_usage = max(max_io_usage, io_usage)
|
||
if max_io_usage > self.io_threshold:
|
||
self.send_alert_email("磁盘 I/O(MB/s)", max_io_usage, self.io_threshold)
|
||
|
||
response = requests.get(f"{self.api_url}/fs")
|
||
response.raise_for_status()
|
||
filesystems = response.json()
|
||
for fs in filesystems:
|
||
disk_usage = fs.get('percent', 0)
|
||
if disk_usage > self.disk_usage_threshold:
|
||
self.send_alert_email(f"磁盘占用 ({fs.get('mnt_point')})", disk_usage,
|
||
self.disk_usage_threshold)
|
||
|
||
response = requests.get(f"{self.api_url}/processcount")
|
||
response.raise_for_status()
|
||
handle_count = response.json().get('total', 0)
|
||
if handle_count > self.handle_threshold:
|
||
self.send_alert_email("句柄数", handle_count, self.handle_threshold)
|
||
|
||
time.sleep(10)
|
||
except requests.RequestException as e:
|
||
logger.error(f"连接 Glances API 失败: {e}")
|
||
time.sleep(60)
|
||
except Exception as e:
|
||
logger.error(f"监控错误: {e}")
|
||
time.sleep(60)
|
||
|
||
def run(self):
|
||
"""启动 Glances 服务和监控线程(非阻塞)"""
|
||
self._running = True
|
||
self.start_glances()
|
||
monitor_thread = threading.Thread(target=self.monitor, daemon=True)
|
||
monitor_thread.start()
|
||
|
||
def stop(self):
|
||
"""停止 Glances 服务和监控"""
|
||
self._running = False
|
||
self.stop_glances()
|