完善插件超时保护与熔断恢复
- 为消息插件执行增加统一超时保护,避免单插件长时间卡住消息主链路 - 增加连续失败熔断、冷却后半开探测与成功自动恢复逻辑 - 将插件执行保护状态接入治理快照与后台详情,便于查看连续失败和恢复剩余时间 - 更新工程优化文档,记录 7.2 第一阶段当前进展
This commit is contained in:
138
robot.py
138
robot.py
@@ -649,7 +649,33 @@ class Robot:
|
||||
|
||||
# 检查插件是否可以处理该消息
|
||||
if plugin.can_process(plugin_msg):
|
||||
processed, _ = await plugin.process_message(plugin_msg)
|
||||
protection_policy = self._build_message_plugin_protection_policy(plugin)
|
||||
acquire_result = self.plugin_manager.try_acquire_plugin_execution(
|
||||
plugin,
|
||||
recovery_seconds=protection_policy["circuit_recovery_seconds"],
|
||||
)
|
||||
if not acquire_result.get("allowed", False):
|
||||
# 熔断打开或半开探测占用时,这里只跳过当前插件:
|
||||
# 1. 保护目标是避免单插件持续拖慢主链路,而不是直接关闭整个插件;
|
||||
# 2. 后续插件仍然可以继续尝试处理当前消息,降低功能面损失;
|
||||
# 3. 冷却结束后会自动进入半开恢复探测,无需人工介入恢复。
|
||||
self.LOG.warning(
|
||||
self._trace_message(
|
||||
msg,
|
||||
f"插件保护跳过 plugin={plugin.name} reason={acquire_result.get('reason')} "
|
||||
f"remaining={acquire_result.get('open_remaining_seconds', 0)}s"
|
||||
)
|
||||
)
|
||||
continue
|
||||
|
||||
processed, _ = await asyncio.wait_for(
|
||||
plugin.process_message(plugin_msg),
|
||||
timeout=protection_policy["process_timeout_seconds"],
|
||||
)
|
||||
self.plugin_manager.record_plugin_execution_success(
|
||||
plugin,
|
||||
process_time_ms=self._elapsed_ms(started_at),
|
||||
)
|
||||
self._record_plugin_call_result(
|
||||
plugin=plugin,
|
||||
msg=msg,
|
||||
@@ -670,14 +696,58 @@ class Robot:
|
||||
)
|
||||
)
|
||||
return True
|
||||
except asyncio.TimeoutError as timeout_error:
|
||||
protection_policy = self._build_message_plugin_protection_policy(plugin)
|
||||
failure_record = self.plugin_manager.record_plugin_execution_failure(
|
||||
plugin,
|
||||
failure_type="timeout",
|
||||
error_message=(
|
||||
f"插件执行超时,超过 {protection_policy['process_timeout_seconds']} 秒仍未完成。"
|
||||
),
|
||||
process_time_ms=self._elapsed_ms(started_at),
|
||||
timeout_seconds=protection_policy["process_timeout_seconds"],
|
||||
failure_threshold=protection_policy["failure_threshold"],
|
||||
recovery_seconds=protection_policy["circuit_recovery_seconds"],
|
||||
)
|
||||
self._record_plugin_call_error(
|
||||
plugin=plugin,
|
||||
msg=msg,
|
||||
command_name=command_name,
|
||||
error=timeout_error,
|
||||
)
|
||||
self.LOG.error(
|
||||
self._trace_message(
|
||||
msg,
|
||||
f"插件 {plugin.name} 执行超时,timeout={protection_policy['process_timeout_seconds']}s "
|
||||
f"circuit_state={failure_record.get('circuit_state')} "
|
||||
f"consecutive_failures={failure_record.get('consecutive_failures')}"
|
||||
)
|
||||
)
|
||||
except Exception as e:
|
||||
protection_policy = self._build_message_plugin_protection_policy(plugin)
|
||||
failure_record = self.plugin_manager.record_plugin_execution_failure(
|
||||
plugin,
|
||||
failure_type="error",
|
||||
error_message=str(e),
|
||||
process_time_ms=self._elapsed_ms(started_at),
|
||||
timeout_seconds=0,
|
||||
failure_threshold=protection_policy["failure_threshold"],
|
||||
recovery_seconds=protection_policy["circuit_recovery_seconds"],
|
||||
)
|
||||
self._record_plugin_call_error(
|
||||
plugin=plugin,
|
||||
msg=msg,
|
||||
command_name=command_name,
|
||||
error=e,
|
||||
)
|
||||
self.LOG.error(self._trace_message(msg, f"插件 {plugin.name} 处理消息失败: {e}"))
|
||||
self.LOG.error(
|
||||
self._trace_message(
|
||||
msg,
|
||||
f"插件 {plugin.name} 处理消息失败: {e} "
|
||||
f"circuit_state={failure_record.get('circuit_state')} "
|
||||
f"consecutive_failures={failure_record.get('consecutive_failures')}"
|
||||
)
|
||||
)
|
||||
|
||||
return False
|
||||
|
||||
@@ -726,6 +796,70 @@ class Robot:
|
||||
msg_type = getattr(getattr(msg, "msg_type", None), "name", "")
|
||||
return f"[{msg_type or 'UNKNOWN'}]"
|
||||
|
||||
@staticmethod
|
||||
def _safe_positive_int(value, default: int) -> int:
|
||||
"""把配置中的数字安全转成正整数。"""
|
||||
try:
|
||||
parsed = int(value)
|
||||
return parsed if parsed > 0 else default
|
||||
except (TypeError, ValueError):
|
||||
return default
|
||||
|
||||
def _build_message_plugin_protection_policy(self, plugin) -> dict:
|
||||
"""构建消息插件执行保护策略。"""
|
||||
plugin_config = getattr(plugin, "_config", {}) or {}
|
||||
runtime_config = plugin_config.get("runtime", {}) if isinstance(plugin_config, dict) else {}
|
||||
runtime_config = runtime_config if isinstance(runtime_config, dict) else {}
|
||||
breaker_config = runtime_config.get("circuit_breaker", {}) if isinstance(runtime_config, dict) else {}
|
||||
breaker_config = breaker_config if isinstance(breaker_config, dict) else {}
|
||||
|
||||
# 超时策略尽量遵循“显式配置优先,已有内部超时参数兜底”的思路:
|
||||
# 1. 新插件如果有特殊需求,只需要在 runtime / circuit_breaker 下声明自己的超时;
|
||||
# 2. 老插件不改代码也能自动复用现有的 request / llm / render 超时字段;
|
||||
# 3. 最终统一加一个缓冲区,避免外层 wait_for 比插件内部自己的超时还更早打断。
|
||||
explicit_timeout = (
|
||||
runtime_config.get("plugin_process_timeout_seconds")
|
||||
or runtime_config.get("message_timeout_seconds")
|
||||
or breaker_config.get("timeout_seconds")
|
||||
or getattr(plugin, "plugin_process_timeout_seconds", 0)
|
||||
)
|
||||
timeout_candidates = []
|
||||
for attr_name in [
|
||||
"llm_call_timeout_sec",
|
||||
"_request_timeout_seconds",
|
||||
"default_timeout",
|
||||
"_image_render_timeout_seconds",
|
||||
"image_render_timeout_seconds",
|
||||
"_receive_timeout",
|
||||
"_connect_timeout_seconds",
|
||||
"_connect_timeout",
|
||||
]:
|
||||
attr_value = getattr(plugin, attr_name, 0)
|
||||
if isinstance(attr_value, (int, float)) and attr_value > 0:
|
||||
timeout_candidates.append(int(attr_value))
|
||||
|
||||
if explicit_timeout:
|
||||
resolved_timeout = self._safe_positive_int(explicit_timeout, 30)
|
||||
elif timeout_candidates:
|
||||
resolved_timeout = max(timeout_candidates) + 10
|
||||
else:
|
||||
resolved_timeout = 30
|
||||
|
||||
failure_threshold = self._safe_positive_int(
|
||||
breaker_config.get("failure_threshold") or runtime_config.get("circuit_breaker_failure_threshold") or 3,
|
||||
3,
|
||||
)
|
||||
circuit_recovery_seconds = self._safe_positive_int(
|
||||
breaker_config.get("recovery_seconds") or runtime_config.get("circuit_breaker_recovery_seconds") or 180,
|
||||
180,
|
||||
)
|
||||
|
||||
return {
|
||||
"process_timeout_seconds": max(10, min(int(resolved_timeout), 180)),
|
||||
"failure_threshold": max(2, min(int(failure_threshold), 10)),
|
||||
"circuit_recovery_seconds": max(30, min(int(circuit_recovery_seconds), 900)),
|
||||
}
|
||||
|
||||
def _get_stats_collector_plugin(self):
|
||||
"""获取运行中的统计收集插件实例。"""
|
||||
# 统计插件已经从“事件订阅”切到“主链路直接回调”,
|
||||
|
||||
Reference in New Issue
Block a user