完善插件超时保护与熔断恢复

- 为消息插件执行增加统一超时保护，避免单插件长时间卡住消息主链路 - 增加连续失败熔断、冷却后半开探测与成功自动恢复逻辑 - 将插件执行保护状态接入治理快照与后台详情，便于查看连续失败和恢复剩余时间 - 更新工程优化文档，记录 7.2 第一阶段当前进展
2026-04-30 16:15:53 +08:00
parent b0e11fb9b5
commit 0d7fe5d6f0
4 changed files with 413 additions and 2 deletions
--- a/robot.py
+++ b/robot.py
@@ -649,7 +649,33 @@ class Robot:

                # 检查插件是否可以处理该消息
                if plugin.can_process(plugin_msg):
-                    processed, _ = await plugin.process_message(plugin_msg)
+                    protection_policy = self._build_message_plugin_protection_policy(plugin)
+                    acquire_result = self.plugin_manager.try_acquire_plugin_execution(
+                        plugin,
+                        recovery_seconds=protection_policy["circuit_recovery_seconds"],
+                    )
+                    if not acquire_result.get("allowed", False):
+                        # 熔断打开或半开探测占用时，这里只跳过当前插件：
+                        # 1. 保护目标是避免单插件持续拖慢主链路，而不是直接关闭整个插件；
+                        # 2. 后续插件仍然可以继续尝试处理当前消息，降低功能面损失；
+                        # 3. 冷却结束后会自动进入半开恢复探测，无需人工介入恢复。
+                        self.LOG.warning(
+                            self._trace_message(
+                                msg,
+                                f"插件保护跳过 plugin={plugin.name} reason={acquire_result.get('reason')} "
+                                f"remaining={acquire_result.get('open_remaining_seconds', 0)}s"
+                            )
+                        )
+                        continue
+
+                    processed, _ = await asyncio.wait_for(
+                        plugin.process_message(plugin_msg),
+                        timeout=protection_policy["process_timeout_seconds"],
+                    )
+                    self.plugin_manager.record_plugin_execution_success(
+                        plugin,
+                        process_time_ms=self._elapsed_ms(started_at),
+                    )
                    self._record_plugin_call_result(
                        plugin=plugin,
                        msg=msg,
@@ -670,14 +696,58 @@ class Robot:
                            )
                        )
                        return True
+            except asyncio.TimeoutError as timeout_error:
+                protection_policy = self._build_message_plugin_protection_policy(plugin)
+                failure_record = self.plugin_manager.record_plugin_execution_failure(
+                    plugin,
+                    failure_type="timeout",
+                    error_message=(
+                        f"插件执行超时，超过 {protection_policy['process_timeout_seconds']} 秒仍未完成。"
+                    ),
+                    process_time_ms=self._elapsed_ms(started_at),
+                    timeout_seconds=protection_policy["process_timeout_seconds"],
+                    failure_threshold=protection_policy["failure_threshold"],
+                    recovery_seconds=protection_policy["circuit_recovery_seconds"],
+                )
+                self._record_plugin_call_error(
+                    plugin=plugin,
+                    msg=msg,
+                    command_name=command_name,
+                    error=timeout_error,
+                )
+                self.LOG.error(
+                    self._trace_message(
+                        msg,
+                        f"插件 {plugin.name} 执行超时，timeout={protection_policy['process_timeout_seconds']}s "
+                        f"circuit_state={failure_record.get('circuit_state')} "
+                        f"consecutive_failures={failure_record.get('consecutive_failures')}"
+                    )
+                )
            except Exception as e:
+                protection_policy = self._build_message_plugin_protection_policy(plugin)
+                failure_record = self.plugin_manager.record_plugin_execution_failure(
+                    plugin,
+                    failure_type="error",
+                    error_message=str(e),
+                    process_time_ms=self._elapsed_ms(started_at),
+                    timeout_seconds=0,
+                    failure_threshold=protection_policy["failure_threshold"],
+                    recovery_seconds=protection_policy["circuit_recovery_seconds"],
+                )
                self._record_plugin_call_error(
                    plugin=plugin,
                    msg=msg,
                    command_name=command_name,
                    error=e,
                )
-                self.LOG.error(self._trace_message(msg, f"插件 {plugin.name} 处理消息失败: {e}"))
+                self.LOG.error(
+                    self._trace_message(
+                        msg,
+                        f"插件 {plugin.name} 处理消息失败: {e} "
+                        f"circuit_state={failure_record.get('circuit_state')} "
+                        f"consecutive_failures={failure_record.get('consecutive_failures')}"
+                    )
+                )

        return False

@@ -726,6 +796,70 @@ class Robot:
        msg_type = getattr(getattr(msg, "msg_type", None), "name", "")
        return f"[{msg_type or 'UNKNOWN'}]"

+    @staticmethod
+    def _safe_positive_int(value, default: int) -> int:
+        """把配置中的数字安全转成正整数。"""
+        try:
+            parsed = int(value)
+            return parsed if parsed > 0 else default
+        except (TypeError, ValueError):
+            return default
+
+    def _build_message_plugin_protection_policy(self, plugin) -> dict:
+        """构建消息插件执行保护策略。"""
+        plugin_config = getattr(plugin, "_config", {}) or {}
+        runtime_config = plugin_config.get("runtime", {}) if isinstance(plugin_config, dict) else {}
+        runtime_config = runtime_config if isinstance(runtime_config, dict) else {}
+        breaker_config = runtime_config.get("circuit_breaker", {}) if isinstance(runtime_config, dict) else {}
+        breaker_config = breaker_config if isinstance(breaker_config, dict) else {}
+
+        # 超时策略尽量遵循“显式配置优先，已有内部超时参数兜底”的思路：
+        # 1. 新插件如果有特殊需求，只需要在 runtime / circuit_breaker 下声明自己的超时；
+        # 2. 老插件不改代码也能自动复用现有的 request / llm / render 超时字段；
+        # 3. 最终统一加一个缓冲区，避免外层 wait_for 比插件内部自己的超时还更早打断。
+        explicit_timeout = (
+            runtime_config.get("plugin_process_timeout_seconds")
+            or runtime_config.get("message_timeout_seconds")
+            or breaker_config.get("timeout_seconds")
+            or getattr(plugin, "plugin_process_timeout_seconds", 0)
+        )
+        timeout_candidates = []
+        for attr_name in [
+            "llm_call_timeout_sec",
+            "_request_timeout_seconds",
+            "default_timeout",
+            "_image_render_timeout_seconds",
+            "image_render_timeout_seconds",
+            "_receive_timeout",
+            "_connect_timeout_seconds",
+            "_connect_timeout",
+        ]:
+            attr_value = getattr(plugin, attr_name, 0)
+            if isinstance(attr_value, (int, float)) and attr_value > 0:
+                timeout_candidates.append(int(attr_value))
+
+        if explicit_timeout:
+            resolved_timeout = self._safe_positive_int(explicit_timeout, 30)
+        elif timeout_candidates:
+            resolved_timeout = max(timeout_candidates) + 10
+        else:
+            resolved_timeout = 30
+
+        failure_threshold = self._safe_positive_int(
+            breaker_config.get("failure_threshold") or runtime_config.get("circuit_breaker_failure_threshold") or 3,
+            3,
+        )
+        circuit_recovery_seconds = self._safe_positive_int(
+            breaker_config.get("recovery_seconds") or runtime_config.get("circuit_breaker_recovery_seconds") or 180,
+            180,
+        )
+
+        return {
+            "process_timeout_seconds": max(10, min(int(resolved_timeout), 180)),
+            "failure_threshold": max(2, min(int(failure_threshold), 10)),
+            "circuit_recovery_seconds": max(30, min(int(circuit_recovery_seconds), 900)),
+        }
+
    def _get_stats_collector_plugin(self):
        """获取运行中的统计收集插件实例。"""
        # 统计插件已经从“事件订阅”切到“主链路直接回调”，