修复转图运行时重复启动与高频误重建问题

变更项:\n1. 修复 md2img 运行时并发启动竞争,新增启动中标记,避免同名线程被重复拉起。\n2. 优化浏览器心跳探测策略:由高频激进重建改为断连连续判定后再重建,降低误判。\n3. 新增截图进行中保护,截图期间心跳跳过探测,避免与业务并发导致误重建。\n4. 为浏览器重建增加 reason 日志字段,便于线上追踪重建触发原因。\n5. 保留截图后断连自愈能力,但改为更稳健的触发路径,减少无意义重建。
This commit is contained in:
liuwei
2026-04-17 10:26:10 +08:00
parent 97bc4560b6
commit 3d98b3c0a2

View File

@@ -523,6 +523,10 @@ class _PersistentBrowser:
self._owner_loop_id: Optional[int] = None
# 保活心跳任务:定期探测浏览器连通性,异常时自动重建。
self._heartbeat_task: Optional[asyncio.Task] = None
# 心跳断连计数:避免单次抖动就触发重建。
self._disconnect_streak = 0
# 截图进行中标记:心跳期间若业务在跑,跳过本轮探测以避免误判。
self._capture_in_progress = False
async def _launch_browser(self):
if self._playwright is None:
@@ -581,7 +585,7 @@ class _PersistentBrowser:
self._ensure_heartbeat_task()
return self._browser
async def restart_browser(self):
async def restart_browser(self, reason: str = "unknown"):
async with self._lock:
if self._browser:
try:
@@ -591,10 +595,11 @@ class _PersistentBrowser:
self._browser = None
self._browser = await self._launch_browser()
self._owner_loop_id = id(asyncio.get_running_loop())
self._disconnect_streak = 0
browser_pid = getattr(getattr(self._browser, "process", None), "pid", None)
logger.info(
f"[md2img] 常驻浏览器已重建: source={self._last_launch_source}, "
f"loop={self._owner_loop_id}, pid={browser_pid}"
f"loop={self._owner_loop_id}, pid={browser_pid}, reason={reason}"
)
self._ensure_heartbeat_task()
return self._browser
@@ -613,13 +618,24 @@ class _PersistentBrowser:
"""周期性探测浏览器可用性,断连后自动重建。"""
while True:
try:
await asyncio.sleep(10)
await asyncio.sleep(20)
# 没有浏览器实例时只保持心跳存活,不主动创建,避免空闲时不必要消耗。
if not self._browser:
self._disconnect_streak = 0
continue
if not await self._is_browser_alive(self._browser, timeout_seconds=2.0):
logger.warning("[md2img] 心跳探测发现浏览器已断连,准备自动重建")
await self.restart_browser()
if self._capture_in_progress:
# 截图期间跳过探测,避免与业务并发导致误判。
continue
if self._browser and self._browser.is_connected():
self._disconnect_streak = 0
continue
self._disconnect_streak += 1
if self._disconnect_streak >= 3:
logger.warning(
f"[md2img] 心跳探测连续{self._disconnect_streak}次发现浏览器断连,准备自动重建"
)
await self.restart_browser(reason="heartbeat_disconnected")
except asyncio.CancelledError:
raise
except Exception as e:
@@ -635,8 +651,10 @@ class _PersistentBrowser:
browser = await self.ensure_browser()
async def _capture_with_browser(active_browser):
context = await active_browser.new_context(viewport={"width": 780, "height": 960}, device_scale_factor=1.2)
self._capture_in_progress = True
context = None
try:
context = await active_browser.new_context(viewport={"width": 780, "height": 960}, device_scale_factor=1.2)
page = await context.new_page()
logger.debug("Set page content")
await page.set_content(html_content, wait_until='domcontentloaded', timeout=15000)
@@ -649,9 +667,11 @@ class _PersistentBrowser:
raise RuntimeError(f"截图失败,输出文件不存在: {output_image}")
finally:
try:
await context.close()
if context:
await context.close()
except Exception:
pass
self._capture_in_progress = False
try:
await _capture_with_browser(browser)
@@ -659,11 +679,11 @@ class _PersistentBrowser:
# 在部分系统环境中,浏览器可能在任务完成后迅速断连,这里主动重建保证“常驻”语义。
if not await self._is_browser_alive(browser, timeout_seconds=2.0):
logger.warning("[md2img] 截图后浏览器已断连,立即执行自动重建")
await self.restart_browser()
await self.restart_browser(reason="post_capture_disconnected")
except Exception as e:
# 首次失败后重建一次浏览器再重试,提升抗偶发故障能力。
logger.warning(f"[md2img] 常驻浏览器截图失败,准备重建后重试: {e}")
browser = await self.restart_browser()
browser = await self.restart_browser(reason="capture_exception_retry")
await _capture_with_browser(browser)
@@ -685,6 +705,8 @@ class _Md2ImgRuntime:
self._loop: Optional[asyncio.AbstractEventLoop] = None
self._lock = threading.Lock()
self._ready = threading.Event()
# 启动中标记:避免并发调用 ensure_started 时重复创建线程。
self._starting = False
@property
def loop(self) -> Optional[asyncio.AbstractEventLoop]:
@@ -706,11 +728,21 @@ class _Md2ImgRuntime:
with self._lock:
if self._thread and self._thread.is_alive() and self._loop and self._loop.is_running():
return
self._ready.clear()
self._thread = threading.Thread(target=self._thread_main, name="md2img-runtime", daemon=True)
self._thread.start()
if not self._ready.wait(timeout=10):
raise RuntimeError("md2img 专用运行时启动超时")
if self._starting:
# 已有其他调用在启动中,当前线程等待启动完成即可。
pass
else:
self._starting = True
self._ready.clear()
self._thread = threading.Thread(target=self._thread_main, name="md2img-runtime", daemon=True)
self._thread.start()
# 注意:等待动作放到锁外,避免阻塞其他读取逻辑。
if not self._ready.wait(timeout=10):
with self._lock:
self._starting = False
raise RuntimeError("md2img 专用运行时启动超时")
with self._lock:
self._starting = False
def submit(self, coro) -> ConcurrentFuture:
"""向专用运行时提交协程任务。"""