抖音解析新增本地无水印提取后备链路

变更项： - 在抖音解析插件中加入三级解析策略：内网接口 -> 外部接口 -> 本地提取后备 - 新增本地提取能力：优先使用 yt_dlp Python 库，失败后自动降级到 yt-dlp 命令行 - 新增 yt-dlp 数据标准化映射，统一输出视频/图集结构（url/images/title/author/cover） - 新增格式选择策略：优先含视频轨道且 http/https 的高质量链接，降低发送失败概率 - requirements 增加 yt-dlp 依赖，确保后备方案可安装可运行
2026-04-23 15:27:00 +08:00
parent c22b4cf055
commit e942ee70ed
3 changed files with 174 additions and 2 deletions
--- a/plugins/douyin_parser/main.py
+++ b/plugins/douyin_parser/main.py
@@ -1,6 +1,9 @@
 import os
 import re
 import time
+import json
+import shutil
+import subprocess
 import traceback
 import requests
 import io
@@ -231,12 +234,23 @@ class DouyinParserPlugin(MessagePluginInterface):
    def _parse_douyin(self, url: str) -> Dict[str, Any]:
        try:
            clean_url = self._clean_url(url)
+            # 第一优先级：你现有的内网解析服务，速度快、稳定性高，优先命中。
            primary = self._parse_from_internal_api(clean_url)
            if primary and (primary.get('url') or primary.get('images')):
                return self._clean_response_data(primary)
+
+            # 第二优先级：你现有的外部付费接口，作为内网服务不可用时的兜底。
            secondary = self._parse_from_external_api(clean_url)
-            if secondary and secondary.get('url'):
+            if secondary and (secondary.get('url') or secondary.get('images')):
                return self._clean_response_data(secondary)
+
+            # 第三优先级：本地提取后备方案（无需依赖远端解析API）：
+            # 1) 优先走 yt_dlp Python 库；
+            # 2) 若库不可用，再尝试系统已安装的 yt-dlp 命令行。
+            # 这样当接口异常/限流时，仍可在本机直接提取无水印直链和元数据。
+            local_fallback = self._parse_from_local_extractor(clean_url)
+            if local_fallback and (local_fallback.get('url') or local_fallback.get('images')):
+                return self._clean_response_data(local_fallback)
            raise DouyinParserError("未获取到有效媒资数据")
        except Exception as e:
            self.LOG.error(f"[抖音] 解析过程发生未知错误: {str(e)}\n{traceback.format_exc()}")
@@ -486,6 +500,162 @@ class DouyinParserPlugin(MessagePluginInterface):
        except Exception:
            return None

+    def _parse_from_local_extractor(self, clean_url: str) -> Optional[Dict[str, Any]]:
+        """
+        本地提取后备方案（接口不可用时启用）。
+
+        设计目标：
+        1) 不依赖你自建/第三方解析接口，避免单点故障；
+        2) 优先使用 Python 方式，减少进程开销；
+        3) 若 Python 库未安装，自动降级到命令行，最大化可用性。
+        """
+        try:
+            info = self._extract_with_yt_dlp_python(clean_url)
+            if not info:
+                info = self._extract_with_yt_dlp_cli(clean_url)
+            if not info:
+                return None
+            return self._normalize_yt_dlp_info(info)
+        except Exception as e:
+            self.LOG.warning(f"[抖音] 本地提取后备失败: {e}")
+            return None
+
+    def _extract_with_yt_dlp_python(self, clean_url: str) -> Optional[Dict[str, Any]]:
+        """
+        使用 yt_dlp Python 库提取信息。
+
+        注意：
+        - skip_download=True 只提取元数据和直链，不下载文件；
+        - 优先选取“含视频轨道且协议为http/https”的格式，降低后续发送失败概率。
+        """
+        try:
+            import yt_dlp  # type: ignore
+        except Exception:
+            return None
+
+        ydl_opts = {
+            "quiet": True,
+            "no_warnings": True,
+            "skip_download": True,
+            "proxy": self.http_proxy or None,
+            "nocheckcertificate": True,
+        }
+        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+            info = ydl.extract_info(clean_url, download=False)
+            if isinstance(info, dict):
+                return info
+            return None
+
+    def _extract_with_yt_dlp_cli(self, clean_url: str) -> Optional[Dict[str, Any]]:
+        """
+        使用 yt-dlp 命令行提取信息。
+
+        适用场景：
+        - 运行环境未安装 yt_dlp Python 包，但系统可执行文件已存在。
+        """
+        yt_dlp_bin = shutil.which("yt-dlp")
+        if not yt_dlp_bin:
+            return None
+        cmd = [yt_dlp_bin, "-J", "--no-warnings", "--skip-download", clean_url]
+        if self.http_proxy:
+            cmd.extend(["--proxy", self.http_proxy])
+
+        result = subprocess.run(cmd, capture_output=True, text=True, timeout=25)
+        if result.returncode != 0:
+            self.LOG.warning(f"[抖音] yt-dlp 命令行提取失败: code={result.returncode}, err={result.stderr[:200]}")
+            return None
+        try:
+            data = json.loads(result.stdout or "{}")
+            return data if isinstance(data, dict) else None
+        except Exception:
+            return None
+
+    def _normalize_yt_dlp_info(self, info: Dict[str, Any]) -> Optional[Dict[str, Any]]:
+        """
+        将 yt-dlp 的原始结构统一映射为插件内部 media_info 结构。
+
+        目标结构：
+        - 视频：{"type":"video","url","title","author","cover"}
+        - 图集：{"type":"image","images":[],"title","author","cover"}
+        """
+        # 统一提取作者与标题，尽量优先更稳定字段，保证卡片/文本信息完整。
+        title = str(info.get("description") or info.get("title") or "无标题")
+        author = str(info.get("uploader") or info.get("creator") or info.get("channel") or "未知作者")
+
+        # 统一提取封面：
+        # 1) thumbnail 字段；
+        # 2) thumbnails 数组最后一项（通常分辨率更高）。
+        cover = str(info.get("thumbnail") or "")
+        if not cover:
+            thumbs = info.get("thumbnails") or []
+            if isinstance(thumbs, list) and thumbs:
+                last = thumbs[-1] if isinstance(thumbs[-1], dict) else {}
+                cover = str(last.get("url") or "")
+
+        # 图集场景：yt-dlp 可能返回 playlist/entries，每项通常是图片或片段资源。
+        if info.get("_type") == "playlist":
+            entries = info.get("entries") or []
+            image_urls: List[str] = []
+            if isinstance(entries, list):
+                for item in entries:
+                    if not isinstance(item, dict):
+                        continue
+                    # 优先取原始URL，其次取页面URL，再次取thumbnail。
+                    candidate = str(item.get("url") or item.get("webpage_url") or item.get("thumbnail") or "")
+                    if candidate and candidate.startswith("http"):
+                        image_urls.append(candidate)
+            if image_urls:
+                return {
+                    "type": "image",
+                    "images": image_urls,
+                    "title": title,
+                    "author": author,
+                    "cover": image_urls[0],
+                }
+
+        # 视频场景：优先从 formats 里选“有视频轨道”的直链，避免选到纯音频。
+        best_url = ""
+        formats = info.get("formats") or []
+        scored_candidates: List[Tuple[int, str]] = []
+        if isinstance(formats, list):
+            for fmt in formats:
+                if not isinstance(fmt, dict):
+                    continue
+                fmt_url = str(fmt.get("url") or "")
+                if not fmt_url or not fmt_url.startswith("http"):
+                    continue
+                # 必须含视频轨道（vcodec != none），并且协议优先 http/https。
+                vcodec = str(fmt.get("vcodec") or "")
+                protocol = str(fmt.get("protocol") or "")
+                if vcodec.lower() == "none":
+                    continue
+                score = 0
+                if protocol in ("https", "http"):
+                    score += 50
+                # 优先高分辨率与高码率。
+                score += int(fmt.get("height") or 0)
+                score += int(fmt.get("tbr") or 0) // 10
+                scored_candidates.append((score, fmt_url))
+        if scored_candidates:
+            scored_candidates.sort(key=lambda x: x[0], reverse=True)
+            best_url = scored_candidates[0][1]
+
+        # 部分站点会直接在顶层给 url 字段，作为兜底读取。
+        if not best_url:
+            fallback_url = str(info.get("url") or "")
+            if fallback_url.startswith("http"):
+                best_url = fallback_url
+
+        if best_url:
+            return {
+                "type": "video",
+                "url": best_url,
+                "title": title,
+                "author": author,
+                "cover": cover,
+            }
+        return None
+
    def _append_title_to_image(self, image_bytes: bytes, title: str) -> bytes:
        """
        将标题绘制到图片顶部，返回新的图片二进制数据。