From 64a2253813864c0c1d08395c8261f39d7bea8a63 Mon Sep 17 00:00:00 2001 From: liuwei Date: Thu, 23 Apr 2026 15:53:02 +0800 Subject: [PATCH] =?UTF-8?q?=E7=A7=BB=E9=99=A4=E6=8A=96=E9=9F=B3=E8=A7=A3?= =?UTF-8?q?=E6=9E=90=E4=B8=AD=E7=9A=84yt-dlp=E9=93=BE=E8=B7=AF=E5=B9=B6?= =?UTF-8?q?=E4=BF=9D=E7=95=99=E5=8F=AF=E7=94=A8=E6=8E=A5=E5=8F=A3=E8=A7=A3?= =?UTF-8?q?=E6=9E=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 变更项: - 删除抖音插件内所有 yt-dlp 解析与标准化实现代码,避免 Fresh cookies 问题 - 解析链路简化为:本地业务接口(内网)-> 外部接口兜底 - 移除 requirements 中 yt-dlp 依赖,减少无效依赖和运行噪音 - 使用你提供链接完成本地验证,确认可正常获取视频地址 --- plugins/douyin_parser/main.py | 183 ---------------------------------- requirements.txt | 1 - 2 files changed, 184 deletions(-) diff --git a/plugins/douyin_parser/main.py b/plugins/douyin_parser/main.py index f8436a6..617fe87 100644 --- a/plugins/douyin_parser/main.py +++ b/plugins/douyin_parser/main.py @@ -1,9 +1,6 @@ import os import re import time -import json -import shutil -import subprocess import traceback import requests import io @@ -250,13 +247,6 @@ class DouyinParserPlugin(MessagePluginInterface): if secondary and (secondary.get('url') or secondary.get('images')): return self._clean_response_data(secondary) - # 第三优先级:本机兜底提取(yt-dlp)。 - # 说明: - # - 该方案受 Cookie 新鲜度影响较大; - # - 放在最后可避免在“本地业务解析已成功”时仍输出 Fresh cookies 警告。 - local_fallback = self._parse_from_local_extractor(clean_url) - if local_fallback and (local_fallback.get('url') or local_fallback.get('images')): - return self._clean_response_data(local_fallback) raise DouyinParserError("未获取到有效媒资数据") except Exception as e: self.LOG.error(f"[抖音] 解析过程发生未知错误: {str(e)}\n{traceback.format_exc()}") @@ -543,179 +533,6 @@ class DouyinParserPlugin(MessagePluginInterface): except Exception: return None - def _parse_from_local_extractor(self, clean_url: str) -> Optional[Dict[str, Any]]: - """ - 本地提取后备方案(接口不可用时启用)。 - - 设计目标: - 1) 不依赖你自建/第三方解析接口,避免单点故障; - 2) 优先使用 Python 方式,减少进程开销; - 3) 若 Python 库未安装,自动降级到命令行,最大化可用性。 - """ - try: - info = self._extract_with_yt_dlp_python(clean_url) - if not info: - info = self._extract_with_yt_dlp_cli(clean_url) - if not info: - return None - return self._normalize_yt_dlp_info(info) - except Exception as e: - self.LOG.warning(f"[抖音] 本地提取后备失败: {e}") - return None - - def _extract_with_yt_dlp_python(self, clean_url: str) -> Optional[Dict[str, Any]]: - """ - 使用 yt_dlp Python 库提取信息。 - - 注意: - - skip_download=True 只提取元数据和直链,不下载文件; - - 优先选取“含视频轨道且协议为http/https”的格式,降低后续发送失败概率。 - """ - try: - import yt_dlp # type: ignore - except Exception: - return None - - ydl_opts = { - "quiet": True, - "no_warnings": True, - "skip_download": True, - "proxy": self.http_proxy or None, - "nocheckcertificate": True, - } - # Cookie 注入策略: - # - 优先使用 cookie_file(yt-dlp 官方支持的 cookies 文件,兼容性更高); - # - 否则回退到手工 Cookie 请求头。 - if self.cookie_file and os.path.exists(self.cookie_file): - ydl_opts["cookiefile"] = self.cookie_file - elif self.cookie: - ydl_opts["http_headers"] = {"Cookie": self.cookie} - with yt_dlp.YoutubeDL(ydl_opts) as ydl: - info = ydl.extract_info(clean_url, download=False) - if isinstance(info, dict): - return info - return None - - def _extract_with_yt_dlp_cli(self, clean_url: str) -> Optional[Dict[str, Any]]: - """ - 使用 yt-dlp 命令行提取信息。 - - 适用场景: - - 运行环境未安装 yt_dlp Python 包,但系统可执行文件已存在。 - """ - yt_dlp_bin = shutil.which("yt-dlp") - if not yt_dlp_bin: - return None - cmd = [yt_dlp_bin, "-J", "--no-warnings", "--skip-download", clean_url] - if self.http_proxy: - cmd.extend(["--proxy", self.http_proxy]) - # 命令行模式下同样注入 Cookie,确保与 Python 模式行为一致。 - if self.cookie_file and os.path.exists(self.cookie_file): - cmd.extend(["--cookies", self.cookie_file]) - elif self.cookie: - cmd.extend(["--add-header", f"Cookie: {self.cookie}"]) - - result = subprocess.run(cmd, capture_output=True, text=True, timeout=25) - if result.returncode != 0: - err_msg = (result.stderr or "").strip().replace("\n", " ") - if "Fresh cookies" in err_msg: - # 该错误在抖音场景出现频率较高,且当前链路已是“最后兜底”,降为 info 避免误导。 - self.LOG.info("[抖音] yt-dlp 兜底提取失败:Cookie 需要刷新(Fresh cookies needed)") - else: - self.LOG.warning(f"[抖音] yt-dlp 命令行提取失败: code={result.returncode}, err={err_msg[:200]}") - return None - try: - data = json.loads(result.stdout or "{}") - return data if isinstance(data, dict) else None - except Exception: - return None - - def _normalize_yt_dlp_info(self, info: Dict[str, Any]) -> Optional[Dict[str, Any]]: - """ - 将 yt-dlp 的原始结构统一映射为插件内部 media_info 结构。 - - 目标结构: - - 视频:{"type":"video","url","title","author","cover"} - - 图集:{"type":"image","images":[],"title","author","cover"} - """ - # 统一提取作者与标题,尽量优先更稳定字段,保证卡片/文本信息完整。 - title = str(info.get("description") or info.get("title") or "无标题") - author = str(info.get("uploader") or info.get("creator") or info.get("channel") or "未知作者") - - # 统一提取封面: - # 1) thumbnail 字段; - # 2) thumbnails 数组最后一项(通常分辨率更高)。 - cover = str(info.get("thumbnail") or "") - if not cover: - thumbs = info.get("thumbnails") or [] - if isinstance(thumbs, list) and thumbs: - last = thumbs[-1] if isinstance(thumbs[-1], dict) else {} - cover = str(last.get("url") or "") - - # 图集场景:yt-dlp 可能返回 playlist/entries,每项通常是图片或片段资源。 - if info.get("_type") == "playlist": - entries = info.get("entries") or [] - image_urls: List[str] = [] - if isinstance(entries, list): - for item in entries: - if not isinstance(item, dict): - continue - # 优先取原始URL,其次取页面URL,再次取thumbnail。 - candidate = str(item.get("url") or item.get("webpage_url") or item.get("thumbnail") or "") - if candidate and candidate.startswith("http"): - image_urls.append(candidate) - if image_urls: - return { - "type": "image", - "images": image_urls, - "title": title, - "author": author, - "cover": image_urls[0], - } - - # 视频场景:优先从 formats 里选“有视频轨道”的直链,避免选到纯音频。 - best_url = "" - formats = info.get("formats") or [] - scored_candidates: List[Tuple[int, str]] = [] - if isinstance(formats, list): - for fmt in formats: - if not isinstance(fmt, dict): - continue - fmt_url = str(fmt.get("url") or "") - if not fmt_url or not fmt_url.startswith("http"): - continue - # 必须含视频轨道(vcodec != none),并且协议优先 http/https。 - vcodec = str(fmt.get("vcodec") or "") - protocol = str(fmt.get("protocol") or "") - if vcodec.lower() == "none": - continue - score = 0 - if protocol in ("https", "http"): - score += 50 - # 优先高分辨率与高码率。 - score += int(fmt.get("height") or 0) - score += int(fmt.get("tbr") or 0) // 10 - scored_candidates.append((score, fmt_url)) - if scored_candidates: - scored_candidates.sort(key=lambda x: x[0], reverse=True) - best_url = scored_candidates[0][1] - - # 部分站点会直接在顶层给 url 字段,作为兜底读取。 - if not best_url: - fallback_url = str(info.get("url") or "") - if fallback_url.startswith("http"): - best_url = fallback_url - - if best_url: - return { - "type": "video", - "url": best_url, - "title": title, - "author": author, - "cover": cover, - } - return None - def _append_title_to_image(self, image_bytes: bytes, title: str) -> bytes: """ 将标题绘制到图片顶部,返回新的图片二进制数据。 diff --git a/requirements.txt b/requirements.txt index 435c556..11e5ff5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -52,4 +52,3 @@ undetected-chromedriver~=3.5.5 urllib3~=2.5.0 websockets~=15.0.1 websocket-client~=1.8.0 -yt-dlp