From e942ee70ed9749a3f7fcc03d48939bb89e003cdd Mon Sep 17 00:00:00 2001 From: liuwei Date: Thu, 23 Apr 2026 15:27:00 +0800 Subject: [PATCH] =?UTF-8?q?=E6=8A=96=E9=9F=B3=E8=A7=A3=E6=9E=90=E6=96=B0?= =?UTF-8?q?=E5=A2=9E=E6=9C=AC=E5=9C=B0=E6=97=A0=E6=B0=B4=E5=8D=B0=E6=8F=90?= =?UTF-8?q?=E5=8F=96=E5=90=8E=E5=A4=87=E9=93=BE=E8=B7=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 变更项: - 在抖音解析插件中加入三级解析策略:内网接口 -> 外部接口 -> 本地提取后备 - 新增本地提取能力:优先使用 yt_dlp Python 库,失败后自动降级到 yt-dlp 命令行 - 新增 yt-dlp 数据标准化映射,统一输出视频/图集结构(url/images/title/author/cover) - 新增格式选择策略:优先含视频轨道且 http/https 的高质量链接,降低发送失败概率 - requirements 增加 yt-dlp 依赖,确保后备方案可安装可运行 --- plugins/douyin_parser/main.py | 172 +++++++++++++++++++++++++++- requirements.txt | 3 +- temp/ext/Douyin_TikTok_Download_API | 1 + 3 files changed, 174 insertions(+), 2 deletions(-) create mode 160000 temp/ext/Douyin_TikTok_Download_API diff --git a/plugins/douyin_parser/main.py b/plugins/douyin_parser/main.py index fac1aab..fb3b73a 100644 --- a/plugins/douyin_parser/main.py +++ b/plugins/douyin_parser/main.py @@ -1,6 +1,9 @@ import os import re import time +import json +import shutil +import subprocess import traceback import requests import io @@ -231,12 +234,23 @@ class DouyinParserPlugin(MessagePluginInterface): def _parse_douyin(self, url: str) -> Dict[str, Any]: try: clean_url = self._clean_url(url) + # 第一优先级:你现有的内网解析服务,速度快、稳定性高,优先命中。 primary = self._parse_from_internal_api(clean_url) if primary and (primary.get('url') or primary.get('images')): return self._clean_response_data(primary) + + # 第二优先级:你现有的外部付费接口,作为内网服务不可用时的兜底。 secondary = self._parse_from_external_api(clean_url) - if secondary and secondary.get('url'): + if secondary and (secondary.get('url') or secondary.get('images')): return self._clean_response_data(secondary) + + # 第三优先级:本地提取后备方案(无需依赖远端解析API): + # 1) 优先走 yt_dlp Python 库; + # 2) 若库不可用,再尝试系统已安装的 yt-dlp 命令行。 + # 这样当接口异常/限流时,仍可在本机直接提取无水印直链和元数据。 + local_fallback = self._parse_from_local_extractor(clean_url) + if local_fallback and (local_fallback.get('url') or local_fallback.get('images')): + return self._clean_response_data(local_fallback) raise DouyinParserError("未获取到有效媒资数据") except Exception as e: self.LOG.error(f"[抖音] 解析过程发生未知错误: {str(e)}\n{traceback.format_exc()}") @@ -486,6 +500,162 @@ class DouyinParserPlugin(MessagePluginInterface): except Exception: return None + def _parse_from_local_extractor(self, clean_url: str) -> Optional[Dict[str, Any]]: + """ + 本地提取后备方案(接口不可用时启用)。 + + 设计目标: + 1) 不依赖你自建/第三方解析接口,避免单点故障; + 2) 优先使用 Python 方式,减少进程开销; + 3) 若 Python 库未安装,自动降级到命令行,最大化可用性。 + """ + try: + info = self._extract_with_yt_dlp_python(clean_url) + if not info: + info = self._extract_with_yt_dlp_cli(clean_url) + if not info: + return None + return self._normalize_yt_dlp_info(info) + except Exception as e: + self.LOG.warning(f"[抖音] 本地提取后备失败: {e}") + return None + + def _extract_with_yt_dlp_python(self, clean_url: str) -> Optional[Dict[str, Any]]: + """ + 使用 yt_dlp Python 库提取信息。 + + 注意: + - skip_download=True 只提取元数据和直链,不下载文件; + - 优先选取“含视频轨道且协议为http/https”的格式,降低后续发送失败概率。 + """ + try: + import yt_dlp # type: ignore + except Exception: + return None + + ydl_opts = { + "quiet": True, + "no_warnings": True, + "skip_download": True, + "proxy": self.http_proxy or None, + "nocheckcertificate": True, + } + with yt_dlp.YoutubeDL(ydl_opts) as ydl: + info = ydl.extract_info(clean_url, download=False) + if isinstance(info, dict): + return info + return None + + def _extract_with_yt_dlp_cli(self, clean_url: str) -> Optional[Dict[str, Any]]: + """ + 使用 yt-dlp 命令行提取信息。 + + 适用场景: + - 运行环境未安装 yt_dlp Python 包,但系统可执行文件已存在。 + """ + yt_dlp_bin = shutil.which("yt-dlp") + if not yt_dlp_bin: + return None + cmd = [yt_dlp_bin, "-J", "--no-warnings", "--skip-download", clean_url] + if self.http_proxy: + cmd.extend(["--proxy", self.http_proxy]) + + result = subprocess.run(cmd, capture_output=True, text=True, timeout=25) + if result.returncode != 0: + self.LOG.warning(f"[抖音] yt-dlp 命令行提取失败: code={result.returncode}, err={result.stderr[:200]}") + return None + try: + data = json.loads(result.stdout or "{}") + return data if isinstance(data, dict) else None + except Exception: + return None + + def _normalize_yt_dlp_info(self, info: Dict[str, Any]) -> Optional[Dict[str, Any]]: + """ + 将 yt-dlp 的原始结构统一映射为插件内部 media_info 结构。 + + 目标结构: + - 视频:{"type":"video","url","title","author","cover"} + - 图集:{"type":"image","images":[],"title","author","cover"} + """ + # 统一提取作者与标题,尽量优先更稳定字段,保证卡片/文本信息完整。 + title = str(info.get("description") or info.get("title") or "无标题") + author = str(info.get("uploader") or info.get("creator") or info.get("channel") or "未知作者") + + # 统一提取封面: + # 1) thumbnail 字段; + # 2) thumbnails 数组最后一项(通常分辨率更高)。 + cover = str(info.get("thumbnail") or "") + if not cover: + thumbs = info.get("thumbnails") or [] + if isinstance(thumbs, list) and thumbs: + last = thumbs[-1] if isinstance(thumbs[-1], dict) else {} + cover = str(last.get("url") or "") + + # 图集场景:yt-dlp 可能返回 playlist/entries,每项通常是图片或片段资源。 + if info.get("_type") == "playlist": + entries = info.get("entries") or [] + image_urls: List[str] = [] + if isinstance(entries, list): + for item in entries: + if not isinstance(item, dict): + continue + # 优先取原始URL,其次取页面URL,再次取thumbnail。 + candidate = str(item.get("url") or item.get("webpage_url") or item.get("thumbnail") or "") + if candidate and candidate.startswith("http"): + image_urls.append(candidate) + if image_urls: + return { + "type": "image", + "images": image_urls, + "title": title, + "author": author, + "cover": image_urls[0], + } + + # 视频场景:优先从 formats 里选“有视频轨道”的直链,避免选到纯音频。 + best_url = "" + formats = info.get("formats") or [] + scored_candidates: List[Tuple[int, str]] = [] + if isinstance(formats, list): + for fmt in formats: + if not isinstance(fmt, dict): + continue + fmt_url = str(fmt.get("url") or "") + if not fmt_url or not fmt_url.startswith("http"): + continue + # 必须含视频轨道(vcodec != none),并且协议优先 http/https。 + vcodec = str(fmt.get("vcodec") or "") + protocol = str(fmt.get("protocol") or "") + if vcodec.lower() == "none": + continue + score = 0 + if protocol in ("https", "http"): + score += 50 + # 优先高分辨率与高码率。 + score += int(fmt.get("height") or 0) + score += int(fmt.get("tbr") or 0) // 10 + scored_candidates.append((score, fmt_url)) + if scored_candidates: + scored_candidates.sort(key=lambda x: x[0], reverse=True) + best_url = scored_candidates[0][1] + + # 部分站点会直接在顶层给 url 字段,作为兜底读取。 + if not best_url: + fallback_url = str(info.get("url") or "") + if fallback_url.startswith("http"): + best_url = fallback_url + + if best_url: + return { + "type": "video", + "url": best_url, + "title": title, + "author": author, + "cover": cover, + } + return None + def _append_title_to_image(self, image_bytes: bytes, title: str) -> bytes: """ 将标题绘制到图片顶部,返回新的图片二进制数据。 diff --git a/requirements.txt b/requirements.txt index 1cd0876..435c556 100644 --- a/requirements.txt +++ b/requirements.txt @@ -51,4 +51,5 @@ aiofiles~=24.1.0 undetected-chromedriver~=3.5.5 urllib3~=2.5.0 websockets~=15.0.1 -websocket-client~=1.8.0 \ No newline at end of file +websocket-client~=1.8.0 +yt-dlp diff --git a/temp/ext/Douyin_TikTok_Download_API b/temp/ext/Douyin_TikTok_Download_API new file mode 160000 index 0000000..42784ff --- /dev/null +++ b/temp/ext/Douyin_TikTok_Download_API @@ -0,0 +1 @@ +Subproject commit 42784ffc83a72a516bfe952153ad7e2a3998d16c