抖音解析新增本地无水印提取后备链路

变更项:

- 在抖音解析插件中加入三级解析策略:内网接口 -> 外部接口 -> 本地提取后备

- 新增本地提取能力:优先使用 yt_dlp Python 库,失败后自动降级到 yt-dlp 命令行

- 新增 yt-dlp 数据标准化映射,统一输出视频/图集结构(url/images/title/author/cover)

- 新增格式选择策略:优先含视频轨道且 http/https 的高质量链接,降低发送失败概率

- requirements 增加 yt-dlp 依赖,确保后备方案可安装可运行
This commit is contained in:
liuwei
2026-04-23 15:27:00 +08:00
parent c22b4cf055
commit e942ee70ed
3 changed files with 174 additions and 2 deletions

View File

@@ -1,6 +1,9 @@
import os
import re
import time
import json
import shutil
import subprocess
import traceback
import requests
import io
@@ -231,12 +234,23 @@ class DouyinParserPlugin(MessagePluginInterface):
def _parse_douyin(self, url: str) -> Dict[str, Any]:
try:
clean_url = self._clean_url(url)
# 第一优先级:你现有的内网解析服务,速度快、稳定性高,优先命中。
primary = self._parse_from_internal_api(clean_url)
if primary and (primary.get('url') or primary.get('images')):
return self._clean_response_data(primary)
# 第二优先级:你现有的外部付费接口,作为内网服务不可用时的兜底。
secondary = self._parse_from_external_api(clean_url)
if secondary and secondary.get('url'):
if secondary and (secondary.get('url') or secondary.get('images')):
return self._clean_response_data(secondary)
# 第三优先级本地提取后备方案无需依赖远端解析API
# 1) 优先走 yt_dlp Python 库;
# 2) 若库不可用,再尝试系统已安装的 yt-dlp 命令行。
# 这样当接口异常/限流时,仍可在本机直接提取无水印直链和元数据。
local_fallback = self._parse_from_local_extractor(clean_url)
if local_fallback and (local_fallback.get('url') or local_fallback.get('images')):
return self._clean_response_data(local_fallback)
raise DouyinParserError("未获取到有效媒资数据")
except Exception as e:
self.LOG.error(f"[抖音] 解析过程发生未知错误: {str(e)}\n{traceback.format_exc()}")
@@ -486,6 +500,162 @@ class DouyinParserPlugin(MessagePluginInterface):
except Exception:
return None
def _parse_from_local_extractor(self, clean_url: str) -> Optional[Dict[str, Any]]:
"""
本地提取后备方案(接口不可用时启用)。
设计目标:
1) 不依赖你自建/第三方解析接口,避免单点故障;
2) 优先使用 Python 方式,减少进程开销;
3) 若 Python 库未安装,自动降级到命令行,最大化可用性。
"""
try:
info = self._extract_with_yt_dlp_python(clean_url)
if not info:
info = self._extract_with_yt_dlp_cli(clean_url)
if not info:
return None
return self._normalize_yt_dlp_info(info)
except Exception as e:
self.LOG.warning(f"[抖音] 本地提取后备失败: {e}")
return None
def _extract_with_yt_dlp_python(self, clean_url: str) -> Optional[Dict[str, Any]]:
"""
使用 yt_dlp Python 库提取信息。
注意:
- skip_download=True 只提取元数据和直链,不下载文件;
- 优先选取“含视频轨道且协议为http/https”的格式降低后续发送失败概率。
"""
try:
import yt_dlp # type: ignore
except Exception:
return None
ydl_opts = {
"quiet": True,
"no_warnings": True,
"skip_download": True,
"proxy": self.http_proxy or None,
"nocheckcertificate": True,
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info = ydl.extract_info(clean_url, download=False)
if isinstance(info, dict):
return info
return None
def _extract_with_yt_dlp_cli(self, clean_url: str) -> Optional[Dict[str, Any]]:
"""
使用 yt-dlp 命令行提取信息。
适用场景:
- 运行环境未安装 yt_dlp Python 包,但系统可执行文件已存在。
"""
yt_dlp_bin = shutil.which("yt-dlp")
if not yt_dlp_bin:
return None
cmd = [yt_dlp_bin, "-J", "--no-warnings", "--skip-download", clean_url]
if self.http_proxy:
cmd.extend(["--proxy", self.http_proxy])
result = subprocess.run(cmd, capture_output=True, text=True, timeout=25)
if result.returncode != 0:
self.LOG.warning(f"[抖音] yt-dlp 命令行提取失败: code={result.returncode}, err={result.stderr[:200]}")
return None
try:
data = json.loads(result.stdout or "{}")
return data if isinstance(data, dict) else None
except Exception:
return None
def _normalize_yt_dlp_info(self, info: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""
将 yt-dlp 的原始结构统一映射为插件内部 media_info 结构。
目标结构:
- 视频:{"type":"video","url","title","author","cover"}
- 图集:{"type":"image","images":[],"title","author","cover"}
"""
# 统一提取作者与标题,尽量优先更稳定字段,保证卡片/文本信息完整。
title = str(info.get("description") or info.get("title") or "无标题")
author = str(info.get("uploader") or info.get("creator") or info.get("channel") or "未知作者")
# 统一提取封面:
# 1) thumbnail 字段;
# 2) thumbnails 数组最后一项(通常分辨率更高)。
cover = str(info.get("thumbnail") or "")
if not cover:
thumbs = info.get("thumbnails") or []
if isinstance(thumbs, list) and thumbs:
last = thumbs[-1] if isinstance(thumbs[-1], dict) else {}
cover = str(last.get("url") or "")
# 图集场景yt-dlp 可能返回 playlist/entries每项通常是图片或片段资源。
if info.get("_type") == "playlist":
entries = info.get("entries") or []
image_urls: List[str] = []
if isinstance(entries, list):
for item in entries:
if not isinstance(item, dict):
continue
# 优先取原始URL其次取页面URL再次取thumbnail。
candidate = str(item.get("url") or item.get("webpage_url") or item.get("thumbnail") or "")
if candidate and candidate.startswith("http"):
image_urls.append(candidate)
if image_urls:
return {
"type": "image",
"images": image_urls,
"title": title,
"author": author,
"cover": image_urls[0],
}
# 视频场景:优先从 formats 里选“有视频轨道”的直链,避免选到纯音频。
best_url = ""
formats = info.get("formats") or []
scored_candidates: List[Tuple[int, str]] = []
if isinstance(formats, list):
for fmt in formats:
if not isinstance(fmt, dict):
continue
fmt_url = str(fmt.get("url") or "")
if not fmt_url or not fmt_url.startswith("http"):
continue
# 必须含视频轨道vcodec != none并且协议优先 http/https。
vcodec = str(fmt.get("vcodec") or "")
protocol = str(fmt.get("protocol") or "")
if vcodec.lower() == "none":
continue
score = 0
if protocol in ("https", "http"):
score += 50
# 优先高分辨率与高码率。
score += int(fmt.get("height") or 0)
score += int(fmt.get("tbr") or 0) // 10
scored_candidates.append((score, fmt_url))
if scored_candidates:
scored_candidates.sort(key=lambda x: x[0], reverse=True)
best_url = scored_candidates[0][1]
# 部分站点会直接在顶层给 url 字段,作为兜底读取。
if not best_url:
fallback_url = str(info.get("url") or "")
if fallback_url.startswith("http"):
best_url = fallback_url
if best_url:
return {
"type": "video",
"url": best_url,
"title": title,
"author": author,
"cover": cover,
}
return None
def _append_title_to_image(self, image_bytes: bytes, title: str) -> bytes:
"""
将标题绘制到图片顶部,返回新的图片二进制数据。