抖音解析改为本地页面优先

1. 参考外部 DouyinParser 项目,新增基于分享页 HTML 和 _ROUTER_DATA 的本地解析链路。
2. 抖音解析现在按本地页面解析 -> 原内网接口 -> 原外部接口的顺序依次兜底。
3. 放宽链接匹配范围到 douyin.com / iesdouyin.com,并新增本地解析超时配置项。
This commit is contained in:
Liu
2026-05-01 11:49:46 +08:00
parent c0a6ee6c21
commit 6e0483a49e
2 changed files with 257 additions and 8 deletions

View File

@@ -4,6 +4,12 @@ enable = true
# 发送模式: card(发送卡片) 或 file(下载并发送文件) # 发送模式: card(发送卡片) 或 file(下载并发送文件)
download_mode = "card" download_mode = "card"
# 本地页面解析超时(秒):
# 1. 现在抖音解析会优先直接请求分享页并从 HTML 中提取作品数据;
# 2. 如果本地网络偶尔较慢,可以适当调大这个值;
# 3. 本地解析超时或失败后,插件仍会继续走原来的内网接口和外部接口兜底。
local_parse_timeout_seconds = 12
# Http代理设置用于获取真实链接发送卡片如果家里有ipv6可以设置为空 # Http代理设置用于获取真实链接发送卡片如果家里有ipv6可以设置为空
# 格式: http://用户名:密码@代理地址:代理端口 # 格式: http://用户名:密码@代理地址:代理端口
# 例如http://127.0.0.1:7890 # 例如http://127.0.0.1:7890

View File

@@ -2,6 +2,8 @@ import os
import re import re
import time import time
import traceback import traceback
import html
import json
import requests import requests
import io import io
from typing import Dict, Any, List, Optional, Tuple from typing import Dict, Any, List, Optional, Tuple
@@ -31,6 +33,17 @@ class DouyinParserPlugin(MessagePluginInterface):
# 功能权限常量 # 功能权限常量
FEATURE_KEY = "DOUYIN_PARSER" FEATURE_KEY = "DOUYIN_PARSER"
FEATURE_DESCRIPTION = "🎵 抖音解析功能 [自动解析抖音链接]" FEATURE_DESCRIPTION = "🎵 抖音解析功能 [自动解析抖音链接]"
# 参考本地解析项目,把链接匹配范围放宽到 douyin.com / iesdouyin.com
# 1. 原来只匹配 `v.douyin.com` 短链,用户直接转发长链时插件不会命中;
# 2. 本地页面解析本身就是基于真实分享页 HTML因此长链也应该纳入同一套入口
# 3. 这里统一抽 URL 后再做清洗,避免句尾标点被误带入请求。
DOUYIN_URL_RE = re.compile(r'https?://[^\s<>"]+?(?:douyin\.com|iesdouyin\.com)[^\s<>"]*')
# 参考项目优先从 `window._ROUTER_DATA` 里拿 `loaderData -> videoInfoRes -> item_list[0]`
# 1. 这是当前抖音分享页里最稳定的一份结构化首屏数据;
# 2. 能同时覆盖视频作品和图文作品;
# 3. 命中后可以直接绕开外部接口,减少第三方依赖。
ROUTER_DATA_RE = re.compile(r"window\._ROUTER_DATA\s*=\s*({.*?})\s*</script>", re.S)
LEGACY_PLAY_ADDR_RE = re.compile(r'"play_addr":\s*{\s*"uri":\s*"[^"]*",\s*"url_list":\s*\[([^\]]*)\]')
@property @property
def name(self) -> str: def name(self) -> str:
@@ -67,7 +80,7 @@ class DouyinParserPlugin(MessagePluginInterface):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
self.LOG = logger self.LOG = logger
self.url_pattern = re.compile(r'https?://v\.douyin\.com/[^\s/]+/?') self.url_pattern = self.DOUYIN_URL_RE
# 注册功能权限 # 注册功能权限
self.feature = self.register_feature() self.feature = self.register_feature()
# 修改为使用插件目录下的down_load_dir文件夹 # 修改为使用插件目录下的down_load_dir文件夹
@@ -95,6 +108,11 @@ class DouyinParserPlugin(MessagePluginInterface):
self.cookie = douyin_config.get("cookie", "") or "" self.cookie = douyin_config.get("cookie", "") or ""
self.cookie_file = douyin_config.get("cookie_file", "") or "" self.cookie_file = douyin_config.get("cookie_file", "") or ""
self.download_mode = douyin_config.get("download_mode", "card") # card或file self.download_mode = douyin_config.get("download_mode", "card") # card或file
# 本地页面解析走真实抖音分享页,网络链路通常比内网接口更长一些:
# 1. 这里单独给一个本地解析超时,避免抖音页面偶发慢响应时无限挂起;
# 2. 超时只作用于“本地 HTML 解析优先链路”,不会改变后续旧接口的既有配置;
# 3. 若后续你觉得本地网络较慢,只需要改配置即可,不必再动代码。
self.local_parse_timeout_seconds = max(int(douyin_config.get("local_parse_timeout_seconds", 12) or 12), 5)
self.LOG.debug(f"[{self.name}] 插件初始化完成,代理设置: {self.http_proxy}") self.LOG.debug(f"[{self.name}] 插件初始化完成,代理设置: {self.http_proxy}")
return True return True
@@ -118,8 +136,7 @@ class DouyinParserPlugin(MessagePluginInterface):
if message.get("type") != MessageType.TEXT: if message.get("type") != MessageType.TEXT:
return False return False
content = str(message.get("content", "")).strip() content = str(message.get("content", "")).strip()
match = self.url_pattern.search(content) return self._extract_douyin_url(content) is not None
return match is not None
@plugin_stats_decorator(plugin_name="抖音解析") @plugin_stats_decorator(plugin_name="抖音解析")
async def process_message(self, message: Dict[str, Any]) -> Tuple[bool, Optional[str]]: async def process_message(self, message: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
@@ -136,11 +153,10 @@ class DouyinParserPlugin(MessagePluginInterface):
return False, "没有权限" return False, "没有权限"
try: try:
match = self.url_pattern.search(content) original_url = self._extract_douyin_url(content)
if not match: if not original_url:
return False, "未找到抖音链接" return False, "未找到抖音链接"
original_url = self._clean_url(match.group(0))
self.LOG.info(f"发现抖音链接: {original_url}") self.LOG.info(f"发现抖音链接: {original_url}")
media_info = self._parse_douyin(original_url) media_info = self._parse_douyin(original_url)
@@ -216,6 +232,22 @@ class DouyinParserPlugin(MessagePluginInterface):
self.LOG.debug(f"[抖音] 清理后的URL: {cleaned_url}") self.LOG.debug(f"[抖音] 清理后的URL: {cleaned_url}")
return cleaned_url return cleaned_url
def _extract_douyin_url(self, content: str) -> Optional[str]:
"""从消息文本中提取第一条抖音链接。
这里参考外部项目的做法,把句尾常见中文标点一并裁掉:
1. 用户经常直接把“复制打开抖音……”整段文案贴进群里;
2. 链接后面常跟着 `,。!?)` 这类符号,若不清洗会导致请求 404 或跳错页;
3. 抽取逻辑统一收口后,`can_process` 和 `process_message` 可以复用同一套结果。
"""
text = str(content or "").strip()
if not text:
return None
match = self.url_pattern.search(text)
if not match:
return None
return self._clean_url(match.group(0).rstrip(",。,.!?)"))
def _clean_response_data(self, data: Dict[str, Any]) -> Dict[str, Any]: def _clean_response_data(self, data: Dict[str, Any]) -> Dict[str, Any]:
"""清理响应数据""" """清理响应数据"""
if not data: if not data:
@@ -237,12 +269,20 @@ class DouyinParserPlugin(MessagePluginInterface):
def _parse_douyin(self, url: str) -> Dict[str, Any]: def _parse_douyin(self, url: str) -> Dict[str, Any]:
try: try:
clean_url = self._clean_url(url) clean_url = self._clean_url(url)
# 第一优先级:本地业务解析服务(内网),该链路与你指定的项目实现思路最接近,稳定性最高 # 第一优先级:本地页面解析
# 1. 参考你给的 DouyinParser 项目,先直接请求分享页并解析 HTML 中的 `_ROUTER_DATA`
# 2. 这样成功时完全不依赖第三方解析 API也更符合“本地优先”的目标
# 3. 只有页面结构变化或网络异常时,才继续走你原来的内网接口和外部接口兜底。
local_primary = self._parse_from_local_page(clean_url)
if local_primary and (local_primary.get('url') or local_primary.get('images')):
return self._clean_response_data(local_primary)
# 第二优先级:保留原有本地业务解析服务(内网)。
primary = self._parse_from_internal_api(clean_url) primary = self._parse_from_internal_api(clean_url)
if primary and (primary.get('url') or primary.get('images')): if primary and (primary.get('url') or primary.get('images')):
return self._clean_response_data(primary) return self._clean_response_data(primary)
# 第优先级:外部接口兜底。 # 第优先级:外部接口兜底。
secondary = self._parse_from_external_api(clean_url) secondary = self._parse_from_external_api(clean_url)
if secondary and (secondary.get('url') or secondary.get('images')): if secondary and (secondary.get('url') or secondary.get('images')):
return self._clean_response_data(secondary) return self._clean_response_data(secondary)
@@ -276,6 +316,209 @@ class DouyinParserPlugin(MessagePluginInterface):
headers["Cookie"] = self.cookie headers["Cookie"] = self.cookie
return headers return headers
def _build_local_parse_headers(self) -> Dict[str, str]:
"""构建本地页面解析专用请求头。
这里刻意切成移动端 Safari UA原因有三点
1. 抖音分享页在移动端更容易直接返回完整作品页,而不是额外的跳转或限制提示;
2. 参考项目就是用移动端 UA 解析,现成经验已经验证过这条链路更稳;
3. 只在本地 HTML 解析链路生效,不会影响你原来的内网/外部接口调用头。
"""
headers = self._build_request_headers()
headers["User-Agent"] = (
"Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X) "
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1"
)
headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
return headers
def _parse_from_local_page(self, clean_url: str) -> Optional[Dict[str, Any]]:
"""优先走本地页面解析。
处理流程:
1. 先跟随分享短链跳转,拿到最终作品页 HTML
2. 优先解析 `window._ROUTER_DATA`,提取视频或图文结构化数据;
3. 若新版结构失效,再用旧版 `play_addr` 正则做一次视频兜底。
"""
try:
response = requests.get(
clean_url,
headers=self._build_local_parse_headers(),
timeout=self.local_parse_timeout_seconds,
proxies=self._build_proxies(),
allow_redirects=True,
)
if response.status_code != 200:
return None
html_content = response.text or ""
if not html_content:
return None
result = self._parse_local_page_html(html_content)
if result:
result["source_url"] = str(response.url or clean_url)
return result
except Exception:
return None
def _parse_local_page_html(self, html_content: str) -> Optional[Dict[str, Any]]:
"""解析抖音分享页 HTML。"""
item = self._extract_aweme_item_from_router_data(html_content)
if item:
note_result = self._parse_local_note_item(item)
if note_result:
return note_result
video_result = self._parse_local_video_item(item)
if video_result:
return video_result
return self._parse_local_legacy_video(html_content)
def _extract_aweme_item_from_router_data(self, html_content: str) -> Optional[Dict[str, Any]]:
"""从 `_ROUTER_DATA` 中抽出作品主数据节点。"""
match = self.ROUTER_DATA_RE.search(html_content or "")
if not match:
return None
try:
router_data = json.loads(match.group(1))
except json.JSONDecodeError as e:
self.LOG.debug(f"[抖音] 解析 _ROUTER_DATA 失败: {e}")
return None
loader_data = router_data.get("loaderData")
if not isinstance(loader_data, dict):
return None
for page_data in loader_data.values():
if not isinstance(page_data, dict):
continue
video_info = page_data.get("videoInfoRes")
if not isinstance(video_info, dict):
continue
item_list = video_info.get("item_list")
if isinstance(item_list, list) and item_list and isinstance(item_list[0], dict):
return item_list[0]
return None
def _parse_local_note_item(self, item: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""解析图文作品。"""
image_url_groups = self._pick_local_image_url_groups(item)
if not image_url_groups:
return None
desc = self._clean_local_text(item.get("desc"))
author = self._clean_local_text((item.get("author") or {}).get("nickname"))
images = [group[0] for group in image_url_groups if group]
if not images:
return None
return {
"type": "image",
"images": images,
"image_url_groups": image_url_groups,
"title": desc,
"author": author,
"cover": images[0],
}
def _pick_local_image_url_groups(self, item: Dict[str, Any]) -> List[List[str]]:
"""从图文作品中提取每一张图的候选地址列表。"""
image_url_groups: List[List[str]] = []
seen_groups = set()
for image_info in item.get("images") or item.get("image_infos") or []:
if not isinstance(image_info, dict):
continue
candidates: List[str] = []
seen_urls = set()
for image_url in image_info.get("url_list") or []:
if not isinstance(image_url, str) or not image_url.startswith("http"):
continue
decoded_url = self._decode_local_value(image_url)
if decoded_url in seen_urls:
continue
candidates.append(decoded_url)
seen_urls.add(decoded_url)
group_key = tuple(candidates)
if candidates and group_key not in seen_groups:
image_url_groups.append(candidates)
seen_groups.add(group_key)
return image_url_groups
def _parse_local_video_item(self, item: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""解析视频作品。"""
video = item.get("video")
if not isinstance(video, dict):
return None
if int(video.get("duration") or 1) == 0:
return None
play_addr = video.get("play_addr") or {}
urls = play_addr.get("url_list") or []
cleaned_urls = [self._decode_local_value(url).replace("playwm", "play") for url in urls if isinstance(url, str) and url]
video_url = self._prefer_v3_v10(cleaned_urls)
if not video_url:
return None
cover = video.get("cover") or {}
cover_urls = cover.get("url_list") or []
cover_url = self._decode_local_value(cover_urls[0]) if cover_urls else ""
return {
"type": "video",
"url": video_url,
"title": self._clean_local_text(item.get("desc")),
"author": self._clean_local_text((item.get("author") or {}).get("nickname")),
"cover": cover_url,
}
def _parse_local_legacy_video(self, html_content: str) -> Optional[Dict[str, Any]]:
"""旧版页面结构兜底:直接从 HTML 里正则抽 `play_addr.url_list`。"""
match = self.LEGACY_PLAY_ADDR_RE.search(html_content or "")
if not match:
return None
raw_urls = [url.strip().strip('"') for url in match.group(1).split(",")]
cleaned_urls = [self._decode_local_value(url).replace("playwm", "play") for url in raw_urls if url]
video_url = self._prefer_v3_v10(cleaned_urls)
if not video_url:
return None
title = self._match_local_json_string(html_content, "desc")
author = self._match_local_json_string(html_content, "nickname")
cover_match = re.search(r'"cover":\s*{\s*"url_list":\s*\[\s*"([^"]+)"', html_content or "")
cover_url = self._decode_local_value(cover_match.group(1)) if cover_match else ""
return {
"type": "video",
"url": video_url,
"title": title,
"author": author,
"cover": cover_url,
}
def _match_local_json_string(self, text: str, key: str) -> str:
"""从页面原始 JSON 片段中提取单个字符串字段。"""
match = re.search(rf'"{re.escape(key)}":\s*"([^"]*)"', text or "")
if not match:
return ""
return self._clean_local_text(self._decode_local_value(match.group(1)))
def _decode_local_value(self, value: str) -> str:
"""解码 HTML 实体和 `\\uXXXX` 形式的转义文本。"""
text = str(value or "")
try:
text = text.encode("utf-8").decode("unicode_escape")
except Exception:
pass
return html.unescape(text)
def _clean_local_text(self, value: Any) -> str:
"""清洗页面里读出来的标题、作者等文本字段。"""
if value is None:
return ""
return html.unescape(str(value)).strip()
def _parse_from_internal_api(self, clean_url: str) -> Optional[Dict[str, Any]]: def _parse_from_internal_api(self, clean_url: str) -> Optional[Dict[str, Any]]:
try: try:
endpoint = "http://192.168.2.32:8999/api/hybrid/video_data" endpoint = "http://192.168.2.32:8999/api/hybrid/video_data"