抖音解析改为本地页面优先
1. 参考外部 DouyinParser 项目,新增基于分享页 HTML 和 _ROUTER_DATA 的本地解析链路。 2. 抖音解析现在按本地页面解析 -> 原内网接口 -> 原外部接口的顺序依次兜底。 3. 放宽链接匹配范围到 douyin.com / iesdouyin.com,并新增本地解析超时配置项。
This commit is contained in:
@@ -4,6 +4,12 @@ enable = true
|
||||
# 发送模式: card(发送卡片) 或 file(下载并发送文件)
|
||||
download_mode = "card"
|
||||
|
||||
# 本地页面解析超时(秒):
|
||||
# 1. 现在抖音解析会优先直接请求分享页并从 HTML 中提取作品数据;
|
||||
# 2. 如果本地网络偶尔较慢,可以适当调大这个值;
|
||||
# 3. 本地解析超时或失败后,插件仍会继续走原来的内网接口和外部接口兜底。
|
||||
local_parse_timeout_seconds = 12
|
||||
|
||||
# Http代理设置(用于获取真实链接发送卡片,如果家里有ipv6,可以设置为空)
|
||||
# 格式: http://用户名:密码@代理地址:代理端口
|
||||
# 例如:http://127.0.0.1:7890
|
||||
|
||||
@@ -2,6 +2,8 @@ import os
|
||||
import re
|
||||
import time
|
||||
import traceback
|
||||
import html
|
||||
import json
|
||||
import requests
|
||||
import io
|
||||
from typing import Dict, Any, List, Optional, Tuple
|
||||
@@ -31,6 +33,17 @@ class DouyinParserPlugin(MessagePluginInterface):
|
||||
# 功能权限常量
|
||||
FEATURE_KEY = "DOUYIN_PARSER"
|
||||
FEATURE_DESCRIPTION = "🎵 抖音解析功能 [自动解析抖音链接]"
|
||||
# 参考本地解析项目,把链接匹配范围放宽到 douyin.com / iesdouyin.com:
|
||||
# 1. 原来只匹配 `v.douyin.com` 短链,用户直接转发长链时插件不会命中;
|
||||
# 2. 本地页面解析本身就是基于真实分享页 HTML,因此长链也应该纳入同一套入口;
|
||||
# 3. 这里统一抽 URL 后再做清洗,避免句尾标点被误带入请求。
|
||||
DOUYIN_URL_RE = re.compile(r'https?://[^\s<>"]+?(?:douyin\.com|iesdouyin\.com)[^\s<>"]*')
|
||||
# 参考项目优先从 `window._ROUTER_DATA` 里拿 `loaderData -> videoInfoRes -> item_list[0]`:
|
||||
# 1. 这是当前抖音分享页里最稳定的一份结构化首屏数据;
|
||||
# 2. 能同时覆盖视频作品和图文作品;
|
||||
# 3. 命中后可以直接绕开外部接口,减少第三方依赖。
|
||||
ROUTER_DATA_RE = re.compile(r"window\._ROUTER_DATA\s*=\s*({.*?})\s*</script>", re.S)
|
||||
LEGACY_PLAY_ADDR_RE = re.compile(r'"play_addr":\s*{\s*"uri":\s*"[^"]*",\s*"url_list":\s*\[([^\]]*)\]')
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
@@ -67,7 +80,7 @@ class DouyinParserPlugin(MessagePluginInterface):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.LOG = logger
|
||||
self.url_pattern = re.compile(r'https?://v\.douyin\.com/[^\s/]+/?')
|
||||
self.url_pattern = self.DOUYIN_URL_RE
|
||||
# 注册功能权限
|
||||
self.feature = self.register_feature()
|
||||
# 修改为使用插件目录下的down_load_dir文件夹
|
||||
@@ -95,6 +108,11 @@ class DouyinParserPlugin(MessagePluginInterface):
|
||||
self.cookie = douyin_config.get("cookie", "") or ""
|
||||
self.cookie_file = douyin_config.get("cookie_file", "") or ""
|
||||
self.download_mode = douyin_config.get("download_mode", "card") # card或file
|
||||
# 本地页面解析走真实抖音分享页,网络链路通常比内网接口更长一些:
|
||||
# 1. 这里单独给一个本地解析超时,避免抖音页面偶发慢响应时无限挂起;
|
||||
# 2. 超时只作用于“本地 HTML 解析优先链路”,不会改变后续旧接口的既有配置;
|
||||
# 3. 若后续你觉得本地网络较慢,只需要改配置即可,不必再动代码。
|
||||
self.local_parse_timeout_seconds = max(int(douyin_config.get("local_parse_timeout_seconds", 12) or 12), 5)
|
||||
|
||||
self.LOG.debug(f"[{self.name}] 插件初始化完成,代理设置: {self.http_proxy}")
|
||||
return True
|
||||
@@ -118,8 +136,7 @@ class DouyinParserPlugin(MessagePluginInterface):
|
||||
if message.get("type") != MessageType.TEXT:
|
||||
return False
|
||||
content = str(message.get("content", "")).strip()
|
||||
match = self.url_pattern.search(content)
|
||||
return match is not None
|
||||
return self._extract_douyin_url(content) is not None
|
||||
|
||||
@plugin_stats_decorator(plugin_name="抖音解析")
|
||||
async def process_message(self, message: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
|
||||
@@ -136,11 +153,10 @@ class DouyinParserPlugin(MessagePluginInterface):
|
||||
return False, "没有权限"
|
||||
|
||||
try:
|
||||
match = self.url_pattern.search(content)
|
||||
if not match:
|
||||
original_url = self._extract_douyin_url(content)
|
||||
if not original_url:
|
||||
return False, "未找到抖音链接"
|
||||
|
||||
original_url = self._clean_url(match.group(0))
|
||||
self.LOG.info(f"发现抖音链接: {original_url}")
|
||||
|
||||
media_info = self._parse_douyin(original_url)
|
||||
@@ -216,6 +232,22 @@ class DouyinParserPlugin(MessagePluginInterface):
|
||||
self.LOG.debug(f"[抖音] 清理后的URL: {cleaned_url}")
|
||||
return cleaned_url
|
||||
|
||||
def _extract_douyin_url(self, content: str) -> Optional[str]:
|
||||
"""从消息文本中提取第一条抖音链接。
|
||||
|
||||
这里参考外部项目的做法,把句尾常见中文标点一并裁掉:
|
||||
1. 用户经常直接把“复制打开抖音……”整段文案贴进群里;
|
||||
2. 链接后面常跟着 `,。!?)` 这类符号,若不清洗会导致请求 404 或跳错页;
|
||||
3. 抽取逻辑统一收口后,`can_process` 和 `process_message` 可以复用同一套结果。
|
||||
"""
|
||||
text = str(content or "").strip()
|
||||
if not text:
|
||||
return None
|
||||
match = self.url_pattern.search(text)
|
||||
if not match:
|
||||
return None
|
||||
return self._clean_url(match.group(0).rstrip(",。,.!!??))"))
|
||||
|
||||
def _clean_response_data(self, data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""清理响应数据"""
|
||||
if not data:
|
||||
@@ -237,12 +269,20 @@ class DouyinParserPlugin(MessagePluginInterface):
|
||||
def _parse_douyin(self, url: str) -> Dict[str, Any]:
|
||||
try:
|
||||
clean_url = self._clean_url(url)
|
||||
# 第一优先级:本地业务解析服务(内网),该链路与你指定的项目实现思路最接近,稳定性最高。
|
||||
# 第一优先级:本地页面解析。
|
||||
# 1. 参考你给的 DouyinParser 项目,先直接请求分享页并解析 HTML 中的 `_ROUTER_DATA`;
|
||||
# 2. 这样成功时完全不依赖第三方解析 API,也更符合“本地优先”的目标;
|
||||
# 3. 只有页面结构变化或网络异常时,才继续走你原来的内网接口和外部接口兜底。
|
||||
local_primary = self._parse_from_local_page(clean_url)
|
||||
if local_primary and (local_primary.get('url') or local_primary.get('images')):
|
||||
return self._clean_response_data(local_primary)
|
||||
|
||||
# 第二优先级:保留原有本地业务解析服务(内网)。
|
||||
primary = self._parse_from_internal_api(clean_url)
|
||||
if primary and (primary.get('url') or primary.get('images')):
|
||||
return self._clean_response_data(primary)
|
||||
|
||||
# 第二优先级:外部接口兜底。
|
||||
# 第三优先级:外部接口兜底。
|
||||
secondary = self._parse_from_external_api(clean_url)
|
||||
if secondary and (secondary.get('url') or secondary.get('images')):
|
||||
return self._clean_response_data(secondary)
|
||||
@@ -276,6 +316,209 @@ class DouyinParserPlugin(MessagePluginInterface):
|
||||
headers["Cookie"] = self.cookie
|
||||
return headers
|
||||
|
||||
def _build_local_parse_headers(self) -> Dict[str, str]:
|
||||
"""构建本地页面解析专用请求头。
|
||||
|
||||
这里刻意切成移动端 Safari UA,原因有三点:
|
||||
1. 抖音分享页在移动端更容易直接返回完整作品页,而不是额外的跳转或限制提示;
|
||||
2. 参考项目就是用移动端 UA 解析,现成经验已经验证过这条链路更稳;
|
||||
3. 只在本地 HTML 解析链路生效,不会影响你原来的内网/外部接口调用头。
|
||||
"""
|
||||
headers = self._build_request_headers()
|
||||
headers["User-Agent"] = (
|
||||
"Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X) "
|
||||
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1"
|
||||
)
|
||||
headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
|
||||
return headers
|
||||
|
||||
def _parse_from_local_page(self, clean_url: str) -> Optional[Dict[str, Any]]:
|
||||
"""优先走本地页面解析。
|
||||
|
||||
处理流程:
|
||||
1. 先跟随分享短链跳转,拿到最终作品页 HTML;
|
||||
2. 优先解析 `window._ROUTER_DATA`,提取视频或图文结构化数据;
|
||||
3. 若新版结构失效,再用旧版 `play_addr` 正则做一次视频兜底。
|
||||
"""
|
||||
try:
|
||||
response = requests.get(
|
||||
clean_url,
|
||||
headers=self._build_local_parse_headers(),
|
||||
timeout=self.local_parse_timeout_seconds,
|
||||
proxies=self._build_proxies(),
|
||||
allow_redirects=True,
|
||||
)
|
||||
if response.status_code != 200:
|
||||
return None
|
||||
html_content = response.text or ""
|
||||
if not html_content:
|
||||
return None
|
||||
result = self._parse_local_page_html(html_content)
|
||||
if result:
|
||||
result["source_url"] = str(response.url or clean_url)
|
||||
return result
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def _parse_local_page_html(self, html_content: str) -> Optional[Dict[str, Any]]:
|
||||
"""解析抖音分享页 HTML。"""
|
||||
item = self._extract_aweme_item_from_router_data(html_content)
|
||||
if item:
|
||||
note_result = self._parse_local_note_item(item)
|
||||
if note_result:
|
||||
return note_result
|
||||
|
||||
video_result = self._parse_local_video_item(item)
|
||||
if video_result:
|
||||
return video_result
|
||||
|
||||
return self._parse_local_legacy_video(html_content)
|
||||
|
||||
def _extract_aweme_item_from_router_data(self, html_content: str) -> Optional[Dict[str, Any]]:
|
||||
"""从 `_ROUTER_DATA` 中抽出作品主数据节点。"""
|
||||
match = self.ROUTER_DATA_RE.search(html_content or "")
|
||||
if not match:
|
||||
return None
|
||||
|
||||
try:
|
||||
router_data = json.loads(match.group(1))
|
||||
except json.JSONDecodeError as e:
|
||||
self.LOG.debug(f"[抖音] 解析 _ROUTER_DATA 失败: {e}")
|
||||
return None
|
||||
|
||||
loader_data = router_data.get("loaderData")
|
||||
if not isinstance(loader_data, dict):
|
||||
return None
|
||||
|
||||
for page_data in loader_data.values():
|
||||
if not isinstance(page_data, dict):
|
||||
continue
|
||||
video_info = page_data.get("videoInfoRes")
|
||||
if not isinstance(video_info, dict):
|
||||
continue
|
||||
item_list = video_info.get("item_list")
|
||||
if isinstance(item_list, list) and item_list and isinstance(item_list[0], dict):
|
||||
return item_list[0]
|
||||
return None
|
||||
|
||||
def _parse_local_note_item(self, item: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
||||
"""解析图文作品。"""
|
||||
image_url_groups = self._pick_local_image_url_groups(item)
|
||||
if not image_url_groups:
|
||||
return None
|
||||
|
||||
desc = self._clean_local_text(item.get("desc"))
|
||||
author = self._clean_local_text((item.get("author") or {}).get("nickname"))
|
||||
images = [group[0] for group in image_url_groups if group]
|
||||
if not images:
|
||||
return None
|
||||
|
||||
return {
|
||||
"type": "image",
|
||||
"images": images,
|
||||
"image_url_groups": image_url_groups,
|
||||
"title": desc,
|
||||
"author": author,
|
||||
"cover": images[0],
|
||||
}
|
||||
|
||||
def _pick_local_image_url_groups(self, item: Dict[str, Any]) -> List[List[str]]:
|
||||
"""从图文作品中提取每一张图的候选地址列表。"""
|
||||
image_url_groups: List[List[str]] = []
|
||||
seen_groups = set()
|
||||
for image_info in item.get("images") or item.get("image_infos") or []:
|
||||
if not isinstance(image_info, dict):
|
||||
continue
|
||||
candidates: List[str] = []
|
||||
seen_urls = set()
|
||||
for image_url in image_info.get("url_list") or []:
|
||||
if not isinstance(image_url, str) or not image_url.startswith("http"):
|
||||
continue
|
||||
decoded_url = self._decode_local_value(image_url)
|
||||
if decoded_url in seen_urls:
|
||||
continue
|
||||
candidates.append(decoded_url)
|
||||
seen_urls.add(decoded_url)
|
||||
group_key = tuple(candidates)
|
||||
if candidates and group_key not in seen_groups:
|
||||
image_url_groups.append(candidates)
|
||||
seen_groups.add(group_key)
|
||||
return image_url_groups
|
||||
|
||||
def _parse_local_video_item(self, item: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
||||
"""解析视频作品。"""
|
||||
video = item.get("video")
|
||||
if not isinstance(video, dict):
|
||||
return None
|
||||
if int(video.get("duration") or 1) == 0:
|
||||
return None
|
||||
|
||||
play_addr = video.get("play_addr") or {}
|
||||
urls = play_addr.get("url_list") or []
|
||||
cleaned_urls = [self._decode_local_value(url).replace("playwm", "play") for url in urls if isinstance(url, str) and url]
|
||||
video_url = self._prefer_v3_v10(cleaned_urls)
|
||||
if not video_url:
|
||||
return None
|
||||
|
||||
cover = video.get("cover") or {}
|
||||
cover_urls = cover.get("url_list") or []
|
||||
cover_url = self._decode_local_value(cover_urls[0]) if cover_urls else ""
|
||||
|
||||
return {
|
||||
"type": "video",
|
||||
"url": video_url,
|
||||
"title": self._clean_local_text(item.get("desc")),
|
||||
"author": self._clean_local_text((item.get("author") or {}).get("nickname")),
|
||||
"cover": cover_url,
|
||||
}
|
||||
|
||||
def _parse_local_legacy_video(self, html_content: str) -> Optional[Dict[str, Any]]:
|
||||
"""旧版页面结构兜底:直接从 HTML 里正则抽 `play_addr.url_list`。"""
|
||||
match = self.LEGACY_PLAY_ADDR_RE.search(html_content or "")
|
||||
if not match:
|
||||
return None
|
||||
|
||||
raw_urls = [url.strip().strip('"') for url in match.group(1).split(",")]
|
||||
cleaned_urls = [self._decode_local_value(url).replace("playwm", "play") for url in raw_urls if url]
|
||||
video_url = self._prefer_v3_v10(cleaned_urls)
|
||||
if not video_url:
|
||||
return None
|
||||
|
||||
title = self._match_local_json_string(html_content, "desc")
|
||||
author = self._match_local_json_string(html_content, "nickname")
|
||||
cover_match = re.search(r'"cover":\s*{\s*"url_list":\s*\[\s*"([^"]+)"', html_content or "")
|
||||
cover_url = self._decode_local_value(cover_match.group(1)) if cover_match else ""
|
||||
|
||||
return {
|
||||
"type": "video",
|
||||
"url": video_url,
|
||||
"title": title,
|
||||
"author": author,
|
||||
"cover": cover_url,
|
||||
}
|
||||
|
||||
def _match_local_json_string(self, text: str, key: str) -> str:
|
||||
"""从页面原始 JSON 片段中提取单个字符串字段。"""
|
||||
match = re.search(rf'"{re.escape(key)}":\s*"([^"]*)"', text or "")
|
||||
if not match:
|
||||
return ""
|
||||
return self._clean_local_text(self._decode_local_value(match.group(1)))
|
||||
|
||||
def _decode_local_value(self, value: str) -> str:
|
||||
"""解码 HTML 实体和 `\\uXXXX` 形式的转义文本。"""
|
||||
text = str(value or "")
|
||||
try:
|
||||
text = text.encode("utf-8").decode("unicode_escape")
|
||||
except Exception:
|
||||
pass
|
||||
return html.unescape(text)
|
||||
|
||||
def _clean_local_text(self, value: Any) -> str:
|
||||
"""清洗页面里读出来的标题、作者等文本字段。"""
|
||||
if value is None:
|
||||
return ""
|
||||
return html.unescape(str(value)).strip()
|
||||
|
||||
def _parse_from_internal_api(self, clean_url: str) -> Optional[Dict[str, Any]]:
|
||||
try:
|
||||
endpoint = "http://192.168.2.32:8999/api/hybrid/video_data"
|
||||
|
||||
Reference in New Issue
Block a user