增强抖音解析插件本地页面优先解析能力

This commit is contained in:
liuwei
2026-05-06 09:52:06 +08:00
parent d036de3138
commit b526f8f398
2 changed files with 331 additions and 12 deletions

View File

@@ -1,6 +1,11 @@
[Douyin] [Douyin]
enable = true enable = true
# 解析顺序说明:
# 1. 优先直接解析抖音分享页 HTML本地解析不依赖第三方接口
# 2. 本地解析失败后,再回退到现有内网解析服务;
# 3. 最后才使用外部接口兜底。
# 发送模式: card(发送卡片) 或 file(下载并发送文件) # 发送模式: card(发送卡片) 或 file(下载并发送文件)
download_mode = "card" download_mode = "card"

View File

@@ -2,6 +2,8 @@ import os
import re import re
import time import time
import traceback import traceback
import json
import html
import requests import requests
import io import io
from typing import Dict, Any, List, Optional, Tuple from typing import Dict, Any, List, Optional, Tuple
@@ -67,7 +69,12 @@ class DouyinParserPlugin(MessagePluginInterface):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
self.LOG = logger self.LOG = logger
self.url_pattern = re.compile(r'https?://v\.douyin\.com/[^\s/]+/?') # 既兼容 v.douyin.com 短链,也兼容分享页已经展开后的 douyin / iesdouyin 链接。
# 这样用户直接转发短链、长链或者带标点的分享文案时,都能进入统一解析链路。
self.url_pattern = re.compile(r'https?://[^\s<>\"]+?(?:douyin\.com|iesdouyin\.com)[^\s<>\"]*')
# 本地页面解析优先复用分享页中的 _ROUTER_DATA。
# 这是参考 DouyinParser 项目接入的核心能力,可以在不依赖第三方接口的情况下直接拿到图文/视频元数据。
self.router_data_pattern = re.compile(r"window\._ROUTER_DATA\s*=\s*({.*?})\s*</script>", re.S)
# 注册功能权限 # 注册功能权限
self.feature = self.register_feature() self.feature = self.register_feature()
# 修改为使用插件目录下的down_load_dir文件夹 # 修改为使用插件目录下的down_load_dir文件夹
@@ -118,8 +125,7 @@ class DouyinParserPlugin(MessagePluginInterface):
if message.get("type") != MessageType.TEXT: if message.get("type") != MessageType.TEXT:
return False return False
content = str(message.get("content", "")).strip() content = str(message.get("content", "")).strip()
match = self.url_pattern.search(content) return self._extract_douyin_url(content) is not None
return match is not None
@plugin_stats_decorator(plugin_name="抖音解析") @plugin_stats_decorator(plugin_name="抖音解析")
async def process_message(self, message: Dict[str, Any]) -> Tuple[bool, Optional[str]]: async def process_message(self, message: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
@@ -136,11 +142,11 @@ class DouyinParserPlugin(MessagePluginInterface):
return False, "没有权限" return False, "没有权限"
try: try:
match = self.url_pattern.search(content) extracted_url = self._extract_douyin_url(content)
if not match: if not extracted_url:
return False, "未找到抖音链接" return False, "未找到抖音链接"
original_url = self._clean_url(match.group(0)) original_url = self._clean_url(extracted_url)
self.LOG.info(f"发现抖音链接: {original_url}") self.LOG.info(f"发现抖音链接: {original_url}")
media_info = self._parse_douyin(original_url) media_info = self._parse_douyin(original_url)
@@ -150,12 +156,17 @@ class DouyinParserPlugin(MessagePluginInterface):
media_type = media_info.get('type', 'video') media_type = media_info.get('type', 'video')
if media_type == 'image': if media_type == 'image':
imgs = media_info.get('images') or [] image_candidates = media_info.get('image_candidates') or []
if not imgs: if not image_candidates:
raw_images = media_info.get('images') or []
image_candidates = [[str(url).strip()] for url in raw_images if str(url).strip()]
if not image_candidates:
return False, "未获取到图片地址" return False, "未获取到图片地址"
img_bytes_list: List[bytes] = [] img_bytes_list: List[bytes] = []
for u in imgs: # 本地页面解析会尽量给出每张图的多个候选地址。
b = self._download_image_bytes(u) # 这里逐组兜底下载,避免首选链接偶发 403/失效时整条图文直接失败。
for candidates in image_candidates:
b = self._download_first_available_image_bytes(candidates)
if b: if b:
img_bytes_list.append(b) img_bytes_list.append(b)
if not img_bytes_list: if not img_bytes_list:
@@ -213,9 +224,24 @@ class DouyinParserPlugin(MessagePluginInterface):
def _clean_url(self, url: str) -> str: def _clean_url(self, url: str) -> str:
"""清理URL""" """清理URL"""
cleaned_url = url.strip().replace(';', '').replace('\n', '').replace('\r', '') cleaned_url = url.strip().replace(';', '').replace('\n', '').replace('\r', '')
cleaned_url = cleaned_url.rstrip(",。,.!?)]}")
self.LOG.debug(f"[抖音] 清理后的URL: {cleaned_url}") self.LOG.debug(f"[抖音] 清理后的URL: {cleaned_url}")
return cleaned_url return cleaned_url
def _extract_douyin_url(self, content: str) -> Optional[str]:
"""
从消息文本中提取第一条抖音链接。
说明:
1. 分享文案里经常会把链接夹在中文标点中间,这里统一做一次裁剪;
2. 后续无论是本地页面解析还是外部兜底,都使用这一条标准化后的 URL
3. 入口收口后,后面如果要补充更多抖音域名,也只需要改这一处。
"""
match = self.url_pattern.search(str(content or ""))
if not match:
return None
return self._clean_url(match.group(0))
def _clean_response_data(self, data: Dict[str, Any]) -> Dict[str, Any]: def _clean_response_data(self, data: Dict[str, Any]) -> Dict[str, Any]:
"""清理响应数据""" """清理响应数据"""
if not data: if not data:
@@ -237,12 +263,21 @@ class DouyinParserPlugin(MessagePluginInterface):
def _parse_douyin(self, url: str) -> Dict[str, Any]: def _parse_douyin(self, url: str) -> Dict[str, Any]:
try: try:
clean_url = self._clean_url(url) clean_url = self._clean_url(url)
# 第一优先级:本地业务解析服务(内网),该链路与你指定的项目实现思路最接近,稳定性最高 # 第一优先级:本地页面解析
# 这里参考 DouyinParser 项目,直接展开短链并解析分享页里的 _ROUTER_DATA
# 优点是不依赖外部第三方接口,命中成功时可直接拿到图文/视频的原始元数据。
local_result = self._parse_from_local_page(clean_url)
if local_result and (local_result.get('url') or local_result.get('images')):
return self._clean_response_data(local_result)
# 第二优先级:现有内网业务解析服务。
# 保留这条链路作为本地页面解析失败后的第一层兜底,避免线上能力回退。
primary = self._parse_from_internal_api(clean_url) primary = self._parse_from_internal_api(clean_url)
if primary and (primary.get('url') or primary.get('images')): if primary and (primary.get('url') or primary.get('images')):
return self._clean_response_data(primary) return self._clean_response_data(primary)
# 第优先级:外部接口兜底。 # 第优先级:外部接口兜底。
# 这一层只在本地解析和内网解析都失败时再尝试,避免主路径对外部服务形成硬依赖。
secondary = self._parse_from_external_api(clean_url) secondary = self._parse_from_external_api(clean_url)
if secondary and (secondary.get('url') or secondary.get('images')): if secondary and (secondary.get('url') or secondary.get('images')):
return self._clean_response_data(secondary) return self._clean_response_data(secondary)
@@ -252,6 +287,251 @@ class DouyinParserPlugin(MessagePluginInterface):
self.LOG.error(f"[抖音] 解析过程发生未知错误: {str(e)}\n{traceback.format_exc()}") self.LOG.error(f"[抖音] 解析过程发生未知错误: {str(e)}\n{traceback.format_exc()}")
raise DouyinParserError(f"未知错误: {str(e)}") raise DouyinParserError(f"未知错误: {str(e)}")
def _parse_from_local_page(self, clean_url: str) -> Optional[Dict[str, Any]]:
"""
直接解析抖音分享页 HTML。
实现思路参考外部 DouyinParser 项目,但这里做了两点本地化适配:
1. 继续沿用当前插件已有的 requests / proxy / cookie 配置,避免额外引入异步 HTTP 依赖;
2. 解析结果统一映射成当前插件现有的数据结构,尽量不改发送链路。
"""
try:
resolved_url = self._resolve_douyin_share_url(clean_url)
html_content = self._fetch_douyin_page_html(resolved_url)
if not html_content:
return None
result = self._parse_douyin_page_html(html_content)
if result and resolved_url and not result.get("source_url"):
result["source_url"] = resolved_url
return result
except Exception as e:
self.LOG.warning(f"[抖音] 本地页面解析失败,准备进入兜底链路: {e}")
return None
def _resolve_douyin_share_url(self, url: str) -> str:
"""
展开抖音短链,拿到最终分享页地址。
这样后续拉取 HTML 时可以稳定命中作品详情页,而不是停留在 v.douyin.com 的跳转页。
"""
response = requests.get(
url,
headers=self._build_page_request_headers(),
timeout=10,
proxies=self._build_proxies(),
allow_redirects=True,
)
response.raise_for_status()
final_url = str(response.url or url).strip()
self.LOG.debug(f"[抖音] 展开后的分享页地址: {final_url}")
return final_url
def _fetch_douyin_page_html(self, url: str) -> str:
"""拉取抖音分享页 HTML 内容。"""
response = requests.get(
url,
headers=self._build_page_request_headers(),
timeout=15,
proxies=self._build_proxies(),
)
response.raise_for_status()
response.encoding = response.apparent_encoding or response.encoding or "utf-8"
html_content = response.text or ""
if not html_content.strip():
raise DouyinParserError("抖音分享页内容为空")
return html_content
def _parse_douyin_page_html(self, html_content: str) -> Dict[str, Any]:
"""
解析分享页 HTML兼容图文与视频作品。
解析顺序:
1. 优先尝试新版页面里的 _ROUTER_DATA
2. 如果没有命中,再回退到旧页面中可直接正则提取的 video 字段。
"""
item = self._extract_aweme_item(html_content)
if item:
note = self._parse_note_item(item)
if note:
return note
video = self._parse_video_item(item)
if video:
return video
legacy_video = self._parse_legacy_video(html_content)
if legacy_video:
return legacy_video
raise DouyinParserError("未找到可解析的抖音图文或视频内容")
def _extract_aweme_item(self, html_content: str) -> Optional[Dict[str, Any]]:
"""
从页面中的 _ROUTER_DATA 提取第一条作品数据。
这是当前抖音分享页最稳定的数据来源,图文、视频都可以从这里统一解析。
"""
match = self.router_data_pattern.search(html_content or "")
if not match:
return None
try:
router_data = json.loads(match.group(1))
except json.JSONDecodeError as e:
self.LOG.warning(f"[抖音] 解析 _ROUTER_DATA 失败: {e}")
return None
loader_data = router_data.get("loaderData")
if not isinstance(loader_data, dict):
return None
for page_data in loader_data.values():
if not isinstance(page_data, dict):
continue
video_info = page_data.get("videoInfoRes")
if not isinstance(video_info, dict):
continue
item_list = video_info.get("item_list")
if isinstance(item_list, list) and item_list and isinstance(item_list[0], dict):
return item_list[0]
return None
def _parse_note_item(self, item: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""
从作品数据中解析图文作品。
这里保留每张图的候选 URL 列表,后续下载阶段可以逐个重试,提升图文成功率。
"""
image_url_groups = self._pick_image_url_groups(item)
if not image_url_groups:
return None
return {
"type": "image",
"title": self._clean_text(item.get("desc")),
"author": self._clean_text((item.get("author") or {}).get("nickname")),
"images": [group[0] for group in image_url_groups if group],
"image_candidates": image_url_groups,
"cover": image_url_groups[0][0] if image_url_groups and image_url_groups[0] else "",
}
def _pick_image_url_groups(self, item: Dict[str, Any]) -> List[List[str]]:
"""提取图文中每一页图片的候选地址列表,并做去重。"""
image_url_groups: List[List[str]] = []
seen_groups = set()
for image_info in item.get("images") or item.get("image_infos") or []:
if not isinstance(image_info, dict):
continue
candidates: List[str] = []
seen_urls = set()
for image_url in image_info.get("url_list") or []:
if not isinstance(image_url, str) or not image_url.startswith("http"):
continue
decoded_url = self._decode_text(image_url)
if decoded_url in seen_urls:
continue
candidates.append(decoded_url)
seen_urls.add(decoded_url)
group_key = tuple(candidates)
if candidates and group_key not in seen_groups:
image_url_groups.append(candidates)
seen_groups.add(group_key)
return image_url_groups
def _parse_video_item(self, item: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""从作品数据中解析视频作品,并优先挑选无水印播放地址。"""
video = item.get("video")
if not isinstance(video, dict) or video.get("duration") == 0:
return None
play_addr = video.get("play_addr") or {}
urls = play_addr.get("url_list") or []
video_url = self._pick_video_url(urls)
if not video_url:
return None
cover = video.get("cover") or {}
cover_urls = cover.get("url_list") or []
cover_url = self._decode_text(cover_urls[0]) if cover_urls else ""
return {
"type": "video",
"url": video_url,
"title": self._clean_text(item.get("desc")),
"author": self._clean_text((item.get("author") or {}).get("nickname")),
"cover": cover_url,
}
def _parse_legacy_video(self, html_content: str) -> Optional[Dict[str, Any]]:
"""
兼容旧分享页结构。
有些页面没有 _ROUTER_DATA但仍然能从 play_addr / cover / desc 中拼出完整视频卡片。
"""
pattern = re.compile(r'"play_addr":\s*{\s*"uri":\s*"[^"]*",\s*"url_list":\s*\[([^\]]*)\]')
match = pattern.search(html_content or "")
if not match:
return None
raw_urls = [url.strip().strip('"') for url in match.group(1).split(",")]
video_url = self._pick_video_url(raw_urls)
if not video_url:
return None
title = self._match_json_string(html_content, "desc")
author = self._match_json_string(html_content, "nickname")
cover_match = re.search(r'"cover":\s*{\s*"url_list":\s*\[\s*"([^"]+)"', html_content or "")
return {
"type": "video",
"url": video_url,
"title": title,
"author": author,
"cover": self._decode_text(cover_match.group(1)) if cover_match else "",
}
def _pick_video_url(self, urls: List[Any]) -> str:
"""
从多个视频地址里优先挑选更适合直发的无水印链接。
规则:
1. 优先把 playwm 改成 play尽量拿无水印地址
2. 优先选择 aweme.snssdk.com 这类直链;
3. 如果没有,再退回现有 v3/v10 / douyinvod 选择逻辑。
"""
decoded_urls = [
self._decode_text(str(url)).replace("playwm", "play")
for url in urls
if isinstance(url, str) and str(url).strip()
]
snssdk_urls = [url for url in decoded_urls if "aweme.snssdk.com" in url]
if snssdk_urls:
return snssdk_urls[0]
return self._prefer_v3_v10(decoded_urls) or ""
def _match_json_string(self, text: str, key: str) -> str:
"""从 HTML 文本中的 JSON 片段抽取字符串字段。"""
match = re.search(rf'"{re.escape(key)}":\s*"([^"]*)"', text or "")
return self._clean_text(self._decode_text(match.group(1))) if match else ""
def _decode_text(self, value: Any) -> str:
"""同时处理 HTML 转义与 unicode 转义,避免标题和 URL 出现 \\uXXXX / &amp;。"""
if value is None:
return ""
text = html.unescape(str(value))
# 只有在文本里明显存在 \uXXXX / \xXX 这类转义片段时才做 unicode_escape 解码,
# 避免把本来已经是正常中文的字符串再次错误解码成乱码。
if "\\u" in text or "\\x" in text:
try:
text = text.encode("utf-8").decode("unicode_escape")
except Exception:
pass
return text
def _clean_text(self, value: Any) -> str:
"""统一清理文本字段,避免标题/作者带空白或转义残留。"""
return "" if value is None else self._decode_text(value).strip()
def _build_proxies(self) -> Optional[Dict[str, str]]: def _build_proxies(self) -> Optional[Dict[str, str]]:
if self.http_proxy: if self.http_proxy:
return {"http": self.http_proxy, "https": self.http_proxy} return {"http": self.http_proxy, "https": self.http_proxy}
@@ -276,6 +556,24 @@ class DouyinParserPlugin(MessagePluginInterface):
headers["Cookie"] = self.cookie headers["Cookie"] = self.cookie
return headers return headers
def _build_page_request_headers(self) -> Dict[str, str]:
"""
构建用于访问抖音分享页的请求头。
这里单独使用移动端 Safari UA是因为参考项目和线上经验都表明
- 分享页 HTML 在移动端更稳定地携带 _ROUTER_DATA
- 图文作品在移动端页面中的结构更统一;
- 不影响现有 API 兜底链路,因为只用于本地页面抓取。
"""
headers = self._build_request_headers()
headers["User-Agent"] = (
"Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X) "
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1"
)
headers["Referer"] = "https://www.douyin.com/"
headers["Accept-Language"] = "zh-CN,zh;q=0.9"
return headers
def _parse_from_internal_api(self, clean_url: str) -> Optional[Dict[str, Any]]: def _parse_from_internal_api(self, clean_url: str) -> Optional[Dict[str, Any]]:
try: try:
endpoint = "http://192.168.2.32:8999/api/hybrid/video_data" endpoint = "http://192.168.2.32:8999/api/hybrid/video_data"
@@ -533,6 +831,22 @@ class DouyinParserPlugin(MessagePluginInterface):
except Exception: except Exception:
return None return None
def _download_first_available_image_bytes(self, candidates: List[str]) -> Optional[bytes]:
"""
按候选列表顺序下载第一张可用图片。
本地页面解析拿到的图片地址通常会给出多份 url_list
这里逐个尝试可以减少单一 CDN 地址失效导致的图文整条失败。
"""
for candidate in candidates or []:
clean_candidate = self._clean_url(str(candidate or ""))
if not clean_candidate:
continue
image_bytes = self._download_image_bytes(clean_candidate)
if image_bytes:
return image_bytes
return None
def _append_title_to_image(self, image_bytes: bytes, title: str) -> bytes: def _append_title_to_image(self, image_bytes: bytes, title: str) -> bytes:
""" """
将标题绘制到图片顶部,返回新的图片二进制数据。 将标题绘制到图片顶部,返回新的图片二进制数据。