From 6e0483a49e27d32f99f9af38403a6ff5511dc9db Mon Sep 17 00:00:00 2001 From: Liu Date: Fri, 1 May 2026 11:49:46 +0800 Subject: [PATCH] =?UTF-8?q?=E6=8A=96=E9=9F=B3=E8=A7=A3=E6=9E=90=E6=94=B9?= =?UTF-8?q?=E4=B8=BA=E6=9C=AC=E5=9C=B0=E9=A1=B5=E9=9D=A2=E4=BC=98=E5=85=88?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. 参考外部 DouyinParser 项目,新增基于分享页 HTML 和 _ROUTER_DATA 的本地解析链路。 2. 抖音解析现在按本地页面解析 -> 原内网接口 -> 原外部接口的顺序依次兜底。 3. 放宽链接匹配范围到 douyin.com / iesdouyin.com,并新增本地解析超时配置项。 --- plugins/douyin_parser/config.toml | 6 + plugins/douyin_parser/main.py | 259 +++++++++++++++++++++++++++++- 2 files changed, 257 insertions(+), 8 deletions(-) diff --git a/plugins/douyin_parser/config.toml b/plugins/douyin_parser/config.toml index cac4242..d4835f4 100644 --- a/plugins/douyin_parser/config.toml +++ b/plugins/douyin_parser/config.toml @@ -4,6 +4,12 @@ enable = true # 发送模式: card(发送卡片) 或 file(下载并发送文件) download_mode = "card" +# 本地页面解析超时(秒): +# 1. 现在抖音解析会优先直接请求分享页并从 HTML 中提取作品数据; +# 2. 如果本地网络偶尔较慢,可以适当调大这个值; +# 3. 本地解析超时或失败后,插件仍会继续走原来的内网接口和外部接口兜底。 +local_parse_timeout_seconds = 12 + # Http代理设置(用于获取真实链接发送卡片,如果家里有ipv6,可以设置为空) # 格式: http://用户名:密码@代理地址:代理端口 # 例如:http://127.0.0.1:7890 diff --git a/plugins/douyin_parser/main.py b/plugins/douyin_parser/main.py index 617fe87..c1d3c64 100644 --- a/plugins/douyin_parser/main.py +++ b/plugins/douyin_parser/main.py @@ -2,6 +2,8 @@ import os import re import time import traceback +import html +import json import requests import io from typing import Dict, Any, List, Optional, Tuple @@ -31,6 +33,17 @@ class DouyinParserPlugin(MessagePluginInterface): # 功能权限常量 FEATURE_KEY = "DOUYIN_PARSER" FEATURE_DESCRIPTION = "🎵 抖音解析功能 [自动解析抖音链接]" + # 参考本地解析项目,把链接匹配范围放宽到 douyin.com / iesdouyin.com: + # 1. 原来只匹配 `v.douyin.com` 短链,用户直接转发长链时插件不会命中; + # 2. 本地页面解析本身就是基于真实分享页 HTML,因此长链也应该纳入同一套入口; + # 3. 这里统一抽 URL 后再做清洗,避免句尾标点被误带入请求。 + DOUYIN_URL_RE = re.compile(r'https?://[^\s<>"]+?(?:douyin\.com|iesdouyin\.com)[^\s<>"]*') + # 参考项目优先从 `window._ROUTER_DATA` 里拿 `loaderData -> videoInfoRes -> item_list[0]`: + # 1. 这是当前抖音分享页里最稳定的一份结构化首屏数据; + # 2. 能同时覆盖视频作品和图文作品; + # 3. 命中后可以直接绕开外部接口,减少第三方依赖。 + ROUTER_DATA_RE = re.compile(r"window\._ROUTER_DATA\s*=\s*({.*?})\s*", re.S) + LEGACY_PLAY_ADDR_RE = re.compile(r'"play_addr":\s*{\s*"uri":\s*"[^"]*",\s*"url_list":\s*\[([^\]]*)\]') @property def name(self) -> str: @@ -67,7 +80,7 @@ class DouyinParserPlugin(MessagePluginInterface): def __init__(self): super().__init__() self.LOG = logger - self.url_pattern = re.compile(r'https?://v\.douyin\.com/[^\s/]+/?') + self.url_pattern = self.DOUYIN_URL_RE # 注册功能权限 self.feature = self.register_feature() # 修改为使用插件目录下的down_load_dir文件夹 @@ -95,6 +108,11 @@ class DouyinParserPlugin(MessagePluginInterface): self.cookie = douyin_config.get("cookie", "") or "" self.cookie_file = douyin_config.get("cookie_file", "") or "" self.download_mode = douyin_config.get("download_mode", "card") # card或file + # 本地页面解析走真实抖音分享页,网络链路通常比内网接口更长一些: + # 1. 这里单独给一个本地解析超时,避免抖音页面偶发慢响应时无限挂起; + # 2. 超时只作用于“本地 HTML 解析优先链路”,不会改变后续旧接口的既有配置; + # 3. 若后续你觉得本地网络较慢,只需要改配置即可,不必再动代码。 + self.local_parse_timeout_seconds = max(int(douyin_config.get("local_parse_timeout_seconds", 12) or 12), 5) self.LOG.debug(f"[{self.name}] 插件初始化完成,代理设置: {self.http_proxy}") return True @@ -118,8 +136,7 @@ class DouyinParserPlugin(MessagePluginInterface): if message.get("type") != MessageType.TEXT: return False content = str(message.get("content", "")).strip() - match = self.url_pattern.search(content) - return match is not None + return self._extract_douyin_url(content) is not None @plugin_stats_decorator(plugin_name="抖音解析") async def process_message(self, message: Dict[str, Any]) -> Tuple[bool, Optional[str]]: @@ -136,11 +153,10 @@ class DouyinParserPlugin(MessagePluginInterface): return False, "没有权限" try: - match = self.url_pattern.search(content) - if not match: + original_url = self._extract_douyin_url(content) + if not original_url: return False, "未找到抖音链接" - original_url = self._clean_url(match.group(0)) self.LOG.info(f"发现抖音链接: {original_url}") media_info = self._parse_douyin(original_url) @@ -216,6 +232,22 @@ class DouyinParserPlugin(MessagePluginInterface): self.LOG.debug(f"[抖音] 清理后的URL: {cleaned_url}") return cleaned_url + def _extract_douyin_url(self, content: str) -> Optional[str]: + """从消息文本中提取第一条抖音链接。 + + 这里参考外部项目的做法,把句尾常见中文标点一并裁掉: + 1. 用户经常直接把“复制打开抖音……”整段文案贴进群里; + 2. 链接后面常跟着 `,。!?)` 这类符号,若不清洗会导致请求 404 或跳错页; + 3. 抽取逻辑统一收口后,`can_process` 和 `process_message` 可以复用同一套结果。 + """ + text = str(content or "").strip() + if not text: + return None + match = self.url_pattern.search(text) + if not match: + return None + return self._clean_url(match.group(0).rstrip(",。,.!!??))")) + def _clean_response_data(self, data: Dict[str, Any]) -> Dict[str, Any]: """清理响应数据""" if not data: @@ -237,12 +269,20 @@ class DouyinParserPlugin(MessagePluginInterface): def _parse_douyin(self, url: str) -> Dict[str, Any]: try: clean_url = self._clean_url(url) - # 第一优先级:本地业务解析服务(内网),该链路与你指定的项目实现思路最接近,稳定性最高。 + # 第一优先级:本地页面解析。 + # 1. 参考你给的 DouyinParser 项目,先直接请求分享页并解析 HTML 中的 `_ROUTER_DATA`; + # 2. 这样成功时完全不依赖第三方解析 API,也更符合“本地优先”的目标; + # 3. 只有页面结构变化或网络异常时,才继续走你原来的内网接口和外部接口兜底。 + local_primary = self._parse_from_local_page(clean_url) + if local_primary and (local_primary.get('url') or local_primary.get('images')): + return self._clean_response_data(local_primary) + + # 第二优先级:保留原有本地业务解析服务(内网)。 primary = self._parse_from_internal_api(clean_url) if primary and (primary.get('url') or primary.get('images')): return self._clean_response_data(primary) - # 第二优先级:外部接口兜底。 + # 第三优先级:外部接口兜底。 secondary = self._parse_from_external_api(clean_url) if secondary and (secondary.get('url') or secondary.get('images')): return self._clean_response_data(secondary) @@ -276,6 +316,209 @@ class DouyinParserPlugin(MessagePluginInterface): headers["Cookie"] = self.cookie return headers + def _build_local_parse_headers(self) -> Dict[str, str]: + """构建本地页面解析专用请求头。 + + 这里刻意切成移动端 Safari UA,原因有三点: + 1. 抖音分享页在移动端更容易直接返回完整作品页,而不是额外的跳转或限制提示; + 2. 参考项目就是用移动端 UA 解析,现成经验已经验证过这条链路更稳; + 3. 只在本地 HTML 解析链路生效,不会影响你原来的内网/外部接口调用头。 + """ + headers = self._build_request_headers() + headers["User-Agent"] = ( + "Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X) " + "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1" + ) + headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" + return headers + + def _parse_from_local_page(self, clean_url: str) -> Optional[Dict[str, Any]]: + """优先走本地页面解析。 + + 处理流程: + 1. 先跟随分享短链跳转,拿到最终作品页 HTML; + 2. 优先解析 `window._ROUTER_DATA`,提取视频或图文结构化数据; + 3. 若新版结构失效,再用旧版 `play_addr` 正则做一次视频兜底。 + """ + try: + response = requests.get( + clean_url, + headers=self._build_local_parse_headers(), + timeout=self.local_parse_timeout_seconds, + proxies=self._build_proxies(), + allow_redirects=True, + ) + if response.status_code != 200: + return None + html_content = response.text or "" + if not html_content: + return None + result = self._parse_local_page_html(html_content) + if result: + result["source_url"] = str(response.url or clean_url) + return result + except Exception: + return None + + def _parse_local_page_html(self, html_content: str) -> Optional[Dict[str, Any]]: + """解析抖音分享页 HTML。""" + item = self._extract_aweme_item_from_router_data(html_content) + if item: + note_result = self._parse_local_note_item(item) + if note_result: + return note_result + + video_result = self._parse_local_video_item(item) + if video_result: + return video_result + + return self._parse_local_legacy_video(html_content) + + def _extract_aweme_item_from_router_data(self, html_content: str) -> Optional[Dict[str, Any]]: + """从 `_ROUTER_DATA` 中抽出作品主数据节点。""" + match = self.ROUTER_DATA_RE.search(html_content or "") + if not match: + return None + + try: + router_data = json.loads(match.group(1)) + except json.JSONDecodeError as e: + self.LOG.debug(f"[抖音] 解析 _ROUTER_DATA 失败: {e}") + return None + + loader_data = router_data.get("loaderData") + if not isinstance(loader_data, dict): + return None + + for page_data in loader_data.values(): + if not isinstance(page_data, dict): + continue + video_info = page_data.get("videoInfoRes") + if not isinstance(video_info, dict): + continue + item_list = video_info.get("item_list") + if isinstance(item_list, list) and item_list and isinstance(item_list[0], dict): + return item_list[0] + return None + + def _parse_local_note_item(self, item: Dict[str, Any]) -> Optional[Dict[str, Any]]: + """解析图文作品。""" + image_url_groups = self._pick_local_image_url_groups(item) + if not image_url_groups: + return None + + desc = self._clean_local_text(item.get("desc")) + author = self._clean_local_text((item.get("author") or {}).get("nickname")) + images = [group[0] for group in image_url_groups if group] + if not images: + return None + + return { + "type": "image", + "images": images, + "image_url_groups": image_url_groups, + "title": desc, + "author": author, + "cover": images[0], + } + + def _pick_local_image_url_groups(self, item: Dict[str, Any]) -> List[List[str]]: + """从图文作品中提取每一张图的候选地址列表。""" + image_url_groups: List[List[str]] = [] + seen_groups = set() + for image_info in item.get("images") or item.get("image_infos") or []: + if not isinstance(image_info, dict): + continue + candidates: List[str] = [] + seen_urls = set() + for image_url in image_info.get("url_list") or []: + if not isinstance(image_url, str) or not image_url.startswith("http"): + continue + decoded_url = self._decode_local_value(image_url) + if decoded_url in seen_urls: + continue + candidates.append(decoded_url) + seen_urls.add(decoded_url) + group_key = tuple(candidates) + if candidates and group_key not in seen_groups: + image_url_groups.append(candidates) + seen_groups.add(group_key) + return image_url_groups + + def _parse_local_video_item(self, item: Dict[str, Any]) -> Optional[Dict[str, Any]]: + """解析视频作品。""" + video = item.get("video") + if not isinstance(video, dict): + return None + if int(video.get("duration") or 1) == 0: + return None + + play_addr = video.get("play_addr") or {} + urls = play_addr.get("url_list") or [] + cleaned_urls = [self._decode_local_value(url).replace("playwm", "play") for url in urls if isinstance(url, str) and url] + video_url = self._prefer_v3_v10(cleaned_urls) + if not video_url: + return None + + cover = video.get("cover") or {} + cover_urls = cover.get("url_list") or [] + cover_url = self._decode_local_value(cover_urls[0]) if cover_urls else "" + + return { + "type": "video", + "url": video_url, + "title": self._clean_local_text(item.get("desc")), + "author": self._clean_local_text((item.get("author") or {}).get("nickname")), + "cover": cover_url, + } + + def _parse_local_legacy_video(self, html_content: str) -> Optional[Dict[str, Any]]: + """旧版页面结构兜底:直接从 HTML 里正则抽 `play_addr.url_list`。""" + match = self.LEGACY_PLAY_ADDR_RE.search(html_content or "") + if not match: + return None + + raw_urls = [url.strip().strip('"') for url in match.group(1).split(",")] + cleaned_urls = [self._decode_local_value(url).replace("playwm", "play") for url in raw_urls if url] + video_url = self._prefer_v3_v10(cleaned_urls) + if not video_url: + return None + + title = self._match_local_json_string(html_content, "desc") + author = self._match_local_json_string(html_content, "nickname") + cover_match = re.search(r'"cover":\s*{\s*"url_list":\s*\[\s*"([^"]+)"', html_content or "") + cover_url = self._decode_local_value(cover_match.group(1)) if cover_match else "" + + return { + "type": "video", + "url": video_url, + "title": title, + "author": author, + "cover": cover_url, + } + + def _match_local_json_string(self, text: str, key: str) -> str: + """从页面原始 JSON 片段中提取单个字符串字段。""" + match = re.search(rf'"{re.escape(key)}":\s*"([^"]*)"', text or "") + if not match: + return "" + return self._clean_local_text(self._decode_local_value(match.group(1))) + + def _decode_local_value(self, value: str) -> str: + """解码 HTML 实体和 `\\uXXXX` 形式的转义文本。""" + text = str(value or "") + try: + text = text.encode("utf-8").decode("unicode_escape") + except Exception: + pass + return html.unescape(text) + + def _clean_local_text(self, value: Any) -> str: + """清洗页面里读出来的标题、作者等文本字段。""" + if value is None: + return "" + return html.unescape(str(value)).strip() + def _parse_from_internal_api(self, clean_url: str) -> Optional[Dict[str, Any]]: try: endpoint = "http://192.168.2.32:8999/api/hybrid/video_data"