diff --git a/plugins/douyin_parser/config.toml b/plugins/douyin_parser/config.toml index d4835f4..cac4242 100644 --- a/plugins/douyin_parser/config.toml +++ b/plugins/douyin_parser/config.toml @@ -4,12 +4,6 @@ enable = true # 发送模式: card(发送卡片) 或 file(下载并发送文件) download_mode = "card" -# 本地页面解析超时(秒): -# 1. 现在抖音解析会优先直接请求分享页并从 HTML 中提取作品数据; -# 2. 如果本地网络偶尔较慢,可以适当调大这个值; -# 3. 本地解析超时或失败后,插件仍会继续走原来的内网接口和外部接口兜底。 -local_parse_timeout_seconds = 12 - # Http代理设置(用于获取真实链接发送卡片,如果家里有ipv6,可以设置为空) # 格式: http://用户名:密码@代理地址:代理端口 # 例如:http://127.0.0.1:7890 diff --git a/plugins/douyin_parser/main.py b/plugins/douyin_parser/main.py index c1d3c64..617fe87 100644 --- a/plugins/douyin_parser/main.py +++ b/plugins/douyin_parser/main.py @@ -2,8 +2,6 @@ import os import re import time import traceback -import html -import json import requests import io from typing import Dict, Any, List, Optional, Tuple @@ -33,17 +31,6 @@ class DouyinParserPlugin(MessagePluginInterface): # 功能权限常量 FEATURE_KEY = "DOUYIN_PARSER" FEATURE_DESCRIPTION = "🎵 抖音解析功能 [自动解析抖音链接]" - # 参考本地解析项目,把链接匹配范围放宽到 douyin.com / iesdouyin.com: - # 1. 原来只匹配 `v.douyin.com` 短链,用户直接转发长链时插件不会命中; - # 2. 本地页面解析本身就是基于真实分享页 HTML,因此长链也应该纳入同一套入口; - # 3. 这里统一抽 URL 后再做清洗,避免句尾标点被误带入请求。 - DOUYIN_URL_RE = re.compile(r'https?://[^\s<>"]+?(?:douyin\.com|iesdouyin\.com)[^\s<>"]*') - # 参考项目优先从 `window._ROUTER_DATA` 里拿 `loaderData -> videoInfoRes -> item_list[0]`: - # 1. 这是当前抖音分享页里最稳定的一份结构化首屏数据; - # 2. 能同时覆盖视频作品和图文作品; - # 3. 命中后可以直接绕开外部接口,减少第三方依赖。 - ROUTER_DATA_RE = re.compile(r"window\._ROUTER_DATA\s*=\s*({.*?})\s*", re.S) - LEGACY_PLAY_ADDR_RE = re.compile(r'"play_addr":\s*{\s*"uri":\s*"[^"]*",\s*"url_list":\s*\[([^\]]*)\]') @property def name(self) -> str: @@ -80,7 +67,7 @@ class DouyinParserPlugin(MessagePluginInterface): def __init__(self): super().__init__() self.LOG = logger - self.url_pattern = self.DOUYIN_URL_RE + self.url_pattern = re.compile(r'https?://v\.douyin\.com/[^\s/]+/?') # 注册功能权限 self.feature = self.register_feature() # 修改为使用插件目录下的down_load_dir文件夹 @@ -108,11 +95,6 @@ class DouyinParserPlugin(MessagePluginInterface): self.cookie = douyin_config.get("cookie", "") or "" self.cookie_file = douyin_config.get("cookie_file", "") or "" self.download_mode = douyin_config.get("download_mode", "card") # card或file - # 本地页面解析走真实抖音分享页,网络链路通常比内网接口更长一些: - # 1. 这里单独给一个本地解析超时,避免抖音页面偶发慢响应时无限挂起; - # 2. 超时只作用于“本地 HTML 解析优先链路”,不会改变后续旧接口的既有配置; - # 3. 若后续你觉得本地网络较慢,只需要改配置即可,不必再动代码。 - self.local_parse_timeout_seconds = max(int(douyin_config.get("local_parse_timeout_seconds", 12) or 12), 5) self.LOG.debug(f"[{self.name}] 插件初始化完成,代理设置: {self.http_proxy}") return True @@ -136,7 +118,8 @@ class DouyinParserPlugin(MessagePluginInterface): if message.get("type") != MessageType.TEXT: return False content = str(message.get("content", "")).strip() - return self._extract_douyin_url(content) is not None + match = self.url_pattern.search(content) + return match is not None @plugin_stats_decorator(plugin_name="抖音解析") async def process_message(self, message: Dict[str, Any]) -> Tuple[bool, Optional[str]]: @@ -153,10 +136,11 @@ class DouyinParserPlugin(MessagePluginInterface): return False, "没有权限" try: - original_url = self._extract_douyin_url(content) - if not original_url: + match = self.url_pattern.search(content) + if not match: return False, "未找到抖音链接" + original_url = self._clean_url(match.group(0)) self.LOG.info(f"发现抖音链接: {original_url}") media_info = self._parse_douyin(original_url) @@ -232,22 +216,6 @@ class DouyinParserPlugin(MessagePluginInterface): self.LOG.debug(f"[抖音] 清理后的URL: {cleaned_url}") return cleaned_url - def _extract_douyin_url(self, content: str) -> Optional[str]: - """从消息文本中提取第一条抖音链接。 - - 这里参考外部项目的做法,把句尾常见中文标点一并裁掉: - 1. 用户经常直接把“复制打开抖音……”整段文案贴进群里; - 2. 链接后面常跟着 `,。!?)` 这类符号,若不清洗会导致请求 404 或跳错页; - 3. 抽取逻辑统一收口后,`can_process` 和 `process_message` 可以复用同一套结果。 - """ - text = str(content or "").strip() - if not text: - return None - match = self.url_pattern.search(text) - if not match: - return None - return self._clean_url(match.group(0).rstrip(",。,.!!??))")) - def _clean_response_data(self, data: Dict[str, Any]) -> Dict[str, Any]: """清理响应数据""" if not data: @@ -269,20 +237,12 @@ class DouyinParserPlugin(MessagePluginInterface): def _parse_douyin(self, url: str) -> Dict[str, Any]: try: clean_url = self._clean_url(url) - # 第一优先级:本地页面解析。 - # 1. 参考你给的 DouyinParser 项目,先直接请求分享页并解析 HTML 中的 `_ROUTER_DATA`; - # 2. 这样成功时完全不依赖第三方解析 API,也更符合“本地优先”的目标; - # 3. 只有页面结构变化或网络异常时,才继续走你原来的内网接口和外部接口兜底。 - local_primary = self._parse_from_local_page(clean_url) - if local_primary and (local_primary.get('url') or local_primary.get('images')): - return self._clean_response_data(local_primary) - - # 第二优先级:保留原有本地业务解析服务(内网)。 + # 第一优先级:本地业务解析服务(内网),该链路与你指定的项目实现思路最接近,稳定性最高。 primary = self._parse_from_internal_api(clean_url) if primary and (primary.get('url') or primary.get('images')): return self._clean_response_data(primary) - # 第三优先级:外部接口兜底。 + # 第二优先级:外部接口兜底。 secondary = self._parse_from_external_api(clean_url) if secondary and (secondary.get('url') or secondary.get('images')): return self._clean_response_data(secondary) @@ -316,209 +276,6 @@ class DouyinParserPlugin(MessagePluginInterface): headers["Cookie"] = self.cookie return headers - def _build_local_parse_headers(self) -> Dict[str, str]: - """构建本地页面解析专用请求头。 - - 这里刻意切成移动端 Safari UA,原因有三点: - 1. 抖音分享页在移动端更容易直接返回完整作品页,而不是额外的跳转或限制提示; - 2. 参考项目就是用移动端 UA 解析,现成经验已经验证过这条链路更稳; - 3. 只在本地 HTML 解析链路生效,不会影响你原来的内网/外部接口调用头。 - """ - headers = self._build_request_headers() - headers["User-Agent"] = ( - "Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X) " - "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1" - ) - headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" - return headers - - def _parse_from_local_page(self, clean_url: str) -> Optional[Dict[str, Any]]: - """优先走本地页面解析。 - - 处理流程: - 1. 先跟随分享短链跳转,拿到最终作品页 HTML; - 2. 优先解析 `window._ROUTER_DATA`,提取视频或图文结构化数据; - 3. 若新版结构失效,再用旧版 `play_addr` 正则做一次视频兜底。 - """ - try: - response = requests.get( - clean_url, - headers=self._build_local_parse_headers(), - timeout=self.local_parse_timeout_seconds, - proxies=self._build_proxies(), - allow_redirects=True, - ) - if response.status_code != 200: - return None - html_content = response.text or "" - if not html_content: - return None - result = self._parse_local_page_html(html_content) - if result: - result["source_url"] = str(response.url or clean_url) - return result - except Exception: - return None - - def _parse_local_page_html(self, html_content: str) -> Optional[Dict[str, Any]]: - """解析抖音分享页 HTML。""" - item = self._extract_aweme_item_from_router_data(html_content) - if item: - note_result = self._parse_local_note_item(item) - if note_result: - return note_result - - video_result = self._parse_local_video_item(item) - if video_result: - return video_result - - return self._parse_local_legacy_video(html_content) - - def _extract_aweme_item_from_router_data(self, html_content: str) -> Optional[Dict[str, Any]]: - """从 `_ROUTER_DATA` 中抽出作品主数据节点。""" - match = self.ROUTER_DATA_RE.search(html_content or "") - if not match: - return None - - try: - router_data = json.loads(match.group(1)) - except json.JSONDecodeError as e: - self.LOG.debug(f"[抖音] 解析 _ROUTER_DATA 失败: {e}") - return None - - loader_data = router_data.get("loaderData") - if not isinstance(loader_data, dict): - return None - - for page_data in loader_data.values(): - if not isinstance(page_data, dict): - continue - video_info = page_data.get("videoInfoRes") - if not isinstance(video_info, dict): - continue - item_list = video_info.get("item_list") - if isinstance(item_list, list) and item_list and isinstance(item_list[0], dict): - return item_list[0] - return None - - def _parse_local_note_item(self, item: Dict[str, Any]) -> Optional[Dict[str, Any]]: - """解析图文作品。""" - image_url_groups = self._pick_local_image_url_groups(item) - if not image_url_groups: - return None - - desc = self._clean_local_text(item.get("desc")) - author = self._clean_local_text((item.get("author") or {}).get("nickname")) - images = [group[0] for group in image_url_groups if group] - if not images: - return None - - return { - "type": "image", - "images": images, - "image_url_groups": image_url_groups, - "title": desc, - "author": author, - "cover": images[0], - } - - def _pick_local_image_url_groups(self, item: Dict[str, Any]) -> List[List[str]]: - """从图文作品中提取每一张图的候选地址列表。""" - image_url_groups: List[List[str]] = [] - seen_groups = set() - for image_info in item.get("images") or item.get("image_infos") or []: - if not isinstance(image_info, dict): - continue - candidates: List[str] = [] - seen_urls = set() - for image_url in image_info.get("url_list") or []: - if not isinstance(image_url, str) or not image_url.startswith("http"): - continue - decoded_url = self._decode_local_value(image_url) - if decoded_url in seen_urls: - continue - candidates.append(decoded_url) - seen_urls.add(decoded_url) - group_key = tuple(candidates) - if candidates and group_key not in seen_groups: - image_url_groups.append(candidates) - seen_groups.add(group_key) - return image_url_groups - - def _parse_local_video_item(self, item: Dict[str, Any]) -> Optional[Dict[str, Any]]: - """解析视频作品。""" - video = item.get("video") - if not isinstance(video, dict): - return None - if int(video.get("duration") or 1) == 0: - return None - - play_addr = video.get("play_addr") or {} - urls = play_addr.get("url_list") or [] - cleaned_urls = [self._decode_local_value(url).replace("playwm", "play") for url in urls if isinstance(url, str) and url] - video_url = self._prefer_v3_v10(cleaned_urls) - if not video_url: - return None - - cover = video.get("cover") or {} - cover_urls = cover.get("url_list") or [] - cover_url = self._decode_local_value(cover_urls[0]) if cover_urls else "" - - return { - "type": "video", - "url": video_url, - "title": self._clean_local_text(item.get("desc")), - "author": self._clean_local_text((item.get("author") or {}).get("nickname")), - "cover": cover_url, - } - - def _parse_local_legacy_video(self, html_content: str) -> Optional[Dict[str, Any]]: - """旧版页面结构兜底:直接从 HTML 里正则抽 `play_addr.url_list`。""" - match = self.LEGACY_PLAY_ADDR_RE.search(html_content or "") - if not match: - return None - - raw_urls = [url.strip().strip('"') for url in match.group(1).split(",")] - cleaned_urls = [self._decode_local_value(url).replace("playwm", "play") for url in raw_urls if url] - video_url = self._prefer_v3_v10(cleaned_urls) - if not video_url: - return None - - title = self._match_local_json_string(html_content, "desc") - author = self._match_local_json_string(html_content, "nickname") - cover_match = re.search(r'"cover":\s*{\s*"url_list":\s*\[\s*"([^"]+)"', html_content or "") - cover_url = self._decode_local_value(cover_match.group(1)) if cover_match else "" - - return { - "type": "video", - "url": video_url, - "title": title, - "author": author, - "cover": cover_url, - } - - def _match_local_json_string(self, text: str, key: str) -> str: - """从页面原始 JSON 片段中提取单个字符串字段。""" - match = re.search(rf'"{re.escape(key)}":\s*"([^"]*)"', text or "") - if not match: - return "" - return self._clean_local_text(self._decode_local_value(match.group(1))) - - def _decode_local_value(self, value: str) -> str: - """解码 HTML 实体和 `\\uXXXX` 形式的转义文本。""" - text = str(value or "") - try: - text = text.encode("utf-8").decode("unicode_escape") - except Exception: - pass - return html.unescape(text) - - def _clean_local_text(self, value: Any) -> str: - """清洗页面里读出来的标题、作者等文本字段。""" - if value is None: - return "" - return html.unescape(str(value)).strip() - def _parse_from_internal_api(self, clean_url: str) -> Optional[Dict[str, Any]]: try: endpoint = "http://192.168.2.32:8999/api/hybrid/video_data"