From a97e2fc0929506b2b7392e48822a8807bbc35075 Mon Sep 17 00:00:00 2001 From: liuwei Date: Wed, 6 May 2026 13:35:11 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E6=8A=96=E9=9F=B3=E5=9B=BE?= =?UTF-8?q?=E6=96=87=E4=B8=8E=E5=8D=A1=E7=89=87=E6=96=87=E6=A1=88=E4=B9=B1?= =?UTF-8?q?=E7=A0=81=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 抖音分享页改为优先按 UTF-8 解码响应内容\n- 新增常见中文乱码识别与温和修复逻辑\n- 统一清洗抖音解析返回的标题与作者字段 --- plugins/douyin_parser/main.py | 73 +++++++++++++++++++++++++++++++++-- 1 file changed, 70 insertions(+), 3 deletions(-) diff --git a/plugins/douyin_parser/main.py b/plugins/douyin_parser/main.py index 1180413..e4771f5 100644 --- a/plugins/douyin_parser/main.py +++ b/plugins/douyin_parser/main.py @@ -258,6 +258,10 @@ class DouyinParserPlugin(MessagePluginInterface): return data default_cover = "https://is1-ssl.mzstatic.com/image/thumb/Purple221/v4/7c/49/e1/7c49e1af-ce92-d1c4-9a93-0a316e47ba94/AppIcon_TikTok-0-0-1x_U007epad-0-1-0-0-85-220.png/512x512bb.jpg" media_type = data.get('type') or 'video' + # 三条解析链路最终都会走到这里,因此把标题/作者统一再清洗一遍, + # 可以同时兜住“本地页面解析”“内网接口”“外部接口”三种来源的乱码问题。 + data['title'] = self._clean_text(data.get('title')) + data['author'] = self._clean_text(data.get('author')) if media_type == 'video': cover = data.get('cover') if isinstance(cover, str): @@ -345,8 +349,14 @@ class DouyinParserPlugin(MessagePluginInterface): proxies=self._build_proxies(), ) response.raise_for_status() - response.encoding = response.apparent_encoding or response.encoding or "utf-8" - html_content = response.text or "" + # 抖音分享页绝大多数场景实际都是 UTF-8。 + # 之前这里优先使用 apparent_encoding,容易被短文本页面误判成 GBK/Latin-1, + # 最终导致图文文案和卡片标题一进解析链路就已经变成乱码。 + # 这里改成: + # 1. 优先按 UTF-8 直接解原始 bytes; + # 2. UTF-8 失败时,再回退到响应头 / apparent_encoding; + # 3. 最后兜底 replace,至少保证流程不断。 + html_content = self._decode_http_response_text(response) if not html_content.strip(): raise DouyinParserError("抖音分享页内容为空") return html_content @@ -536,12 +546,69 @@ class DouyinParserPlugin(MessagePluginInterface): text = text.encode("utf-8").decode("unicode_escape") except Exception: pass - return text + # 某些链路里文本已经在上游被错误按 Latin-1 / CP1252 解过一次, + # 这里做一层“仅在明显像乱码时才尝试”的温和修复,避免正常中文被误伤。 + return self._repair_mojibake_text(text) def _clean_text(self, value: Any) -> str: """统一清理文本字段,避免标题/作者带空白或转义残留。""" return "" if value is None else self._decode_text(value).strip() + def _decode_http_response_text(self, response: requests.Response) -> str: + """更稳妥地把 HTTP 响应转成文本。 + + 设计说明: + 1. 抖音分享页和大部分 JSON/HTML 实际都用 UTF-8; + 2. `apparent_encoding` 在中文短文本页面上很容易误判,直接用会把整段中文解坏; + 3. 因此先信任 UTF-8,再逐步回退到 header / apparent / replace。 + """ + raw_bytes = response.content or b"" + if not raw_bytes: + return "" + + for encoding in ("utf-8", response.encoding, response.apparent_encoding, "gb18030"): + if not encoding: + continue + try: + decoded_text = raw_bytes.decode(encoding) + # 如果解出来明显像“UTF-8 被错按单字节编码解释过”,再试着修一手。 + repaired_text = self._repair_mojibake_text(decoded_text) + if repaired_text: + return repaired_text + except Exception: + continue + return raw_bytes.decode("utf-8", errors="replace") + + def _looks_like_mojibake(self, text: str) -> bool: + """判断文本是否像常见的 UTF-8 误解码乱码。""" + if not text: + return False + suspicious_markers = ("Ã", "Â", "æ", "ä", "å", "ç", "é", "ê", "ï", "ð") + marker_hits = sum(text.count(marker) for marker in suspicious_markers) + # 中文场景里这些字符密集出现时,基本就是“UTF-8 被按 Latin-1/CP1252 解了”。 + return marker_hits >= 2 + + def _repair_mojibake_text(self, text: str) -> str: + """修复常见的中文乱码,但只在高置信度时生效。""" + if not text or not self._looks_like_mojibake(text): + return text + + for source_encoding in ("latin1", "cp1252"): + try: + repaired_text = text.encode(source_encoding).decode("utf-8") + # 修复后若中文比例明显提升,就采用修复结果。 + if repaired_text and self._count_cjk_chars(repaired_text) >= self._count_cjk_chars(text): + return repaired_text + except Exception: + continue + return text + + def _count_cjk_chars(self, text: str) -> int: + """统计字符串中的中日韩统一表意文字数量,用于判断修复是否更合理。""" + if not text: + return 0 + return sum(1 for ch in text if "\u4e00" <= ch <= "\u9fff") + def _build_proxies(self) -> Optional[Dict[str, str]]: if self.http_proxy: return {"http": self.http_proxy, "https": self.http_proxy}