修复抖音图文与卡片文案乱码问题

- 抖音分享页改为优先按 UTF-8 解码响应内容\n- 新增常见中文乱码识别与温和修复逻辑\n- 统一清洗抖音解析返回的标题与作者字段
2026-05-06 13:35:11 +08:00
parent 14aa2ba067
commit a97e2fc092
1 changed files with 70 additions and 3 deletions
--- a/plugins/douyin_parser/main.py
+++ b/plugins/douyin_parser/main.py
@@ -258,6 +258,10 @@ class DouyinParserPlugin(MessagePluginInterface):
            return data
        default_cover = "https://is1-ssl.mzstatic.com/image/thumb/Purple221/v4/7c/49/e1/7c49e1af-ce92-d1c4-9a93-0a316e47ba94/AppIcon_TikTok-0-0-1x_U007epad-0-1-0-0-85-220.png/512x512bb.jpg"
        media_type = data.get('type') or 'video'
+        # 三条解析链路最终都会走到这里，因此把标题/作者统一再清洗一遍，
+        # 可以同时兜住“本地页面解析”“内网接口”“外部接口”三种来源的乱码问题。
+        data['title'] = self._clean_text(data.get('title'))
+        data['author'] = self._clean_text(data.get('author'))
        if media_type == 'video':
            cover = data.get('cover')
            if isinstance(cover, str):
@@ -345,8 +349,14 @@ class DouyinParserPlugin(MessagePluginInterface):
            proxies=self._build_proxies(),
        )
        response.raise_for_status()
-        response.encoding = response.apparent_encoding or response.encoding or "utf-8"
-        html_content = response.text or ""
+        # 抖音分享页绝大多数场景实际都是 UTF-8。
+        # 之前这里优先使用 apparent_encoding，容易被短文本页面误判成 GBK/Latin-1，
+        # 最终导致图文文案和卡片标题一进解析链路就已经变成乱码。
+        # 这里改成：
+        # 1. 优先按 UTF-8 直接解原始 bytes；
+        # 2. UTF-8 失败时，再回退到响应头 / apparent_encoding；
+        # 3. 最后兜底 replace，至少保证流程不断。
+        html_content = self._decode_http_response_text(response)
        if not html_content.strip():
            raise DouyinParserError("抖音分享页内容为空")
        return html_content
@@ -536,12 +546,69 @@ class DouyinParserPlugin(MessagePluginInterface):
                text = text.encode("utf-8").decode("unicode_escape")
            except Exception:
                pass
-        return text
+        # 某些链路里文本已经在上游被错误按 Latin-1 / CP1252 解过一次，
+        # 这里做一层“仅在明显像乱码时才尝试”的温和修复，避免正常中文被误伤。
+        return self._repair_mojibake_text(text)

    def _clean_text(self, value: Any) -> str:
        """统一清理文本字段，避免标题/作者带空白或转义残留。"""
        return "" if value is None else self._decode_text(value).strip()

+    def _decode_http_response_text(self, response: requests.Response) -> str:
+        """更稳妥地把 HTTP 响应转成文本。
+
+        设计说明：
+        1. 抖音分享页和大部分 JSON/HTML 实际都用 UTF-8；
+        2. `apparent_encoding` 在中文短文本页面上很容易误判，直接用会把整段中文解坏；
+        3. 因此先信任 UTF-8，再逐步回退到 header / apparent / replace。
+        """
+        raw_bytes = response.content or b""
+        if not raw_bytes:
+            return ""
+
+        for encoding in ("utf-8", response.encoding, response.apparent_encoding, "gb18030"):
+            if not encoding:
+                continue
+            try:
+                decoded_text = raw_bytes.decode(encoding)
+                # 如果解出来明显像“UTF-8 被错按单字节编码解释过”，再试着修一手。
+                repaired_text = self._repair_mojibake_text(decoded_text)
+                if repaired_text:
+                    return repaired_text
+            except Exception:
+                continue
+        return raw_bytes.decode("utf-8", errors="replace")
+
+    def _looks_like_mojibake(self, text: str) -> bool:
+        """判断文本是否像常见的 UTF-8 误解码乱码。"""
+        if not text:
+            return False
+        suspicious_markers = ("Ã", "Â", "æ", "ä", "å", "ç", "é", "ê", "ï", "ð")
+        marker_hits = sum(text.count(marker) for marker in suspicious_markers)
+        # 中文场景里这些字符密集出现时，基本就是“UTF-8 被按 Latin-1/CP1252 解了”。
+        return marker_hits >= 2
+
+    def _repair_mojibake_text(self, text: str) -> str:
+        """修复常见的中文乱码，但只在高置信度时生效。"""
+        if not text or not self._looks_like_mojibake(text):
+            return text
+
+        for source_encoding in ("latin1", "cp1252"):
+            try:
+                repaired_text = text.encode(source_encoding).decode("utf-8")
+                # 修复后若中文比例明显提升，就采用修复结果。
+                if repaired_text and self._count_cjk_chars(repaired_text) >= self._count_cjk_chars(text):
+                    return repaired_text
+            except Exception:
+                continue
+        return text
+
+    def _count_cjk_chars(self, text: str) -> int:
+        """统计字符串中的中日韩统一表意文字数量，用于判断修复是否更合理。"""
+        if not text:
+            return 0
+        return sum(1 for ch in text if "\u4e00" <= ch <= "\u9fff")
+
    def _build_proxies(self) -> Optional[Dict[str, str]]:
        if self.http_proxy:
            return {"http": self.http_proxy, "https": self.http_proxy}