修复抖音图文与卡片文案乱码问题
- 抖音分享页改为优先按 UTF-8 解码响应内容\n- 新增常见中文乱码识别与温和修复逻辑\n- 统一清洗抖音解析返回的标题与作者字段
This commit is contained in:
@@ -258,6 +258,10 @@ class DouyinParserPlugin(MessagePluginInterface):
|
||||
return data
|
||||
default_cover = "https://is1-ssl.mzstatic.com/image/thumb/Purple221/v4/7c/49/e1/7c49e1af-ce92-d1c4-9a93-0a316e47ba94/AppIcon_TikTok-0-0-1x_U007epad-0-1-0-0-85-220.png/512x512bb.jpg"
|
||||
media_type = data.get('type') or 'video'
|
||||
# 三条解析链路最终都会走到这里,因此把标题/作者统一再清洗一遍,
|
||||
# 可以同时兜住“本地页面解析”“内网接口”“外部接口”三种来源的乱码问题。
|
||||
data['title'] = self._clean_text(data.get('title'))
|
||||
data['author'] = self._clean_text(data.get('author'))
|
||||
if media_type == 'video':
|
||||
cover = data.get('cover')
|
||||
if isinstance(cover, str):
|
||||
@@ -345,8 +349,14 @@ class DouyinParserPlugin(MessagePluginInterface):
|
||||
proxies=self._build_proxies(),
|
||||
)
|
||||
response.raise_for_status()
|
||||
response.encoding = response.apparent_encoding or response.encoding or "utf-8"
|
||||
html_content = response.text or ""
|
||||
# 抖音分享页绝大多数场景实际都是 UTF-8。
|
||||
# 之前这里优先使用 apparent_encoding,容易被短文本页面误判成 GBK/Latin-1,
|
||||
# 最终导致图文文案和卡片标题一进解析链路就已经变成乱码。
|
||||
# 这里改成:
|
||||
# 1. 优先按 UTF-8 直接解原始 bytes;
|
||||
# 2. UTF-8 失败时,再回退到响应头 / apparent_encoding;
|
||||
# 3. 最后兜底 replace,至少保证流程不断。
|
||||
html_content = self._decode_http_response_text(response)
|
||||
if not html_content.strip():
|
||||
raise DouyinParserError("抖音分享页内容为空")
|
||||
return html_content
|
||||
@@ -536,12 +546,69 @@ class DouyinParserPlugin(MessagePluginInterface):
|
||||
text = text.encode("utf-8").decode("unicode_escape")
|
||||
except Exception:
|
||||
pass
|
||||
return text
|
||||
# 某些链路里文本已经在上游被错误按 Latin-1 / CP1252 解过一次,
|
||||
# 这里做一层“仅在明显像乱码时才尝试”的温和修复,避免正常中文被误伤。
|
||||
return self._repair_mojibake_text(text)
|
||||
|
||||
def _clean_text(self, value: Any) -> str:
|
||||
"""统一清理文本字段,避免标题/作者带空白或转义残留。"""
|
||||
return "" if value is None else self._decode_text(value).strip()
|
||||
|
||||
def _decode_http_response_text(self, response: requests.Response) -> str:
|
||||
"""更稳妥地把 HTTP 响应转成文本。
|
||||
|
||||
设计说明:
|
||||
1. 抖音分享页和大部分 JSON/HTML 实际都用 UTF-8;
|
||||
2. `apparent_encoding` 在中文短文本页面上很容易误判,直接用会把整段中文解坏;
|
||||
3. 因此先信任 UTF-8,再逐步回退到 header / apparent / replace。
|
||||
"""
|
||||
raw_bytes = response.content or b""
|
||||
if not raw_bytes:
|
||||
return ""
|
||||
|
||||
for encoding in ("utf-8", response.encoding, response.apparent_encoding, "gb18030"):
|
||||
if not encoding:
|
||||
continue
|
||||
try:
|
||||
decoded_text = raw_bytes.decode(encoding)
|
||||
# 如果解出来明显像“UTF-8 被错按单字节编码解释过”,再试着修一手。
|
||||
repaired_text = self._repair_mojibake_text(decoded_text)
|
||||
if repaired_text:
|
||||
return repaired_text
|
||||
except Exception:
|
||||
continue
|
||||
return raw_bytes.decode("utf-8", errors="replace")
|
||||
|
||||
def _looks_like_mojibake(self, text: str) -> bool:
|
||||
"""判断文本是否像常见的 UTF-8 误解码乱码。"""
|
||||
if not text:
|
||||
return False
|
||||
suspicious_markers = ("Ã", "Â", "æ", "ä", "å", "ç", "é", "ê", "ï", "ð")
|
||||
marker_hits = sum(text.count(marker) for marker in suspicious_markers)
|
||||
# 中文场景里这些字符密集出现时,基本就是“UTF-8 被按 Latin-1/CP1252 解了”。
|
||||
return marker_hits >= 2
|
||||
|
||||
def _repair_mojibake_text(self, text: str) -> str:
|
||||
"""修复常见的中文乱码,但只在高置信度时生效。"""
|
||||
if not text or not self._looks_like_mojibake(text):
|
||||
return text
|
||||
|
||||
for source_encoding in ("latin1", "cp1252"):
|
||||
try:
|
||||
repaired_text = text.encode(source_encoding).decode("utf-8")
|
||||
# 修复后若中文比例明显提升,就采用修复结果。
|
||||
if repaired_text and self._count_cjk_chars(repaired_text) >= self._count_cjk_chars(text):
|
||||
return repaired_text
|
||||
except Exception:
|
||||
continue
|
||||
return text
|
||||
|
||||
def _count_cjk_chars(self, text: str) -> int:
|
||||
"""统计字符串中的中日韩统一表意文字数量,用于判断修复是否更合理。"""
|
||||
if not text:
|
||||
return 0
|
||||
return sum(1 for ch in text if "\u4e00" <= ch <= "\u9fff")
|
||||
|
||||
def _build_proxies(self) -> Optional[Dict[str, str]]:
|
||||
if self.http_proxy:
|
||||
return {"http": self.http_proxy, "https": self.http_proxy}
|
||||
|
||||
Reference in New Issue
Block a user