diff --git a/utils/wechat/message_to_db.py b/utils/wechat/message_to_db.py index c8a38c8..b74813e 100644 --- a/utils/wechat/message_to_db.py +++ b/utils/wechat/message_to_db.py @@ -3,6 +3,7 @@ import time from datetime import datetime, timedelta import xml.etree.ElementTree as ET import concurrent.futures # 添加线程池支持 +import html import os import base64 import re @@ -126,26 +127,61 @@ class MessageStorage: if not xml_content: return {} - aeskey_match = self._aeskey_re.search(xml_content) - if not aeskey_match: - return {} + normalized_xml = html.unescape(xml_content) + aeskey = "" + md5 = "" + length = 0 + urls = [] - url_match = ( - self._emoji_cdn_re.search(xml_content) - or self._emoji_encrypt_re.search(xml_content) - or self._emoji_thumb_re.search(xml_content) - ) - if not url_match: - return {} + try: + root = ET.fromstring(normalized_xml) + emoji_node = root.find(".//emoji") + if emoji_node is not None: + aeskey = (emoji_node.attrib.get("aeskey") or "").strip() + md5 = (emoji_node.attrib.get("md5") or "").strip() + try: + length = int((emoji_node.attrib.get("len") or "0").strip() or 0) + except Exception: + length = 0 + urls.extend([ + (emoji_node.attrib.get("encrypturl") or "").strip(), + (emoji_node.attrib.get("cdnurl") or "").strip(), + (emoji_node.attrib.get("thumburl") or "").strip(), + ]) + except Exception: + pass - md5_match = re.search(r'md5="(.*?)"', xml_content) - length_match = re.search(r'len="(\d+)"', xml_content) + if not aeskey: + aeskey_match = self._aeskey_re.search(normalized_xml) + aeskey = aeskey_match.group(1).strip() if aeskey_match else "" + + if not md5: + md5_match = re.search(r'md5="(.*?)"', normalized_xml) + md5 = md5_match.group(1).strip() if md5_match else "" + + if not length: + length_match = re.search(r'len="(\d+)"', normalized_xml) + length = int(length_match.group(1)) if length_match else 0 + + encrypt_match = self._emoji_encrypt_re.search(normalized_xml) + cdn_match = self._emoji_cdn_re.search(normalized_xml) + thumb_match = self._emoji_thumb_re.search(normalized_xml) + urls.extend([ + encrypt_match.group(1).strip() if encrypt_match else "", + cdn_match.group(1).strip() if cdn_match else "", + thumb_match.group(1).strip() if thumb_match else "", + ]) + urls = [url for index, url in enumerate(urls) if url and url not in urls[:index]] + + if not aeskey or not urls: + return {} return { - "aeskey": aeskey_match.group(1), - "url": url_match.group(1), - "md5": md5_match.group(1) if md5_match else "", - "length": int(length_match.group(1)) if length_match else 0, + "aeskey": aeskey, + "url": urls[0], + "urls": urls, + "md5": md5, + "length": length, } async def _process_emoji_record(self, msg_record: Dict) -> bool: @@ -163,8 +199,24 @@ class MessageStorage: return False try: - base64_str = await self.client.download_cdn_file(emoji_info["aeskey"], emoji_info["url"]) + base64_str = None + last_error = None + for file_url in emoji_info.get("urls", []) or [emoji_info.get("url", "")]: + if not file_url: + continue + try: + base64_str = await self.client.download_cdn_file(emoji_info["aeskey"], file_url) + if base64_str: + break + except Exception as e: + last_error = e + logger.warning( + f"表情下载地址尝试失败: msg_id={message_id}, url={file_url[:120]}, error={e}" + ) if not base64_str: + if last_error: + logger.warning(f"表情下载全部地址均失败: msg_id={message_id}, error={last_error}") + return False logger.warning(f"表情下载返回为空: msg_id={message_id}") return False