Files
abot/utils/message_formatter.py

125 lines
6.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import xml.etree.ElementTree as ET
import html
import re
def _clean_text(value: str) -> str:
if not value:
return ""
value = html.unescape(value)
value = re.sub(r"<br\s*/?>", "\n", value, flags=re.IGNORECASE)
value = re.sub(r"<[^>]+>", "", value)
return value.strip()
def _extract_first(pattern: str, text: str, default: str = "") -> str:
match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
return match.group(1) if match else default
def _format_referenced_content(ref_type: str, quoted_content: str, xml_content: str) -> str:
cleaned = _clean_text(quoted_content)
lower_xml = (quoted_content or "") + (xml_content or "")
lower_xml = lower_xml.lower()
if ref_type in {"3"} or "<img" in lower_xml or "cdnthumburl" in lower_xml:
return "[图片]"
if ref_type in {"43", "62"} or "<videomsg" in lower_xml or "cdnvideourl" in lower_xml:
return "[视频]"
if ref_type in {"47", "1048625", "1090519089"} or "<emoji" in lower_xml or "<emoticonmd5>" in lower_xml:
return "[表情]"
if ref_type in {"34"} or "<voicemsg" in lower_xml:
return "[语音]"
if ref_type in {"48"} or "<location" in lower_xml:
return "[位置]"
if ref_type in {"49"}:
title = _extract_first(r"<title>(.*?)</title>", quoted_content) or _extract_first(r"<title>(.*?)</title>", xml_content)
title = _clean_text(title)
return f"[链接] {title}" if title else "[链接]"
if cleaned:
return cleaned
return "[消息]"
def _extract_media_preview(ref_type: str, quoted_content: str) -> dict:
payload = html.unescape(quoted_content or "")
preview = {"reference_type": "text", "preview_image": "", "preview_video_thumb": ""}
if ref_type in {"3"} or "<img" in payload.lower():
preview["reference_type"] = "image"
preview["preview_image"] = (
_extract_first(r'cdnthumburl="(.*?)"', payload)
or _extract_first(r"<cdnthumburl><!\[CDATA\[(.*?)\]\]></cdnthumburl>", payload)
or _extract_first(r"<cdnmidimgurl><!\[CDATA\[(.*?)\]\]></cdnmidimgurl>", payload)
)
return preview
if ref_type in {"43", "62"} or "<videomsg" in payload.lower():
preview["reference_type"] = "video"
preview["preview_video_thumb"] = (
_extract_first(r'cdnthumburl="(.*?)"', payload)
or _extract_first(r"<cdnthumburl><!\[CDATA\[(.*?)\]\]></cdnthumburl>", payload)
)
return preview
if ref_type in {"47", "1048625", "1090519089"} or "<emoji" in payload.lower():
preview["reference_type"] = "emoji"
return preview
return preview
def parse_quote_message(xml_content: str) -> dict:
xml_content = xml_content.replace('&lt;', '<').replace('&gt;', '>')
main_content = _clean_text(_extract_first(r'<title>(.*?)</title>', xml_content, "[无标题]")) or "[无标题]"
display_name = _clean_text(_extract_first(r'<displayname>(.*?)</displayname>', xml_content, "未知用户")) or "未知用户"
quoted_content = _extract_first(r'<refermsg>.*?<content>(.*?)</content>', xml_content)
ref_type = _extract_first(r'<refermsg>.*?<type>(.*?)</type>', xml_content)
reference_svrid = _extract_first(r'<refermsg>.*?<svrid>(.*?)</svrid>', xml_content)
pretty_reference = _format_referenced_content(ref_type, quoted_content, xml_content)
media_preview = _extract_media_preview(ref_type, quoted_content)
return {
"main_content": main_content,
"display_name": display_name,
"quoted_content": pretty_reference,
"reference_svrid": reference_svrid,
"reference_type": media_preview.get("reference_type", "text"),
"preview_image": media_preview.get("preview_image", ""),
"preview_video_thumb": media_preview.get("preview_video_thumb", ""),
"formatted_message": f"{main_content}\n引用 {display_name}{pretty_reference}" if display_name and pretty_reference else main_content
}
def format_quote_message(xml_content):
"""
格式化引用消息
Args:
xml_content: XML格式的消息内容
Returns:
格式化后的消息文本
"""
try:
return parse_quote_message(xml_content)["formatted_message"]
except Exception as e:
# 如果解析失败尝试提取title标签内容
try:
match = re.search(r'<title>(.*?)</title>', xml_content)
if match:
return match.group(1)
except:
pass
return f"[引用消息]" # 返回一个简单的标识,而不是错误信息
if __name__ == '__main__':
strs ="""
<?xml version="1.0"?> <msg> <appmsg appid="" sdkver="0"> <title>那也没事,都是富二代,都是送出国镀个金回家继承家业的</title> <des /> <action /> <type>57</type> <showtype>0</showtype> <soundtype>0</soundtype> <mediatagname /> <messageext /> <messageaction /> <content /> <contentattr>0</contentattr> <url /> <lowurl /> <dataurl /> <lowdataurl /> <songalbumurl /> <songlyric /> <appattach> <totallen>0</totallen> <attachid /> <emoticonmd5></emoticonmd5> <fileext /> <aeskey></aeskey> </appattach> <extinfo /> <sourceusername /> <sourcedisplayname /> <thumburl /> <md5 /> <statextstr /> <refermsg> <type>1</type> <svrid>2568355911763278189</svrid> <fromusr>45317011307@chatroom</fromusr> <chatusr>wxid_twrbhdxddlud12</chatusr> <displayname>水牛🐃</displayname> <content>都是富二代。</content> <msgsource>&lt;msgsource&gt; &lt;sec_msg_node&gt; &lt;alnode&gt; &lt;fr&gt;1&lt;/fr&gt; &lt;/alnode&gt; &lt;/sec_msg_node&gt; &lt;pua&gt;1&lt;/pua&gt; &lt;silence&gt;1&lt;/silence&gt; &lt;membercount&gt;125&lt;/membercount&gt; &lt;signature&gt;N0_V1_DaX/Y3s2|v1_PvH3m56P&lt;/signature&gt; &lt;tmp_node&gt; &lt;publisher-id&gt;&lt;/publisher-id&gt; &lt;/tmp_node&gt; &lt;/msgsource&gt; </msgsource> <createtime>1743491847</createtime> </refermsg> </appmsg> <fromusername>maoyijie</fromusername> <scene>0</scene> <appinfo> <version>1</version> <appname /> </appinfo> <commenturl /> </msg>
"""
sss = format_quote_message(strs)
print(sss)