尝试进行总结优化

2026-01-06 16:02:00 +08:00
parent b8e881980f
commit 491c0d16fb
3 changed files with 215 additions and 27 deletions
--- a/utils/compress_chat_data.py
+++ b/utils/compress_chat_data.py
@@ -4,13 +4,20 @@ from datetime import datetime

 def compress_chat_data(chat_data_str, time_threshold=5):
    """
-    压缩聊天数据，减少 token 使用，格式为时间,发信人,内容。
+    压缩聊天数据，减少 token 使用。
+
+    支持两种格式：
+    1. 旧格式：时间,发信人,内容（例如：2025-01-06 08:30,张三,大家好）
+    2. 新格式（优化后）：
+       【时间】
+       发信人：内容
+            内容（续）

    :param chat_data_str: 原始聊天记录的长字符串
-    :param time_threshold: 同一发信人连续发言间隔小于该值(秒)，则合并
+    :param time_threshold: 同一发信人连续发言间隔小于该值(秒)，则合并（仅对旧格式有效）
    :return: 压缩后的聊天数据的长字符串
    """
-    # 如果字符串长度超过30000，则去除前面的聊天记录
+    # 如果字符串长度超过40000，则去除前面的聊天记录（保留最新的）
    if len(chat_data_str) > 40000:
        lines = chat_data_str.splitlines()
        total_length = 0
@@ -27,6 +34,20 @@ def compress_chat_data(chat_data_str, time_threshold=5):
        # 只保留后面的聊天记录
        chat_data_str = '\n'.join(lines[cut_index:])

+    # 检测格式类型
+    has_new_format = '【' in chat_data_str and '】' in chat_data_str
+
+    if has_new_format:
+        # 新格式：已经是压缩格式，直接返回
+        # 只需要确保不超过字符限制（上面已经处理）
+        return chat_data_str
+    else:
+        # 旧格式：需要压缩处理
+        return _compress_old_format(chat_data_str, time_threshold)
+
+
+def _compress_old_format(chat_data_str, time_threshold):
+    """压缩旧格式的聊天数据（时间,发信人,内容）"""
    # 解析原始聊天数据为列表
    chat_data = []
    for line in chat_data_str.splitlines():