From ab302d77d8a1323c62666ddde24aabd7654ef635 Mon Sep 17 00:00:00 2001 From: liuwei Date: Fri, 14 Mar 2025 14:59:00 +0800 Subject: [PATCH] =?UTF-8?q?=E5=8E=BB=E9=99=A430000=E5=AD=97=E7=AC=A6?= =?UTF-8?q?=E4=BB=A5=E4=B8=8A=E5=86=85=E5=AE=B9=EF=BC=8C=E9=98=B2=E6=AD=A2?= =?UTF-8?q?=E8=B6=85=E9=95=BF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- message_summary/compress_chat_data.py | 34 ++++++++++++++++++++------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/message_summary/compress_chat_data.py b/message_summary/compress_chat_data.py index ede217d..e6c5cc4 100644 --- a/message_summary/compress_chat_data.py +++ b/message_summary/compress_chat_data.py @@ -10,6 +10,23 @@ def compress_chat_data(chat_data_str, time_threshold=5): :param time_threshold: 同一发信人连续发言间隔小于该值(秒),则合并 :return: 压缩后的聊天数据的长字符串 """ + # 如果字符串长度超过30000,则去除前面的聊天记录 + if len(chat_data_str) > 30000: + lines = chat_data_str.splitlines() + total_length = 0 + cut_index = 0 + + # 从后往前计算,找到保留哪些行 + for i in range(len(lines) - 1, -1, -1): + line_length = len(lines[i]) + 1 # +1 是为了计入换行符 + total_length += line_length + if total_length > 30000: + cut_index = i + 1 # 保留这个索引之后的行 + break + + # 只保留后面的聊天记录 + chat_data_str = '\n'.join(lines[cut_index:]) + # 解析原始聊天数据为列表 chat_data = [] for line in chat_data_str.splitlines(): @@ -25,6 +42,7 @@ def compress_chat_data(chat_data_str, time_threshold=5): timestamp, sender, content = parts chat_data.append((timestamp, sender, content)) + # 其余代码保持不变 if not chat_data: return "" # 如果没有有效数据,返回空字符串 @@ -37,7 +55,7 @@ def compress_chat_data(chat_data_str, time_threshold=5): for timestamp, sender, content in chat_data: try: time_obj = datetime.strptime(timestamp.strip(), "%Y-%m-%d %H:%M:%S") - + # 去除无意义的空格 content = re.sub(r"\s+", " ", content).strip() if not content: @@ -45,30 +63,30 @@ def compress_chat_data(chat_data_str, time_threshold=5): # 检查日期是否变化(不包含年份) date_str = time_obj.strftime('%m-%d') - + # 如果是第一条消息或日期变化,添加日期标记 if current_date is None or date_str != current_date: current_date = date_str compressed_data.append(f"{date_str}") - + # 检查是否需要合并消息 - if (prev_sender == sender and prev_time_obj is not None and - (time_obj - prev_time_obj).total_seconds() <= time_threshold): + if (prev_sender == sender and prev_time_obj is not None and + (time_obj - prev_time_obj).total_seconds() <= time_threshold): # 合并消息,更新最后一条消息 compressed_data[-1] = f"{prev_time_obj.strftime('%H:%M:%S')},{sender},{prev_content} {content}" else: # 添加新消息 compressed_data.append(f"{time_obj.strftime('%H:%M:%S')},{sender},{content}") - + # 更新前一条消息的信息 prev_time_obj = time_obj prev_sender = sender prev_content = content - + except ValueError as e: # 处理日期格式错误 print(f"日期格式错误: {timestamp}, 错误: {e}") continue # 返回压缩后的数据长字符串 - return '\n'.join(compressed_data) + return '\n'.join(compressed_data) \ No newline at end of file