From 374d3bec49e5f1f37b411c93415f26d1f7f3aa33 Mon Sep 17 00:00:00 2001
From: liuwei <liuwei@wdtrgf.com.cn>
Date: Mon, 3 Mar 2025 15:07:35 +0800
Subject: [PATCH] =?UTF-8?q?=E8=B0=83=E6=95=B4=E8=89=B2=E8=8A=B1=E5=A0=82PD?=
 =?UTF-8?q?F=E6=A0=BC=E5=BC=8F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 sehuatang/shehuatang.py | 74 ++++++++++++++++++++++++-----------------
 1 file changed, 43 insertions(+), 31 deletions(-)
diff --git a/sehuatang/shehuatang.py b/sehuatang/shehuatang.py
index a470606..49a63bf 100644
--- a/sehuatang/shehuatang.py
+++ b/sehuatang/shehuatang.py
@@ -16,12 +16,12 @@ from reportlab.pdfbase.ttfonts import TTFont
 from reportlab.pdfbase import pdfmetrics
 from datetime import datetime
 from PIL import Image as PILImage
-import re  # 用于正则表达式提取磁力链接
+import re
 from PyPDF2 import PdfReader, PdfWriter
 
 
 def download_image(url):
-    """ 下载大于100KB的图片并返回临时文件路径，仅支持jpg、jpeg和png格式 """
+    """下载大于100KB的图片并返回临时文件路径，仅支持jpg、jpeg和png格式"""
     try:
         if not url.lower().endswith(('.jpg', '.jpeg', '.png')):
             return None
@@ -42,7 +42,7 @@ def download_image(url):
 
 def fetch_and_create_pdf(url):
     """根据给定URL抓取页面并生成PDF"""
-    # 配置Selenium以无头模式（即不显示浏览器窗口）运行
+    # 配置Selenium以无头模式运行
     options = Options()
     options.headless = True
     options.add_argument('--disable-gpu')
@@ -60,7 +60,7 @@ def fetch_and_create_pdf(url):
         enter_button = driver.find_element(By.XPATH, '//a[contains(text(), "满18岁，请点此进入")]')
         enter_button.click()
         print("点击了满18岁按钮")
-        time.sleep(5)  # 等待 5 秒，确保点击后内容加载完成
+        time.sleep(5)  # 等待内容加载
     except Exception as e:
         print("未找到满18岁按钮，跳过此步骤", e)
 
@@ -75,18 +75,18 @@ def fetch_and_create_pdf(url):
     today = datetime.now().strftime('%Y-%m-%d')
 
     # 注册中文字体
-    pdfmetrics.registerFont(TTFont('SamHei', 'fonts/simhei.ttf'))  # 设置中文字体路径
+    pdfmetrics.registerFont(TTFont('SimHei', 'fonts/simhei.ttf'))  # 请确保字体文件路径正确
     styles = getSampleStyleSheet()
 
-    # 设置标题和正文样式都使用SamHei字体
+    # 设置标题和正文样式
     title_style = styles['Heading1']
-    title_style.fontName = 'SamHei'  # 设置字体为SamHei
-    title_style.fontSize = 14  # 设置字体大小
-    title_style.textColor = colors.red  # 设置字体颜色为红色
-    title_style.bold = True  # 设置加粗
+    title_style.fontName = 'SimHei'
+    title_style.fontSize = 14
+    title_style.textColor = colors.red
+    title_style.bold = True
 
     normal_style = styles['Normal']
-    normal_style.fontName = 'SamHei'  # 设置正文使用SamHei字体
+    normal_style.fontName = 'SimHei'
 
     content = []
 
@@ -94,7 +94,7 @@ def fetch_and_create_pdf(url):
     today_posts = []
     for post in posts:
         post_time_span = post.find('span', {'class': 'xi1'})
-        if post_time_span:  # 判断是否存在post_time_span，即认为是当天发布的帖子
+        if post_time_span:  # 判断是否为当天帖子
             today_posts.append(post)
 
     # 设置PDF
@@ -109,7 +109,7 @@ def fetch_and_create_pdf(url):
             post_title = title.get_text()
             post_url = title.get('href')
             print(post_title)
-            # 获取帖子的页面
+            # 获取帖子页面
             post_page_url = 'https://www.sehuatang.net/' + post_url
             driver.get(post_page_url)
             time.sleep(3)
@@ -122,26 +122,39 @@ def fetch_and_create_pdf(url):
             content_div = post_soup.find('div', {'class': 't_fsz'})
 
             if content_div:
-                # 提取文本并将 <br> 标签替换为换行符
-                post_text = content_div.get_text(strip=True)  # 使用 separator='\n' 参数替换 <br> 标签
+                # 提取文本
+                post_text = content_div.get_text(strip=True)
 
                 # 查找磁力链接
-                magnet_links =re.findall(r'magnet:\?[^ \u4e00-\u9fff]+', post_text) # 使用正则表达式查找磁力链接
+                magnet_links = re.findall(r'magnet:\?[^ \u4e00-\u9fff]+', post_text)
 
-                # 添加标题到PDF
+                # 添加标题和来源URL
                 content.append(Paragraph(f"标题:<br /> {post_title}", title_style))
                 content.append(Spacer(1, 12))
                 content.append(Paragraph(f"来源URL:<br /> {post_page_url}<br />", normal_style))
                 content.append(Spacer(1, 12))
-                content.append(Paragraph(f"介绍:<br /> {post_text}<br />", normal_style))
-                content.append(Spacer(1, 12))  # 添加空白区域
 
-                # 如果有磁力链接，将其单独加粗并显示
+                # 解析并格式化 "介绍" 内容
+                intro_content = []
+                intro_lines = post_text.split("【")  # 以【分割字段
+                for line in intro_lines:
+                    if line.strip():
+                        line = "【" + line if not line.startswith("【") else line
+                        if "】：" in line:
+                            key, value = line.split("】：", 1)
+                            intro_content.append(Paragraph(f"{key}】：<br />{value}<br />", normal_style))
+                            intro_content.append(Spacer(1, 6))  # 字段间距
+
+                # 添加格式化的介绍部分
+                content.append(Paragraph("介绍:<br />", normal_style))
+                content.extend(intro_content)
+                content.append(Spacer(1, 12))
+
+                # 添加磁力链接
                 if magnet_links:
                     for magnet_link in magnet_links:
-                        # 将磁力链接作为加粗的内容显示
-                        content.append(Paragraph(f"Magnet Link:<br /><br /> <b>{magnet_link}</b><br /><br />", normal_style))
-                        content.append(Spacer(1, 12))  # 添加空白区域
+                        content.append(Paragraph(f"磁力链接:<br /><br /><b>{magnet_link}</b><br /><br />", normal_style))
+                        content.append(Spacer(1, 12))
 
                 # 添加图片
                 image_links = []
@@ -156,20 +169,19 @@ def fetch_and_create_pdf(url):
                         if image:
                             img = PILImage.open(image)
                             img_width, img_height = img.size
-                            image_width = 400  # 图片宽度
+                            image_width = 400  # 固定宽度
                             image_height = int((img_height / img_width) * image_width)
 
-                            # 将图片加载到内存流中，并添加到PDF
                             img_stream = BytesIO(image.getvalue())
                             content.append(Image(img_stream, width=image_width, height=image_height))
-                            content.append(Spacer(1, 12))  # 添加空白区域
+                            content.append(Spacer(1, 12))
 
-                content.append(Spacer(1, 12))  # 为每个帖子添加间距
+                content.append(Spacer(1, 12))  # 帖子间距
 
     # 生成PDF
     doc.build(content)
 
-    # 获取PDF文件的绝对路径
+    # 获取PDF绝对路径
     absolute_pdf_path = os.path.abspath(pdf_filename)
     print(f"PDF saved as {absolute_pdf_path}")
     # 加密PDF
@@ -182,11 +194,11 @@ def fetch_and_create_pdf(url):
 
 
 def add_pdf_encryption(pdf_file, password="4000"):
-    """ 使用PyPDF2为PDF添加加密保护 """
+    """使用PyPDF2为PDF添加加密保护"""
     pdf_writer = PdfWriter()
     pdf_reader = PdfReader(pdf_file)
 
-    # 将所有页面添加到PDF写入器中
+    # 将所有页面添加到PDF写入器
     for page_num in range(len(pdf_reader.pages)):
         pdf_writer.add_page(pdf_reader.pages[page_num])
 
@@ -208,4 +220,4 @@ def pdf_file_path():
 
 
 if __name__ == "__main__":
-    pdf_file_path()
+    pdf_file_path()
\ No newline at end of file