From 334850ae324f616e75e0039d89178ea6f0b74855 Mon Sep 17 00:00:00 2001
From: liuwei <liuwei@wdtrgf.com.cn>
Date: Mon, 3 Mar 2025 15:31:06 +0800
Subject: [PATCH] =?UTF-8?q?=E5=8A=A0=E5=85=A5=E5=88=86=E9=A1=B5=EF=BC=8C?=
 =?UTF-8?q?=E9=80=86=E5=BA=8F=E6=8E=92=E5=88=97=E5=B8=96=E5=AD=90?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 sehuatang/shehuatang.py | 83 +++++++++++++----------------------------
 1 file changed, 25 insertions(+), 58 deletions(-)

diff --git a/sehuatang/shehuatang.py b/sehuatang/shehuatang.py
index 7165bd0..19aea58 100644
--- a/sehuatang/shehuatang.py
+++ b/sehuatang/shehuatang.py
@@ -10,7 +10,7 @@ from webdriver_manager.chrome import ChromeDriverManager
 from bs4 import BeautifulSoup
 from reportlab.lib.pagesizes import letter
 from reportlab.lib import colors
-from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image
+from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image, PageBreak
 from reportlab.lib.styles import getSampleStyleSheet
 from reportlab.pdfbase.ttfonts import TTFont
 from reportlab.pdfbase import pdfmetrics
@@ -20,6 +20,7 @@ import re
 from PyPDF2 import PdfReader, PdfWriter
 
 
+# download_image 函数保持不变
 def download_image(url):
     """下载大于100KB的图片并返回临时文件路径，仅支持jpg、jpeg和png格式"""
     try:
@@ -28,11 +29,11 @@ def download_image(url):
 
         headers = {
             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
-            'Referer': 'https://tu.a7nz4.us',  # 防止403
+            'Referer': 'https://tu.a7nz4.us',
         }
 
         response = requests.get(url, headers=headers)
-        response.raise_for_status()  # 确保请求成功
+        response.raise_for_status()
         image = BytesIO(response.content)
         return image
     except requests.exceptions.RequestException as e:
@@ -42,43 +43,39 @@ def download_image(url):
 
 def fetch_and_create_pdf(url):
     """根据给定URL抓取页面并生成PDF"""
-    # 配置Selenium以无头模式运行
+    # 配置Selenium
     options = Options()
     options.headless = True
     options.add_argument('--disable-gpu')
     options.add_argument('--no-sandbox')
-
-    # 使用webdriver-manager自动下载ChromeDriver
     driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
 
     # 获取目标页面
     driver.get(url)
     time.sleep(5)
 
-    # 处理“满18岁，请点此进入”按钮
+    # 处理年龄验证按钮
     try:
         enter_button = driver.find_element(By.XPATH, '//a[contains(text(), "满18岁，请点此进入")]')
         enter_button.click()
         print("点击了满18岁按钮")
-        time.sleep(5)  # 等待内容加载
+        time.sleep(5)
     except Exception as e:
         print("未找到满18岁按钮，跳过此步骤", e)
 
-    # 使用BeautifulSoup解析页面
+    # 解析页面
     html = driver.page_source
     soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8')
-
-    # 定位到帖子列表
     posts = soup.find_all('tbody', {'id': lambda x: x and x.startswith('normalthread')})
 
     # 获取今天的日期
     today = datetime.now().strftime('%Y-%m-%d')
 
     # 注册中文字体
-    pdfmetrics.registerFont(TTFont('SimHei', 'fonts/simhei.ttf'))  # 请确保字体文件路径正确
+    pdfmetrics.registerFont(TTFont('SimHei', 'fonts/simhei.ttf'))
     styles = getSampleStyleSheet()
 
-    # 设置标题和正文样式
+    # 设置样式
     title_style = styles['Heading1']
     title_style.fontName = 'SimHei'
     title_style.fontSize = 14
@@ -90,65 +87,43 @@ def fetch_and_create_pdf(url):
 
     content = []
 
-    # 过滤出当天的帖子
+    # 过滤当天帖子并倒序
     today_posts = []
     for post in posts:
         post_time_span = post.find('span', {'class': 'xi1'})
-        if post_time_span:  # 判断是否为当天帖子
+        if post_time_span:
             today_posts.append(post)
+    today_posts = today_posts[::-1]  # 倒序处理
 
     # 设置PDF
     pdf_filename = f"JAV-{today}-{len(today_posts)}.pdf"
     doc = SimpleDocTemplate(pdf_filename, pagesize=letter)
 
-    # 遍历当天的帖子并提取信息
+    # 遍历帖子
     for post in today_posts:
-        # 查找帖子标题
         title = post.find('a', {'class': 's xst'})
         if title:
             post_title = title.get_text()
             post_url = title.get('href')
             print(post_title)
-            # 获取帖子页面
+
+            # 获取帖子内容
             post_page_url = 'https://www.sehuatang.net/' + post_url
             driver.get(post_page_url)
             time.sleep(3)
 
-            # 获取帖子页面内容
             post_html = driver.page_source
             post_soup = BeautifulSoup(post_html, 'html.parser', from_encoding='utf-8')
-
-            # 提取 <div class="t_fsz"> 下的文本和图片
             content_div = post_soup.find('div', {'class': 't_fsz'})
 
             if content_div:
-                # 提取文本
+                # 提取文本和磁力链接
                 post_text = content_div.get_text(strip=True)
-
-                # 查找磁力链接
                 magnet_links = re.findall(r'magnet:\?[^ \u4e00-\u9fff]+', post_text)
 
-                # 添加标题和来源URL
+                # 添加标题
                 content.append(Paragraph(f" {post_title}", title_style))
                 content.append(Spacer(1, 12))
-                # content.append(Paragraph(f"来源URL:<br /> {post_page_url}<br />", normal_style))
-                # content.append(Spacer(1, 12))
-
-                # 解析并格式化 "介绍" 内容
-                # intro_content = []
-                # intro_lines = post_text.split("【")  # 以【分割字段
-                # for line in intro_lines:
-                #     if line.strip():
-                #         line = "【" + line if not line.startswith("【") else line
-                #         if "】：" in line:
-                #             key, value = line.split("】：", 1)
-                #             intro_content.append(Paragraph(f"{key}】：<br />{value}<br />", normal_style))
-                #             intro_content.append(Spacer(1, 6))  # 字段间距
-                #
-                # # 添加格式化的介绍部分
-                # content.append(Paragraph("介绍:<br />", normal_style))
-                # content.extend(intro_content)
-                # content.append(Spacer(1, 12))
 
                 # 添加磁力链接
                 if magnet_links:
@@ -162,53 +137,45 @@ def fetch_and_create_pdf(url):
                 for img in images:
                     if img.get('zoomfile') and 'http' in img.get('zoomfile'):
                         image_links.append(img.get('zoomfile'))
-                print(image_links)
+
                 if image_links:
                     for img_link in image_links:
                         image = download_image(img_link)
                         if image:
                             img = PILImage.open(image)
                             img_width, img_height = img.size
-                            image_width = 500  # 固定宽度
+                            image_width = 500
                             image_height = int((img_height / img_width) * image_width)
-
                             img_stream = BytesIO(image.getvalue())
                             content.append(Image(img_stream, width=image_width, height=image_height))
                             content.append(Spacer(1, 12))
 
-                content.append(Spacer(1, 12))  # 帖子间距
+                # 在每个帖子后添加分页符（除了最后一页）
+                if post != today_posts[-1]:
+                    content.append(PageBreak())
 
     # 生成PDF
     doc.build(content)
-
-    # 获取PDF绝对路径
     absolute_pdf_path = os.path.abspath(pdf_filename)
     print(f"PDF saved as {absolute_pdf_path}")
+
     # 加密PDF
     add_pdf_encryption(absolute_pdf_path)
-
-    # 关闭浏览器
     driver.quit()
 
     return absolute_pdf_path
 
 
+# add_pdf_encryption 和 pdf_file_path 函数保持不变
 def add_pdf_encryption(pdf_file, password="4000"):
     """使用PyPDF2为PDF添加加密保护"""
     pdf_writer = PdfWriter()
     pdf_reader = PdfReader(pdf_file)
-
-    # 将所有页面添加到PDF写入器
     for page_num in range(len(pdf_reader.pages)):
         pdf_writer.add_page(pdf_reader.pages[page_num])
-
-    # 添加密码
     pdf_writer.encrypt(password)
-
-    # 保存加密后的PDF
     with open(pdf_file, "wb") as output_pdf:
         pdf_writer.write(output_pdf)
-
     print(f"PDF加密成功，密码为: {password}")