From 374d3bec49e5f1f37b411c93415f26d1f7f3aa33 Mon Sep 17 00:00:00 2001 From: liuwei Date: Mon, 3 Mar 2025 15:07:35 +0800 Subject: [PATCH] =?UTF-8?q?=E8=B0=83=E6=95=B4=E8=89=B2=E8=8A=B1=E5=A0=82PD?= =?UTF-8?q?F=E6=A0=BC=E5=BC=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- sehuatang/shehuatang.py | 74 ++++++++++++++++++++++++----------------- 1 file changed, 43 insertions(+), 31 deletions(-) diff --git a/sehuatang/shehuatang.py b/sehuatang/shehuatang.py index a470606..49a63bf 100644 --- a/sehuatang/shehuatang.py +++ b/sehuatang/shehuatang.py @@ -16,12 +16,12 @@ from reportlab.pdfbase.ttfonts import TTFont from reportlab.pdfbase import pdfmetrics from datetime import datetime from PIL import Image as PILImage -import re # 用于正则表达式提取磁力链接 +import re from PyPDF2 import PdfReader, PdfWriter def download_image(url): - """ 下载大于100KB的图片并返回临时文件路径,仅支持jpg、jpeg和png格式 """ + """下载大于100KB的图片并返回临时文件路径,仅支持jpg、jpeg和png格式""" try: if not url.lower().endswith(('.jpg', '.jpeg', '.png')): return None @@ -42,7 +42,7 @@ def download_image(url): def fetch_and_create_pdf(url): """根据给定URL抓取页面并生成PDF""" - # 配置Selenium以无头模式(即不显示浏览器窗口)运行 + # 配置Selenium以无头模式运行 options = Options() options.headless = True options.add_argument('--disable-gpu') @@ -60,7 +60,7 @@ def fetch_and_create_pdf(url): enter_button = driver.find_element(By.XPATH, '//a[contains(text(), "满18岁,请点此进入")]') enter_button.click() print("点击了满18岁按钮") - time.sleep(5) # 等待 5 秒,确保点击后内容加载完成 + time.sleep(5) # 等待内容加载 except Exception as e: print("未找到满18岁按钮,跳过此步骤", e) @@ -75,18 +75,18 @@ def fetch_and_create_pdf(url): today = datetime.now().strftime('%Y-%m-%d') # 注册中文字体 - pdfmetrics.registerFont(TTFont('SamHei', 'fonts/simhei.ttf')) # 设置中文字体路径 + pdfmetrics.registerFont(TTFont('SimHei', 'fonts/simhei.ttf')) # 请确保字体文件路径正确 styles = getSampleStyleSheet() - # 设置标题和正文样式都使用SamHei字体 + # 设置标题和正文样式 title_style = styles['Heading1'] - title_style.fontName = 'SamHei' # 设置字体为SamHei - title_style.fontSize = 14 # 设置字体大小 - title_style.textColor = colors.red # 设置字体颜色为红色 - title_style.bold = True # 设置加粗 + title_style.fontName = 'SimHei' + title_style.fontSize = 14 + title_style.textColor = colors.red + title_style.bold = True normal_style = styles['Normal'] - normal_style.fontName = 'SamHei' # 设置正文使用SamHei字体 + normal_style.fontName = 'SimHei' content = [] @@ -94,7 +94,7 @@ def fetch_and_create_pdf(url): today_posts = [] for post in posts: post_time_span = post.find('span', {'class': 'xi1'}) - if post_time_span: # 判断是否存在post_time_span,即认为是当天发布的帖子 + if post_time_span: # 判断是否为当天帖子 today_posts.append(post) # 设置PDF @@ -109,7 +109,7 @@ def fetch_and_create_pdf(url): post_title = title.get_text() post_url = title.get('href') print(post_title) - # 获取帖子的页面 + # 获取帖子页面 post_page_url = 'https://www.sehuatang.net/' + post_url driver.get(post_page_url) time.sleep(3) @@ -122,26 +122,39 @@ def fetch_and_create_pdf(url): content_div = post_soup.find('div', {'class': 't_fsz'}) if content_div: - # 提取文本并将
标签替换为换行符 - post_text = content_div.get_text(strip=True) # 使用 separator='\n' 参数替换
标签 + # 提取文本 + post_text = content_div.get_text(strip=True) # 查找磁力链接 - magnet_links =re.findall(r'magnet:\?[^ \u4e00-\u9fff]+', post_text) # 使用正则表达式查找磁力链接 + magnet_links = re.findall(r'magnet:\?[^ \u4e00-\u9fff]+', post_text) - # 添加标题到PDF + # 添加标题和来源URL content.append(Paragraph(f"标题:
{post_title}", title_style)) content.append(Spacer(1, 12)) content.append(Paragraph(f"来源URL:
{post_page_url}
", normal_style)) content.append(Spacer(1, 12)) - content.append(Paragraph(f"介绍:
{post_text}
", normal_style)) - content.append(Spacer(1, 12)) # 添加空白区域 - # 如果有磁力链接,将其单独加粗并显示 + # 解析并格式化 "介绍" 内容 + intro_content = [] + intro_lines = post_text.split("【") # 以【分割字段 + for line in intro_lines: + if line.strip(): + line = "【" + line if not line.startswith("【") else line + if "】:" in line: + key, value = line.split("】:", 1) + intro_content.append(Paragraph(f"{key}】:
{value}
", normal_style)) + intro_content.append(Spacer(1, 6)) # 字段间距 + + # 添加格式化的介绍部分 + content.append(Paragraph("介绍:
", normal_style)) + content.extend(intro_content) + content.append(Spacer(1, 12)) + + # 添加磁力链接 if magnet_links: for magnet_link in magnet_links: - # 将磁力链接作为加粗的内容显示 - content.append(Paragraph(f"Magnet Link:

{magnet_link}

", normal_style)) - content.append(Spacer(1, 12)) # 添加空白区域 + content.append(Paragraph(f"磁力链接:

{magnet_link}

", normal_style)) + content.append(Spacer(1, 12)) # 添加图片 image_links = [] @@ -156,20 +169,19 @@ def fetch_and_create_pdf(url): if image: img = PILImage.open(image) img_width, img_height = img.size - image_width = 400 # 图片宽度 + image_width = 400 # 固定宽度 image_height = int((img_height / img_width) * image_width) - # 将图片加载到内存流中,并添加到PDF img_stream = BytesIO(image.getvalue()) content.append(Image(img_stream, width=image_width, height=image_height)) - content.append(Spacer(1, 12)) # 添加空白区域 + content.append(Spacer(1, 12)) - content.append(Spacer(1, 12)) # 为每个帖子添加间距 + content.append(Spacer(1, 12)) # 帖子间距 # 生成PDF doc.build(content) - # 获取PDF文件的绝对路径 + # 获取PDF绝对路径 absolute_pdf_path = os.path.abspath(pdf_filename) print(f"PDF saved as {absolute_pdf_path}") # 加密PDF @@ -182,11 +194,11 @@ def fetch_and_create_pdf(url): def add_pdf_encryption(pdf_file, password="4000"): - """ 使用PyPDF2为PDF添加加密保护 """ + """使用PyPDF2为PDF添加加密保护""" pdf_writer = PdfWriter() pdf_reader = PdfReader(pdf_file) - # 将所有页面添加到PDF写入器中 + # 将所有页面添加到PDF写入器 for page_num in range(len(pdf_reader.pages)): pdf_writer.add_page(pdf_reader.pages[page_num]) @@ -208,4 +220,4 @@ def pdf_file_path(): if __name__ == "__main__": - pdf_file_path() + pdf_file_path() \ No newline at end of file