调整色花堂PDF格式

This commit is contained in:
liuwei
2025-03-03 15:07:35 +08:00
parent e0ae83583d
commit 374d3bec49

View File

@@ -16,12 +16,12 @@ from reportlab.pdfbase.ttfonts import TTFont
from reportlab.pdfbase import pdfmetrics from reportlab.pdfbase import pdfmetrics
from datetime import datetime from datetime import datetime
from PIL import Image as PILImage from PIL import Image as PILImage
import re # 用于正则表达式提取磁力链接 import re
from PyPDF2 import PdfReader, PdfWriter from PyPDF2 import PdfReader, PdfWriter
def download_image(url): def download_image(url):
""" 下载大于100KB的图片并返回临时文件路径仅支持jpg、jpeg和png格式 """ """下载大于100KB的图片并返回临时文件路径仅支持jpg、jpeg和png格式"""
try: try:
if not url.lower().endswith(('.jpg', '.jpeg', '.png')): if not url.lower().endswith(('.jpg', '.jpeg', '.png')):
return None return None
@@ -42,7 +42,7 @@ def download_image(url):
def fetch_and_create_pdf(url): def fetch_and_create_pdf(url):
"""根据给定URL抓取页面并生成PDF""" """根据给定URL抓取页面并生成PDF"""
# 配置Selenium以无头模式(即不显示浏览器窗口)运行 # 配置Selenium以无头模式运行
options = Options() options = Options()
options.headless = True options.headless = True
options.add_argument('--disable-gpu') options.add_argument('--disable-gpu')
@@ -60,7 +60,7 @@ def fetch_and_create_pdf(url):
enter_button = driver.find_element(By.XPATH, '//a[contains(text(), "满18岁请点此进入")]') enter_button = driver.find_element(By.XPATH, '//a[contains(text(), "满18岁请点此进入")]')
enter_button.click() enter_button.click()
print("点击了满18岁按钮") print("点击了满18岁按钮")
time.sleep(5) # 等待 5 秒,确保点击后内容加载完成 time.sleep(5) # 等待内容加载
except Exception as e: except Exception as e:
print("未找到满18岁按钮跳过此步骤", e) print("未找到满18岁按钮跳过此步骤", e)
@@ -75,18 +75,18 @@ def fetch_and_create_pdf(url):
today = datetime.now().strftime('%Y-%m-%d') today = datetime.now().strftime('%Y-%m-%d')
# 注册中文字体 # 注册中文字体
pdfmetrics.registerFont(TTFont('SamHei', 'fonts/simhei.ttf')) # 设置中文字体路径 pdfmetrics.registerFont(TTFont('SimHei', 'fonts/simhei.ttf')) # 请确保字体文件路径正确
styles = getSampleStyleSheet() styles = getSampleStyleSheet()
# 设置标题和正文样式都使用SamHei字体 # 设置标题和正文样式
title_style = styles['Heading1'] title_style = styles['Heading1']
title_style.fontName = 'SamHei' # 设置字体为SamHei title_style.fontName = 'SimHei'
title_style.fontSize = 14 # 设置字体大小 title_style.fontSize = 14
title_style.textColor = colors.red # 设置字体颜色为红色 title_style.textColor = colors.red
title_style.bold = True # 设置加粗 title_style.bold = True
normal_style = styles['Normal'] normal_style = styles['Normal']
normal_style.fontName = 'SamHei' # 设置正文使用SamHei字体 normal_style.fontName = 'SimHei'
content = [] content = []
@@ -94,7 +94,7 @@ def fetch_and_create_pdf(url):
today_posts = [] today_posts = []
for post in posts: for post in posts:
post_time_span = post.find('span', {'class': 'xi1'}) post_time_span = post.find('span', {'class': 'xi1'})
if post_time_span: # 判断是否存在post_time_span即认为是当天发布的帖子 if post_time_span: # 判断是否为当天帖子
today_posts.append(post) today_posts.append(post)
# 设置PDF # 设置PDF
@@ -109,7 +109,7 @@ def fetch_and_create_pdf(url):
post_title = title.get_text() post_title = title.get_text()
post_url = title.get('href') post_url = title.get('href')
print(post_title) print(post_title)
# 获取帖子页面 # 获取帖子页面
post_page_url = 'https://www.sehuatang.net/' + post_url post_page_url = 'https://www.sehuatang.net/' + post_url
driver.get(post_page_url) driver.get(post_page_url)
time.sleep(3) time.sleep(3)
@@ -122,26 +122,39 @@ def fetch_and_create_pdf(url):
content_div = post_soup.find('div', {'class': 't_fsz'}) content_div = post_soup.find('div', {'class': 't_fsz'})
if content_div: if content_div:
# 提取文本并将 <br> 标签替换为换行符 # 提取文本
post_text = content_div.get_text(strip=True) # 使用 separator='\n' 参数替换 <br> 标签 post_text = content_div.get_text(strip=True)
# 查找磁力链接 # 查找磁力链接
magnet_links =re.findall(r'magnet:\?[^ \u4e00-\u9fff]+', post_text) # 使用正则表达式查找磁力链接 magnet_links = re.findall(r'magnet:\?[^ \u4e00-\u9fff]+', post_text)
# 添加标题到PDF # 添加标题和来源URL
content.append(Paragraph(f"标题:<br /> {post_title}", title_style)) content.append(Paragraph(f"标题:<br /> {post_title}", title_style))
content.append(Spacer(1, 12)) content.append(Spacer(1, 12))
content.append(Paragraph(f"来源URL:<br /> {post_page_url}<br />", normal_style)) content.append(Paragraph(f"来源URL:<br /> {post_page_url}<br />", normal_style))
content.append(Spacer(1, 12)) content.append(Spacer(1, 12))
content.append(Paragraph(f"介绍:<br /> {post_text}<br />", normal_style))
content.append(Spacer(1, 12)) # 添加空白区域
# 如果有磁力链接,将其单独加粗并显示 # 解析并格式化 "介绍" 内容
intro_content = []
intro_lines = post_text.split("") # 以【分割字段
for line in intro_lines:
if line.strip():
line = "" + line if not line.startswith("") else line
if "】:" in line:
key, value = line.split("】:", 1)
intro_content.append(Paragraph(f"{key}】:<br />{value}<br />", normal_style))
intro_content.append(Spacer(1, 6)) # 字段间距
# 添加格式化的介绍部分
content.append(Paragraph("介绍:<br />", normal_style))
content.extend(intro_content)
content.append(Spacer(1, 12))
# 添加磁力链接
if magnet_links: if magnet_links:
for magnet_link in magnet_links: for magnet_link in magnet_links:
# 将磁力链接作为加粗的内容显示 content.append(Paragraph(f"磁力链接:<br /><br /><b>{magnet_link}</b><br /><br />", normal_style))
content.append(Paragraph(f"Magnet Link:<br /><br /> <b>{magnet_link}</b><br /><br />", normal_style)) content.append(Spacer(1, 12))
content.append(Spacer(1, 12)) # 添加空白区域
# 添加图片 # 添加图片
image_links = [] image_links = []
@@ -156,20 +169,19 @@ def fetch_and_create_pdf(url):
if image: if image:
img = PILImage.open(image) img = PILImage.open(image)
img_width, img_height = img.size img_width, img_height = img.size
image_width = 400 # 图片宽度 image_width = 400 # 固定宽度
image_height = int((img_height / img_width) * image_width) image_height = int((img_height / img_width) * image_width)
# 将图片加载到内存流中并添加到PDF
img_stream = BytesIO(image.getvalue()) img_stream = BytesIO(image.getvalue())
content.append(Image(img_stream, width=image_width, height=image_height)) content.append(Image(img_stream, width=image_width, height=image_height))
content.append(Spacer(1, 12)) # 添加空白区域 content.append(Spacer(1, 12))
content.append(Spacer(1, 12)) # 为每个帖子添加间距 content.append(Spacer(1, 12)) # 帖子间距
# 生成PDF # 生成PDF
doc.build(content) doc.build(content)
# 获取PDF文件的绝对路径 # 获取PDF绝对路径
absolute_pdf_path = os.path.abspath(pdf_filename) absolute_pdf_path = os.path.abspath(pdf_filename)
print(f"PDF saved as {absolute_pdf_path}") print(f"PDF saved as {absolute_pdf_path}")
# 加密PDF # 加密PDF
@@ -182,11 +194,11 @@ def fetch_and_create_pdf(url):
def add_pdf_encryption(pdf_file, password="4000"): def add_pdf_encryption(pdf_file, password="4000"):
""" 使用PyPDF2为PDF添加加密保护 """ """使用PyPDF2为PDF添加加密保护"""
pdf_writer = PdfWriter() pdf_writer = PdfWriter()
pdf_reader = PdfReader(pdf_file) pdf_reader = PdfReader(pdf_file)
# 将所有页面添加到PDF写入器 # 将所有页面添加到PDF写入器
for page_num in range(len(pdf_reader.pages)): for page_num in range(len(pdf_reader.pages)):
pdf_writer.add_page(pdf_reader.pages[page_num]) pdf_writer.add_page(pdf_reader.pages[page_num])
@@ -208,4 +220,4 @@ def pdf_file_path():
if __name__ == "__main__": if __name__ == "__main__":
pdf_file_path() pdf_file_path()