diff --git a/sehuatang/shehuatang.py b/sehuatang/shehuatang.py
index a470606..49a63bf 100644
--- a/sehuatang/shehuatang.py
+++ b/sehuatang/shehuatang.py
@@ -16,12 +16,12 @@ from reportlab.pdfbase.ttfonts import TTFont
from reportlab.pdfbase import pdfmetrics
from datetime import datetime
from PIL import Image as PILImage
-import re # 用于正则表达式提取磁力链接
+import re
from PyPDF2 import PdfReader, PdfWriter
def download_image(url):
- """ 下载大于100KB的图片并返回临时文件路径,仅支持jpg、jpeg和png格式 """
+ """下载大于100KB的图片并返回临时文件路径,仅支持jpg、jpeg和png格式"""
try:
if not url.lower().endswith(('.jpg', '.jpeg', '.png')):
return None
@@ -42,7 +42,7 @@ def download_image(url):
def fetch_and_create_pdf(url):
"""根据给定URL抓取页面并生成PDF"""
- # 配置Selenium以无头模式(即不显示浏览器窗口)运行
+ # 配置Selenium以无头模式运行
options = Options()
options.headless = True
options.add_argument('--disable-gpu')
@@ -60,7 +60,7 @@ def fetch_and_create_pdf(url):
enter_button = driver.find_element(By.XPATH, '//a[contains(text(), "满18岁,请点此进入")]')
enter_button.click()
print("点击了满18岁按钮")
- time.sleep(5) # 等待 5 秒,确保点击后内容加载完成
+ time.sleep(5) # 等待内容加载
except Exception as e:
print("未找到满18岁按钮,跳过此步骤", e)
@@ -75,18 +75,18 @@ def fetch_and_create_pdf(url):
today = datetime.now().strftime('%Y-%m-%d')
# 注册中文字体
- pdfmetrics.registerFont(TTFont('SamHei', 'fonts/simhei.ttf')) # 设置中文字体路径
+ pdfmetrics.registerFont(TTFont('SimHei', 'fonts/simhei.ttf')) # 请确保字体文件路径正确
styles = getSampleStyleSheet()
- # 设置标题和正文样式都使用SamHei字体
+ # 设置标题和正文样式
title_style = styles['Heading1']
- title_style.fontName = 'SamHei' # 设置字体为SamHei
- title_style.fontSize = 14 # 设置字体大小
- title_style.textColor = colors.red # 设置字体颜色为红色
- title_style.bold = True # 设置加粗
+ title_style.fontName = 'SimHei'
+ title_style.fontSize = 14
+ title_style.textColor = colors.red
+ title_style.bold = True
normal_style = styles['Normal']
- normal_style.fontName = 'SamHei' # 设置正文使用SamHei字体
+ normal_style.fontName = 'SimHei'
content = []
@@ -94,7 +94,7 @@ def fetch_and_create_pdf(url):
today_posts = []
for post in posts:
post_time_span = post.find('span', {'class': 'xi1'})
- if post_time_span: # 判断是否存在post_time_span,即认为是当天发布的帖子
+ if post_time_span: # 判断是否为当天帖子
today_posts.append(post)
# 设置PDF
@@ -109,7 +109,7 @@ def fetch_and_create_pdf(url):
post_title = title.get_text()
post_url = title.get('href')
print(post_title)
- # 获取帖子的页面
+ # 获取帖子页面
post_page_url = 'https://www.sehuatang.net/' + post_url
driver.get(post_page_url)
time.sleep(3)
@@ -122,26 +122,39 @@ def fetch_and_create_pdf(url):
content_div = post_soup.find('div', {'class': 't_fsz'})
if content_div:
- # 提取文本并将
标签替换为换行符
- post_text = content_div.get_text(strip=True) # 使用 separator='\n' 参数替换
标签
+ # 提取文本
+ post_text = content_div.get_text(strip=True)
# 查找磁力链接
- magnet_links =re.findall(r'magnet:\?[^ \u4e00-\u9fff]+', post_text) # 使用正则表达式查找磁力链接
+ magnet_links = re.findall(r'magnet:\?[^ \u4e00-\u9fff]+', post_text)
- # 添加标题到PDF
+ # 添加标题和来源URL
content.append(Paragraph(f"标题:
{post_title}", title_style))
content.append(Spacer(1, 12))
content.append(Paragraph(f"来源URL:
{post_page_url}
", normal_style))
content.append(Spacer(1, 12))
- content.append(Paragraph(f"介绍:
{post_text}
", normal_style))
- content.append(Spacer(1, 12)) # 添加空白区域
- # 如果有磁力链接,将其单独加粗并显示
+ # 解析并格式化 "介绍" 内容
+ intro_content = []
+ intro_lines = post_text.split("【") # 以【分割字段
+ for line in intro_lines:
+ if line.strip():
+ line = "【" + line if not line.startswith("【") else line
+ if "】:" in line:
+ key, value = line.split("】:", 1)
+ intro_content.append(Paragraph(f"{key}】:
{value}
", normal_style))
+ intro_content.append(Spacer(1, 6)) # 字段间距
+
+ # 添加格式化的介绍部分
+ content.append(Paragraph("介绍:
", normal_style))
+ content.extend(intro_content)
+ content.append(Spacer(1, 12))
+
+ # 添加磁力链接
if magnet_links:
for magnet_link in magnet_links:
- # 将磁力链接作为加粗的内容显示
- content.append(Paragraph(f"Magnet Link:
{magnet_link}
", normal_style))
- content.append(Spacer(1, 12)) # 添加空白区域
+ content.append(Paragraph(f"磁力链接:
{magnet_link}
", normal_style))
+ content.append(Spacer(1, 12))
# 添加图片
image_links = []
@@ -156,20 +169,19 @@ def fetch_and_create_pdf(url):
if image:
img = PILImage.open(image)
img_width, img_height = img.size
- image_width = 400 # 图片宽度
+ image_width = 400 # 固定宽度
image_height = int((img_height / img_width) * image_width)
- # 将图片加载到内存流中,并添加到PDF
img_stream = BytesIO(image.getvalue())
content.append(Image(img_stream, width=image_width, height=image_height))
- content.append(Spacer(1, 12)) # 添加空白区域
+ content.append(Spacer(1, 12))
- content.append(Spacer(1, 12)) # 为每个帖子添加间距
+ content.append(Spacer(1, 12)) # 帖子间距
# 生成PDF
doc.build(content)
- # 获取PDF文件的绝对路径
+ # 获取PDF绝对路径
absolute_pdf_path = os.path.abspath(pdf_filename)
print(f"PDF saved as {absolute_pdf_path}")
# 加密PDF
@@ -182,11 +194,11 @@ def fetch_and_create_pdf(url):
def add_pdf_encryption(pdf_file, password="4000"):
- """ 使用PyPDF2为PDF添加加密保护 """
+ """使用PyPDF2为PDF添加加密保护"""
pdf_writer = PdfWriter()
pdf_reader = PdfReader(pdf_file)
- # 将所有页面添加到PDF写入器中
+ # 将所有页面添加到PDF写入器
for page_num in range(len(pdf_reader.pages)):
pdf_writer.add_page(pdf_reader.pages[page_num])
@@ -208,4 +220,4 @@ def pdf_file_path():
if __name__ == "__main__":
- pdf_file_path()
+ pdf_file_path()
\ No newline at end of file