加入分页,逆序排列帖子

This commit is contained in:
liuwei
2025-03-03 15:31:06 +08:00
parent be27e440c7
commit 334850ae32

View File

@@ -10,7 +10,7 @@ from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from reportlab.lib.pagesizes import letter
from reportlab.lib import colors
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image, PageBreak
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.pdfbase.ttfonts import TTFont
from reportlab.pdfbase import pdfmetrics
@@ -20,6 +20,7 @@ import re
from PyPDF2 import PdfReader, PdfWriter
# download_image 函数保持不变
def download_image(url):
"""下载大于100KB的图片并返回临时文件路径仅支持jpg、jpeg和png格式"""
try:
@@ -28,11 +29,11 @@ def download_image(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Referer': 'https://tu.a7nz4.us', # 防止403
'Referer': 'https://tu.a7nz4.us',
}
response = requests.get(url, headers=headers)
response.raise_for_status() # 确保请求成功
response.raise_for_status()
image = BytesIO(response.content)
return image
except requests.exceptions.RequestException as e:
@@ -42,43 +43,39 @@ def download_image(url):
def fetch_and_create_pdf(url):
"""根据给定URL抓取页面并生成PDF"""
# 配置Selenium以无头模式运行
# 配置Selenium
options = Options()
options.headless = True
options.add_argument('--disable-gpu')
options.add_argument('--no-sandbox')
# 使用webdriver-manager自动下载ChromeDriver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
# 获取目标页面
driver.get(url)
time.sleep(5)
# 处理“满18岁请点此进入”按钮
# 处理年龄验证按钮
try:
enter_button = driver.find_element(By.XPATH, '//a[contains(text(), "满18岁请点此进入")]')
enter_button.click()
print("点击了满18岁按钮")
time.sleep(5) # 等待内容加载
time.sleep(5)
except Exception as e:
print("未找到满18岁按钮跳过此步骤", e)
# 使用BeautifulSoup解析页面
# 解析页面
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8')
# 定位到帖子列表
posts = soup.find_all('tbody', {'id': lambda x: x and x.startswith('normalthread')})
# 获取今天的日期
today = datetime.now().strftime('%Y-%m-%d')
# 注册中文字体
pdfmetrics.registerFont(TTFont('SimHei', 'fonts/simhei.ttf')) # 请确保字体文件路径正确
pdfmetrics.registerFont(TTFont('SimHei', 'fonts/simhei.ttf'))
styles = getSampleStyleSheet()
# 设置标题和正文样式
# 设置样式
title_style = styles['Heading1']
title_style.fontName = 'SimHei'
title_style.fontSize = 14
@@ -90,65 +87,43 @@ def fetch_and_create_pdf(url):
content = []
# 过滤当天帖子
# 过滤当天帖子并倒序
today_posts = []
for post in posts:
post_time_span = post.find('span', {'class': 'xi1'})
if post_time_span: # 判断是否为当天帖子
if post_time_span:
today_posts.append(post)
today_posts = today_posts[::-1] # 倒序处理
# 设置PDF
pdf_filename = f"JAV-{today}-{len(today_posts)}.pdf"
doc = SimpleDocTemplate(pdf_filename, pagesize=letter)
# 遍历当天的帖子并提取信息
# 遍历帖子
for post in today_posts:
# 查找帖子标题
title = post.find('a', {'class': 's xst'})
if title:
post_title = title.get_text()
post_url = title.get('href')
print(post_title)
# 获取帖子页面
# 获取帖子内容
post_page_url = 'https://www.sehuatang.net/' + post_url
driver.get(post_page_url)
time.sleep(3)
# 获取帖子页面内容
post_html = driver.page_source
post_soup = BeautifulSoup(post_html, 'html.parser', from_encoding='utf-8')
# 提取 <div class="t_fsz"> 下的文本和图片
content_div = post_soup.find('div', {'class': 't_fsz'})
if content_div:
# 提取文本
# 提取文本和磁力链接
post_text = content_div.get_text(strip=True)
# 查找磁力链接
magnet_links = re.findall(r'magnet:\?[^ \u4e00-\u9fff]+', post_text)
# 添加标题和来源URL
# 添加标题
content.append(Paragraph(f" {post_title}", title_style))
content.append(Spacer(1, 12))
# content.append(Paragraph(f"来源URL:<br /> {post_page_url}<br />", normal_style))
# content.append(Spacer(1, 12))
# 解析并格式化 "介绍" 内容
# intro_content = []
# intro_lines = post_text.split("【") # 以【分割字段
# for line in intro_lines:
# if line.strip():
# line = "【" + line if not line.startswith("【") else line
# if "】:" in line:
# key, value = line.split("】:", 1)
# intro_content.append(Paragraph(f"{key}】:<br />{value}<br />", normal_style))
# intro_content.append(Spacer(1, 6)) # 字段间距
#
# # 添加格式化的介绍部分
# content.append(Paragraph("介绍:<br />", normal_style))
# content.extend(intro_content)
# content.append(Spacer(1, 12))
# 添加磁力链接
if magnet_links:
@@ -162,53 +137,45 @@ def fetch_and_create_pdf(url):
for img in images:
if img.get('zoomfile') and 'http' in img.get('zoomfile'):
image_links.append(img.get('zoomfile'))
print(image_links)
if image_links:
for img_link in image_links:
image = download_image(img_link)
if image:
img = PILImage.open(image)
img_width, img_height = img.size
image_width = 500 # 固定宽度
image_width = 500
image_height = int((img_height / img_width) * image_width)
img_stream = BytesIO(image.getvalue())
content.append(Image(img_stream, width=image_width, height=image_height))
content.append(Spacer(1, 12))
content.append(Spacer(1, 12)) # 帖子间距
# 在每个帖子后添加分页符(除了最后一页)
if post != today_posts[-1]:
content.append(PageBreak())
# 生成PDF
doc.build(content)
# 获取PDF绝对路径
absolute_pdf_path = os.path.abspath(pdf_filename)
print(f"PDF saved as {absolute_pdf_path}")
# 加密PDF
add_pdf_encryption(absolute_pdf_path)
# 关闭浏览器
driver.quit()
return absolute_pdf_path
# add_pdf_encryption 和 pdf_file_path 函数保持不变
def add_pdf_encryption(pdf_file, password="4000"):
"""使用PyPDF2为PDF添加加密保护"""
pdf_writer = PdfWriter()
pdf_reader = PdfReader(pdf_file)
# 将所有页面添加到PDF写入器
for page_num in range(len(pdf_reader.pages)):
pdf_writer.add_page(pdf_reader.pages[page_num])
# 添加密码
pdf_writer.encrypt(password)
# 保存加密后的PDF
with open(pdf_file, "wb") as output_pdf:
pdf_writer.write(output_pdf)
print(f"PDF加密成功密码为: {password}")