Files
abot/sehuatang/shehuatang.py
2025-02-17 17:06:03 +08:00

212 lines
7.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import time
import os
import requests
from io import BytesIO
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from reportlab.lib.pagesizes import letter
from reportlab.lib import colors
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.pdfbase.ttfonts import TTFont
from reportlab.pdfbase import pdfmetrics
from datetime import datetime
from PIL import Image as PILImage
import re # 用于正则表达式提取磁力链接
from PyPDF2 import PdfReader, PdfWriter
def download_image(url):
""" 下载大于100KB的图片并返回临时文件路径仅支持jpg、jpeg和png格式 """
try:
if not url.lower().endswith(('.jpg', '.jpeg', '.png')):
return None
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Referer': 'https://tu.a7nz4.us', # 防止403
}
response = requests.get(url, headers=headers)
response.raise_for_status() # 确保请求成功
image = BytesIO(response.content)
return image
except requests.exceptions.RequestException as e:
print(f"下载图片失败: {e}")
return None
def fetch_and_create_pdf(url):
"""根据给定URL抓取页面并生成PDF"""
# 配置Selenium以无头模式即不显示浏览器窗口运行
options = Options()
options.headless = True
options.add_argument('--disable-gpu')
options.add_argument('--no-sandbox')
# 使用webdriver-manager自动下载ChromeDriver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
# 获取目标页面
driver.get(url)
time.sleep(5)
# 处理“满18岁请点此进入”按钮
try:
enter_button = driver.find_element(By.XPATH, '//a[contains(text(), "满18岁请点此进入")]')
enter_button.click()
print("点击了满18岁按钮")
time.sleep(5) # 等待 5 秒,确保点击后内容加载完成
except Exception as e:
print("未找到满18岁按钮跳过此步骤", e)
# 使用BeautifulSoup解析页面
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8')
# 定位到帖子列表
posts = soup.find_all('tbody', {'id': lambda x: x and x.startswith('normalthread')})
# 获取今天的日期
today = datetime.now().strftime('%Y-%m-%d')
# 注册中文字体
pdfmetrics.registerFont(TTFont('SamHei', 'fonts/simhei.ttf')) # 设置中文字体路径
styles = getSampleStyleSheet()
# 设置标题和正文样式都使用SamHei字体
title_style = styles['Heading1']
title_style.fontName = 'SamHei' # 设置字体为SamHei
title_style.fontSize = 14 # 设置字体大小
title_style.textColor = colors.red # 设置字体颜色为红色
title_style.bold = True # 设置加粗
normal_style = styles['Normal']
normal_style.fontName = 'SamHei' # 设置正文使用SamHei字体
content = []
# 过滤出当天的帖子
today_posts = []
for post in posts:
post_time_span = post.find('span', {'class': 'xi1'})
if post_time_span: # 判断是否存在post_time_span即认为是当天发布的帖子
today_posts.append(post)
# 设置PDF
pdf_filename = f"JAV-{today}-{len(today_posts)}.pdf"
doc = SimpleDocTemplate(pdf_filename, pagesize=letter)
# 遍历当天的帖子并提取信息
for post in today_posts:
# 查找帖子标题
title = post.find('a', {'class': 's xst'})
if title:
post_title = title.get_text()
post_url = title.get('href')
print(post_title)
# 获取帖子的页面
post_page_url = 'https://www.sehuatang.net/' + post_url
driver.get(post_page_url)
time.sleep(3)
# 获取帖子页面内容
post_html = driver.page_source
post_soup = BeautifulSoup(post_html, 'html.parser', from_encoding='utf-8')
# 提取 <div class="t_fsz"> 下的文本和图片
content_div = post_soup.find('div', {'class': 't_fsz'})
if content_div:
# 提取文本并将 <br> 标签替换为换行符
post_text = content_div.get_text(strip=True) # 使用 separator='\n' 参数替换 <br> 标签
# 查找磁力链接
magnet_links =re.findall(r'magnet:\?[^ \u4e00-\u9fff]+', post_text) # 使用正则表达式查找磁力链接
# 添加标题到PDF
content.append(Paragraph(f"标题:<br /> {post_title}", title_style))
content.append(Spacer(1, 12))
content.append(Paragraph(f"来源URL:<br /> {post_page_url}<br />", normal_style))
content.append(Spacer(1, 12))
content.append(Paragraph(f"介绍:<br /> {post_text}<br />", normal_style))
content.append(Spacer(1, 12)) # 添加空白区域
# 如果有磁力链接,将其单独加粗并显示
if magnet_links:
for magnet_link in magnet_links:
# 将磁力链接作为加粗的内容显示
content.append(Paragraph(f"Magnet Link:<br /><br /> <b>{magnet_link}</b><br /><br />", normal_style))
content.append(Spacer(1, 12)) # 添加空白区域
# 添加图片
image_links = []
images = content_div.find_all('img')
for img in images:
if img.get('zoomfile') and 'http' in img.get('zoomfile'):
image_links.append(img.get('zoomfile'))
print(image_links)
if image_links:
for img_link in image_links:
image = download_image(img_link)
if image:
img = PILImage.open(image)
img_width, img_height = img.size
image_width = 400 # 图片宽度
image_height = int((img_height / img_width) * image_width)
# 将图片加载到内存流中并添加到PDF
img_stream = BytesIO(image.getvalue())
content.append(Image(img_stream, width=image_width, height=image_height))
content.append(Spacer(1, 12)) # 添加空白区域
content.append(Spacer(1, 12)) # 为每个帖子添加间距
# 生成PDF
doc.build(content)
# 获取PDF文件的绝对路径
absolute_pdf_path = os.path.abspath(pdf_filename)
print(f"PDF saved as {absolute_pdf_path}")
# 加密PDF
add_pdf_encryption(absolute_pdf_path)
# 关闭浏览器
driver.quit()
return absolute_pdf_path
def add_pdf_encryption(pdf_file, password="4000"):
""" 使用PyPDF2为PDF添加加密保护 """
pdf_writer = PdfWriter()
pdf_reader = PdfReader(pdf_file)
# 将所有页面添加到PDF写入器中
for page_num in range(len(pdf_reader.pages)):
pdf_writer.add_page(pdf_reader.pages[page_num])
# 添加密码
pdf_writer.encrypt(password)
# 保存加密后的PDF
with open(pdf_file, "wb") as output_pdf:
pdf_writer.write(output_pdf)
print(f"PDF加密成功密码为: {password}")
def pdf_file_path():
url = 'https://www.sehuatang.net/forum.php?mod=forumdisplay&fid=103&filter=typeid&typeid=481'
pdf_path = fetch_and_create_pdf(url)
print(f"返回的PDF文件路径{pdf_path}")
return pdf_path
if __name__ == "__main__":
pdf_file_path()