Files
abot/sehuatang/shehuatang.py
2025-03-03 15:45:43 +08:00

191 lines
6.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import time
import os
import requests
from io import BytesIO
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from reportlab.lib.pagesizes import letter, A3
from reportlab.lib import colors
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image, PageBreak
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.pdfbase.ttfonts import TTFont
from reportlab.pdfbase import pdfmetrics
from datetime import datetime
from PIL import Image as PILImage
import re
from PyPDF2 import PdfReader, PdfWriter
# download_image 函数保持不变
def download_image(url):
"""下载大于100KB的图片并返回临时文件路径仅支持jpg、jpeg和png格式"""
try:
if not url.lower().endswith(('.jpg', '.jpeg', '.png')):
return None
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Referer': 'https://tu.a7nz4.us',
}
response = requests.get(url, headers=headers)
response.raise_for_status()
image = BytesIO(response.content)
return image
except requests.exceptions.RequestException as e:
print(f"下载图片失败: {e}")
return None
def fetch_and_create_pdf(url):
"""根据给定URL抓取页面并生成PDF"""
# 配置Selenium
options = Options()
options.headless = True
options.add_argument('--disable-gpu')
options.add_argument('--no-sandbox')
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
# 获取目标页面
driver.get(url)
time.sleep(5)
# 处理年龄验证按钮
try:
enter_button = driver.find_element(By.XPATH, '//a[contains(text(), "满18岁请点此进入")]')
enter_button.click()
print("点击了满18岁按钮")
time.sleep(5)
except Exception as e:
print("未找到满18岁按钮跳过此步骤", e)
# 解析页面
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8')
posts = soup.find_all('tbody', {'id': lambda x: x and x.startswith('normalthread')})
# 获取今天的日期
today = datetime.now().strftime('%Y-%m-%d')
# 注册中文字体
pdfmetrics.registerFont(TTFont('SimHei', 'fonts/simhei.ttf'))
styles = getSampleStyleSheet()
# 设置样式
title_style = styles['Heading1']
title_style.fontName = 'SimHei'
title_style.fontSize = 14
title_style.textColor = colors.red
title_style.bold = True
normal_style = styles['Normal']
normal_style.fontName = 'SimHei'
normal_style.fontSize = 14
content = []
# 过滤当天帖子并倒序
today_posts = []
for post in posts:
post_time_span = post.find('span', {'class': 'xi1'})
if post_time_span:
today_posts.append(post)
today_posts = today_posts[::-1] # 倒序处理
# 设置PDF
pdf_filename = f"JAV-{today}-{len(today_posts)}.pdf"
doc = SimpleDocTemplate(pdf_filename, pagesize=A3)
# 遍历帖子
for post in today_posts:
title = post.find('a', {'class': 's xst'})
if title:
post_title = title.get_text()
post_url = title.get('href')
print(post_title)
# 获取帖子内容
post_page_url = 'https://www.sehuatang.net/' + post_url
driver.get(post_page_url)
time.sleep(3)
post_html = driver.page_source
post_soup = BeautifulSoup(post_html, 'html.parser', from_encoding='utf-8')
content_div = post_soup.find('div', {'class': 't_fsz'})
if content_div:
# 提取文本和磁力链接
post_text = content_div.get_text(strip=True)
magnet_links = re.findall(r'magnet:\?[^ \u4e00-\u9fff]+', post_text)
# 添加标题
content.append(Paragraph(f" {post_title}", title_style))
content.append(Spacer(1, 5))
# 添加磁力链接
if magnet_links:
for magnet_link in magnet_links:
content.append(Paragraph(f"<br /><b>{magnet_link}</b><br />", normal_style))
content.append(Spacer(1, 12))
# 添加图片
image_links = []
images = content_div.find_all('img')
for img in images:
if img.get('zoomfile') and 'http' in img.get('zoomfile'):
image_links.append(img.get('zoomfile'))
if image_links:
for img_link in image_links:
image = download_image(img_link)
if image:
img = PILImage.open(image)
img_width, img_height = img.size
image_width = 700
image_height = int((img_height / img_width) * image_width)
img_stream = BytesIO(image.getvalue())
content.append(Image(img_stream, width=image_width, height=image_height))
content.append(Spacer(1, 4))
# 在每个帖子后添加分页符(除了最后一页)
if post != today_posts[-1]:
content.append(PageBreak())
# 生成PDF
doc.build(content)
absolute_pdf_path = os.path.abspath(pdf_filename)
print(f"PDF saved as {absolute_pdf_path}")
# 加密PDF
add_pdf_encryption(absolute_pdf_path)
driver.quit()
return absolute_pdf_path
# add_pdf_encryption 和 pdf_file_path 函数保持不变
def add_pdf_encryption(pdf_file, password="4000"):
"""使用PyPDF2为PDF添加加密保护"""
pdf_writer = PdfWriter()
pdf_reader = PdfReader(pdf_file)
for page_num in range(len(pdf_reader.pages)):
pdf_writer.add_page(pdf_reader.pages[page_num])
pdf_writer.encrypt(password)
with open(pdf_file, "wb") as output_pdf:
pdf_writer.write(output_pdf)
print(f"PDF加密成功密码为: {password}")
def pdf_file_path():
url = 'https://www.sehuatang.net/forum.php?mod=forumdisplay&fid=103&filter=typeid&typeid=481'
pdf_path = fetch_and_create_pdf(url)
print(f"返回的PDF文件路径{pdf_path}")
return pdf_path
if __name__ == "__main__":
pdf_file_path()