From 334850ae324f616e75e0039d89178ea6f0b74855 Mon Sep 17 00:00:00 2001 From: liuwei Date: Mon, 3 Mar 2025 15:31:06 +0800 Subject: [PATCH] =?UTF-8?q?=E5=8A=A0=E5=85=A5=E5=88=86=E9=A1=B5=EF=BC=8C?= =?UTF-8?q?=E9=80=86=E5=BA=8F=E6=8E=92=E5=88=97=E5=B8=96=E5=AD=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- sehuatang/shehuatang.py | 83 +++++++++++++---------------------------- 1 file changed, 25 insertions(+), 58 deletions(-) diff --git a/sehuatang/shehuatang.py b/sehuatang/shehuatang.py index 7165bd0..19aea58 100644 --- a/sehuatang/shehuatang.py +++ b/sehuatang/shehuatang.py @@ -10,7 +10,7 @@ from webdriver_manager.chrome import ChromeDriverManager from bs4 import BeautifulSoup from reportlab.lib.pagesizes import letter from reportlab.lib import colors -from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image +from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image, PageBreak from reportlab.lib.styles import getSampleStyleSheet from reportlab.pdfbase.ttfonts import TTFont from reportlab.pdfbase import pdfmetrics @@ -20,6 +20,7 @@ import re from PyPDF2 import PdfReader, PdfWriter +# download_image 函数保持不变 def download_image(url): """下载大于100KB的图片并返回临时文件路径,仅支持jpg、jpeg和png格式""" try: @@ -28,11 +29,11 @@ def download_image(url): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', - 'Referer': 'https://tu.a7nz4.us', # 防止403 + 'Referer': 'https://tu.a7nz4.us', } response = requests.get(url, headers=headers) - response.raise_for_status() # 确保请求成功 + response.raise_for_status() image = BytesIO(response.content) return image except requests.exceptions.RequestException as e: @@ -42,43 +43,39 @@ def download_image(url): def fetch_and_create_pdf(url): """根据给定URL抓取页面并生成PDF""" - # 配置Selenium以无头模式运行 + # 配置Selenium options = Options() options.headless = True options.add_argument('--disable-gpu') options.add_argument('--no-sandbox') - - # 使用webdriver-manager自动下载ChromeDriver driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options) # 获取目标页面 driver.get(url) time.sleep(5) - # 处理“满18岁,请点此进入”按钮 + # 处理年龄验证按钮 try: enter_button = driver.find_element(By.XPATH, '//a[contains(text(), "满18岁,请点此进入")]') enter_button.click() print("点击了满18岁按钮") - time.sleep(5) # 等待内容加载 + time.sleep(5) except Exception as e: print("未找到满18岁按钮,跳过此步骤", e) - # 使用BeautifulSoup解析页面 + # 解析页面 html = driver.page_source soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8') - - # 定位到帖子列表 posts = soup.find_all('tbody', {'id': lambda x: x and x.startswith('normalthread')}) # 获取今天的日期 today = datetime.now().strftime('%Y-%m-%d') # 注册中文字体 - pdfmetrics.registerFont(TTFont('SimHei', 'fonts/simhei.ttf')) # 请确保字体文件路径正确 + pdfmetrics.registerFont(TTFont('SimHei', 'fonts/simhei.ttf')) styles = getSampleStyleSheet() - # 设置标题和正文样式 + # 设置样式 title_style = styles['Heading1'] title_style.fontName = 'SimHei' title_style.fontSize = 14 @@ -90,65 +87,43 @@ def fetch_and_create_pdf(url): content = [] - # 过滤出当天的帖子 + # 过滤当天帖子并倒序 today_posts = [] for post in posts: post_time_span = post.find('span', {'class': 'xi1'}) - if post_time_span: # 判断是否为当天帖子 + if post_time_span: today_posts.append(post) + today_posts = today_posts[::-1] # 倒序处理 # 设置PDF pdf_filename = f"JAV-{today}-{len(today_posts)}.pdf" doc = SimpleDocTemplate(pdf_filename, pagesize=letter) - # 遍历当天的帖子并提取信息 + # 遍历帖子 for post in today_posts: - # 查找帖子标题 title = post.find('a', {'class': 's xst'}) if title: post_title = title.get_text() post_url = title.get('href') print(post_title) - # 获取帖子页面 + + # 获取帖子内容 post_page_url = 'https://www.sehuatang.net/' + post_url driver.get(post_page_url) time.sleep(3) - # 获取帖子页面内容 post_html = driver.page_source post_soup = BeautifulSoup(post_html, 'html.parser', from_encoding='utf-8') - - # 提取
下的文本和图片 content_div = post_soup.find('div', {'class': 't_fsz'}) if content_div: - # 提取文本 + # 提取文本和磁力链接 post_text = content_div.get_text(strip=True) - - # 查找磁力链接 magnet_links = re.findall(r'magnet:\?[^ \u4e00-\u9fff]+', post_text) - # 添加标题和来源URL + # 添加标题 content.append(Paragraph(f" {post_title}", title_style)) content.append(Spacer(1, 12)) - # content.append(Paragraph(f"来源URL:
{post_page_url}
", normal_style)) - # content.append(Spacer(1, 12)) - - # 解析并格式化 "介绍" 内容 - # intro_content = [] - # intro_lines = post_text.split("【") # 以【分割字段 - # for line in intro_lines: - # if line.strip(): - # line = "【" + line if not line.startswith("【") else line - # if "】:" in line: - # key, value = line.split("】:", 1) - # intro_content.append(Paragraph(f"{key}】:
{value}
", normal_style)) - # intro_content.append(Spacer(1, 6)) # 字段间距 - # - # # 添加格式化的介绍部分 - # content.append(Paragraph("介绍:
", normal_style)) - # content.extend(intro_content) - # content.append(Spacer(1, 12)) # 添加磁力链接 if magnet_links: @@ -162,53 +137,45 @@ def fetch_and_create_pdf(url): for img in images: if img.get('zoomfile') and 'http' in img.get('zoomfile'): image_links.append(img.get('zoomfile')) - print(image_links) + if image_links: for img_link in image_links: image = download_image(img_link) if image: img = PILImage.open(image) img_width, img_height = img.size - image_width = 500 # 固定宽度 + image_width = 500 image_height = int((img_height / img_width) * image_width) - img_stream = BytesIO(image.getvalue()) content.append(Image(img_stream, width=image_width, height=image_height)) content.append(Spacer(1, 12)) - content.append(Spacer(1, 12)) # 帖子间距 + # 在每个帖子后添加分页符(除了最后一页) + if post != today_posts[-1]: + content.append(PageBreak()) # 生成PDF doc.build(content) - - # 获取PDF绝对路径 absolute_pdf_path = os.path.abspath(pdf_filename) print(f"PDF saved as {absolute_pdf_path}") + # 加密PDF add_pdf_encryption(absolute_pdf_path) - - # 关闭浏览器 driver.quit() return absolute_pdf_path +# add_pdf_encryption 和 pdf_file_path 函数保持不变 def add_pdf_encryption(pdf_file, password="4000"): """使用PyPDF2为PDF添加加密保护""" pdf_writer = PdfWriter() pdf_reader = PdfReader(pdf_file) - - # 将所有页面添加到PDF写入器 for page_num in range(len(pdf_reader.pages)): pdf_writer.add_page(pdf_reader.pages[page_num]) - - # 添加密码 pdf_writer.encrypt(password) - - # 保存加密后的PDF with open(pdf_file, "wb") as output_pdf: pdf_writer.write(output_pdf) - print(f"PDF加密成功,密码为: {password}")