import time import os import requests from io import BytesIO from selenium import webdriver from selenium.webdriver.chrome.service import Service from selenium.webdriver.common.by import By from selenium.webdriver.chrome.options import Options from webdriver_manager.chrome import ChromeDriverManager from bs4 import BeautifulSoup from reportlab.lib.pagesizes import letter from reportlab.lib import colors from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image from reportlab.lib.styles import getSampleStyleSheet from reportlab.pdfbase.ttfonts import TTFont from reportlab.pdfbase import pdfmetrics from datetime import datetime from PIL import Image as PILImage from PyPDF2 import PdfReader, PdfWriter # 用于PDF加密 def download_image(url): """ 下载大于100KB的图片并返回临时文件路径,仅支持jpg、jpeg和png格式 """ try: if not url.lower().endswith(('.jpg', '.jpeg', '.png')): return None headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Referer': 'https://tu.a7nz4.us', # 防止403 } response = requests.get(url, headers=headers) response.raise_for_status() # 确保请求成功 image = BytesIO(response.content) return image except requests.exceptions.RequestException as e: print(f"下载图片失败: {e}") return None def add_pdf_encryption(pdf_file, password="4000"): """ 使用PyPDF2为PDF添加加密保护 """ pdf_writer = PdfWriter() pdf_reader = PdfReader(pdf_file) # 将所有页面添加到PDF写入器中 for page_num in range(len(pdf_reader.pages)): pdf_writer.add_page(pdf_reader.pages[page_num]) # 添加密码 pdf_writer.encrypt(password) # 保存加密后的PDF with open(pdf_file, "wb") as output_pdf: pdf_writer.write(output_pdf) print(f"PDF加密成功,密码为: {password}") def fetch_and_create_pdf(url): """根据给定URL抓取页面并生成PDF""" # 配置Selenium以无头模式(即不显示浏览器窗口)运行 options = Options() options.headless = True options.add_argument('--disable-gpu') options.add_argument('--no-sandbox') # 使用webdriver-manager自动下载ChromeDriver driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options) # 获取目标页面 driver.get(url) time.sleep(5) # 处理“满18岁,请点此进入”按钮 try: enter_button = driver.find_element(By.XPATH, '//a[contains(text(), "满18岁,请点此进入")]') enter_button.click() print("点击了满18岁按钮") time.sleep(5) # 等待 5 秒,确保点击后内容加载完成 except Exception as e: print("未找到满18岁按钮,跳过此步骤", e) # 使用BeautifulSoup解析页面 html = driver.page_source soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8') # 定位到帖子列表 posts = soup.find_all('tbody', {'id': lambda x: x and x.startswith('normalthread')}) # 获取今天的日期 today = datetime.now().strftime('%Y-%m-%d') # 设置PDF pdf_filename = f"JAV-{today}-{len(posts)}.pdf" doc = SimpleDocTemplate(pdf_filename, pagesize=letter) # 注册中文字体 pdfmetrics.registerFont(TTFont('SamHei', 'fonts/simhei.ttf')) # 设置中文字体路径 styles = getSampleStyleSheet() # 设置标题和正文样式都使用SamHei字体 title_style = styles['Heading1'] title_style.fontName = 'SamHei' # 设置标题使用SamHei字体 normal_style = styles['Normal'] normal_style.fontName = 'SamHei' # 设置正文使用SamHei字体 content = [] # 过滤出当天的帖子 today_posts = [] for post in posts: post_time_span = post.find('span', {'class': 'xi1'}) if post_time_span: # 判断是否存在post_time_span,即认为是当天发布的帖子 today_posts.append(post) # 遍历当天的帖子并提取信息 for post in today_posts: # 查找帖子标题 title = post.find('a', {'class': 's xst'}) if title: post_title = title.get_text() post_url = title.get('href') # 获取帖子的页面 post_page_url = 'https://www.sehuatang.net/' + post_url driver.get(post_page_url) time.sleep(3) # 获取帖子页面内容 post_html = driver.page_source post_soup = BeautifulSoup(post_html, 'html.parser', from_encoding='utf-8') # 提取