abot/utils/sehuatang/shehuatang.py

import time
import os
import requests
from io import BytesIO
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from reportlab.lib.pagesizes import letter, A3
from reportlab.lib import colors
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image, PageBreak
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.pdfbase.ttfonts import TTFont
from reportlab.pdfbase import pdfmetrics
from datetime import datetime
from PIL import Image as PILImage
import re
from PyPDF2 import PdfReader, PdfWriter

from loguru import logger


# download_image 函数保持不变
def download_image(url):
    """下载大于100KB的图片并返回临时文件路径，仅支持jpg、jpeg和png格式"""
    try:
        if not url.lower().endswith(('.jpg', '.jpeg', '.png')):
            return None

        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Referer': 'https://tu.a7nz4.us',
        }

        response = requests.get(url, headers=headers)
        response.raise_for_status()
        image = BytesIO(response.content)
        return image
    except requests.exceptions.RequestException as e:
        logger.warning(f"下载图片失败: {e}")
        return None


def fetch_and_create_pdf(url):
    """根据给定URL抓取页面并生成PDF"""
    driver = None
    try:
        # 配置Selenium
        options = Options()
        options.add_argument('--headless')  # 使用新的headless模式
        options.add_argument('--disable-gpu')
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')  # 添加Linux特定配置
        options.add_argument('--disable-logging')
        options.add_argument('--log-level=3')
        options.add_experimental_option('excludeSwitches', ['enable-automation', 'enable-logging'])

        # 根据操作系统选择不同的ChromeDriver路径处理方式
        if os.name == 'nt':  # Windows
            chrome_driver_path = os.path.join(
                os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
                "utils", "chromedriver", "chromedriver.exe"
            )
        else:  # Linux
            chrome_driver_path = '/usr/bin/chromedriver'  # 使用系统PATH中的chromedriver

        try:
            if os.name == 'nt' and not os.path.exists(chrome_driver_path):
                chrome_driver_path = ChromeDriverManager().install()
            service = Service(chrome_driver_path, log_path=os.devnull)
            driver = webdriver.Chrome(service=service, options=options)
        except Exception as e:
            logger.debug(f"初始化ChromeDriver失败: {e}")
            chrome_driver_path = ChromeDriverManager().install()
            service = Service(chrome_driver_path, log_path=os.devnull)
            driver = webdriver.Chrome(service=service, options=options)

        # 获取目标页面
        driver.get(url)
        try:
            enter_button = WebDriverWait(driver, 5).until(
                EC.element_to_be_clickable((By.XPATH, '//a[contains(text(), "满18岁，请点此进入")]')))
            enter_button.click()
            logger.debug("点击了满18岁按钮")
        except Exception as e:
            logger.warning(f"未找到满18岁按钮，跳过此步骤: {e}")
        WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'tbody[id^="normalthread"]')))

        # 处理年龄验证按钮
        try:
            enter_button = driver.find_element(By.XPATH, '//a[contains(text(), "满18岁，请点此进入")]')
            enter_button.click()
            logger.debug("点击了满18岁按钮")
            time.sleep(5)
        except Exception as e:
            logger.warning(f"未找到满18岁按钮，跳过此步骤: {e}")

        # 解析页面
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8')
        posts = soup.find_all('tbody', {'id': lambda x: x and x.startswith('normalthread')})

        # 获取今天的日期
        today = datetime.now().strftime('%Y-%m-%d')

        # 注册中文字体
        pdfmetrics.registerFont(TTFont('SimHei', 'fonts/simhei.ttf'))
        styles = getSampleStyleSheet()

        # 设置样式
        title_style = styles['Heading1']
        title_style.fontName = 'SimHei'
        title_style.fontSize = 14
        title_style.textColor = colors.red
        title_style.bold = True

        normal_style = styles['Normal']
        normal_style.fontName = 'SimHei'
        normal_style.fontSize = 14

        content = []

        # 过滤当天帖子并倒序
        today_posts = []
        for post in posts:
            post_time_span = post.find('span', {'class': 'xi1'})
            if post_time_span:
                today_posts.append(post)
        today_posts = today_posts[::-1]  # 倒序处理

        # 设置PDF
        # 修改PDF文件路径到项目根目录的temp目录下
        pdf_filename = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))),
                                    'temp',
                                    f"JAV-{today}-{len(today_posts)}.pdf")
        doc = SimpleDocTemplate(pdf_filename, pagesize=A3)

        # 计算内容区域的宽度和高度
        page_width, page_height = A3
        content_width = page_width - doc.rightMargin - doc.leftMargin
        content_height = page_height - doc.topMargin - doc.bottomMargin

        # 设置最大图片尺寸，留出一些边距
        max_image_width = content_width * 0.95
        max_image_height = content_height * 0.7  # 留出足够空间给文本和其他元素

        # 遍历帖子
        session = requests.Session()
        session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Referer': 'https://www.sehuatang.net/'
        })
        for c in driver.get_cookies():
            try:
                session.cookies.set(c['name'], c['value'], domain=c.get('domain'), path=c.get('path', '/'))
            except Exception:
                session.cookies.set(c['name'], c['value'])
        for post in today_posts:
            title = post.find('a', {'class': 's xst'})
            if title:
                post_title = title.get_text()
                post_url = title.get('href')
                logger.info(post_title)

                # 获取帖子内容
                post_page_url = 'https://www.sehuatang.net/' + post_url
                try:
                    resp = session.get(post_page_url, timeout=15)
                    resp.raise_for_status()
                    post_html = resp.text
                except Exception as e:
                    logger.warning(f"获取帖子内容失败: {e}")
                    continue
                post_soup = BeautifulSoup(post_html, 'html.parser', from_encoding='utf-8')
                content_div = post_soup.find('div', {'class': 't_fsz'})

                if content_div:
                    # 提取文本和磁力链接
                    post_text = content_div.get_text(strip=True)
                    magnet_links = re.findall(r'magnet:\?[^ \u4e00-\u9fff]+', post_text)

                    # 添加标题
                    content.append(Paragraph(f" {post_title}", title_style))
                    content.append(Spacer(1, 5))

                    # 添加磁力链接
                    if magnet_links:
                        for magnet_link in magnet_links:
                            content.append(Paragraph(f"<br /><b>{magnet_link}</b><br />", normal_style))
                            content.append(Spacer(1, 12))

                    # 添加图片
                    image_links = []
                    images = content_div.find_all('img')
                    for img in images:
                        if img.get('zoomfile') and 'http' in img.get('zoomfile'):
                            image_links.append(img.get('zoomfile'))

                    if image_links:
                        for img_link in image_links:
                            image = download_image(img_link)
                            if image:
                                try:
                                    # 使用PIL处理图片尺寸
                                    with PILImage.open(image) as img:
                                        img_width, img_height = img.size
                                        # 计算缩放比例，确保图片适应页面
                                        scale_width = max_image_width / img_width
                                        scale_height = max_image_height / img_height
                                        scale = min(scale_width, scale_height, 1.0)  # 不超过原始大小

                                        # 计算新的尺寸
                                        new_width = img_width * scale
                                        new_height = img_height * scale

                                        # 重置文件指针
                                        image.seek(0)
                                        img_stream = BytesIO(image.getvalue())

                                        # 添加图片到内容中，使用计算后的尺寸
                                        content.append(Image(img_stream, width=new_width, height=new_height))
                                        content.append(Spacer(1, 4))
                                        logger.debug(
                                            f"处理图片: 原始尺寸 {img_width}x{img_height}, 新尺寸 {new_width}x{new_height}")
                                except Exception as e:
                                    logger.error(f"处理图片时出错: {e}")

                    # 在每个帖子后添加分页符（除了最后一页）
                    if post != today_posts[-1]:
                        content.append(PageBreak())

        # 生成PDF
        try:
            doc.build(content)
            absolute_pdf_path = os.path.abspath(pdf_filename)
            logger.info(f"PDF saved as {absolute_pdf_path}")

            # 加密PDF
            add_pdf_encryption(absolute_pdf_path)
            return absolute_pdf_path
        except Exception as e:
            logger.error(f"生成PDF时出错: {e}")
            # 如果生成失败，返回一个默认路径或空字符串
            return ""
    except Exception as e:
        logger.error(f"抓取帖子时出错: {e}")
        # 如果抓取失败，返回一个默认路径或空字符串
        return ""
    finally:
        # 确保在所有情况下都关闭driver
        if driver:
            try:
                driver.quit()
                logger.debug("Chrome driver已成功关闭")
            except Exception as e:
                logger.error(f"关闭Chrome driver时出错: {e}")
                # 在极端情况下尝试强制结束进程
                try:
                    import psutil
                    process = psutil.Process(driver.service.process.pid)
                    process.terminate()
                    logger.debug("已强制终止Chrome进程")
                except Exception as e2:
                    logger.error(f"强制终止Chrome进程失败: {e2}")


# add_pdf_encryption 和 pdf_file_path 函数保持不变
def add_pdf_encryption(pdf_file, password="4000"):
    """使用PyPDF2为PDF添加加密保护"""
    try:
        pdf_writer = PdfWriter()
        pdf_reader = PdfReader(pdf_file)
        for page_num in range(len(pdf_reader.pages)):
            pdf_writer.add_page(pdf_reader.pages[page_num])
        pdf_writer.encrypt(password)
        with open(pdf_file, "wb") as output_pdf:
            pdf_writer.write(output_pdf)
        logger.debug(f"PDF加密成功，密码为: {password}")
    except Exception as e:
        logger.error(f"PDF加密失败: {e}")


def pdf_file_path():
    try:
        url = 'https://www.sehuatang.net/forum.php?mod=forumdisplay&fid=103&filter=typeid&typeid=481'
        pdf_path = fetch_and_create_pdf(url)
        if pdf_path:
            logger.info(f"返回的PDF文件路径：{pdf_path}")
            return True, pdf_path
        else:
            # 如果生成失败，返回一个默认的PDF路径
            default_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "default.pdf")
            logger.info(f"PDF生成失败，返回默认路径: {default_path}")
            return False, default_path
    except Exception as e:
        logger.error(f"生成PDF路径时出错: {e}")
        # 返回一个默认路径
        default_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "default.pdf")
        return False, default_path


if __name__ == "__main__":
    pdf_file_path()