搬迁sehuat

2025-05-26 17:10:47 +08:00
parent 6adef5277b
commit d01662a397
3 changed files with 1 additions and 1 deletions
--- a/utils/sehuatang/init.py
+++ b/utils/sehuatang/init.py
--- a/utils/sehuatang/shehuatang.py
+++ b/utils/sehuatang/shehuatang.py
@@ -0,0 +1,274 @@
+import time
+import os
+import requests
+from io import BytesIO
+from selenium import webdriver
+from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.common.by import By
+from selenium.webdriver.chrome.options import Options
+from webdriver_manager.chrome import ChromeDriverManager
+from bs4 import BeautifulSoup
+from reportlab.lib.pagesizes import letter, A3
+from reportlab.lib import colors
+from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image, PageBreak
+from reportlab.lib.styles import getSampleStyleSheet
+from reportlab.pdfbase.ttfonts import TTFont
+from reportlab.pdfbase import pdfmetrics
+from datetime import datetime
+from PIL import Image as PILImage
+import re
+from PyPDF2 import PdfReader, PdfWriter
+
+from loguru import logger
+
+# download_image 函数保持不变
+def download_image(url):
+    """下载大于100KB的图片并返回临时文件路径，仅支持jpg、jpeg和png格式"""
+    try:
+        if not url.lower().endswith(('.jpg', '.jpeg', '.png')):
+            return None
+
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+            'Referer': 'https://tu.a7nz4.us',
+        }
+
+        response = requests.get(url, headers=headers)
+        response.raise_for_status()
+        image = BytesIO(response.content)
+        return image
+    except requests.exceptions.RequestException as e:
+        logger.info(f"下载图片失败: {e}")
+        return None
+
+
+def fetch_and_create_pdf(url):
+    """根据给定URL抓取页面并生成PDF"""
+    driver =None
+    try:
+        # 配置Selenium
+        options = Options()
+        options.add_argument('--headless')  # 使用新的headless模式
+        options.add_argument('--disable-gpu')
+        options.add_argument('--no-sandbox')
+        options.add_argument('--disable-dev-shm-usage')  # 添加Linux特定配置
+        
+        # 根据操作系统选择不同的ChromeDriver路径处理方式
+        if os.name == 'nt':  # Windows
+            chrome_driver_path = os.path.join(
+                os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
+                "utils", "chromedriver", "chromedriver.exe"
+            )
+        else:  # Linux
+            chrome_driver_path = '/usr/bin/chromedriver'  # 使用系统PATH中的chromedriver
+        
+        try:
+            if os.name == 'nt' and not os.path.exists(chrome_driver_path):
+                chrome_driver_path = ChromeDriverManager().install()
+            
+            driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
+        except Exception as e:
+            logger.info(f"初始化ChromeDriver失败: {e}")
+            chrome_driver_path = ChromeDriverManager().install()
+            driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
+        
+        # 如果本地没有chromedriver.exe，则使用webdriver_manager下载一次
+        if not os.path.exists(chrome_driver_path):
+            chrome_driver_path = ChromeDriverManager().install()
+            logger.info(f"ChromeDriver已下载到: {chrome_driver_path}")
+        else:
+            logger.info(f"使用本地ChromeDriver: {chrome_driver_path}")
+            
+        driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
+
+        # 获取目标页面
+        driver.get(url)
+        time.sleep(5)
+
+        # 处理年龄验证按钮
+        try:
+            enter_button = driver.find_element(By.XPATH, '//a[contains(text(), "满18岁，请点此进入")]')
+            enter_button.click()
+            logger.info("点击了满18岁按钮")
+            time.sleep(5)
+        except Exception as e:
+            logger.info("未找到满18岁按钮，跳过此步骤", e)
+
+        # 解析页面
+        html = driver.page_source
+        soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8')
+        posts = soup.find_all('tbody', {'id': lambda x: x and x.startswith('normalthread')})
+
+        # 获取今天的日期
+        today = datetime.now().strftime('%Y-%m-%d')
+
+        # 注册中文字体
+        pdfmetrics.registerFont(TTFont('SimHei', 'fonts/simhei.ttf'))
+        styles = getSampleStyleSheet()
+
+        # 设置样式
+        title_style = styles['Heading1']
+        title_style.fontName = 'SimHei'
+        title_style.fontSize = 14
+        title_style.textColor = colors.red
+        title_style.bold = True
+
+        normal_style = styles['Normal']
+        normal_style.fontName = 'SimHei'
+        normal_style.fontSize = 14
+
+        content = []
+
+        # 过滤当天帖子并倒序
+        today_posts = []
+        for post in posts:
+            post_time_span = post.find('span', {'class': 'xi1'})
+            if post_time_span:
+                today_posts.append(post)
+        today_posts = today_posts[::-1]  # 倒序处理
+
+        # 设置PDF
+        pdf_filename = f"JAV-{today}-{len(today_posts)}.pdf"
+        doc = SimpleDocTemplate(pdf_filename, pagesize=A3)
+        
+        # 计算内容区域的宽度和高度
+        page_width, page_height = A3
+        content_width = page_width - doc.rightMargin - doc.leftMargin
+        content_height = page_height - doc.topMargin - doc.bottomMargin
+        
+        # 设置最大图片尺寸，留出一些边距
+        max_image_width = content_width * 0.95
+        max_image_height = content_height * 0.7  # 留出足够空间给文本和其他元素
+
+        # 遍历帖子
+        for post in today_posts:
+            title = post.find('a', {'class': 's xst'})
+            if title:
+                post_title = title.get_text()
+                post_url = title.get('href')
+                logger.info(post_title)
+
+                # 获取帖子内容
+                post_page_url = 'https://www.sehuatang.net/' + post_url
+                driver.get(post_page_url)
+                time.sleep(3)
+
+                post_html = driver.page_source
+                post_soup = BeautifulSoup(post_html, 'html.parser', from_encoding='utf-8')
+                content_div = post_soup.find('div', {'class': 't_fsz'})
+
+                if content_div:
+                    # 提取文本和磁力链接
+                    post_text = content_div.get_text(strip=True)
+                    magnet_links = re.findall(r'magnet:\?[^ \u4e00-\u9fff]+', post_text)
+
+                    # 添加标题
+                    content.append(Paragraph(f" {post_title}", title_style))
+                    content.append(Spacer(1, 5))
+
+                    # 添加磁力链接
+                    if magnet_links:
+                        for magnet_link in magnet_links:
+                            content.append(Paragraph(f"<br /><b>{magnet_link}</b><br />", normal_style))
+                            content.append(Spacer(1, 12))
+
+                    # 添加图片
+                    image_links = []
+                    images = content_div.find_all('img')
+                    for img in images:
+                        if img.get('zoomfile') and 'http' in img.get('zoomfile'):
+                            image_links.append(img.get('zoomfile'))
+
+                    if image_links:
+                        for img_link in image_links:
+                            image = download_image(img_link)
+                            if image:
+                                try:
+                                    # 使用PIL处理图片尺寸
+                                    with PILImage.open(image) as img:
+                                        img_width, img_height = img.size
+                                        # 计算缩放比例，确保图片适应页面
+                                        scale_width = max_image_width / img_width
+                                        scale_height = max_image_height / img_height
+                                        scale = min(scale_width, scale_height, 1.0)  # 不超过原始大小
+                                        
+                                        # 计算新的尺寸
+                                        new_width = img_width * scale
+                                        new_height = img_height * scale
+                                        
+                                        # 重置文件指针
+                                        image.seek(0)
+                                        img_stream = BytesIO(image.getvalue())
+                                        
+                                        # 添加图片到内容中，使用计算后的尺寸
+                                        content.append(Image(img_stream, width=new_width, height=new_height))
+                                        content.append(Spacer(1, 4))
+                                        logger.info(f"处理图片: 原始尺寸 {img_width}x{img_height}, 新尺寸 {new_width}x{new_height}")
+                                except Exception as e:
+                                    logger.info(f"处理图片时出错: {e}")
+
+                    # 在每个帖子后添加分页符（除了最后一页）
+                    if post != today_posts[-1]:
+                        content.append(PageBreak())
+
+        # 生成PDF
+        try:
+            doc.build(content)
+            absolute_pdf_path = os.path.abspath(pdf_filename)
+            logger.info(f"PDF saved as {absolute_pdf_path}")
+
+            # 加密PDF
+            add_pdf_encryption(absolute_pdf_path)
+            driver.quit()
+
+            return absolute_pdf_path
+        except Exception as e:
+            logger.info(f"生成PDF时出错: {e}")
+            driver.quit()
+            # 如果生成失败，返回一个默认路径或空字符串
+            return ""
+    except Exception as e:
+        logger.info(f"抓取帖子时出错: {e}")
+        # 如果抓取失败，返回一个默认路径或空字符串
+        driver.quit()
+        return ""
+    finally:
+        driver.quit()
+
+# add_pdf_encryption 和 pdf_file_path 函数保持不变
+def add_pdf_encryption(pdf_file, password="4000"):
+    """使用PyPDF2为PDF添加加密保护"""
+    try:
+        pdf_writer = PdfWriter()
+        pdf_reader = PdfReader(pdf_file)
+        for page_num in range(len(pdf_reader.pages)):
+            pdf_writer.add_page(pdf_reader.pages[page_num])
+        pdf_writer.encrypt(password)
+        with open(pdf_file, "wb") as output_pdf:
+            pdf_writer.write(output_pdf)
+        logger.info(f"PDF加密成功，密码为: {password}")
+    except Exception as e:
+        logger.info(f"PDF加密失败: {e}")
+
+
+def pdf_file_path():
+    try:
+        url = 'https://www.sehuatang.net/forum.php?mod=forumdisplay&fid=103&filter=typeid&typeid=481'
+        pdf_path = fetch_and_create_pdf(url)
+        if pdf_path:
+            logger.info(f"返回的PDF文件路径：{pdf_path}")
+            return pdf_path
+        else:
+            # 如果生成失败，返回一个默认的PDF路径
+            default_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "default.pdf")
+            logger.info(f"PDF生成失败，返回默认路径: {default_path}")
+            return default_path
+    except Exception as e:
+        logger.info(f"生成PDF路径时出错: {e}")
+        # 返回一个默认路径
+        default_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "default.pdf")
+        return default_path
+
+
+if __name__ == "__main__":
+    pdf_file_path()