abot/utils/sehuatang/shehuatang_undetected.py

import time
import os
import requests
from io import BytesIO
import undetected_chromedriver as uc

# 注意：不要禁用析构函数，否则会导致Chrome进程泄漏
# if os.name == 'nt':
#     try:
#         uc.Chrome.__del__ = lambda self: None
#     except Exception:
#         pass
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from reportlab.lib.pagesizes import A3
from reportlab.lib import colors
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image, PageBreak
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.pdfbase.ttfonts import TTFont
from reportlab.pdfbase import pdfmetrics
from datetime import datetime
from PIL import Image as PILImage
import re
from PyPDF2 import PdfReader, PdfWriter
from loguru import logger


def download_image(url, session):
    """使用同步的 session 下载图片，确保 Cookie 一致"""
    try:
        if not url.lower().endswith(('.jpg', '.jpeg', '.png')):
            return None
        response = session.get(url, timeout=15)
        response.raise_for_status()
        return BytesIO(response.content)
    except Exception as e:
        logger.warning(f"下载图片失败: {e}")
        return None


def add_pdf_encryption(pdf_file, password="4000"):
    try:
        pdf_writer = PdfWriter()
        pdf_reader = PdfReader(pdf_file)
        for page in pdf_reader.pages:
            pdf_writer.add_page(page)
        pdf_writer.encrypt(password)
        with open(pdf_file, "wb") as output_pdf:
            pdf_writer.write(output_pdf)
        logger.debug("PDF加密成功")
    except Exception as e:
        logger.error(f"PDF加密失败: {e}")


def fetch_and_create_pdf(url):
    driver = None
    service = None
    try:
        options = uc.ChromeOptions()
        # 规避检测的关键配置
        # 在Linux服务器上使用headless模式
        if os.name != 'nt':
            options.headless = True
            options.add_argument('--headless=new')  # 使用新版headless模式
        else:
            options.headless = False

        options.add_argument('--no-sandbox')
        options.add_argument('--disable-gpu')
        options.add_argument('--disable-dev-shm-usage')
        options.add_argument('--disable-extensions')
        options.add_argument('--disable-background-networking')
        # 确保进程能被正确清理
        options.add_argument('--disable-crash-reporter')
        options.add_argument('--disable-in-process-stack-traces')
        options.add_argument('--disable-logging')
        options.add_argument('--disable-dev-shm-usage')

        # 创建driver实例
        # 让 undetected_chromedriver 自动检测浏览器版本并下载匹配的 ChromeDriver
        # 强制指定版本为144，以匹配服务器当前的 Chrome 版本
        driver = uc.Chrome(options=options, version_main=144)

        logger.info(f"正在访问: {url}")
        driver.get(url)

        # 等待 Cloudflare 5秒盾结束，并处理“满18岁”按钮
        time.sleep(8)

        try:
            enter_btn = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, '//a[contains(text(), "满18岁，请点此进入")]'))
            )
            enter_btn.click()
            logger.debug("点击了年龄确认按钮")
            time.sleep(3)
        except Exception:
            logger.debug("未发现年龄验证按钮，可能已过检测")

        # 确保列表加载
        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'tbody[id^="normalthread"]'))
        )

        # 提取数据
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        posts = [p for p in soup.find_all('tbody', {'id': lambda x: x and x.startswith('normalthread')}) if
                 p.find('span', {'class': 'xi1'})]
        today_posts = posts[::-1]

        # 字体注册
        pdfmetrics.registerFont(TTFont('SimHei', 'fonts/simhei.ttf'))
        styles = getSampleStyleSheet()
        title_style = styles['Heading1']
        title_style.fontName = 'SimHei'
        title_style.textColor = colors.red
        normal_style = styles['Normal']
        normal_style.fontName = 'SimHei'

        # 路径逻辑 - 保存到 temp/JAV 目录
        base_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
        save_path = os.path.join(base_dir, 'temp', 'JAV')
        if not os.path.exists(save_path):
            os.makedirs(save_path)
        pdf_filename = os.path.join(save_path, f"JAV-{datetime.now().strftime('%Y-%m-%d')}-{len(today_posts)}.pdf")

        doc = SimpleDocTemplate(pdf_filename, pagesize=A3)
        content = []
        max_w, max_h = (A3[0] - 72) * 0.95, (A3[1] - 72) * 0.7

        # 同步 Session
        session = requests.Session()
        ua = driver.execute_script("return navigator.userAgent")
        session.headers.update({'User-Agent': ua, 'Referer': 'https://www.sehuatang.net/'})
        for c in driver.get_cookies():
            session.cookies.set(c['name'], c['value'])

        # 循环帖子
        for post in today_posts:
            title_tag = post.find('a', {'class': 's xst'})
            if not title_tag: continue

            p_title = title_tag.get_text()
            p_url = 'https://www.sehuatang.net/' + title_tag.get('href')
            logger.info(f"详情页: {p_title}")

            try:
                resp = session.get(p_url, timeout=15)
                p_soup = BeautifulSoup(resp.text, 'html.parser')
                div = p_soup.find('div', {'class': 't_fsz'})

                if div:
                    content.append(Paragraph(f" {p_title}", title_style))
                    magnets = re.findall(r'magnet:\?[^ \u4e00-\u9fff]+', div.get_text())
                    for m in magnets:
                        content.append(Paragraph(f"<b>{m}</b>", normal_style))

                    for img_tag in div.find_all('img'):
                        src = img_tag.get('zoomfile')
                        if src and 'http' in src:
                            img_io = download_image(src, session)
                            if img_io:
                                with PILImage.open(img_io) as p_img:
                                    iw, ih = p_img.size
                                    sc = min(max_w / iw, max_h / ih, 1.0)
                                    img_io.seek(0)
                                    content.append(Image(img_io, width=iw * sc, height=ih * sc))

                    if post != today_posts[-1]: content.append(PageBreak())
            except Exception as e:
                logger.error(f"帖子处理失败: {e}")

        doc.build(content)
        add_pdf_encryption(pdf_filename)
        return pdf_filename

    except Exception as e:
        logger.exception(f"抓取异常: {e}")
        return ""
    finally:
        # --- 确保Chrome进程被完全关闭 ---
        if driver:
            try:
                logger.debug("正在安全关闭浏览器...")
                # 先关闭所有标签页和窗口
                try:
                    driver.close()
                except Exception as e:
                    logger.warning(f"关闭浏览器窗口时出错: {e}")

                # 强制退出所有Chrome进程
                driver.quit()
                logger.debug("浏览器已完全关闭")
            except Exception as e:
                logger.error(f"关闭浏览器时出错: {e}")

        # 额外保险：强制清理残留的Chrome进程（仅Linux）
        if os.name != 'nt':
            try:
                import psutil
                current_user = os.getlogin()
                for proc in psutil.process_iter(['pid', 'name', 'cmdline', 'username']):
                    try:
                        if proc.info['name'] and 'chrome' in proc.info['name'].lower():
                            if proc.info['username'] == current_user:
                                # 检查是否是本次启动的chrome进程（通过命令行参数判断）
                                cmdline = proc.info.get('cmdline', [])
                                if cmdline and any('--user-data-dir=/tmp/playwright' in str(cmd) for cmd in cmdline):
                                    logger.info(f"强制终止残留Chrome进程: PID={proc.info['pid']}")
                                    proc.kill()
                    except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
                        pass
            except ImportError:
                logger.debug("未安装psutil，跳过强制清理")
            except Exception as e:
                logger.warning(f"强制清理Chrome进程时出错: {e}")


def pdf_file_path_undetected():
    try:
        url = 'https://www.sehuatang.net/forum.php?mod=forumdisplay&fid=103&filter=typeid&typeid=481'
        pdf_path = fetch_and_create_pdf(url)
        if pdf_path:
            logger.info(f"返回的PDF文件路径：{pdf_path}")
            return True, pdf_path
        else:
            # 如果生成失败，返回一个默认的PDF路径
            default_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "default.pdf")
            logger.info(f"PDF生成失败，返回默认路径: {default_path}")
            return False, default_path
    except Exception as e:
        logger.error(f"生成PDF路径时出错: {e}")
        # 返回一个默认路径
        default_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "default.pdf")
        return False, default_path


if __name__ == "__main__":
    pdf_file_path_undetected()