From e484263cb9465c06eb7cc54f65d12ecf9cc92370 Mon Sep 17 00:00:00 2001 From: liuwei Date: Mon, 22 Dec 2025 16:45:43 +0800 Subject: [PATCH] =?UTF-8?q?=E5=8A=A0=E5=85=A5=E6=89=8B=E5=8A=A8=E5=A4=84?= =?UTF-8?q?=E7=90=86=E9=80=BB=E8=BE=91=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- utils/sehuatang/shehuatang-undetected.py | 175 +++++++++++++++++++++++ 1 file changed, 175 insertions(+) create mode 100644 utils/sehuatang/shehuatang-undetected.py diff --git a/utils/sehuatang/shehuatang-undetected.py b/utils/sehuatang/shehuatang-undetected.py new file mode 100644 index 0000000..2d6ec9e --- /dev/null +++ b/utils/sehuatang/shehuatang-undetected.py @@ -0,0 +1,175 @@ +import time +import os +import requests +from io import BytesIO +import undetected_chromedriver as uc +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from bs4 import BeautifulSoup +from reportlab.lib.pagesizes import A3 +from reportlab.lib import colors +from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image, PageBreak +from reportlab.lib.styles import getSampleStyleSheet +from reportlab.pdfbase.ttfonts import TTFont +from reportlab.pdfbase import pdfmetrics +from datetime import datetime +from PIL import Image as PILImage +import re +from PyPDF2 import PdfReader, PdfWriter +from loguru import logger + + +def download_image(url, session): + """使用同步的 session 下载图片,确保 Cookie 一致""" + try: + if not url.lower().endswith(('.jpg', '.jpeg', '.png')): + return None + response = session.get(url, timeout=15) + response.raise_for_status() + return BytesIO(response.content) + except Exception as e: + logger.warning(f"下载图片失败: {e}") + return None + + +def add_pdf_encryption(pdf_file, password="4000"): + try: + pdf_writer = PdfWriter() + pdf_reader = PdfReader(pdf_file) + for page in pdf_reader.pages: + pdf_writer.add_page(page) + pdf_writer.encrypt(password) + with open(pdf_file, "wb") as output_pdf: + pdf_writer.write(output_pdf) + logger.debug("PDF加密成功") + except Exception as e: + logger.error(f"PDF加密失败: {e}") + + +def fetch_and_create_pdf(url): + driver = None + try: + options = uc.ChromeOptions() + # 规避检测的关键配置 + options.headless = False + options.add_argument('--no-sandbox') + options.add_argument('--disable-gpu') + options.add_argument('--disable-dev-shm-usage') + + # 如果依然在 Headless 触发检测,建议第一次运行设为 False 手动通过 + driver = uc.Chrome(options=options, headless=True) + + logger.info(f"正在访问: {url}") + driver.get(url) + + # 等待 Cloudflare 5秒盾结束,并处理“满18岁”按钮 + time.sleep(8) + + try: + enter_btn = WebDriverWait(driver, 10).until( + EC.element_to_be_clickable((By.XPATH, '//a[contains(text(), "满18岁,请点此进入")]')) + ) + enter_btn.click() + logger.debug("点击了年龄确认按钮") + time.sleep(3) + except Exception: + logger.debug("未发现年龄验证按钮,可能已过检测") + + # 确保列表加载 + WebDriverWait(driver, 20).until( + EC.presence_of_element_located((By.CSS_SELECTOR, 'tbody[id^="normalthread"]')) + ) + + # 提取数据 + soup = BeautifulSoup(driver.page_source, 'html.parser') + posts = [p for p in soup.find_all('tbody', {'id': lambda x: x and x.startswith('normalthread')}) if + p.find('span', {'class': 'xi1'})] + today_posts = posts[::-1] + + # 字体注册 + pdfmetrics.registerFont(TTFont('SimHei', 'fonts/simhei.ttf')) + styles = getSampleStyleSheet() + title_style = styles['Heading1'] + title_style.fontName = 'SimHei' + title_style.textColor = colors.red + normal_style = styles['Normal'] + normal_style.fontName = 'SimHei' + + # 路径逻辑 + save_path = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), 'temp') + if not os.path.exists(save_path): os.makedirs(save_path) + pdf_filename = os.path.join(save_path, f"JAV-{datetime.now().strftime('%Y-%m-%d')}-{len(today_posts)}.pdf") + + doc = SimpleDocTemplate(pdf_filename, pagesize=A3) + content = [] + max_w, max_h = (A3[0] - 72) * 0.95, (A3[1] - 72) * 0.7 + + # 同步 Session + session = requests.Session() + ua = driver.execute_script("return navigator.userAgent") + session.headers.update({'User-Agent': ua, 'Referer': 'https://www.sehuatang.net/'}) + for c in driver.get_cookies(): + session.cookies.set(c['name'], c['value']) + + # 循环帖子 + for post in today_posts: + title_tag = post.find('a', {'class': 's xst'}) + if not title_tag: continue + + p_title = title_tag.get_text() + p_url = 'https://www.sehuatang.net/' + title_tag.get('href') + logger.info(f"详情页: {p_title}") + + try: + resp = session.get(p_url, timeout=15) + p_soup = BeautifulSoup(resp.text, 'html.parser') + div = p_soup.find('div', {'class': 't_fsz'}) + + if div: + content.append(Paragraph(f" {p_title}", title_style)) + magnets = re.findall(r'magnet:\?[^ \u4e00-\u9fff]+', div.get_text()) + for m in magnets: + content.append(Paragraph(f"{m}", normal_style)) + + for img_tag in div.find_all('img'): + src = img_tag.get('zoomfile') + if src and 'http' in src: + img_io = download_image(src, session) + if img_io: + with PILImage.open(img_io) as p_img: + iw, ih = p_img.size + sc = min(max_w / iw, max_h / ih, 1.0) + img_io.seek(0) + content.append(Image(img_io, width=iw * sc, height=ih * sc)) + + if post != today_posts[-1]: content.append(PageBreak()) + except Exception as e: + logger.error(f"帖子处理失败: {e}") + + doc.build(content) + add_pdf_encryption(pdf_filename) + return pdf_filename + + except Exception as e: + logger.exception(f"抓取异常: {e}") + return "" + finally: + # --- 解决 [WinError 6] 句柄无效的关键 --- + if driver: + try: + logger.debug("正在安全关闭浏览器...") + driver.close() # 先关闭窗口 + driver.quit() # 再退出进程 + except Exception: + # 捕获因句柄失效导致的退出异常,避免污染控制台 + pass + + +def pdf_file_path(): + url = 'https://www.sehuatang.net/forum.php?mod=forumdisplay&fid=103&filter=typeid&typeid=481' + return fetch_and_create_pdf(url) + + +if __name__ == "__main__": + pdf_file_path() \ No newline at end of file