diff --git a/utils/sehuatang/sehuatang_bot.py b/utils/sehuatang/sehuatang_bot.py index 2988007..c15f62a 100644 --- a/utils/sehuatang/sehuatang_bot.py +++ b/utils/sehuatang/sehuatang_bot.py @@ -1,15 +1,13 @@ import time import os import re +import requests import mysql.connector from mysql.connector import Error -from selenium import webdriver -from selenium.webdriver.chrome.service import Service from selenium.webdriver.common.by import By -from selenium.webdriver.chrome.options import Options +import undetected_chromedriver as uc from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC -from webdriver_manager.chrome import ChromeDriverManager from bs4 import BeautifulSoup from loguru import logger from datetime import datetime, timedelta @@ -67,6 +65,7 @@ class SehuatangCrawler: self._connect_db() self._init_db_table() self.driver = self._init_driver() + self.session = None self.today_str = datetime.now().strftime('%Y-%m-%d') def _connect_db(self): @@ -110,34 +109,15 @@ class SehuatangCrawler: if cursor: cursor.close() def _init_driver(self): - options = Options() - # options.add_argument('--headless') - options.add_argument('--disable-gpu') - options.add_argument('--headless') # 使用新的headless模式 + options = uc.ChromeOptions() + # 规避检测的关键配置 + options.headless = False options.add_argument('--no-sandbox') + options.add_argument('--disable-gpu') options.add_argument('--disable-dev-shm-usage') - options.add_argument('--blink-settings=imagesEnabled=false') - options.add_argument('--disable-logging') - options.add_argument('--log-level=3') - options.add_argument('--disable-blink-features=AutomationControlled') - options.add_experimental_option('excludeSwitches', ['enable-automation', 'enable-logging']) - options.add_experimental_option('useAutomationExtension', False) - options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36') - if os.name == 'nt': - chrome_driver_path = os.path.join(os.getcwd(), "utils", "chromedriver", "chromedriver.exe") - else: - chrome_driver_path = '/usr/bin/chromedriver' - - try: - if os.name == 'nt' and not os.path.exists(chrome_driver_path): - chrome_driver_path = ChromeDriverManager().install() - service = Service(chrome_driver_path, log_path=os.devnull) - driver = webdriver.Chrome(service=service, options=options) - except Exception: - chrome_driver_path = ChromeDriverManager().install() - driver = webdriver.Chrome(service=Service(chrome_driver_path, log_path=os.devnull), options=options) - driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {"source": "Object.defineProperty(navigator, 'webdriver', {get: () => undefined})"}) + # 如果依然在 Headless 触发检测,建议第一次运行设为 False 手动通过 + driver = uc.Chrome(options=options) return driver def bypass_age_verification(self): @@ -154,6 +134,14 @@ class SehuatangCrawler: logger.success("通过年龄验证") except Exception: pass + ua = self.driver.execute_script("return navigator.userAgent") + self.session = requests.Session() + self.session.headers.update({'User-Agent': ua, 'Referer': 'https://www.sehuatang.org/'}) + for c in self.driver.get_cookies(): + try: + self.session.cookies.set(c['name'], c['value'], domain=c.get('domain'), path=c.get('path', '/')) + except Exception: + self.session.cookies.set(c['name'], c['value']) except Exception as e: logger.warning(f"主页访问异常: {e}") @@ -284,17 +272,14 @@ class SehuatangCrawler: def parse_detail_page(self, post_url): magnet_link = "" cover_image = "" - actress_in_body = "" # 详情页提取到的女优 + actress_in_body = "" try: - self.driver.get(post_url) - time.sleep(1 if RUN_MODE == 'full' else 2) - - soup = BeautifulSoup(self.driver.page_source, 'html.parser') + resp = self.session.get(post_url, timeout=15) if self.session else requests.get(post_url, timeout=15) + soup = BeautifulSoup(resp.text, 'html.parser') content_div = soup.find('div', {'class': 't_fsz'}) if content_div: - # 1. 提取磁力链 magnet_tags = content_div.find_all('a', href=re.compile(r'^magnet:\?')) for tag in magnet_tags: href = tag.get('href', '') @@ -307,7 +292,6 @@ class SehuatangCrawler: if match: magnet_link = match.group(0) magnet_link = self.clean_magnet(magnet_link) - # 2. 提取图片 imgs = content_div.find_all('img') for img in imgs: zoomfile = img.get('zoomfile') @@ -319,16 +303,10 @@ class SehuatangCrawler: cover_image = file_attr break - # 3. [新] 提取【出演女优】 - # 使用 separator='\n' 保持换行,防止文字粘连 text_content = content_div.get_text(separator='\n') - - # 正则匹配:支持 【】 或 [],支持冒号或空格 - # 匹配逻辑:找 "女优" 关键词,后面跟冒号,再取剩下的一整行文字 actress_match = re.search(r'(?:【|\[)(?:出演)?女优(?:】|\])\s*[::]\s*(.*)', text_content) if actress_match: raw_actress = actress_match.group(1).strip() - # 再次清洗一下,防止后面有HTML标签残留 actress_in_body = raw_actress.split('<')[0].strip() except Exception: @@ -419,7 +397,14 @@ class SehuatangCrawler: if self.conn and self.conn.is_connected(): self.conn.close() if self.driver: - self.driver.quit() + try: + self.driver.close() + except Exception: + pass + try: + self.driver.quit() + except Exception: + pass def run(self): try: diff --git a/utils/sehuatang/shehuatang-undetected.py b/utils/sehuatang/shehuatang-undetected.py index 2d6ec9e..769ec99 100644 --- a/utils/sehuatang/shehuatang-undetected.py +++ b/utils/sehuatang/shehuatang-undetected.py @@ -58,7 +58,7 @@ def fetch_and_create_pdf(url): options.add_argument('--disable-dev-shm-usage') # 如果依然在 Headless 触发检测,建议第一次运行设为 False 手动通过 - driver = uc.Chrome(options=options, headless=True) + driver = uc.Chrome(options=options) logger.info(f"正在访问: {url}") driver.get(url)