调整sehuatang内容

This commit is contained in:
liuwei
2025-12-22 17:04:45 +08:00
parent e484263cb9
commit 1d94bbcc35
2 changed files with 29 additions and 44 deletions

View File

@@ -1,15 +1,13 @@
import time
import os
import re
import requests
import mysql.connector
from mysql.connector import Error
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import undetected_chromedriver as uc
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from loguru import logger
from datetime import datetime, timedelta
@@ -67,6 +65,7 @@ class SehuatangCrawler:
self._connect_db()
self._init_db_table()
self.driver = self._init_driver()
self.session = None
self.today_str = datetime.now().strftime('%Y-%m-%d')
def _connect_db(self):
@@ -110,34 +109,15 @@ class SehuatangCrawler:
if cursor: cursor.close()
def _init_driver(self):
options = Options()
# options.add_argument('--headless')
options.add_argument('--disable-gpu')
options.add_argument('--headless') # 使用新的headless模式
options = uc.ChromeOptions()
# 规避检测的关键配置
options.headless = False
options.add_argument('--no-sandbox')
options.add_argument('--disable-gpu')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--blink-settings=imagesEnabled=false')
options.add_argument('--disable-logging')
options.add_argument('--log-level=3')
options.add_argument('--disable-blink-features=AutomationControlled')
options.add_experimental_option('excludeSwitches', ['enable-automation', 'enable-logging'])
options.add_experimental_option('useAutomationExtension', False)
options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36')
if os.name == 'nt':
chrome_driver_path = os.path.join(os.getcwd(), "utils", "chromedriver", "chromedriver.exe")
else:
chrome_driver_path = '/usr/bin/chromedriver'
try:
if os.name == 'nt' and not os.path.exists(chrome_driver_path):
chrome_driver_path = ChromeDriverManager().install()
service = Service(chrome_driver_path, log_path=os.devnull)
driver = webdriver.Chrome(service=service, options=options)
except Exception:
chrome_driver_path = ChromeDriverManager().install()
driver = webdriver.Chrome(service=Service(chrome_driver_path, log_path=os.devnull), options=options)
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {"source": "Object.defineProperty(navigator, 'webdriver', {get: () => undefined})"})
# 如果依然在 Headless 触发检测,建议第一次运行设为 False 手动通过
driver = uc.Chrome(options=options)
return driver
def bypass_age_verification(self):
@@ -154,6 +134,14 @@ class SehuatangCrawler:
logger.success("通过年龄验证")
except Exception:
pass
ua = self.driver.execute_script("return navigator.userAgent")
self.session = requests.Session()
self.session.headers.update({'User-Agent': ua, 'Referer': 'https://www.sehuatang.org/'})
for c in self.driver.get_cookies():
try:
self.session.cookies.set(c['name'], c['value'], domain=c.get('domain'), path=c.get('path', '/'))
except Exception:
self.session.cookies.set(c['name'], c['value'])
except Exception as e:
logger.warning(f"主页访问异常: {e}")
@@ -284,17 +272,14 @@ class SehuatangCrawler:
def parse_detail_page(self, post_url):
magnet_link = ""
cover_image = ""
actress_in_body = "" # 详情页提取到的女优
actress_in_body = ""
try:
self.driver.get(post_url)
time.sleep(1 if RUN_MODE == 'full' else 2)
soup = BeautifulSoup(self.driver.page_source, 'html.parser')
resp = self.session.get(post_url, timeout=15) if self.session else requests.get(post_url, timeout=15)
soup = BeautifulSoup(resp.text, 'html.parser')
content_div = soup.find('div', {'class': 't_fsz'})
if content_div:
# 1. 提取磁力链
magnet_tags = content_div.find_all('a', href=re.compile(r'^magnet:\?'))
for tag in magnet_tags:
href = tag.get('href', '')
@@ -307,7 +292,6 @@ class SehuatangCrawler:
if match: magnet_link = match.group(0)
magnet_link = self.clean_magnet(magnet_link)
# 2. 提取图片
imgs = content_div.find_all('img')
for img in imgs:
zoomfile = img.get('zoomfile')
@@ -319,16 +303,10 @@ class SehuatangCrawler:
cover_image = file_attr
break
# 3. [新] 提取【出演女优】
# 使用 separator='\n' 保持换行,防止文字粘连
text_content = content_div.get_text(separator='\n')
# 正则匹配:支持 【】 或 [],支持冒号或空格
# 匹配逻辑:找 "女优" 关键词,后面跟冒号,再取剩下的一整行文字
actress_match = re.search(r'(?:【|\[)(?:出演)?女优(?:】|\])\s*[:]\s*(.*)', text_content)
if actress_match:
raw_actress = actress_match.group(1).strip()
# 再次清洗一下防止后面有HTML标签残留
actress_in_body = raw_actress.split('<')[0].strip()
except Exception:
@@ -419,7 +397,14 @@ class SehuatangCrawler:
if self.conn and self.conn.is_connected():
self.conn.close()
if self.driver:
self.driver.quit()
try:
self.driver.close()
except Exception:
pass
try:
self.driver.quit()
except Exception:
pass
def run(self):
try:

View File

@@ -58,7 +58,7 @@ def fetch_and_create_pdf(url):
options.add_argument('--disable-dev-shm-usage')
# 如果依然在 Headless 触发检测,建议第一次运行设为 False 手动通过
driver = uc.Chrome(options=options, headless=True)
driver = uc.Chrome(options=options)
logger.info(f"正在访问: {url}")
driver.get(url)