调整sehuatang内容

This commit is contained in:
liuwei
2025-12-22 17:04:45 +08:00
parent e484263cb9
commit 1d94bbcc35
2 changed files with 29 additions and 44 deletions

View File

@@ -1,15 +1,13 @@
import time import time
import os import os
import re import re
import requests
import mysql.connector import mysql.connector
from mysql.connector import Error from mysql.connector import Error
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options import undetected_chromedriver as uc
from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from loguru import logger from loguru import logger
from datetime import datetime, timedelta from datetime import datetime, timedelta
@@ -67,6 +65,7 @@ class SehuatangCrawler:
self._connect_db() self._connect_db()
self._init_db_table() self._init_db_table()
self.driver = self._init_driver() self.driver = self._init_driver()
self.session = None
self.today_str = datetime.now().strftime('%Y-%m-%d') self.today_str = datetime.now().strftime('%Y-%m-%d')
def _connect_db(self): def _connect_db(self):
@@ -110,34 +109,15 @@ class SehuatangCrawler:
if cursor: cursor.close() if cursor: cursor.close()
def _init_driver(self): def _init_driver(self):
options = Options() options = uc.ChromeOptions()
# options.add_argument('--headless') # 规避检测的关键配置
options.add_argument('--disable-gpu') options.headless = False
options.add_argument('--headless') # 使用新的headless模式
options.add_argument('--no-sandbox') options.add_argument('--no-sandbox')
options.add_argument('--disable-gpu')
options.add_argument('--disable-dev-shm-usage') options.add_argument('--disable-dev-shm-usage')
options.add_argument('--blink-settings=imagesEnabled=false')
options.add_argument('--disable-logging')
options.add_argument('--log-level=3')
options.add_argument('--disable-blink-features=AutomationControlled')
options.add_experimental_option('excludeSwitches', ['enable-automation', 'enable-logging'])
options.add_experimental_option('useAutomationExtension', False)
options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36')
if os.name == 'nt': # 如果依然在 Headless 触发检测,建议第一次运行设为 False 手动通过
chrome_driver_path = os.path.join(os.getcwd(), "utils", "chromedriver", "chromedriver.exe") driver = uc.Chrome(options=options)
else:
chrome_driver_path = '/usr/bin/chromedriver'
try:
if os.name == 'nt' and not os.path.exists(chrome_driver_path):
chrome_driver_path = ChromeDriverManager().install()
service = Service(chrome_driver_path, log_path=os.devnull)
driver = webdriver.Chrome(service=service, options=options)
except Exception:
chrome_driver_path = ChromeDriverManager().install()
driver = webdriver.Chrome(service=Service(chrome_driver_path, log_path=os.devnull), options=options)
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {"source": "Object.defineProperty(navigator, 'webdriver', {get: () => undefined})"})
return driver return driver
def bypass_age_verification(self): def bypass_age_verification(self):
@@ -154,6 +134,14 @@ class SehuatangCrawler:
logger.success("通过年龄验证") logger.success("通过年龄验证")
except Exception: except Exception:
pass pass
ua = self.driver.execute_script("return navigator.userAgent")
self.session = requests.Session()
self.session.headers.update({'User-Agent': ua, 'Referer': 'https://www.sehuatang.org/'})
for c in self.driver.get_cookies():
try:
self.session.cookies.set(c['name'], c['value'], domain=c.get('domain'), path=c.get('path', '/'))
except Exception:
self.session.cookies.set(c['name'], c['value'])
except Exception as e: except Exception as e:
logger.warning(f"主页访问异常: {e}") logger.warning(f"主页访问异常: {e}")
@@ -284,17 +272,14 @@ class SehuatangCrawler:
def parse_detail_page(self, post_url): def parse_detail_page(self, post_url):
magnet_link = "" magnet_link = ""
cover_image = "" cover_image = ""
actress_in_body = "" # 详情页提取到的女优 actress_in_body = ""
try: try:
self.driver.get(post_url) resp = self.session.get(post_url, timeout=15) if self.session else requests.get(post_url, timeout=15)
time.sleep(1 if RUN_MODE == 'full' else 2) soup = BeautifulSoup(resp.text, 'html.parser')
soup = BeautifulSoup(self.driver.page_source, 'html.parser')
content_div = soup.find('div', {'class': 't_fsz'}) content_div = soup.find('div', {'class': 't_fsz'})
if content_div: if content_div:
# 1. 提取磁力链
magnet_tags = content_div.find_all('a', href=re.compile(r'^magnet:\?')) magnet_tags = content_div.find_all('a', href=re.compile(r'^magnet:\?'))
for tag in magnet_tags: for tag in magnet_tags:
href = tag.get('href', '') href = tag.get('href', '')
@@ -307,7 +292,6 @@ class SehuatangCrawler:
if match: magnet_link = match.group(0) if match: magnet_link = match.group(0)
magnet_link = self.clean_magnet(magnet_link) magnet_link = self.clean_magnet(magnet_link)
# 2. 提取图片
imgs = content_div.find_all('img') imgs = content_div.find_all('img')
for img in imgs: for img in imgs:
zoomfile = img.get('zoomfile') zoomfile = img.get('zoomfile')
@@ -319,16 +303,10 @@ class SehuatangCrawler:
cover_image = file_attr cover_image = file_attr
break break
# 3. [新] 提取【出演女优】
# 使用 separator='\n' 保持换行,防止文字粘连
text_content = content_div.get_text(separator='\n') text_content = content_div.get_text(separator='\n')
# 正则匹配:支持 【】 或 [],支持冒号或空格
# 匹配逻辑:找 "女优" 关键词,后面跟冒号,再取剩下的一整行文字
actress_match = re.search(r'(?:【|\[)(?:出演)?女优(?:】|\])\s*[:]\s*(.*)', text_content) actress_match = re.search(r'(?:【|\[)(?:出演)?女优(?:】|\])\s*[:]\s*(.*)', text_content)
if actress_match: if actress_match:
raw_actress = actress_match.group(1).strip() raw_actress = actress_match.group(1).strip()
# 再次清洗一下防止后面有HTML标签残留
actress_in_body = raw_actress.split('<')[0].strip() actress_in_body = raw_actress.split('<')[0].strip()
except Exception: except Exception:
@@ -419,7 +397,14 @@ class SehuatangCrawler:
if self.conn and self.conn.is_connected(): if self.conn and self.conn.is_connected():
self.conn.close() self.conn.close()
if self.driver: if self.driver:
self.driver.quit() try:
self.driver.close()
except Exception:
pass
try:
self.driver.quit()
except Exception:
pass
def run(self): def run(self):
try: try:

View File

@@ -58,7 +58,7 @@ def fetch_and_create_pdf(url):
options.add_argument('--disable-dev-shm-usage') options.add_argument('--disable-dev-shm-usage')
# 如果依然在 Headless 触发检测,建议第一次运行设为 False 手动通过 # 如果依然在 Headless 触发检测,建议第一次运行设为 False 手动通过
driver = uc.Chrome(options=options, headless=True) driver = uc.Chrome(options=options)
logger.info(f"正在访问: {url}") logger.info(f"正在访问: {url}")
driver.get(url) driver.get(url)