调整sehuatang内容
This commit is contained in:
@@ -1,15 +1,13 @@
|
||||
import time
|
||||
import os
|
||||
import re
|
||||
import requests
|
||||
import mysql.connector
|
||||
from mysql.connector import Error
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.service import Service
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
import undetected_chromedriver as uc
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from webdriver_manager.chrome import ChromeDriverManager
|
||||
from bs4 import BeautifulSoup
|
||||
from loguru import logger
|
||||
from datetime import datetime, timedelta
|
||||
@@ -67,6 +65,7 @@ class SehuatangCrawler:
|
||||
self._connect_db()
|
||||
self._init_db_table()
|
||||
self.driver = self._init_driver()
|
||||
self.session = None
|
||||
self.today_str = datetime.now().strftime('%Y-%m-%d')
|
||||
|
||||
def _connect_db(self):
|
||||
@@ -110,34 +109,15 @@ class SehuatangCrawler:
|
||||
if cursor: cursor.close()
|
||||
|
||||
def _init_driver(self):
|
||||
options = Options()
|
||||
# options.add_argument('--headless')
|
||||
options.add_argument('--disable-gpu')
|
||||
options.add_argument('--headless') # 使用新的headless模式
|
||||
options = uc.ChromeOptions()
|
||||
# 规避检测的关键配置
|
||||
options.headless = False
|
||||
options.add_argument('--no-sandbox')
|
||||
options.add_argument('--disable-gpu')
|
||||
options.add_argument('--disable-dev-shm-usage')
|
||||
options.add_argument('--blink-settings=imagesEnabled=false')
|
||||
options.add_argument('--disable-logging')
|
||||
options.add_argument('--log-level=3')
|
||||
options.add_argument('--disable-blink-features=AutomationControlled')
|
||||
options.add_experimental_option('excludeSwitches', ['enable-automation', 'enable-logging'])
|
||||
options.add_experimental_option('useAutomationExtension', False)
|
||||
options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36')
|
||||
|
||||
if os.name == 'nt':
|
||||
chrome_driver_path = os.path.join(os.getcwd(), "utils", "chromedriver", "chromedriver.exe")
|
||||
else:
|
||||
chrome_driver_path = '/usr/bin/chromedriver'
|
||||
|
||||
try:
|
||||
if os.name == 'nt' and not os.path.exists(chrome_driver_path):
|
||||
chrome_driver_path = ChromeDriverManager().install()
|
||||
service = Service(chrome_driver_path, log_path=os.devnull)
|
||||
driver = webdriver.Chrome(service=service, options=options)
|
||||
except Exception:
|
||||
chrome_driver_path = ChromeDriverManager().install()
|
||||
driver = webdriver.Chrome(service=Service(chrome_driver_path, log_path=os.devnull), options=options)
|
||||
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {"source": "Object.defineProperty(navigator, 'webdriver', {get: () => undefined})"})
|
||||
# 如果依然在 Headless 触发检测,建议第一次运行设为 False 手动通过
|
||||
driver = uc.Chrome(options=options)
|
||||
return driver
|
||||
|
||||
def bypass_age_verification(self):
|
||||
@@ -154,6 +134,14 @@ class SehuatangCrawler:
|
||||
logger.success("通过年龄验证")
|
||||
except Exception:
|
||||
pass
|
||||
ua = self.driver.execute_script("return navigator.userAgent")
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update({'User-Agent': ua, 'Referer': 'https://www.sehuatang.org/'})
|
||||
for c in self.driver.get_cookies():
|
||||
try:
|
||||
self.session.cookies.set(c['name'], c['value'], domain=c.get('domain'), path=c.get('path', '/'))
|
||||
except Exception:
|
||||
self.session.cookies.set(c['name'], c['value'])
|
||||
except Exception as e:
|
||||
logger.warning(f"主页访问异常: {e}")
|
||||
|
||||
@@ -284,17 +272,14 @@ class SehuatangCrawler:
|
||||
def parse_detail_page(self, post_url):
|
||||
magnet_link = ""
|
||||
cover_image = ""
|
||||
actress_in_body = "" # 详情页提取到的女优
|
||||
actress_in_body = ""
|
||||
|
||||
try:
|
||||
self.driver.get(post_url)
|
||||
time.sleep(1 if RUN_MODE == 'full' else 2)
|
||||
|
||||
soup = BeautifulSoup(self.driver.page_source, 'html.parser')
|
||||
resp = self.session.get(post_url, timeout=15) if self.session else requests.get(post_url, timeout=15)
|
||||
soup = BeautifulSoup(resp.text, 'html.parser')
|
||||
content_div = soup.find('div', {'class': 't_fsz'})
|
||||
|
||||
if content_div:
|
||||
# 1. 提取磁力链
|
||||
magnet_tags = content_div.find_all('a', href=re.compile(r'^magnet:\?'))
|
||||
for tag in magnet_tags:
|
||||
href = tag.get('href', '')
|
||||
@@ -307,7 +292,6 @@ class SehuatangCrawler:
|
||||
if match: magnet_link = match.group(0)
|
||||
magnet_link = self.clean_magnet(magnet_link)
|
||||
|
||||
# 2. 提取图片
|
||||
imgs = content_div.find_all('img')
|
||||
for img in imgs:
|
||||
zoomfile = img.get('zoomfile')
|
||||
@@ -319,16 +303,10 @@ class SehuatangCrawler:
|
||||
cover_image = file_attr
|
||||
break
|
||||
|
||||
# 3. [新] 提取【出演女优】
|
||||
# 使用 separator='\n' 保持换行,防止文字粘连
|
||||
text_content = content_div.get_text(separator='\n')
|
||||
|
||||
# 正则匹配:支持 【】 或 [],支持冒号或空格
|
||||
# 匹配逻辑:找 "女优" 关键词,后面跟冒号,再取剩下的一整行文字
|
||||
actress_match = re.search(r'(?:【|\[)(?:出演)?女优(?:】|\])\s*[::]\s*(.*)', text_content)
|
||||
if actress_match:
|
||||
raw_actress = actress_match.group(1).strip()
|
||||
# 再次清洗一下,防止后面有HTML标签残留
|
||||
actress_in_body = raw_actress.split('<')[0].strip()
|
||||
|
||||
except Exception:
|
||||
@@ -419,7 +397,14 @@ class SehuatangCrawler:
|
||||
if self.conn and self.conn.is_connected():
|
||||
self.conn.close()
|
||||
if self.driver:
|
||||
self.driver.quit()
|
||||
try:
|
||||
self.driver.close()
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
self.driver.quit()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def run(self):
|
||||
try:
|
||||
|
||||
@@ -58,7 +58,7 @@ def fetch_and_create_pdf(url):
|
||||
options.add_argument('--disable-dev-shm-usage')
|
||||
|
||||
# 如果依然在 Headless 触发检测,建议第一次运行设为 False 手动通过
|
||||
driver = uc.Chrome(options=options, headless=True)
|
||||
driver = uc.Chrome(options=options)
|
||||
|
||||
logger.info(f"正在访问: {url}")
|
||||
driver.get(url)
|
||||
|
||||
Reference in New Issue
Block a user