使用新的网址

This commit is contained in:
liuwei
2025-12-22 15:45:40 +08:00
parent d5d7a2a34d
commit 8a68338ffe
2 changed files with 38 additions and 22 deletions

View File

@@ -132,7 +132,7 @@ class SehuatangCrawler:
def bypass_age_verification(self): def bypass_age_verification(self):
try: try:
self.driver.get("https://www.sehuatang.net/forum.php") self.driver.get("https://www.sehuatang.org/forum.php")
time.sleep(2) time.sleep(2)
try: try:
enter_button = self.driver.find_element(By.XPATH, '//a[contains(text(), "满18岁")]') enter_button = self.driver.find_element(By.XPATH, '//a[contains(text(), "满18岁")]')
@@ -332,7 +332,7 @@ class SehuatangCrawler:
if RUN_MODE == 'daily' and consecutive_old_posts > 20: if RUN_MODE == 'daily' and consecutive_old_posts > 20:
break break
list_url = f"https://www.sehuatang.net/forum-{fid}-{page}.html" list_url = f"https://www.sehuatang.org/forum-{fid}-{page}.html"
logger.info(f"正在爬取第 {page}") logger.info(f"正在爬取第 {page}")
try: try:
@@ -369,7 +369,7 @@ class SehuatangCrawler:
continue continue
partial_url = title_tag.get('href') partial_url = title_tag.get('href')
full_url = f"https://www.sehuatang.net/{partial_url}" full_url = f"https://www.sehuatang.org/{partial_url}"
# 获取详情页数据(含女优) # 获取详情页数据(含女优)
magnet, cover, body_actress = self.parse_detail_page(full_url) magnet, cover, body_actress = self.parse_detail_page(full_url)

View File

@@ -6,6 +6,8 @@ from selenium import webdriver
from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from reportlab.lib.pagesizes import letter, A3 from reportlab.lib.pagesizes import letter, A3
@@ -52,6 +54,9 @@ def fetch_and_create_pdf(url):
options.add_argument('--disable-gpu') options.add_argument('--disable-gpu')
options.add_argument('--no-sandbox') options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage') # 添加Linux特定配置 options.add_argument('--disable-dev-shm-usage') # 添加Linux特定配置
options.add_argument('--disable-logging')
options.add_argument('--log-level=3')
options.add_experimental_option('excludeSwitches', ['enable-automation', 'enable-logging'])
# 根据操作系统选择不同的ChromeDriver路径处理方式 # 根据操作系统选择不同的ChromeDriver路径处理方式
if os.name == 'nt': # Windows if os.name == 'nt': # Windows
@@ -65,25 +70,23 @@ def fetch_and_create_pdf(url):
try: try:
if os.name == 'nt' and not os.path.exists(chrome_driver_path): if os.name == 'nt' and not os.path.exists(chrome_driver_path):
chrome_driver_path = ChromeDriverManager().install() chrome_driver_path = ChromeDriverManager().install()
service = Service(chrome_driver_path, log_path=os.devnull)
driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options) driver = webdriver.Chrome(service=service, options=options)
except Exception as e: except Exception as e:
logger.debug(f"初始化ChromeDriver失败: {e}") logger.debug(f"初始化ChromeDriver失败: {e}")
chrome_driver_path = ChromeDriverManager().install() chrome_driver_path = ChromeDriverManager().install()
driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options) service = Service(chrome_driver_path, log_path=os.devnull)
driver = webdriver.Chrome(service=service, options=options)
# 如果本地没有chromedriver.exe则使用webdriver_manager下载一次
if not os.path.exists(chrome_driver_path):
chrome_driver_path = ChromeDriverManager().install()
logger.debug(f"ChromeDriver已下载到: {chrome_driver_path}")
else:
logger.debug(f"使用本地ChromeDriver: {chrome_driver_path}")
driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
# 获取目标页面 # 获取目标页面
driver.get(url) driver.get(url)
time.sleep(10) try:
enter_button = WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH, '//a[contains(text(), "满18岁请点此进入")]')))
enter_button.click()
logger.debug("点击了满18岁按钮")
except Exception as e:
logger.warning(f"未找到满18岁按钮跳过此步骤: {e}")
WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'tbody[id^="normalthread"]')))
# 处理年龄验证按钮 # 处理年龄验证按钮
try: try:
@@ -92,7 +95,7 @@ def fetch_and_create_pdf(url):
logger.debug("点击了满18岁按钮") logger.debug("点击了满18岁按钮")
time.sleep(5) time.sleep(5)
except Exception as e: except Exception as e:
logger.warning("未找到满18岁按钮跳过此步骤", e) logger.warning(f"未找到满18岁按钮跳过此步骤: {e}")
# 解析页面 # 解析页面
html = driver.page_source html = driver.page_source
@@ -144,6 +147,16 @@ def fetch_and_create_pdf(url):
max_image_height = content_height * 0.7 # 留出足够空间给文本和其他元素 max_image_height = content_height * 0.7 # 留出足够空间给文本和其他元素
# 遍历帖子 # 遍历帖子
session = requests.Session()
session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Referer': 'https://www.sehuatang.net/'
})
for c in driver.get_cookies():
try:
session.cookies.set(c['name'], c['value'], domain=c.get('domain'), path=c.get('path', '/'))
except Exception:
session.cookies.set(c['name'], c['value'])
for post in today_posts: for post in today_posts:
title = post.find('a', {'class': 's xst'}) title = post.find('a', {'class': 's xst'})
if title: if title:
@@ -153,10 +166,13 @@ def fetch_and_create_pdf(url):
# 获取帖子内容 # 获取帖子内容
post_page_url = 'https://www.sehuatang.net/' + post_url post_page_url = 'https://www.sehuatang.net/' + post_url
driver.get(post_page_url) try:
time.sleep(3) resp = session.get(post_page_url, timeout=15)
resp.raise_for_status()
post_html = driver.page_source post_html = resp.text
except Exception as e:
logger.warning(f"获取帖子内容失败: {e}")
continue
post_soup = BeautifulSoup(post_html, 'html.parser', from_encoding='utf-8') post_soup = BeautifulSoup(post_html, 'html.parser', from_encoding='utf-8')
content_div = post_soup.find('div', {'class': 't_fsz'}) content_div = post_soup.find('div', {'class': 't_fsz'})
@@ -284,4 +300,4 @@ def pdf_file_path():
if __name__ == "__main__": if __name__ == "__main__":
pdf_file_path() pdf_file_path()