使用新的网址

This commit is contained in:
liuwei
2025-12-22 15:45:40 +08:00
parent d5d7a2a34d
commit 8a68338ffe
2 changed files with 38 additions and 22 deletions

View File

@@ -132,7 +132,7 @@ class SehuatangCrawler:
def bypass_age_verification(self):
try:
self.driver.get("https://www.sehuatang.net/forum.php")
self.driver.get("https://www.sehuatang.org/forum.php")
time.sleep(2)
try:
enter_button = self.driver.find_element(By.XPATH, '//a[contains(text(), "满18岁")]')
@@ -332,7 +332,7 @@ class SehuatangCrawler:
if RUN_MODE == 'daily' and consecutive_old_posts > 20:
break
list_url = f"https://www.sehuatang.net/forum-{fid}-{page}.html"
list_url = f"https://www.sehuatang.org/forum-{fid}-{page}.html"
logger.info(f"正在爬取第 {page}")
try:
@@ -369,7 +369,7 @@ class SehuatangCrawler:
continue
partial_url = title_tag.get('href')
full_url = f"https://www.sehuatang.net/{partial_url}"
full_url = f"https://www.sehuatang.org/{partial_url}"
# 获取详情页数据(含女优)
magnet, cover, body_actress = self.parse_detail_page(full_url)

View File

@@ -6,6 +6,8 @@ from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from reportlab.lib.pagesizes import letter, A3
@@ -52,6 +54,9 @@ def fetch_and_create_pdf(url):
options.add_argument('--disable-gpu')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage') # 添加Linux特定配置
options.add_argument('--disable-logging')
options.add_argument('--log-level=3')
options.add_experimental_option('excludeSwitches', ['enable-automation', 'enable-logging'])
# 根据操作系统选择不同的ChromeDriver路径处理方式
if os.name == 'nt': # Windows
@@ -65,25 +70,23 @@ def fetch_and_create_pdf(url):
try:
if os.name == 'nt' and not os.path.exists(chrome_driver_path):
chrome_driver_path = ChromeDriverManager().install()
driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
service = Service(chrome_driver_path, log_path=os.devnull)
driver = webdriver.Chrome(service=service, options=options)
except Exception as e:
logger.debug(f"初始化ChromeDriver失败: {e}")
chrome_driver_path = ChromeDriverManager().install()
driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
# 如果本地没有chromedriver.exe则使用webdriver_manager下载一次
if not os.path.exists(chrome_driver_path):
chrome_driver_path = ChromeDriverManager().install()
logger.debug(f"ChromeDriver已下载到: {chrome_driver_path}")
else:
logger.debug(f"使用本地ChromeDriver: {chrome_driver_path}")
driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
service = Service(chrome_driver_path, log_path=os.devnull)
driver = webdriver.Chrome(service=service, options=options)
# 获取目标页面
driver.get(url)
time.sleep(10)
try:
enter_button = WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH, '//a[contains(text(), "满18岁请点此进入")]')))
enter_button.click()
logger.debug("点击了满18岁按钮")
except Exception as e:
logger.warning(f"未找到满18岁按钮跳过此步骤: {e}")
WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'tbody[id^="normalthread"]')))
# 处理年龄验证按钮
try:
@@ -92,7 +95,7 @@ def fetch_and_create_pdf(url):
logger.debug("点击了满18岁按钮")
time.sleep(5)
except Exception as e:
logger.warning("未找到满18岁按钮跳过此步骤", e)
logger.warning(f"未找到满18岁按钮跳过此步骤: {e}")
# 解析页面
html = driver.page_source
@@ -144,6 +147,16 @@ def fetch_and_create_pdf(url):
max_image_height = content_height * 0.7 # 留出足够空间给文本和其他元素
# 遍历帖子
session = requests.Session()
session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Referer': 'https://www.sehuatang.net/'
})
for c in driver.get_cookies():
try:
session.cookies.set(c['name'], c['value'], domain=c.get('domain'), path=c.get('path', '/'))
except Exception:
session.cookies.set(c['name'], c['value'])
for post in today_posts:
title = post.find('a', {'class': 's xst'})
if title:
@@ -153,10 +166,13 @@ def fetch_and_create_pdf(url):
# 获取帖子内容
post_page_url = 'https://www.sehuatang.net/' + post_url
driver.get(post_page_url)
time.sleep(3)
post_html = driver.page_source
try:
resp = session.get(post_page_url, timeout=15)
resp.raise_for_status()
post_html = resp.text
except Exception as e:
logger.warning(f"获取帖子内容失败: {e}")
continue
post_soup = BeautifulSoup(post_html, 'html.parser', from_encoding='utf-8')
content_div = post_soup.find('div', {'class': 't_fsz'})
@@ -284,4 +300,4 @@ def pdf_file_path():
if __name__ == "__main__":
pdf_file_path()
pdf_file_path()