使用新的网址
This commit is contained in:
@@ -132,7 +132,7 @@ class SehuatangCrawler:
|
||||
|
||||
def bypass_age_verification(self):
|
||||
try:
|
||||
self.driver.get("https://www.sehuatang.net/forum.php")
|
||||
self.driver.get("https://www.sehuatang.org/forum.php")
|
||||
time.sleep(2)
|
||||
try:
|
||||
enter_button = self.driver.find_element(By.XPATH, '//a[contains(text(), "满18岁")]')
|
||||
@@ -332,7 +332,7 @@ class SehuatangCrawler:
|
||||
if RUN_MODE == 'daily' and consecutive_old_posts > 20:
|
||||
break
|
||||
|
||||
list_url = f"https://www.sehuatang.net/forum-{fid}-{page}.html"
|
||||
list_url = f"https://www.sehuatang.org/forum-{fid}-{page}.html"
|
||||
logger.info(f"正在爬取第 {page} 页")
|
||||
|
||||
try:
|
||||
@@ -369,7 +369,7 @@ class SehuatangCrawler:
|
||||
continue
|
||||
|
||||
partial_url = title_tag.get('href')
|
||||
full_url = f"https://www.sehuatang.net/{partial_url}"
|
||||
full_url = f"https://www.sehuatang.org/{partial_url}"
|
||||
|
||||
# 获取详情页数据(含女优)
|
||||
magnet, cover, body_actress = self.parse_detail_page(full_url)
|
||||
|
||||
@@ -6,6 +6,8 @@ from selenium import webdriver
|
||||
from selenium.webdriver.chrome.service import Service
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from webdriver_manager.chrome import ChromeDriverManager
|
||||
from bs4 import BeautifulSoup
|
||||
from reportlab.lib.pagesizes import letter, A3
|
||||
@@ -52,6 +54,9 @@ def fetch_and_create_pdf(url):
|
||||
options.add_argument('--disable-gpu')
|
||||
options.add_argument('--no-sandbox')
|
||||
options.add_argument('--disable-dev-shm-usage') # 添加Linux特定配置
|
||||
options.add_argument('--disable-logging')
|
||||
options.add_argument('--log-level=3')
|
||||
options.add_experimental_option('excludeSwitches', ['enable-automation', 'enable-logging'])
|
||||
|
||||
# 根据操作系统选择不同的ChromeDriver路径处理方式
|
||||
if os.name == 'nt': # Windows
|
||||
@@ -65,25 +70,23 @@ def fetch_and_create_pdf(url):
|
||||
try:
|
||||
if os.name == 'nt' and not os.path.exists(chrome_driver_path):
|
||||
chrome_driver_path = ChromeDriverManager().install()
|
||||
|
||||
driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
|
||||
service = Service(chrome_driver_path, log_path=os.devnull)
|
||||
driver = webdriver.Chrome(service=service, options=options)
|
||||
except Exception as e:
|
||||
logger.debug(f"初始化ChromeDriver失败: {e}")
|
||||
chrome_driver_path = ChromeDriverManager().install()
|
||||
driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
|
||||
|
||||
# 如果本地没有chromedriver.exe,则使用webdriver_manager下载一次
|
||||
if not os.path.exists(chrome_driver_path):
|
||||
chrome_driver_path = ChromeDriverManager().install()
|
||||
logger.debug(f"ChromeDriver已下载到: {chrome_driver_path}")
|
||||
else:
|
||||
logger.debug(f"使用本地ChromeDriver: {chrome_driver_path}")
|
||||
|
||||
driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
|
||||
service = Service(chrome_driver_path, log_path=os.devnull)
|
||||
driver = webdriver.Chrome(service=service, options=options)
|
||||
|
||||
# 获取目标页面
|
||||
driver.get(url)
|
||||
time.sleep(10)
|
||||
try:
|
||||
enter_button = WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH, '//a[contains(text(), "满18岁,请点此进入")]')))
|
||||
enter_button.click()
|
||||
logger.debug("点击了满18岁按钮")
|
||||
except Exception as e:
|
||||
logger.warning(f"未找到满18岁按钮,跳过此步骤: {e}")
|
||||
WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'tbody[id^="normalthread"]')))
|
||||
|
||||
# 处理年龄验证按钮
|
||||
try:
|
||||
@@ -92,7 +95,7 @@ def fetch_and_create_pdf(url):
|
||||
logger.debug("点击了满18岁按钮")
|
||||
time.sleep(5)
|
||||
except Exception as e:
|
||||
logger.warning("未找到满18岁按钮,跳过此步骤", e)
|
||||
logger.warning(f"未找到满18岁按钮,跳过此步骤: {e}")
|
||||
|
||||
# 解析页面
|
||||
html = driver.page_source
|
||||
@@ -144,6 +147,16 @@ def fetch_and_create_pdf(url):
|
||||
max_image_height = content_height * 0.7 # 留出足够空间给文本和其他元素
|
||||
|
||||
# 遍历帖子
|
||||
session = requests.Session()
|
||||
session.headers.update({
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
||||
'Referer': 'https://www.sehuatang.net/'
|
||||
})
|
||||
for c in driver.get_cookies():
|
||||
try:
|
||||
session.cookies.set(c['name'], c['value'], domain=c.get('domain'), path=c.get('path', '/'))
|
||||
except Exception:
|
||||
session.cookies.set(c['name'], c['value'])
|
||||
for post in today_posts:
|
||||
title = post.find('a', {'class': 's xst'})
|
||||
if title:
|
||||
@@ -153,10 +166,13 @@ def fetch_and_create_pdf(url):
|
||||
|
||||
# 获取帖子内容
|
||||
post_page_url = 'https://www.sehuatang.net/' + post_url
|
||||
driver.get(post_page_url)
|
||||
time.sleep(3)
|
||||
|
||||
post_html = driver.page_source
|
||||
try:
|
||||
resp = session.get(post_page_url, timeout=15)
|
||||
resp.raise_for_status()
|
||||
post_html = resp.text
|
||||
except Exception as e:
|
||||
logger.warning(f"获取帖子内容失败: {e}")
|
||||
continue
|
||||
post_soup = BeautifulSoup(post_html, 'html.parser', from_encoding='utf-8')
|
||||
content_div = post_soup.find('div', {'class': 't_fsz'})
|
||||
|
||||
@@ -284,4 +300,4 @@ def pdf_file_path():
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pdf_file_path()
|
||||
pdf_file_path()
|
||||
|
||||
Reference in New Issue
Block a user