From 8a68338ffe68735552bc58e41d13b75902d284a5 Mon Sep 17 00:00:00 2001 From: liuwei Date: Mon, 22 Dec 2025 15:45:40 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BD=BF=E7=94=A8=E6=96=B0=E7=9A=84=E7=BD=91?= =?UTF-8?q?=E5=9D=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- utils/sehuatang/sehuatang_bot.py | 6 ++-- utils/sehuatang/shehuatang.py | 54 +++++++++++++++++++++----------- 2 files changed, 38 insertions(+), 22 deletions(-) diff --git a/utils/sehuatang/sehuatang_bot.py b/utils/sehuatang/sehuatang_bot.py index d099d53..5dde251 100644 --- a/utils/sehuatang/sehuatang_bot.py +++ b/utils/sehuatang/sehuatang_bot.py @@ -132,7 +132,7 @@ class SehuatangCrawler: def bypass_age_verification(self): try: - self.driver.get("https://www.sehuatang.net/forum.php") + self.driver.get("https://www.sehuatang.org/forum.php") time.sleep(2) try: enter_button = self.driver.find_element(By.XPATH, '//a[contains(text(), "满18岁")]') @@ -332,7 +332,7 @@ class SehuatangCrawler: if RUN_MODE == 'daily' and consecutive_old_posts > 20: break - list_url = f"https://www.sehuatang.net/forum-{fid}-{page}.html" + list_url = f"https://www.sehuatang.org/forum-{fid}-{page}.html" logger.info(f"正在爬取第 {page} 页") try: @@ -369,7 +369,7 @@ class SehuatangCrawler: continue partial_url = title_tag.get('href') - full_url = f"https://www.sehuatang.net/{partial_url}" + full_url = f"https://www.sehuatang.org/{partial_url}" # 获取详情页数据(含女优) magnet, cover, body_actress = self.parse_detail_page(full_url) diff --git a/utils/sehuatang/shehuatang.py b/utils/sehuatang/shehuatang.py index 89dc7fb..5879548 100644 --- a/utils/sehuatang/shehuatang.py +++ b/utils/sehuatang/shehuatang.py @@ -6,6 +6,8 @@ from selenium import webdriver from selenium.webdriver.chrome.service import Service from selenium.webdriver.common.by import By from selenium.webdriver.chrome.options import Options +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC from webdriver_manager.chrome import ChromeDriverManager from bs4 import BeautifulSoup from reportlab.lib.pagesizes import letter, A3 @@ -52,6 +54,9 @@ def fetch_and_create_pdf(url): options.add_argument('--disable-gpu') options.add_argument('--no-sandbox') options.add_argument('--disable-dev-shm-usage') # 添加Linux特定配置 + options.add_argument('--disable-logging') + options.add_argument('--log-level=3') + options.add_experimental_option('excludeSwitches', ['enable-automation', 'enable-logging']) # 根据操作系统选择不同的ChromeDriver路径处理方式 if os.name == 'nt': # Windows @@ -65,25 +70,23 @@ def fetch_and_create_pdf(url): try: if os.name == 'nt' and not os.path.exists(chrome_driver_path): chrome_driver_path = ChromeDriverManager().install() - - driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options) + service = Service(chrome_driver_path, log_path=os.devnull) + driver = webdriver.Chrome(service=service, options=options) except Exception as e: logger.debug(f"初始化ChromeDriver失败: {e}") chrome_driver_path = ChromeDriverManager().install() - driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options) - - # 如果本地没有chromedriver.exe,则使用webdriver_manager下载一次 - if not os.path.exists(chrome_driver_path): - chrome_driver_path = ChromeDriverManager().install() - logger.debug(f"ChromeDriver已下载到: {chrome_driver_path}") - else: - logger.debug(f"使用本地ChromeDriver: {chrome_driver_path}") - - driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options) + service = Service(chrome_driver_path, log_path=os.devnull) + driver = webdriver.Chrome(service=service, options=options) # 获取目标页面 driver.get(url) - time.sleep(10) + try: + enter_button = WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH, '//a[contains(text(), "满18岁,请点此进入")]'))) + enter_button.click() + logger.debug("点击了满18岁按钮") + except Exception as e: + logger.warning(f"未找到满18岁按钮,跳过此步骤: {e}") + WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'tbody[id^="normalthread"]'))) # 处理年龄验证按钮 try: @@ -92,7 +95,7 @@ def fetch_and_create_pdf(url): logger.debug("点击了满18岁按钮") time.sleep(5) except Exception as e: - logger.warning("未找到满18岁按钮,跳过此步骤", e) + logger.warning(f"未找到满18岁按钮,跳过此步骤: {e}") # 解析页面 html = driver.page_source @@ -144,6 +147,16 @@ def fetch_and_create_pdf(url): max_image_height = content_height * 0.7 # 留出足够空间给文本和其他元素 # 遍历帖子 + session = requests.Session() + session.headers.update({ + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', + 'Referer': 'https://www.sehuatang.net/' + }) + for c in driver.get_cookies(): + try: + session.cookies.set(c['name'], c['value'], domain=c.get('domain'), path=c.get('path', '/')) + except Exception: + session.cookies.set(c['name'], c['value']) for post in today_posts: title = post.find('a', {'class': 's xst'}) if title: @@ -153,10 +166,13 @@ def fetch_and_create_pdf(url): # 获取帖子内容 post_page_url = 'https://www.sehuatang.net/' + post_url - driver.get(post_page_url) - time.sleep(3) - - post_html = driver.page_source + try: + resp = session.get(post_page_url, timeout=15) + resp.raise_for_status() + post_html = resp.text + except Exception as e: + logger.warning(f"获取帖子内容失败: {e}") + continue post_soup = BeautifulSoup(post_html, 'html.parser', from_encoding='utf-8') content_div = post_soup.find('div', {'class': 't_fsz'}) @@ -284,4 +300,4 @@ def pdf_file_path(): if __name__ == "__main__": - pdf_file_path() \ No newline at end of file + pdf_file_path()