From 8a68338ffe68735552bc58e41d13b75902d284a5 Mon Sep 17 00:00:00 2001
From: liuwei <liuwei@wdtrgf.com.cn>
Date: Mon, 22 Dec 2025 15:45:40 +0800
Subject: [PATCH] =?UTF-8?q?=E4=BD=BF=E7=94=A8=E6=96=B0=E7=9A=84=E7=BD=91?=
 =?UTF-8?q?=E5=9D=80?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 utils/sehuatang/sehuatang_bot.py |  6 ++--
 utils/sehuatang/shehuatang.py    | 54 +++++++++++++++++++++-----------
 2 files changed, 38 insertions(+), 22 deletions(-)

diff --git a/utils/sehuatang/sehuatang_bot.py b/utils/sehuatang/sehuatang_bot.py
index d099d53..5dde251 100644
--- a/utils/sehuatang/sehuatang_bot.py
+++ b/utils/sehuatang/sehuatang_bot.py
@@ -132,7 +132,7 @@ class SehuatangCrawler:
 
     def bypass_age_verification(self):
         try:
-            self.driver.get("https://www.sehuatang.net/forum.php")
+            self.driver.get("https://www.sehuatang.org/forum.php")
             time.sleep(2)
             try:
                 enter_button = self.driver.find_element(By.XPATH, '//a[contains(text(), "满18岁")]')
@@ -332,7 +332,7 @@ class SehuatangCrawler:
             if RUN_MODE == 'daily' and consecutive_old_posts > 20:
                 break
 
-            list_url = f"https://www.sehuatang.net/forum-{fid}-{page}.html"
+            list_url = f"https://www.sehuatang.org/forum-{fid}-{page}.html"
             logger.info(f"正在爬取第 {page} 页")
 
             try:
@@ -369,7 +369,7 @@ class SehuatangCrawler:
                             continue
 
                         partial_url = title_tag.get('href')
-                        full_url = f"https://www.sehuatang.net/{partial_url}"
+                        full_url = f"https://www.sehuatang.org/{partial_url}"
 
                         # 获取详情页数据（含女优）
                         magnet, cover, body_actress = self.parse_detail_page(full_url)
diff --git a/utils/sehuatang/shehuatang.py b/utils/sehuatang/shehuatang.py
index 89dc7fb..5879548 100644
--- a/utils/sehuatang/shehuatang.py
+++ b/utils/sehuatang/shehuatang.py
@@ -6,6 +6,8 @@ from selenium import webdriver
 from selenium.webdriver.chrome.service import Service
 from selenium.webdriver.common.by import By
 from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
 from webdriver_manager.chrome import ChromeDriverManager
 from bs4 import BeautifulSoup
 from reportlab.lib.pagesizes import letter, A3
@@ -52,6 +54,9 @@ def fetch_and_create_pdf(url):
         options.add_argument('--disable-gpu')
         options.add_argument('--no-sandbox')
         options.add_argument('--disable-dev-shm-usage')  # 添加Linux特定配置
+        options.add_argument('--disable-logging')
+        options.add_argument('--log-level=3')
+        options.add_experimental_option('excludeSwitches', ['enable-automation', 'enable-logging'])
         
         # 根据操作系统选择不同的ChromeDriver路径处理方式
         if os.name == 'nt':  # Windows
@@ -65,25 +70,23 @@ def fetch_and_create_pdf(url):
         try:
             if os.name == 'nt' and not os.path.exists(chrome_driver_path):
                 chrome_driver_path = ChromeDriverManager().install()
-            
-            driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
+            service = Service(chrome_driver_path, log_path=os.devnull)
+            driver = webdriver.Chrome(service=service, options=options)
         except Exception as e:
             logger.debug(f"初始化ChromeDriver失败: {e}")
             chrome_driver_path = ChromeDriverManager().install()
-            driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
-        
-        # 如果本地没有chromedriver.exe，则使用webdriver_manager下载一次
-        if not os.path.exists(chrome_driver_path):
-            chrome_driver_path = ChromeDriverManager().install()
-            logger.debug(f"ChromeDriver已下载到: {chrome_driver_path}")
-        else:
-            logger.debug(f"使用本地ChromeDriver: {chrome_driver_path}")
-            
-        driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
+            service = Service(chrome_driver_path, log_path=os.devnull)
+            driver = webdriver.Chrome(service=service, options=options)
 
         # 获取目标页面
         driver.get(url)
-        time.sleep(10)
+        try:
+            enter_button = WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH, '//a[contains(text(), "满18岁，请点此进入")]')))
+            enter_button.click()
+            logger.debug("点击了满18岁按钮")
+        except Exception as e:
+            logger.warning(f"未找到满18岁按钮，跳过此步骤: {e}")
+        WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'tbody[id^="normalthread"]')))
 
         # 处理年龄验证按钮
         try:
@@ -92,7 +95,7 @@ def fetch_and_create_pdf(url):
             logger.debug("点击了满18岁按钮")
             time.sleep(5)
         except Exception as e:
-            logger.warning("未找到满18岁按钮，跳过此步骤", e)
+            logger.warning(f"未找到满18岁按钮，跳过此步骤: {e}")
 
         # 解析页面
         html = driver.page_source
@@ -144,6 +147,16 @@ def fetch_and_create_pdf(url):
         max_image_height = content_height * 0.7  # 留出足够空间给文本和其他元素
 
         # 遍历帖子
+        session = requests.Session()
+        session.headers.update({
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+            'Referer': 'https://www.sehuatang.net/'
+        })
+        for c in driver.get_cookies():
+            try:
+                session.cookies.set(c['name'], c['value'], domain=c.get('domain'), path=c.get('path', '/'))
+            except Exception:
+                session.cookies.set(c['name'], c['value'])
         for post in today_posts:
             title = post.find('a', {'class': 's xst'})
             if title:
@@ -153,10 +166,13 @@ def fetch_and_create_pdf(url):
 
                 # 获取帖子内容
                 post_page_url = 'https://www.sehuatang.net/' + post_url
-                driver.get(post_page_url)
-                time.sleep(3)
-
-                post_html = driver.page_source
+                try:
+                    resp = session.get(post_page_url, timeout=15)
+                    resp.raise_for_status()
+                    post_html = resp.text
+                except Exception as e:
+                    logger.warning(f"获取帖子内容失败: {e}")
+                    continue
                 post_soup = BeautifulSoup(post_html, 'html.parser', from_encoding='utf-8')
                 content_div = post_soup.find('div', {'class': 't_fsz'})
 
@@ -284,4 +300,4 @@ def pdf_file_path():
 
 
 if __name__ == "__main__":
-    pdf_file_path()
\ No newline at end of file
+    pdf_file_path()