使用新的网址

2025-12-22 15:45:40 +08:00
parent d5d7a2a34d
commit 8a68338ffe
2 changed files with 38 additions and 22 deletions
--- a/utils/sehuatang/sehuatang_bot.py
+++ b/utils/sehuatang/sehuatang_bot.py
@@ -132,7 +132,7 @@ class SehuatangCrawler:

    def bypass_age_verification(self):
        try:
-            self.driver.get("https://www.sehuatang.net/forum.php")
+            self.driver.get("https://www.sehuatang.org/forum.php")
            time.sleep(2)
            try:
                enter_button = self.driver.find_element(By.XPATH, '//a[contains(text(), "满18岁")]')
@@ -332,7 +332,7 @@ class SehuatangCrawler:
            if RUN_MODE == 'daily' and consecutive_old_posts > 20:
                break

-            list_url = f"https://www.sehuatang.net/forum-{fid}-{page}.html"
+            list_url = f"https://www.sehuatang.org/forum-{fid}-{page}.html"
            logger.info(f"正在爬取第 {page} 页")

            try:
@@ -369,7 +369,7 @@ class SehuatangCrawler:
                            continue

                        partial_url = title_tag.get('href')
-                        full_url = f"https://www.sehuatang.net/{partial_url}"
+                        full_url = f"https://www.sehuatang.org/{partial_url}"

                        # 获取详情页数据（含女优）
                        magnet, cover, body_actress = self.parse_detail_page(full_url)
--- a/utils/sehuatang/shehuatang.py
+++ b/utils/sehuatang/shehuatang.py
@@ -6,6 +6,8 @@ from selenium import webdriver
 from selenium.webdriver.chrome.service import Service
 from selenium.webdriver.common.by import By
 from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
 from webdriver_manager.chrome import ChromeDriverManager
 from bs4 import BeautifulSoup
 from reportlab.lib.pagesizes import letter, A3
@@ -52,6 +54,9 @@ def fetch_and_create_pdf(url):
        options.add_argument('--disable-gpu')
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')  # 添加Linux特定配置
+        options.add_argument('--disable-logging')
+        options.add_argument('--log-level=3')
+        options.add_experimental_option('excludeSwitches', ['enable-automation', 'enable-logging'])
        
        # 根据操作系统选择不同的ChromeDriver路径处理方式
        if os.name == 'nt':  # Windows
@@ -65,25 +70,23 @@ def fetch_and_create_pdf(url):
        try:
            if os.name == 'nt' and not os.path.exists(chrome_driver_path):
                chrome_driver_path = ChromeDriverManager().install()
-            
-            driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
+            service = Service(chrome_driver_path, log_path=os.devnull)
+            driver = webdriver.Chrome(service=service, options=options)
        except Exception as e:
            logger.debug(f"初始化ChromeDriver失败: {e}")
            chrome_driver_path = ChromeDriverManager().install()
-            driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
-        
-        # 如果本地没有chromedriver.exe，则使用webdriver_manager下载一次
-        if not os.path.exists(chrome_driver_path):
-            chrome_driver_path = ChromeDriverManager().install()
-            logger.debug(f"ChromeDriver已下载到: {chrome_driver_path}")
-        else:
-            logger.debug(f"使用本地ChromeDriver: {chrome_driver_path}")
-            
-        driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
+            service = Service(chrome_driver_path, log_path=os.devnull)
+            driver = webdriver.Chrome(service=service, options=options)

        # 获取目标页面
        driver.get(url)
-        time.sleep(10)
+        try:
+            enter_button = WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH, '//a[contains(text(), "满18岁，请点此进入")]')))
+            enter_button.click()
+            logger.debug("点击了满18岁按钮")
+        except Exception as e:
+            logger.warning(f"未找到满18岁按钮，跳过此步骤: {e}")
+        WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'tbody[id^="normalthread"]')))

        # 处理年龄验证按钮
        try:
@@ -92,7 +95,7 @@ def fetch_and_create_pdf(url):
            logger.debug("点击了满18岁按钮")
            time.sleep(5)
        except Exception as e:
-            logger.warning("未找到满18岁按钮，跳过此步骤", e)
+            logger.warning(f"未找到满18岁按钮，跳过此步骤: {e}")

        # 解析页面
        html = driver.page_source
@@ -144,6 +147,16 @@ def fetch_and_create_pdf(url):
        max_image_height = content_height * 0.7  # 留出足够空间给文本和其他元素

        # 遍历帖子
+        session = requests.Session()
+        session.headers.update({
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+            'Referer': 'https://www.sehuatang.net/'
+        })
+        for c in driver.get_cookies():
+            try:
+                session.cookies.set(c['name'], c['value'], domain=c.get('domain'), path=c.get('path', '/'))
+            except Exception:
+                session.cookies.set(c['name'], c['value'])
        for post in today_posts:
            title = post.find('a', {'class': 's xst'})
            if title:
@@ -153,10 +166,13 @@ def fetch_and_create_pdf(url):

                # 获取帖子内容
                post_page_url = 'https://www.sehuatang.net/' + post_url
-                driver.get(post_page_url)
-                time.sleep(3)
-
-                post_html = driver.page_source
+                try:
+                    resp = session.get(post_page_url, timeout=15)
+                    resp.raise_for_status()
+                    post_html = resp.text
+                except Exception as e:
+                    logger.warning(f"获取帖子内容失败: {e}")
+                    continue
                post_soup = BeautifulSoup(post_html, 'html.parser', from_encoding='utf-8')
                content_div = post_soup.find('div', {'class': 't_fsz'})

@@ -284,4 +300,4 @@ def pdf_file_path():


 if __name__ == "__main__":
-    pdf_file_path()
+    pdf_file_path()