diff --git a/main.py b/main.py index ea1fbe0..d2629c4 100644 --- a/main.py +++ b/main.py @@ -110,9 +110,9 @@ def jobs(robot: Robot): await robot.generate_and_send_ranking() # ✅ 每天 15:30 发涩图 PDF - @async_job.at_times(["15:30"]) - async def sehuatang_pdf_job(): - await robot.generate_sehuatang_pdf() + # @async_job.at_times(["15:30"]) + # async def sehuatang_pdf_job(): + # await robot.generate_sehuatang_pdf() # ✅ 每天 01:30 下载秀人网帖子 @async_job.at_times(["01:30"]) diff --git a/utils/sehuatang/sehuatang_bot.py b/utils/sehuatang/sehuatang_bot.py index 5dde251..2988007 100644 --- a/utils/sehuatang/sehuatang_bot.py +++ b/utils/sehuatang/sehuatang_bot.py @@ -7,6 +7,8 @@ from selenium import webdriver from selenium.webdriver.chrome.service import Service from selenium.webdriver.common.by import By from selenium.webdriver.chrome.options import Options +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC from webdriver_manager.chrome import ChromeDriverManager from bs4 import BeautifulSoup from loguru import logger @@ -15,7 +17,7 @@ from datetime import datetime, timedelta # ================= 配置区域 ================= # 运行模式: 'full' (全量) 或 'daily' (增量) -RUN_MODE = 'full' +RUN_MODE = 'daily' DB_CONFIG = { 'host': '192.168.2.41', @@ -111,9 +113,16 @@ class SehuatangCrawler: options = Options() # options.add_argument('--headless') options.add_argument('--disable-gpu') + options.add_argument('--headless') # 使用新的headless模式 options.add_argument('--no-sandbox') options.add_argument('--disable-dev-shm-usage') options.add_argument('--blink-settings=imagesEnabled=false') + options.add_argument('--disable-logging') + options.add_argument('--log-level=3') + options.add_argument('--disable-blink-features=AutomationControlled') + options.add_experimental_option('excludeSwitches', ['enable-automation', 'enable-logging']) + options.add_experimental_option('useAutomationExtension', False) + options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36') if os.name == 'nt': chrome_driver_path = os.path.join(os.getcwd(), "utils", "chromedriver", "chromedriver.exe") @@ -123,24 +132,28 @@ class SehuatangCrawler: try: if os.name == 'nt' and not os.path.exists(chrome_driver_path): chrome_driver_path = ChromeDriverManager().install() - service = Service(chrome_driver_path) + service = Service(chrome_driver_path, log_path=os.devnull) driver = webdriver.Chrome(service=service, options=options) except Exception: chrome_driver_path = ChromeDriverManager().install() - driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options) + driver = webdriver.Chrome(service=Service(chrome_driver_path, log_path=os.devnull), options=options) + driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {"source": "Object.defineProperty(navigator, 'webdriver', {get: () => undefined})"}) return driver def bypass_age_verification(self): try: - self.driver.get("https://www.sehuatang.org/forum.php") - time.sleep(2) + self.driver.get("https://www.sehuatang.org/") try: - enter_button = self.driver.find_element(By.XPATH, '//a[contains(text(), "满18岁")]') - enter_button.click() + btn = WebDriverWait(self.driver, 6).until(EC.element_to_be_clickable((By.XPATH, '//a[contains(text(), "满18岁,请点此进入")]'))) + btn.click() logger.success("通过年龄验证") - time.sleep(2) except Exception: - pass + try: + btn2 = WebDriverWait(self.driver, 4).until(EC.element_to_be_clickable((By.XPATH, '//a[contains(text(), "满18岁")]'))) + btn2.click() + logger.success("通过年龄验证") + except Exception: + pass except Exception as e: logger.warning(f"主页访问异常: {e}") @@ -337,7 +350,7 @@ class SehuatangCrawler: try: self.driver.get(list_url) - time.sleep(0.5) + WebDriverWait(self.driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'tbody[id^="normalthread"]'))) soup = BeautifulSoup(self.driver.page_source, 'html.parser') threads = soup.find_all('tbody', {'id': lambda x: x and x.startswith('normalthread')}) @@ -419,4 +432,4 @@ class SehuatangCrawler: if __name__ == "__main__": crawler = SehuatangCrawler() - crawler.run() \ No newline at end of file + crawler.run()