去掉sehuat自动处理

This commit is contained in:
liuwei
2025-12-22 16:20:55 +08:00
parent c1f927a425
commit 190271e239
2 changed files with 27 additions and 14 deletions

View File

@@ -110,9 +110,9 @@ def jobs(robot: Robot):
await robot.generate_and_send_ranking()
# ✅ 每天 15:30 发涩图 PDF
@async_job.at_times(["15:30"])
async def sehuatang_pdf_job():
await robot.generate_sehuatang_pdf()
# @async_job.at_times(["15:30"])
# async def sehuatang_pdf_job():
# await robot.generate_sehuatang_pdf()
# ✅ 每天 01:30 下载秀人网帖子
@async_job.at_times(["01:30"])

View File

@@ -7,6 +7,8 @@ from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from loguru import logger
@@ -15,7 +17,7 @@ from datetime import datetime, timedelta
# ================= 配置区域 =================
# 运行模式: 'full' (全量) 或 'daily' (增量)
RUN_MODE = 'full'
RUN_MODE = 'daily'
DB_CONFIG = {
'host': '192.168.2.41',
@@ -111,9 +113,16 @@ class SehuatangCrawler:
options = Options()
# options.add_argument('--headless')
options.add_argument('--disable-gpu')
options.add_argument('--headless') # 使用新的headless模式
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--blink-settings=imagesEnabled=false')
options.add_argument('--disable-logging')
options.add_argument('--log-level=3')
options.add_argument('--disable-blink-features=AutomationControlled')
options.add_experimental_option('excludeSwitches', ['enable-automation', 'enable-logging'])
options.add_experimental_option('useAutomationExtension', False)
options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36')
if os.name == 'nt':
chrome_driver_path = os.path.join(os.getcwd(), "utils", "chromedriver", "chromedriver.exe")
@@ -123,24 +132,28 @@ class SehuatangCrawler:
try:
if os.name == 'nt' and not os.path.exists(chrome_driver_path):
chrome_driver_path = ChromeDriverManager().install()
service = Service(chrome_driver_path)
service = Service(chrome_driver_path, log_path=os.devnull)
driver = webdriver.Chrome(service=service, options=options)
except Exception:
chrome_driver_path = ChromeDriverManager().install()
driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
driver = webdriver.Chrome(service=Service(chrome_driver_path, log_path=os.devnull), options=options)
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {"source": "Object.defineProperty(navigator, 'webdriver', {get: () => undefined})"})
return driver
def bypass_age_verification(self):
try:
self.driver.get("https://www.sehuatang.org/forum.php")
time.sleep(2)
self.driver.get("https://www.sehuatang.org/")
try:
enter_button = self.driver.find_element(By.XPATH, '//a[contains(text(), "满18岁")]')
enter_button.click()
btn = WebDriverWait(self.driver, 6).until(EC.element_to_be_clickable((By.XPATH, '//a[contains(text(), "满18岁,请点此进入")]')))
btn.click()
logger.success("通过年龄验证")
time.sleep(2)
except Exception:
pass
try:
btn2 = WebDriverWait(self.driver, 4).until(EC.element_to_be_clickable((By.XPATH, '//a[contains(text(), "满18岁")]')))
btn2.click()
logger.success("通过年龄验证")
except Exception:
pass
except Exception as e:
logger.warning(f"主页访问异常: {e}")
@@ -337,7 +350,7 @@ class SehuatangCrawler:
try:
self.driver.get(list_url)
time.sleep(0.5)
WebDriverWait(self.driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'tbody[id^="normalthread"]')))
soup = BeautifulSoup(self.driver.page_source, 'html.parser')
threads = soup.find_all('tbody', {'id': lambda x: x and x.startswith('normalthread')})
@@ -419,4 +432,4 @@ class SehuatangCrawler:
if __name__ == "__main__":
crawler = SehuatangCrawler()
crawler.run()
crawler.run()