去掉sehuat自动处理

This commit is contained in:
liuwei
2025-12-22 16:20:55 +08:00
parent c1f927a425
commit 190271e239
2 changed files with 27 additions and 14 deletions

View File

@@ -110,9 +110,9 @@ def jobs(robot: Robot):
await robot.generate_and_send_ranking() await robot.generate_and_send_ranking()
# ✅ 每天 15:30 发涩图 PDF # ✅ 每天 15:30 发涩图 PDF
@async_job.at_times(["15:30"]) # @async_job.at_times(["15:30"])
async def sehuatang_pdf_job(): # async def sehuatang_pdf_job():
await robot.generate_sehuatang_pdf() # await robot.generate_sehuatang_pdf()
# ✅ 每天 01:30 下载秀人网帖子 # ✅ 每天 01:30 下载秀人网帖子
@async_job.at_times(["01:30"]) @async_job.at_times(["01:30"])

View File

@@ -7,6 +7,8 @@ from selenium import webdriver
from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from loguru import logger from loguru import logger
@@ -15,7 +17,7 @@ from datetime import datetime, timedelta
# ================= 配置区域 ================= # ================= 配置区域 =================
# 运行模式: 'full' (全量) 或 'daily' (增量) # 运行模式: 'full' (全量) 或 'daily' (增量)
RUN_MODE = 'full' RUN_MODE = 'daily'
DB_CONFIG = { DB_CONFIG = {
'host': '192.168.2.41', 'host': '192.168.2.41',
@@ -111,9 +113,16 @@ class SehuatangCrawler:
options = Options() options = Options()
# options.add_argument('--headless') # options.add_argument('--headless')
options.add_argument('--disable-gpu') options.add_argument('--disable-gpu')
options.add_argument('--headless') # 使用新的headless模式
options.add_argument('--no-sandbox') options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage') options.add_argument('--disable-dev-shm-usage')
options.add_argument('--blink-settings=imagesEnabled=false') options.add_argument('--blink-settings=imagesEnabled=false')
options.add_argument('--disable-logging')
options.add_argument('--log-level=3')
options.add_argument('--disable-blink-features=AutomationControlled')
options.add_experimental_option('excludeSwitches', ['enable-automation', 'enable-logging'])
options.add_experimental_option('useAutomationExtension', False)
options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36')
if os.name == 'nt': if os.name == 'nt':
chrome_driver_path = os.path.join(os.getcwd(), "utils", "chromedriver", "chromedriver.exe") chrome_driver_path = os.path.join(os.getcwd(), "utils", "chromedriver", "chromedriver.exe")
@@ -123,24 +132,28 @@ class SehuatangCrawler:
try: try:
if os.name == 'nt' and not os.path.exists(chrome_driver_path): if os.name == 'nt' and not os.path.exists(chrome_driver_path):
chrome_driver_path = ChromeDriverManager().install() chrome_driver_path = ChromeDriverManager().install()
service = Service(chrome_driver_path) service = Service(chrome_driver_path, log_path=os.devnull)
driver = webdriver.Chrome(service=service, options=options) driver = webdriver.Chrome(service=service, options=options)
except Exception: except Exception:
chrome_driver_path = ChromeDriverManager().install() chrome_driver_path = ChromeDriverManager().install()
driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options) driver = webdriver.Chrome(service=Service(chrome_driver_path, log_path=os.devnull), options=options)
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {"source": "Object.defineProperty(navigator, 'webdriver', {get: () => undefined})"})
return driver return driver
def bypass_age_verification(self): def bypass_age_verification(self):
try: try:
self.driver.get("https://www.sehuatang.org/forum.php") self.driver.get("https://www.sehuatang.org/")
time.sleep(2)
try: try:
enter_button = self.driver.find_element(By.XPATH, '//a[contains(text(), "满18岁")]') btn = WebDriverWait(self.driver, 6).until(EC.element_to_be_clickable((By.XPATH, '//a[contains(text(), "满18岁,请点此进入")]')))
enter_button.click() btn.click()
logger.success("通过年龄验证") logger.success("通过年龄验证")
time.sleep(2)
except Exception: except Exception:
pass try:
btn2 = WebDriverWait(self.driver, 4).until(EC.element_to_be_clickable((By.XPATH, '//a[contains(text(), "满18岁")]')))
btn2.click()
logger.success("通过年龄验证")
except Exception:
pass
except Exception as e: except Exception as e:
logger.warning(f"主页访问异常: {e}") logger.warning(f"主页访问异常: {e}")
@@ -337,7 +350,7 @@ class SehuatangCrawler:
try: try:
self.driver.get(list_url) self.driver.get(list_url)
time.sleep(0.5) WebDriverWait(self.driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'tbody[id^="normalthread"]')))
soup = BeautifulSoup(self.driver.page_source, 'html.parser') soup = BeautifulSoup(self.driver.page_source, 'html.parser')
threads = soup.find_all('tbody', {'id': lambda x: x and x.startswith('normalthread')}) threads = soup.find_all('tbody', {'id': lambda x: x and x.startswith('normalthread')})
@@ -419,4 +432,4 @@ class SehuatangCrawler:
if __name__ == "__main__": if __name__ == "__main__":
crawler = SehuatangCrawler() crawler = SehuatangCrawler()
crawler.run() crawler.run()