去掉sehuat自动处理
This commit is contained in:
6
main.py
6
main.py
@@ -110,9 +110,9 @@ def jobs(robot: Robot):
|
|||||||
await robot.generate_and_send_ranking()
|
await robot.generate_and_send_ranking()
|
||||||
|
|
||||||
# ✅ 每天 15:30 发涩图 PDF
|
# ✅ 每天 15:30 发涩图 PDF
|
||||||
@async_job.at_times(["15:30"])
|
# @async_job.at_times(["15:30"])
|
||||||
async def sehuatang_pdf_job():
|
# async def sehuatang_pdf_job():
|
||||||
await robot.generate_sehuatang_pdf()
|
# await robot.generate_sehuatang_pdf()
|
||||||
|
|
||||||
# ✅ 每天 01:30 下载秀人网帖子
|
# ✅ 每天 01:30 下载秀人网帖子
|
||||||
@async_job.at_times(["01:30"])
|
@async_job.at_times(["01:30"])
|
||||||
|
|||||||
@@ -7,6 +7,8 @@ from selenium import webdriver
|
|||||||
from selenium.webdriver.chrome.service import Service
|
from selenium.webdriver.chrome.service import Service
|
||||||
from selenium.webdriver.common.by import By
|
from selenium.webdriver.common.by import By
|
||||||
from selenium.webdriver.chrome.options import Options
|
from selenium.webdriver.chrome.options import Options
|
||||||
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
|
from selenium.webdriver.support import expected_conditions as EC
|
||||||
from webdriver_manager.chrome import ChromeDriverManager
|
from webdriver_manager.chrome import ChromeDriverManager
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
@@ -15,7 +17,7 @@ from datetime import datetime, timedelta
|
|||||||
# ================= 配置区域 =================
|
# ================= 配置区域 =================
|
||||||
|
|
||||||
# 运行模式: 'full' (全量) 或 'daily' (增量)
|
# 运行模式: 'full' (全量) 或 'daily' (增量)
|
||||||
RUN_MODE = 'full'
|
RUN_MODE = 'daily'
|
||||||
|
|
||||||
DB_CONFIG = {
|
DB_CONFIG = {
|
||||||
'host': '192.168.2.41',
|
'host': '192.168.2.41',
|
||||||
@@ -111,9 +113,16 @@ class SehuatangCrawler:
|
|||||||
options = Options()
|
options = Options()
|
||||||
# options.add_argument('--headless')
|
# options.add_argument('--headless')
|
||||||
options.add_argument('--disable-gpu')
|
options.add_argument('--disable-gpu')
|
||||||
|
options.add_argument('--headless') # 使用新的headless模式
|
||||||
options.add_argument('--no-sandbox')
|
options.add_argument('--no-sandbox')
|
||||||
options.add_argument('--disable-dev-shm-usage')
|
options.add_argument('--disable-dev-shm-usage')
|
||||||
options.add_argument('--blink-settings=imagesEnabled=false')
|
options.add_argument('--blink-settings=imagesEnabled=false')
|
||||||
|
options.add_argument('--disable-logging')
|
||||||
|
options.add_argument('--log-level=3')
|
||||||
|
options.add_argument('--disable-blink-features=AutomationControlled')
|
||||||
|
options.add_experimental_option('excludeSwitches', ['enable-automation', 'enable-logging'])
|
||||||
|
options.add_experimental_option('useAutomationExtension', False)
|
||||||
|
options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36')
|
||||||
|
|
||||||
if os.name == 'nt':
|
if os.name == 'nt':
|
||||||
chrome_driver_path = os.path.join(os.getcwd(), "utils", "chromedriver", "chromedriver.exe")
|
chrome_driver_path = os.path.join(os.getcwd(), "utils", "chromedriver", "chromedriver.exe")
|
||||||
@@ -123,24 +132,28 @@ class SehuatangCrawler:
|
|||||||
try:
|
try:
|
||||||
if os.name == 'nt' and not os.path.exists(chrome_driver_path):
|
if os.name == 'nt' and not os.path.exists(chrome_driver_path):
|
||||||
chrome_driver_path = ChromeDriverManager().install()
|
chrome_driver_path = ChromeDriverManager().install()
|
||||||
service = Service(chrome_driver_path)
|
service = Service(chrome_driver_path, log_path=os.devnull)
|
||||||
driver = webdriver.Chrome(service=service, options=options)
|
driver = webdriver.Chrome(service=service, options=options)
|
||||||
except Exception:
|
except Exception:
|
||||||
chrome_driver_path = ChromeDriverManager().install()
|
chrome_driver_path = ChromeDriverManager().install()
|
||||||
driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
|
driver = webdriver.Chrome(service=Service(chrome_driver_path, log_path=os.devnull), options=options)
|
||||||
|
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {"source": "Object.defineProperty(navigator, 'webdriver', {get: () => undefined})"})
|
||||||
return driver
|
return driver
|
||||||
|
|
||||||
def bypass_age_verification(self):
|
def bypass_age_verification(self):
|
||||||
try:
|
try:
|
||||||
self.driver.get("https://www.sehuatang.org/forum.php")
|
self.driver.get("https://www.sehuatang.org/")
|
||||||
time.sleep(2)
|
|
||||||
try:
|
try:
|
||||||
enter_button = self.driver.find_element(By.XPATH, '//a[contains(text(), "满18岁")]')
|
btn = WebDriverWait(self.driver, 6).until(EC.element_to_be_clickable((By.XPATH, '//a[contains(text(), "满18岁,请点此进入")]')))
|
||||||
enter_button.click()
|
btn.click()
|
||||||
logger.success("通过年龄验证")
|
logger.success("通过年龄验证")
|
||||||
time.sleep(2)
|
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
try:
|
||||||
|
btn2 = WebDriverWait(self.driver, 4).until(EC.element_to_be_clickable((By.XPATH, '//a[contains(text(), "满18岁")]')))
|
||||||
|
btn2.click()
|
||||||
|
logger.success("通过年龄验证")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"主页访问异常: {e}")
|
logger.warning(f"主页访问异常: {e}")
|
||||||
|
|
||||||
@@ -337,7 +350,7 @@ class SehuatangCrawler:
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
self.driver.get(list_url)
|
self.driver.get(list_url)
|
||||||
time.sleep(0.5)
|
WebDriverWait(self.driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'tbody[id^="normalthread"]')))
|
||||||
|
|
||||||
soup = BeautifulSoup(self.driver.page_source, 'html.parser')
|
soup = BeautifulSoup(self.driver.page_source, 'html.parser')
|
||||||
threads = soup.find_all('tbody', {'id': lambda x: x and x.startswith('normalthread')})
|
threads = soup.find_all('tbody', {'id': lambda x: x and x.startswith('normalthread')})
|
||||||
@@ -419,4 +432,4 @@ class SehuatangCrawler:
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
crawler = SehuatangCrawler()
|
crawler = SehuatangCrawler()
|
||||||
crawler.run()
|
crawler.run()
|
||||||
|
|||||||
Reference in New Issue
Block a user