加入手动处理逻辑。
This commit is contained in:
175
utils/sehuatang/shehuatang-undetected.py
Normal file
175
utils/sehuatang/shehuatang-undetected.py
Normal file
@@ -0,0 +1,175 @@
|
|||||||
|
import time
|
||||||
|
import os
|
||||||
|
import requests
|
||||||
|
from io import BytesIO
|
||||||
|
import undetected_chromedriver as uc
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
|
from selenium.webdriver.support import expected_conditions as EC
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from reportlab.lib.pagesizes import A3
|
||||||
|
from reportlab.lib import colors
|
||||||
|
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image, PageBreak
|
||||||
|
from reportlab.lib.styles import getSampleStyleSheet
|
||||||
|
from reportlab.pdfbase.ttfonts import TTFont
|
||||||
|
from reportlab.pdfbase import pdfmetrics
|
||||||
|
from datetime import datetime
|
||||||
|
from PIL import Image as PILImage
|
||||||
|
import re
|
||||||
|
from PyPDF2 import PdfReader, PdfWriter
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
|
|
||||||
|
def download_image(url, session):
|
||||||
|
"""使用同步的 session 下载图片,确保 Cookie 一致"""
|
||||||
|
try:
|
||||||
|
if not url.lower().endswith(('.jpg', '.jpeg', '.png')):
|
||||||
|
return None
|
||||||
|
response = session.get(url, timeout=15)
|
||||||
|
response.raise_for_status()
|
||||||
|
return BytesIO(response.content)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"下载图片失败: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def add_pdf_encryption(pdf_file, password="4000"):
|
||||||
|
try:
|
||||||
|
pdf_writer = PdfWriter()
|
||||||
|
pdf_reader = PdfReader(pdf_file)
|
||||||
|
for page in pdf_reader.pages:
|
||||||
|
pdf_writer.add_page(page)
|
||||||
|
pdf_writer.encrypt(password)
|
||||||
|
with open(pdf_file, "wb") as output_pdf:
|
||||||
|
pdf_writer.write(output_pdf)
|
||||||
|
logger.debug("PDF加密成功")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"PDF加密失败: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_and_create_pdf(url):
|
||||||
|
driver = None
|
||||||
|
try:
|
||||||
|
options = uc.ChromeOptions()
|
||||||
|
# 规避检测的关键配置
|
||||||
|
options.headless = False
|
||||||
|
options.add_argument('--no-sandbox')
|
||||||
|
options.add_argument('--disable-gpu')
|
||||||
|
options.add_argument('--disable-dev-shm-usage')
|
||||||
|
|
||||||
|
# 如果依然在 Headless 触发检测,建议第一次运行设为 False 手动通过
|
||||||
|
driver = uc.Chrome(options=options, headless=True)
|
||||||
|
|
||||||
|
logger.info(f"正在访问: {url}")
|
||||||
|
driver.get(url)
|
||||||
|
|
||||||
|
# 等待 Cloudflare 5秒盾结束,并处理“满18岁”按钮
|
||||||
|
time.sleep(8)
|
||||||
|
|
||||||
|
try:
|
||||||
|
enter_btn = WebDriverWait(driver, 10).until(
|
||||||
|
EC.element_to_be_clickable((By.XPATH, '//a[contains(text(), "满18岁,请点此进入")]'))
|
||||||
|
)
|
||||||
|
enter_btn.click()
|
||||||
|
logger.debug("点击了年龄确认按钮")
|
||||||
|
time.sleep(3)
|
||||||
|
except Exception:
|
||||||
|
logger.debug("未发现年龄验证按钮,可能已过检测")
|
||||||
|
|
||||||
|
# 确保列表加载
|
||||||
|
WebDriverWait(driver, 20).until(
|
||||||
|
EC.presence_of_element_located((By.CSS_SELECTOR, 'tbody[id^="normalthread"]'))
|
||||||
|
)
|
||||||
|
|
||||||
|
# 提取数据
|
||||||
|
soup = BeautifulSoup(driver.page_source, 'html.parser')
|
||||||
|
posts = [p for p in soup.find_all('tbody', {'id': lambda x: x and x.startswith('normalthread')}) if
|
||||||
|
p.find('span', {'class': 'xi1'})]
|
||||||
|
today_posts = posts[::-1]
|
||||||
|
|
||||||
|
# 字体注册
|
||||||
|
pdfmetrics.registerFont(TTFont('SimHei', 'fonts/simhei.ttf'))
|
||||||
|
styles = getSampleStyleSheet()
|
||||||
|
title_style = styles['Heading1']
|
||||||
|
title_style.fontName = 'SimHei'
|
||||||
|
title_style.textColor = colors.red
|
||||||
|
normal_style = styles['Normal']
|
||||||
|
normal_style.fontName = 'SimHei'
|
||||||
|
|
||||||
|
# 路径逻辑
|
||||||
|
save_path = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), 'temp')
|
||||||
|
if not os.path.exists(save_path): os.makedirs(save_path)
|
||||||
|
pdf_filename = os.path.join(save_path, f"JAV-{datetime.now().strftime('%Y-%m-%d')}-{len(today_posts)}.pdf")
|
||||||
|
|
||||||
|
doc = SimpleDocTemplate(pdf_filename, pagesize=A3)
|
||||||
|
content = []
|
||||||
|
max_w, max_h = (A3[0] - 72) * 0.95, (A3[1] - 72) * 0.7
|
||||||
|
|
||||||
|
# 同步 Session
|
||||||
|
session = requests.Session()
|
||||||
|
ua = driver.execute_script("return navigator.userAgent")
|
||||||
|
session.headers.update({'User-Agent': ua, 'Referer': 'https://www.sehuatang.net/'})
|
||||||
|
for c in driver.get_cookies():
|
||||||
|
session.cookies.set(c['name'], c['value'])
|
||||||
|
|
||||||
|
# 循环帖子
|
||||||
|
for post in today_posts:
|
||||||
|
title_tag = post.find('a', {'class': 's xst'})
|
||||||
|
if not title_tag: continue
|
||||||
|
|
||||||
|
p_title = title_tag.get_text()
|
||||||
|
p_url = 'https://www.sehuatang.net/' + title_tag.get('href')
|
||||||
|
logger.info(f"详情页: {p_title}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
resp = session.get(p_url, timeout=15)
|
||||||
|
p_soup = BeautifulSoup(resp.text, 'html.parser')
|
||||||
|
div = p_soup.find('div', {'class': 't_fsz'})
|
||||||
|
|
||||||
|
if div:
|
||||||
|
content.append(Paragraph(f" {p_title}", title_style))
|
||||||
|
magnets = re.findall(r'magnet:\?[^ \u4e00-\u9fff]+', div.get_text())
|
||||||
|
for m in magnets:
|
||||||
|
content.append(Paragraph(f"<b>{m}</b>", normal_style))
|
||||||
|
|
||||||
|
for img_tag in div.find_all('img'):
|
||||||
|
src = img_tag.get('zoomfile')
|
||||||
|
if src and 'http' in src:
|
||||||
|
img_io = download_image(src, session)
|
||||||
|
if img_io:
|
||||||
|
with PILImage.open(img_io) as p_img:
|
||||||
|
iw, ih = p_img.size
|
||||||
|
sc = min(max_w / iw, max_h / ih, 1.0)
|
||||||
|
img_io.seek(0)
|
||||||
|
content.append(Image(img_io, width=iw * sc, height=ih * sc))
|
||||||
|
|
||||||
|
if post != today_posts[-1]: content.append(PageBreak())
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"帖子处理失败: {e}")
|
||||||
|
|
||||||
|
doc.build(content)
|
||||||
|
add_pdf_encryption(pdf_filename)
|
||||||
|
return pdf_filename
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception(f"抓取异常: {e}")
|
||||||
|
return ""
|
||||||
|
finally:
|
||||||
|
# --- 解决 [WinError 6] 句柄无效的关键 ---
|
||||||
|
if driver:
|
||||||
|
try:
|
||||||
|
logger.debug("正在安全关闭浏览器...")
|
||||||
|
driver.close() # 先关闭窗口
|
||||||
|
driver.quit() # 再退出进程
|
||||||
|
except Exception:
|
||||||
|
# 捕获因句柄失效导致的退出异常,避免污染控制台
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def pdf_file_path():
|
||||||
|
url = 'https://www.sehuatang.net/forum.php?mod=forumdisplay&fid=103&filter=typeid&typeid=481'
|
||||||
|
return fetch_and_create_pdf(url)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
pdf_file_path()
|
||||||
Reference in New Issue
Block a user