From 5c43d5dc39b3a862972006e903f37b931e8f6b8a Mon Sep 17 00:00:00 2001 From: liuwei Date: Tue, 3 Feb 2026 15:53:56 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BC=98=E5=8C=96sehuat?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- utils/sehuatang/shehuatang_undetected.py | 40 +++++++++++++++--------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/utils/sehuatang/shehuatang_undetected.py b/utils/sehuatang/shehuatang_undetected.py index a5f20b3..b2afb67 100644 --- a/utils/sehuatang/shehuatang_undetected.py +++ b/utils/sehuatang/shehuatang_undetected.py @@ -25,25 +25,30 @@ from PIL import Image as PILImage import re from PyPDF2 import PdfReader, PdfWriter from loguru import logger -from urllib.parse import urlparse +from urllib.parse import urlparse, urljoin from requests.adapters import HTTPAdapter from urllib3.util import Retry UNREACHABLE_HOSTS = set() -def download_image(url, session): +def download_image(url, session, referer=None): """使用同步的 session 下载图片,确保 Cookie 一致""" try: - if not url.lower().endswith(('.jpg', '.jpeg', '.png')): - return None parsed = urlparse(url) host = parsed.netloc.lower() if host in UNREACHABLE_HOSTS: return None - headers = {'Referer': f'{parsed.scheme}://{host}/'} + headers = {} + if referer: + headers['Referer'] = referer + else: + headers['Referer'] = f'{parsed.scheme}://{host}/' response = session.get(url, headers=headers, timeout=10) response.raise_for_status() + ctype = response.headers.get('Content-Type', '') + if 'image' not in ctype.lower() and not url.lower().endswith(('.jpg', '.jpeg', '.png', '.webp', '.gif')): + return None return BytesIO(response.content) except requests.exceptions.RequestException as e: try: @@ -187,16 +192,23 @@ def fetch_and_create_pdf(url): for m in magnets: content.append(Paragraph(f"{m}", normal_style)) + page_url = resp.url for img_tag in div.find_all('img'): - src = img_tag.get('zoomfile') - if src and 'http' in src: - img_io = download_image(src, session) - if img_io: - with PILImage.open(img_io) as p_img: - iw, ih = p_img.size - sc = min(max_w / iw, max_h / ih, 1.0) - img_io.seek(0) - content.append(Image(img_io, width=iw * sc, height=ih * sc)) + src = img_tag.get('zoomfile') or img_tag.get('file') or img_tag.get('src') or img_tag.get('data-src') + if not src: + continue + if src.startswith('//'): + src = 'https:' + src + elif not src.startswith('http'): + src = urljoin(page_url, src) + img_io = download_image(src, session, referer=page_url) + if img_io: + with PILImage.open(img_io) as p_img: + iw, ih = p_img.size + sc = min(max_w / iw, max_h / ih, 1.0) + img_io.seek(0) + content.append(Image(img_io, width=iw * sc, height=ih * sc)) + time.sleep(0.15) if post != today_posts[-1]: content.append(PageBreak()) except Exception as e: