diff --git a/utils/sehuatang/shehuatang_undetected.py b/utils/sehuatang/shehuatang_undetected.py index b2afb67..c009e2d 100644 --- a/utils/sehuatang/shehuatang_undetected.py +++ b/utils/sehuatang/shehuatang_undetected.py @@ -25,42 +25,16 @@ from PIL import Image as PILImage import re from PyPDF2 import PdfReader, PdfWriter from loguru import logger -from urllib.parse import urlparse, urljoin -from requests.adapters import HTTPAdapter -from urllib3.util import Retry - -UNREACHABLE_HOSTS = set() -def download_image(url, session, referer=None): +def download_image(url, session): """使用同步的 session 下载图片,确保 Cookie 一致""" try: - parsed = urlparse(url) - host = parsed.netloc.lower() - if host in UNREACHABLE_HOSTS: + if not url.lower().endswith(('.jpg', '.jpeg', '.png')): return None - headers = {} - if referer: - headers['Referer'] = referer - else: - headers['Referer'] = f'{parsed.scheme}://{host}/' - response = session.get(url, headers=headers, timeout=10) + response = session.get(url, timeout=15) response.raise_for_status() - ctype = response.headers.get('Content-Type', '') - if 'image' not in ctype.lower() and not url.lower().endswith(('.jpg', '.jpeg', '.png', '.webp', '.gif')): - return None return BytesIO(response.content) - except requests.exceptions.RequestException as e: - try: - parsed = urlparse(url) - host = parsed.netloc.lower() - msg = str(e) - if 'Network is unreachable' in msg or 'Failed to establish a new connection' in msg: - UNREACHABLE_HOSTS.add(host) - except Exception: - pass - logger.warning(f"下载图片失败: {e}") - return None except Exception as e: logger.warning(f"下载图片失败: {e}") return None @@ -159,16 +133,7 @@ def fetch_and_create_pdf(url): # 同步 Session session = requests.Session() ua = driver.execute_script("return navigator.userAgent") - session.headers.update({ - 'User-Agent': ua, - 'Accept': 'image/avif,image/webp,image/apng,image/svg+xml,image/*;q=0.8,*/*;q=0.5', - 'Accept-Language': 'zh-CN,zh;q=0.9', - 'Connection': 'keep-alive' - }) - retry = Retry(total=3, connect=3, read=3, backoff_factor=0.5, status_forcelist=[429, 500, 502, 503, 504], allowed_methods=frozenset(['GET'])) - adapter = HTTPAdapter(max_retries=retry) - session.mount('https://', adapter) - session.mount('http://', adapter) + session.headers.update({'User-Agent': ua, 'Referer': 'https://www.sehuatang.net/'}) for c in driver.get_cookies(): session.cookies.set(c['name'], c['value']) @@ -192,23 +157,16 @@ def fetch_and_create_pdf(url): for m in magnets: content.append(Paragraph(f"{m}", normal_style)) - page_url = resp.url for img_tag in div.find_all('img'): - src = img_tag.get('zoomfile') or img_tag.get('file') or img_tag.get('src') or img_tag.get('data-src') - if not src: - continue - if src.startswith('//'): - src = 'https:' + src - elif not src.startswith('http'): - src = urljoin(page_url, src) - img_io = download_image(src, session, referer=page_url) - if img_io: - with PILImage.open(img_io) as p_img: - iw, ih = p_img.size - sc = min(max_w / iw, max_h / ih, 1.0) - img_io.seek(0) - content.append(Image(img_io, width=iw * sc, height=ih * sc)) - time.sleep(0.15) + src = img_tag.get('zoomfile') + if src and 'http' in src: + img_io = download_image(src, session) + if img_io: + with PILImage.open(img_io) as p_img: + iw, ih = p_img.size + sc = min(max_w / iw, max_h / ih, 1.0) + img_io.seek(0) + content.append(Image(img_io, width=iw * sc, height=ih * sc)) if post != today_posts[-1]: content.append(PageBreak()) except Exception as e: