优化sehuat

This commit is contained in:
liuwei
2026-02-03 15:56:49 +08:00
parent 5c43d5dc39
commit 6b28e375cf

View File

@@ -25,42 +25,16 @@ from PIL import Image as PILImage
import re
from PyPDF2 import PdfReader, PdfWriter
from loguru import logger
from urllib.parse import urlparse, urljoin
from requests.adapters import HTTPAdapter
from urllib3.util import Retry
UNREACHABLE_HOSTS = set()
def download_image(url, session, referer=None):
def download_image(url, session):
"""使用同步的 session 下载图片,确保 Cookie 一致"""
try:
parsed = urlparse(url)
host = parsed.netloc.lower()
if host in UNREACHABLE_HOSTS:
if not url.lower().endswith(('.jpg', '.jpeg', '.png')):
return None
headers = {}
if referer:
headers['Referer'] = referer
else:
headers['Referer'] = f'{parsed.scheme}://{host}/'
response = session.get(url, headers=headers, timeout=10)
response = session.get(url, timeout=15)
response.raise_for_status()
ctype = response.headers.get('Content-Type', '')
if 'image' not in ctype.lower() and not url.lower().endswith(('.jpg', '.jpeg', '.png', '.webp', '.gif')):
return None
return BytesIO(response.content)
except requests.exceptions.RequestException as e:
try:
parsed = urlparse(url)
host = parsed.netloc.lower()
msg = str(e)
if 'Network is unreachable' in msg or 'Failed to establish a new connection' in msg:
UNREACHABLE_HOSTS.add(host)
except Exception:
pass
logger.warning(f"下载图片失败: {e}")
return None
except Exception as e:
logger.warning(f"下载图片失败: {e}")
return None
@@ -159,16 +133,7 @@ def fetch_and_create_pdf(url):
# 同步 Session
session = requests.Session()
ua = driver.execute_script("return navigator.userAgent")
session.headers.update({
'User-Agent': ua,
'Accept': 'image/avif,image/webp,image/apng,image/svg+xml,image/*;q=0.8,*/*;q=0.5',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive'
})
retry = Retry(total=3, connect=3, read=3, backoff_factor=0.5, status_forcelist=[429, 500, 502, 503, 504], allowed_methods=frozenset(['GET']))
adapter = HTTPAdapter(max_retries=retry)
session.mount('https://', adapter)
session.mount('http://', adapter)
session.headers.update({'User-Agent': ua, 'Referer': 'https://www.sehuatang.net/'})
for c in driver.get_cookies():
session.cookies.set(c['name'], c['value'])
@@ -192,23 +157,16 @@ def fetch_and_create_pdf(url):
for m in magnets:
content.append(Paragraph(f"<b>{m}</b>", normal_style))
page_url = resp.url
for img_tag in div.find_all('img'):
src = img_tag.get('zoomfile') or img_tag.get('file') or img_tag.get('src') or img_tag.get('data-src')
if not src:
continue
if src.startswith('//'):
src = 'https:' + src
elif not src.startswith('http'):
src = urljoin(page_url, src)
img_io = download_image(src, session, referer=page_url)
if img_io:
with PILImage.open(img_io) as p_img:
iw, ih = p_img.size
sc = min(max_w / iw, max_h / ih, 1.0)
img_io.seek(0)
content.append(Image(img_io, width=iw * sc, height=ih * sc))
time.sleep(0.15)
src = img_tag.get('zoomfile')
if src and 'http' in src:
img_io = download_image(src, session)
if img_io:
with PILImage.open(img_io) as p_img:
iw, ih = p_img.size
sc = min(max_w / iw, max_h / ih, 1.0)
img_io.seek(0)
content.append(Image(img_io, width=iw * sc, height=ih * sc))
if post != today_posts[-1]: content.append(PageBreak())
except Exception as e: