优化sehuat

This commit is contained in:
liuwei
2026-02-03 15:56:49 +08:00
parent 5c43d5dc39
commit 6b28e375cf

View File

@@ -25,42 +25,16 @@ from PIL import Image as PILImage
import re import re
from PyPDF2 import PdfReader, PdfWriter from PyPDF2 import PdfReader, PdfWriter
from loguru import logger from loguru import logger
from urllib.parse import urlparse, urljoin
from requests.adapters import HTTPAdapter
from urllib3.util import Retry
UNREACHABLE_HOSTS = set()
def download_image(url, session, referer=None): def download_image(url, session):
"""使用同步的 session 下载图片,确保 Cookie 一致""" """使用同步的 session 下载图片,确保 Cookie 一致"""
try: try:
parsed = urlparse(url) if not url.lower().endswith(('.jpg', '.jpeg', '.png')):
host = parsed.netloc.lower()
if host in UNREACHABLE_HOSTS:
return None return None
headers = {} response = session.get(url, timeout=15)
if referer:
headers['Referer'] = referer
else:
headers['Referer'] = f'{parsed.scheme}://{host}/'
response = session.get(url, headers=headers, timeout=10)
response.raise_for_status() response.raise_for_status()
ctype = response.headers.get('Content-Type', '')
if 'image' not in ctype.lower() and not url.lower().endswith(('.jpg', '.jpeg', '.png', '.webp', '.gif')):
return None
return BytesIO(response.content) return BytesIO(response.content)
except requests.exceptions.RequestException as e:
try:
parsed = urlparse(url)
host = parsed.netloc.lower()
msg = str(e)
if 'Network is unreachable' in msg or 'Failed to establish a new connection' in msg:
UNREACHABLE_HOSTS.add(host)
except Exception:
pass
logger.warning(f"下载图片失败: {e}")
return None
except Exception as e: except Exception as e:
logger.warning(f"下载图片失败: {e}") logger.warning(f"下载图片失败: {e}")
return None return None
@@ -159,16 +133,7 @@ def fetch_and_create_pdf(url):
# 同步 Session # 同步 Session
session = requests.Session() session = requests.Session()
ua = driver.execute_script("return navigator.userAgent") ua = driver.execute_script("return navigator.userAgent")
session.headers.update({ session.headers.update({'User-Agent': ua, 'Referer': 'https://www.sehuatang.net/'})
'User-Agent': ua,
'Accept': 'image/avif,image/webp,image/apng,image/svg+xml,image/*;q=0.8,*/*;q=0.5',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive'
})
retry = Retry(total=3, connect=3, read=3, backoff_factor=0.5, status_forcelist=[429, 500, 502, 503, 504], allowed_methods=frozenset(['GET']))
adapter = HTTPAdapter(max_retries=retry)
session.mount('https://', adapter)
session.mount('http://', adapter)
for c in driver.get_cookies(): for c in driver.get_cookies():
session.cookies.set(c['name'], c['value']) session.cookies.set(c['name'], c['value'])
@@ -192,23 +157,16 @@ def fetch_and_create_pdf(url):
for m in magnets: for m in magnets:
content.append(Paragraph(f"<b>{m}</b>", normal_style)) content.append(Paragraph(f"<b>{m}</b>", normal_style))
page_url = resp.url
for img_tag in div.find_all('img'): for img_tag in div.find_all('img'):
src = img_tag.get('zoomfile') or img_tag.get('file') or img_tag.get('src') or img_tag.get('data-src') src = img_tag.get('zoomfile')
if not src: if src and 'http' in src:
continue img_io = download_image(src, session)
if src.startswith('//'): if img_io:
src = 'https:' + src with PILImage.open(img_io) as p_img:
elif not src.startswith('http'): iw, ih = p_img.size
src = urljoin(page_url, src) sc = min(max_w / iw, max_h / ih, 1.0)
img_io = download_image(src, session, referer=page_url) img_io.seek(0)
if img_io: content.append(Image(img_io, width=iw * sc, height=ih * sc))
with PILImage.open(img_io) as p_img:
iw, ih = p_img.size
sc = min(max_w / iw, max_h / ih, 1.0)
img_io.seek(0)
content.append(Image(img_io, width=iw * sc, height=ih * sc))
time.sleep(0.15)
if post != today_posts[-1]: content.append(PageBreak()) if post != today_posts[-1]: content.append(PageBreak())
except Exception as e: except Exception as e: