import os import time import requests import urllib3 from bs4 import BeautifulSoup from urllib.parse import urljoin from io import BytesIO from PIL import Image # --- 配置区域 --- BASE_URL = "https://www.hentaiclub.net" START_URL = "https://www.hentaiclub.net/sort/r15.html/1/" # 全局定义下载根目录 download_root = "/mnt/nfs_share" # 每天下载的新帖子数量限制 DAILY_LIMIT = 10 # 屏蔽 SSL 不安全请求的警告 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) # 伪装请求头 HEADERS = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", "Referer": BASE_URL } def get_post_id(url): """ 从URL中提取唯一ID 示例: .../64068.html -> 64068 """ try: filename = url.split('/')[-1] return filename.split('.')[0] except: return None def is_downloaded(post_id): """ 检查 download_root 下是否已经存在以该 ID 开头的文件夹 """ # 如果根目录本身不存在,说明肯定没下载过 if not os.path.exists(download_root): return False try: existing_folders = os.listdir(download_root) except OSError as e: print(f"无法读取目录 {download_root}: {e}") return False # 匹配 "ID" 或 "ID_标题" prefix = f"{post_id}_" for folder in existing_folders: if folder.startswith(prefix) or folder == post_id: return True return False def get_soup(url): """获取页面内容,忽略 SSL 验证""" try: response = requests.get(url, headers=HEADERS, timeout=10, verify=False) response.raise_for_status() return BeautifulSoup(response.text, 'html.parser') except Exception as e: print(f"请求失败: {url} - 错误: {e}") return None def download_image_as_jpg(img_url, folder_path, file_name): """ 下载图片并利用 Pillow 转换为 JPG 格式保存 """ try: file_path = os.path.join(folder_path, file_name) if os.path.exists(file_path): return # 下载图片数据 (verify=False 忽略证书错误) resp = requests.get(img_url, headers=HEADERS, timeout=20, verify=False) resp.raise_for_status() # 读取内存中的图片 image_data = BytesIO(resp.content) img = Image.open(image_data) # 转换为 RGB 模式 (处理 WebP/PNG 透明背景,防止保存 JPG 报错) if img.mode in ("RGBA", "P"): img = img.convert("RGB") # 保存为 JPG img.save(file_path, "JPEG", quality=90) except Exception as e: print(f" -> 图片处理失败 {img_url}: {e}") def parse_detail_page(post_url, post_title, post_id): """解析详情页并下载图片""" # 过滤文件名中的非法字符 safe_title = "".join([c for c in post_title if c.isalnum() or c in (' ', '-', '_')]).strip() folder_name = f"{post_id}_{safe_title}" # 使用全局 download_root 拼接路径 post_dir = os.path.join(download_root, folder_name) print(f"正在处理: {folder_name}") soup = get_soup(post_url) if not soup: return False # 创建子目录 if not os.path.exists(post_dir): try: os.makedirs(post_dir) except OSError as e: print(f"无法创建目录 {post_dir}: {e}") return False items = soup.select('#masonry .post-item') print(f" 包含 {len(items)} 张图片,开始下载并转为 JPG...") for index, item in enumerate(items): # 优先获取高清大图链接 img_url = item.get('data-src') if not img_url: img_tag = item.find('img') if img_tag: img_url = img_tag.get('data-original') or img_tag.get('src') if img_url: # 补全 URL if not img_url.startswith('http'): img_url = urljoin(BASE_URL, img_url) # 强制命名为 .jpg file_name = f"{index + 1:03d}.jpg" download_image_as_jpg(img_url, post_dir, file_name) print(f" 完成.\n") return True def run_daily_job(): """主逻辑""" current_download_count = 0 current_page_url = START_URL print(f"下载目录: {download_root}") while current_download_count < DAILY_LIMIT: print(f"正在扫描列表页: {current_page_url}") soup = get_soup(current_page_url) if not soup: break items = soup.select('#masonry .item') if not items: print("本页无内容。") break for item in items: # 检查是否达标 if current_download_count >= DAILY_LIMIT: print(f"=== 今日任务已完成 ({DAILY_LIMIT}个) ===") return link_tag = item.select_one('a.item-link') if not link_tag: continue href = link_tag.get('href') # 1. 获取 ID post_id = get_post_id(href) if not post_id: continue # 2. 检查 NFS 目录下是否存在该 ID if is_downloaded(post_id): # print(f"跳过已存在: {post_id}") continue # 3. 开始下载新帖子 title_div = link_tag.select_one('.item-link-text') title = title_div.get_text(strip=True) if title_div else "未命名" success = parse_detail_page(href, title, post_id) if success: current_download_count += 1 print(f"=== 进度: {current_download_count}/{DAILY_LIMIT} ===\n") time.sleep(2) # 避免请求过快 # 翻页 next_page = soup.select_one('.page-navigator .next a') if next_page: current_page_url = next_page.get('href') time.sleep(1) else: print("已到达最后一页,没有更多帖子了。") break if __name__ == "__main__": # 尝试创建根目录(如果不存在) if not os.path.exists(download_root): try: os.makedirs(download_root) print(f"创建目录成功: {download_root}") except OSError as e: print(f"错误: 无法创建根目录 {download_root}。请检查权限或手动挂载 NFS。") print(f"系统报错: {e}") exit(1) print(f"开始任务:下载 {DAILY_LIMIT} 个新帖子 (JPG格式, 忽略SSL)") run_daily_job()