diff --git a/main.py b/main.py index 056e8cc..ea1fbe0 100644 --- a/main.py +++ b/main.py @@ -119,6 +119,11 @@ def jobs(robot: Robot): async def xiuren_download_job(): await robot.xiu_ren_download_task() + # ✅ 每天 01:30 下载秀人网帖子 + @async_job.at_times(["2:30"]) + async def shenshiR15_download_job(): + await robot.shen_shi_download_task() + # ✅ 每天 17:30 发秀人 PDF(如果启用) # @async_job.at_times(["17:30"]) # async def xiuren_pdf_send_job(): diff --git a/plugins/xiuren_image/shenshi_r15.py b/plugins/xiuren_image/shenshi_r15.py new file mode 100644 index 0000000..e8e00f1 --- /dev/null +++ b/plugins/xiuren_image/shenshi_r15.py @@ -0,0 +1,226 @@ +import os +import time +import requests +import urllib3 +from bs4 import BeautifulSoup +from urllib.parse import urljoin +from io import BytesIO +from PIL import Image + +# --- 配置区域 --- +BASE_URL = "https://www.hentaiclub.net" +START_URL = "https://www.hentaiclub.net/sort/r15.html/1/" + +# 全局定义下载根目录 +download_root = "/mnt/nfs_share" + +# 每天下载的新帖子数量限制 +DAILY_LIMIT = 10 + +# 屏蔽 SSL 不安全请求的警告 +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + +# 伪装请求头 +HEADERS = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", + "Referer": BASE_URL +} + + +def get_post_id(url): + """ + 从URL中提取唯一ID + 示例: .../64068.html -> 64068 + """ + try: + filename = url.split('/')[-1] + return filename.split('.')[0] + except: + return None + + +def is_downloaded(post_id): + """ + 检查 download_root 下是否已经存在以该 ID 开头的文件夹 + """ + # 如果根目录本身不存在,说明肯定没下载过 + if not os.path.exists(download_root): + return False + + try: + existing_folders = os.listdir(download_root) + except OSError as e: + print(f"无法读取目录 {download_root}: {e}") + return False + + # 匹配 "ID" 或 "ID_标题" + prefix = f"{post_id}_" + for folder in existing_folders: + if folder.startswith(prefix) or folder == post_id: + return True + return False + + +def get_soup(url): + """获取页面内容,忽略 SSL 验证""" + try: + response = requests.get(url, headers=HEADERS, timeout=10, verify=False) + response.raise_for_status() + return BeautifulSoup(response.text, 'html.parser') + except Exception as e: + print(f"请求失败: {url} - 错误: {e}") + return None + + +def download_image_as_jpg(img_url, folder_path, file_name): + """ + 下载图片并利用 Pillow 转换为 JPG 格式保存 + """ + try: + file_path = os.path.join(folder_path, file_name) + + if os.path.exists(file_path): + return + + # 下载图片数据 (verify=False 忽略证书错误) + resp = requests.get(img_url, headers=HEADERS, timeout=20, verify=False) + resp.raise_for_status() + + # 读取内存中的图片 + image_data = BytesIO(resp.content) + img = Image.open(image_data) + + # 转换为 RGB 模式 (处理 WebP/PNG 透明背景,防止保存 JPG 报错) + if img.mode in ("RGBA", "P"): + img = img.convert("RGB") + + # 保存为 JPG + img.save(file_path, "JPEG", quality=90) + + except Exception as e: + print(f" -> 图片处理失败 {img_url}: {e}") + + +def parse_detail_page(post_url, post_title, post_id): + """解析详情页并下载图片""" + + # 过滤文件名中的非法字符 + safe_title = "".join([c for c in post_title if c.isalnum() or c in (' ', '-', '_')]).strip() + folder_name = f"{post_id}_{safe_title}" + + # 使用全局 download_root 拼接路径 + post_dir = os.path.join(download_root, folder_name) + + print(f"正在处理: {folder_name}") + + soup = get_soup(post_url) + if not soup: + return False + + # 创建子目录 + if not os.path.exists(post_dir): + try: + os.makedirs(post_dir) + except OSError as e: + print(f"无法创建目录 {post_dir}: {e}") + return False + + items = soup.select('#masonry .post-item') + print(f" 包含 {len(items)} 张图片,开始下载并转为 JPG...") + + for index, item in enumerate(items): + # 优先获取高清大图链接 + img_url = item.get('data-src') + if not img_url: + img_tag = item.find('img') + if img_tag: + img_url = img_tag.get('data-original') or img_tag.get('src') + + if img_url: + # 补全 URL + if not img_url.startswith('http'): + img_url = urljoin(BASE_URL, img_url) + + # 强制命名为 .jpg + file_name = f"{index + 1:03d}.jpg" + + download_image_as_jpg(img_url, post_dir, file_name) + + print(f" 完成.\n") + return True + + +def run_daily_job(): + """主逻辑""" + current_download_count = 0 + current_page_url = START_URL + + print(f"下载目录: {download_root}") + + while current_download_count < DAILY_LIMIT: + print(f"正在扫描列表页: {current_page_url}") + soup = get_soup(current_page_url) + if not soup: + break + + items = soup.select('#masonry .item') + if not items: + print("本页无内容。") + break + + for item in items: + # 检查是否达标 + if current_download_count >= DAILY_LIMIT: + print(f"=== 今日任务已完成 ({DAILY_LIMIT}个) ===") + return + + link_tag = item.select_one('a.item-link') + if not link_tag: + continue + + href = link_tag.get('href') + + # 1. 获取 ID + post_id = get_post_id(href) + if not post_id: + continue + + # 2. 检查 NFS 目录下是否存在该 ID + if is_downloaded(post_id): + # print(f"跳过已存在: {post_id}") + continue + + # 3. 开始下载新帖子 + title_div = link_tag.select_one('.item-link-text') + title = title_div.get_text(strip=True) if title_div else "未命名" + + success = parse_detail_page(href, title, post_id) + + if success: + current_download_count += 1 + print(f"=== 进度: {current_download_count}/{DAILY_LIMIT} ===\n") + time.sleep(2) # 避免请求过快 + + # 翻页 + next_page = soup.select_one('.page-navigator .next a') + if next_page: + current_page_url = next_page.get('href') + time.sleep(1) + else: + print("已到达最后一页,没有更多帖子了。") + break + + +if __name__ == "__main__": + # 尝试创建根目录(如果不存在) + if not os.path.exists(download_root): + try: + os.makedirs(download_root) + print(f"创建目录成功: {download_root}") + except OSError as e: + print(f"错误: 无法创建根目录 {download_root}。请检查权限或手动挂载 NFS。") + print(f"系统报错: {e}") + exit(1) + + print(f"开始任务:下载 {DAILY_LIMIT} 个新帖子 (JPG格式, 忽略SSL)") + run_daily_job() \ No newline at end of file diff --git a/robot.py b/robot.py index 7410c3a..ec0f9a6 100644 --- a/robot.py +++ b/robot.py @@ -20,6 +20,7 @@ from configuration import Config from db.connection import DBConnectionManager from db.contacts_db import ContactsDBOperator from plugins.xiuren_image.meitu_dl import meitu_dowload_pub_pic +from plugins.xiuren_image.shenshi_r15 import run_daily_job from utils.email_util import EmailSender from utils.revoke.message_auto_revoke import MessageAutoRevoke from utils.robot_cmd.robot_command import GroupBotManager, Feature, PermissionStatus @@ -709,6 +710,13 @@ class Robot: except Exception as e: self.LOG.error(f"xiu_ren_download_task error:{e}") + async def shen_shi_download_task(self): + try: + # 每天下载10组图,然后发一个帖子PDF + run_daily_job() + except Exception as e: + self.LOG.error(f"shen_shi_download_task error:{e}") + async def generate_and_send_ranking(self): try: receivers = self.gbm.get_group_list()