加入绅士R15的内容

2025-12-10 14:53:54 +08:00
parent 33897dc1cd
commit e204610bc5
3 changed files with 239 additions and 0 deletions
--- a/main.py
+++ b/main.py
@@ -119,6 +119,11 @@ def jobs(robot: Robot):
    async def xiuren_download_job():
        await robot.xiu_ren_download_task()
    # ✅ 每天 01:30 下载秀人网帖子
    @async_job.at_times(["2:30"])
    async def shenshiR15_download_job():
        await robot.shen_shi_download_task()
    # ✅ 每天 17:30 发秀人 PDF（如果启用）
    # @async_job.at_times(["17:30"])
    # async def xiuren_pdf_send_job():
--- a/plugins/xiuren_image/shenshi_r15.py
+++ b/plugins/xiuren_image/shenshi_r15.py
@@ -0,0 +1,226 @@
 import os
 import time
 import requests
 import urllib3
 from bs4 import BeautifulSoup
 from urllib.parse import urljoin
 from io import BytesIO
 from PIL import Image
 # --- 配置区域 ---
 BASE_URL = "https://www.hentaiclub.net"
 START_URL = "https://www.hentaiclub.net/sort/r15.html/1/"
 # 全局定义下载根目录
 download_root = "/mnt/nfs_share"
 # 每天下载的新帖子数量限制
 DAILY_LIMIT = 10
 # 屏蔽 SSL 不安全请求的警告
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 # 伪装请求头
 HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Referer": BASE_URL
 }
 def get_post_id(url):
    """
    从URL中提取唯一ID
    示例: .../64068.html -> 64068
    """
    try:
        filename = url.split('/')[-1]
        return filename.split('.')[0]
    except:
        return None
 def is_downloaded(post_id):
    """
    检查 download_root 下是否已经存在以该 ID 开头的文件夹
    """
    # 如果根目录本身不存在，说明肯定没下载过
    if not os.path.exists(download_root):
        return False
    try:
        existing_folders = os.listdir(download_root)
    except OSError as e:
        print(f"无法读取目录 {download_root}: {e}")
        return False
    # 匹配 "ID" 或 "ID_标题"
    prefix = f"{post_id}_"
    for folder in existing_folders:
        if folder.startswith(prefix) or folder == post_id:
            return True
    return False
 def get_soup(url):
    """获取页面内容，忽略 SSL 验证"""
    try:
        response = requests.get(url, headers=HEADERS, timeout=10, verify=False)
        response.raise_for_status()
        return BeautifulSoup(response.text, 'html.parser')
    except Exception as e:
        print(f"请求失败: {url} - 错误: {e}")
        return None
 def download_image_as_jpg(img_url, folder_path, file_name):
    """
    下载图片并利用 Pillow 转换为 JPG 格式保存
    """
    try:
        file_path = os.path.join(folder_path, file_name)
        if os.path.exists(file_path):
            return
        # 下载图片数据 (verify=False 忽略证书错误)
        resp = requests.get(img_url, headers=HEADERS, timeout=20, verify=False)
        resp.raise_for_status()
        # 读取内存中的图片
        image_data = BytesIO(resp.content)
        img = Image.open(image_data)
        # 转换为 RGB 模式 (处理 WebP/PNG 透明背景，防止保存 JPG 报错)
        if img.mode in ("RGBA", "P"):
            img = img.convert("RGB")
        # 保存为 JPG
        img.save(file_path, "JPEG", quality=90)
    except Exception as e:
        print(f"    -> 图片处理失败 {img_url}: {e}")
 def parse_detail_page(post_url, post_title, post_id):
    """解析详情页并下载图片"""
    # 过滤文件名中的非法字符
    safe_title = "".join([c for c in post_title if c.isalnum() or c in (' ', '-', '_')]).strip()
    folder_name = f"{post_id}_{safe_title}"
    # 使用全局 download_root 拼接路径
    post_dir = os.path.join(download_root, folder_name)
    print(f"正在处理: {folder_name}")
    soup = get_soup(post_url)
    if not soup:
        return False
    # 创建子目录
    if not os.path.exists(post_dir):
        try:
            os.makedirs(post_dir)
        except OSError as e:
            print(f"无法创建目录 {post_dir}: {e}")
            return False
    items = soup.select('#masonry .post-item')
    print(f"  包含 {len(items)} 张图片，开始下载并转为 JPG...")
    for index, item in enumerate(items):
        # 优先获取高清大图链接
        img_url = item.get('data-src')
        if not img_url:
            img_tag = item.find('img')
            if img_tag:
                img_url = img_tag.get('data-original') or img_tag.get('src')
        if img_url:
            # 补全 URL
            if not img_url.startswith('http'):
                img_url = urljoin(BASE_URL, img_url)
            # 强制命名为 .jpg
            file_name = f"{index + 1:03d}.jpg"
            download_image_as_jpg(img_url, post_dir, file_name)
    print(f"  完成.\n")
    return True
 def run_daily_job():
    """主逻辑"""
    current_download_count = 0
    current_page_url = START_URL
    print(f"下载目录: {download_root}")
    while current_download_count < DAILY_LIMIT:
        print(f"正在扫描列表页: {current_page_url}")
        soup = get_soup(current_page_url)
        if not soup:
            break
        items = soup.select('#masonry .item')
        if not items:
            print("本页无内容。")
            break
        for item in items:
            # 检查是否达标
            if current_download_count >= DAILY_LIMIT:
                print(f"=== 今日任务已完成 ({DAILY_LIMIT}个) ===")
                return
            link_tag = item.select_one('a.item-link')
            if not link_tag:
                continue
            href = link_tag.get('href')
            # 1. 获取 ID
            post_id = get_post_id(href)
            if not post_id:
                continue
            # 2. 检查 NFS 目录下是否存在该 ID
            if is_downloaded(post_id):
                # print(f"跳过已存在: {post_id}")
                continue
            # 3. 开始下载新帖子
            title_div = link_tag.select_one('.item-link-text')
            title = title_div.get_text(strip=True) if title_div else "未命名"
            success = parse_detail_page(href, title, post_id)
            if success:
                current_download_count += 1
                print(f"=== 进度: {current_download_count}/{DAILY_LIMIT} ===\n")
                time.sleep(2)  # 避免请求过快
        # 翻页
        next_page = soup.select_one('.page-navigator .next a')
        if next_page:
            current_page_url = next_page.get('href')
            time.sleep(1)
        else:
            print("已到达最后一页，没有更多帖子了。")
            break
 if __name__ == "__main__":
    # 尝试创建根目录（如果不存在）
    if not os.path.exists(download_root):
        try:
            os.makedirs(download_root)
            print(f"创建目录成功: {download_root}")
        except OSError as e:
            print(f"错误: 无法创建根目录 {download_root}。请检查权限或手动挂载 NFS。")
            print(f"系统报错: {e}")
            exit(1)
    print(f"开始任务：下载 {DAILY_LIMIT} 个新帖子 (JPG格式, 忽略SSL)")
    run_daily_job()
--- a/robot.py
+++ b/robot.py
@@ -20,6 +20,7 @@ from configuration import Config
 from db.connection import DBConnectionManager
 from db.contacts_db import ContactsDBOperator
 from plugins.xiuren_image.meitu_dl import meitu_dowload_pub_pic
 from plugins.xiuren_image.shenshi_r15 import run_daily_job
 from utils.email_util import EmailSender
 from utils.revoke.message_auto_revoke import MessageAutoRevoke
 from utils.robot_cmd.robot_command import GroupBotManager, Feature, PermissionStatus
@@ -709,6 +710,13 @@ class Robot:
        except Exception as e:
            self.LOG.error(f"xiu_ren_download_task error：{e}")
    async def shen_shi_download_task(self):
        try:
            # 每天下载10组图，然后发一个帖子PDF
            run_daily_job()
        except Exception as e:
            self.LOG.error(f"shen_shi_download_task error：{e}")
    async def generate_and_send_ranking(self):
        try:
            receivers = self.gbm.get_group_list()