加入绅士R15的内容

2025-12-10 14:53:54 +08:00
parent 33897dc1cd
commit e204610bc5
3 changed files with 239 additions and 0 deletions
--- a/main.py
+++ b/main.py
@@ -119,6 +119,11 @@ def jobs(robot: Robot):
    async def xiuren_download_job():
        await robot.xiu_ren_download_task()

+    # ✅ 每天 01:30 下载秀人网帖子
+    @async_job.at_times(["2:30"])
+    async def shenshiR15_download_job():
+        await robot.shen_shi_download_task()
+
    # ✅ 每天 17:30 发秀人 PDF（如果启用）
    # @async_job.at_times(["17:30"])
    # async def xiuren_pdf_send_job():
--- a/plugins/xiuren_image/shenshi_r15.py
+++ b/plugins/xiuren_image/shenshi_r15.py
@@ -0,0 +1,226 @@
+import os
+import time
+import requests
+import urllib3
+from bs4 import BeautifulSoup
+from urllib.parse import urljoin
+from io import BytesIO
+from PIL import Image
+
+# --- 配置区域 ---
+BASE_URL = "https://www.hentaiclub.net"
+START_URL = "https://www.hentaiclub.net/sort/r15.html/1/"
+
+# 全局定义下载根目录
+download_root = "/mnt/nfs_share"
+
+# 每天下载的新帖子数量限制
+DAILY_LIMIT = 10
+
+# 屏蔽 SSL 不安全请求的警告
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+
+# 伪装请求头
+HEADERS = {
+    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
+    "Referer": BASE_URL
+}
+
+
+def get_post_id(url):
+    """
+    从URL中提取唯一ID
+    示例: .../64068.html -> 64068
+    """
+    try:
+        filename = url.split('/')[-1]
+        return filename.split('.')[0]
+    except:
+        return None
+
+
+def is_downloaded(post_id):
+    """
+    检查 download_root 下是否已经存在以该 ID 开头的文件夹
+    """
+    # 如果根目录本身不存在，说明肯定没下载过
+    if not os.path.exists(download_root):
+        return False
+
+    try:
+        existing_folders = os.listdir(download_root)
+    except OSError as e:
+        print(f"无法读取目录 {download_root}: {e}")
+        return False
+
+    # 匹配 "ID" 或 "ID_标题"
+    prefix = f"{post_id}_"
+    for folder in existing_folders:
+        if folder.startswith(prefix) or folder == post_id:
+            return True
+    return False
+
+
+def get_soup(url):
+    """获取页面内容，忽略 SSL 验证"""
+    try:
+        response = requests.get(url, headers=HEADERS, timeout=10, verify=False)
+        response.raise_for_status()
+        return BeautifulSoup(response.text, 'html.parser')
+    except Exception as e:
+        print(f"请求失败: {url} - 错误: {e}")
+        return None
+
+
+def download_image_as_jpg(img_url, folder_path, file_name):
+    """
+    下载图片并利用 Pillow 转换为 JPG 格式保存
+    """
+    try:
+        file_path = os.path.join(folder_path, file_name)
+
+        if os.path.exists(file_path):
+            return
+
+        # 下载图片数据 (verify=False 忽略证书错误)
+        resp = requests.get(img_url, headers=HEADERS, timeout=20, verify=False)
+        resp.raise_for_status()
+
+        # 读取内存中的图片
+        image_data = BytesIO(resp.content)
+        img = Image.open(image_data)
+
+        # 转换为 RGB 模式 (处理 WebP/PNG 透明背景，防止保存 JPG 报错)
+        if img.mode in ("RGBA", "P"):
+            img = img.convert("RGB")
+
+        # 保存为 JPG
+        img.save(file_path, "JPEG", quality=90)
+
+    except Exception as e:
+        print(f"    -> 图片处理失败 {img_url}: {e}")
+
+
+def parse_detail_page(post_url, post_title, post_id):
+    """解析详情页并下载图片"""
+
+    # 过滤文件名中的非法字符
+    safe_title = "".join([c for c in post_title if c.isalnum() or c in (' ', '-', '_')]).strip()
+    folder_name = f"{post_id}_{safe_title}"
+
+    # 使用全局 download_root 拼接路径
+    post_dir = os.path.join(download_root, folder_name)
+
+    print(f"正在处理: {folder_name}")
+
+    soup = get_soup(post_url)
+    if not soup:
+        return False
+
+    # 创建子目录
+    if not os.path.exists(post_dir):
+        try:
+            os.makedirs(post_dir)
+        except OSError as e:
+            print(f"无法创建目录 {post_dir}: {e}")
+            return False
+
+    items = soup.select('#masonry .post-item')
+    print(f"  包含 {len(items)} 张图片，开始下载并转为 JPG...")
+
+    for index, item in enumerate(items):
+        # 优先获取高清大图链接
+        img_url = item.get('data-src')
+        if not img_url:
+            img_tag = item.find('img')
+            if img_tag:
+                img_url = img_tag.get('data-original') or img_tag.get('src')
+
+        if img_url:
+            # 补全 URL
+            if not img_url.startswith('http'):
+                img_url = urljoin(BASE_URL, img_url)
+
+            # 强制命名为 .jpg
+            file_name = f"{index + 1:03d}.jpg"
+
+            download_image_as_jpg(img_url, post_dir, file_name)
+
+    print(f"  完成.\n")
+    return True
+
+
+def run_daily_job():
+    """主逻辑"""
+    current_download_count = 0
+    current_page_url = START_URL
+
+    print(f"下载目录: {download_root}")
+
+    while current_download_count < DAILY_LIMIT:
+        print(f"正在扫描列表页: {current_page_url}")
+        soup = get_soup(current_page_url)
+        if not soup:
+            break
+
+        items = soup.select('#masonry .item')
+        if not items:
+            print("本页无内容。")
+            break
+
+        for item in items:
+            # 检查是否达标
+            if current_download_count >= DAILY_LIMIT:
+                print(f"=== 今日任务已完成 ({DAILY_LIMIT}个) ===")
+                return
+
+            link_tag = item.select_one('a.item-link')
+            if not link_tag:
+                continue
+
+            href = link_tag.get('href')
+
+            # 1. 获取 ID
+            post_id = get_post_id(href)
+            if not post_id:
+                continue
+
+            # 2. 检查 NFS 目录下是否存在该 ID
+            if is_downloaded(post_id):
+                # print(f"跳过已存在: {post_id}")
+                continue
+
+            # 3. 开始下载新帖子
+            title_div = link_tag.select_one('.item-link-text')
+            title = title_div.get_text(strip=True) if title_div else "未命名"
+
+            success = parse_detail_page(href, title, post_id)
+
+            if success:
+                current_download_count += 1
+                print(f"=== 进度: {current_download_count}/{DAILY_LIMIT} ===\n")
+                time.sleep(2)  # 避免请求过快
+
+        # 翻页
+        next_page = soup.select_one('.page-navigator .next a')
+        if next_page:
+            current_page_url = next_page.get('href')
+            time.sleep(1)
+        else:
+            print("已到达最后一页，没有更多帖子了。")
+            break
+
+
+if __name__ == "__main__":
+    # 尝试创建根目录（如果不存在）
+    if not os.path.exists(download_root):
+        try:
+            os.makedirs(download_root)
+            print(f"创建目录成功: {download_root}")
+        except OSError as e:
+            print(f"错误: 无法创建根目录 {download_root}。请检查权限或手动挂载 NFS。")
+            print(f"系统报错: {e}")
+            exit(1)
+
+    print(f"开始任务：下载 {DAILY_LIMIT} 个新帖子 (JPG格式, 忽略SSL)")
+    run_daily_job()
--- a/robot.py
+++ b/robot.py
@@ -20,6 +20,7 @@ from configuration import Config
 from db.connection import DBConnectionManager
 from db.contacts_db import ContactsDBOperator
 from plugins.xiuren_image.meitu_dl import meitu_dowload_pub_pic
+from plugins.xiuren_image.shenshi_r15 import run_daily_job
 from utils.email_util import EmailSender
 from utils.revoke.message_auto_revoke import MessageAutoRevoke
 from utils.robot_cmd.robot_command import GroupBotManager, Feature, PermissionStatus
@@ -709,6 +710,13 @@ class Robot:
        except Exception as e:
            self.LOG.error(f"xiu_ren_download_task error：{e}")

+    async def shen_shi_download_task(self):
+        try:
+            # 每天下载10组图，然后发一个帖子PDF
+            run_daily_job()
+        except Exception as e:
+            self.LOG.error(f"shen_shi_download_task error：{e}")
+
    async def generate_and_send_ranking(self):
        try:
            receivers = self.gbm.get_group_list()