feature：秀人网抓图功能开放

2025-02-24 16:23:44 +08:00
parent aae2d1beae
commit a2f4a9dfc7
3 changed files with 122 additions and 85 deletions
--- a/main.py
+++ b/main.py
@@ -47,6 +47,9 @@ def main(chat_type: int):

    # 游戏的定时任务每小时执行
    robot.onEveryTime("18:00", robot.game_auto_tasks)
+
+    # 秀人网每天自动下载帖子
+    robot.onEveryTime("03:00", robot.xiu_ren_download_task)
    # 让机器人一直跑
    robot.keepRunningAndBlockProcess()

--- a/robot.py
+++ b/robot.py
@@ -40,6 +40,7 @@ from message_report.write_db import write_to_db, generate_and_send_ranking
 from message_storage.message_to_db import archive_message, get_messages
 from message_summary.message_summary_4o import message_summary
 from sehuatang.shehuatang import pdf_file_path
+from xiuren.xiuren_dl import xiuren_dowload_pic


 class Robot(Job):
@@ -525,3 +526,9 @@ class Robot(Job):
                self.sendTextMsg(rep, gid)
        except Exception as e:
            self.LOG.error(f"message_summary_robot error：{e}")
+
+    def xiu_ren_download_task(self):
+        try:
+            xiuren_dowload_pic()
+        except Exception as e:
+            self.LOG.error(f"xiuren_dowload_pic error：{e}")
--- a/xiuren/xiuren_dl.py
+++ b/xiuren/xiuren_dl.py
@@ -3,143 +3,170 @@ from bs4 import BeautifulSoup
 import os
 import time
 import random
+import re


-def get_html(url):
+def get_html(url, session):
    headers = {
-        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
-                      'AppleWebKit/537.36 (KHTML, like Gecko) '
-                      'Chrome/114.0.0.0 Safari/537.36'
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
+        'Referer': 'https://www.xiurenwang.cc/'
    }
    try:
-        response = requests.get(url, headers=headers)
+        response = session.get(url, headers=headers, verify=False)
        response.raise_for_status()
        return response.text
    except requests.exceptions.RequestException as e:
-        print(f"Error fetching {url}: {e}")
+        print(f"请求 {url} 失败: {e}")
        return None


 def parse_initial_page(html):
    soup = BeautifulSoup(html, 'html.parser')
    posts = soup.find_all('a', href=lambda x: x and x.endswith('.html'))
-    first_two_posts = posts[:2]
+    # 取所有帖子，而不是仅前两个，以便后续查找未下载的帖子
    post_info = []

-    print(posts)
-    for post in first_two_posts:
+    for post in posts:
        text = post.text.strip()
-        print(f"Post text: '{text}'")  # 调试输出，检查实际内容
-        if not text:
-            print("Empty post text, skipping...")
-            continue
-
-        parts = text.split()
-        if len(parts) < 2:
-            print(f"Unexpected format in '{text}', skipping...")
-            continue
-
-        # 提取编号和图片总数
-        number = parts[0].replace('No.', '') if parts[0].startswith('No.') else None
-        pages = parts[-1].replace('P', '') if parts[-1].endswith('P') else None
-
-        if not number or not pages:
-            print(f"Failed to parse number or pages from '{text}', skipping...")
-            continue
-
-        try:
-            total_images = int(pages)
+        number_match = re.search(r'No\.(\d+)', text)
+        number = number_match.group(1) if number_match else None
+        if number:
            url = 'https://www.xiurenwang.cc/' + post['href']
-            post_info.append({'url': url, 'number': number, 'total_images': total_images})
-        except ValueError:
-            print(f"Invalid total_images value in '{text}', skipping...")
-            continue
-        print(f"post_info:{post_info}")
+            post_info.append({'url': url, 'number': number})
+
    return post_info


-def extract_title_and_first_image(html):
+def extract_post_details(html):
    soup = BeautifulSoup(html, 'html.parser')
-    title = soup.title.text.strip()
-    images = soup.find_all('img', src=lambda x: x and 'pic/' in x)
-    if images:
-        first_image = images[0]
-        first_image_src = first_image['src']
-        return title, first_image_src
-    else:
-        return None, None
+
+    title = soup.title.text.strip() if soup.title else "未知标题"
+
+    # 提取可见图片URL
+    image_div = soup.find('div', id='image')
+    visible_image_urls = []
+    if image_div:
+        images = image_div.find_all('img', {'data-original': True})
+        visible_image_urls = [img.get('data-original') for img in images]
+
+    # 提取总图片数量
+    total_images = None
+    sp_div = soup.find('div', class_='sp')
+    if sp_div:
+        i_tags = sp_div.find_all('i', class_='i1')
+        if i_tags:
+            total_text = i_tags[0].text.strip()
+            number_match = re.search(r'(\d+)', total_text)
+            total_images = int(number_match.group(1)) if number_match else None
+
+    return title, visible_image_urls, total_images


-def parse_image_url(src):
-    image_filename = src.split('/')[-1]
-    starting_number = int(image_filename.split('.')[0])
-    return starting_number
+def generate_image_urls(visible_image_urls, total_images):
+    if not visible_image_urls or not total_images:
+        print("未找到可见图片URL或总图片数")
+        return []
+
+    # 提取编号和基础路径
+    numbers = [int(url.split('/')[-1].split('.')[0]) for url in visible_image_urls]
+    min_number = min(numbers)
+    base_url = visible_image_urls[0].rsplit('/', 1)[0] + '/'
+
+    # 如果base_url已包含https://，不需要再次添加
+    if not base_url.startswith('https://'):
+        base_url = 'https://' + base_url.lstrip('/')
+
+    # 生成所有图片URL
+    image_urls = []
+    for i in range(total_images):
+        image_number = min_number + i
+        image_url = f"{base_url}{image_number}.jpg"
+        image_urls.append(image_url)
+
+    return image_urls


-def download_image(image_url, filename):
+def download_image(image_url, filename, session, post_url):
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
+        'Referer': post_url
+    }
    try:
-        response = requests.get(image_url)
+        response = session.get(image_url, headers=headers, verify=False)
        response.raise_for_status()
        with open(filename, 'wb') as f:
            f.write(response.content)
+        print(f"已下载 {image_url}")
    except requests.exceptions.RequestException as e:
-        print(f"Error downloading {image_url}: {e}")
+        print(f"下载 {image_url} 失败: {e}")


-def download_images(image_urls, output_dir):
+def download_images(image_urls, output_dir, session, post_url):
+    if not image_urls:
+        print("没有可下载的图片URL")
+        return
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    for i, image_url in enumerate(image_urls):
+        if not image_url:
+            print(f"无效URL在索引 {i}")
+            continue
        filename = os.path.join(output_dir, f"{i + 1}.jpg")
-        download_image(image_url, filename)
-        time.sleep(random.uniform(1, 3))
+        download_image(image_url, filename, session, post_url)
+        # time.sleep(random.uniform(1, 3))


-def main():
+def xiuren_dowload_pic():
+    session = requests.Session()
+
    initial_url = 'https://www.xiurenwang.cc/bang?f=7'
-    initial_html = get_html(initial_url)
+    initial_html = get_html(initial_url, session)
    if not initial_html:
+        print("无法获取初始页面")
        return

    post_info = parse_initial_page(initial_html)
    if not post_info:
-        print("No valid posts found.")
+        print("未找到有效帖子")
        return

+    processed_count = 0  # 记录已处理的帖子数量
+    target_count = 2  # 目标处理2个新帖子
+
    for post in post_info:
+        if processed_count >= target_count:
+            break
+
        post_url = post['url']
        post_number = post['number']
-        total_images = post['total_images']
-
-        print(f"Processing post {post_number} with {total_images} images...")
-        post_html = get_html(post_url)
-        if not post_html:
-            continue
-
-        title, first_image_src = extract_title_and_first_image(post_html)
-        if not first_image_src:
-            print(f"No image found for post {post_number}")
-            continue
-
-        starting_number = parse_image_url(first_image_src)
-
-        # Construct full base URL
-        base_url = first_image_src.rsplit('/', 1)[0] + '/'
-        full_base_url = 'https:' + base_url
-
-        # Generate image URLs
-        image_urls = []
-        for i in range(total_images):
-            image_number = starting_number + i
-            image_url = full_base_url + str(image_number) + '.jpg'
-            image_urls.append(image_url)
-
-        # Create output directory
        output_dir = post_number
-        download_images(image_urls, output_dir)

-        print(f"Downloaded {total_images} images for post {post_number}")
+        # 检查本地文件夹是否已存在
+        if os.path.exists(output_dir):
+            print(f"帖子 {post_number} 的文件夹已存在，跳过")
+            continue
+
+        post_html = get_html(post_url, session)
+        if not post_html:
+            print(f"无法获取帖子 {post_number} 的页面")
+            continue
+
+        title, visible_image_urls, total_images = extract_post_details(post_html)
+        print(f"处理帖子 {post_number} - 标题: {title}, 总图片数: {total_images}")
+
+        if not visible_image_urls or not total_images:
+            print(f"帖子 {post_number} 缺少图片URL或总数，跳过")
+            continue
+
+        image_urls = generate_image_urls(visible_image_urls, total_images)
+        if not image_urls:
+            print(f"帖子 {post_number} 未生成图片URL，跳过")
+            continue
+
+        download_images(image_urls, output_dir, session, post_url)
+        print(f"完成处理帖子 {post_number}")
+        processed_count += 1


 if __name__ == '__main__':
-    main()
+    xiuren_dowload_pic()