自动任务发放功能调整

2025-02-24 15:05:45 +08:00
parent 7dddaedb9a
commit de8eb09ff1
4 changed files with 139 additions and 39 deletions
--- a/xiuren/xiuren_dl.py
+++ b/xiuren/xiuren_dl.py
@@ -1,49 +1,145 @@
 import requests
-from lxml import etree
+from bs4 import BeautifulSoup
 import os
+import time
+import random

-# 设置目标URL和请求头
-url = "https://www.xiurenwang.cc/bang?f=7"
-headers = {
-    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
-    "Referer": "https://www.xiurenwang.cc/"
-}

-# 发送请求获取网页内容
-response = requests.get(url, headers=headers)
-response.encoding = "utf-8"  # 确保正确解码
+def get_html(url):
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
+                      'AppleWebKit/537.36 (KHTML, like Gecko) '
+                      'Chrome/114.0.0.0 Safari/537.36'
+    }
+    try:
+        response = requests.get(url, headers=headers)
+        response.raise_for_status()
+        return response.text
+    except requests.exceptions.RequestException as e:
+        print(f"Error fetching {url}: {e}")
+        return None

-# 解析HTML
-html = etree.HTML(response.text)

-# 提取图片链接和标题（假设最新图片在列表页面中）
-image_items = html.xpath('//div[@class="list"]/li/a[@class="img"]/@href')
-titles = html.xpath('//div[@class="tit"]/a/text()')
+def parse_initial_page(html):
+    soup = BeautifulSoup(html, 'html.parser')
+    posts = soup.find_all('a', href=lambda x: x and x.endswith('.html'))
+    first_two_posts = posts[:2]
+    post_info = []

-# 创建保存图片的文件夹
-save_dir = "./xiuren_images"
-if not os.path.exists(save_dir):
-    os.makedirs(save_dir)
+    print(posts)
+    for post in first_two_posts:
+        text = post.text.strip()
+        print(f"Post text: '{text}'")  # 调试输出，检查实际内容
+        if not text:
+            print("Empty post text, skipping...")
+            continue

-# 只获取最新的一个条目（假设第一个是最新的）
-if image_items:
-    latest_url = "https://www.xiurenwang.cc" + image_items[0]  # 拼接详情页URL
-    latest_title = titles[0] if titles else "latest_image"
+        parts = text.split()
+        if len(parts) < 2:
+            print(f"Unexpected format in '{text}', skipping...")
+            continue

-    # 访问详情页获取图片
-    detail_response = requests.get(latest_url, headers=headers)
-    detail_html = etree.HTML(detail_response.text)
-    image_urls = detail_html.xpath('//div[@id="image"]/a/@href')
+        # 提取编号和图片总数
+        number = parts[0].replace('No.', '') if parts[0].startswith('No.') else None
+        pages = parts[-1].replace('P', '') if parts[-1].endswith('P') else None

-    # 下载图片
-    for idx, img_url in enumerate(image_urls):
-        img_response = requests.get(img_url, headers=headers)
-        img_name = f"{latest_title}_{idx + 1}.jpg"
-        img_path = os.path.join(save_dir, img_name.replace('/', '_'))  # 避免文件名中的斜杠
-        with open(img_path, "wb") as f:
-            f.write(img_response.content)
-        print(f"已下载: {img_path}")
-else:
-    print("未找到图片链接，可能需要调整XPath或检查网站结构。")
+        if not number or not pages:
+            print(f"Failed to parse number or pages from '{text}', skipping...")
+            continue

-print("最新图片下载完成！")
+        try:
+            total_images = int(pages)
+            url = 'https://www.xiurenwang.cc/' + post['href']
+            post_info.append({'url': url, 'number': number, 'total_images': total_images})
+        except ValueError:
+            print(f"Invalid total_images value in '{text}', skipping...")
+            continue
+        print(f"post_info:{post_info}")
+    return post_info
+
+
+def extract_title_and_first_image(html):
+    soup = BeautifulSoup(html, 'html.parser')
+    title = soup.title.text.strip()
+    images = soup.find_all('img', src=lambda x: x and 'pic/' in x)
+    if images:
+        first_image = images[0]
+        first_image_src = first_image['src']
+        return title, first_image_src
+    else:
+        return None, None
+
+
+def parse_image_url(src):
+    image_filename = src.split('/')[-1]
+    starting_number = int(image_filename.split('.')[0])
+    return starting_number
+
+
+def download_image(image_url, filename):
+    try:
+        response = requests.get(image_url)
+        response.raise_for_status()
+        with open(filename, 'wb') as f:
+            f.write(response.content)
+    except requests.exceptions.RequestException as e:
+        print(f"Error downloading {image_url}: {e}")
+
+
+def download_images(image_urls, output_dir):
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    for i, image_url in enumerate(image_urls):
+        filename = os.path.join(output_dir, f"{i + 1}.jpg")
+        download_image(image_url, filename)
+        time.sleep(random.uniform(1, 3))
+
+
+def main():
+    initial_url = 'https://www.xiurenwang.cc/bang?f=7'
+    initial_html = get_html(initial_url)
+    if not initial_html:
+        return
+
+    post_info = parse_initial_page(initial_html)
+    if not post_info:
+        print("No valid posts found.")
+        return
+
+    for post in post_info:
+        post_url = post['url']
+        post_number = post['number']
+        total_images = post['total_images']
+
+        print(f"Processing post {post_number} with {total_images} images...")
+        post_html = get_html(post_url)
+        if not post_html:
+            continue
+
+        title, first_image_src = extract_title_and_first_image(post_html)
+        if not first_image_src:
+            print(f"No image found for post {post_number}")
+            continue
+
+        starting_number = parse_image_url(first_image_src)
+
+        # Construct full base URL
+        base_url = first_image_src.rsplit('/', 1)[0] + '/'
+        full_base_url = 'https:' + base_url
+
+        # Generate image URLs
+        image_urls = []
+        for i in range(total_images):
+            image_number = starting_number + i
+            image_url = full_base_url + str(image_number) + '.jpg'
+            image_urls.append(image_url)
+
+        # Create output directory
+        output_dir = post_number
+        download_images(image_urls, output_dir)
+
+        print(f"Downloaded {total_images} images for post {post_number}")
+
+
+if __name__ == '__main__':
+    main()