abot/xiuren/xiuren_dl.py

import requests
from bs4 import BeautifulSoup
import os
import time
import random


def get_html(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                      'AppleWebKit/537.36 (KHTML, like Gecko) '
                      'Chrome/114.0.0.0 Safari/537.36'
    }
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None


def parse_initial_page(html):
    soup = BeautifulSoup(html, 'html.parser')
    posts = soup.find_all('a', href=lambda x: x and x.endswith('.html'))
    first_two_posts = posts[:2]
    post_info = []

    print(posts)
    for post in first_two_posts:
        text = post.text.strip()
        print(f"Post text: '{text}'")  # 调试输出，检查实际内容
        if not text:
            print("Empty post text, skipping...")
            continue

        parts = text.split()
        if len(parts) < 2:
            print(f"Unexpected format in '{text}', skipping...")
            continue

        # 提取编号和图片总数
        number = parts[0].replace('No.', '') if parts[0].startswith('No.') else None
        pages = parts[-1].replace('P', '') if parts[-1].endswith('P') else None

        if not number or not pages:
            print(f"Failed to parse number or pages from '{text}', skipping...")
            continue

        try:
            total_images = int(pages)
            url = 'https://www.xiurenwang.cc/' + post['href']
            post_info.append({'url': url, 'number': number, 'total_images': total_images})
        except ValueError:
            print(f"Invalid total_images value in '{text}', skipping...")
            continue
        print(f"post_info:{post_info}")
    return post_info


def extract_title_and_first_image(html):
    soup = BeautifulSoup(html, 'html.parser')
    title = soup.title.text.strip()
    images = soup.find_all('img', src=lambda x: x and 'pic/' in x)
    if images:
        first_image = images[0]
        first_image_src = first_image['src']
        return title, first_image_src
    else:
        return None, None


def parse_image_url(src):
    image_filename = src.split('/')[-1]
    starting_number = int(image_filename.split('.')[0])
    return starting_number


def download_image(image_url, filename):
    try:
        response = requests.get(image_url)
        response.raise_for_status()
        with open(filename, 'wb') as f:
            f.write(response.content)
    except requests.exceptions.RequestException as e:
        print(f"Error downloading {image_url}: {e}")


def download_images(image_urls, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    for i, image_url in enumerate(image_urls):
        filename = os.path.join(output_dir, f"{i + 1}.jpg")
        download_image(image_url, filename)
        time.sleep(random.uniform(1, 3))


def main():
    initial_url = 'https://www.xiurenwang.cc/bang?f=7'
    initial_html = get_html(initial_url)
    if not initial_html:
        return

    post_info = parse_initial_page(initial_html)
    if not post_info:
        print("No valid posts found.")
        return

    for post in post_info:
        post_url = post['url']
        post_number = post['number']
        total_images = post['total_images']

        print(f"Processing post {post_number} with {total_images} images...")
        post_html = get_html(post_url)
        if not post_html:
            continue

        title, first_image_src = extract_title_and_first_image(post_html)
        if not first_image_src:
            print(f"No image found for post {post_number}")
            continue

        starting_number = parse_image_url(first_image_src)

        # Construct full base URL
        base_url = first_image_src.rsplit('/', 1)[0] + '/'
        full_base_url = 'https:' + base_url

        # Generate image URLs
        image_urls = []
        for i in range(total_images):
            image_number = starting_number + i
            image_url = full_base_url + str(image_number) + '.jpg'
            image_urls.append(image_url)

        # Create output directory
        output_dir = post_number
        download_images(image_urls, output_dir)

        print(f"Downloaded {total_images} images for post {post_number}")


if __name__ == '__main__':
    main()