import requests from bs4 import BeautifulSoup import os import time import random def get_html(url): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' 'AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/114.0.0.0 Safari/537.36' } try: response = requests.get(url, headers=headers) response.raise_for_status() return response.text except requests.exceptions.RequestException as e: print(f"Error fetching {url}: {e}") return None def parse_initial_page(html): soup = BeautifulSoup(html, 'html.parser') posts = soup.find_all('a', href=lambda x: x and x.endswith('.html')) first_two_posts = posts[:2] post_info = [] print(posts) for post in first_two_posts: text = post.text.strip() print(f"Post text: '{text}'") # 调试输出,检查实际内容 if not text: print("Empty post text, skipping...") continue parts = text.split() if len(parts) < 2: print(f"Unexpected format in '{text}', skipping...") continue # 提取编号和图片总数 number = parts[0].replace('No.', '') if parts[0].startswith('No.') else None pages = parts[-1].replace('P', '') if parts[-1].endswith('P') else None if not number or not pages: print(f"Failed to parse number or pages from '{text}', skipping...") continue try: total_images = int(pages) url = 'https://www.xiurenwang.cc/' + post['href'] post_info.append({'url': url, 'number': number, 'total_images': total_images}) except ValueError: print(f"Invalid total_images value in '{text}', skipping...") continue print(f"post_info:{post_info}") return post_info def extract_title_and_first_image(html): soup = BeautifulSoup(html, 'html.parser') title = soup.title.text.strip() images = soup.find_all('img', src=lambda x: x and 'pic/' in x) if images: first_image = images[0] first_image_src = first_image['src'] return title, first_image_src else: return None, None def parse_image_url(src): image_filename = src.split('/')[-1] starting_number = int(image_filename.split('.')[0]) return starting_number def download_image(image_url, filename): try: response = requests.get(image_url) response.raise_for_status() with open(filename, 'wb') as f: f.write(response.content) except requests.exceptions.RequestException as e: print(f"Error downloading {image_url}: {e}") def download_images(image_urls, output_dir): if not os.path.exists(output_dir): os.makedirs(output_dir) for i, image_url in enumerate(image_urls): filename = os.path.join(output_dir, f"{i + 1}.jpg") download_image(image_url, filename) time.sleep(random.uniform(1, 3)) def main(): initial_url = 'https://www.xiurenwang.cc/bang?f=7' initial_html = get_html(initial_url) if not initial_html: return post_info = parse_initial_page(initial_html) if not post_info: print("No valid posts found.") return for post in post_info: post_url = post['url'] post_number = post['number'] total_images = post['total_images'] print(f"Processing post {post_number} with {total_images} images...") post_html = get_html(post_url) if not post_html: continue title, first_image_src = extract_title_and_first_image(post_html) if not first_image_src: print(f"No image found for post {post_number}") continue starting_number = parse_image_url(first_image_src) # Construct full base URL base_url = first_image_src.rsplit('/', 1)[0] + '/' full_base_url = 'https:' + base_url # Generate image URLs image_urls = [] for i in range(total_images): image_number = starting_number + i image_url = full_base_url + str(image_number) + '.jpg' image_urls.append(image_url) # Create output directory output_dir = post_number download_images(image_urls, output_dir) print(f"Downloaded {total_images} images for post {post_number}") if __name__ == '__main__': main()