import requests from bs4 import BeautifulSoup import os import time import random import re from xiuren.xiuren_pdf import generate_pdf_from_images def get_html(url, session): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36', 'Referer': 'https://www.xiurenwang.cc/' } try: response = session.get(url, headers=headers, verify=False) response.raise_for_status() return response.text except requests.exceptions.RequestException as e: print(f"请求 {url} 失败: {e}") return None def parse_initial_page(html): soup = BeautifulSoup(html, 'html.parser') posts = soup.find_all('a', href=lambda x: x and x.endswith('.html')) # 取所有帖子,而不是仅前两个,以便后续查找未下载的帖子 post_info = [] for post in posts: text = post.text.strip() number_match = re.search(r'No\.(\d+)', text) number = number_match.group(1) if number_match else None if number: url = 'https://www.xiurenwang.cc/' + post['href'] post_info.append({'url': url, 'number': number}) return post_info def extract_post_details(html): soup = BeautifulSoup(html, 'html.parser') title = soup.title.text.strip() if soup.title else "未知标题" # 提取可见图片URL image_div = soup.find('div', id='image') visible_image_urls = [] if image_div: images = image_div.find_all('img', {'data-original': True}) visible_image_urls = [img.get('data-original') for img in images] # 提取总图片数量 total_images = None sp_div = soup.find('div', class_='sp') if sp_div: i_tags = sp_div.find_all('i', class_='i1') if i_tags: total_text = i_tags[0].text.strip() number_match = re.search(r'(\d+)', total_text) total_images = int(number_match.group(1)) if number_match else None return title, visible_image_urls, total_images def generate_image_urls(visible_image_urls, total_images): if not visible_image_urls or not total_images: print("未找到可见图片URL或总图片数") return [] # 提取编号和基础路径 numbers = [int(url.split('/')[-1].split('.')[0]) for url in visible_image_urls] min_number = min(numbers) base_url = visible_image_urls[0].rsplit('/', 1)[0] + '/' # 如果base_url已包含https://,不需要再次添加 if not base_url.startswith('https://'): base_url = 'https://' + base_url.lstrip('/') # 生成所有图片URL image_urls = [] for i in range(total_images): image_number = min_number + i image_url = f"{base_url}{image_number}.jpg" image_urls.append(image_url) return image_urls def download_image(image_url, filename, session, post_url): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36', 'Referer': post_url } try: response = session.get(image_url, headers=headers, verify=False) response.raise_for_status() with open(filename, 'wb') as f: f.write(response.content) print(f"已下载 {image_url}") except requests.exceptions.RequestException as e: print(f"下载 {image_url} 失败: {e}") def download_images(image_urls, output_dir, session, post_url): if not image_urls: print("没有可下载的图片URL") return if not os.path.exists(output_dir): os.makedirs(output_dir) for i, image_url in enumerate(image_urls): if not image_url: print(f"无效URL在索引 {i}") continue filename = os.path.join(output_dir, f"{i + 1}.jpg") download_image(image_url, filename, session, post_url) # time.sleep(random.uniform(1, 3)) def xiuren_dowload_pic(): session = requests.Session() initial_url = 'https://www.xiurenwang.cc/bang?f=7' initial_html = get_html(initial_url, session) if not initial_html: print("无法获取初始页面") return post_info = parse_initial_page(initial_html) if not post_info: print("未找到有效帖子") return processed_count = 0 # 记录已处理的帖子数量 target_count = 1 # 目标处理2个新帖子 for post in post_info: if processed_count >= target_count: break post_url = post['url'] post_number = post['number'] output_dir = post_number # 检查本地文件夹是否已存在 if os.path.exists(output_dir): print(f"帖子 {post_number} 的文件夹已存在,跳过") continue post_html = get_html(post_url, session) if not post_html: print(f"无法获取帖子 {post_number} 的页面") continue title, visible_image_urls, total_images = extract_post_details(post_html) print(f"处理帖子 {post_number} - 标题: {title}, 总图片数: {total_images}") if not visible_image_urls or not total_images: print(f"帖子 {post_number} 缺少图片URL或总数,跳过") continue image_urls = generate_image_urls(visible_image_urls, total_images) if not image_urls: print(f"帖子 {post_number} 未生成图片URL,跳过") continue download_images(image_urls, output_dir, session, post_url) print(f"完成处理帖子 {post_number}") processed_count += 1 # 将下载好的帖子生成PDF return generate_pdf_from_images('.') if __name__ == '__main__': xiuren_dowload_pic()