@@ -1,49 +1,145 @@
import requests
from lxml import etree
from bs4 import BeautifulSoup
import os
import time
import random
# 设置目标URL和请求头
url = " https://www.xiurenwang.cc/bang?f=7 "
headers = {
" User-Agent " : " Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 " ,
" Referer " : " https://www.xiurenwang.cc/ "
}
# 发送请求获取网页内容
response = requests . get ( url , headers = headers )
response . encoding = " utf-8 " # 确保正确解码
def get_html ( url ) :
headers = {
' User-Agent ' : ' Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
' AppleWebKit/537.36 (KHTML, like Gecko) '
' Chrome/114.0.0.0 Safari/537.36 '
}
try :
response = requests . get ( url , headers = headers )
response . raise_for_status ( )
return response . text
except requests . exceptions . RequestException as e :
print ( f " Error fetching { url } : { e } " )
return None
# 解析HTML
html = etree . HTML ( response . text )
# 提取图片链接和标题(假设最新图片在列表页面中)
image_items = html . xpath ( ' //div[@class= " list " ]/li/a[@class= " img " ]/@href ' )
title s = html . xpath ( ' //div[@class= " tit " ]/a/text() ' )
def parse_initial_page ( html ) :
soup = BeautifulSoup ( html , ' html.parser ' )
post s = soup . find_all ( ' a ' , href = lambda x : x and x . endswith ( ' .html ' ) )
first_two_posts = posts [ : 2 ]
post_info = [ ]
# 创建保存图片的文件夹
save_dir = " ./xiuren_images "
if not os . path . exists ( save_dir ) :
os . makedirs ( save_dir )
print ( posts )
for post in first_two_posts :
text = post . t ext . strip ( )
print ( f " Post text: ' { text } ' " ) # 调试输出,检查实际内容
if not text :
print ( " Empty post text, skipping... " )
continue
# 只获取最新的一个条目(假设第一个是最新的)
if image_items :
latest_url = " https://www.xiurenwang.cc " + image_items [ 0 ] # 拼接详情页URL
latest_title = titles [ 0 ] if titles else " latest_image "
parts = text . split ( )
if len ( parts ) < 2 :
print ( f " Unexpected format in ' { text } ' , skipping... " )
continue
# 访问详情页获取图片
detail_response = requests . get ( latest_url , headers = headers )
detail_html = etree . HTML ( detail_response . text )
image_urls = detail_html . xpath ( ' //div[@id= " image " ]/a/@href ' )
# 提取编号和图片总数
number = parts [ 0 ] . replace ( ' No. ' , ' ' ) if parts [ 0 ] . startswith ( ' No. ' ) else None
pages = parts [ - 1 ] . replace ( ' P ' , ' ' ) if parts [ - 1 ] . endswith ( ' P ' ) else None
# 下载图片
for idx , img_url in e numerate ( image_urls ) :
img_response = requests . get ( img_url , headers = headers )
img_name = f " { latest_title } _ { idx + 1 } .jpg "
img_path = os . path . join ( save_dir , img_name . replace ( ' / ' , ' _ ' ) ) # 避免文件名中的斜杠
with open ( img_path , " wb " ) as f :
f . write ( img_response . content )
print ( f " 已下载: { img_path } " )
else :
print ( " 未找到图片链接, 可能需要调整XPath或检查网站结构。 " )
if not number or not pages :
print ( f " Failed to parse numb er or pages from ' { text } ' , skipping... " )
continue
print ( " 最新图片下载完成! " )
try :
total_images = int ( pages )
url = ' https://www.xiurenwang.cc/ ' + post [ ' href ' ]
post_info . append ( { ' url ' : url , ' number ' : number , ' total_images ' : total_images } )
except ValueError :
print ( f " Invalid total_images value in ' { text } ' , skipping... " )
continue
print ( f " post_info: { post_info } " )
return post_info
def extract_title_and_first_image ( html ) :
soup = BeautifulSoup ( html , ' html.parser ' )
title = soup . title . text . strip ( )
images = soup . find_all ( ' img ' , src = lambda x : x and ' pic/ ' in x )
if images :
first_image = images [ 0 ]
first_image_src = first_image [ ' src ' ]
return title , first_image_src
else :
return None , None
def parse_image_url ( src ) :
image_filename = src . split ( ' / ' ) [ - 1 ]
starting_number = int ( image_filename . split ( ' . ' ) [ 0 ] )
return starting_number
def download_image ( image_url , filename ) :
try :
response = requests . get ( image_url )
response . raise_for_status ( )
with open ( filename , ' wb ' ) as f :
f . write ( response . content )
except requests . exceptions . RequestException as e :
print ( f " Error downloading { image_url } : { e } " )
def download_images ( image_urls , output_dir ) :
if not os . path . exists ( output_dir ) :
os . makedirs ( output_dir )
for i , image_url in enumerate ( image_urls ) :
filename = os . path . join ( output_dir , f " { i + 1 } .jpg " )
download_image ( image_url , filename )
time . sleep ( random . uniform ( 1 , 3 ) )
def main ( ) :
initial_url = ' https://www.xiurenwang.cc/bang?f=7 '
initial_html = get_html ( initial_url )
if not initial_html :
return
post_info = parse_initial_page ( initial_html )
if not post_info :
print ( " No valid posts found. " )
return
for post in post_info :
post_url = post [ ' url ' ]
post_number = post [ ' number ' ]
total_images = post [ ' total_images ' ]
print ( f " Processing post { post_number } with { total_images } images... " )
post_html = get_html ( post_url )
if not post_html :
continue
title , first_image_src = extract_title_and_first_image ( post_html )
if not first_image_src :
print ( f " No image found for post { post_number } " )
continue
starting_number = parse_image_url ( first_image_src )
# Construct full base URL
base_url = first_image_src . rsplit ( ' / ' , 1 ) [ 0 ] + ' / '
full_base_url = ' https: ' + base_url
# Generate image URLs
image_urls = [ ]
for i in range ( total_images ) :
image_number = starting_number + i
image_url = full_base_url + str ( image_number ) + ' .jpg '
image_urls . append ( image_url )
# Create output directory
output_dir = post_number
download_images ( image_urls , output_dir )
print ( f " Downloaded { total_images } images for post { post_number } " )
if __name__ == ' __main__ ' :
main ( )