调整日志信息
This commit is contained in:
@@ -1,5 +1,5 @@
|
|||||||
import random
|
import random
|
||||||
|
from loguru import logger
|
||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
import time
|
import time
|
||||||
@@ -33,7 +33,7 @@ def fetch_posts(base_url,dl_path, posts_per_batch=10):
|
|||||||
|
|
||||||
post_elements = soup.select('posts.posts-item.card h2.item-heading a')
|
post_elements = soup.select('posts.posts-item.card h2.item-heading a')
|
||||||
if not post_elements:
|
if not post_elements:
|
||||||
print(f"页面 {page} 未找到帖子,停止爬取")
|
logger.info(f"页面 {page} 未找到帖子,停止爬取")
|
||||||
break
|
break
|
||||||
|
|
||||||
for post in post_elements:
|
for post in post_elements:
|
||||||
@@ -58,7 +58,7 @@ def fetch_posts(base_url,dl_path, posts_per_batch=10):
|
|||||||
page += 1
|
page += 1
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
except requests.RequestException as e:
|
except requests.RequestException as e:
|
||||||
print(f"请求 {url} 失败: {e}")
|
logger.info(f"请求 {url} 失败: {e}")
|
||||||
break
|
break
|
||||||
|
|
||||||
return posts
|
return posts
|
||||||
@@ -74,14 +74,14 @@ def get_total_pages(post_url):
|
|||||||
pages = [int(link.text) for link in page_links if link.text.isdigit()]
|
pages = [int(link.text) for link in page_links if link.text.isdigit()]
|
||||||
return max(pages) if pages else 1
|
return max(pages) if pages else 1
|
||||||
except requests.RequestException as e:
|
except requests.RequestException as e:
|
||||||
print(f"请求 {post_url} 失败,默认1页: {e}")
|
logger.info(f"请求 {post_url} 失败,默认1页: {e}")
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
|
|
||||||
def fetch_images(post_url):
|
def fetch_images(post_url):
|
||||||
images = []
|
images = []
|
||||||
total_pages = get_total_pages(post_url)
|
total_pages = get_total_pages(post_url)
|
||||||
print(f"帖子 {post_url} 共有 {total_pages} 页")
|
logger.info(f"帖子 {post_url} 共有 {total_pages} 页")
|
||||||
|
|
||||||
options = Options()
|
options = Options()
|
||||||
options.add_argument('--headless') # 使用新的headless模式
|
options.add_argument('--headless') # 使用新的headless模式
|
||||||
@@ -101,11 +101,11 @@ def fetch_images(post_url):
|
|||||||
# 如果本地没有chromedriver.exe,则使用默认方式
|
# 如果本地没有chromedriver.exe,则使用默认方式
|
||||||
if not os.path.exists(chrome_driver_path):
|
if not os.path.exists(chrome_driver_path):
|
||||||
driver = webdriver.Chrome(options=options)
|
driver = webdriver.Chrome(options=options)
|
||||||
print("使用默认ChromeDriver")
|
logger.debug("使用默认ChromeDriver")
|
||||||
else:
|
else:
|
||||||
from selenium.webdriver.chrome.service import Service
|
from selenium.webdriver.chrome.service import Service
|
||||||
driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
|
driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
|
||||||
print(f"使用本地ChromeDriver: {chrome_driver_path}")
|
logger.debug(f"使用本地ChromeDriver: {chrome_driver_path}")
|
||||||
|
|
||||||
for page in range(1, total_pages + 1):
|
for page in range(1, total_pages + 1):
|
||||||
url = f"{post_url}/{page}" if page > 1 else post_url
|
url = f"{post_url}/{page}" if page > 1 else post_url
|
||||||
@@ -118,7 +118,7 @@ def fetch_images(post_url):
|
|||||||
if img_url and img_url.startswith('http'):
|
if img_url and img_url.startswith('http'):
|
||||||
images.append(img_url)
|
images.append(img_url)
|
||||||
|
|
||||||
print(f"已爬取 {url},找到 {len(img_elements)} 张图片")
|
logger.info(f"已爬取 {url},找到 {len(img_elements)} 张图片")
|
||||||
|
|
||||||
driver.quit()
|
driver.quit()
|
||||||
return images
|
return images
|
||||||
@@ -146,7 +146,7 @@ def download_image(img_url, folder_path, img_index, max_retries=3):
|
|||||||
# 验证内容类型
|
# 验证内容类型
|
||||||
content_type = response.headers.get('content-type', '')
|
content_type = response.headers.get('content-type', '')
|
||||||
if not content_type.startswith('image/'):
|
if not content_type.startswith('image/'):
|
||||||
print(f"尝试 {attempt + 1}/{max_retries}: 返回内容不是图片类型 ({content_type}),等待后重试...")
|
logger.info(f"尝试 {attempt + 1}/{max_retries}: 返回内容不是图片类型 ({content_type}),等待后重试...")
|
||||||
time.sleep(2 * (attempt + 1))
|
time.sleep(2 * (attempt + 1))
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@@ -155,18 +155,18 @@ def download_image(img_url, folder_path, img_index, max_retries=3):
|
|||||||
img_path = os.path.join(folder_path, img_name)
|
img_path = os.path.join(folder_path, img_name)
|
||||||
|
|
||||||
img.save(img_path, 'JPEG', quality=95)
|
img.save(img_path, 'JPEG', quality=95)
|
||||||
print(f"已下载并转换为JPG: {img_path}")
|
logger.info(f"已下载并转换为JPG: {img_path}")
|
||||||
return True
|
return True
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"尝试 {attempt + 1}/{max_retries} 下载图片失败: {e}")
|
logger.warning(f"尝试 {attempt + 1}/{max_retries} 下载图片失败: {e}")
|
||||||
if attempt < max_retries - 1:
|
if attempt < max_retries - 1:
|
||||||
wait_time = random.uniform(2, 5) * (attempt + 1) # 随机递增等待时间
|
wait_time = random.uniform(2, 5) * (attempt + 1) # 随机递增等待时间
|
||||||
print(f"等待 {wait_time:.1f} 秒后重试...")
|
logger.info(f"等待 {wait_time:.1f} 秒后重试...")
|
||||||
time.sleep(wait_time)
|
time.sleep(wait_time)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
print(f"图片 {img_url} 下载失败,已达到最大重试次数")
|
logger.warning(f"图片 {img_url} 下载失败,已达到最大重试次数")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
@@ -175,17 +175,17 @@ def meitu_dowload_pic(dl_path, dl_url):
|
|||||||
if not os.path.exists(dl_path):
|
if not os.path.exists(dl_path):
|
||||||
os.makedirs(dl_path)
|
os.makedirs(dl_path)
|
||||||
|
|
||||||
print(f"开始爬取 {base_url} 的帖子...")
|
logger.info(f"开始爬取 {base_url} 的帖子...")
|
||||||
posts = fetch_posts(base_url,dl_path, 10)
|
posts = fetch_posts(base_url,dl_path, 10)
|
||||||
|
|
||||||
if not posts:
|
if not posts:
|
||||||
print("未获取到符合条件的帖子,请检查选择器或网络连接。")
|
logger.info("未获取到符合条件的帖子,请检查选择器或网络连接。")
|
||||||
return
|
return
|
||||||
|
|
||||||
print(f"成功选择 {len(posts)} 个未下载的帖子,开始下载图片...")
|
logger.info(f"成功选择 {len(posts)} 个未下载的帖子,开始下载图片...")
|
||||||
for i, post in enumerate(posts, 1):
|
for i, post in enumerate(posts, 1):
|
||||||
print(f"\n{i}. 标题: {post['title']}")
|
logger.info(f"\n{i}. 标题: {post['title']}")
|
||||||
print(f" 链接: {post['url']}")
|
logger.info(f" 链接: {post['url']}")
|
||||||
|
|
||||||
match = re.search(r'(?:[Nn][Oo]|[Vv][Oo][Ll])\.(\d+)', post['title']) # 支持 "No." 或 "
|
match = re.search(r'(?:[Nn][Oo]|[Vv][Oo][Ll])\.(\d+)', post['title']) # 支持 "No." 或 "
|
||||||
folder_name = match.group(1) if match else post['title']
|
folder_name = match.group(1) if match else post['title']
|
||||||
@@ -194,11 +194,11 @@ def meitu_dowload_pic(dl_path, dl_url):
|
|||||||
os.makedirs(folder_path, exist_ok=True) # 创建目录,exist_ok=True 避免重复创建报错
|
os.makedirs(folder_path, exist_ok=True) # 创建目录,exist_ok=True 避免重复创建报错
|
||||||
images = fetch_images(post['url'])
|
images = fetch_images(post['url'])
|
||||||
if images:
|
if images:
|
||||||
print(f"共找到 {len(images)} 张图片,开始下载...")
|
logger.info(f"共找到 {len(images)} 张图片,开始下载...")
|
||||||
for idx, img_url in enumerate(images, 1):
|
for idx, img_url in enumerate(images, 1):
|
||||||
# 增加随机延时
|
# 增加随机延时
|
||||||
if not download_image(img_url, folder_path, idx):
|
if not download_image(img_url, folder_path, idx):
|
||||||
print(f"图片 {img_url} 下载失败,继续下一张...")
|
logger.info(f"图片 {img_url} 下载失败,继续下一张...")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# 每个帖子之间增加随机延时
|
# 每个帖子之间增加随机延时
|
||||||
|
|||||||
Reference in New Issue
Block a user