diff --git a/xiuren/meitu_dl.py b/xiuren/meitu_dl.py index ac7ea43..d5b0417 100644 --- a/xiuren/meitu_dl.py +++ b/xiuren/meitu_dl.py @@ -1,5 +1,5 @@ import random - +from loguru import logger import requests from bs4 import BeautifulSoup import time @@ -33,7 +33,7 @@ def fetch_posts(base_url,dl_path, posts_per_batch=10): post_elements = soup.select('posts.posts-item.card h2.item-heading a') if not post_elements: - print(f"页面 {page} 未找到帖子,停止爬取") + logger.info(f"页面 {page} 未找到帖子,停止爬取") break for post in post_elements: @@ -58,7 +58,7 @@ def fetch_posts(base_url,dl_path, posts_per_batch=10): page += 1 time.sleep(1) except requests.RequestException as e: - print(f"请求 {url} 失败: {e}") + logger.info(f"请求 {url} 失败: {e}") break return posts @@ -74,14 +74,14 @@ def get_total_pages(post_url): pages = [int(link.text) for link in page_links if link.text.isdigit()] return max(pages) if pages else 1 except requests.RequestException as e: - print(f"请求 {post_url} 失败,默认1页: {e}") + logger.info(f"请求 {post_url} 失败,默认1页: {e}") return 1 def fetch_images(post_url): images = [] total_pages = get_total_pages(post_url) - print(f"帖子 {post_url} 共有 {total_pages} 页") + logger.info(f"帖子 {post_url} 共有 {total_pages} 页") options = Options() options.add_argument('--headless') # 使用新的headless模式 @@ -101,11 +101,11 @@ def fetch_images(post_url): # 如果本地没有chromedriver.exe,则使用默认方式 if not os.path.exists(chrome_driver_path): driver = webdriver.Chrome(options=options) - print("使用默认ChromeDriver") + logger.debug("使用默认ChromeDriver") else: from selenium.webdriver.chrome.service import Service driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options) - print(f"使用本地ChromeDriver: {chrome_driver_path}") + logger.debug(f"使用本地ChromeDriver: {chrome_driver_path}") for page in range(1, total_pages + 1): url = f"{post_url}/{page}" if page > 1 else post_url @@ -118,7 +118,7 @@ def fetch_images(post_url): if img_url and img_url.startswith('http'): images.append(img_url) - print(f"已爬取 {url},找到 {len(img_elements)} 张图片") + logger.info(f"已爬取 {url},找到 {len(img_elements)} 张图片") driver.quit() return images @@ -146,7 +146,7 @@ def download_image(img_url, folder_path, img_index, max_retries=3): # 验证内容类型 content_type = response.headers.get('content-type', '') if not content_type.startswith('image/'): - print(f"尝试 {attempt + 1}/{max_retries}: 返回内容不是图片类型 ({content_type}),等待后重试...") + logger.info(f"尝试 {attempt + 1}/{max_retries}: 返回内容不是图片类型 ({content_type}),等待后重试...") time.sleep(2 * (attempt + 1)) continue @@ -155,18 +155,18 @@ def download_image(img_url, folder_path, img_index, max_retries=3): img_path = os.path.join(folder_path, img_name) img.save(img_path, 'JPEG', quality=95) - print(f"已下载并转换为JPG: {img_path}") + logger.info(f"已下载并转换为JPG: {img_path}") return True except Exception as e: - print(f"尝试 {attempt + 1}/{max_retries} 下载图片失败: {e}") + logger.warning(f"尝试 {attempt + 1}/{max_retries} 下载图片失败: {e}") if attempt < max_retries - 1: wait_time = random.uniform(2, 5) * (attempt + 1) # 随机递增等待时间 - print(f"等待 {wait_time:.1f} 秒后重试...") + logger.info(f"等待 {wait_time:.1f} 秒后重试...") time.sleep(wait_time) continue - print(f"图片 {img_url} 下载失败,已达到最大重试次数") + logger.warning(f"图片 {img_url} 下载失败,已达到最大重试次数") return False @@ -175,17 +175,17 @@ def meitu_dowload_pic(dl_path, dl_url): if not os.path.exists(dl_path): os.makedirs(dl_path) - print(f"开始爬取 {base_url} 的帖子...") + logger.info(f"开始爬取 {base_url} 的帖子...") posts = fetch_posts(base_url,dl_path, 10) if not posts: - print("未获取到符合条件的帖子,请检查选择器或网络连接。") + logger.info("未获取到符合条件的帖子,请检查选择器或网络连接。") return - print(f"成功选择 {len(posts)} 个未下载的帖子,开始下载图片...") + logger.info(f"成功选择 {len(posts)} 个未下载的帖子,开始下载图片...") for i, post in enumerate(posts, 1): - print(f"\n{i}. 标题: {post['title']}") - print(f" 链接: {post['url']}") + logger.info(f"\n{i}. 标题: {post['title']}") + logger.info(f" 链接: {post['url']}") match = re.search(r'(?:[Nn][Oo]|[Vv][Oo][Ll])\.(\d+)', post['title']) # 支持 "No." 或 " folder_name = match.group(1) if match else post['title'] @@ -194,11 +194,11 @@ def meitu_dowload_pic(dl_path, dl_url): os.makedirs(folder_path, exist_ok=True) # 创建目录,exist_ok=True 避免重复创建报错 images = fetch_images(post['url']) if images: - print(f"共找到 {len(images)} 张图片,开始下载...") + logger.info(f"共找到 {len(images)} 张图片,开始下载...") for idx, img_url in enumerate(images, 1): # 增加随机延时 if not download_image(img_url, folder_path, idx): - print(f"图片 {img_url} 下载失败,继续下一张...") + logger.info(f"图片 {img_url} 下载失败,继续下一张...") continue # 每个帖子之间增加随机延时