From 2475fb8d7f6f7fbd24eeeb4333ff838f77cf3aca Mon Sep 17 00:00:00 2001 From: liuwei Date: Mon, 26 May 2025 10:33:19 +0800 Subject: [PATCH] =?UTF-8?q?=E8=A7=A3=E5=86=B3chrome=20quit=E7=9A=84?= =?UTF-8?q?=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- plugins/xiuren_image/meitu_dl.py | 99 +++++++++++++++++--------------- 1 file changed, 53 insertions(+), 46 deletions(-) diff --git a/plugins/xiuren_image/meitu_dl.py b/plugins/xiuren_image/meitu_dl.py index b7a46aa..7ed7a6b 100644 --- a/plugins/xiuren_image/meitu_dl.py +++ b/plugins/xiuren_image/meitu_dl.py @@ -20,7 +20,8 @@ headers = { seen_posts = set() download_root = "/mnt/nfs_share" # 全局定义下载根目录 -def fetch_posts(base_url,dl_path, posts_per_batch=10): + +def fetch_posts(base_url, dl_path, posts_per_batch=10): posts = [] page = 1 @@ -79,49 +80,56 @@ def get_total_pages(post_url): def fetch_images(post_url): - images = [] - total_pages = get_total_pages(post_url) - logger.info(f"帖子 {post_url} 共有 {total_pages} 页") + driver = None + try: + images = [] + total_pages = get_total_pages(post_url) + logger.info(f"帖子 {post_url} 共有 {total_pages} 页") - options = Options() - options.add_argument('--headless') # 使用新的headless模式 - options.add_argument('--disable-gpu') - options.add_argument('--disable-dev-shm-usage') # 添加Linux特定配置 + options = Options() + options.add_argument('--headless') # 使用新的headless模式 + options.add_argument('--disable-gpu') + options.add_argument('--disable-dev-shm-usage') # 添加Linux特定配置 - options.headless = True - # 根据操作系统选择不同的ChromeDriver路径处理方式 - if os.name == 'nt': # Windows - chrome_driver_path = os.path.join( - os.path.dirname(os.path.dirname(os.path.abspath(__file__))), - "utils", "chromedriver", "chromedriver.exe" - ) - else: # Linux - chrome_driver_path = '/usr/bin/chromedriver' # 使用系统PATH中的chromedriver - - # 如果本地没有chromedriver.exe,则使用默认方式 - if not os.path.exists(chrome_driver_path): - driver = webdriver.Chrome(options=options) - logger.debug("使用默认ChromeDriver") - else: - from selenium.webdriver.chrome.service import Service - driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options) - logger.debug(f"使用本地ChromeDriver: {chrome_driver_path}") + options.headless = True + # 根据操作系统选择不同的ChromeDriver路径处理方式 + if os.name == 'nt': # Windows + chrome_driver_path = os.path.join( + os.path.dirname(os.path.dirname(os.path.abspath(__file__))), + "utils", "chromedriver", "chromedriver.exe" + ) + else: # Linux + chrome_driver_path = '/usr/bin/chromedriver' # 使用系统PATH中的chromedriver - for page in range(1, total_pages + 1): - url = f"{post_url}/{page}" if page > 1 else post_url - driver.get(url) - time.sleep(2) + # 如果本地没有chromedriver.exe,则使用默认方式 + if not os.path.exists(chrome_driver_path): + driver = webdriver.Chrome(options=options) + logger.debug("使用默认ChromeDriver") + else: + from selenium.webdriver.chrome.service import Service + driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options) + logger.debug(f"使用本地ChromeDriver: {chrome_driver_path}") - img_elements = driver.find_elements(By.CSS_SELECTOR, 'figure.wp-block-gallery figure.wp-block-image img') - for img in img_elements: - img_url = img.get_attribute('src') - if img_url and img_url.startswith('http'): - images.append(img_url) + for page in range(1, total_pages + 1): + url = f"{post_url}/{page}" if page > 1 else post_url + driver.get(url) + time.sleep(2) - logger.info(f"已爬取 {url},找到 {len(img_elements)} 张图片") + img_elements = driver.find_elements(By.CSS_SELECTOR, 'figure.wp-block-gallery figure.wp-block-image img') + for img in img_elements: + img_url = img.get_attribute('src') + if img_url and img_url.startswith('http'): + images.append(img_url) - driver.quit() - return images + logger.info(f"已爬取 {url},找到 {len(img_elements)} 张图片") + + driver.quit() + return images + except Exception as e: + logger.info(f"爬取 {post_url} 失败: {e}") + return [] + finally: + driver.quit() def download_image(img_url, folder_path, img_index, max_retries=3): @@ -130,7 +138,7 @@ def download_image(img_url, folder_path, img_index, max_retries=3): # 构建特定的headers local_headers = headers.copy() local_headers['Referer'] = img_url # 使用图片URL作为referer - + # 添加一些额外的headers模拟真实浏览器 local_headers.update({ 'Accept': 'image/avif,image/webp,image/apng,image/*,*/*;q=0.8', @@ -139,10 +147,10 @@ def download_image(img_url, folder_path, img_index, max_retries=3): 'Cache-Control': 'no-cache', 'Pragma': 'no-cache' }) - + response = requests.get(img_url, headers=local_headers, timeout=10) response.raise_for_status() - + # 验证内容类型 content_type = response.headers.get('content-type', '') if not content_type.startswith('image/'): @@ -156,7 +164,7 @@ def download_image(img_url, folder_path, img_index, max_retries=3): img.save(img_path, 'JPEG', quality=95) logger.info(f"已下载并转换为JPG: {img_path}") return True - + except Exception as e: logger.warning(f"尝试 {attempt + 1}/{max_retries} 下载图片失败: {e}") if attempt < max_retries - 1: @@ -164,7 +172,7 @@ def download_image(img_url, folder_path, img_index, max_retries=3): logger.info(f"等待 {wait_time:.1f} 秒后重试...") time.sleep(wait_time) continue - + logger.warning(f"图片 {img_url} 下载失败,已达到最大重试次数") return False @@ -175,7 +183,7 @@ def meitu_dowload_pic(dl_path, dl_url): os.makedirs(dl_path) logger.info(f"开始爬取 {base_url} 的帖子...") - posts = fetch_posts(base_url,dl_path, 10) + posts = fetch_posts(base_url, dl_path, 10) if not posts: logger.info("未获取到符合条件的帖子,请检查选择器或网络连接。") @@ -199,7 +207,7 @@ def meitu_dowload_pic(dl_path, dl_url): if not download_image(img_url, folder_path, idx): logger.info(f"图片 {img_url} 下载失败,继续下一张...") continue - + # 每个帖子之间增加随机延时 time.sleep(random.uniform(3, 6)) @@ -210,6 +218,5 @@ def meitu_dowload_pub_pic(): meitu_dowload_pic(download_root, "https://www.mntuce.com/") - if __name__ == "__main__": meitu_dowload_pub_pic()