解决chrome quit的问题
This commit is contained in:
@@ -20,7 +20,8 @@ headers = {
|
||||
seen_posts = set()
|
||||
download_root = "/mnt/nfs_share" # 全局定义下载根目录
|
||||
|
||||
def fetch_posts(base_url,dl_path, posts_per_batch=10):
|
||||
|
||||
def fetch_posts(base_url, dl_path, posts_per_batch=10):
|
||||
posts = []
|
||||
page = 1
|
||||
|
||||
@@ -79,49 +80,56 @@ def get_total_pages(post_url):
|
||||
|
||||
|
||||
def fetch_images(post_url):
|
||||
images = []
|
||||
total_pages = get_total_pages(post_url)
|
||||
logger.info(f"帖子 {post_url} 共有 {total_pages} 页")
|
||||
driver = None
|
||||
try:
|
||||
images = []
|
||||
total_pages = get_total_pages(post_url)
|
||||
logger.info(f"帖子 {post_url} 共有 {total_pages} 页")
|
||||
|
||||
options = Options()
|
||||
options.add_argument('--headless') # 使用新的headless模式
|
||||
options.add_argument('--disable-gpu')
|
||||
options.add_argument('--disable-dev-shm-usage') # 添加Linux特定配置
|
||||
options = Options()
|
||||
options.add_argument('--headless') # 使用新的headless模式
|
||||
options.add_argument('--disable-gpu')
|
||||
options.add_argument('--disable-dev-shm-usage') # 添加Linux特定配置
|
||||
|
||||
options.headless = True
|
||||
# 根据操作系统选择不同的ChromeDriver路径处理方式
|
||||
if os.name == 'nt': # Windows
|
||||
chrome_driver_path = os.path.join(
|
||||
os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
|
||||
"utils", "chromedriver", "chromedriver.exe"
|
||||
)
|
||||
else: # Linux
|
||||
chrome_driver_path = '/usr/bin/chromedriver' # 使用系统PATH中的chromedriver
|
||||
|
||||
# 如果本地没有chromedriver.exe,则使用默认方式
|
||||
if not os.path.exists(chrome_driver_path):
|
||||
driver = webdriver.Chrome(options=options)
|
||||
logger.debug("使用默认ChromeDriver")
|
||||
else:
|
||||
from selenium.webdriver.chrome.service import Service
|
||||
driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
|
||||
logger.debug(f"使用本地ChromeDriver: {chrome_driver_path}")
|
||||
options.headless = True
|
||||
# 根据操作系统选择不同的ChromeDriver路径处理方式
|
||||
if os.name == 'nt': # Windows
|
||||
chrome_driver_path = os.path.join(
|
||||
os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
|
||||
"utils", "chromedriver", "chromedriver.exe"
|
||||
)
|
||||
else: # Linux
|
||||
chrome_driver_path = '/usr/bin/chromedriver' # 使用系统PATH中的chromedriver
|
||||
|
||||
for page in range(1, total_pages + 1):
|
||||
url = f"{post_url}/{page}" if page > 1 else post_url
|
||||
driver.get(url)
|
||||
time.sleep(2)
|
||||
# 如果本地没有chromedriver.exe,则使用默认方式
|
||||
if not os.path.exists(chrome_driver_path):
|
||||
driver = webdriver.Chrome(options=options)
|
||||
logger.debug("使用默认ChromeDriver")
|
||||
else:
|
||||
from selenium.webdriver.chrome.service import Service
|
||||
driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
|
||||
logger.debug(f"使用本地ChromeDriver: {chrome_driver_path}")
|
||||
|
||||
img_elements = driver.find_elements(By.CSS_SELECTOR, 'figure.wp-block-gallery figure.wp-block-image img')
|
||||
for img in img_elements:
|
||||
img_url = img.get_attribute('src')
|
||||
if img_url and img_url.startswith('http'):
|
||||
images.append(img_url)
|
||||
for page in range(1, total_pages + 1):
|
||||
url = f"{post_url}/{page}" if page > 1 else post_url
|
||||
driver.get(url)
|
||||
time.sleep(2)
|
||||
|
||||
logger.info(f"已爬取 {url},找到 {len(img_elements)} 张图片")
|
||||
img_elements = driver.find_elements(By.CSS_SELECTOR, 'figure.wp-block-gallery figure.wp-block-image img')
|
||||
for img in img_elements:
|
||||
img_url = img.get_attribute('src')
|
||||
if img_url and img_url.startswith('http'):
|
||||
images.append(img_url)
|
||||
|
||||
driver.quit()
|
||||
return images
|
||||
logger.info(f"已爬取 {url},找到 {len(img_elements)} 张图片")
|
||||
|
||||
driver.quit()
|
||||
return images
|
||||
except Exception as e:
|
||||
logger.info(f"爬取 {post_url} 失败: {e}")
|
||||
return []
|
||||
finally:
|
||||
driver.quit()
|
||||
|
||||
|
||||
def download_image(img_url, folder_path, img_index, max_retries=3):
|
||||
@@ -130,7 +138,7 @@ def download_image(img_url, folder_path, img_index, max_retries=3):
|
||||
# 构建特定的headers
|
||||
local_headers = headers.copy()
|
||||
local_headers['Referer'] = img_url # 使用图片URL作为referer
|
||||
|
||||
|
||||
# 添加一些额外的headers模拟真实浏览器
|
||||
local_headers.update({
|
||||
'Accept': 'image/avif,image/webp,image/apng,image/*,*/*;q=0.8',
|
||||
@@ -139,10 +147,10 @@ def download_image(img_url, folder_path, img_index, max_retries=3):
|
||||
'Cache-Control': 'no-cache',
|
||||
'Pragma': 'no-cache'
|
||||
})
|
||||
|
||||
|
||||
response = requests.get(img_url, headers=local_headers, timeout=10)
|
||||
response.raise_for_status()
|
||||
|
||||
|
||||
# 验证内容类型
|
||||
content_type = response.headers.get('content-type', '')
|
||||
if not content_type.startswith('image/'):
|
||||
@@ -156,7 +164,7 @@ def download_image(img_url, folder_path, img_index, max_retries=3):
|
||||
img.save(img_path, 'JPEG', quality=95)
|
||||
logger.info(f"已下载并转换为JPG: {img_path}")
|
||||
return True
|
||||
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"尝试 {attempt + 1}/{max_retries} 下载图片失败: {e}")
|
||||
if attempt < max_retries - 1:
|
||||
@@ -164,7 +172,7 @@ def download_image(img_url, folder_path, img_index, max_retries=3):
|
||||
logger.info(f"等待 {wait_time:.1f} 秒后重试...")
|
||||
time.sleep(wait_time)
|
||||
continue
|
||||
|
||||
|
||||
logger.warning(f"图片 {img_url} 下载失败,已达到最大重试次数")
|
||||
return False
|
||||
|
||||
@@ -175,7 +183,7 @@ def meitu_dowload_pic(dl_path, dl_url):
|
||||
os.makedirs(dl_path)
|
||||
|
||||
logger.info(f"开始爬取 {base_url} 的帖子...")
|
||||
posts = fetch_posts(base_url,dl_path, 10)
|
||||
posts = fetch_posts(base_url, dl_path, 10)
|
||||
|
||||
if not posts:
|
||||
logger.info("未获取到符合条件的帖子,请检查选择器或网络连接。")
|
||||
@@ -199,7 +207,7 @@ def meitu_dowload_pic(dl_path, dl_url):
|
||||
if not download_image(img_url, folder_path, idx):
|
||||
logger.info(f"图片 {img_url} 下载失败,继续下一张...")
|
||||
continue
|
||||
|
||||
|
||||
# 每个帖子之间增加随机延时
|
||||
time.sleep(random.uniform(3, 6))
|
||||
|
||||
@@ -210,6 +218,5 @@ def meitu_dowload_pub_pic():
|
||||
meitu_dowload_pic(download_root, "https://www.mntuce.com/")
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
meitu_dowload_pub_pic()
|
||||
|
||||
Reference in New Issue
Block a user