解决chrome quit的问题
This commit is contained in:
@@ -44,188 +44,196 @@ def download_image(url):
|
||||
|
||||
def fetch_and_create_pdf(url):
|
||||
"""根据给定URL抓取页面并生成PDF"""
|
||||
# 配置Selenium
|
||||
options = Options()
|
||||
options.add_argument('--headless') # 使用新的headless模式
|
||||
options.add_argument('--disable-gpu')
|
||||
options.add_argument('--no-sandbox')
|
||||
options.add_argument('--disable-dev-shm-usage') # 添加Linux特定配置
|
||||
|
||||
# 根据操作系统选择不同的ChromeDriver路径处理方式
|
||||
if os.name == 'nt': # Windows
|
||||
chrome_driver_path = os.path.join(
|
||||
os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
|
||||
"utils", "chromedriver", "chromedriver.exe"
|
||||
)
|
||||
else: # Linux
|
||||
chrome_driver_path = '/usr/bin/chromedriver' # 使用系统PATH中的chromedriver
|
||||
|
||||
driver =None
|
||||
try:
|
||||
if os.name == 'nt' and not os.path.exists(chrome_driver_path):
|
||||
# 配置Selenium
|
||||
options = Options()
|
||||
options.add_argument('--headless') # 使用新的headless模式
|
||||
options.add_argument('--disable-gpu')
|
||||
options.add_argument('--no-sandbox')
|
||||
options.add_argument('--disable-dev-shm-usage') # 添加Linux特定配置
|
||||
|
||||
# 根据操作系统选择不同的ChromeDriver路径处理方式
|
||||
if os.name == 'nt': # Windows
|
||||
chrome_driver_path = os.path.join(
|
||||
os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
|
||||
"utils", "chromedriver", "chromedriver.exe"
|
||||
)
|
||||
else: # Linux
|
||||
chrome_driver_path = '/usr/bin/chromedriver' # 使用系统PATH中的chromedriver
|
||||
|
||||
try:
|
||||
if os.name == 'nt' and not os.path.exists(chrome_driver_path):
|
||||
chrome_driver_path = ChromeDriverManager().install()
|
||||
|
||||
driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
|
||||
except Exception as e:
|
||||
logger.info(f"初始化ChromeDriver失败: {e}")
|
||||
chrome_driver_path = ChromeDriverManager().install()
|
||||
driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
|
||||
|
||||
# 如果本地没有chromedriver.exe,则使用webdriver_manager下载一次
|
||||
if not os.path.exists(chrome_driver_path):
|
||||
chrome_driver_path = ChromeDriverManager().install()
|
||||
logger.info(f"ChromeDriver已下载到: {chrome_driver_path}")
|
||||
else:
|
||||
logger.info(f"使用本地ChromeDriver: {chrome_driver_path}")
|
||||
|
||||
driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
|
||||
except Exception as e:
|
||||
logger.info(f"初始化ChromeDriver失败: {e}")
|
||||
chrome_driver_path = ChromeDriverManager().install()
|
||||
driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
|
||||
|
||||
# 如果本地没有chromedriver.exe,则使用webdriver_manager下载一次
|
||||
if not os.path.exists(chrome_driver_path):
|
||||
chrome_driver_path = ChromeDriverManager().install()
|
||||
logger.info(f"ChromeDriver已下载到: {chrome_driver_path}")
|
||||
else:
|
||||
logger.info(f"使用本地ChromeDriver: {chrome_driver_path}")
|
||||
|
||||
driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
|
||||
|
||||
# 获取目标页面
|
||||
driver.get(url)
|
||||
time.sleep(5)
|
||||
|
||||
# 处理年龄验证按钮
|
||||
try:
|
||||
enter_button = driver.find_element(By.XPATH, '//a[contains(text(), "满18岁,请点此进入")]')
|
||||
enter_button.click()
|
||||
logger.info("点击了满18岁按钮")
|
||||
# 获取目标页面
|
||||
driver.get(url)
|
||||
time.sleep(5)
|
||||
|
||||
# 处理年龄验证按钮
|
||||
try:
|
||||
enter_button = driver.find_element(By.XPATH, '//a[contains(text(), "满18岁,请点此进入")]')
|
||||
enter_button.click()
|
||||
logger.info("点击了满18岁按钮")
|
||||
time.sleep(5)
|
||||
except Exception as e:
|
||||
logger.info("未找到满18岁按钮,跳过此步骤", e)
|
||||
|
||||
# 解析页面
|
||||
html = driver.page_source
|
||||
soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8')
|
||||
posts = soup.find_all('tbody', {'id': lambda x: x and x.startswith('normalthread')})
|
||||
|
||||
# 获取今天的日期
|
||||
today = datetime.now().strftime('%Y-%m-%d')
|
||||
|
||||
# 注册中文字体
|
||||
pdfmetrics.registerFont(TTFont('SimHei', 'fonts/simhei.ttf'))
|
||||
styles = getSampleStyleSheet()
|
||||
|
||||
# 设置样式
|
||||
title_style = styles['Heading1']
|
||||
title_style.fontName = 'SimHei'
|
||||
title_style.fontSize = 14
|
||||
title_style.textColor = colors.red
|
||||
title_style.bold = True
|
||||
|
||||
normal_style = styles['Normal']
|
||||
normal_style.fontName = 'SimHei'
|
||||
normal_style.fontSize = 14
|
||||
|
||||
content = []
|
||||
|
||||
# 过滤当天帖子并倒序
|
||||
today_posts = []
|
||||
for post in posts:
|
||||
post_time_span = post.find('span', {'class': 'xi1'})
|
||||
if post_time_span:
|
||||
today_posts.append(post)
|
||||
today_posts = today_posts[::-1] # 倒序处理
|
||||
|
||||
# 设置PDF
|
||||
pdf_filename = f"JAV-{today}-{len(today_posts)}.pdf"
|
||||
doc = SimpleDocTemplate(pdf_filename, pagesize=A3)
|
||||
|
||||
# 计算内容区域的宽度和高度
|
||||
page_width, page_height = A3
|
||||
content_width = page_width - doc.rightMargin - doc.leftMargin
|
||||
content_height = page_height - doc.topMargin - doc.bottomMargin
|
||||
|
||||
# 设置最大图片尺寸,留出一些边距
|
||||
max_image_width = content_width * 0.95
|
||||
max_image_height = content_height * 0.7 # 留出足够空间给文本和其他元素
|
||||
|
||||
# 遍历帖子
|
||||
for post in today_posts:
|
||||
title = post.find('a', {'class': 's xst'})
|
||||
if title:
|
||||
post_title = title.get_text()
|
||||
post_url = title.get('href')
|
||||
logger.info(post_title)
|
||||
|
||||
# 获取帖子内容
|
||||
post_page_url = 'https://www.sehuatang.net/' + post_url
|
||||
driver.get(post_page_url)
|
||||
time.sleep(3)
|
||||
|
||||
post_html = driver.page_source
|
||||
post_soup = BeautifulSoup(post_html, 'html.parser', from_encoding='utf-8')
|
||||
content_div = post_soup.find('div', {'class': 't_fsz'})
|
||||
|
||||
if content_div:
|
||||
# 提取文本和磁力链接
|
||||
post_text = content_div.get_text(strip=True)
|
||||
magnet_links = re.findall(r'magnet:\?[^ \u4e00-\u9fff]+', post_text)
|
||||
|
||||
# 添加标题
|
||||
content.append(Paragraph(f" {post_title}", title_style))
|
||||
content.append(Spacer(1, 5))
|
||||
|
||||
# 添加磁力链接
|
||||
if magnet_links:
|
||||
for magnet_link in magnet_links:
|
||||
content.append(Paragraph(f"<br /><b>{magnet_link}</b><br />", normal_style))
|
||||
content.append(Spacer(1, 12))
|
||||
|
||||
# 添加图片
|
||||
image_links = []
|
||||
images = content_div.find_all('img')
|
||||
for img in images:
|
||||
if img.get('zoomfile') and 'http' in img.get('zoomfile'):
|
||||
image_links.append(img.get('zoomfile'))
|
||||
|
||||
if image_links:
|
||||
for img_link in image_links:
|
||||
image = download_image(img_link)
|
||||
if image:
|
||||
try:
|
||||
# 使用PIL处理图片尺寸
|
||||
with PILImage.open(image) as img:
|
||||
img_width, img_height = img.size
|
||||
# 计算缩放比例,确保图片适应页面
|
||||
scale_width = max_image_width / img_width
|
||||
scale_height = max_image_height / img_height
|
||||
scale = min(scale_width, scale_height, 1.0) # 不超过原始大小
|
||||
|
||||
# 计算新的尺寸
|
||||
new_width = img_width * scale
|
||||
new_height = img_height * scale
|
||||
|
||||
# 重置文件指针
|
||||
image.seek(0)
|
||||
img_stream = BytesIO(image.getvalue())
|
||||
|
||||
# 添加图片到内容中,使用计算后的尺寸
|
||||
content.append(Image(img_stream, width=new_width, height=new_height))
|
||||
content.append(Spacer(1, 4))
|
||||
logger.info(f"处理图片: 原始尺寸 {img_width}x{img_height}, 新尺寸 {new_width}x{new_height}")
|
||||
except Exception as e:
|
||||
logger.info(f"处理图片时出错: {e}")
|
||||
|
||||
# 在每个帖子后添加分页符(除了最后一页)
|
||||
if post != today_posts[-1]:
|
||||
content.append(PageBreak())
|
||||
|
||||
# 生成PDF
|
||||
try:
|
||||
doc.build(content)
|
||||
absolute_pdf_path = os.path.abspath(pdf_filename)
|
||||
logger.info(f"PDF saved as {absolute_pdf_path}")
|
||||
|
||||
# 加密PDF
|
||||
add_pdf_encryption(absolute_pdf_path)
|
||||
driver.quit()
|
||||
|
||||
return absolute_pdf_path
|
||||
except Exception as e:
|
||||
logger.info(f"生成PDF时出错: {e}")
|
||||
driver.quit()
|
||||
# 如果生成失败,返回一个默认路径或空字符串
|
||||
return ""
|
||||
except Exception as e:
|
||||
logger.info("未找到满18岁按钮,跳过此步骤", e)
|
||||
|
||||
# 解析页面
|
||||
html = driver.page_source
|
||||
soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8')
|
||||
posts = soup.find_all('tbody', {'id': lambda x: x and x.startswith('normalthread')})
|
||||
|
||||
# 获取今天的日期
|
||||
today = datetime.now().strftime('%Y-%m-%d')
|
||||
|
||||
# 注册中文字体
|
||||
pdfmetrics.registerFont(TTFont('SimHei', 'fonts/simhei.ttf'))
|
||||
styles = getSampleStyleSheet()
|
||||
|
||||
# 设置样式
|
||||
title_style = styles['Heading1']
|
||||
title_style.fontName = 'SimHei'
|
||||
title_style.fontSize = 14
|
||||
title_style.textColor = colors.red
|
||||
title_style.bold = True
|
||||
|
||||
normal_style = styles['Normal']
|
||||
normal_style.fontName = 'SimHei'
|
||||
normal_style.fontSize = 14
|
||||
|
||||
content = []
|
||||
|
||||
# 过滤当天帖子并倒序
|
||||
today_posts = []
|
||||
for post in posts:
|
||||
post_time_span = post.find('span', {'class': 'xi1'})
|
||||
if post_time_span:
|
||||
today_posts.append(post)
|
||||
today_posts = today_posts[::-1] # 倒序处理
|
||||
|
||||
# 设置PDF
|
||||
pdf_filename = f"JAV-{today}-{len(today_posts)}.pdf"
|
||||
doc = SimpleDocTemplate(pdf_filename, pagesize=A3)
|
||||
|
||||
# 计算内容区域的宽度和高度
|
||||
page_width, page_height = A3
|
||||
content_width = page_width - doc.rightMargin - doc.leftMargin
|
||||
content_height = page_height - doc.topMargin - doc.bottomMargin
|
||||
|
||||
# 设置最大图片尺寸,留出一些边距
|
||||
max_image_width = content_width * 0.95
|
||||
max_image_height = content_height * 0.7 # 留出足够空间给文本和其他元素
|
||||
|
||||
# 遍历帖子
|
||||
for post in today_posts:
|
||||
title = post.find('a', {'class': 's xst'})
|
||||
if title:
|
||||
post_title = title.get_text()
|
||||
post_url = title.get('href')
|
||||
logger.info(post_title)
|
||||
|
||||
# 获取帖子内容
|
||||
post_page_url = 'https://www.sehuatang.net/' + post_url
|
||||
driver.get(post_page_url)
|
||||
time.sleep(3)
|
||||
|
||||
post_html = driver.page_source
|
||||
post_soup = BeautifulSoup(post_html, 'html.parser', from_encoding='utf-8')
|
||||
content_div = post_soup.find('div', {'class': 't_fsz'})
|
||||
|
||||
if content_div:
|
||||
# 提取文本和磁力链接
|
||||
post_text = content_div.get_text(strip=True)
|
||||
magnet_links = re.findall(r'magnet:\?[^ \u4e00-\u9fff]+', post_text)
|
||||
|
||||
# 添加标题
|
||||
content.append(Paragraph(f" {post_title}", title_style))
|
||||
content.append(Spacer(1, 5))
|
||||
|
||||
# 添加磁力链接
|
||||
if magnet_links:
|
||||
for magnet_link in magnet_links:
|
||||
content.append(Paragraph(f"<br /><b>{magnet_link}</b><br />", normal_style))
|
||||
content.append(Spacer(1, 12))
|
||||
|
||||
# 添加图片
|
||||
image_links = []
|
||||
images = content_div.find_all('img')
|
||||
for img in images:
|
||||
if img.get('zoomfile') and 'http' in img.get('zoomfile'):
|
||||
image_links.append(img.get('zoomfile'))
|
||||
|
||||
if image_links:
|
||||
for img_link in image_links:
|
||||
image = download_image(img_link)
|
||||
if image:
|
||||
try:
|
||||
# 使用PIL处理图片尺寸
|
||||
with PILImage.open(image) as img:
|
||||
img_width, img_height = img.size
|
||||
# 计算缩放比例,确保图片适应页面
|
||||
scale_width = max_image_width / img_width
|
||||
scale_height = max_image_height / img_height
|
||||
scale = min(scale_width, scale_height, 1.0) # 不超过原始大小
|
||||
|
||||
# 计算新的尺寸
|
||||
new_width = img_width * scale
|
||||
new_height = img_height * scale
|
||||
|
||||
# 重置文件指针
|
||||
image.seek(0)
|
||||
img_stream = BytesIO(image.getvalue())
|
||||
|
||||
# 添加图片到内容中,使用计算后的尺寸
|
||||
content.append(Image(img_stream, width=new_width, height=new_height))
|
||||
content.append(Spacer(1, 4))
|
||||
logger.info(f"处理图片: 原始尺寸 {img_width}x{img_height}, 新尺寸 {new_width}x{new_height}")
|
||||
except Exception as e:
|
||||
logger.info(f"处理图片时出错: {e}")
|
||||
|
||||
# 在每个帖子后添加分页符(除了最后一页)
|
||||
if post != today_posts[-1]:
|
||||
content.append(PageBreak())
|
||||
|
||||
# 生成PDF
|
||||
try:
|
||||
doc.build(content)
|
||||
absolute_pdf_path = os.path.abspath(pdf_filename)
|
||||
logger.info(f"PDF saved as {absolute_pdf_path}")
|
||||
|
||||
# 加密PDF
|
||||
add_pdf_encryption(absolute_pdf_path)
|
||||
logger.info(f"抓取帖子时出错: {e}")
|
||||
# 如果抓取失败,返回一个默认路径或空字符串
|
||||
driver.quit()
|
||||
|
||||
return absolute_pdf_path
|
||||
except Exception as e:
|
||||
logger.info(f"生成PDF时出错: {e}")
|
||||
driver.quit()
|
||||
# 如果生成失败,返回一个默认路径或空字符串
|
||||
return ""
|
||||
|
||||
finally:
|
||||
driver.quit()
|
||||
|
||||
# add_pdf_encryption 和 pdf_file_path 函数保持不变
|
||||
def add_pdf_encryption(pdf_file, password="4000"):
|
||||
|
||||
Reference in New Issue
Block a user