解决chrome quit的问题

2025-05-26 10:35:49 +08:00
parent 2475fb8d7f
commit ec92d4218c
1 changed files with 181 additions and 173 deletions
--- a/sehuatang/shehuatang.py
+++ b/sehuatang/shehuatang.py
@@ -44,188 +44,196 @@ def download_image(url):

 def fetch_and_create_pdf(url):
    """根据给定URL抓取页面并生成PDF"""
-    # 配置Selenium
-    options = Options()
-    options.add_argument('--headless')  # 使用新的headless模式
-    options.add_argument('--disable-gpu')
-    options.add_argument('--no-sandbox')
-    options.add_argument('--disable-dev-shm-usage')  # 添加Linux特定配置
-    
-    # 根据操作系统选择不同的ChromeDriver路径处理方式
-    if os.name == 'nt':  # Windows
-        chrome_driver_path = os.path.join(
-            os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
-            "utils", "chromedriver", "chromedriver.exe"
-        )
-    else:  # Linux
-        chrome_driver_path = '/usr/bin/chromedriver'  # 使用系统PATH中的chromedriver
-    
+    driver =None
    try:
-        if os.name == 'nt' and not os.path.exists(chrome_driver_path):
+        # 配置Selenium
+        options = Options()
+        options.add_argument('--headless')  # 使用新的headless模式
+        options.add_argument('--disable-gpu')
+        options.add_argument('--no-sandbox')
+        options.add_argument('--disable-dev-shm-usage')  # 添加Linux特定配置
+        
+        # 根据操作系统选择不同的ChromeDriver路径处理方式
+        if os.name == 'nt':  # Windows
+            chrome_driver_path = os.path.join(
+                os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
+                "utils", "chromedriver", "chromedriver.exe"
+            )
+        else:  # Linux
+            chrome_driver_path = '/usr/bin/chromedriver'  # 使用系统PATH中的chromedriver
+        
+        try:
+            if os.name == 'nt' and not os.path.exists(chrome_driver_path):
+                chrome_driver_path = ChromeDriverManager().install()
+            
+            driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
+        except Exception as e:
+            logger.info(f"初始化ChromeDriver失败: {e}")
            chrome_driver_path = ChromeDriverManager().install()
+            driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
        
+        # 如果本地没有chromedriver.exe，则使用webdriver_manager下载一次
+        if not os.path.exists(chrome_driver_path):
+            chrome_driver_path = ChromeDriverManager().install()
+            logger.info(f"ChromeDriver已下载到: {chrome_driver_path}")
+        else:
+            logger.info(f"使用本地ChromeDriver: {chrome_driver_path}")
+            
        driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
-    except Exception as e:
-        logger.info(f"初始化ChromeDriver失败: {e}")
-        chrome_driver_path = ChromeDriverManager().install()
-        driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
-    
-    # 如果本地没有chromedriver.exe，则使用webdriver_manager下载一次
-    if not os.path.exists(chrome_driver_path):
-        chrome_driver_path = ChromeDriverManager().install()
-        logger.info(f"ChromeDriver已下载到: {chrome_driver_path}")
-    else:
-        logger.info(f"使用本地ChromeDriver: {chrome_driver_path}")
-        
-    driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)

-    # 获取目标页面
-    driver.get(url)
-    time.sleep(5)
-
-    # 处理年龄验证按钮
-    try:
-        enter_button = driver.find_element(By.XPATH, '//a[contains(text(), "满18岁，请点此进入")]')
-        enter_button.click()
-        logger.info("点击了满18岁按钮")
+        # 获取目标页面
+        driver.get(url)
        time.sleep(5)
+
+        # 处理年龄验证按钮
+        try:
+            enter_button = driver.find_element(By.XPATH, '//a[contains(text(), "满18岁，请点此进入")]')
+            enter_button.click()
+            logger.info("点击了满18岁按钮")
+            time.sleep(5)
+        except Exception as e:
+            logger.info("未找到满18岁按钮，跳过此步骤", e)
+
+        # 解析页面
+        html = driver.page_source
+        soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8')
+        posts = soup.find_all('tbody', {'id': lambda x: x and x.startswith('normalthread')})
+
+        # 获取今天的日期
+        today = datetime.now().strftime('%Y-%m-%d')
+
+        # 注册中文字体
+        pdfmetrics.registerFont(TTFont('SimHei', 'fonts/simhei.ttf'))
+        styles = getSampleStyleSheet()
+
+        # 设置样式
+        title_style = styles['Heading1']
+        title_style.fontName = 'SimHei'
+        title_style.fontSize = 14
+        title_style.textColor = colors.red
+        title_style.bold = True
+
+        normal_style = styles['Normal']
+        normal_style.fontName = 'SimHei'
+        normal_style.fontSize = 14
+
+        content = []
+
+        # 过滤当天帖子并倒序
+        today_posts = []
+        for post in posts:
+            post_time_span = post.find('span', {'class': 'xi1'})
+            if post_time_span:
+                today_posts.append(post)
+        today_posts = today_posts[::-1]  # 倒序处理
+
+        # 设置PDF
+        pdf_filename = f"JAV-{today}-{len(today_posts)}.pdf"
+        doc = SimpleDocTemplate(pdf_filename, pagesize=A3)
+        
+        # 计算内容区域的宽度和高度
+        page_width, page_height = A3
+        content_width = page_width - doc.rightMargin - doc.leftMargin
+        content_height = page_height - doc.topMargin - doc.bottomMargin
+        
+        # 设置最大图片尺寸，留出一些边距
+        max_image_width = content_width * 0.95
+        max_image_height = content_height * 0.7  # 留出足够空间给文本和其他元素
+
+        # 遍历帖子
+        for post in today_posts:
+            title = post.find('a', {'class': 's xst'})
+            if title:
+                post_title = title.get_text()
+                post_url = title.get('href')
+                logger.info(post_title)
+
+                # 获取帖子内容
+                post_page_url = 'https://www.sehuatang.net/' + post_url
+                driver.get(post_page_url)
+                time.sleep(3)
+
+                post_html = driver.page_source
+                post_soup = BeautifulSoup(post_html, 'html.parser', from_encoding='utf-8')
+                content_div = post_soup.find('div', {'class': 't_fsz'})
+
+                if content_div:
+                    # 提取文本和磁力链接
+                    post_text = content_div.get_text(strip=True)
+                    magnet_links = re.findall(r'magnet:\?[^ \u4e00-\u9fff]+', post_text)
+
+                    # 添加标题
+                    content.append(Paragraph(f" {post_title}", title_style))
+                    content.append(Spacer(1, 5))
+
+                    # 添加磁力链接
+                    if magnet_links:
+                        for magnet_link in magnet_links:
+                            content.append(Paragraph(f"<br /><b>{magnet_link}</b><br />", normal_style))
+                            content.append(Spacer(1, 12))
+
+                    # 添加图片
+                    image_links = []
+                    images = content_div.find_all('img')
+                    for img in images:
+                        if img.get('zoomfile') and 'http' in img.get('zoomfile'):
+                            image_links.append(img.get('zoomfile'))
+
+                    if image_links:
+                        for img_link in image_links:
+                            image = download_image(img_link)
+                            if image:
+                                try:
+                                    # 使用PIL处理图片尺寸
+                                    with PILImage.open(image) as img:
+                                        img_width, img_height = img.size
+                                        # 计算缩放比例，确保图片适应页面
+                                        scale_width = max_image_width / img_width
+                                        scale_height = max_image_height / img_height
+                                        scale = min(scale_width, scale_height, 1.0)  # 不超过原始大小
+                                        
+                                        # 计算新的尺寸
+                                        new_width = img_width * scale
+                                        new_height = img_height * scale
+                                        
+                                        # 重置文件指针
+                                        image.seek(0)
+                                        img_stream = BytesIO(image.getvalue())
+                                        
+                                        # 添加图片到内容中，使用计算后的尺寸
+                                        content.append(Image(img_stream, width=new_width, height=new_height))
+                                        content.append(Spacer(1, 4))
+                                        logger.info(f"处理图片: 原始尺寸 {img_width}x{img_height}, 新尺寸 {new_width}x{new_height}")
+                                except Exception as e:
+                                    logger.info(f"处理图片时出错: {e}")
+
+                    # 在每个帖子后添加分页符（除了最后一页）
+                    if post != today_posts[-1]:
+                        content.append(PageBreak())
+
+        # 生成PDF
+        try:
+            doc.build(content)
+            absolute_pdf_path = os.path.abspath(pdf_filename)
+            logger.info(f"PDF saved as {absolute_pdf_path}")
+
+            # 加密PDF
+            add_pdf_encryption(absolute_pdf_path)
+            driver.quit()
+
+            return absolute_pdf_path
+        except Exception as e:
+            logger.info(f"生成PDF时出错: {e}")
+            driver.quit()
+            # 如果生成失败，返回一个默认路径或空字符串
+            return ""
    except Exception as e:
-        logger.info("未找到满18岁按钮，跳过此步骤", e)
-
-    # 解析页面
-    html = driver.page_source
-    soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8')
-    posts = soup.find_all('tbody', {'id': lambda x: x and x.startswith('normalthread')})
-
-    # 获取今天的日期
-    today = datetime.now().strftime('%Y-%m-%d')
-
-    # 注册中文字体
-    pdfmetrics.registerFont(TTFont('SimHei', 'fonts/simhei.ttf'))
-    styles = getSampleStyleSheet()
-
-    # 设置样式
-    title_style = styles['Heading1']
-    title_style.fontName = 'SimHei'
-    title_style.fontSize = 14
-    title_style.textColor = colors.red
-    title_style.bold = True
-
-    normal_style = styles['Normal']
-    normal_style.fontName = 'SimHei'
-    normal_style.fontSize = 14
-
-    content = []
-
-    # 过滤当天帖子并倒序
-    today_posts = []
-    for post in posts:
-        post_time_span = post.find('span', {'class': 'xi1'})
-        if post_time_span:
-            today_posts.append(post)
-    today_posts = today_posts[::-1]  # 倒序处理
-
-    # 设置PDF
-    pdf_filename = f"JAV-{today}-{len(today_posts)}.pdf"
-    doc = SimpleDocTemplate(pdf_filename, pagesize=A3)
-    
-    # 计算内容区域的宽度和高度
-    page_width, page_height = A3
-    content_width = page_width - doc.rightMargin - doc.leftMargin
-    content_height = page_height - doc.topMargin - doc.bottomMargin
-    
-    # 设置最大图片尺寸，留出一些边距
-    max_image_width = content_width * 0.95
-    max_image_height = content_height * 0.7  # 留出足够空间给文本和其他元素
-
-    # 遍历帖子
-    for post in today_posts:
-        title = post.find('a', {'class': 's xst'})
-        if title:
-            post_title = title.get_text()
-            post_url = title.get('href')
-            logger.info(post_title)
-
-            # 获取帖子内容
-            post_page_url = 'https://www.sehuatang.net/' + post_url
-            driver.get(post_page_url)
-            time.sleep(3)
-
-            post_html = driver.page_source
-            post_soup = BeautifulSoup(post_html, 'html.parser', from_encoding='utf-8')
-            content_div = post_soup.find('div', {'class': 't_fsz'})
-
-            if content_div:
-                # 提取文本和磁力链接
-                post_text = content_div.get_text(strip=True)
-                magnet_links = re.findall(r'magnet:\?[^ \u4e00-\u9fff]+', post_text)
-
-                # 添加标题
-                content.append(Paragraph(f" {post_title}", title_style))
-                content.append(Spacer(1, 5))
-
-                # 添加磁力链接
-                if magnet_links:
-                    for magnet_link in magnet_links:
-                        content.append(Paragraph(f"<br /><b>{magnet_link}</b><br />", normal_style))
-                        content.append(Spacer(1, 12))
-
-                # 添加图片
-                image_links = []
-                images = content_div.find_all('img')
-                for img in images:
-                    if img.get('zoomfile') and 'http' in img.get('zoomfile'):
-                        image_links.append(img.get('zoomfile'))
-
-                if image_links:
-                    for img_link in image_links:
-                        image = download_image(img_link)
-                        if image:
-                            try:
-                                # 使用PIL处理图片尺寸
-                                with PILImage.open(image) as img:
-                                    img_width, img_height = img.size
-                                    # 计算缩放比例，确保图片适应页面
-                                    scale_width = max_image_width / img_width
-                                    scale_height = max_image_height / img_height
-                                    scale = min(scale_width, scale_height, 1.0)  # 不超过原始大小
-                                    
-                                    # 计算新的尺寸
-                                    new_width = img_width * scale
-                                    new_height = img_height * scale
-                                    
-                                    # 重置文件指针
-                                    image.seek(0)
-                                    img_stream = BytesIO(image.getvalue())
-                                    
-                                    # 添加图片到内容中，使用计算后的尺寸
-                                    content.append(Image(img_stream, width=new_width, height=new_height))
-                                    content.append(Spacer(1, 4))
-                                    logger.info(f"处理图片: 原始尺寸 {img_width}x{img_height}, 新尺寸 {new_width}x{new_height}")
-                            except Exception as e:
-                                logger.info(f"处理图片时出错: {e}")
-
-                # 在每个帖子后添加分页符（除了最后一页）
-                if post != today_posts[-1]:
-                    content.append(PageBreak())
-
-    # 生成PDF
-    try:
-        doc.build(content)
-        absolute_pdf_path = os.path.abspath(pdf_filename)
-        logger.info(f"PDF saved as {absolute_pdf_path}")
-
-        # 加密PDF
-        add_pdf_encryption(absolute_pdf_path)
+        logger.info(f"抓取帖子时出错: {e}")
+        # 如果抓取失败，返回一个默认路径或空字符串
        driver.quit()
-
-        return absolute_pdf_path
-    except Exception as e:
-        logger.info(f"生成PDF时出错: {e}")
-        driver.quit()
-        # 如果生成失败，返回一个默认路径或空字符串
        return ""
-
+    finally:
+        driver.quit()

 # add_pdf_encryption 和 pdf_file_path 函数保持不变
 def add_pdf_encryption(pdf_file, password="4000"):