From ec92d4218ca1f87508e4dbfab5894d5dd4668251 Mon Sep 17 00:00:00 2001 From: liuwei Date: Mon, 26 May 2025 10:35:49 +0800 Subject: [PATCH] =?UTF-8?q?=E8=A7=A3=E5=86=B3chrome=20quit=E7=9A=84?= =?UTF-8?q?=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- sehuatang/shehuatang.py | 354 ++++++++++++++++++++-------------------- 1 file changed, 181 insertions(+), 173 deletions(-) diff --git a/sehuatang/shehuatang.py b/sehuatang/shehuatang.py index 575d12e..f76c426 100644 --- a/sehuatang/shehuatang.py +++ b/sehuatang/shehuatang.py @@ -44,188 +44,196 @@ def download_image(url): def fetch_and_create_pdf(url): """根据给定URL抓取页面并生成PDF""" - # 配置Selenium - options = Options() - options.add_argument('--headless') # 使用新的headless模式 - options.add_argument('--disable-gpu') - options.add_argument('--no-sandbox') - options.add_argument('--disable-dev-shm-usage') # 添加Linux特定配置 - - # 根据操作系统选择不同的ChromeDriver路径处理方式 - if os.name == 'nt': # Windows - chrome_driver_path = os.path.join( - os.path.dirname(os.path.dirname(os.path.abspath(__file__))), - "utils", "chromedriver", "chromedriver.exe" - ) - else: # Linux - chrome_driver_path = '/usr/bin/chromedriver' # 使用系统PATH中的chromedriver - + driver =None try: - if os.name == 'nt' and not os.path.exists(chrome_driver_path): + # 配置Selenium + options = Options() + options.add_argument('--headless') # 使用新的headless模式 + options.add_argument('--disable-gpu') + options.add_argument('--no-sandbox') + options.add_argument('--disable-dev-shm-usage') # 添加Linux特定配置 + + # 根据操作系统选择不同的ChromeDriver路径处理方式 + if os.name == 'nt': # Windows + chrome_driver_path = os.path.join( + os.path.dirname(os.path.dirname(os.path.abspath(__file__))), + "utils", "chromedriver", "chromedriver.exe" + ) + else: # Linux + chrome_driver_path = '/usr/bin/chromedriver' # 使用系统PATH中的chromedriver + + try: + if os.name == 'nt' and not os.path.exists(chrome_driver_path): + chrome_driver_path = ChromeDriverManager().install() + + driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options) + except Exception as e: + logger.info(f"初始化ChromeDriver失败: {e}") chrome_driver_path = ChromeDriverManager().install() + driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options) + # 如果本地没有chromedriver.exe,则使用webdriver_manager下载一次 + if not os.path.exists(chrome_driver_path): + chrome_driver_path = ChromeDriverManager().install() + logger.info(f"ChromeDriver已下载到: {chrome_driver_path}") + else: + logger.info(f"使用本地ChromeDriver: {chrome_driver_path}") + driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options) - except Exception as e: - logger.info(f"初始化ChromeDriver失败: {e}") - chrome_driver_path = ChromeDriverManager().install() - driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options) - - # 如果本地没有chromedriver.exe,则使用webdriver_manager下载一次 - if not os.path.exists(chrome_driver_path): - chrome_driver_path = ChromeDriverManager().install() - logger.info(f"ChromeDriver已下载到: {chrome_driver_path}") - else: - logger.info(f"使用本地ChromeDriver: {chrome_driver_path}") - - driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options) - # 获取目标页面 - driver.get(url) - time.sleep(5) - - # 处理年龄验证按钮 - try: - enter_button = driver.find_element(By.XPATH, '//a[contains(text(), "满18岁,请点此进入")]') - enter_button.click() - logger.info("点击了满18岁按钮") + # 获取目标页面 + driver.get(url) time.sleep(5) + + # 处理年龄验证按钮 + try: + enter_button = driver.find_element(By.XPATH, '//a[contains(text(), "满18岁,请点此进入")]') + enter_button.click() + logger.info("点击了满18岁按钮") + time.sleep(5) + except Exception as e: + logger.info("未找到满18岁按钮,跳过此步骤", e) + + # 解析页面 + html = driver.page_source + soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8') + posts = soup.find_all('tbody', {'id': lambda x: x and x.startswith('normalthread')}) + + # 获取今天的日期 + today = datetime.now().strftime('%Y-%m-%d') + + # 注册中文字体 + pdfmetrics.registerFont(TTFont('SimHei', 'fonts/simhei.ttf')) + styles = getSampleStyleSheet() + + # 设置样式 + title_style = styles['Heading1'] + title_style.fontName = 'SimHei' + title_style.fontSize = 14 + title_style.textColor = colors.red + title_style.bold = True + + normal_style = styles['Normal'] + normal_style.fontName = 'SimHei' + normal_style.fontSize = 14 + + content = [] + + # 过滤当天帖子并倒序 + today_posts = [] + for post in posts: + post_time_span = post.find('span', {'class': 'xi1'}) + if post_time_span: + today_posts.append(post) + today_posts = today_posts[::-1] # 倒序处理 + + # 设置PDF + pdf_filename = f"JAV-{today}-{len(today_posts)}.pdf" + doc = SimpleDocTemplate(pdf_filename, pagesize=A3) + + # 计算内容区域的宽度和高度 + page_width, page_height = A3 + content_width = page_width - doc.rightMargin - doc.leftMargin + content_height = page_height - doc.topMargin - doc.bottomMargin + + # 设置最大图片尺寸,留出一些边距 + max_image_width = content_width * 0.95 + max_image_height = content_height * 0.7 # 留出足够空间给文本和其他元素 + + # 遍历帖子 + for post in today_posts: + title = post.find('a', {'class': 's xst'}) + if title: + post_title = title.get_text() + post_url = title.get('href') + logger.info(post_title) + + # 获取帖子内容 + post_page_url = 'https://www.sehuatang.net/' + post_url + driver.get(post_page_url) + time.sleep(3) + + post_html = driver.page_source + post_soup = BeautifulSoup(post_html, 'html.parser', from_encoding='utf-8') + content_div = post_soup.find('div', {'class': 't_fsz'}) + + if content_div: + # 提取文本和磁力链接 + post_text = content_div.get_text(strip=True) + magnet_links = re.findall(r'magnet:\?[^ \u4e00-\u9fff]+', post_text) + + # 添加标题 + content.append(Paragraph(f" {post_title}", title_style)) + content.append(Spacer(1, 5)) + + # 添加磁力链接 + if magnet_links: + for magnet_link in magnet_links: + content.append(Paragraph(f"
{magnet_link}
", normal_style)) + content.append(Spacer(1, 12)) + + # 添加图片 + image_links = [] + images = content_div.find_all('img') + for img in images: + if img.get('zoomfile') and 'http' in img.get('zoomfile'): + image_links.append(img.get('zoomfile')) + + if image_links: + for img_link in image_links: + image = download_image(img_link) + if image: + try: + # 使用PIL处理图片尺寸 + with PILImage.open(image) as img: + img_width, img_height = img.size + # 计算缩放比例,确保图片适应页面 + scale_width = max_image_width / img_width + scale_height = max_image_height / img_height + scale = min(scale_width, scale_height, 1.0) # 不超过原始大小 + + # 计算新的尺寸 + new_width = img_width * scale + new_height = img_height * scale + + # 重置文件指针 + image.seek(0) + img_stream = BytesIO(image.getvalue()) + + # 添加图片到内容中,使用计算后的尺寸 + content.append(Image(img_stream, width=new_width, height=new_height)) + content.append(Spacer(1, 4)) + logger.info(f"处理图片: 原始尺寸 {img_width}x{img_height}, 新尺寸 {new_width}x{new_height}") + except Exception as e: + logger.info(f"处理图片时出错: {e}") + + # 在每个帖子后添加分页符(除了最后一页) + if post != today_posts[-1]: + content.append(PageBreak()) + + # 生成PDF + try: + doc.build(content) + absolute_pdf_path = os.path.abspath(pdf_filename) + logger.info(f"PDF saved as {absolute_pdf_path}") + + # 加密PDF + add_pdf_encryption(absolute_pdf_path) + driver.quit() + + return absolute_pdf_path + except Exception as e: + logger.info(f"生成PDF时出错: {e}") + driver.quit() + # 如果生成失败,返回一个默认路径或空字符串 + return "" except Exception as e: - logger.info("未找到满18岁按钮,跳过此步骤", e) - - # 解析页面 - html = driver.page_source - soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8') - posts = soup.find_all('tbody', {'id': lambda x: x and x.startswith('normalthread')}) - - # 获取今天的日期 - today = datetime.now().strftime('%Y-%m-%d') - - # 注册中文字体 - pdfmetrics.registerFont(TTFont('SimHei', 'fonts/simhei.ttf')) - styles = getSampleStyleSheet() - - # 设置样式 - title_style = styles['Heading1'] - title_style.fontName = 'SimHei' - title_style.fontSize = 14 - title_style.textColor = colors.red - title_style.bold = True - - normal_style = styles['Normal'] - normal_style.fontName = 'SimHei' - normal_style.fontSize = 14 - - content = [] - - # 过滤当天帖子并倒序 - today_posts = [] - for post in posts: - post_time_span = post.find('span', {'class': 'xi1'}) - if post_time_span: - today_posts.append(post) - today_posts = today_posts[::-1] # 倒序处理 - - # 设置PDF - pdf_filename = f"JAV-{today}-{len(today_posts)}.pdf" - doc = SimpleDocTemplate(pdf_filename, pagesize=A3) - - # 计算内容区域的宽度和高度 - page_width, page_height = A3 - content_width = page_width - doc.rightMargin - doc.leftMargin - content_height = page_height - doc.topMargin - doc.bottomMargin - - # 设置最大图片尺寸,留出一些边距 - max_image_width = content_width * 0.95 - max_image_height = content_height * 0.7 # 留出足够空间给文本和其他元素 - - # 遍历帖子 - for post in today_posts: - title = post.find('a', {'class': 's xst'}) - if title: - post_title = title.get_text() - post_url = title.get('href') - logger.info(post_title) - - # 获取帖子内容 - post_page_url = 'https://www.sehuatang.net/' + post_url - driver.get(post_page_url) - time.sleep(3) - - post_html = driver.page_source - post_soup = BeautifulSoup(post_html, 'html.parser', from_encoding='utf-8') - content_div = post_soup.find('div', {'class': 't_fsz'}) - - if content_div: - # 提取文本和磁力链接 - post_text = content_div.get_text(strip=True) - magnet_links = re.findall(r'magnet:\?[^ \u4e00-\u9fff]+', post_text) - - # 添加标题 - content.append(Paragraph(f" {post_title}", title_style)) - content.append(Spacer(1, 5)) - - # 添加磁力链接 - if magnet_links: - for magnet_link in magnet_links: - content.append(Paragraph(f"
{magnet_link}
", normal_style)) - content.append(Spacer(1, 12)) - - # 添加图片 - image_links = [] - images = content_div.find_all('img') - for img in images: - if img.get('zoomfile') and 'http' in img.get('zoomfile'): - image_links.append(img.get('zoomfile')) - - if image_links: - for img_link in image_links: - image = download_image(img_link) - if image: - try: - # 使用PIL处理图片尺寸 - with PILImage.open(image) as img: - img_width, img_height = img.size - # 计算缩放比例,确保图片适应页面 - scale_width = max_image_width / img_width - scale_height = max_image_height / img_height - scale = min(scale_width, scale_height, 1.0) # 不超过原始大小 - - # 计算新的尺寸 - new_width = img_width * scale - new_height = img_height * scale - - # 重置文件指针 - image.seek(0) - img_stream = BytesIO(image.getvalue()) - - # 添加图片到内容中,使用计算后的尺寸 - content.append(Image(img_stream, width=new_width, height=new_height)) - content.append(Spacer(1, 4)) - logger.info(f"处理图片: 原始尺寸 {img_width}x{img_height}, 新尺寸 {new_width}x{new_height}") - except Exception as e: - logger.info(f"处理图片时出错: {e}") - - # 在每个帖子后添加分页符(除了最后一页) - if post != today_posts[-1]: - content.append(PageBreak()) - - # 生成PDF - try: - doc.build(content) - absolute_pdf_path = os.path.abspath(pdf_filename) - logger.info(f"PDF saved as {absolute_pdf_path}") - - # 加密PDF - add_pdf_encryption(absolute_pdf_path) + logger.info(f"抓取帖子时出错: {e}") + # 如果抓取失败,返回一个默认路径或空字符串 driver.quit() - - return absolute_pdf_path - except Exception as e: - logger.info(f"生成PDF时出错: {e}") - driver.quit() - # 如果生成失败,返回一个默认路径或空字符串 return "" - + finally: + driver.quit() # add_pdf_encryption 和 pdf_file_path 函数保持不变 def add_pdf_encryption(pdf_file, password="4000"):