diff --git a/sehuatang/shehuatang.py b/sehuatang/shehuatang.py
index 575d12e..f76c426 100644
--- a/sehuatang/shehuatang.py
+++ b/sehuatang/shehuatang.py
@@ -44,188 +44,196 @@ def download_image(url):
def fetch_and_create_pdf(url):
"""根据给定URL抓取页面并生成PDF"""
- # 配置Selenium
- options = Options()
- options.add_argument('--headless') # 使用新的headless模式
- options.add_argument('--disable-gpu')
- options.add_argument('--no-sandbox')
- options.add_argument('--disable-dev-shm-usage') # 添加Linux特定配置
-
- # 根据操作系统选择不同的ChromeDriver路径处理方式
- if os.name == 'nt': # Windows
- chrome_driver_path = os.path.join(
- os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
- "utils", "chromedriver", "chromedriver.exe"
- )
- else: # Linux
- chrome_driver_path = '/usr/bin/chromedriver' # 使用系统PATH中的chromedriver
-
+ driver =None
try:
- if os.name == 'nt' and not os.path.exists(chrome_driver_path):
+ # 配置Selenium
+ options = Options()
+ options.add_argument('--headless') # 使用新的headless模式
+ options.add_argument('--disable-gpu')
+ options.add_argument('--no-sandbox')
+ options.add_argument('--disable-dev-shm-usage') # 添加Linux特定配置
+
+ # 根据操作系统选择不同的ChromeDriver路径处理方式
+ if os.name == 'nt': # Windows
+ chrome_driver_path = os.path.join(
+ os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
+ "utils", "chromedriver", "chromedriver.exe"
+ )
+ else: # Linux
+ chrome_driver_path = '/usr/bin/chromedriver' # 使用系统PATH中的chromedriver
+
+ try:
+ if os.name == 'nt' and not os.path.exists(chrome_driver_path):
+ chrome_driver_path = ChromeDriverManager().install()
+
+ driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
+ except Exception as e:
+ logger.info(f"初始化ChromeDriver失败: {e}")
chrome_driver_path = ChromeDriverManager().install()
+ driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
+ # 如果本地没有chromedriver.exe,则使用webdriver_manager下载一次
+ if not os.path.exists(chrome_driver_path):
+ chrome_driver_path = ChromeDriverManager().install()
+ logger.info(f"ChromeDriver已下载到: {chrome_driver_path}")
+ else:
+ logger.info(f"使用本地ChromeDriver: {chrome_driver_path}")
+
driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
- except Exception as e:
- logger.info(f"初始化ChromeDriver失败: {e}")
- chrome_driver_path = ChromeDriverManager().install()
- driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
-
- # 如果本地没有chromedriver.exe,则使用webdriver_manager下载一次
- if not os.path.exists(chrome_driver_path):
- chrome_driver_path = ChromeDriverManager().install()
- logger.info(f"ChromeDriver已下载到: {chrome_driver_path}")
- else:
- logger.info(f"使用本地ChromeDriver: {chrome_driver_path}")
-
- driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
- # 获取目标页面
- driver.get(url)
- time.sleep(5)
-
- # 处理年龄验证按钮
- try:
- enter_button = driver.find_element(By.XPATH, '//a[contains(text(), "满18岁,请点此进入")]')
- enter_button.click()
- logger.info("点击了满18岁按钮")
+ # 获取目标页面
+ driver.get(url)
time.sleep(5)
+
+ # 处理年龄验证按钮
+ try:
+ enter_button = driver.find_element(By.XPATH, '//a[contains(text(), "满18岁,请点此进入")]')
+ enter_button.click()
+ logger.info("点击了满18岁按钮")
+ time.sleep(5)
+ except Exception as e:
+ logger.info("未找到满18岁按钮,跳过此步骤", e)
+
+ # 解析页面
+ html = driver.page_source
+ soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8')
+ posts = soup.find_all('tbody', {'id': lambda x: x and x.startswith('normalthread')})
+
+ # 获取今天的日期
+ today = datetime.now().strftime('%Y-%m-%d')
+
+ # 注册中文字体
+ pdfmetrics.registerFont(TTFont('SimHei', 'fonts/simhei.ttf'))
+ styles = getSampleStyleSheet()
+
+ # 设置样式
+ title_style = styles['Heading1']
+ title_style.fontName = 'SimHei'
+ title_style.fontSize = 14
+ title_style.textColor = colors.red
+ title_style.bold = True
+
+ normal_style = styles['Normal']
+ normal_style.fontName = 'SimHei'
+ normal_style.fontSize = 14
+
+ content = []
+
+ # 过滤当天帖子并倒序
+ today_posts = []
+ for post in posts:
+ post_time_span = post.find('span', {'class': 'xi1'})
+ if post_time_span:
+ today_posts.append(post)
+ today_posts = today_posts[::-1] # 倒序处理
+
+ # 设置PDF
+ pdf_filename = f"JAV-{today}-{len(today_posts)}.pdf"
+ doc = SimpleDocTemplate(pdf_filename, pagesize=A3)
+
+ # 计算内容区域的宽度和高度
+ page_width, page_height = A3
+ content_width = page_width - doc.rightMargin - doc.leftMargin
+ content_height = page_height - doc.topMargin - doc.bottomMargin
+
+ # 设置最大图片尺寸,留出一些边距
+ max_image_width = content_width * 0.95
+ max_image_height = content_height * 0.7 # 留出足够空间给文本和其他元素
+
+ # 遍历帖子
+ for post in today_posts:
+ title = post.find('a', {'class': 's xst'})
+ if title:
+ post_title = title.get_text()
+ post_url = title.get('href')
+ logger.info(post_title)
+
+ # 获取帖子内容
+ post_page_url = 'https://www.sehuatang.net/' + post_url
+ driver.get(post_page_url)
+ time.sleep(3)
+
+ post_html = driver.page_source
+ post_soup = BeautifulSoup(post_html, 'html.parser', from_encoding='utf-8')
+ content_div = post_soup.find('div', {'class': 't_fsz'})
+
+ if content_div:
+ # 提取文本和磁力链接
+ post_text = content_div.get_text(strip=True)
+ magnet_links = re.findall(r'magnet:\?[^ \u4e00-\u9fff]+', post_text)
+
+ # 添加标题
+ content.append(Paragraph(f" {post_title}", title_style))
+ content.append(Spacer(1, 5))
+
+ # 添加磁力链接
+ if magnet_links:
+ for magnet_link in magnet_links:
+ content.append(Paragraph(f"
{magnet_link}
", normal_style))
+ content.append(Spacer(1, 12))
+
+ # 添加图片
+ image_links = []
+ images = content_div.find_all('img')
+ for img in images:
+ if img.get('zoomfile') and 'http' in img.get('zoomfile'):
+ image_links.append(img.get('zoomfile'))
+
+ if image_links:
+ for img_link in image_links:
+ image = download_image(img_link)
+ if image:
+ try:
+ # 使用PIL处理图片尺寸
+ with PILImage.open(image) as img:
+ img_width, img_height = img.size
+ # 计算缩放比例,确保图片适应页面
+ scale_width = max_image_width / img_width
+ scale_height = max_image_height / img_height
+ scale = min(scale_width, scale_height, 1.0) # 不超过原始大小
+
+ # 计算新的尺寸
+ new_width = img_width * scale
+ new_height = img_height * scale
+
+ # 重置文件指针
+ image.seek(0)
+ img_stream = BytesIO(image.getvalue())
+
+ # 添加图片到内容中,使用计算后的尺寸
+ content.append(Image(img_stream, width=new_width, height=new_height))
+ content.append(Spacer(1, 4))
+ logger.info(f"处理图片: 原始尺寸 {img_width}x{img_height}, 新尺寸 {new_width}x{new_height}")
+ except Exception as e:
+ logger.info(f"处理图片时出错: {e}")
+
+ # 在每个帖子后添加分页符(除了最后一页)
+ if post != today_posts[-1]:
+ content.append(PageBreak())
+
+ # 生成PDF
+ try:
+ doc.build(content)
+ absolute_pdf_path = os.path.abspath(pdf_filename)
+ logger.info(f"PDF saved as {absolute_pdf_path}")
+
+ # 加密PDF
+ add_pdf_encryption(absolute_pdf_path)
+ driver.quit()
+
+ return absolute_pdf_path
+ except Exception as e:
+ logger.info(f"生成PDF时出错: {e}")
+ driver.quit()
+ # 如果生成失败,返回一个默认路径或空字符串
+ return ""
except Exception as e:
- logger.info("未找到满18岁按钮,跳过此步骤", e)
-
- # 解析页面
- html = driver.page_source
- soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8')
- posts = soup.find_all('tbody', {'id': lambda x: x and x.startswith('normalthread')})
-
- # 获取今天的日期
- today = datetime.now().strftime('%Y-%m-%d')
-
- # 注册中文字体
- pdfmetrics.registerFont(TTFont('SimHei', 'fonts/simhei.ttf'))
- styles = getSampleStyleSheet()
-
- # 设置样式
- title_style = styles['Heading1']
- title_style.fontName = 'SimHei'
- title_style.fontSize = 14
- title_style.textColor = colors.red
- title_style.bold = True
-
- normal_style = styles['Normal']
- normal_style.fontName = 'SimHei'
- normal_style.fontSize = 14
-
- content = []
-
- # 过滤当天帖子并倒序
- today_posts = []
- for post in posts:
- post_time_span = post.find('span', {'class': 'xi1'})
- if post_time_span:
- today_posts.append(post)
- today_posts = today_posts[::-1] # 倒序处理
-
- # 设置PDF
- pdf_filename = f"JAV-{today}-{len(today_posts)}.pdf"
- doc = SimpleDocTemplate(pdf_filename, pagesize=A3)
-
- # 计算内容区域的宽度和高度
- page_width, page_height = A3
- content_width = page_width - doc.rightMargin - doc.leftMargin
- content_height = page_height - doc.topMargin - doc.bottomMargin
-
- # 设置最大图片尺寸,留出一些边距
- max_image_width = content_width * 0.95
- max_image_height = content_height * 0.7 # 留出足够空间给文本和其他元素
-
- # 遍历帖子
- for post in today_posts:
- title = post.find('a', {'class': 's xst'})
- if title:
- post_title = title.get_text()
- post_url = title.get('href')
- logger.info(post_title)
-
- # 获取帖子内容
- post_page_url = 'https://www.sehuatang.net/' + post_url
- driver.get(post_page_url)
- time.sleep(3)
-
- post_html = driver.page_source
- post_soup = BeautifulSoup(post_html, 'html.parser', from_encoding='utf-8')
- content_div = post_soup.find('div', {'class': 't_fsz'})
-
- if content_div:
- # 提取文本和磁力链接
- post_text = content_div.get_text(strip=True)
- magnet_links = re.findall(r'magnet:\?[^ \u4e00-\u9fff]+', post_text)
-
- # 添加标题
- content.append(Paragraph(f" {post_title}", title_style))
- content.append(Spacer(1, 5))
-
- # 添加磁力链接
- if magnet_links:
- for magnet_link in magnet_links:
- content.append(Paragraph(f"
{magnet_link}
", normal_style))
- content.append(Spacer(1, 12))
-
- # 添加图片
- image_links = []
- images = content_div.find_all('img')
- for img in images:
- if img.get('zoomfile') and 'http' in img.get('zoomfile'):
- image_links.append(img.get('zoomfile'))
-
- if image_links:
- for img_link in image_links:
- image = download_image(img_link)
- if image:
- try:
- # 使用PIL处理图片尺寸
- with PILImage.open(image) as img:
- img_width, img_height = img.size
- # 计算缩放比例,确保图片适应页面
- scale_width = max_image_width / img_width
- scale_height = max_image_height / img_height
- scale = min(scale_width, scale_height, 1.0) # 不超过原始大小
-
- # 计算新的尺寸
- new_width = img_width * scale
- new_height = img_height * scale
-
- # 重置文件指针
- image.seek(0)
- img_stream = BytesIO(image.getvalue())
-
- # 添加图片到内容中,使用计算后的尺寸
- content.append(Image(img_stream, width=new_width, height=new_height))
- content.append(Spacer(1, 4))
- logger.info(f"处理图片: 原始尺寸 {img_width}x{img_height}, 新尺寸 {new_width}x{new_height}")
- except Exception as e:
- logger.info(f"处理图片时出错: {e}")
-
- # 在每个帖子后添加分页符(除了最后一页)
- if post != today_posts[-1]:
- content.append(PageBreak())
-
- # 生成PDF
- try:
- doc.build(content)
- absolute_pdf_path = os.path.abspath(pdf_filename)
- logger.info(f"PDF saved as {absolute_pdf_path}")
-
- # 加密PDF
- add_pdf_encryption(absolute_pdf_path)
+ logger.info(f"抓取帖子时出错: {e}")
+ # 如果抓取失败,返回一个默认路径或空字符串
driver.quit()
-
- return absolute_pdf_path
- except Exception as e:
- logger.info(f"生成PDF时出错: {e}")
- driver.quit()
- # 如果生成失败,返回一个默认路径或空字符串
return ""
-
+ finally:
+ driver.quit()
# add_pdf_encryption 和 pdf_file_path 函数保持不变
def add_pdf_encryption(pdf_file, password="4000"):