加入分页,逆序排列帖子
This commit is contained in:
@@ -10,7 +10,7 @@ from webdriver_manager.chrome import ChromeDriverManager
|
|||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from reportlab.lib.pagesizes import letter
|
from reportlab.lib.pagesizes import letter
|
||||||
from reportlab.lib import colors
|
from reportlab.lib import colors
|
||||||
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image
|
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image, PageBreak
|
||||||
from reportlab.lib.styles import getSampleStyleSheet
|
from reportlab.lib.styles import getSampleStyleSheet
|
||||||
from reportlab.pdfbase.ttfonts import TTFont
|
from reportlab.pdfbase.ttfonts import TTFont
|
||||||
from reportlab.pdfbase import pdfmetrics
|
from reportlab.pdfbase import pdfmetrics
|
||||||
@@ -20,6 +20,7 @@ import re
|
|||||||
from PyPDF2 import PdfReader, PdfWriter
|
from PyPDF2 import PdfReader, PdfWriter
|
||||||
|
|
||||||
|
|
||||||
|
# download_image 函数保持不变
|
||||||
def download_image(url):
|
def download_image(url):
|
||||||
"""下载大于100KB的图片并返回临时文件路径,仅支持jpg、jpeg和png格式"""
|
"""下载大于100KB的图片并返回临时文件路径,仅支持jpg、jpeg和png格式"""
|
||||||
try:
|
try:
|
||||||
@@ -28,11 +29,11 @@ def download_image(url):
|
|||||||
|
|
||||||
headers = {
|
headers = {
|
||||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
||||||
'Referer': 'https://tu.a7nz4.us', # 防止403
|
'Referer': 'https://tu.a7nz4.us',
|
||||||
}
|
}
|
||||||
|
|
||||||
response = requests.get(url, headers=headers)
|
response = requests.get(url, headers=headers)
|
||||||
response.raise_for_status() # 确保请求成功
|
response.raise_for_status()
|
||||||
image = BytesIO(response.content)
|
image = BytesIO(response.content)
|
||||||
return image
|
return image
|
||||||
except requests.exceptions.RequestException as e:
|
except requests.exceptions.RequestException as e:
|
||||||
@@ -42,43 +43,39 @@ def download_image(url):
|
|||||||
|
|
||||||
def fetch_and_create_pdf(url):
|
def fetch_and_create_pdf(url):
|
||||||
"""根据给定URL抓取页面并生成PDF"""
|
"""根据给定URL抓取页面并生成PDF"""
|
||||||
# 配置Selenium以无头模式运行
|
# 配置Selenium
|
||||||
options = Options()
|
options = Options()
|
||||||
options.headless = True
|
options.headless = True
|
||||||
options.add_argument('--disable-gpu')
|
options.add_argument('--disable-gpu')
|
||||||
options.add_argument('--no-sandbox')
|
options.add_argument('--no-sandbox')
|
||||||
|
|
||||||
# 使用webdriver-manager自动下载ChromeDriver
|
|
||||||
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
|
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
|
||||||
|
|
||||||
# 获取目标页面
|
# 获取目标页面
|
||||||
driver.get(url)
|
driver.get(url)
|
||||||
time.sleep(5)
|
time.sleep(5)
|
||||||
|
|
||||||
# 处理“满18岁,请点此进入”按钮
|
# 处理年龄验证按钮
|
||||||
try:
|
try:
|
||||||
enter_button = driver.find_element(By.XPATH, '//a[contains(text(), "满18岁,请点此进入")]')
|
enter_button = driver.find_element(By.XPATH, '//a[contains(text(), "满18岁,请点此进入")]')
|
||||||
enter_button.click()
|
enter_button.click()
|
||||||
print("点击了满18岁按钮")
|
print("点击了满18岁按钮")
|
||||||
time.sleep(5) # 等待内容加载
|
time.sleep(5)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("未找到满18岁按钮,跳过此步骤", e)
|
print("未找到满18岁按钮,跳过此步骤", e)
|
||||||
|
|
||||||
# 使用BeautifulSoup解析页面
|
# 解析页面
|
||||||
html = driver.page_source
|
html = driver.page_source
|
||||||
soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8')
|
soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8')
|
||||||
|
|
||||||
# 定位到帖子列表
|
|
||||||
posts = soup.find_all('tbody', {'id': lambda x: x and x.startswith('normalthread')})
|
posts = soup.find_all('tbody', {'id': lambda x: x and x.startswith('normalthread')})
|
||||||
|
|
||||||
# 获取今天的日期
|
# 获取今天的日期
|
||||||
today = datetime.now().strftime('%Y-%m-%d')
|
today = datetime.now().strftime('%Y-%m-%d')
|
||||||
|
|
||||||
# 注册中文字体
|
# 注册中文字体
|
||||||
pdfmetrics.registerFont(TTFont('SimHei', 'fonts/simhei.ttf')) # 请确保字体文件路径正确
|
pdfmetrics.registerFont(TTFont('SimHei', 'fonts/simhei.ttf'))
|
||||||
styles = getSampleStyleSheet()
|
styles = getSampleStyleSheet()
|
||||||
|
|
||||||
# 设置标题和正文样式
|
# 设置样式
|
||||||
title_style = styles['Heading1']
|
title_style = styles['Heading1']
|
||||||
title_style.fontName = 'SimHei'
|
title_style.fontName = 'SimHei'
|
||||||
title_style.fontSize = 14
|
title_style.fontSize = 14
|
||||||
@@ -90,65 +87,43 @@ def fetch_and_create_pdf(url):
|
|||||||
|
|
||||||
content = []
|
content = []
|
||||||
|
|
||||||
# 过滤出当天的帖子
|
# 过滤当天帖子并倒序
|
||||||
today_posts = []
|
today_posts = []
|
||||||
for post in posts:
|
for post in posts:
|
||||||
post_time_span = post.find('span', {'class': 'xi1'})
|
post_time_span = post.find('span', {'class': 'xi1'})
|
||||||
if post_time_span: # 判断是否为当天帖子
|
if post_time_span:
|
||||||
today_posts.append(post)
|
today_posts.append(post)
|
||||||
|
today_posts = today_posts[::-1] # 倒序处理
|
||||||
|
|
||||||
# 设置PDF
|
# 设置PDF
|
||||||
pdf_filename = f"JAV-{today}-{len(today_posts)}.pdf"
|
pdf_filename = f"JAV-{today}-{len(today_posts)}.pdf"
|
||||||
doc = SimpleDocTemplate(pdf_filename, pagesize=letter)
|
doc = SimpleDocTemplate(pdf_filename, pagesize=letter)
|
||||||
|
|
||||||
# 遍历当天的帖子并提取信息
|
# 遍历帖子
|
||||||
for post in today_posts:
|
for post in today_posts:
|
||||||
# 查找帖子标题
|
|
||||||
title = post.find('a', {'class': 's xst'})
|
title = post.find('a', {'class': 's xst'})
|
||||||
if title:
|
if title:
|
||||||
post_title = title.get_text()
|
post_title = title.get_text()
|
||||||
post_url = title.get('href')
|
post_url = title.get('href')
|
||||||
print(post_title)
|
print(post_title)
|
||||||
# 获取帖子页面
|
|
||||||
|
# 获取帖子内容
|
||||||
post_page_url = 'https://www.sehuatang.net/' + post_url
|
post_page_url = 'https://www.sehuatang.net/' + post_url
|
||||||
driver.get(post_page_url)
|
driver.get(post_page_url)
|
||||||
time.sleep(3)
|
time.sleep(3)
|
||||||
|
|
||||||
# 获取帖子页面内容
|
|
||||||
post_html = driver.page_source
|
post_html = driver.page_source
|
||||||
post_soup = BeautifulSoup(post_html, 'html.parser', from_encoding='utf-8')
|
post_soup = BeautifulSoup(post_html, 'html.parser', from_encoding='utf-8')
|
||||||
|
|
||||||
# 提取 <div class="t_fsz"> 下的文本和图片
|
|
||||||
content_div = post_soup.find('div', {'class': 't_fsz'})
|
content_div = post_soup.find('div', {'class': 't_fsz'})
|
||||||
|
|
||||||
if content_div:
|
if content_div:
|
||||||
# 提取文本
|
# 提取文本和磁力链接
|
||||||
post_text = content_div.get_text(strip=True)
|
post_text = content_div.get_text(strip=True)
|
||||||
|
|
||||||
# 查找磁力链接
|
|
||||||
magnet_links = re.findall(r'magnet:\?[^ \u4e00-\u9fff]+', post_text)
|
magnet_links = re.findall(r'magnet:\?[^ \u4e00-\u9fff]+', post_text)
|
||||||
|
|
||||||
# 添加标题和来源URL
|
# 添加标题
|
||||||
content.append(Paragraph(f" {post_title}", title_style))
|
content.append(Paragraph(f" {post_title}", title_style))
|
||||||
content.append(Spacer(1, 12))
|
content.append(Spacer(1, 12))
|
||||||
# content.append(Paragraph(f"来源URL:<br /> {post_page_url}<br />", normal_style))
|
|
||||||
# content.append(Spacer(1, 12))
|
|
||||||
|
|
||||||
# 解析并格式化 "介绍" 内容
|
|
||||||
# intro_content = []
|
|
||||||
# intro_lines = post_text.split("【") # 以【分割字段
|
|
||||||
# for line in intro_lines:
|
|
||||||
# if line.strip():
|
|
||||||
# line = "【" + line if not line.startswith("【") else line
|
|
||||||
# if "】:" in line:
|
|
||||||
# key, value = line.split("】:", 1)
|
|
||||||
# intro_content.append(Paragraph(f"{key}】:<br />{value}<br />", normal_style))
|
|
||||||
# intro_content.append(Spacer(1, 6)) # 字段间距
|
|
||||||
#
|
|
||||||
# # 添加格式化的介绍部分
|
|
||||||
# content.append(Paragraph("介绍:<br />", normal_style))
|
|
||||||
# content.extend(intro_content)
|
|
||||||
# content.append(Spacer(1, 12))
|
|
||||||
|
|
||||||
# 添加磁力链接
|
# 添加磁力链接
|
||||||
if magnet_links:
|
if magnet_links:
|
||||||
@@ -162,53 +137,45 @@ def fetch_and_create_pdf(url):
|
|||||||
for img in images:
|
for img in images:
|
||||||
if img.get('zoomfile') and 'http' in img.get('zoomfile'):
|
if img.get('zoomfile') and 'http' in img.get('zoomfile'):
|
||||||
image_links.append(img.get('zoomfile'))
|
image_links.append(img.get('zoomfile'))
|
||||||
print(image_links)
|
|
||||||
if image_links:
|
if image_links:
|
||||||
for img_link in image_links:
|
for img_link in image_links:
|
||||||
image = download_image(img_link)
|
image = download_image(img_link)
|
||||||
if image:
|
if image:
|
||||||
img = PILImage.open(image)
|
img = PILImage.open(image)
|
||||||
img_width, img_height = img.size
|
img_width, img_height = img.size
|
||||||
image_width = 500 # 固定宽度
|
image_width = 500
|
||||||
image_height = int((img_height / img_width) * image_width)
|
image_height = int((img_height / img_width) * image_width)
|
||||||
|
|
||||||
img_stream = BytesIO(image.getvalue())
|
img_stream = BytesIO(image.getvalue())
|
||||||
content.append(Image(img_stream, width=image_width, height=image_height))
|
content.append(Image(img_stream, width=image_width, height=image_height))
|
||||||
content.append(Spacer(1, 12))
|
content.append(Spacer(1, 12))
|
||||||
|
|
||||||
content.append(Spacer(1, 12)) # 帖子间距
|
# 在每个帖子后添加分页符(除了最后一页)
|
||||||
|
if post != today_posts[-1]:
|
||||||
|
content.append(PageBreak())
|
||||||
|
|
||||||
# 生成PDF
|
# 生成PDF
|
||||||
doc.build(content)
|
doc.build(content)
|
||||||
|
|
||||||
# 获取PDF绝对路径
|
|
||||||
absolute_pdf_path = os.path.abspath(pdf_filename)
|
absolute_pdf_path = os.path.abspath(pdf_filename)
|
||||||
print(f"PDF saved as {absolute_pdf_path}")
|
print(f"PDF saved as {absolute_pdf_path}")
|
||||||
|
|
||||||
# 加密PDF
|
# 加密PDF
|
||||||
add_pdf_encryption(absolute_pdf_path)
|
add_pdf_encryption(absolute_pdf_path)
|
||||||
|
|
||||||
# 关闭浏览器
|
|
||||||
driver.quit()
|
driver.quit()
|
||||||
|
|
||||||
return absolute_pdf_path
|
return absolute_pdf_path
|
||||||
|
|
||||||
|
|
||||||
|
# add_pdf_encryption 和 pdf_file_path 函数保持不变
|
||||||
def add_pdf_encryption(pdf_file, password="4000"):
|
def add_pdf_encryption(pdf_file, password="4000"):
|
||||||
"""使用PyPDF2为PDF添加加密保护"""
|
"""使用PyPDF2为PDF添加加密保护"""
|
||||||
pdf_writer = PdfWriter()
|
pdf_writer = PdfWriter()
|
||||||
pdf_reader = PdfReader(pdf_file)
|
pdf_reader = PdfReader(pdf_file)
|
||||||
|
|
||||||
# 将所有页面添加到PDF写入器
|
|
||||||
for page_num in range(len(pdf_reader.pages)):
|
for page_num in range(len(pdf_reader.pages)):
|
||||||
pdf_writer.add_page(pdf_reader.pages[page_num])
|
pdf_writer.add_page(pdf_reader.pages[page_num])
|
||||||
|
|
||||||
# 添加密码
|
|
||||||
pdf_writer.encrypt(password)
|
pdf_writer.encrypt(password)
|
||||||
|
|
||||||
# 保存加密后的PDF
|
|
||||||
with open(pdf_file, "wb") as output_pdf:
|
with open(pdf_file, "wb") as output_pdf:
|
||||||
pdf_writer.write(output_pdf)
|
pdf_writer.write(output_pdf)
|
||||||
|
|
||||||
print(f"PDF加密成功,密码为: {password}")
|
print(f"PDF加密成功,密码为: {password}")
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user