jiaru jav内容
This commit is contained in:
BIN
fonts/simhei.ttf
Normal file
BIN
fonts/simhei.ttf
Normal file
Binary file not shown.
BIN
fonts/simsun.ttf
Normal file
BIN
fonts/simsun.ttf
Normal file
Binary file not shown.
2
main.py
2
main.py
@@ -62,6 +62,8 @@ def main(chat_type: int):
|
||||
robot.onEveryTime("00:30", robot.messageCountToDB)
|
||||
# 从db中提取并发送给相关群
|
||||
robot.onEveryTime("09:30", robot.generateAndSendRanking)
|
||||
|
||||
#sehuatang
|
||||
|
||||
# 让机器人一直跑
|
||||
robot.keepRunningAndBlockProcess()
|
||||
|
||||
192
sehuatang/shehuatang.py
Normal file
192
sehuatang/shehuatang.py
Normal file
@@ -0,0 +1,192 @@
|
||||
import time
|
||||
import os
|
||||
import requests
|
||||
from io import BytesIO
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.service import Service
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from webdriver_manager.chrome import ChromeDriverManager
|
||||
from bs4 import BeautifulSoup
|
||||
from reportlab.lib.pagesizes import letter
|
||||
from reportlab.lib import colors
|
||||
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image
|
||||
from reportlab.lib.styles import getSampleStyleSheet
|
||||
from reportlab.pdfbase.ttfonts import TTFont
|
||||
from reportlab.pdfbase import pdfmetrics
|
||||
from datetime import datetime
|
||||
from PIL import Image as PILImage
|
||||
from PyPDF2 import PdfReader, PdfWriter # 用于PDF加密
|
||||
|
||||
|
||||
def download_image(url):
|
||||
""" 下载大于100KB的图片并返回临时文件路径,仅支持jpg、jpeg和png格式 """
|
||||
try:
|
||||
if not url.lower().endswith(('.jpg', '.jpeg', '.png')):
|
||||
return None
|
||||
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
||||
'Referer': 'https://tu.a7nz4.us', # 防止403
|
||||
}
|
||||
|
||||
response = requests.get(url, headers=headers)
|
||||
response.raise_for_status() # 确保请求成功
|
||||
image = BytesIO(response.content)
|
||||
return image
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"下载图片失败: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def add_pdf_encryption(pdf_file, password="4000"):
|
||||
""" 使用PyPDF2为PDF添加加密保护 """
|
||||
pdf_writer = PdfWriter()
|
||||
pdf_reader = PdfReader(pdf_file)
|
||||
|
||||
# 将所有页面添加到PDF写入器中
|
||||
for page_num in range(len(pdf_reader.pages)):
|
||||
pdf_writer.add_page(pdf_reader.pages[page_num])
|
||||
|
||||
# 添加密码
|
||||
pdf_writer.encrypt(password)
|
||||
|
||||
# 保存加密后的PDF
|
||||
with open(pdf_file, "wb") as output_pdf:
|
||||
pdf_writer.write(output_pdf)
|
||||
|
||||
print(f"PDF加密成功,密码为: {password}")
|
||||
|
||||
|
||||
def fetch_and_create_pdf(url):
|
||||
"""根据给定URL抓取页面并生成PDF"""
|
||||
# 配置Selenium以无头模式(即不显示浏览器窗口)运行
|
||||
options = Options()
|
||||
options.headless = True
|
||||
options.add_argument('--disable-gpu')
|
||||
options.add_argument('--no-sandbox')
|
||||
|
||||
# 使用webdriver-manager自动下载ChromeDriver
|
||||
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
|
||||
|
||||
# 获取目标页面
|
||||
driver.get(url)
|
||||
time.sleep(5)
|
||||
|
||||
# 处理“满18岁,请点此进入”按钮
|
||||
try:
|
||||
enter_button = driver.find_element(By.XPATH, '//a[contains(text(), "满18岁,请点此进入")]')
|
||||
enter_button.click()
|
||||
print("点击了满18岁按钮")
|
||||
time.sleep(5) # 等待 5 秒,确保点击后内容加载完成
|
||||
except Exception as e:
|
||||
print("未找到满18岁按钮,跳过此步骤", e)
|
||||
|
||||
# 使用BeautifulSoup解析页面
|
||||
html = driver.page_source
|
||||
soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8')
|
||||
|
||||
# 定位到帖子列表
|
||||
posts = soup.find_all('tbody', {'id': lambda x: x and x.startswith('normalthread')})
|
||||
|
||||
# 获取今天的日期
|
||||
today = datetime.now().strftime('%Y-%m-%d')
|
||||
|
||||
# 设置PDF
|
||||
pdf_filename = f"JAV-{today}-{len(posts)}.pdf"
|
||||
doc = SimpleDocTemplate(pdf_filename, pagesize=letter)
|
||||
|
||||
# 注册中文字体
|
||||
pdfmetrics.registerFont(TTFont('SamHei', 'fonts/simhei.ttf')) # 设置中文字体路径
|
||||
styles = getSampleStyleSheet()
|
||||
|
||||
# 设置标题和正文样式都使用SamHei字体
|
||||
title_style = styles['Heading1']
|
||||
title_style.fontName = 'SamHei' # 设置标题使用SamHei字体
|
||||
|
||||
normal_style = styles['Normal']
|
||||
normal_style.fontName = 'SamHei' # 设置正文使用SamHei字体
|
||||
|
||||
content = []
|
||||
|
||||
# 过滤出当天的帖子
|
||||
today_posts = []
|
||||
for post in posts:
|
||||
post_time_span = post.find('span', {'class': 'xi1'})
|
||||
if post_time_span: # 判断是否存在post_time_span,即认为是当天发布的帖子
|
||||
today_posts.append(post)
|
||||
|
||||
# 遍历当天的帖子并提取信息
|
||||
for post in today_posts:
|
||||
# 查找帖子标题
|
||||
title = post.find('a', {'class': 's xst'})
|
||||
if title:
|
||||
post_title = title.get_text()
|
||||
post_url = title.get('href')
|
||||
|
||||
# 获取帖子的页面
|
||||
post_page_url = 'https://www.sehuatang.net/' + post_url
|
||||
driver.get(post_page_url)
|
||||
time.sleep(3)
|
||||
|
||||
# 获取帖子页面内容
|
||||
post_html = driver.page_source
|
||||
post_soup = BeautifulSoup(post_html, 'html.parser', from_encoding='utf-8')
|
||||
|
||||
# 提取 <div class="t_fsz"> 下的文本和图片
|
||||
content_div = post_soup.find('div', {'class': 't_fsz'})
|
||||
|
||||
if content_div:
|
||||
# 提取文本
|
||||
# 提取文本并将 <br> 标签替换为换行符
|
||||
post_text = content_div.get_text(separator='\n', strip=True) # 使用 separator='\n' 参数替换 <br> 标签
|
||||
# 提取图片链接
|
||||
image_links = []
|
||||
images = content_div.find_all('img')
|
||||
for img in images:
|
||||
if img.get('src') and 'http' in img.get('src'):
|
||||
image_links.append(img.get('src'))
|
||||
|
||||
# 添加标题到PDF
|
||||
content.append(Paragraph(f"Title: {post_title}", title_style))
|
||||
content.append(Spacer(1, 12))
|
||||
content.append(Paragraph(f"Post URL: {post_page_url}", normal_style))
|
||||
content.append(Spacer(1, 12))
|
||||
content.append(Paragraph(f"Post Content: {post_text}", normal_style))
|
||||
content.append(Spacer(1, 12)) # 添加空白区域
|
||||
|
||||
# 添加图片
|
||||
if image_links:
|
||||
for img_link in image_links:
|
||||
image = download_image(img_link)
|
||||
if image:
|
||||
img = PILImage.open(image)
|
||||
img_width, img_height = img.size
|
||||
image_width = 400 # 图片宽度
|
||||
image_height = int((img_height / img_width) * image_width)
|
||||
|
||||
# 将图片加载到内存流中,并添加到PDF
|
||||
img_stream = BytesIO(image.getvalue())
|
||||
content.append(Image(img_stream, width=image_width, height=image_height))
|
||||
content.append(Spacer(1, 12)) # 添加空白区域
|
||||
|
||||
content.append(Spacer(1, 12)) # 为每个帖子添加间距
|
||||
|
||||
# 生成PDF
|
||||
doc.build(content)
|
||||
|
||||
# 关闭浏览器
|
||||
driver.quit()
|
||||
|
||||
print(f"PDF saved as {pdf_filename}")
|
||||
|
||||
# 加密PDF
|
||||
add_pdf_encryption(pdf_filename)
|
||||
return pdf_filename
|
||||
|
||||
def main():
|
||||
url = 'https://www.sehuatang.net/forum.php?mod=forumdisplay&fid=103&filter=typeid&typeid=481'
|
||||
return fetch_and_create_pdf(url)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user