diff --git a/fonts/simhei.ttf b/fonts/simhei.ttf new file mode 100644 index 0000000..60a4819 Binary files /dev/null and b/fonts/simhei.ttf differ diff --git a/fonts/simsun.ttf b/fonts/simsun.ttf new file mode 100644 index 0000000..7b7c8b8 Binary files /dev/null and b/fonts/simsun.ttf differ diff --git a/main.py b/main.py index 48ecbb2..e4663f3 100644 --- a/main.py +++ b/main.py @@ -62,6 +62,8 @@ def main(chat_type: int): robot.onEveryTime("00:30", robot.messageCountToDB) # 从db中提取并发送给相关群 robot.onEveryTime("09:30", robot.generateAndSendRanking) + + #sehuatang # 让机器人一直跑 robot.keepRunningAndBlockProcess() diff --git a/sehuatang/shehuatang.py b/sehuatang/shehuatang.py new file mode 100644 index 0000000..e201cf0 --- /dev/null +++ b/sehuatang/shehuatang.py @@ -0,0 +1,192 @@ +import time +import os +import requests +from io import BytesIO +from selenium import webdriver +from selenium.webdriver.chrome.service import Service +from selenium.webdriver.common.by import By +from selenium.webdriver.chrome.options import Options +from webdriver_manager.chrome import ChromeDriverManager +from bs4 import BeautifulSoup +from reportlab.lib.pagesizes import letter +from reportlab.lib import colors +from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image +from reportlab.lib.styles import getSampleStyleSheet +from reportlab.pdfbase.ttfonts import TTFont +from reportlab.pdfbase import pdfmetrics +from datetime import datetime +from PIL import Image as PILImage +from PyPDF2 import PdfReader, PdfWriter # 用于PDF加密 + + +def download_image(url): + """ 下载大于100KB的图片并返回临时文件路径,仅支持jpg、jpeg和png格式 """ + try: + if not url.lower().endswith(('.jpg', '.jpeg', '.png')): + return None + + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', + 'Referer': 'https://tu.a7nz4.us', # 防止403 + } + + response = requests.get(url, headers=headers) + response.raise_for_status() # 确保请求成功 + image = BytesIO(response.content) + return image + except requests.exceptions.RequestException as e: + print(f"下载图片失败: {e}") + return None + + +def add_pdf_encryption(pdf_file, password="4000"): + """ 使用PyPDF2为PDF添加加密保护 """ + pdf_writer = PdfWriter() + pdf_reader = PdfReader(pdf_file) + + # 将所有页面添加到PDF写入器中 + for page_num in range(len(pdf_reader.pages)): + pdf_writer.add_page(pdf_reader.pages[page_num]) + + # 添加密码 + pdf_writer.encrypt(password) + + # 保存加密后的PDF + with open(pdf_file, "wb") as output_pdf: + pdf_writer.write(output_pdf) + + print(f"PDF加密成功,密码为: {password}") + + +def fetch_and_create_pdf(url): + """根据给定URL抓取页面并生成PDF""" + # 配置Selenium以无头模式(即不显示浏览器窗口)运行 + options = Options() + options.headless = True + options.add_argument('--disable-gpu') + options.add_argument('--no-sandbox') + + # 使用webdriver-manager自动下载ChromeDriver + driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options) + + # 获取目标页面 + driver.get(url) + time.sleep(5) + + # 处理“满18岁,请点此进入”按钮 + try: + enter_button = driver.find_element(By.XPATH, '//a[contains(text(), "满18岁,请点此进入")]') + enter_button.click() + print("点击了满18岁按钮") + time.sleep(5) # 等待 5 秒,确保点击后内容加载完成 + except Exception as e: + print("未找到满18岁按钮,跳过此步骤", e) + + # 使用BeautifulSoup解析页面 + html = driver.page_source + soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8') + + # 定位到帖子列表 + posts = soup.find_all('tbody', {'id': lambda x: x and x.startswith('normalthread')}) + + # 获取今天的日期 + today = datetime.now().strftime('%Y-%m-%d') + + # 设置PDF + pdf_filename = f"JAV-{today}-{len(posts)}.pdf" + doc = SimpleDocTemplate(pdf_filename, pagesize=letter) + + # 注册中文字体 + pdfmetrics.registerFont(TTFont('SamHei', 'fonts/simhei.ttf')) # 设置中文字体路径 + styles = getSampleStyleSheet() + + # 设置标题和正文样式都使用SamHei字体 + title_style = styles['Heading1'] + title_style.fontName = 'SamHei' # 设置标题使用SamHei字体 + + normal_style = styles['Normal'] + normal_style.fontName = 'SamHei' # 设置正文使用SamHei字体 + + content = [] + + # 过滤出当天的帖子 + today_posts = [] + for post in posts: + post_time_span = post.find('span', {'class': 'xi1'}) + if post_time_span: # 判断是否存在post_time_span,即认为是当天发布的帖子 + today_posts.append(post) + + # 遍历当天的帖子并提取信息 + for post in today_posts: + # 查找帖子标题 + title = post.find('a', {'class': 's xst'}) + if title: + post_title = title.get_text() + post_url = title.get('href') + + # 获取帖子的页面 + post_page_url = 'https://www.sehuatang.net/' + post_url + driver.get(post_page_url) + time.sleep(3) + + # 获取帖子页面内容 + post_html = driver.page_source + post_soup = BeautifulSoup(post_html, 'html.parser', from_encoding='utf-8') + + # 提取