jiaru jav内容

2025-02-08 14:33:51 +08:00
parent 9ceaea00dd
commit 30c6ac6466
4 changed files with 194 additions and 0 deletions
--- a/fonts/simhei.ttf
+++ b/fonts/simhei.ttf
--- a/fonts/simsun.ttf
+++ b/fonts/simsun.ttf
--- a/main.py
+++ b/main.py
@@ -62,6 +62,8 @@ def main(chat_type: int):
    robot.onEveryTime("00:30", robot.messageCountToDB)
    # 从db中提取并发送给相关群
    robot.onEveryTime("09:30", robot.generateAndSendRanking)
+    
+    #sehuatang

    # 让机器人一直跑
    robot.keepRunningAndBlockProcess()
--- a/sehuatang/shehuatang.py
+++ b/sehuatang/shehuatang.py
@@ -0,0 +1,192 @@
+import time
+import os
+import requests
+from io import BytesIO
+from selenium import webdriver
+from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.common.by import By
+from selenium.webdriver.chrome.options import Options
+from webdriver_manager.chrome import ChromeDriverManager
+from bs4 import BeautifulSoup
+from reportlab.lib.pagesizes import letter
+from reportlab.lib import colors
+from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image
+from reportlab.lib.styles import getSampleStyleSheet
+from reportlab.pdfbase.ttfonts import TTFont
+from reportlab.pdfbase import pdfmetrics
+from datetime import datetime
+from PIL import Image as PILImage
+from PyPDF2 import PdfReader, PdfWriter  # 用于PDF加密
+
+
+def download_image(url):
+    """ 下载大于100KB的图片并返回临时文件路径，仅支持jpg、jpeg和png格式 """
+    try:
+        if not url.lower().endswith(('.jpg', '.jpeg', '.png')):
+            return None
+
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+            'Referer': 'https://tu.a7nz4.us',  # 防止403
+        }
+
+        response = requests.get(url, headers=headers)
+        response.raise_for_status()  # 确保请求成功
+        image = BytesIO(response.content)
+        return image
+    except requests.exceptions.RequestException as e:
+        print(f"下载图片失败: {e}")
+        return None
+
+
+def add_pdf_encryption(pdf_file, password="4000"):
+    """ 使用PyPDF2为PDF添加加密保护 """
+    pdf_writer = PdfWriter()
+    pdf_reader = PdfReader(pdf_file)
+
+    # 将所有页面添加到PDF写入器中
+    for page_num in range(len(pdf_reader.pages)):
+        pdf_writer.add_page(pdf_reader.pages[page_num])
+
+    # 添加密码
+    pdf_writer.encrypt(password)
+
+    # 保存加密后的PDF
+    with open(pdf_file, "wb") as output_pdf:
+        pdf_writer.write(output_pdf)
+
+    print(f"PDF加密成功，密码为: {password}")
+
+
+def fetch_and_create_pdf(url):
+    """根据给定URL抓取页面并生成PDF"""
+    # 配置Selenium以无头模式（即不显示浏览器窗口）运行
+    options = Options()
+    options.headless = True
+    options.add_argument('--disable-gpu')
+    options.add_argument('--no-sandbox')
+
+    # 使用webdriver-manager自动下载ChromeDriver
+    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
+
+    # 获取目标页面
+    driver.get(url)
+    time.sleep(5)
+
+    # 处理“满18岁，请点此进入”按钮
+    try:
+        enter_button = driver.find_element(By.XPATH, '//a[contains(text(), "满18岁，请点此进入")]')
+        enter_button.click()
+        print("点击了满18岁按钮")
+        time.sleep(5)  # 等待 5 秒，确保点击后内容加载完成
+    except Exception as e:
+        print("未找到满18岁按钮，跳过此步骤", e)
+
+    # 使用BeautifulSoup解析页面
+    html = driver.page_source
+    soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8')
+
+    # 定位到帖子列表
+    posts = soup.find_all('tbody', {'id': lambda x: x and x.startswith('normalthread')})
+
+    # 获取今天的日期
+    today = datetime.now().strftime('%Y-%m-%d')
+
+    # 设置PDF
+    pdf_filename = f"JAV-{today}-{len(posts)}.pdf"
+    doc = SimpleDocTemplate(pdf_filename, pagesize=letter)
+
+    # 注册中文字体
+    pdfmetrics.registerFont(TTFont('SamHei', 'fonts/simhei.ttf'))  # 设置中文字体路径
+    styles = getSampleStyleSheet()
+
+    # 设置标题和正文样式都使用SamHei字体
+    title_style = styles['Heading1']
+    title_style.fontName = 'SamHei'  # 设置标题使用SamHei字体
+
+    normal_style = styles['Normal']
+    normal_style.fontName = 'SamHei'  # 设置正文使用SamHei字体
+
+    content = []
+
+    # 过滤出当天的帖子
+    today_posts = []
+    for post in posts:
+        post_time_span = post.find('span', {'class': 'xi1'})
+        if post_time_span:  # 判断是否存在post_time_span，即认为是当天发布的帖子
+            today_posts.append(post)
+
+    # 遍历当天的帖子并提取信息
+    for post in today_posts:
+        # 查找帖子标题
+        title = post.find('a', {'class': 's xst'})
+        if title:
+            post_title = title.get_text()
+            post_url = title.get('href')
+
+            # 获取帖子的页面
+            post_page_url = 'https://www.sehuatang.net/' + post_url
+            driver.get(post_page_url)
+            time.sleep(3)
+
+            # 获取帖子页面内容
+            post_html = driver.page_source
+            post_soup = BeautifulSoup(post_html, 'html.parser', from_encoding='utf-8')
+
+            # 提取 <div class="t_fsz"> 下的文本和图片
+            content_div = post_soup.find('div', {'class': 't_fsz'})
+
+            if content_div:
+                # 提取文本
+                # 提取文本并将 <br> 标签替换为换行符
+                post_text = content_div.get_text(separator='\n', strip=True)  # 使用 separator='\n' 参数替换 <br> 标签
+                # 提取图片链接
+                image_links = []
+                images = content_div.find_all('img')
+                for img in images:
+                    if img.get('src') and 'http' in img.get('src'):
+                        image_links.append(img.get('src'))
+
+                # 添加标题到PDF
+                content.append(Paragraph(f"Title: {post_title}", title_style))
+                content.append(Spacer(1, 12))
+                content.append(Paragraph(f"Post URL: {post_page_url}", normal_style))
+                content.append(Spacer(1, 12))
+                content.append(Paragraph(f"Post Content: {post_text}", normal_style))
+                content.append(Spacer(1, 12))  # 添加空白区域
+
+                # 添加图片
+                if image_links:
+                    for img_link in image_links:
+                        image = download_image(img_link)
+                        if image:
+                            img = PILImage.open(image)
+                            img_width, img_height = img.size
+                            image_width = 400  # 图片宽度
+                            image_height = int((img_height / img_width) * image_width)
+
+                            # 将图片加载到内存流中，并添加到PDF
+                            img_stream = BytesIO(image.getvalue())
+                            content.append(Image(img_stream, width=image_width, height=image_height))
+                            content.append(Spacer(1, 12))  # 添加空白区域
+
+                content.append(Spacer(1, 12))  # 为每个帖子添加间距
+
+    # 生成PDF
+    doc.build(content)
+
+    # 关闭浏览器
+    driver.quit()
+
+    print(f"PDF saved as {pdf_filename}")
+
+    # 加密PDF
+    add_pdf_encryption(pdf_filename)
+    return pdf_filename
+
+def main():
+    url = 'https://www.sehuatang.net/forum.php?mod=forumdisplay&fid=103&filter=typeid&typeid=481'
+    return fetch_and_create_pdf(url)
+
+if __name__ == "__main__":
+    main()