feature:秀人网抓图功能开放
This commit is contained in:
3
main.py
3
main.py
@@ -47,6 +47,9 @@ def main(chat_type: int):
|
|||||||
|
|
||||||
# 游戏的定时任务每小时执行
|
# 游戏的定时任务每小时执行
|
||||||
robot.onEveryTime("18:00", robot.game_auto_tasks)
|
robot.onEveryTime("18:00", robot.game_auto_tasks)
|
||||||
|
|
||||||
|
# 秀人网每天自动下载帖子
|
||||||
|
robot.onEveryTime("03:00", robot.xiu_ren_download_task)
|
||||||
# 让机器人一直跑
|
# 让机器人一直跑
|
||||||
robot.keepRunningAndBlockProcess()
|
robot.keepRunningAndBlockProcess()
|
||||||
|
|
||||||
|
|||||||
7
robot.py
7
robot.py
@@ -40,6 +40,7 @@ from message_report.write_db import write_to_db, generate_and_send_ranking
|
|||||||
from message_storage.message_to_db import archive_message, get_messages
|
from message_storage.message_to_db import archive_message, get_messages
|
||||||
from message_summary.message_summary_4o import message_summary
|
from message_summary.message_summary_4o import message_summary
|
||||||
from sehuatang.shehuatang import pdf_file_path
|
from sehuatang.shehuatang import pdf_file_path
|
||||||
|
from xiuren.xiuren_dl import xiuren_dowload_pic
|
||||||
|
|
||||||
|
|
||||||
class Robot(Job):
|
class Robot(Job):
|
||||||
@@ -525,3 +526,9 @@ class Robot(Job):
|
|||||||
self.sendTextMsg(rep, gid)
|
self.sendTextMsg(rep, gid)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.LOG.error(f"message_summary_robot error:{e}")
|
self.LOG.error(f"message_summary_robot error:{e}")
|
||||||
|
|
||||||
|
def xiu_ren_download_task(self):
|
||||||
|
try:
|
||||||
|
xiuren_dowload_pic()
|
||||||
|
except Exception as e:
|
||||||
|
self.LOG.error(f"xiuren_dowload_pic error:{e}")
|
||||||
|
|||||||
@@ -3,143 +3,170 @@ from bs4 import BeautifulSoup
|
|||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
import random
|
import random
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
def get_html(url):
|
def get_html(url, session):
|
||||||
headers = {
|
headers = {
|
||||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
|
||||||
'AppleWebKit/537.36 (KHTML, like Gecko) '
|
'Referer': 'https://www.xiurenwang.cc/'
|
||||||
'Chrome/114.0.0.0 Safari/537.36'
|
|
||||||
}
|
}
|
||||||
try:
|
try:
|
||||||
response = requests.get(url, headers=headers)
|
response = session.get(url, headers=headers, verify=False)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
return response.text
|
return response.text
|
||||||
except requests.exceptions.RequestException as e:
|
except requests.exceptions.RequestException as e:
|
||||||
print(f"Error fetching {url}: {e}")
|
print(f"请求 {url} 失败: {e}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def parse_initial_page(html):
|
def parse_initial_page(html):
|
||||||
soup = BeautifulSoup(html, 'html.parser')
|
soup = BeautifulSoup(html, 'html.parser')
|
||||||
posts = soup.find_all('a', href=lambda x: x and x.endswith('.html'))
|
posts = soup.find_all('a', href=lambda x: x and x.endswith('.html'))
|
||||||
first_two_posts = posts[:2]
|
# 取所有帖子,而不是仅前两个,以便后续查找未下载的帖子
|
||||||
post_info = []
|
post_info = []
|
||||||
|
|
||||||
print(posts)
|
for post in posts:
|
||||||
for post in first_two_posts:
|
|
||||||
text = post.text.strip()
|
text = post.text.strip()
|
||||||
print(f"Post text: '{text}'") # 调试输出,检查实际内容
|
number_match = re.search(r'No\.(\d+)', text)
|
||||||
if not text:
|
number = number_match.group(1) if number_match else None
|
||||||
print("Empty post text, skipping...")
|
if number:
|
||||||
continue
|
|
||||||
|
|
||||||
parts = text.split()
|
|
||||||
if len(parts) < 2:
|
|
||||||
print(f"Unexpected format in '{text}', skipping...")
|
|
||||||
continue
|
|
||||||
|
|
||||||
# 提取编号和图片总数
|
|
||||||
number = parts[0].replace('No.', '') if parts[0].startswith('No.') else None
|
|
||||||
pages = parts[-1].replace('P', '') if parts[-1].endswith('P') else None
|
|
||||||
|
|
||||||
if not number or not pages:
|
|
||||||
print(f"Failed to parse number or pages from '{text}', skipping...")
|
|
||||||
continue
|
|
||||||
|
|
||||||
try:
|
|
||||||
total_images = int(pages)
|
|
||||||
url = 'https://www.xiurenwang.cc/' + post['href']
|
url = 'https://www.xiurenwang.cc/' + post['href']
|
||||||
post_info.append({'url': url, 'number': number, 'total_images': total_images})
|
post_info.append({'url': url, 'number': number})
|
||||||
except ValueError:
|
|
||||||
print(f"Invalid total_images value in '{text}', skipping...")
|
|
||||||
continue
|
|
||||||
print(f"post_info:{post_info}")
|
|
||||||
return post_info
|
return post_info
|
||||||
|
|
||||||
|
|
||||||
def extract_title_and_first_image(html):
|
def extract_post_details(html):
|
||||||
soup = BeautifulSoup(html, 'html.parser')
|
soup = BeautifulSoup(html, 'html.parser')
|
||||||
title = soup.title.text.strip()
|
|
||||||
images = soup.find_all('img', src=lambda x: x and 'pic/' in x)
|
title = soup.title.text.strip() if soup.title else "未知标题"
|
||||||
if images:
|
|
||||||
first_image = images[0]
|
# 提取可见图片URL
|
||||||
first_image_src = first_image['src']
|
image_div = soup.find('div', id='image')
|
||||||
return title, first_image_src
|
visible_image_urls = []
|
||||||
else:
|
if image_div:
|
||||||
return None, None
|
images = image_div.find_all('img', {'data-original': True})
|
||||||
|
visible_image_urls = [img.get('data-original') for img in images]
|
||||||
|
|
||||||
|
# 提取总图片数量
|
||||||
|
total_images = None
|
||||||
|
sp_div = soup.find('div', class_='sp')
|
||||||
|
if sp_div:
|
||||||
|
i_tags = sp_div.find_all('i', class_='i1')
|
||||||
|
if i_tags:
|
||||||
|
total_text = i_tags[0].text.strip()
|
||||||
|
number_match = re.search(r'(\d+)', total_text)
|
||||||
|
total_images = int(number_match.group(1)) if number_match else None
|
||||||
|
|
||||||
|
return title, visible_image_urls, total_images
|
||||||
|
|
||||||
|
|
||||||
def parse_image_url(src):
|
def generate_image_urls(visible_image_urls, total_images):
|
||||||
image_filename = src.split('/')[-1]
|
if not visible_image_urls or not total_images:
|
||||||
starting_number = int(image_filename.split('.')[0])
|
print("未找到可见图片URL或总图片数")
|
||||||
return starting_number
|
return []
|
||||||
|
|
||||||
|
# 提取编号和基础路径
|
||||||
|
numbers = [int(url.split('/')[-1].split('.')[0]) for url in visible_image_urls]
|
||||||
|
min_number = min(numbers)
|
||||||
|
base_url = visible_image_urls[0].rsplit('/', 1)[0] + '/'
|
||||||
|
|
||||||
|
# 如果base_url已包含https://,不需要再次添加
|
||||||
|
if not base_url.startswith('https://'):
|
||||||
|
base_url = 'https://' + base_url.lstrip('/')
|
||||||
|
|
||||||
|
# 生成所有图片URL
|
||||||
|
image_urls = []
|
||||||
|
for i in range(total_images):
|
||||||
|
image_number = min_number + i
|
||||||
|
image_url = f"{base_url}{image_number}.jpg"
|
||||||
|
image_urls.append(image_url)
|
||||||
|
|
||||||
|
return image_urls
|
||||||
|
|
||||||
|
|
||||||
def download_image(image_url, filename):
|
def download_image(image_url, filename, session, post_url):
|
||||||
|
headers = {
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
|
||||||
|
'Referer': post_url
|
||||||
|
}
|
||||||
try:
|
try:
|
||||||
response = requests.get(image_url)
|
response = session.get(image_url, headers=headers, verify=False)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
with open(filename, 'wb') as f:
|
with open(filename, 'wb') as f:
|
||||||
f.write(response.content)
|
f.write(response.content)
|
||||||
|
print(f"已下载 {image_url}")
|
||||||
except requests.exceptions.RequestException as e:
|
except requests.exceptions.RequestException as e:
|
||||||
print(f"Error downloading {image_url}: {e}")
|
print(f"下载 {image_url} 失败: {e}")
|
||||||
|
|
||||||
|
|
||||||
def download_images(image_urls, output_dir):
|
def download_images(image_urls, output_dir, session, post_url):
|
||||||
|
if not image_urls:
|
||||||
|
print("没有可下载的图片URL")
|
||||||
|
return
|
||||||
if not os.path.exists(output_dir):
|
if not os.path.exists(output_dir):
|
||||||
os.makedirs(output_dir)
|
os.makedirs(output_dir)
|
||||||
for i, image_url in enumerate(image_urls):
|
for i, image_url in enumerate(image_urls):
|
||||||
|
if not image_url:
|
||||||
|
print(f"无效URL在索引 {i}")
|
||||||
|
continue
|
||||||
filename = os.path.join(output_dir, f"{i + 1}.jpg")
|
filename = os.path.join(output_dir, f"{i + 1}.jpg")
|
||||||
download_image(image_url, filename)
|
download_image(image_url, filename, session, post_url)
|
||||||
time.sleep(random.uniform(1, 3))
|
# time.sleep(random.uniform(1, 3))
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def xiuren_dowload_pic():
|
||||||
|
session = requests.Session()
|
||||||
|
|
||||||
initial_url = 'https://www.xiurenwang.cc/bang?f=7'
|
initial_url = 'https://www.xiurenwang.cc/bang?f=7'
|
||||||
initial_html = get_html(initial_url)
|
initial_html = get_html(initial_url, session)
|
||||||
if not initial_html:
|
if not initial_html:
|
||||||
|
print("无法获取初始页面")
|
||||||
return
|
return
|
||||||
|
|
||||||
post_info = parse_initial_page(initial_html)
|
post_info = parse_initial_page(initial_html)
|
||||||
if not post_info:
|
if not post_info:
|
||||||
print("No valid posts found.")
|
print("未找到有效帖子")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
processed_count = 0 # 记录已处理的帖子数量
|
||||||
|
target_count = 2 # 目标处理2个新帖子
|
||||||
|
|
||||||
for post in post_info:
|
for post in post_info:
|
||||||
|
if processed_count >= target_count:
|
||||||
|
break
|
||||||
|
|
||||||
post_url = post['url']
|
post_url = post['url']
|
||||||
post_number = post['number']
|
post_number = post['number']
|
||||||
total_images = post['total_images']
|
|
||||||
|
|
||||||
print(f"Processing post {post_number} with {total_images} images...")
|
|
||||||
post_html = get_html(post_url)
|
|
||||||
if not post_html:
|
|
||||||
continue
|
|
||||||
|
|
||||||
title, first_image_src = extract_title_and_first_image(post_html)
|
|
||||||
if not first_image_src:
|
|
||||||
print(f"No image found for post {post_number}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
starting_number = parse_image_url(first_image_src)
|
|
||||||
|
|
||||||
# Construct full base URL
|
|
||||||
base_url = first_image_src.rsplit('/', 1)[0] + '/'
|
|
||||||
full_base_url = 'https:' + base_url
|
|
||||||
|
|
||||||
# Generate image URLs
|
|
||||||
image_urls = []
|
|
||||||
for i in range(total_images):
|
|
||||||
image_number = starting_number + i
|
|
||||||
image_url = full_base_url + str(image_number) + '.jpg'
|
|
||||||
image_urls.append(image_url)
|
|
||||||
|
|
||||||
# Create output directory
|
|
||||||
output_dir = post_number
|
output_dir = post_number
|
||||||
download_images(image_urls, output_dir)
|
|
||||||
|
|
||||||
print(f"Downloaded {total_images} images for post {post_number}")
|
# 检查本地文件夹是否已存在
|
||||||
|
if os.path.exists(output_dir):
|
||||||
|
print(f"帖子 {post_number} 的文件夹已存在,跳过")
|
||||||
|
continue
|
||||||
|
|
||||||
|
post_html = get_html(post_url, session)
|
||||||
|
if not post_html:
|
||||||
|
print(f"无法获取帖子 {post_number} 的页面")
|
||||||
|
continue
|
||||||
|
|
||||||
|
title, visible_image_urls, total_images = extract_post_details(post_html)
|
||||||
|
print(f"处理帖子 {post_number} - 标题: {title}, 总图片数: {total_images}")
|
||||||
|
|
||||||
|
if not visible_image_urls or not total_images:
|
||||||
|
print(f"帖子 {post_number} 缺少图片URL或总数,跳过")
|
||||||
|
continue
|
||||||
|
|
||||||
|
image_urls = generate_image_urls(visible_image_urls, total_images)
|
||||||
|
if not image_urls:
|
||||||
|
print(f"帖子 {post_number} 未生成图片URL,跳过")
|
||||||
|
continue
|
||||||
|
|
||||||
|
download_images(image_urls, output_dir, session, post_url)
|
||||||
|
print(f"完成处理帖子 {post_number}")
|
||||||
|
processed_count += 1
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main()
|
xiuren_dowload_pic()
|
||||||
|
|||||||
Reference in New Issue
Block a user