diff --git a/main.py b/main.py index 5c11906..1aecdc5 100644 --- a/main.py +++ b/main.py @@ -47,6 +47,9 @@ def main(chat_type: int): # 游戏的定时任务每小时执行 robot.onEveryTime("18:00", robot.game_auto_tasks) + + # 秀人网每天自动下载帖子 + robot.onEveryTime("03:00", robot.xiu_ren_download_task) # 让机器人一直跑 robot.keepRunningAndBlockProcess() diff --git a/robot.py b/robot.py index f49aa7a..ddeb027 100644 --- a/robot.py +++ b/robot.py @@ -40,6 +40,7 @@ from message_report.write_db import write_to_db, generate_and_send_ranking from message_storage.message_to_db import archive_message, get_messages from message_summary.message_summary_4o import message_summary from sehuatang.shehuatang import pdf_file_path +from xiuren.xiuren_dl import xiuren_dowload_pic class Robot(Job): @@ -525,3 +526,9 @@ class Robot(Job): self.sendTextMsg(rep, gid) except Exception as e: self.LOG.error(f"message_summary_robot error:{e}") + + def xiu_ren_download_task(self): + try: + xiuren_dowload_pic() + except Exception as e: + self.LOG.error(f"xiuren_dowload_pic error:{e}") diff --git a/xiuren/xiuren_dl.py b/xiuren/xiuren_dl.py index ba4a281..87fe3a6 100644 --- a/xiuren/xiuren_dl.py +++ b/xiuren/xiuren_dl.py @@ -3,143 +3,170 @@ from bs4 import BeautifulSoup import os import time import random +import re -def get_html(url): +def get_html(url, session): headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' - 'AppleWebKit/537.36 (KHTML, like Gecko) ' - 'Chrome/114.0.0.0 Safari/537.36' + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36', + 'Referer': 'https://www.xiurenwang.cc/' } try: - response = requests.get(url, headers=headers) + response = session.get(url, headers=headers, verify=False) response.raise_for_status() return response.text except requests.exceptions.RequestException as e: - print(f"Error fetching {url}: {e}") + print(f"请求 {url} 失败: {e}") return None def parse_initial_page(html): soup = BeautifulSoup(html, 'html.parser') posts = soup.find_all('a', href=lambda x: x and x.endswith('.html')) - first_two_posts = posts[:2] + # 取所有帖子,而不是仅前两个,以便后续查找未下载的帖子 post_info = [] - print(posts) - for post in first_two_posts: + for post in posts: text = post.text.strip() - print(f"Post text: '{text}'") # 调试输出,检查实际内容 - if not text: - print("Empty post text, skipping...") - continue - - parts = text.split() - if len(parts) < 2: - print(f"Unexpected format in '{text}', skipping...") - continue - - # 提取编号和图片总数 - number = parts[0].replace('No.', '') if parts[0].startswith('No.') else None - pages = parts[-1].replace('P', '') if parts[-1].endswith('P') else None - - if not number or not pages: - print(f"Failed to parse number or pages from '{text}', skipping...") - continue - - try: - total_images = int(pages) + number_match = re.search(r'No\.(\d+)', text) + number = number_match.group(1) if number_match else None + if number: url = 'https://www.xiurenwang.cc/' + post['href'] - post_info.append({'url': url, 'number': number, 'total_images': total_images}) - except ValueError: - print(f"Invalid total_images value in '{text}', skipping...") - continue - print(f"post_info:{post_info}") + post_info.append({'url': url, 'number': number}) + return post_info -def extract_title_and_first_image(html): +def extract_post_details(html): soup = BeautifulSoup(html, 'html.parser') - title = soup.title.text.strip() - images = soup.find_all('img', src=lambda x: x and 'pic/' in x) - if images: - first_image = images[0] - first_image_src = first_image['src'] - return title, first_image_src - else: - return None, None + + title = soup.title.text.strip() if soup.title else "未知标题" + + # 提取可见图片URL + image_div = soup.find('div', id='image') + visible_image_urls = [] + if image_div: + images = image_div.find_all('img', {'data-original': True}) + visible_image_urls = [img.get('data-original') for img in images] + + # 提取总图片数量 + total_images = None + sp_div = soup.find('div', class_='sp') + if sp_div: + i_tags = sp_div.find_all('i', class_='i1') + if i_tags: + total_text = i_tags[0].text.strip() + number_match = re.search(r'(\d+)', total_text) + total_images = int(number_match.group(1)) if number_match else None + + return title, visible_image_urls, total_images -def parse_image_url(src): - image_filename = src.split('/')[-1] - starting_number = int(image_filename.split('.')[0]) - return starting_number +def generate_image_urls(visible_image_urls, total_images): + if not visible_image_urls or not total_images: + print("未找到可见图片URL或总图片数") + return [] + + # 提取编号和基础路径 + numbers = [int(url.split('/')[-1].split('.')[0]) for url in visible_image_urls] + min_number = min(numbers) + base_url = visible_image_urls[0].rsplit('/', 1)[0] + '/' + + # 如果base_url已包含https://,不需要再次添加 + if not base_url.startswith('https://'): + base_url = 'https://' + base_url.lstrip('/') + + # 生成所有图片URL + image_urls = [] + for i in range(total_images): + image_number = min_number + i + image_url = f"{base_url}{image_number}.jpg" + image_urls.append(image_url) + + return image_urls -def download_image(image_url, filename): +def download_image(image_url, filename, session, post_url): + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36', + 'Referer': post_url + } try: - response = requests.get(image_url) + response = session.get(image_url, headers=headers, verify=False) response.raise_for_status() with open(filename, 'wb') as f: f.write(response.content) + print(f"已下载 {image_url}") except requests.exceptions.RequestException as e: - print(f"Error downloading {image_url}: {e}") + print(f"下载 {image_url} 失败: {e}") -def download_images(image_urls, output_dir): +def download_images(image_urls, output_dir, session, post_url): + if not image_urls: + print("没有可下载的图片URL") + return if not os.path.exists(output_dir): os.makedirs(output_dir) for i, image_url in enumerate(image_urls): + if not image_url: + print(f"无效URL在索引 {i}") + continue filename = os.path.join(output_dir, f"{i + 1}.jpg") - download_image(image_url, filename) - time.sleep(random.uniform(1, 3)) + download_image(image_url, filename, session, post_url) + # time.sleep(random.uniform(1, 3)) -def main(): +def xiuren_dowload_pic(): + session = requests.Session() + initial_url = 'https://www.xiurenwang.cc/bang?f=7' - initial_html = get_html(initial_url) + initial_html = get_html(initial_url, session) if not initial_html: + print("无法获取初始页面") return post_info = parse_initial_page(initial_html) if not post_info: - print("No valid posts found.") + print("未找到有效帖子") return + processed_count = 0 # 记录已处理的帖子数量 + target_count = 2 # 目标处理2个新帖子 + for post in post_info: + if processed_count >= target_count: + break + post_url = post['url'] post_number = post['number'] - total_images = post['total_images'] - - print(f"Processing post {post_number} with {total_images} images...") - post_html = get_html(post_url) - if not post_html: - continue - - title, first_image_src = extract_title_and_first_image(post_html) - if not first_image_src: - print(f"No image found for post {post_number}") - continue - - starting_number = parse_image_url(first_image_src) - - # Construct full base URL - base_url = first_image_src.rsplit('/', 1)[0] + '/' - full_base_url = 'https:' + base_url - - # Generate image URLs - image_urls = [] - for i in range(total_images): - image_number = starting_number + i - image_url = full_base_url + str(image_number) + '.jpg' - image_urls.append(image_url) - - # Create output directory output_dir = post_number - download_images(image_urls, output_dir) - print(f"Downloaded {total_images} images for post {post_number}") + # 检查本地文件夹是否已存在 + if os.path.exists(output_dir): + print(f"帖子 {post_number} 的文件夹已存在,跳过") + continue + + post_html = get_html(post_url, session) + if not post_html: + print(f"无法获取帖子 {post_number} 的页面") + continue + + title, visible_image_urls, total_images = extract_post_details(post_html) + print(f"处理帖子 {post_number} - 标题: {title}, 总图片数: {total_images}") + + if not visible_image_urls or not total_images: + print(f"帖子 {post_number} 缺少图片URL或总数,跳过") + continue + + image_urls = generate_image_urls(visible_image_urls, total_images) + if not image_urls: + print(f"帖子 {post_number} 未生成图片URL,跳过") + continue + + download_images(image_urls, output_dir, session, post_url) + print(f"完成处理帖子 {post_number}") + processed_count += 1 if __name__ == '__main__': - main() \ No newline at end of file + xiuren_dowload_pic()