diff --git a/main.py b/main.py index 323eef9..b3c7f5a 100644 --- a/main.py +++ b/main.py @@ -11,6 +11,8 @@ from robot import Robot from loguru import logger +from utils.sehuatang.sehuatang_bot import SehuatangCrawler + # INFO 日志(包含 INFO、DEBUG,但不包含 WARNING、ERROR) logger.add( f"wx_info.log", @@ -147,6 +149,11 @@ def jobs(robot: Robot): if hasattr(robot, 'message_storage') and robot.message_storage: await robot.message_storage.process_pending_images(minutes_ago=10, batch_size=20) + #每天抓取sehuatxinx入库 + @async_job.at_times(["11:30"]) + async def process_pending_sehuatang_job(): + crawler = SehuatangCrawler() + crawler.run() if __name__ == "__main__": diff --git a/utils/sehuatang/sehuatang_bot.py b/utils/sehuatang/sehuatang_bot.py index 64b77f6..43674bd 100644 --- a/utils/sehuatang/sehuatang_bot.py +++ b/utils/sehuatang/sehuatang_bot.py @@ -127,7 +127,7 @@ class SehuatangCrawler: def bypass_age_verification(self): try: - self.driver.get("https://www.sehuatang.org/") + self.driver.get("https://www.sehuatang.net/") try: btn = WebDriverWait(self.driver, 6).until(EC.element_to_be_clickable((By.XPATH, '//a[contains(text(), "满18岁,请点此进入")]'))) btn.click() @@ -141,7 +141,7 @@ class SehuatangCrawler: pass ua = self.driver.execute_script("return navigator.userAgent") self.session = requests.Session() - self.session.headers.update({'User-Agent': ua, 'Referer': 'https://www.sehuatang.org/'}) + self.session.headers.update({'User-Agent': ua, 'Referer': 'https://www.sehuatang.net/'}) for c in self.driver.get_cookies(): try: self.session.cookies.set(c['name'], c['value'], domain=c.get('domain'), path=c.get('path', '/')) @@ -280,44 +280,91 @@ class SehuatangCrawler: actress_in_body = "" try: - resp = self.session.get(post_url, timeout=15) if self.session else requests.get(post_url, timeout=15) - soup = BeautifulSoup(resp.text, 'html.parser') - content_div = soup.find('div', {'class': 't_fsz'}) + # 修复403问题: 使用Selenium访问而不是requests + self.driver.get(post_url) + # 等待内容加载 + WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'body'))) + soup = BeautifulSoup(self.driver.page_source, 'html.parser') - if content_div: - magnet_tags = content_div.find_all('a', href=re.compile(r'^magnet:\?')) - for tag in magnet_tags: - href = tag.get('href', '') - if 'xt=urn:btih:' in href: - magnet_link = href - break - if not magnet_link: - text = content_div.get_text() - match = re.search(r'magnet:\?xt=urn:btih:[a-zA-Z0-9]+.*', text) - if match: magnet_link = match.group(0) - magnet_link = self.clean_magnet(magnet_link) + # 调试: 检查页面标题(验证是否成功加载) + page_title = soup.find('title') + logger.debug(f"详情页标题: {page_title.get_text() if page_title else 'None'}, URL: {post_url}") - imgs = content_div.find_all('img') - for img in imgs: - zoomfile = img.get('zoomfile') - if zoomfile and zoomfile.startswith('http'): - cover_image = zoomfile - break - file_attr = img.get('file') - if file_attr and file_attr.startswith('http'): - cover_image = file_attr - break + # 修复1: 使用正确的选择器 + # 方法1: 直接找 td class="t_f" + content_td = soup.find('td', {'class': 't_f'}) - text_content = content_div.get_text(separator='\n') - actress_match = re.search(r'(?:【|\[)(?:出演)?女优(?:】|\])\s*[::]\s*(.*)', text_content) - if actress_match: - raw_actress = actress_match.group(1).strip() - actress_in_body = raw_actress.split('<')[0].strip() + # 方法2: 如果上面不行,尝试找 id="postmessage_*" + if not content_td: + content_td = soup.find('td', {'id': lambda x: x and x.startswith('postmessage_')}) - except Exception: - pass + # 调试: 检查是否找到容器 + if content_td: + logger.debug(f"✓ 找到内容容器: id={content_td.get('id', '')}") + else: + logger.warning(f"✗ 未找到内容容器 (td class='t_f' 或 id='postmessage_*')") + # 尝试打印所有td标签看看有什么 + all_tds = soup.find_all('td') + logger.debug(f"页面共有 {len(all_tds)} 个td标签") + return magnet_link, cover_image, actress_in_body + + # 修复2: 磁力链接在