diff --git a/main.py b/main.py index 323eef9..b3c7f5a 100644 --- a/main.py +++ b/main.py @@ -11,6 +11,8 @@ from robot import Robot from loguru import logger +from utils.sehuatang.sehuatang_bot import SehuatangCrawler + # INFO 日志(包含 INFO、DEBUG,但不包含 WARNING、ERROR) logger.add( f"wx_info.log", @@ -147,6 +149,11 @@ def jobs(robot: Robot): if hasattr(robot, 'message_storage') and robot.message_storage: await robot.message_storage.process_pending_images(minutes_ago=10, batch_size=20) + #每天抓取sehuatxinx入库 + @async_job.at_times(["11:30"]) + async def process_pending_sehuatang_job(): + crawler = SehuatangCrawler() + crawler.run() if __name__ == "__main__": diff --git a/utils/sehuatang/sehuatang_bot.py b/utils/sehuatang/sehuatang_bot.py index 64b77f6..43674bd 100644 --- a/utils/sehuatang/sehuatang_bot.py +++ b/utils/sehuatang/sehuatang_bot.py @@ -127,7 +127,7 @@ class SehuatangCrawler: def bypass_age_verification(self): try: - self.driver.get("https://www.sehuatang.org/") + self.driver.get("https://www.sehuatang.net/") try: btn = WebDriverWait(self.driver, 6).until(EC.element_to_be_clickable((By.XPATH, '//a[contains(text(), "满18岁,请点此进入")]'))) btn.click() @@ -141,7 +141,7 @@ class SehuatangCrawler: pass ua = self.driver.execute_script("return navigator.userAgent") self.session = requests.Session() - self.session.headers.update({'User-Agent': ua, 'Referer': 'https://www.sehuatang.org/'}) + self.session.headers.update({'User-Agent': ua, 'Referer': 'https://www.sehuatang.net/'}) for c in self.driver.get_cookies(): try: self.session.cookies.set(c['name'], c['value'], domain=c.get('domain'), path=c.get('path', '/')) @@ -280,44 +280,91 @@ class SehuatangCrawler: actress_in_body = "" try: - resp = self.session.get(post_url, timeout=15) if self.session else requests.get(post_url, timeout=15) - soup = BeautifulSoup(resp.text, 'html.parser') - content_div = soup.find('div', {'class': 't_fsz'}) + # 修复403问题: 使用Selenium访问而不是requests + self.driver.get(post_url) + # 等待内容加载 + WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'body'))) + soup = BeautifulSoup(self.driver.page_source, 'html.parser') - if content_div: - magnet_tags = content_div.find_all('a', href=re.compile(r'^magnet:\?')) - for tag in magnet_tags: - href = tag.get('href', '') - if 'xt=urn:btih:' in href: - magnet_link = href - break - if not magnet_link: - text = content_div.get_text() - match = re.search(r'magnet:\?xt=urn:btih:[a-zA-Z0-9]+.*', text) - if match: magnet_link = match.group(0) - magnet_link = self.clean_magnet(magnet_link) + # 调试: 检查页面标题(验证是否成功加载) + page_title = soup.find('title') + logger.debug(f"详情页标题: {page_title.get_text() if page_title else 'None'}, URL: {post_url}") - imgs = content_div.find_all('img') - for img in imgs: - zoomfile = img.get('zoomfile') - if zoomfile and zoomfile.startswith('http'): - cover_image = zoomfile - break - file_attr = img.get('file') - if file_attr and file_attr.startswith('http'): - cover_image = file_attr - break + # 修复1: 使用正确的选择器 + # 方法1: 直接找 td class="t_f" + content_td = soup.find('td', {'class': 't_f'}) - text_content = content_div.get_text(separator='\n') - actress_match = re.search(r'(?:【|\[)(?:出演)?女优(?:】|\])\s*[::]\s*(.*)', text_content) - if actress_match: - raw_actress = actress_match.group(1).strip() - actress_in_body = raw_actress.split('<')[0].strip() + # 方法2: 如果上面不行,尝试找 id="postmessage_*" + if not content_td: + content_td = soup.find('td', {'id': lambda x: x and x.startswith('postmessage_')}) - except Exception: - pass + # 调试: 检查是否找到容器 + if content_td: + logger.debug(f"✓ 找到内容容器: id={content_td.get('id', '')}") + else: + logger.warning(f"✗ 未找到内容容器 (td class='t_f' 或 id='postmessage_*')") + # 尝试打印所有td标签看看有什么 + all_tds = soup.find_all('td') + logger.debug(f"页面共有 {len(all_tds)} 个td标签") + return magnet_link, cover_image, actress_in_body + + # 修复2: 磁力链接在
里的纯文本中 + magnet_div = content_td.find('div', {'class': 'blockcode'}) + if magnet_div: + # 直接提取文本,去掉空白符 + magnet_text = magnet_div.get_text(strip=True) + logger.debug(f"找到blockcode, 内容: {magnet_text[:100]}") + # 用正则匹配磁力链接 + match = re.search(r'(magnet:\?xt=urn:btih:[a-zA-Z0-9]+[^\s"\'<>\u4e00-\u9fa5]*)', magnet_text) + if match: + magnet_link = match.group(1) + logger.debug(f"✓ 提取到磁力链接: {magnet_link[:50]}...") + else: + logger.debug(f"✗ blockcode中未匹配到磁力链接") + else: + logger.debug(f"✗ 未找到div class='blockcode'") + + # 兜底:如果上面没找到,再尝试在整个文本中搜索 + if not magnet_link: + full_text = content_td.get_text() + match = re.search(r'magnet:\?xt=urn:btih:[a-zA-Z0-9]+.*', full_text) + if match: + magnet_link = self.clean_magnet(match.group(0)) + logger.debug(f"✓ 兜底提取到磁力链接: {magnet_link[:50]}...") + else: + logger.debug(f"✗ 整个文本中也未找到磁力链接") + + # 封面图:逻辑正确,但要确保在正确的容器里查找 + imgs = content_td.find_all('img') + logger.debug(f"内容容器中共有 {len(imgs)} 个img标签") + for img in imgs: + zoomfile = img.get('zoomfile') + file_attr = img.get('file') + if zoomfile and zoomfile.startswith('http') and 'static/image/common/none.gif' not in zoomfile: + cover_image = zoomfile + logger.debug(f"✓ 使用zoomfile作为封面: {cover_image[:50]}...") + break + if file_attr and file_attr.startswith('http') and 'static/image/common/none.gif' not in file_attr: + cover_image = file_attr + logger.debug(f"✓ 使用file作为封面: {cover_image[:50]}...") + break + + if not cover_image: + logger.debug(f"✗ 未找到合适的封面图") + + # 女优:从文本中提取 + text_content = content_td.get_text(separator='\n') + actress_match = re.search(r'(?:【|\[)(?:出演)?女优(?:】|\])\s*[::]\s*([^\n<]+)', text_content) + if actress_match: + raw_actress = actress_match.group(1).strip() + actress_in_body = raw_actress.split('<')[0].split()[0].strip() + logger.debug(f"✓ 提取到女优: {actress_in_body}") + else: + logger.debug(f"✗ 未匹配到女优信息") + + except Exception as e: + logger.error(f"详情页解析异常: {e}, URL: {post_url}") - # 返回三个值 return magnet_link, cover_image, actress_in_body def crawl_forum(self, fid, category_name): @@ -328,7 +375,7 @@ class SehuatangCrawler: if RUN_MODE == 'daily' and consecutive_old_posts > 20: break - list_url = f"https://www.sehuatang.org/forum-{fid}-{page}.html" + list_url = f"https://www.sehuatang.net/forum-{fid}-{page}.html" logger.info(f"正在爬取第 {page} 页") try: @@ -349,7 +396,7 @@ class SehuatangCrawler: date_span = thread.find('td', {'class': 'by'}).find('em').find('span') raw_date = date_span.get_text() if date_span else "" publish_date = self.parse_relative_date(raw_date) - + logger.debug("publish_date: "+ publish_date) if RUN_MODE == 'daily' and ONLY_CRAWL_TODAY: if publish_date != self.today_str: consecutive_old_posts += 1 @@ -365,11 +412,11 @@ class SehuatangCrawler: continue partial_url = title_tag.get('href') - full_url = f"https://www.sehuatang.org/{partial_url}" - + full_url = f"https://www.sehuatang.net/{partial_url}" + logger.info("get url : " + full_url) # 获取详情页数据(含女优) magnet, cover, body_actress = self.parse_detail_page(full_url) - + logger.debug(f"magnet: {magnet}, cover: {cover}, body_actress: {body_actress}") # ================= 决策逻辑 ================= # 优先用详情页里抓到的 body_actress # 如果没抓到,再尝试用标题分析