加个每天爬取数据

2026-01-07 16:01:04 +08:00
parent b39e93fead
commit 89455de986
2 changed files with 94 additions and 40 deletions
--- a/main.py
+++ b/main.py
@@ -11,6 +11,8 @@ from robot import Robot

 from loguru import logger

+from utils.sehuatang.sehuatang_bot import SehuatangCrawler
+
 # INFO 日志（包含 INFO、DEBUG，但不包含 WARNING、ERROR）
 logger.add(
    f"wx_info.log",
@@ -147,6 +149,11 @@ def jobs(robot: Robot):
        if hasattr(robot, 'message_storage') and robot.message_storage:
            await robot.message_storage.process_pending_images(minutes_ago=10, batch_size=20)

+    #每天抓取sehuatxinx入库
+    @async_job.at_times(["11:30"])
+    async def process_pending_sehuatang_job():
+        crawler = SehuatangCrawler()
+        crawler.run()


 if __name__ == "__main__":
--- a/utils/sehuatang/sehuatang_bot.py
+++ b/utils/sehuatang/sehuatang_bot.py
@@ -127,7 +127,7 @@ class SehuatangCrawler:

    def bypass_age_verification(self):
        try:
-            self.driver.get("https://www.sehuatang.org/")
+            self.driver.get("https://www.sehuatang.net/")
            try:
                btn = WebDriverWait(self.driver, 6).until(EC.element_to_be_clickable((By.XPATH, '//a[contains(text(), "满18岁，请点此进入")]')))
                btn.click()
@@ -141,7 +141,7 @@ class SehuatangCrawler:
                    pass
            ua = self.driver.execute_script("return navigator.userAgent")
            self.session = requests.Session()
-            self.session.headers.update({'User-Agent': ua, 'Referer': 'https://www.sehuatang.org/'})
+            self.session.headers.update({'User-Agent': ua, 'Referer': 'https://www.sehuatang.net/'})
            for c in self.driver.get_cookies():
                try:
                    self.session.cookies.set(c['name'], c['value'], domain=c.get('domain'), path=c.get('path', '/'))
@@ -280,44 +280,91 @@ class SehuatangCrawler:
        actress_in_body = ""

        try:
-            resp = self.session.get(post_url, timeout=15) if self.session else requests.get(post_url, timeout=15)
-            soup = BeautifulSoup(resp.text, 'html.parser')
-            content_div = soup.find('div', {'class': 't_fsz'})
+            # 修复403问题: 使用Selenium访问而不是requests
+            self.driver.get(post_url)
+            # 等待内容加载
+            WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
+            soup = BeautifulSoup(self.driver.page_source, 'html.parser')

-            if content_div:
-                magnet_tags = content_div.find_all('a', href=re.compile(r'^magnet:\?'))
-                for tag in magnet_tags:
-                    href = tag.get('href', '')
-                    if 'xt=urn:btih:' in href:
-                        magnet_link = href
-                        break
-                if not magnet_link:
-                    text = content_div.get_text()
-                    match = re.search(r'magnet:\?xt=urn:btih:[a-zA-Z0-9]+.*', text)
-                    if match: magnet_link = match.group(0)
-                magnet_link = self.clean_magnet(magnet_link)
+            # 调试: 检查页面标题（验证是否成功加载）
+            page_title = soup.find('title')
+            logger.debug(f"详情页标题: {page_title.get_text() if page_title else 'None'}, URL: {post_url}")

-                imgs = content_div.find_all('img')
-                for img in imgs:
-                    zoomfile = img.get('zoomfile')
-                    if zoomfile and zoomfile.startswith('http'):
-                        cover_image = zoomfile
-                        break
-                    file_attr = img.get('file')
-                    if file_attr and file_attr.startswith('http'):
-                        cover_image = file_attr
-                        break
+            # 修复1: 使用正确的选择器
+            # 方法1: 直接找 td class="t_f"
+            content_td = soup.find('td', {'class': 't_f'})

-                text_content = content_div.get_text(separator='\n')
-                actress_match = re.search(r'(?:【|\[)(?:出演)?女优(?:】|\])\s*[:：]\s*(.*)', text_content)
-                if actress_match:
-                    raw_actress = actress_match.group(1).strip()
-                    actress_in_body = raw_actress.split('<')[0].strip()
+            # 方法2: 如果上面不行,尝试找 id="postmessage_*"
+            if not content_td:
+                content_td = soup.find('td', {'id': lambda x: x and x.startswith('postmessage_')})

-        except Exception:
-            pass
+            # 调试: 检查是否找到容器
+            if content_td:
+                logger.debug(f"✓ 找到内容容器: id={content_td.get('id', '')}")
+            else:
+                logger.warning(f"✗ 未找到内容容器 (td class='t_f' 或 id='postmessage_*')")
+                # 尝试打印所有td标签看看有什么
+                all_tds = soup.find_all('td')
+                logger.debug(f"页面共有 {len(all_tds)} 个td标签")
+                return magnet_link, cover_image, actress_in_body
+
+            # 修复2: 磁力链接在 <div class="blockcode"> 里的纯文本中
+            magnet_div = content_td.find('div', {'class': 'blockcode'})
+            if magnet_div:
+                # 直接提取文本,去掉空白符
+                magnet_text = magnet_div.get_text(strip=True)
+                logger.debug(f"找到blockcode, 内容: {magnet_text[:100]}")
+                # 用正则匹配磁力链接
+                match = re.search(r'(magnet:\?xt=urn:btih:[a-zA-Z0-9]+[^\s"\'<>\u4e00-\u9fa5]*)', magnet_text)
+                if match:
+                    magnet_link = match.group(1)
+                    logger.debug(f"✓ 提取到磁力链接: {magnet_link[:50]}...")
+                else:
+                    logger.debug(f"✗ blockcode中未匹配到磁力链接")
+            else:
+                logger.debug(f"✗ 未找到div class='blockcode'")
+
+            # 兜底:如果上面没找到,再尝试在整个文本中搜索
+            if not magnet_link:
+                full_text = content_td.get_text()
+                match = re.search(r'magnet:\?xt=urn:btih:[a-zA-Z0-9]+.*', full_text)
+                if match:
+                    magnet_link = self.clean_magnet(match.group(0))
+                    logger.debug(f"✓ 兜底提取到磁力链接: {magnet_link[:50]}...")
+                else:
+                    logger.debug(f"✗ 整个文本中也未找到磁力链接")
+
+            # 封面图:逻辑正确,但要确保在正确的容器里查找
+            imgs = content_td.find_all('img')
+            logger.debug(f"内容容器中共有 {len(imgs)} 个img标签")
+            for img in imgs:
+                zoomfile = img.get('zoomfile')
+                file_attr = img.get('file')
+                if zoomfile and zoomfile.startswith('http') and 'static/image/common/none.gif' not in zoomfile:
+                    cover_image = zoomfile
+                    logger.debug(f"✓ 使用zoomfile作为封面: {cover_image[:50]}...")
+                    break
+                if file_attr and file_attr.startswith('http') and 'static/image/common/none.gif' not in file_attr:
+                    cover_image = file_attr
+                    logger.debug(f"✓ 使用file作为封面: {cover_image[:50]}...")
+                    break
+
+            if not cover_image:
+                logger.debug(f"✗ 未找到合适的封面图")
+
+            # 女优:从文本中提取
+            text_content = content_td.get_text(separator='\n')
+            actress_match = re.search(r'(?:【|\[)(?:出演)?女优(?:】|\])\s*[:：]\s*([^\n<]+)', text_content)
+            if actress_match:
+                raw_actress = actress_match.group(1).strip()
+                actress_in_body = raw_actress.split('<')[0].split()[0].strip()
+                logger.debug(f"✓ 提取到女优: {actress_in_body}")
+            else:
+                logger.debug(f"✗ 未匹配到女优信息")
+
+        except Exception as e:
+            logger.error(f"详情页解析异常: {e}, URL: {post_url}")

-        # 返回三个值
        return magnet_link, cover_image, actress_in_body

    def crawl_forum(self, fid, category_name):
@@ -328,7 +375,7 @@ class SehuatangCrawler:
            if RUN_MODE == 'daily' and consecutive_old_posts > 20:
                break

-            list_url = f"https://www.sehuatang.org/forum-{fid}-{page}.html"
+            list_url = f"https://www.sehuatang.net/forum-{fid}-{page}.html"
            logger.info(f"正在爬取第 {page} 页")

            try:
@@ -349,7 +396,7 @@ class SehuatangCrawler:
                        date_span = thread.find('td', {'class': 'by'}).find('em').find('span')
                        raw_date = date_span.get_text() if date_span else ""
                        publish_date = self.parse_relative_date(raw_date)
-
+                        logger.debug("publish_date: "+ publish_date)
                        if RUN_MODE == 'daily' and ONLY_CRAWL_TODAY:
                            if publish_date != self.today_str:
                                consecutive_old_posts += 1
@@ -365,11 +412,11 @@ class SehuatangCrawler:
                            continue

                        partial_url = title_tag.get('href')
-                        full_url = f"https://www.sehuatang.org/{partial_url}"
-
+                        full_url = f"https://www.sehuatang.net/{partial_url}"
+                        logger.info("get url : " + full_url)
                        # 获取详情页数据（含女优）
                        magnet, cover, body_actress = self.parse_detail_page(full_url)
-
+                        logger.debug(f"magnet: {magnet}, cover: {cover}, body_actress: {body_actress}")
                        # ================= 决策逻辑 =================
                        # 优先用详情页里抓到的 body_actress
                        # 如果没抓到，再尝试用标题分析