加个每天爬取数据

This commit is contained in:
liuwei
2026-01-07 16:01:04 +08:00
parent b39e93fead
commit 89455de986
2 changed files with 94 additions and 40 deletions

View File

@@ -11,6 +11,8 @@ from robot import Robot
from loguru import logger
from utils.sehuatang.sehuatang_bot import SehuatangCrawler
# INFO 日志(包含 INFO、DEBUG但不包含 WARNING、ERROR
logger.add(
f"wx_info.log",
@@ -147,6 +149,11 @@ def jobs(robot: Robot):
if hasattr(robot, 'message_storage') and robot.message_storage:
await robot.message_storage.process_pending_images(minutes_ago=10, batch_size=20)
#每天抓取sehuatxinx入库
@async_job.at_times(["11:30"])
async def process_pending_sehuatang_job():
crawler = SehuatangCrawler()
crawler.run()
if __name__ == "__main__":

View File

@@ -127,7 +127,7 @@ class SehuatangCrawler:
def bypass_age_verification(self):
try:
self.driver.get("https://www.sehuatang.org/")
self.driver.get("https://www.sehuatang.net/")
try:
btn = WebDriverWait(self.driver, 6).until(EC.element_to_be_clickable((By.XPATH, '//a[contains(text(), "满18岁请点此进入")]')))
btn.click()
@@ -141,7 +141,7 @@ class SehuatangCrawler:
pass
ua = self.driver.execute_script("return navigator.userAgent")
self.session = requests.Session()
self.session.headers.update({'User-Agent': ua, 'Referer': 'https://www.sehuatang.org/'})
self.session.headers.update({'User-Agent': ua, 'Referer': 'https://www.sehuatang.net/'})
for c in self.driver.get_cookies():
try:
self.session.cookies.set(c['name'], c['value'], domain=c.get('domain'), path=c.get('path', '/'))
@@ -280,44 +280,91 @@ class SehuatangCrawler:
actress_in_body = ""
try:
resp = self.session.get(post_url, timeout=15) if self.session else requests.get(post_url, timeout=15)
soup = BeautifulSoup(resp.text, 'html.parser')
content_div = soup.find('div', {'class': 't_fsz'})
# 修复403问题: 使用Selenium访问而不是requests
self.driver.get(post_url)
# 等待内容加载
WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
soup = BeautifulSoup(self.driver.page_source, 'html.parser')
if content_div:
magnet_tags = content_div.find_all('a', href=re.compile(r'^magnet:\?'))
for tag in magnet_tags:
href = tag.get('href', '')
if 'xt=urn:btih:' in href:
magnet_link = href
break
if not magnet_link:
text = content_div.get_text()
match = re.search(r'magnet:\?xt=urn:btih:[a-zA-Z0-9]+.*', text)
if match: magnet_link = match.group(0)
magnet_link = self.clean_magnet(magnet_link)
# 调试: 检查页面标题(验证是否成功加载)
page_title = soup.find('title')
logger.debug(f"详情页标题: {page_title.get_text() if page_title else 'None'}, URL: {post_url}")
imgs = content_div.find_all('img')
for img in imgs:
zoomfile = img.get('zoomfile')
if zoomfile and zoomfile.startswith('http'):
cover_image = zoomfile
break
file_attr = img.get('file')
if file_attr and file_attr.startswith('http'):
cover_image = file_attr
break
# 修复1: 使用正确的选择器
# 方法1: 直接找 td class="t_f"
content_td = soup.find('td', {'class': 't_f'})
text_content = content_div.get_text(separator='\n')
actress_match = re.search(r'(?:【|\[)(?:出演)?女优(?:】|\])\s*[:]\s*(.*)', text_content)
if actress_match:
raw_actress = actress_match.group(1).strip()
actress_in_body = raw_actress.split('<')[0].strip()
# 方法2: 如果上面不行,尝试找 id="postmessage_*"
if not content_td:
content_td = soup.find('td', {'id': lambda x: x and x.startswith('postmessage_')})
except Exception:
pass
# 调试: 检查是否找到容器
if content_td:
logger.debug(f"✓ 找到内容容器: id={content_td.get('id', '')}")
else:
logger.warning(f"✗ 未找到内容容器 (td class='t_f' 或 id='postmessage_*')")
# 尝试打印所有td标签看看有什么
all_tds = soup.find_all('td')
logger.debug(f"页面共有 {len(all_tds)} 个td标签")
return magnet_link, cover_image, actress_in_body
# 修复2: 磁力链接在 <div class="blockcode"> 里的纯文本中
magnet_div = content_td.find('div', {'class': 'blockcode'})
if magnet_div:
# 直接提取文本,去掉空白符
magnet_text = magnet_div.get_text(strip=True)
logger.debug(f"找到blockcode, 内容: {magnet_text[:100]}")
# 用正则匹配磁力链接
match = re.search(r'(magnet:\?xt=urn:btih:[a-zA-Z0-9]+[^\s"\'<>\u4e00-\u9fa5]*)', magnet_text)
if match:
magnet_link = match.group(1)
logger.debug(f"✓ 提取到磁力链接: {magnet_link[:50]}...")
else:
logger.debug(f"✗ blockcode中未匹配到磁力链接")
else:
logger.debug(f"✗ 未找到div class='blockcode'")
# 兜底:如果上面没找到,再尝试在整个文本中搜索
if not magnet_link:
full_text = content_td.get_text()
match = re.search(r'magnet:\?xt=urn:btih:[a-zA-Z0-9]+.*', full_text)
if match:
magnet_link = self.clean_magnet(match.group(0))
logger.debug(f"✓ 兜底提取到磁力链接: {magnet_link[:50]}...")
else:
logger.debug(f"✗ 整个文本中也未找到磁力链接")
# 封面图:逻辑正确,但要确保在正确的容器里查找
imgs = content_td.find_all('img')
logger.debug(f"内容容器中共有 {len(imgs)} 个img标签")
for img in imgs:
zoomfile = img.get('zoomfile')
file_attr = img.get('file')
if zoomfile and zoomfile.startswith('http') and 'static/image/common/none.gif' not in zoomfile:
cover_image = zoomfile
logger.debug(f"✓ 使用zoomfile作为封面: {cover_image[:50]}...")
break
if file_attr and file_attr.startswith('http') and 'static/image/common/none.gif' not in file_attr:
cover_image = file_attr
logger.debug(f"✓ 使用file作为封面: {cover_image[:50]}...")
break
if not cover_image:
logger.debug(f"✗ 未找到合适的封面图")
# 女优:从文本中提取
text_content = content_td.get_text(separator='\n')
actress_match = re.search(r'(?:【|\[)(?:出演)?女优(?:】|\])\s*[:]\s*([^\n<]+)', text_content)
if actress_match:
raw_actress = actress_match.group(1).strip()
actress_in_body = raw_actress.split('<')[0].split()[0].strip()
logger.debug(f"✓ 提取到女优: {actress_in_body}")
else:
logger.debug(f"✗ 未匹配到女优信息")
except Exception as e:
logger.error(f"详情页解析异常: {e}, URL: {post_url}")
# 返回三个值
return magnet_link, cover_image, actress_in_body
def crawl_forum(self, fid, category_name):
@@ -328,7 +375,7 @@ class SehuatangCrawler:
if RUN_MODE == 'daily' and consecutive_old_posts > 20:
break
list_url = f"https://www.sehuatang.org/forum-{fid}-{page}.html"
list_url = f"https://www.sehuatang.net/forum-{fid}-{page}.html"
logger.info(f"正在爬取第 {page}")
try:
@@ -349,7 +396,7 @@ class SehuatangCrawler:
date_span = thread.find('td', {'class': 'by'}).find('em').find('span')
raw_date = date_span.get_text() if date_span else ""
publish_date = self.parse_relative_date(raw_date)
logger.debug("publish_date: "+ publish_date)
if RUN_MODE == 'daily' and ONLY_CRAWL_TODAY:
if publish_date != self.today_str:
consecutive_old_posts += 1
@@ -365,11 +412,11 @@ class SehuatangCrawler:
continue
partial_url = title_tag.get('href')
full_url = f"https://www.sehuatang.org/{partial_url}"
full_url = f"https://www.sehuatang.net/{partial_url}"
logger.info("get url : " + full_url)
# 获取详情页数据(含女优)
magnet, cover, body_actress = self.parse_detail_page(full_url)
logger.debug(f"magnet: {magnet}, cover: {cover}, body_actress: {body_actress}")
# ================= 决策逻辑 =================
# 优先用详情页里抓到的 body_actress
# 如果没抓到,再尝试用标题分析