加个每天爬取数据
This commit is contained in:
7
main.py
7
main.py
@@ -11,6 +11,8 @@ from robot import Robot
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from utils.sehuatang.sehuatang_bot import SehuatangCrawler
|
||||
|
||||
# INFO 日志(包含 INFO、DEBUG,但不包含 WARNING、ERROR)
|
||||
logger.add(
|
||||
f"wx_info.log",
|
||||
@@ -147,6 +149,11 @@ def jobs(robot: Robot):
|
||||
if hasattr(robot, 'message_storage') and robot.message_storage:
|
||||
await robot.message_storage.process_pending_images(minutes_ago=10, batch_size=20)
|
||||
|
||||
#每天抓取sehuatxinx入库
|
||||
@async_job.at_times(["11:30"])
|
||||
async def process_pending_sehuatang_job():
|
||||
crawler = SehuatangCrawler()
|
||||
crawler.run()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -127,7 +127,7 @@ class SehuatangCrawler:
|
||||
|
||||
def bypass_age_verification(self):
|
||||
try:
|
||||
self.driver.get("https://www.sehuatang.org/")
|
||||
self.driver.get("https://www.sehuatang.net/")
|
||||
try:
|
||||
btn = WebDriverWait(self.driver, 6).until(EC.element_to_be_clickable((By.XPATH, '//a[contains(text(), "满18岁,请点此进入")]')))
|
||||
btn.click()
|
||||
@@ -141,7 +141,7 @@ class SehuatangCrawler:
|
||||
pass
|
||||
ua = self.driver.execute_script("return navigator.userAgent")
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update({'User-Agent': ua, 'Referer': 'https://www.sehuatang.org/'})
|
||||
self.session.headers.update({'User-Agent': ua, 'Referer': 'https://www.sehuatang.net/'})
|
||||
for c in self.driver.get_cookies():
|
||||
try:
|
||||
self.session.cookies.set(c['name'], c['value'], domain=c.get('domain'), path=c.get('path', '/'))
|
||||
@@ -280,44 +280,91 @@ class SehuatangCrawler:
|
||||
actress_in_body = ""
|
||||
|
||||
try:
|
||||
resp = self.session.get(post_url, timeout=15) if self.session else requests.get(post_url, timeout=15)
|
||||
soup = BeautifulSoup(resp.text, 'html.parser')
|
||||
content_div = soup.find('div', {'class': 't_fsz'})
|
||||
# 修复403问题: 使用Selenium访问而不是requests
|
||||
self.driver.get(post_url)
|
||||
# 等待内容加载
|
||||
WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
|
||||
soup = BeautifulSoup(self.driver.page_source, 'html.parser')
|
||||
|
||||
if content_div:
|
||||
magnet_tags = content_div.find_all('a', href=re.compile(r'^magnet:\?'))
|
||||
for tag in magnet_tags:
|
||||
href = tag.get('href', '')
|
||||
if 'xt=urn:btih:' in href:
|
||||
magnet_link = href
|
||||
break
|
||||
if not magnet_link:
|
||||
text = content_div.get_text()
|
||||
match = re.search(r'magnet:\?xt=urn:btih:[a-zA-Z0-9]+.*', text)
|
||||
if match: magnet_link = match.group(0)
|
||||
magnet_link = self.clean_magnet(magnet_link)
|
||||
# 调试: 检查页面标题(验证是否成功加载)
|
||||
page_title = soup.find('title')
|
||||
logger.debug(f"详情页标题: {page_title.get_text() if page_title else 'None'}, URL: {post_url}")
|
||||
|
||||
imgs = content_div.find_all('img')
|
||||
for img in imgs:
|
||||
zoomfile = img.get('zoomfile')
|
||||
if zoomfile and zoomfile.startswith('http'):
|
||||
cover_image = zoomfile
|
||||
break
|
||||
file_attr = img.get('file')
|
||||
if file_attr and file_attr.startswith('http'):
|
||||
cover_image = file_attr
|
||||
break
|
||||
# 修复1: 使用正确的选择器
|
||||
# 方法1: 直接找 td class="t_f"
|
||||
content_td = soup.find('td', {'class': 't_f'})
|
||||
|
||||
text_content = content_div.get_text(separator='\n')
|
||||
actress_match = re.search(r'(?:【|\[)(?:出演)?女优(?:】|\])\s*[::]\s*(.*)', text_content)
|
||||
if actress_match:
|
||||
raw_actress = actress_match.group(1).strip()
|
||||
actress_in_body = raw_actress.split('<')[0].strip()
|
||||
# 方法2: 如果上面不行,尝试找 id="postmessage_*"
|
||||
if not content_td:
|
||||
content_td = soup.find('td', {'id': lambda x: x and x.startswith('postmessage_')})
|
||||
|
||||
except Exception:
|
||||
pass
|
||||
# 调试: 检查是否找到容器
|
||||
if content_td:
|
||||
logger.debug(f"✓ 找到内容容器: id={content_td.get('id', '')}")
|
||||
else:
|
||||
logger.warning(f"✗ 未找到内容容器 (td class='t_f' 或 id='postmessage_*')")
|
||||
# 尝试打印所有td标签看看有什么
|
||||
all_tds = soup.find_all('td')
|
||||
logger.debug(f"页面共有 {len(all_tds)} 个td标签")
|
||||
return magnet_link, cover_image, actress_in_body
|
||||
|
||||
# 修复2: 磁力链接在 <div class="blockcode"> 里的纯文本中
|
||||
magnet_div = content_td.find('div', {'class': 'blockcode'})
|
||||
if magnet_div:
|
||||
# 直接提取文本,去掉空白符
|
||||
magnet_text = magnet_div.get_text(strip=True)
|
||||
logger.debug(f"找到blockcode, 内容: {magnet_text[:100]}")
|
||||
# 用正则匹配磁力链接
|
||||
match = re.search(r'(magnet:\?xt=urn:btih:[a-zA-Z0-9]+[^\s"\'<>\u4e00-\u9fa5]*)', magnet_text)
|
||||
if match:
|
||||
magnet_link = match.group(1)
|
||||
logger.debug(f"✓ 提取到磁力链接: {magnet_link[:50]}...")
|
||||
else:
|
||||
logger.debug(f"✗ blockcode中未匹配到磁力链接")
|
||||
else:
|
||||
logger.debug(f"✗ 未找到div class='blockcode'")
|
||||
|
||||
# 兜底:如果上面没找到,再尝试在整个文本中搜索
|
||||
if not magnet_link:
|
||||
full_text = content_td.get_text()
|
||||
match = re.search(r'magnet:\?xt=urn:btih:[a-zA-Z0-9]+.*', full_text)
|
||||
if match:
|
||||
magnet_link = self.clean_magnet(match.group(0))
|
||||
logger.debug(f"✓ 兜底提取到磁力链接: {magnet_link[:50]}...")
|
||||
else:
|
||||
logger.debug(f"✗ 整个文本中也未找到磁力链接")
|
||||
|
||||
# 封面图:逻辑正确,但要确保在正确的容器里查找
|
||||
imgs = content_td.find_all('img')
|
||||
logger.debug(f"内容容器中共有 {len(imgs)} 个img标签")
|
||||
for img in imgs:
|
||||
zoomfile = img.get('zoomfile')
|
||||
file_attr = img.get('file')
|
||||
if zoomfile and zoomfile.startswith('http') and 'static/image/common/none.gif' not in zoomfile:
|
||||
cover_image = zoomfile
|
||||
logger.debug(f"✓ 使用zoomfile作为封面: {cover_image[:50]}...")
|
||||
break
|
||||
if file_attr and file_attr.startswith('http') and 'static/image/common/none.gif' not in file_attr:
|
||||
cover_image = file_attr
|
||||
logger.debug(f"✓ 使用file作为封面: {cover_image[:50]}...")
|
||||
break
|
||||
|
||||
if not cover_image:
|
||||
logger.debug(f"✗ 未找到合适的封面图")
|
||||
|
||||
# 女优:从文本中提取
|
||||
text_content = content_td.get_text(separator='\n')
|
||||
actress_match = re.search(r'(?:【|\[)(?:出演)?女优(?:】|\])\s*[::]\s*([^\n<]+)', text_content)
|
||||
if actress_match:
|
||||
raw_actress = actress_match.group(1).strip()
|
||||
actress_in_body = raw_actress.split('<')[0].split()[0].strip()
|
||||
logger.debug(f"✓ 提取到女优: {actress_in_body}")
|
||||
else:
|
||||
logger.debug(f"✗ 未匹配到女优信息")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"详情页解析异常: {e}, URL: {post_url}")
|
||||
|
||||
# 返回三个值
|
||||
return magnet_link, cover_image, actress_in_body
|
||||
|
||||
def crawl_forum(self, fid, category_name):
|
||||
@@ -328,7 +375,7 @@ class SehuatangCrawler:
|
||||
if RUN_MODE == 'daily' and consecutive_old_posts > 20:
|
||||
break
|
||||
|
||||
list_url = f"https://www.sehuatang.org/forum-{fid}-{page}.html"
|
||||
list_url = f"https://www.sehuatang.net/forum-{fid}-{page}.html"
|
||||
logger.info(f"正在爬取第 {page} 页")
|
||||
|
||||
try:
|
||||
@@ -349,7 +396,7 @@ class SehuatangCrawler:
|
||||
date_span = thread.find('td', {'class': 'by'}).find('em').find('span')
|
||||
raw_date = date_span.get_text() if date_span else ""
|
||||
publish_date = self.parse_relative_date(raw_date)
|
||||
|
||||
logger.debug("publish_date: "+ publish_date)
|
||||
if RUN_MODE == 'daily' and ONLY_CRAWL_TODAY:
|
||||
if publish_date != self.today_str:
|
||||
consecutive_old_posts += 1
|
||||
@@ -365,11 +412,11 @@ class SehuatangCrawler:
|
||||
continue
|
||||
|
||||
partial_url = title_tag.get('href')
|
||||
full_url = f"https://www.sehuatang.org/{partial_url}"
|
||||
|
||||
full_url = f"https://www.sehuatang.net/{partial_url}"
|
||||
logger.info("get url : " + full_url)
|
||||
# 获取详情页数据(含女优)
|
||||
magnet, cover, body_actress = self.parse_detail_page(full_url)
|
||||
|
||||
logger.debug(f"magnet: {magnet}, cover: {cover}, body_actress: {body_actress}")
|
||||
# ================= 决策逻辑 =================
|
||||
# 优先用详情页里抓到的 body_actress
|
||||
# 如果没抓到,再尝试用标题分析
|
||||
|
||||
Reference in New Issue
Block a user