302 lines
9.2 KiB
Python
302 lines
9.2 KiB
Python
# -*- coding: utf-8 -*-
|
||
"""
|
||
Program: Global News Crawler
|
||
Author: liu.wei (based on MrCrawL's work)
|
||
Created Date: 2024-05-01
|
||
"""
|
||
import requests
|
||
from time import localtime, sleep
|
||
from lxml import etree
|
||
from loguru import logger
|
||
from datetime import datetime
|
||
import time
|
||
|
||
|
||
# 请求配置
|
||
HEADERS = {
|
||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||
}
|
||
TIMEOUT = 10
|
||
MAX_RETRIES = 3
|
||
NEWS_LIMIT = 30
|
||
|
||
|
||
def get_time():
|
||
date_ = f'{str(localtime().tm_year).zfill(4)}-{str(localtime().tm_mon).zfill(2)}-{str(localtime().tm_mday).zfill(2)}'
|
||
return date_
|
||
|
||
|
||
def title_tidy(title_list):
|
||
t_index = []
|
||
for i in range(1, len(title_list)):
|
||
if title_list[i] == title_list[i - 1]: t_index.append(i)
|
||
t_index.reverse()
|
||
for i in range(len(t_index)): title_list.pop(t_index[i])
|
||
return title_list
|
||
|
||
|
||
def safe_request(url, retry_count=0):
|
||
"""安全的请求方法,包含重试机制"""
|
||
try:
|
||
response = requests.get(url, headers=HEADERS, timeout=TIMEOUT)
|
||
response.raise_for_status()
|
||
return response
|
||
except requests.RequestException as e:
|
||
if retry_count < MAX_RETRIES:
|
||
logger.warning(f"请求失败,正在进行第{retry_count + 1}次重试: {url}")
|
||
sleep(1)
|
||
return safe_request(url, retry_count + 1)
|
||
else:
|
||
logger.error(f"请求失败: {url}, 错误: {str(e)}")
|
||
return None
|
||
|
||
|
||
def nbc():
|
||
logger.info("开始获取NBC新闻")
|
||
try:
|
||
url = 'https://www.nbcnews.com/'
|
||
response = safe_request(url)
|
||
if not response:
|
||
return "获取NBC新闻失败"
|
||
|
||
html = etree.HTML(response.text)
|
||
href = html.xpath('//h2/a/@href')
|
||
href = title_tidy(href)
|
||
|
||
msg = ''
|
||
count = 0
|
||
|
||
for url in href[:NEWS_LIMIT]:
|
||
try:
|
||
response = safe_request(url)
|
||
if not response:
|
||
continue
|
||
|
||
html = etree.HTML(response.text)
|
||
title = html.xpath('//h1/text()')
|
||
|
||
if not title:
|
||
logger.warning(f'跳过视频或其他类型新闻: {url}')
|
||
continue
|
||
|
||
title = title[0]
|
||
msg += f'Title: {title}. Link: {url}\n'
|
||
count += 1
|
||
sleep(0.1)
|
||
|
||
except Exception as e:
|
||
logger.error(f"处理新闻失败: {url}, 错误: {str(e)}")
|
||
continue
|
||
|
||
logger.info(f"NBC新闻获取完成,共获取{count}条")
|
||
return msg
|
||
|
||
except Exception as e:
|
||
logger.error(f"获取NBC新闻失败: {str(e)}")
|
||
return "获取新闻失败,请查看日志了解详情"
|
||
|
||
|
||
def cnn():
|
||
logger.info("开始获取CNN新闻")
|
||
try:
|
||
head = 'https://www.cnn.com'
|
||
response = safe_request(head + '/')
|
||
if not response:
|
||
return "获取CNN新闻失败"
|
||
|
||
html = etree.HTML(response.text)
|
||
href = html.xpath('//a[@data-link-type="article"]/@href')
|
||
href = title_tidy(href)
|
||
|
||
msg = ''
|
||
count = 0
|
||
|
||
for url in href[:NEWS_LIMIT]:
|
||
try:
|
||
full_url = head + url
|
||
response = safe_request(full_url)
|
||
if not response:
|
||
continue
|
||
|
||
html = etree.HTML(response.text)
|
||
title = html.xpath('//h1[@data-editable="headlineText"]/text()')
|
||
|
||
if not title:
|
||
logger.warning(f'跳过视频或其他类型新闻: {full_url}')
|
||
continue
|
||
|
||
title = title[0].strip()
|
||
msg += f'Title: {title}. Link: {full_url}\n'
|
||
count += 1
|
||
sleep(0.1)
|
||
|
||
except Exception as e:
|
||
logger.error(f"处理新闻失败: {full_url}, 错误: {str(e)}")
|
||
continue
|
||
|
||
logger.info(f"CNN新闻获取完成,共获取{count}条")
|
||
return msg
|
||
|
||
except Exception as e:
|
||
logger.error(f"获取CNN新闻失败: {str(e)}")
|
||
return "获取新闻失败,请查看日志了解详情"
|
||
|
||
|
||
def abc():
|
||
logger.info("开始获取ABC新闻")
|
||
try:
|
||
head = 'https://abcnews.go.com' # 移除末尾的斜杠
|
||
response = safe_request(head)
|
||
if not response:
|
||
return "获取ABC新闻失败"
|
||
|
||
html = etree.HTML(response.text)
|
||
href1 = html.xpath('//div[@class="HeadlinesTrio"]/a/@href')
|
||
href2 = html.xpath(
|
||
'//div[@class="title card"]/a[@class="AnchorLink"]/@href | //div[@class="title"]/a[@class="AnchorLink"]/@href')
|
||
href3 = html.xpath('//a[@target="_self"]/@href')
|
||
href4 = html.xpath('//a[@class="AnchorLink VideoTile"]/@href')
|
||
href = title_tidy(href1 + href2 + href3 + href4)
|
||
|
||
msg = ''
|
||
count = 0
|
||
|
||
for url in href[:NEWS_LIMIT]:
|
||
try:
|
||
# 处理URL格式
|
||
if url.startswith('http'):
|
||
full_url = url
|
||
elif url.startswith('//'):
|
||
full_url = 'https:' + url
|
||
else:
|
||
full_url = head + ('' if url.startswith('/') else '/') + url
|
||
|
||
response = safe_request(full_url)
|
||
if not response:
|
||
continue
|
||
|
||
html = etree.HTML(response.text)
|
||
# 修改 xpath 以支持新的标题结构
|
||
title = html.xpath('//div[@data-testid="prism-headline"]/h1/text() | //div[@data-testid="prism-headline"]//span/text()')
|
||
|
||
if not title:
|
||
logger.warning(f'跳过视频或其他类型新闻: {full_url}')
|
||
continue
|
||
|
||
title = title[0].strip()
|
||
msg += f'Title: {title}. Link: {full_url}\n'
|
||
count += 1
|
||
sleep(0.1)
|
||
|
||
except Exception as e:
|
||
logger.error(f"处理新闻失败: {full_url}, 错误: {str(e)}")
|
||
continue
|
||
|
||
logger.info(f"ABC新闻获取完成,共获取{count}条")
|
||
return msg
|
||
|
||
except Exception as e:
|
||
logger.error(f"获取ABC新闻失败: {str(e)}")
|
||
return "获取新闻失败,请查看日志了解详情"
|
||
|
||
|
||
def fox():
|
||
logger.info("开始获取FOX新闻")
|
||
try:
|
||
head = 'https://www.foxnews.com/'
|
||
response = safe_request(head)
|
||
if not response:
|
||
return "获取FOX新闻失败"
|
||
|
||
html = etree.HTML(response.text)
|
||
href = html.xpath('//h3[@class="title"]/a/@href')
|
||
href = title_tidy(href)
|
||
|
||
msg = ''
|
||
count = 0
|
||
|
||
for url in href[:NEWS_LIMIT]:
|
||
try:
|
||
if url[0:4] != 'http':
|
||
url = 'https:' + url
|
||
|
||
response = safe_request(url)
|
||
if not response:
|
||
continue
|
||
|
||
html = etree.HTML(response.text)
|
||
# 修改 xpath 以支持更多标题结构
|
||
title = html.xpath('//h1[@itemprop="headline"]/text() | //h1[@class="headline speakable"]/text() | //h1[@class="headline"]/text()')
|
||
|
||
if not title:
|
||
logger.warning(f'跳过视频或其他类型新闻: {url}')
|
||
continue
|
||
|
||
title = title[0].strip()
|
||
msg += f'Title: {title}. Link: {url}\n'
|
||
count += 1
|
||
sleep(0.1)
|
||
|
||
except Exception as e:
|
||
logger.error(f"处理新闻失败: {url}, 错误: {str(e)}")
|
||
continue
|
||
|
||
logger.info(f"FOX新闻获取完成,共获取{count}条")
|
||
return msg
|
||
|
||
except Exception as e:
|
||
logger.error(f"获取FOX新闻失败: {str(e)}")
|
||
return "获取新闻失败,请查看日志了解详情"
|
||
|
||
|
||
def bbc():
|
||
logger.info("开始获取BBC新闻")
|
||
try:
|
||
head = 'https://www.bbc.com'
|
||
response = safe_request(head + '/')
|
||
if not response:
|
||
return "获取BBC新闻失败"
|
||
|
||
html = etree.HTML(response.text)
|
||
href = html.xpath(
|
||
'//h2[@data-testid="card-headline"]/../../../../../@href | //h2[@data-testid="card-headline"]/../../../../@href')
|
||
href = title_tidy(href)
|
||
|
||
msg = ''
|
||
count = 0
|
||
|
||
for url in href[:NEWS_LIMIT]:
|
||
try:
|
||
if url[0:4] == 'http':
|
||
continue
|
||
|
||
full_url = head + url
|
||
response = safe_request(full_url)
|
||
if not response:
|
||
continue
|
||
|
||
html = etree.HTML(response.text)
|
||
title = html.xpath('//div[@data-component="headline-block"]/h1/text()')
|
||
|
||
if not title:
|
||
logger.warning(f'跳过视频或其他类型新闻: {full_url}')
|
||
continue
|
||
|
||
title = title[0]
|
||
msg += f'Title: {title}. Link: {full_url}\n'
|
||
count += 1
|
||
sleep(0.1)
|
||
|
||
except Exception as e:
|
||
logger.error(f"处理新闻失败: {full_url}, 错误: {str(e)}")
|
||
continue
|
||
|
||
logger.info(f"BBC新闻获取完成,共获取{count}条")
|
||
return msg
|
||
|
||
except Exception as e:
|
||
logger.error(f"获取BBC新闻失败: {str(e)}")
|
||
return "获取新闻失败,请查看日志了解详情"
|
||
|
||
if __name__ == '__main__':
|
||
fox() |