Files
abot/base/func_english_news.py
2025-04-14 16:02:00 +08:00

343 lines
10 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# -*- coding: utf-8 -*-
"""
Program: English Daily News Downloader
Author: MrCrawL
Created Date: 2024-01-21
Last Modified: 2024-03-24
Modified by: MrCrawL
"""
from utils.ai.dify_news_analyze import dify_news_title_analyze
from utils.markdown_to_image import convert_md_str_to_image
'''Existing problem: text with hyperlink won't be saved'''
import requests
from time import localtime, sleep
from lxml import etree
import logging
from datetime import datetime
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(f'news_crawler_{datetime.now().strftime("%Y%m%d")}.log'),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
# 请求配置
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
TIMEOUT = 10
MAX_RETRIES = 3
NEWS_LIMIT = 30
def get_time():
date_ = f'{str(localtime().tm_year).zfill(4)}-{str(localtime().tm_mon).zfill(2)}-{str(localtime().tm_mday).zfill(2)}'
return date_
def title_tidy(title_list):
t_index = []
for i in range(1, len(title_list)):
if title_list[i] == title_list[i - 1]: t_index.append(i)
t_index.reverse()
for i in range(len(t_index)): title_list.pop(t_index[i])
return title_list
def text_tidy(p_text):
text_ = p_text.replace('', "'")
text_ = text_.replace(' \n\n', ' ')
text_ = text_.replace('\n\n ', ' ')
text_ = text_.replace('\n\n,', ' ,')
text_ = text_.replace(',\n\n', ', ')
text_ = text_.replace(';\n\n', '; ')
text_ = text_.replace('\n\n;', ' ;')
text_ = text_.replace(':\n\n', ': ')
text_ = text_.replace('\n\n:', ' :')
text_ = text_.replace('"\n\n', '" ')
text_ = text_.replace('\n\n"', ' "')
text_ = text_.replace("'\n\n", "' ")
text_ = text_.replace("\n\n'", " '")
return text_
def safe_request(url, retry_count=0):
"""安全的请求方法,包含重试机制"""
try:
response = requests.get(url, headers=HEADERS, timeout=TIMEOUT)
response.raise_for_status()
return response
except requests.RequestException as e:
if retry_count < MAX_RETRIES:
logger.warning(f"请求失败,正在进行第{retry_count + 1}次重试: {url}")
sleep(1)
return safe_request(url, retry_count + 1)
else:
logger.error(f"请求失败: {url}, 错误: {str(e)}")
return None
def nbc():
logger.info("开始获取NBC新闻")
try:
url = 'https://www.nbcnews.com/'
response = safe_request(url)
if not response:
return "获取NBC新闻失败"
html = etree.HTML(response.text)
href = html.xpath('//h2/a/@href')
href = title_tidy(href)
msg = ''
count = 0
for url in href[:NEWS_LIMIT]:
try:
response = safe_request(url)
if not response:
continue
html = etree.HTML(response.text)
title = html.xpath('//h1/text()')
if not title:
logger.warning(f'跳过视频或其他类型新闻: {url}')
continue
title = title[0]
msg += f'Title: {title}. Link: {url}\n'
count += 1
sleep(0.1)
except Exception as e:
logger.error(f"处理新闻失败: {url}, 错误: {str(e)}")
continue
logger.info(f"NBC新闻获取完成共获取{count}")
return msg
except Exception as e:
logger.error(f"获取NBC新闻失败: {str(e)}")
return "获取新闻失败,请查看日志了解详情"
def cnn():
logger.info("开始获取CNN新闻")
try:
head = 'https://www.cnn.com'
response = safe_request(head + '/')
if not response:
return "获取CNN新闻失败"
html = etree.HTML(response.text)
href = html.xpath('//a[@data-link-type="article"]/@href')
href = title_tidy(href)
msg = ''
count = 0
for url in href[:NEWS_LIMIT]:
try:
full_url = head + url
response = safe_request(full_url)
if not response:
continue
html = etree.HTML(response.text)
title = html.xpath('//h1[@data-editable="headlineText"]/text()')
if not title:
logger.warning(f'跳过视频或其他类型新闻: {full_url}')
continue
title = title[0].strip()
msg += f'Title: {title}. Link: {full_url}\n'
count += 1
sleep(0.1)
except Exception as e:
logger.error(f"处理新闻失败: {full_url}, 错误: {str(e)}")
continue
logger.info(f"CNN新闻获取完成共获取{count}")
return msg
except Exception as e:
logger.error(f"获取CNN新闻失败: {str(e)}")
return "获取新闻失败,请查看日志了解详情"
def abc():
logger.info("开始获取ABC新闻")
try:
head = 'https://abcnews.go.com' # 移除末尾的斜杠
response = safe_request(head)
if not response:
return "获取ABC新闻失败"
html = etree.HTML(response.text)
href1 = html.xpath('//div[@class="HeadlinesTrio"]/a/@href')
href2 = html.xpath(
'//div[@class="title card"]/a[@class="AnchorLink"]/@href | //div[@class="title"]/a[@class="AnchorLink"]/@href')
href3 = html.xpath('//a[@target="_self"]/@href')
href4 = html.xpath('//a[@class="AnchorLink VideoTile"]/@href')
href = title_tidy(href1 + href2 + href3 + href4)
msg = ''
count = 0
for url in href[:NEWS_LIMIT]:
try:
# 处理URL格式
if url.startswith('http'):
full_url = url
elif url.startswith('//'):
full_url = 'https:' + url
else:
full_url = head + ('' if url.startswith('/') else '/') + url
response = safe_request(full_url)
if not response:
continue
html = etree.HTML(response.text)
title = html.xpath('//div[@data-testid="prism-headline"]/h1/text()')
if not title:
logger.warning(f'跳过视频或其他类型新闻: {full_url}')
continue
title = title[0]
msg += f'Title: {title}. Link: {full_url}\n'
count += 1
sleep(0.1)
except Exception as e:
logger.error(f"处理新闻失败: {full_url}, 错误: {str(e)}")
continue
logger.info(f"ABC新闻获取完成共获取{count}")
return msg
except Exception as e:
logger.error(f"获取ABC新闻失败: {str(e)}")
return "获取新闻失败,请查看日志了解详情"
def fox():
logger.info("开始获取FOX新闻")
try:
head = 'https://www.foxnews.com/'
response = safe_request(head)
if not response:
return "获取FOX新闻失败"
html = etree.HTML(response.text)
href = html.xpath('//h3[@class="title"]/a/@href')
href = title_tidy(href)
msg = ''
count = 0
for url in href[:NEWS_LIMIT]:
try:
if url[0:4] != 'http':
url = 'https:' + url
response = safe_request(url)
if not response:
continue
html = etree.HTML(response.text)
title = html.xpath('//h1[@itemprop="headline"]/text()')
if not title:
logger.warning(f'跳过视频或其他类型新闻: {url}')
continue
title = title[0]
msg += f'Title: {title}. Link: {url}\n'
count += 1
sleep(0.1)
except Exception as e:
logger.error(f"处理新闻失败: {url}, 错误: {str(e)}")
continue
logger.info(f"FOX新闻获取完成共获取{count}")
return msg
except Exception as e:
logger.error(f"获取FOX新闻失败: {str(e)}")
return "获取新闻失败,请查看日志了解详情"
def bbc():
logger.info("开始获取BBC新闻")
try:
head = 'https://www.bbc.com'
response = safe_request(head + '/')
if not response:
return "获取BBC新闻失败"
html = etree.HTML(response.text)
href = html.xpath(
'//h2[@data-testid="card-headline"]/../../../../../@href | //h2[@data-testid="card-headline"]/../../../../@href')
href = title_tidy(href)
msg = ''
count = 0
for url in href[:NEWS_LIMIT]:
try:
if url[0:4] == 'http':
continue
full_url = head + url
response = safe_request(full_url)
if not response:
continue
html = etree.HTML(response.text)
title = html.xpath('//div[@data-component="headline-block"]/h1/text()')
if not title:
logger.warning(f'跳过视频或其他类型新闻: {full_url}')
continue
title = title[0]
msg += f'Title: {title}. Link: {full_url}\n'
count += 1
sleep(0.1)
except Exception as e:
logger.error(f"处理新闻失败: {full_url}, 错误: {str(e)}")
continue
logger.info(f"BBC新闻获取完成共获取{count}")
return msg
except Exception as e:
logger.error(f"获取BBC新闻失败: {str(e)}")
return "获取新闻失败,请查看日志了解详情"
def all_english_news():
news_titles = ""
news_titles += nbc() + "\n"
news_titles += cnn() + "\n"
news_titles += abc() + "\n"
news_titles += fox() + "\n"
news_titles += bbc() + "\n"
markdown_news = dify_news_title_analyze(news_titles)
spath = convert_md_str_to_image(markdown_news, "news_output.png")
return spath