feature:全球政治经济新闻
This commit is contained in:
@@ -6,11 +6,35 @@ Created Date: 2024-01-21
|
|||||||
Last Modified: 2024-03-24
|
Last Modified: 2024-03-24
|
||||||
Modified by: MrCrawL
|
Modified by: MrCrawL
|
||||||
"""
|
"""
|
||||||
|
from utils.ai.dify_news_analyze import dify_news_title_analyze
|
||||||
|
from utils.markdown_to_image import convert_md_str_to_image
|
||||||
|
|
||||||
'''Existing problem: text with hyperlink won't be saved'''
|
'''Existing problem: text with hyperlink won't be saved'''
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from time import localtime, sleep
|
from time import localtime, sleep
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
import logging
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
# 配置日志
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||||
|
handlers=[
|
||||||
|
logging.FileHandler(f'news_crawler_{datetime.now().strftime("%Y%m%d")}.log'),
|
||||||
|
logging.StreamHandler()
|
||||||
|
]
|
||||||
|
)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# 请求配置
|
||||||
|
HEADERS = {
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||||
|
}
|
||||||
|
TIMEOUT = 10
|
||||||
|
MAX_RETRIES = 3
|
||||||
|
NEWS_LIMIT = 30
|
||||||
|
|
||||||
|
|
||||||
def get_time():
|
def get_time():
|
||||||
@@ -18,7 +42,6 @@ def get_time():
|
|||||||
return date_
|
return date_
|
||||||
|
|
||||||
|
|
||||||
# delete duplicated
|
|
||||||
def title_tidy(title_list):
|
def title_tidy(title_list):
|
||||||
t_index = []
|
t_index = []
|
||||||
for i in range(1, len(title_list)):
|
for i in range(1, len(title_list)):
|
||||||
@@ -28,7 +51,6 @@ def title_tidy(title_list):
|
|||||||
return title_list
|
return title_list
|
||||||
|
|
||||||
|
|
||||||
# tidy text, seems a little bit redundant
|
|
||||||
def text_tidy(p_text):
|
def text_tidy(p_text):
|
||||||
text_ = p_text.replace('’', "'")
|
text_ = p_text.replace('’', "'")
|
||||||
text_ = text_.replace(' \n\n', ' ')
|
text_ = text_.replace(' \n\n', ' ')
|
||||||
@@ -46,218 +68,275 @@ def text_tidy(p_text):
|
|||||||
return text_
|
return text_
|
||||||
|
|
||||||
|
|
||||||
def save(text, file_name, mode='w', encoding='utf-8'):
|
def safe_request(url, retry_count=0):
|
||||||
with open(f'{file_name}.txt', mode, encoding=encoding) as f: f.write(text)
|
"""安全的请求方法,包含重试机制"""
|
||||||
|
try:
|
||||||
|
response = requests.get(url, headers=HEADERS, timeout=TIMEOUT)
|
||||||
|
response.raise_for_status()
|
||||||
|
return response
|
||||||
|
except requests.RequestException as e:
|
||||||
|
if retry_count < MAX_RETRIES:
|
||||||
|
logger.warning(f"请求失败,正在进行第{retry_count + 1}次重试: {url}")
|
||||||
|
sleep(1)
|
||||||
|
return safe_request(url, retry_count + 1)
|
||||||
|
else:
|
||||||
|
logger.error(f"请求失败: {url}, 错误: {str(e)}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def nbc():
|
def nbc():
|
||||||
|
logger.info("开始获取NBC新闻")
|
||||||
|
try:
|
||||||
url = 'https://www.nbcnews.com/'
|
url = 'https://www.nbcnews.com/'
|
||||||
res = requests.get(url)
|
response = safe_request(url)
|
||||||
html = etree.HTML(res.text)
|
if not response:
|
||||||
|
return "获取NBC新闻失败"
|
||||||
|
|
||||||
|
html = etree.HTML(response.text)
|
||||||
href = html.xpath('//h2/a/@href')
|
href = html.xpath('//h2/a/@href')
|
||||||
href = title_tidy(href)
|
href = title_tidy(href)
|
||||||
# quant = int(input(f'There are {len(href)} pieces detected. How many would you download:'))
|
|
||||||
# if quant > len(href) or quant < 1:
|
msg = ''
|
||||||
# print("Outnumber!")
|
|
||||||
# quit()
|
|
||||||
count = 0
|
count = 0
|
||||||
# save('', f'NBC_news_title_{get_time()}')
|
|
||||||
# save('', f'NBC_news_text_{get_time()}')
|
for url in href[:NEWS_LIMIT]:
|
||||||
msg =''
|
try:
|
||||||
for i in range(30):
|
response = safe_request(url)
|
||||||
url = href[i]
|
if not response:
|
||||||
sleep(0.1) # delete to speed up
|
|
||||||
res = requests.get(url)
|
|
||||||
html = etree.HTML(res.text)
|
|
||||||
title = html.xpath('//h1/text()')
|
|
||||||
if len(title) == 0:
|
|
||||||
print(f'Video or other news. Link: {url}')
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
html = etree.HTML(response.text)
|
||||||
|
title = html.xpath('//h1/text()')
|
||||||
|
|
||||||
|
if not title:
|
||||||
|
logger.warning(f'跳过视频或其他类型新闻: {url}')
|
||||||
|
continue
|
||||||
|
|
||||||
title = title[0]
|
title = title[0]
|
||||||
author = html.xpath('//span[@class="byline-name"]/a/text() | //span[@class="byline-name" and not(a)]/text()')
|
msg += f'Title: {title}. Link: {url}\n'
|
||||||
author = ', '.join(author)
|
|
||||||
text = html.xpath('//p[@class=""]/text()')
|
|
||||||
text = '\n\n'.join(text)
|
|
||||||
text = text_tidy(text)
|
|
||||||
count += 1
|
count += 1
|
||||||
# save(f'Title: {title}\nLink: {url}\n\n', f'NBC_news_title_{get_time()}', 'a') # news title
|
sleep(0.1)
|
||||||
# save(f'Title: {title}\n\nOrigin: {url}\n\nAuthor: {author}\n\n\n', f'NBC_news_text_{get_time()}', 'a')
|
|
||||||
# save(f'{text}' + '\n\n------------------------------\n\n', f'NBC_news_text_{get_time()}', 'a')
|
except Exception as e:
|
||||||
# print(f'Title: {title}. Link: {href[i]}.')
|
logger.error(f"处理新闻失败: {url}, 错误: {str(e)}")
|
||||||
msg += f'Title: {title}. Link: {href[i]}.\n'
|
continue
|
||||||
|
|
||||||
|
logger.info(f"NBC新闻获取完成,共获取{count}条")
|
||||||
return msg
|
return msg
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"获取NBC新闻失败: {str(e)}")
|
||||||
|
return "获取新闻失败,请查看日志了解详情"
|
||||||
|
|
||||||
|
|
||||||
def cnn():
|
def cnn():
|
||||||
|
logger.info("开始获取CNN新闻")
|
||||||
|
try:
|
||||||
head = 'https://www.cnn.com'
|
head = 'https://www.cnn.com'
|
||||||
res = requests.get(head + '/')
|
response = safe_request(head + '/')
|
||||||
html = etree.HTML(res.text)
|
if not response:
|
||||||
|
return "获取CNN新闻失败"
|
||||||
|
|
||||||
|
html = etree.HTML(response.text)
|
||||||
href = html.xpath('//a[@data-link-type="article"]/@href')
|
href = html.xpath('//a[@data-link-type="article"]/@href')
|
||||||
href = title_tidy(href)
|
href = title_tidy(href)
|
||||||
# quant = int(input(f'{len(href)} data detected. How many would you like to download:'))
|
|
||||||
# if quant > len(href) or quant < 1:
|
|
||||||
# print("Outnumber!")
|
|
||||||
# quit()
|
|
||||||
count = 0
|
|
||||||
msg = ''
|
msg = ''
|
||||||
# save('', f'CNN_news_title_{get_time()}')
|
count = 0
|
||||||
# save('', f'CNN_news_text_{get_time()}')
|
|
||||||
for i in range(30):
|
for url in href[:NEWS_LIMIT]:
|
||||||
url = head + href[i]
|
try:
|
||||||
sleep(0.1) # delete to speed up
|
full_url = head + url
|
||||||
res = requests.get(url)
|
response = safe_request(full_url)
|
||||||
html = etree.HTML(res.text)
|
if not response:
|
||||||
title = html.xpath('//h1[@data-editable="headlineText"]/text()')
|
|
||||||
if len(title) == 0:
|
|
||||||
print(f'Video or other news. Link: {url}')
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
html = etree.HTML(response.text)
|
||||||
|
title = html.xpath('//h1[@data-editable="headlineText"]/text()')
|
||||||
|
|
||||||
|
if not title:
|
||||||
|
logger.warning(f'跳过视频或其他类型新闻: {full_url}')
|
||||||
|
continue
|
||||||
|
|
||||||
title = title[0].strip()
|
title = title[0].strip()
|
||||||
author = html.xpath('//span[@class="byline__name"]/text()')
|
msg += f'Title: {title}. Link: {full_url}\n'
|
||||||
author = ', '.join(author)
|
|
||||||
text = html.xpath('//p[@class="paragraph inline-placeholder"]/text()')
|
|
||||||
for k in range(len(text)): text[k].strip()
|
|
||||||
text = ''.join(text)
|
|
||||||
text = text_tidy(text)
|
|
||||||
count += 1
|
count += 1
|
||||||
# save(f'Title: {title}\nLink: {url}\n\n', f'CNN_news_title_{get_time()}', 'a') # news title
|
sleep(0.1)
|
||||||
# save(f'Title: {title}\n\nOrigin: {url}\n\nAuthor: {author}\n\n\n', f'CNN_news_text_{get_time()}', 'a')
|
|
||||||
# save(f'{text}' + '\n\n------------------------------\n\n', f'CNN_news_text_{get_time()}', 'a')
|
except Exception as e:
|
||||||
# print(f'Title: {title}. Link: {url}')
|
logger.error(f"处理新闻失败: {full_url}, 错误: {str(e)}")
|
||||||
msg +=f'Title: {title}. Link: {url}\n'
|
continue
|
||||||
# print(f'Files saved with {count} articles available.')
|
|
||||||
|
logger.info(f"CNN新闻获取完成,共获取{count}条")
|
||||||
return msg
|
return msg
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"获取CNN新闻失败: {str(e)}")
|
||||||
|
return "获取新闻失败,请查看日志了解详情"
|
||||||
|
|
||||||
|
|
||||||
def abc():
|
def abc():
|
||||||
head = 'https://abcnews.go.com/'
|
logger.info("开始获取ABC新闻")
|
||||||
res = requests.get(head)
|
try:
|
||||||
html = etree.HTML(res.text)
|
head = 'https://abcnews.go.com' # 移除末尾的斜杠
|
||||||
|
response = safe_request(head)
|
||||||
|
if not response:
|
||||||
|
return "获取ABC新闻失败"
|
||||||
|
|
||||||
|
html = etree.HTML(response.text)
|
||||||
href1 = html.xpath('//div[@class="HeadlinesTrio"]/a/@href')
|
href1 = html.xpath('//div[@class="HeadlinesTrio"]/a/@href')
|
||||||
href2 = html.xpath('//div[@class="title card"]/a[@class="AnchorLink"]/@href | //div[@class="title"]/a[@class="AnchorLink"]/@href')
|
href2 = html.xpath(
|
||||||
|
'//div[@class="title card"]/a[@class="AnchorLink"]/@href | //div[@class="title"]/a[@class="AnchorLink"]/@href')
|
||||||
href3 = html.xpath('//a[@target="_self"]/@href')
|
href3 = html.xpath('//a[@target="_self"]/@href')
|
||||||
href4 = html.xpath('//a[@class="AnchorLink VideoTile"]/@href')
|
href4 = html.xpath('//a[@class="AnchorLink VideoTile"]/@href')
|
||||||
href = href1 + href2 + href3 + href4
|
href = title_tidy(href1 + href2 + href3 + href4)
|
||||||
href = title_tidy(href)
|
|
||||||
# quant = int(input(f'{len(href)} data detected. How many would you like to download:'))
|
|
||||||
# if quant > len(href) or quant < 1:
|
|
||||||
# print("Outnumber!")
|
|
||||||
# quit()
|
|
||||||
count = 0
|
|
||||||
msg = ''
|
msg = ''
|
||||||
# save('', f'ABC_news_title_{get_time()}')
|
count = 0
|
||||||
# save('', f'ABC_news_text_{get_time()}')
|
|
||||||
for i in range(30):
|
for url in href[:NEWS_LIMIT]:
|
||||||
url = href[i]
|
try:
|
||||||
sleep(0.1) # delete to speed up
|
# 处理URL格式
|
||||||
res = requests.get(url)
|
if url.startswith('http'):
|
||||||
html = etree.HTML(res.text)
|
full_url = url
|
||||||
title = html.xpath('//div[@data-testid="prism-headline"]/h1/text()')
|
elif url.startswith('//'):
|
||||||
if len(title) == 0:
|
full_url = 'https:' + url
|
||||||
print(f'Video or other news. Link: {url}')
|
else:
|
||||||
|
full_url = head + ('' if url.startswith('/') else '/') + url
|
||||||
|
|
||||||
|
response = safe_request(full_url)
|
||||||
|
if not response:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
html = etree.HTML(response.text)
|
||||||
|
title = html.xpath('//div[@data-testid="prism-headline"]/h1/text()')
|
||||||
|
|
||||||
|
if not title:
|
||||||
|
logger.warning(f'跳过视频或其他类型新闻: {full_url}')
|
||||||
|
continue
|
||||||
|
|
||||||
title = title[0]
|
title = title[0]
|
||||||
author = html.xpath('//a[@data-testid="prism-linkbase"]/text()')
|
msg += f'Title: {title}. Link: {full_url}\n'
|
||||||
author = ', '.join(author)
|
|
||||||
text = html.xpath('//div[@data-testid="prism-article-body"]/p/text()')
|
|
||||||
text = '\n\n'.join(text)
|
|
||||||
text = text_tidy(text)
|
|
||||||
count += 1
|
count += 1
|
||||||
# save(f'Title: {title}\nLink: {url}\n\n', f'ABC_news_title_{get_time()}', 'a') # news title
|
sleep(0.1)
|
||||||
# save(f'Title: {title}\n\nOrigin: {url}\n\nAuthor: {author}\n\n\n', f'ABC_news_text_{get_time()}', 'a')
|
|
||||||
# save(f'{text}' + '\n\n------------------------------\n\n', f'ABC_news_text_{get_time()}', 'a')
|
except Exception as e:
|
||||||
# print(f'Title: {title}. Link: {url}')
|
logger.error(f"处理新闻失败: {full_url}, 错误: {str(e)}")
|
||||||
msg +=f'Title: {title}. Link: {url}\n'
|
continue
|
||||||
# print(f'Files saved with {count} articles available.')
|
|
||||||
|
logger.info(f"ABC新闻获取完成,共获取{count}条")
|
||||||
return msg
|
return msg
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"获取ABC新闻失败: {str(e)}")
|
||||||
|
return "获取新闻失败,请查看日志了解详情"
|
||||||
|
|
||||||
|
|
||||||
def fox():
|
def fox():
|
||||||
|
logger.info("开始获取FOX新闻")
|
||||||
|
try:
|
||||||
head = 'https://www.foxnews.com/'
|
head = 'https://www.foxnews.com/'
|
||||||
res = requests.get(head)
|
response = safe_request(head)
|
||||||
html = etree.HTML(res.text)
|
if not response:
|
||||||
|
return "获取FOX新闻失败"
|
||||||
|
|
||||||
|
html = etree.HTML(response.text)
|
||||||
href = html.xpath('//h3[@class="title"]/a/@href')
|
href = html.xpath('//h3[@class="title"]/a/@href')
|
||||||
href = title_tidy(href)
|
href = title_tidy(href)
|
||||||
# quant = int(input(f'{len(href)} data detected. How many would you like to download:'))
|
|
||||||
# if quant > len(href) or quant < 1:
|
msg = ''
|
||||||
# print("Outnumber!")
|
|
||||||
# quit()
|
|
||||||
count = 0
|
count = 0
|
||||||
msg =''
|
|
||||||
# save('', f'FOX_news_title_{get_time()}')
|
for url in href[:NEWS_LIMIT]:
|
||||||
# save('', f'FOX_news_text_{get_time()}')
|
try:
|
||||||
for i in range(30):
|
if url[0:4] != 'http':
|
||||||
if href[i][0:4] != 'http': href[i] = 'https:' + href[i]
|
url = 'https:' + url
|
||||||
url = href[i]
|
|
||||||
sleep(0.1) # delete to speed up
|
response = safe_request(url)
|
||||||
res = requests.get(url)
|
if not response:
|
||||||
html = etree.HTML(res.text)
|
|
||||||
title = html.xpath('//h1[@itemprop="headline"]/text()')
|
|
||||||
if len(title) == 0:
|
|
||||||
print(f'Video or other news. Link: {url}')
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
html = etree.HTML(response.text)
|
||||||
|
title = html.xpath('//h1[@itemprop="headline"]/text()')
|
||||||
|
|
||||||
|
if not title:
|
||||||
|
logger.warning(f'跳过视频或其他类型新闻: {url}')
|
||||||
|
continue
|
||||||
|
|
||||||
title = title[0]
|
title = title[0]
|
||||||
author = html.xpath('//a[@rel="author"]/strong/text()')
|
msg += f'Title: {title}. Link: {url}\n'
|
||||||
author = ', '.join(author)
|
|
||||||
text = html.xpath('//div[@itemprop="articleBody"]/p/text()')
|
|
||||||
text = '\n\n'.join(text)
|
|
||||||
text = text_tidy(text)
|
|
||||||
count += 1
|
count += 1
|
||||||
# save(f'Title: {title}\nLink: {url}\n\n', f'FOX_news_title_{get_time()}', 'a') # news title
|
sleep(0.1)
|
||||||
# save(f'Title: {title}\n\nOrigin: {url}\n\nAuthor: {author}\n\n\n', f'FOX_news_text_{get_time()}', 'a')
|
|
||||||
# save(f'{text}' + '\n\n------------------------------\n\n', f'FOX_news_text_{get_time()}', 'a')
|
except Exception as e:
|
||||||
# print(f'Title: {title}. Link: {url}')
|
logger.error(f"处理新闻失败: {url}, 错误: {str(e)}")
|
||||||
msg +=f'Title: {title}. Link: {url}\n'
|
continue
|
||||||
# print(f'Files saved with {count} articles available.')
|
|
||||||
|
logger.info(f"FOX新闻获取完成,共获取{count}条")
|
||||||
return msg
|
return msg
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"获取FOX新闻失败: {str(e)}")
|
||||||
|
return "获取新闻失败,请查看日志了解详情"
|
||||||
|
|
||||||
|
|
||||||
def bbc():
|
def bbc():
|
||||||
|
logger.info("开始获取BBC新闻")
|
||||||
|
try:
|
||||||
head = 'https://www.bbc.com'
|
head = 'https://www.bbc.com'
|
||||||
res = requests.get(head + '/')
|
response = safe_request(head + '/')
|
||||||
html = etree.HTML(res.text)
|
if not response:
|
||||||
href = html.xpath('//h2[@data-testid="card-headline"]/../../../../../@href | //h2[@data-testid="card-headline"]/../../../../@href')
|
return "获取BBC新闻失败"
|
||||||
href = title_tidy(href)
|
|
||||||
# quant = int(input(f'{len(href)} data detected. How many would you like to download:'))
|
|
||||||
# if quant > len(href) or quant < 1:
|
|
||||||
# print("Outnumber!")
|
|
||||||
# quit()
|
|
||||||
count = 0
|
|
||||||
msg =''
|
|
||||||
# save('', f'BBC_news_title_{get_time()}')
|
|
||||||
# save('', f'BBC_news_text_{get_time()}')
|
|
||||||
for i in range(30):
|
|
||||||
if href[i][0:4] == 'http': continue
|
|
||||||
url = head + href[i]
|
|
||||||
sleep(0.1) # delete to speed up
|
|
||||||
print(url)
|
|
||||||
res = requests.get(url)
|
|
||||||
html = etree.HTML(res.text)
|
|
||||||
title = html.xpath('//div[@data-component="headline-block"]/h1/text()')
|
|
||||||
if len(title) == 0:
|
|
||||||
# print(f'Video or other news. Link: {url}')
|
|
||||||
continue
|
|
||||||
title = title[0]
|
|
||||||
# author = html.xpath('//div[@data-testid="byline"]/div/span[@data-testid="byline-name"]/text()')
|
|
||||||
# author = ', '.join(author)
|
|
||||||
# text = html.xpath('//div[@data-component="text-block"]/p/b/text() | //div[@data-component="text-block"]/p/text()')
|
|
||||||
# text = '\n\n'.join(text)
|
|
||||||
# text = text_tidy(text)
|
|
||||||
count += 1
|
|
||||||
# save(f'Title: {title}\nLink: {url}\n\n', f'BBC_news_title_{get_time()}', 'a') # news title
|
|
||||||
# save(f'Title: {title}\n\nOrigin: {url}\n\nAuthor: {author}\n\n\n', f'BBC_news_text_{get_time()}', 'a')
|
|
||||||
# save(f'{text}' + '\n\n------------------------------\n\n', f'BBC_news_text_{get_time()}', 'a')
|
|
||||||
# print(f'Title: {title}. Link: {url}')
|
|
||||||
|
|
||||||
msg +=f'Title: {title}. Link: {url}\n'
|
html = etree.HTML(response.text)
|
||||||
# print(f'Files saved with {count} articles available.')
|
href = html.xpath(
|
||||||
|
'//h2[@data-testid="card-headline"]/../../../../../@href | //h2[@data-testid="card-headline"]/../../../../@href')
|
||||||
|
href = title_tidy(href)
|
||||||
|
|
||||||
|
msg = ''
|
||||||
|
count = 0
|
||||||
|
|
||||||
|
for url in href[:NEWS_LIMIT]:
|
||||||
|
try:
|
||||||
|
if url[0:4] == 'http':
|
||||||
|
continue
|
||||||
|
|
||||||
|
full_url = head + url
|
||||||
|
response = safe_request(full_url)
|
||||||
|
if not response:
|
||||||
|
continue
|
||||||
|
|
||||||
|
html = etree.HTML(response.text)
|
||||||
|
title = html.xpath('//div[@data-component="headline-block"]/h1/text()')
|
||||||
|
|
||||||
|
if not title:
|
||||||
|
logger.warning(f'跳过视频或其他类型新闻: {full_url}')
|
||||||
|
continue
|
||||||
|
|
||||||
|
title = title[0]
|
||||||
|
msg += f'Title: {title}. Link: {full_url}\n'
|
||||||
|
count += 1
|
||||||
|
sleep(0.1)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"处理新闻失败: {full_url}, 错误: {str(e)}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
logger.info(f"BBC新闻获取完成,共获取{count}条")
|
||||||
return msg
|
return msg
|
||||||
|
|
||||||
if __name__ == '__main__':
|
except Exception as e:
|
||||||
# Hello, World! :)
|
logger.error(f"获取BBC新闻失败: {str(e)}")
|
||||||
# news = input('Choose news site["nbc","cnn","abc","fox","bbc"]:').lower()
|
return "获取新闻失败,请查看日志了解详情"
|
||||||
# if news == 'nbc': nbc()
|
|
||||||
# elif news == 'cnn': cnn()
|
|
||||||
# elif news == 'abc': abc()
|
def all_english_news():
|
||||||
# elif news == 'fox': fox()
|
news_titles = ""
|
||||||
# elif news == 'bbc': bbc()
|
news_titles += nbc() + "\n"
|
||||||
# else:
|
news_titles += cnn() + "\n"
|
||||||
# print('Oops! It seems a wrong input. Please retry...')
|
news_titles += abc() + "\n"
|
||||||
# sleep(2)
|
news_titles += fox() + "\n"
|
||||||
print(bbc())
|
news_titles += bbc() + "\n"
|
||||||
|
markdown_news = dify_news_title_analyze(news_titles)
|
||||||
|
spath = convert_md_str_to_image(markdown_news, "news_output.png")
|
||||||
|
return spath
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ import requests
|
|||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
from base import func_english_news
|
from base import func_english_news
|
||||||
|
from utils.ai.dify_news_analyze import dify_news_title_analyze
|
||||||
|
|
||||||
|
|
||||||
class News(object):
|
class News(object):
|
||||||
@@ -92,7 +93,7 @@ class News(object):
|
|||||||
self.LOG.error(f"获取百度新闻时出错: {e}")
|
self.LOG.error(f"获取百度新闻时出错: {e}")
|
||||||
return f"获取百度新闻时出错: {e}"
|
return f"获取百度新闻时出错: {e}"
|
||||||
|
|
||||||
def get_eng_news(self,website):
|
def get_eng_news(self, website):
|
||||||
if website == 'nbc':
|
if website == 'nbc':
|
||||||
return func_english_news.nbc()
|
return func_english_news.nbc()
|
||||||
elif website == 'cnn':
|
elif website == 'cnn':
|
||||||
@@ -105,9 +106,9 @@ class News(object):
|
|||||||
return func_english_news.bbc()
|
return func_english_news.bbc()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
news = News()
|
news = News()
|
||||||
print(news.get_baidu_news())
|
|
||||||
# # msg = "@水牛-分身 今日百度新闻"
|
# # msg = "@水牛-分身 今日百度新闻"
|
||||||
# # q = re.sub(r"@.*?[\u2005|\s]", "", msg).replace(" ", "")
|
# # q = re.sub(r"@.*?[\u2005|\s]", "", msg).replace(" ", "")
|
||||||
# # print(q)
|
# # print(q)
|
||||||
|
|||||||
7
plugins/global_news/__init__.py
Normal file
7
plugins/global_news/__init__.py
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
# 从当前包的main模块导入GlobalNewsPlugin类
|
||||||
|
from .main import GlobalNewsPlugin
|
||||||
|
|
||||||
|
# 提供get_plugin函数,返回插件实例
|
||||||
|
def get_plugin():
|
||||||
|
"""获取插件实例"""
|
||||||
|
return GlobalNewsPlugin()
|
||||||
6
plugins/global_news/config.toml
Normal file
6
plugins/global_news/config.toml
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
enable = true
|
||||||
|
command = ["全球新闻", "国际新闻", "环球新闻", "政经新闻", "政治经济新闻"]
|
||||||
|
command-format = """
|
||||||
|
🌍全球新闻指令:
|
||||||
|
全球新闻 - 获取最新的全球政治经济新闻
|
||||||
|
"""
|
||||||
191
plugins/global_news/main.py
Normal file
191
plugins/global_news/main.py
Normal file
@@ -0,0 +1,191 @@
|
|||||||
|
import logging
|
||||||
|
import asyncio
|
||||||
|
import threading
|
||||||
|
import time # 添加这一行
|
||||||
|
from typing import Dict, Any, List, Optional, Tuple
|
||||||
|
|
||||||
|
from wcferry import Wcf
|
||||||
|
|
||||||
|
from plugin_common.message_plugin_interface import MessagePluginInterface
|
||||||
|
from plugin_common.plugin_interface import PluginStatus
|
||||||
|
from utils.decorator.plugin_decorators import plugin_stats_decorator
|
||||||
|
from utils.robot_cmd.robot_command import Feature, PermissionStatus, GroupBotManager
|
||||||
|
from utils.decorator.points_decorator import plugin_points_cost
|
||||||
|
from utils.ai.dify_news_analyze import dify_news_title_analyze
|
||||||
|
from utils.markdown_to_image import convert_md_str_to_image
|
||||||
|
|
||||||
|
# 导入新闻抓取函数
|
||||||
|
from .news_crawler import nbc, cnn, abc, fox, bbc
|
||||||
|
|
||||||
|
|
||||||
|
class GlobalNewsPlugin(MessagePluginInterface):
|
||||||
|
"""全球政治经济新闻插件"""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def name(self) -> str:
|
||||||
|
return "全球政治经济新闻"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def version(self) -> str:
|
||||||
|
return "1.0.0"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def description(self) -> str:
|
||||||
|
return "提供全球政治经济新闻,支持多个国际新闻源"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def author(self) -> str:
|
||||||
|
return "Trae AI"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def command_prefix(self) -> Optional[str]:
|
||||||
|
return "" # 不需要前缀,直接匹配命令
|
||||||
|
|
||||||
|
@property
|
||||||
|
def commands(self) -> List[str]:
|
||||||
|
return self._commands
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self._news_tasks = {} # 存储正在进行的新闻抓取任务
|
||||||
|
|
||||||
|
def initialize(self, context: Dict[str, Any]) -> bool:
|
||||||
|
"""初始化插件"""
|
||||||
|
self.LOG = logging.getLogger(f"Plugin.{self.name}")
|
||||||
|
self.LOG.info(f"正在初始化 {self.name} 插件...")
|
||||||
|
|
||||||
|
# 保存上下文对象
|
||||||
|
self.wcf = context.get("wcf")
|
||||||
|
self.event_system = context.get("event_system")
|
||||||
|
self.message_util = context.get("message_util")
|
||||||
|
|
||||||
|
self._commands = self._config.get("GlobalNews", {}).get("command", ["全球新闻", "国际新闻", "环球新闻", "政经新闻"])
|
||||||
|
self.command_format = self._config.get("GlobalNews", {}).get("command-format", "全球新闻 - 获取最新的全球政治经济新闻")
|
||||||
|
self.enable = self._config.get("GlobalNews", {}).get("enable", True)
|
||||||
|
|
||||||
|
self.LOG.info(f"[{self.name}] 插件初始化完成,指令:{self._commands}")
|
||||||
|
return True
|
||||||
|
|
||||||
|
def start(self) -> bool:
|
||||||
|
"""启动插件"""
|
||||||
|
self.LOG.info(f"[{self.name}] 插件已启动")
|
||||||
|
self.status = PluginStatus.RUNNING
|
||||||
|
return True
|
||||||
|
|
||||||
|
def stop(self) -> bool:
|
||||||
|
"""停止插件"""
|
||||||
|
self.LOG.info(f"[{self.name}] 插件已停止")
|
||||||
|
self.status = PluginStatus.STOPPED
|
||||||
|
return True
|
||||||
|
|
||||||
|
def can_process(self, message: Dict[str, Any]) -> bool:
|
||||||
|
"""检查是否可以处理该消息"""
|
||||||
|
if not self.enable:
|
||||||
|
return False
|
||||||
|
|
||||||
|
content = str(message.get("content", "")).strip()
|
||||||
|
command = content.split(" ")[0]
|
||||||
|
|
||||||
|
return command in self._commands
|
||||||
|
|
||||||
|
@plugin_stats_decorator(plugin_name="全球政治经济新闻")
|
||||||
|
@plugin_points_cost(5, "全球新闻消耗积分", Feature.NEWS)
|
||||||
|
def process_message(self, message: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
|
||||||
|
"""处理消息"""
|
||||||
|
content = str(message.get("content", "")).strip()
|
||||||
|
self.LOG.info(f"插件执行: {self.name}:{content}")
|
||||||
|
sender = message.get("sender")
|
||||||
|
roomid = message.get("roomid", "")
|
||||||
|
wcf: Wcf = message.get("wcf")
|
||||||
|
gbm: GroupBotManager = message.get("gbm")
|
||||||
|
|
||||||
|
# 检查权限
|
||||||
|
if roomid and gbm.get_group_permission(roomid, Feature.NEWS) == PermissionStatus.DISABLED:
|
||||||
|
return False, "没有权限"
|
||||||
|
|
||||||
|
# 生成唯一任务ID
|
||||||
|
task_id = f"{sender}_{roomid}_{int(time.time())}"
|
||||||
|
|
||||||
|
# 发送等待消息
|
||||||
|
wcf.send_text("🌍正在获取全球新闻,请稍候...",
|
||||||
|
(roomid if roomid else sender), sender)
|
||||||
|
|
||||||
|
# 启动异步任务
|
||||||
|
self._start_news_task(task_id, sender, roomid, wcf)
|
||||||
|
|
||||||
|
return True, "新闻获取任务已启动"
|
||||||
|
|
||||||
|
def _start_news_task(self, task_id: str, sender: str, roomid: str, wcf: Wcf):
|
||||||
|
"""启动异步新闻获取任务"""
|
||||||
|
thread = threading.Thread(
|
||||||
|
target=self._fetch_news_thread,
|
||||||
|
args=(task_id, sender, roomid, wcf)
|
||||||
|
)
|
||||||
|
thread.daemon = True
|
||||||
|
thread.start()
|
||||||
|
self._news_tasks[task_id] = thread
|
||||||
|
self.LOG.info(f"启动新闻获取任务: {task_id}")
|
||||||
|
|
||||||
|
def _fetch_news_thread(self, task_id: str, sender: str, roomid: str, wcf: Wcf):
|
||||||
|
"""在单独的线程中运行异步新闻获取任务"""
|
||||||
|
try:
|
||||||
|
loop = asyncio.new_event_loop()
|
||||||
|
asyncio.set_event_loop(loop)
|
||||||
|
news_result = loop.run_until_complete(self._fetch_news_async())
|
||||||
|
loop.close()
|
||||||
|
|
||||||
|
# 处理结果
|
||||||
|
if news_result:
|
||||||
|
# 发送新闻图片
|
||||||
|
receiver = roomid if roomid else sender
|
||||||
|
wcf.send_image(news_result, receiver)
|
||||||
|
wcf.send_text("🌍全球新闻获取完成!", receiver, sender)
|
||||||
|
else:
|
||||||
|
wcf.send_text("❌获取新闻失败,请稍后再试",
|
||||||
|
(roomid if roomid else sender), sender)
|
||||||
|
except Exception as e:
|
||||||
|
self.LOG.error(f"新闻获取任务出错: {e}")
|
||||||
|
wcf.send_text(f"❌获取新闻出错: {str(e)}",
|
||||||
|
(roomid if roomid else sender), sender)
|
||||||
|
finally:
|
||||||
|
# 清理任务
|
||||||
|
if task_id in self._news_tasks:
|
||||||
|
del self._news_tasks[task_id]
|
||||||
|
|
||||||
|
async def _fetch_news_async(self) -> str:
|
||||||
|
"""异步获取所有新闻源的新闻"""
|
||||||
|
try:
|
||||||
|
# 创建所有新闻源的任务
|
||||||
|
tasks = [
|
||||||
|
self._run_in_executor(nbc),
|
||||||
|
self._run_in_executor(cnn),
|
||||||
|
self._run_in_executor(abc),
|
||||||
|
self._run_in_executor(fox),
|
||||||
|
self._run_in_executor(bbc)
|
||||||
|
]
|
||||||
|
|
||||||
|
# 并行执行所有任务
|
||||||
|
results = await asyncio.gather(*tasks)
|
||||||
|
|
||||||
|
# 合并结果
|
||||||
|
news_titles = "\n".join(results)
|
||||||
|
|
||||||
|
# 使用AI分析新闻
|
||||||
|
markdown_news = await self._run_in_executor(
|
||||||
|
dify_news_title_analyze, news_titles
|
||||||
|
)
|
||||||
|
|
||||||
|
# 转换为图片
|
||||||
|
image_path = await self._run_in_executor(
|
||||||
|
convert_md_str_to_image, markdown_news, "news_output.png"
|
||||||
|
)
|
||||||
|
|
||||||
|
return image_path
|
||||||
|
except Exception as e:
|
||||||
|
self.LOG.error(f"异步获取新闻失败: {e}")
|
||||||
|
return ""
|
||||||
|
|
||||||
|
async def _run_in_executor(self, func, *args):
|
||||||
|
"""在线程池中运行同步函数"""
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
return await loop.run_in_executor(None, func, *args)
|
||||||
307
plugins/global_news/news_crawler.py
Normal file
307
plugins/global_news/news_crawler.py
Normal file
@@ -0,0 +1,307 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Program: Global News Crawler
|
||||||
|
Author: Trae AI (based on MrCrawL's work)
|
||||||
|
Created Date: 2024-05-01
|
||||||
|
"""
|
||||||
|
import requests
|
||||||
|
from time import localtime, sleep
|
||||||
|
from lxml import etree
|
||||||
|
import logging
|
||||||
|
from datetime import datetime
|
||||||
|
import time
|
||||||
|
|
||||||
|
# 配置日志
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||||
|
handlers=[
|
||||||
|
logging.FileHandler(f'global_news_{datetime.now().strftime("%Y%m%d")}.log'),
|
||||||
|
logging.StreamHandler()
|
||||||
|
]
|
||||||
|
)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# 请求配置
|
||||||
|
HEADERS = {
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||||
|
}
|
||||||
|
TIMEOUT = 10
|
||||||
|
MAX_RETRIES = 3
|
||||||
|
NEWS_LIMIT = 30
|
||||||
|
|
||||||
|
|
||||||
|
def get_time():
|
||||||
|
date_ = f'{str(localtime().tm_year).zfill(4)}-{str(localtime().tm_mon).zfill(2)}-{str(localtime().tm_mday).zfill(2)}'
|
||||||
|
return date_
|
||||||
|
|
||||||
|
|
||||||
|
def title_tidy(title_list):
|
||||||
|
t_index = []
|
||||||
|
for i in range(1, len(title_list)):
|
||||||
|
if title_list[i] == title_list[i - 1]: t_index.append(i)
|
||||||
|
t_index.reverse()
|
||||||
|
for i in range(len(t_index)): title_list.pop(t_index[i])
|
||||||
|
return title_list
|
||||||
|
|
||||||
|
|
||||||
|
def safe_request(url, retry_count=0):
|
||||||
|
"""安全的请求方法,包含重试机制"""
|
||||||
|
try:
|
||||||
|
response = requests.get(url, headers=HEADERS, timeout=TIMEOUT)
|
||||||
|
response.raise_for_status()
|
||||||
|
return response
|
||||||
|
except requests.RequestException as e:
|
||||||
|
if retry_count < MAX_RETRIES:
|
||||||
|
logger.warning(f"请求失败,正在进行第{retry_count + 1}次重试: {url}")
|
||||||
|
sleep(1)
|
||||||
|
return safe_request(url, retry_count + 1)
|
||||||
|
else:
|
||||||
|
logger.error(f"请求失败: {url}, 错误: {str(e)}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def nbc():
|
||||||
|
logger.info("开始获取NBC新闻")
|
||||||
|
try:
|
||||||
|
url = 'https://www.nbcnews.com/'
|
||||||
|
response = safe_request(url)
|
||||||
|
if not response:
|
||||||
|
return "获取NBC新闻失败"
|
||||||
|
|
||||||
|
html = etree.HTML(response.text)
|
||||||
|
href = html.xpath('//h2/a/@href')
|
||||||
|
href = title_tidy(href)
|
||||||
|
|
||||||
|
msg = ''
|
||||||
|
count = 0
|
||||||
|
|
||||||
|
for url in href[:NEWS_LIMIT]:
|
||||||
|
try:
|
||||||
|
response = safe_request(url)
|
||||||
|
if not response:
|
||||||
|
continue
|
||||||
|
|
||||||
|
html = etree.HTML(response.text)
|
||||||
|
title = html.xpath('//h1/text()')
|
||||||
|
|
||||||
|
if not title:
|
||||||
|
logger.warning(f'跳过视频或其他类型新闻: {url}')
|
||||||
|
continue
|
||||||
|
|
||||||
|
title = title[0]
|
||||||
|
msg += f'Title: {title}. Link: {url}\n'
|
||||||
|
count += 1
|
||||||
|
sleep(0.1)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"处理新闻失败: {url}, 错误: {str(e)}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
logger.info(f"NBC新闻获取完成,共获取{count}条")
|
||||||
|
return msg
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"获取NBC新闻失败: {str(e)}")
|
||||||
|
return "获取新闻失败,请查看日志了解详情"
|
||||||
|
|
||||||
|
|
||||||
|
def cnn():
|
||||||
|
logger.info("开始获取CNN新闻")
|
||||||
|
try:
|
||||||
|
head = 'https://www.cnn.com'
|
||||||
|
response = safe_request(head + '/')
|
||||||
|
if not response:
|
||||||
|
return "获取CNN新闻失败"
|
||||||
|
|
||||||
|
html = etree.HTML(response.text)
|
||||||
|
href = html.xpath('//a[@data-link-type="article"]/@href')
|
||||||
|
href = title_tidy(href)
|
||||||
|
|
||||||
|
msg = ''
|
||||||
|
count = 0
|
||||||
|
|
||||||
|
for url in href[:NEWS_LIMIT]:
|
||||||
|
try:
|
||||||
|
full_url = head + url
|
||||||
|
response = safe_request(full_url)
|
||||||
|
if not response:
|
||||||
|
continue
|
||||||
|
|
||||||
|
html = etree.HTML(response.text)
|
||||||
|
title = html.xpath('//h1[@data-editable="headlineText"]/text()')
|
||||||
|
|
||||||
|
if not title:
|
||||||
|
logger.warning(f'跳过视频或其他类型新闻: {full_url}')
|
||||||
|
continue
|
||||||
|
|
||||||
|
title = title[0].strip()
|
||||||
|
msg += f'Title: {title}. Link: {full_url}\n'
|
||||||
|
count += 1
|
||||||
|
sleep(0.1)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"处理新闻失败: {full_url}, 错误: {str(e)}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
logger.info(f"CNN新闻获取完成,共获取{count}条")
|
||||||
|
return msg
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"获取CNN新闻失败: {str(e)}")
|
||||||
|
return "获取新闻失败,请查看日志了解详情"
|
||||||
|
|
||||||
|
|
||||||
|
def abc():
|
||||||
|
logger.info("开始获取ABC新闻")
|
||||||
|
try:
|
||||||
|
head = 'https://abcnews.go.com' # 移除末尾的斜杠
|
||||||
|
response = safe_request(head)
|
||||||
|
if not response:
|
||||||
|
return "获取ABC新闻失败"
|
||||||
|
|
||||||
|
html = etree.HTML(response.text)
|
||||||
|
href1 = html.xpath('//div[@class="HeadlinesTrio"]/a/@href')
|
||||||
|
href2 = html.xpath(
|
||||||
|
'//div[@class="title card"]/a[@class="AnchorLink"]/@href | //div[@class="title"]/a[@class="AnchorLink"]/@href')
|
||||||
|
href3 = html.xpath('//a[@target="_self"]/@href')
|
||||||
|
href4 = html.xpath('//a[@class="AnchorLink VideoTile"]/@href')
|
||||||
|
href = title_tidy(href1 + href2 + href3 + href4)
|
||||||
|
|
||||||
|
msg = ''
|
||||||
|
count = 0
|
||||||
|
|
||||||
|
for url in href[:NEWS_LIMIT]:
|
||||||
|
try:
|
||||||
|
# 处理URL格式
|
||||||
|
if url.startswith('http'):
|
||||||
|
full_url = url
|
||||||
|
elif url.startswith('//'):
|
||||||
|
full_url = 'https:' + url
|
||||||
|
else:
|
||||||
|
full_url = head + ('' if url.startswith('/') else '/') + url
|
||||||
|
|
||||||
|
response = safe_request(full_url)
|
||||||
|
if not response:
|
||||||
|
continue
|
||||||
|
|
||||||
|
html = etree.HTML(response.text)
|
||||||
|
title = html.xpath('//div[@data-testid="prism-headline"]/h1/text()')
|
||||||
|
|
||||||
|
if not title:
|
||||||
|
logger.warning(f'跳过视频或其他类型新闻: {full_url}')
|
||||||
|
continue
|
||||||
|
|
||||||
|
title = title[0]
|
||||||
|
msg += f'Title: {title}. Link: {full_url}\n'
|
||||||
|
count += 1
|
||||||
|
sleep(0.1)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"处理新闻失败: {full_url}, 错误: {str(e)}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
logger.info(f"ABC新闻获取完成,共获取{count}条")
|
||||||
|
return msg
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"获取ABC新闻失败: {str(e)}")
|
||||||
|
return "获取新闻失败,请查看日志了解详情"
|
||||||
|
|
||||||
|
|
||||||
|
def fox():
|
||||||
|
logger.info("开始获取FOX新闻")
|
||||||
|
try:
|
||||||
|
head = 'https://www.foxnews.com/'
|
||||||
|
response = safe_request(head)
|
||||||
|
if not response:
|
||||||
|
return "获取FOX新闻失败"
|
||||||
|
|
||||||
|
html = etree.HTML(response.text)
|
||||||
|
href = html.xpath('//h3[@class="title"]/a/@href')
|
||||||
|
href = title_tidy(href)
|
||||||
|
|
||||||
|
msg = ''
|
||||||
|
count = 0
|
||||||
|
|
||||||
|
for url in href[:NEWS_LIMIT]:
|
||||||
|
try:
|
||||||
|
if url[0:4] != 'http':
|
||||||
|
url = 'https:' + url
|
||||||
|
|
||||||
|
response = safe_request(url)
|
||||||
|
if not response:
|
||||||
|
continue
|
||||||
|
|
||||||
|
html = etree.HTML(response.text)
|
||||||
|
title = html.xpath('//h1[@itemprop="headline"]/text()')
|
||||||
|
|
||||||
|
if not title:
|
||||||
|
logger.warning(f'跳过视频或其他类型新闻: {url}')
|
||||||
|
continue
|
||||||
|
|
||||||
|
title = title[0]
|
||||||
|
msg += f'Title: {title}. Link: {url}\n'
|
||||||
|
count += 1
|
||||||
|
sleep(0.1)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"处理新闻失败: {url}, 错误: {str(e)}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
logger.info(f"FOX新闻获取完成,共获取{count}条")
|
||||||
|
return msg
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"获取FOX新闻失败: {str(e)}")
|
||||||
|
return "获取新闻失败,请查看日志了解详情"
|
||||||
|
|
||||||
|
|
||||||
|
def bbc():
|
||||||
|
logger.info("开始获取BBC新闻")
|
||||||
|
try:
|
||||||
|
head = 'https://www.bbc.com'
|
||||||
|
response = safe_request(head + '/')
|
||||||
|
if not response:
|
||||||
|
return "获取BBC新闻失败"
|
||||||
|
|
||||||
|
html = etree.HTML(response.text)
|
||||||
|
href = html.xpath(
|
||||||
|
'//h2[@data-testid="card-headline"]/../../../../../@href | //h2[@data-testid="card-headline"]/../../../../@href')
|
||||||
|
href = title_tidy(href)
|
||||||
|
|
||||||
|
msg = ''
|
||||||
|
count = 0
|
||||||
|
|
||||||
|
for url in href[:NEWS_LIMIT]:
|
||||||
|
try:
|
||||||
|
if url[0:4] == 'http':
|
||||||
|
continue
|
||||||
|
|
||||||
|
full_url = head + url
|
||||||
|
response = safe_request(full_url)
|
||||||
|
if not response:
|
||||||
|
continue
|
||||||
|
|
||||||
|
html = etree.HTML(response.text)
|
||||||
|
title = html.xpath('//div[@data-component="headline-block"]/h1/text()')
|
||||||
|
|
||||||
|
if not title:
|
||||||
|
logger.warning(f'跳过视频或其他类型新闻: {full_url}')
|
||||||
|
continue
|
||||||
|
|
||||||
|
title = title[0]
|
||||||
|
msg += f'Title: {title}. Link: {full_url}\n'
|
||||||
|
count += 1
|
||||||
|
sleep(0.1)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"处理新闻失败: {full_url}, 错误: {str(e)}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
logger.info(f"BBC新闻获取完成,共获取{count}条")
|
||||||
|
return msg
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"获取BBC新闻失败: {str(e)}")
|
||||||
|
return "获取新闻失败,请查看日志了解详情"
|
||||||
73
utils/ai/dify_news_analyze.py
Normal file
73
utils/ai/dify_news_analyze.py
Normal file
@@ -0,0 +1,73 @@
|
|||||||
|
#
|
||||||
|
# curl -X POST 'http://192.168.2.240/v1/chat-messages' \
|
||||||
|
# --header 'Authorization: Bearer {api_key}' \
|
||||||
|
# --header 'Content-Type: application/json' \
|
||||||
|
# --data-raw '{
|
||||||
|
# "inputs": {},
|
||||||
|
# "query": "What are the specs of the iPhone 13 Pro Max?",
|
||||||
|
# "response_mode": "streaming",
|
||||||
|
# "conversation_id": "",
|
||||||
|
# "user": "abc-123",
|
||||||
|
# "files": [
|
||||||
|
# {
|
||||||
|
# "type": "image",
|
||||||
|
# "transfer_method": "remote_url",
|
||||||
|
# "url": "https://cloud.dify.ai/logo/logo-site.png"
|
||||||
|
# }
|
||||||
|
# ]
|
||||||
|
# }'
|
||||||
|
import json
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
|
||||||
|
def dify_news_title_analyze(content):
|
||||||
|
# 设置Authorization和URL
|
||||||
|
authorization = "Bearer app-rhhKkbvHd2IAQoGX7xTzXZJj" # 请替换为真实的Authorization token
|
||||||
|
url = 'http://192.168.2.240/v1/chat-messages'
|
||||||
|
|
||||||
|
data = {
|
||||||
|
"response_mode": "blocking",
|
||||||
|
"conversation_id": "",
|
||||||
|
"inputs": {},
|
||||||
|
"query": content,
|
||||||
|
"user": "a-bot"
|
||||||
|
}
|
||||||
|
|
||||||
|
# 设置请求头
|
||||||
|
headers = {
|
||||||
|
"Content-Type": "application/json; charset=utf-8",
|
||||||
|
"Authorization": authorization
|
||||||
|
}
|
||||||
|
|
||||||
|
# 发送POST请求
|
||||||
|
response = requests.post(url, headers=headers, data=json.dumps(data), )
|
||||||
|
response.encoding = 'utf-8'
|
||||||
|
|
||||||
|
# 输出响应内容
|
||||||
|
print(response.status_code)
|
||||||
|
print(response.json())
|
||||||
|
return extract_content(response.json())
|
||||||
|
|
||||||
|
|
||||||
|
def extract_content(data):
|
||||||
|
"""解析API响应内容
|
||||||
|
Args:
|
||||||
|
data: API返回的响应数据,可以是字典或字符串
|
||||||
|
Returns:
|
||||||
|
str: 提取的answer内容
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# 如果是字符串,尝试解析为字典
|
||||||
|
if isinstance(data, str):
|
||||||
|
data = json.dumps(data)
|
||||||
|
# 如果是字典,直接获取answer
|
||||||
|
if isinstance(data, dict):
|
||||||
|
answer = data.get('answer', '')
|
||||||
|
if answer:
|
||||||
|
return answer
|
||||||
|
|
||||||
|
return None
|
||||||
|
except Exception as e:
|
||||||
|
print(f"解析响应失败: {str(e)}")
|
||||||
|
return None
|
||||||
@@ -43,7 +43,8 @@ class Feature(Enum):
|
|||||||
GROUP_ADD = 16, "加群提醒功能"
|
GROUP_ADD = 16, "加群提醒功能"
|
||||||
DOUYIN_PARSER = 17, "抖音链接转视频功能"
|
DOUYIN_PARSER = 17, "抖音链接转视频功能"
|
||||||
GROUP_MEMBER_CHANGE = 18, "群成员变更提醒功能"
|
GROUP_MEMBER_CHANGE = 18, "群成员变更提醒功能"
|
||||||
KID_PHOTO_EXTRACT =19, "儿童照片提取转发功能" # 小朋友照片提取功能
|
KID_PHOTO_EXTRACT = 19, "儿童照片提取转发功能" # 小朋友照片提取功能
|
||||||
|
NEWS = 20, "全球政治经济新闻"
|
||||||
|
|
||||||
def __new__(cls, value, description):
|
def __new__(cls, value, description):
|
||||||
obj = object.__new__(cls)
|
obj = object.__new__(cls)
|
||||||
|
|||||||
Reference in New Issue
Block a user