feature:全球政治经济新闻

This commit is contained in:
liuwei
2025-04-14 16:02:00 +08:00
parent bfdb1831d3
commit 8afd0f49d0
8 changed files with 885 additions and 220 deletions

View File

@@ -6,11 +6,35 @@ Created Date: 2024-01-21
Last Modified: 2024-03-24 Last Modified: 2024-03-24
Modified by: MrCrawL Modified by: MrCrawL
""" """
from utils.ai.dify_news_analyze import dify_news_title_analyze
from utils.markdown_to_image import convert_md_str_to_image
'''Existing problem: text with hyperlink won't be saved''' '''Existing problem: text with hyperlink won't be saved'''
import requests import requests
from time import localtime, sleep from time import localtime, sleep
from lxml import etree from lxml import etree
import logging
from datetime import datetime
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(f'news_crawler_{datetime.now().strftime("%Y%m%d")}.log'),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
# 请求配置
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
TIMEOUT = 10
MAX_RETRIES = 3
NEWS_LIMIT = 30
def get_time(): def get_time():
@@ -18,7 +42,6 @@ def get_time():
return date_ return date_
# delete duplicated
def title_tidy(title_list): def title_tidy(title_list):
t_index = [] t_index = []
for i in range(1, len(title_list)): for i in range(1, len(title_list)):
@@ -28,7 +51,6 @@ def title_tidy(title_list):
return title_list return title_list
# tidy text, seems a little bit redundant
def text_tidy(p_text): def text_tidy(p_text):
text_ = p_text.replace('', "'") text_ = p_text.replace('', "'")
text_ = text_.replace(' \n\n', ' ') text_ = text_.replace(' \n\n', ' ')
@@ -46,218 +68,275 @@ def text_tidy(p_text):
return text_ return text_
def save(text, file_name, mode='w', encoding='utf-8'): def safe_request(url, retry_count=0):
with open(f'{file_name}.txt', mode, encoding=encoding) as f: f.write(text) """安全的请求方法,包含重试机制"""
try:
response = requests.get(url, headers=HEADERS, timeout=TIMEOUT)
response.raise_for_status()
return response
except requests.RequestException as e:
if retry_count < MAX_RETRIES:
logger.warning(f"请求失败,正在进行第{retry_count + 1}次重试: {url}")
sleep(1)
return safe_request(url, retry_count + 1)
else:
logger.error(f"请求失败: {url}, 错误: {str(e)}")
return None
def nbc(): def nbc():
url = 'https://www.nbcnews.com/' logger.info("开始获取NBC新闻")
res = requests.get(url) try:
html = etree.HTML(res.text) url = 'https://www.nbcnews.com/'
href = html.xpath('//h2/a/@href') response = safe_request(url)
href = title_tidy(href) if not response:
# quant = int(input(f'There are {len(href)} pieces detected. How many would you download:')) return "获取NBC新闻失败"
# if quant > len(href) or quant < 1:
# print("Outnumber!") html = etree.HTML(response.text)
# quit() href = html.xpath('//h2/a/@href')
count = 0 href = title_tidy(href)
# save('', f'NBC_news_title_{get_time()}')
# save('', f'NBC_news_text_{get_time()}') msg = ''
msg ='' count = 0
for i in range(30):
url = href[i] for url in href[:NEWS_LIMIT]:
sleep(0.1) # delete to speed up try:
res = requests.get(url) response = safe_request(url)
html = etree.HTML(res.text) if not response:
title = html.xpath('//h1/text()') continue
if len(title) == 0:
print(f'Video or other news. Link: {url}') html = etree.HTML(response.text)
continue title = html.xpath('//h1/text()')
title = title[0]
author = html.xpath('//span[@class="byline-name"]/a/text() | //span[@class="byline-name" and not(a)]/text()') if not title:
author = ', '.join(author) logger.warning(f'跳过视频或其他类型新闻: {url}')
text = html.xpath('//p[@class=""]/text()') continue
text = '\n\n'.join(text)
text = text_tidy(text) title = title[0]
count += 1 msg += f'Title: {title}. Link: {url}\n'
# save(f'Title: {title}\nLink: {url}\n\n', f'NBC_news_title_{get_time()}', 'a') # news title count += 1
# save(f'Title: {title}\n\nOrigin: {url}\n\nAuthor: {author}\n\n\n', f'NBC_news_text_{get_time()}', 'a') sleep(0.1)
# save(f'{text}' + '\n\n------------------------------\n\n', f'NBC_news_text_{get_time()}', 'a')
# print(f'Title: {title}. Link: {href[i]}.') except Exception as e:
msg += f'Title: {title}. Link: {href[i]}.\n' logger.error(f"处理新闻失败: {url}, 错误: {str(e)}")
return msg continue
logger.info(f"NBC新闻获取完成共获取{count}")
return msg
except Exception as e:
logger.error(f"获取NBC新闻失败: {str(e)}")
return "获取新闻失败,请查看日志了解详情"
def cnn(): def cnn():
head = 'https://www.cnn.com' logger.info("开始获取CNN新闻")
res = requests.get(head + '/') try:
html = etree.HTML(res.text) head = 'https://www.cnn.com'
href = html.xpath('//a[@data-link-type="article"]/@href') response = safe_request(head + '/')
href = title_tidy(href) if not response:
# quant = int(input(f'{len(href)} data detected. How many would you like to download:')) return "获取CNN新闻失败"
# if quant > len(href) or quant < 1:
# print("Outnumber!") html = etree.HTML(response.text)
# quit() href = html.xpath('//a[@data-link-type="article"]/@href')
count = 0 href = title_tidy(href)
msg = ''
# save('', f'CNN_news_title_{get_time()}') msg = ''
# save('', f'CNN_news_text_{get_time()}') count = 0
for i in range(30):
url = head + href[i] for url in href[:NEWS_LIMIT]:
sleep(0.1) # delete to speed up try:
res = requests.get(url) full_url = head + url
html = etree.HTML(res.text) response = safe_request(full_url)
title = html.xpath('//h1[@data-editable="headlineText"]/text()') if not response:
if len(title) == 0: continue
print(f'Video or other news. Link: {url}')
continue html = etree.HTML(response.text)
title = title[0].strip() title = html.xpath('//h1[@data-editable="headlineText"]/text()')
author = html.xpath('//span[@class="byline__name"]/text()')
author = ', '.join(author) if not title:
text = html.xpath('//p[@class="paragraph inline-placeholder"]/text()') logger.warning(f'跳过视频或其他类型新闻: {full_url}')
for k in range(len(text)): text[k].strip() continue
text = ''.join(text)
text = text_tidy(text) title = title[0].strip()
count += 1 msg += f'Title: {title}. Link: {full_url}\n'
# save(f'Title: {title}\nLink: {url}\n\n', f'CNN_news_title_{get_time()}', 'a') # news title count += 1
# save(f'Title: {title}\n\nOrigin: {url}\n\nAuthor: {author}\n\n\n', f'CNN_news_text_{get_time()}', 'a') sleep(0.1)
# save(f'{text}' + '\n\n------------------------------\n\n', f'CNN_news_text_{get_time()}', 'a')
# print(f'Title: {title}. Link: {url}') except Exception as e:
msg +=f'Title: {title}. Link: {url}\n' logger.error(f"处理新闻失败: {full_url}, 错误: {str(e)}")
# print(f'Files saved with {count} articles available.') continue
return msg
logger.info(f"CNN新闻获取完成共获取{count}")
return msg
except Exception as e:
logger.error(f"获取CNN新闻失败: {str(e)}")
return "获取新闻失败,请查看日志了解详情"
def abc(): def abc():
head = 'https://abcnews.go.com/' logger.info("开始获取ABC新闻")
res = requests.get(head) try:
html = etree.HTML(res.text) head = 'https://abcnews.go.com' # 移除末尾的斜杠
href1 = html.xpath('//div[@class="HeadlinesTrio"]/a/@href') response = safe_request(head)
href2 = html.xpath('//div[@class="title card"]/a[@class="AnchorLink"]/@href | //div[@class="title"]/a[@class="AnchorLink"]/@href') if not response:
href3 = html.xpath('//a[@target="_self"]/@href') return "获取ABC新闻失败"
href4 = html.xpath('//a[@class="AnchorLink VideoTile"]/@href')
href = href1 + href2 + href3 + href4 html = etree.HTML(response.text)
href = title_tidy(href) href1 = html.xpath('//div[@class="HeadlinesTrio"]/a/@href')
# quant = int(input(f'{len(href)} data detected. How many would you like to download:')) href2 = html.xpath(
# if quant > len(href) or quant < 1: '//div[@class="title card"]/a[@class="AnchorLink"]/@href | //div[@class="title"]/a[@class="AnchorLink"]/@href')
# print("Outnumber!") href3 = html.xpath('//a[@target="_self"]/@href')
# quit() href4 = html.xpath('//a[@class="AnchorLink VideoTile"]/@href')
count = 0 href = title_tidy(href1 + href2 + href3 + href4)
msg = ''
# save('', f'ABC_news_title_{get_time()}') msg = ''
# save('', f'ABC_news_text_{get_time()}') count = 0
for i in range(30):
url = href[i] for url in href[:NEWS_LIMIT]:
sleep(0.1) # delete to speed up try:
res = requests.get(url) # 处理URL格式
html = etree.HTML(res.text) if url.startswith('http'):
title = html.xpath('//div[@data-testid="prism-headline"]/h1/text()') full_url = url
if len(title) == 0: elif url.startswith('//'):
print(f'Video or other news. Link: {url}') full_url = 'https:' + url
continue else:
title = title[0] full_url = head + ('' if url.startswith('/') else '/') + url
author = html.xpath('//a[@data-testid="prism-linkbase"]/text()')
author = ', '.join(author) response = safe_request(full_url)
text = html.xpath('//div[@data-testid="prism-article-body"]/p/text()') if not response:
text = '\n\n'.join(text) continue
text = text_tidy(text)
count += 1 html = etree.HTML(response.text)
# save(f'Title: {title}\nLink: {url}\n\n', f'ABC_news_title_{get_time()}', 'a') # news title title = html.xpath('//div[@data-testid="prism-headline"]/h1/text()')
# save(f'Title: {title}\n\nOrigin: {url}\n\nAuthor: {author}\n\n\n', f'ABC_news_text_{get_time()}', 'a')
# save(f'{text}' + '\n\n------------------------------\n\n', f'ABC_news_text_{get_time()}', 'a') if not title:
# print(f'Title: {title}. Link: {url}') logger.warning(f'跳过视频或其他类型新闻: {full_url}')
msg +=f'Title: {title}. Link: {url}\n' continue
# print(f'Files saved with {count} articles available.')
return msg title = title[0]
msg += f'Title: {title}. Link: {full_url}\n'
count += 1
sleep(0.1)
except Exception as e:
logger.error(f"处理新闻失败: {full_url}, 错误: {str(e)}")
continue
logger.info(f"ABC新闻获取完成共获取{count}")
return msg
except Exception as e:
logger.error(f"获取ABC新闻失败: {str(e)}")
return "获取新闻失败,请查看日志了解详情"
def fox(): def fox():
head = 'https://www.foxnews.com/' logger.info("开始获取FOX新闻")
res = requests.get(head) try:
html = etree.HTML(res.text) head = 'https://www.foxnews.com/'
href = html.xpath('//h3[@class="title"]/a/@href') response = safe_request(head)
href = title_tidy(href) if not response:
# quant = int(input(f'{len(href)} data detected. How many would you like to download:')) return "获取FOX新闻失败"
# if quant > len(href) or quant < 1:
# print("Outnumber!") html = etree.HTML(response.text)
# quit() href = html.xpath('//h3[@class="title"]/a/@href')
count = 0 href = title_tidy(href)
msg =''
# save('', f'FOX_news_title_{get_time()}') msg = ''
# save('', f'FOX_news_text_{get_time()}') count = 0
for i in range(30):
if href[i][0:4] != 'http': href[i] = 'https:' + href[i] for url in href[:NEWS_LIMIT]:
url = href[i] try:
sleep(0.1) # delete to speed up if url[0:4] != 'http':
res = requests.get(url) url = 'https:' + url
html = etree.HTML(res.text)
title = html.xpath('//h1[@itemprop="headline"]/text()') response = safe_request(url)
if len(title) == 0: if not response:
print(f'Video or other news. Link: {url}') continue
continue
title = title[0] html = etree.HTML(response.text)
author = html.xpath('//a[@rel="author"]/strong/text()') title = html.xpath('//h1[@itemprop="headline"]/text()')
author = ', '.join(author)
text = html.xpath('//div[@itemprop="articleBody"]/p/text()') if not title:
text = '\n\n'.join(text) logger.warning(f'跳过视频或其他类型新闻: {url}')
text = text_tidy(text) continue
count += 1
# save(f'Title: {title}\nLink: {url}\n\n', f'FOX_news_title_{get_time()}', 'a') # news title title = title[0]
# save(f'Title: {title}\n\nOrigin: {url}\n\nAuthor: {author}\n\n\n', f'FOX_news_text_{get_time()}', 'a') msg += f'Title: {title}. Link: {url}\n'
# save(f'{text}' + '\n\n------------------------------\n\n', f'FOX_news_text_{get_time()}', 'a') count += 1
# print(f'Title: {title}. Link: {url}') sleep(0.1)
msg +=f'Title: {title}. Link: {url}\n'
# print(f'Files saved with {count} articles available.') except Exception as e:
return msg logger.error(f"处理新闻失败: {url}, 错误: {str(e)}")
continue
logger.info(f"FOX新闻获取完成共获取{count}")
return msg
except Exception as e:
logger.error(f"获取FOX新闻失败: {str(e)}")
return "获取新闻失败,请查看日志了解详情"
def bbc(): def bbc():
head = 'https://www.bbc.com' logger.info("开始获取BBC新闻")
res = requests.get(head + '/') try:
html = etree.HTML(res.text) head = 'https://www.bbc.com'
href = html.xpath('//h2[@data-testid="card-headline"]/../../../../../@href | //h2[@data-testid="card-headline"]/../../../../@href') response = safe_request(head + '/')
href = title_tidy(href) if not response:
# quant = int(input(f'{len(href)} data detected. How many would you like to download:')) return "获取BBC新闻失败"
# if quant > len(href) or quant < 1:
# print("Outnumber!")
# quit()
count = 0
msg =''
# save('', f'BBC_news_title_{get_time()}')
# save('', f'BBC_news_text_{get_time()}')
for i in range(30):
if href[i][0:4] == 'http': continue
url = head + href[i]
sleep(0.1) # delete to speed up
print(url)
res = requests.get(url)
html = etree.HTML(res.text)
title = html.xpath('//div[@data-component="headline-block"]/h1/text()')
if len(title) == 0:
# print(f'Video or other news. Link: {url}')
continue
title = title[0]
# author = html.xpath('//div[@data-testid="byline"]/div/span[@data-testid="byline-name"]/text()')
# author = ', '.join(author)
# text = html.xpath('//div[@data-component="text-block"]/p/b/text() | //div[@data-component="text-block"]/p/text()')
# text = '\n\n'.join(text)
# text = text_tidy(text)
count += 1
# save(f'Title: {title}\nLink: {url}\n\n', f'BBC_news_title_{get_time()}', 'a') # news title
# save(f'Title: {title}\n\nOrigin: {url}\n\nAuthor: {author}\n\n\n', f'BBC_news_text_{get_time()}', 'a')
# save(f'{text}' + '\n\n------------------------------\n\n', f'BBC_news_text_{get_time()}', 'a')
# print(f'Title: {title}. Link: {url}')
msg +=f'Title: {title}. Link: {url}\n' html = etree.HTML(response.text)
# print(f'Files saved with {count} articles available.') href = html.xpath(
return msg '//h2[@data-testid="card-headline"]/../../../../../@href | //h2[@data-testid="card-headline"]/../../../../@href')
href = title_tidy(href)
if __name__ == '__main__': msg = ''
# Hello, World! :) count = 0
# news = input('Choose news site["nbc","cnn","abc","fox","bbc"]:').lower()
# if news == 'nbc': nbc() for url in href[:NEWS_LIMIT]:
# elif news == 'cnn': cnn() try:
# elif news == 'abc': abc() if url[0:4] == 'http':
# elif news == 'fox': fox() continue
# elif news == 'bbc': bbc()
# else: full_url = head + url
# print('Oops! It seems a wrong input. Please retry...') response = safe_request(full_url)
# sleep(2) if not response:
print(bbc()) continue
html = etree.HTML(response.text)
title = html.xpath('//div[@data-component="headline-block"]/h1/text()')
if not title:
logger.warning(f'跳过视频或其他类型新闻: {full_url}')
continue
title = title[0]
msg += f'Title: {title}. Link: {full_url}\n'
count += 1
sleep(0.1)
except Exception as e:
logger.error(f"处理新闻失败: {full_url}, 错误: {str(e)}")
continue
logger.info(f"BBC新闻获取完成共获取{count}")
return msg
except Exception as e:
logger.error(f"获取BBC新闻失败: {str(e)}")
return "获取新闻失败,请查看日志了解详情"
def all_english_news():
news_titles = ""
news_titles += nbc() + "\n"
news_titles += cnn() + "\n"
news_titles += abc() + "\n"
news_titles += fox() + "\n"
news_titles += bbc() + "\n"
markdown_news = dify_news_title_analyze(news_titles)
spath = convert_md_str_to_image(markdown_news, "news_output.png")
return spath

View File

@@ -11,6 +11,7 @@ import requests
from lxml import etree from lxml import etree
from base import func_english_news from base import func_english_news
from utils.ai.dify_news_analyze import dify_news_title_analyze
class News(object): class News(object):
@@ -70,10 +71,10 @@ class News(object):
post = response.json() post = response.json()
# 提取content列表 - 避免使用str作为变量名 # 提取content列表 - 避免使用str作为变量名
content_list = post.get('data', {}).get('cards', []) content_list = post.get('data', {}).get('cards', [])
if content_list and len(content_list) > 0: if content_list and len(content_list) > 0:
news_items = content_list[0].get('content', []) news_items = content_list[0].get('content', [])
# 遍历列表并格式化每个字典的title, url然后添加到output字符串中 # 遍历列表并格式化每个字典的title, url然后添加到output字符串中
for index, article in enumerate(news_items, start=1): for index, article in enumerate(news_items, start=1):
if isinstance(article, dict) and 'word' in article: if isinstance(article, dict) and 'word' in article:
@@ -81,34 +82,34 @@ class News(object):
# url = article.get('url', '') # url = article.get('url', '')
# 使用f-string格式化字符串并添加到output中 # 使用f-string格式化字符串并添加到output中
output += f"{index} :#{title}\n" output += f"{index} :#{title}\n"
# 输出最终的字符串 # 输出最终的字符串
return output return output
else: else:
self.LOG.error(f"获取百度新闻失败,状态码: {response.status_code}") self.LOG.error(f"获取百度新闻失败,状态码: {response.status_code}")
return "获取百度新闻失败,请稍后再试" return "获取百度新闻失败,请稍后再试"
except Exception as e: except Exception as e:
self.LOG.error(f"获取百度新闻时出错: {e}") self.LOG.error(f"获取百度新闻时出错: {e}")
return f"获取百度新闻时出错: {e}" return f"获取百度新闻时出错: {e}"
def get_eng_news(self,website): def get_eng_news(self, website):
if website == 'nbc': if website == 'nbc':
return func_english_news.nbc() return func_english_news.nbc()
elif website == 'cnn': elif website == 'cnn':
return func_english_news.cnn() return func_english_news.cnn()
elif website == 'abc': elif website == 'abc':
return func_english_news.abc() return func_english_news.abc()
elif website == 'fox': elif website == 'fox':
return func_english_news.fox() return func_english_news.fox()
elif website == 'bbc': elif website == 'bbc':
return func_english_news.bbc() return func_english_news.bbc()
if __name__ == "__main__": if __name__ == "__main__":
news = News() news = News()
print(news.get_baidu_news())
# # msg = "@水牛-分身 今日百度新闻" # # msg = "@水牛-分身 今日百度新闻"
# # q = re.sub(r"@.*?[\u2005|\s]", "", msg).replace(" ", "") # # q = re.sub(r"@.*?[\u2005|\s]", "", msg).replace(" ", "")
# # print(q) # # print(q)
# print(news.get_eng_news('nbc')) # print(news.get_eng_news('nbc'))

View File

@@ -0,0 +1,7 @@
# 从当前包的main模块导入GlobalNewsPlugin类
from .main import GlobalNewsPlugin
# 提供get_plugin函数返回插件实例
def get_plugin():
"""获取插件实例"""
return GlobalNewsPlugin()

View File

@@ -0,0 +1,6 @@
enable = true
command = ["全球新闻", "国际新闻", "环球新闻", "政经新闻", "政治经济新闻"]
command-format = """
🌍全球新闻指令:
全球新闻 - 获取最新的全球政治经济新闻
"""

191
plugins/global_news/main.py Normal file
View File

@@ -0,0 +1,191 @@
import logging
import asyncio
import threading
import time # 添加这一行
from typing import Dict, Any, List, Optional, Tuple
from wcferry import Wcf
from plugin_common.message_plugin_interface import MessagePluginInterface
from plugin_common.plugin_interface import PluginStatus
from utils.decorator.plugin_decorators import plugin_stats_decorator
from utils.robot_cmd.robot_command import Feature, PermissionStatus, GroupBotManager
from utils.decorator.points_decorator import plugin_points_cost
from utils.ai.dify_news_analyze import dify_news_title_analyze
from utils.markdown_to_image import convert_md_str_to_image
# 导入新闻抓取函数
from .news_crawler import nbc, cnn, abc, fox, bbc
class GlobalNewsPlugin(MessagePluginInterface):
"""全球政治经济新闻插件"""
@property
def name(self) -> str:
return "全球政治经济新闻"
@property
def version(self) -> str:
return "1.0.0"
@property
def description(self) -> str:
return "提供全球政治经济新闻,支持多个国际新闻源"
@property
def author(self) -> str:
return "Trae AI"
@property
def command_prefix(self) -> Optional[str]:
return "" # 不需要前缀,直接匹配命令
@property
def commands(self) -> List[str]:
return self._commands
def __init__(self):
super().__init__()
self._news_tasks = {} # 存储正在进行的新闻抓取任务
def initialize(self, context: Dict[str, Any]) -> bool:
"""初始化插件"""
self.LOG = logging.getLogger(f"Plugin.{self.name}")
self.LOG.info(f"正在初始化 {self.name} 插件...")
# 保存上下文对象
self.wcf = context.get("wcf")
self.event_system = context.get("event_system")
self.message_util = context.get("message_util")
self._commands = self._config.get("GlobalNews", {}).get("command", ["全球新闻", "国际新闻", "环球新闻", "政经新闻"])
self.command_format = self._config.get("GlobalNews", {}).get("command-format", "全球新闻 - 获取最新的全球政治经济新闻")
self.enable = self._config.get("GlobalNews", {}).get("enable", True)
self.LOG.info(f"[{self.name}] 插件初始化完成,指令:{self._commands}")
return True
def start(self) -> bool:
"""启动插件"""
self.LOG.info(f"[{self.name}] 插件已启动")
self.status = PluginStatus.RUNNING
return True
def stop(self) -> bool:
"""停止插件"""
self.LOG.info(f"[{self.name}] 插件已停止")
self.status = PluginStatus.STOPPED
return True
def can_process(self, message: Dict[str, Any]) -> bool:
"""检查是否可以处理该消息"""
if not self.enable:
return False
content = str(message.get("content", "")).strip()
command = content.split(" ")[0]
return command in self._commands
@plugin_stats_decorator(plugin_name="全球政治经济新闻")
@plugin_points_cost(5, "全球新闻消耗积分", Feature.NEWS)
def process_message(self, message: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
"""处理消息"""
content = str(message.get("content", "")).strip()
self.LOG.info(f"插件执行: {self.name}{content}")
sender = message.get("sender")
roomid = message.get("roomid", "")
wcf: Wcf = message.get("wcf")
gbm: GroupBotManager = message.get("gbm")
# 检查权限
if roomid and gbm.get_group_permission(roomid, Feature.NEWS) == PermissionStatus.DISABLED:
return False, "没有权限"
# 生成唯一任务ID
task_id = f"{sender}_{roomid}_{int(time.time())}"
# 发送等待消息
wcf.send_text("🌍正在获取全球新闻,请稍候...",
(roomid if roomid else sender), sender)
# 启动异步任务
self._start_news_task(task_id, sender, roomid, wcf)
return True, "新闻获取任务已启动"
def _start_news_task(self, task_id: str, sender: str, roomid: str, wcf: Wcf):
"""启动异步新闻获取任务"""
thread = threading.Thread(
target=self._fetch_news_thread,
args=(task_id, sender, roomid, wcf)
)
thread.daemon = True
thread.start()
self._news_tasks[task_id] = thread
self.LOG.info(f"启动新闻获取任务: {task_id}")
def _fetch_news_thread(self, task_id: str, sender: str, roomid: str, wcf: Wcf):
"""在单独的线程中运行异步新闻获取任务"""
try:
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
news_result = loop.run_until_complete(self._fetch_news_async())
loop.close()
# 处理结果
if news_result:
# 发送新闻图片
receiver = roomid if roomid else sender
wcf.send_image(news_result, receiver)
wcf.send_text("🌍全球新闻获取完成!", receiver, sender)
else:
wcf.send_text("❌获取新闻失败,请稍后再试",
(roomid if roomid else sender), sender)
except Exception as e:
self.LOG.error(f"新闻获取任务出错: {e}")
wcf.send_text(f"❌获取新闻出错: {str(e)}",
(roomid if roomid else sender), sender)
finally:
# 清理任务
if task_id in self._news_tasks:
del self._news_tasks[task_id]
async def _fetch_news_async(self) -> str:
"""异步获取所有新闻源的新闻"""
try:
# 创建所有新闻源的任务
tasks = [
self._run_in_executor(nbc),
self._run_in_executor(cnn),
self._run_in_executor(abc),
self._run_in_executor(fox),
self._run_in_executor(bbc)
]
# 并行执行所有任务
results = await asyncio.gather(*tasks)
# 合并结果
news_titles = "\n".join(results)
# 使用AI分析新闻
markdown_news = await self._run_in_executor(
dify_news_title_analyze, news_titles
)
# 转换为图片
image_path = await self._run_in_executor(
convert_md_str_to_image, markdown_news, "news_output.png"
)
return image_path
except Exception as e:
self.LOG.error(f"异步获取新闻失败: {e}")
return ""
async def _run_in_executor(self, func, *args):
"""在线程池中运行同步函数"""
loop = asyncio.get_event_loop()
return await loop.run_in_executor(None, func, *args)

View File

@@ -0,0 +1,307 @@
# -*- coding: utf-8 -*-
"""
Program: Global News Crawler
Author: Trae AI (based on MrCrawL's work)
Created Date: 2024-05-01
"""
import requests
from time import localtime, sleep
from lxml import etree
import logging
from datetime import datetime
import time
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(f'global_news_{datetime.now().strftime("%Y%m%d")}.log'),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
# 请求配置
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
TIMEOUT = 10
MAX_RETRIES = 3
NEWS_LIMIT = 30
def get_time():
date_ = f'{str(localtime().tm_year).zfill(4)}-{str(localtime().tm_mon).zfill(2)}-{str(localtime().tm_mday).zfill(2)}'
return date_
def title_tidy(title_list):
t_index = []
for i in range(1, len(title_list)):
if title_list[i] == title_list[i - 1]: t_index.append(i)
t_index.reverse()
for i in range(len(t_index)): title_list.pop(t_index[i])
return title_list
def safe_request(url, retry_count=0):
"""安全的请求方法,包含重试机制"""
try:
response = requests.get(url, headers=HEADERS, timeout=TIMEOUT)
response.raise_for_status()
return response
except requests.RequestException as e:
if retry_count < MAX_RETRIES:
logger.warning(f"请求失败,正在进行第{retry_count + 1}次重试: {url}")
sleep(1)
return safe_request(url, retry_count + 1)
else:
logger.error(f"请求失败: {url}, 错误: {str(e)}")
return None
def nbc():
logger.info("开始获取NBC新闻")
try:
url = 'https://www.nbcnews.com/'
response = safe_request(url)
if not response:
return "获取NBC新闻失败"
html = etree.HTML(response.text)
href = html.xpath('//h2/a/@href')
href = title_tidy(href)
msg = ''
count = 0
for url in href[:NEWS_LIMIT]:
try:
response = safe_request(url)
if not response:
continue
html = etree.HTML(response.text)
title = html.xpath('//h1/text()')
if not title:
logger.warning(f'跳过视频或其他类型新闻: {url}')
continue
title = title[0]
msg += f'Title: {title}. Link: {url}\n'
count += 1
sleep(0.1)
except Exception as e:
logger.error(f"处理新闻失败: {url}, 错误: {str(e)}")
continue
logger.info(f"NBC新闻获取完成共获取{count}")
return msg
except Exception as e:
logger.error(f"获取NBC新闻失败: {str(e)}")
return "获取新闻失败,请查看日志了解详情"
def cnn():
logger.info("开始获取CNN新闻")
try:
head = 'https://www.cnn.com'
response = safe_request(head + '/')
if not response:
return "获取CNN新闻失败"
html = etree.HTML(response.text)
href = html.xpath('//a[@data-link-type="article"]/@href')
href = title_tidy(href)
msg = ''
count = 0
for url in href[:NEWS_LIMIT]:
try:
full_url = head + url
response = safe_request(full_url)
if not response:
continue
html = etree.HTML(response.text)
title = html.xpath('//h1[@data-editable="headlineText"]/text()')
if not title:
logger.warning(f'跳过视频或其他类型新闻: {full_url}')
continue
title = title[0].strip()
msg += f'Title: {title}. Link: {full_url}\n'
count += 1
sleep(0.1)
except Exception as e:
logger.error(f"处理新闻失败: {full_url}, 错误: {str(e)}")
continue
logger.info(f"CNN新闻获取完成共获取{count}")
return msg
except Exception as e:
logger.error(f"获取CNN新闻失败: {str(e)}")
return "获取新闻失败,请查看日志了解详情"
def abc():
logger.info("开始获取ABC新闻")
try:
head = 'https://abcnews.go.com' # 移除末尾的斜杠
response = safe_request(head)
if not response:
return "获取ABC新闻失败"
html = etree.HTML(response.text)
href1 = html.xpath('//div[@class="HeadlinesTrio"]/a/@href')
href2 = html.xpath(
'//div[@class="title card"]/a[@class="AnchorLink"]/@href | //div[@class="title"]/a[@class="AnchorLink"]/@href')
href3 = html.xpath('//a[@target="_self"]/@href')
href4 = html.xpath('//a[@class="AnchorLink VideoTile"]/@href')
href = title_tidy(href1 + href2 + href3 + href4)
msg = ''
count = 0
for url in href[:NEWS_LIMIT]:
try:
# 处理URL格式
if url.startswith('http'):
full_url = url
elif url.startswith('//'):
full_url = 'https:' + url
else:
full_url = head + ('' if url.startswith('/') else '/') + url
response = safe_request(full_url)
if not response:
continue
html = etree.HTML(response.text)
title = html.xpath('//div[@data-testid="prism-headline"]/h1/text()')
if not title:
logger.warning(f'跳过视频或其他类型新闻: {full_url}')
continue
title = title[0]
msg += f'Title: {title}. Link: {full_url}\n'
count += 1
sleep(0.1)
except Exception as e:
logger.error(f"处理新闻失败: {full_url}, 错误: {str(e)}")
continue
logger.info(f"ABC新闻获取完成共获取{count}")
return msg
except Exception as e:
logger.error(f"获取ABC新闻失败: {str(e)}")
return "获取新闻失败,请查看日志了解详情"
def fox():
logger.info("开始获取FOX新闻")
try:
head = 'https://www.foxnews.com/'
response = safe_request(head)
if not response:
return "获取FOX新闻失败"
html = etree.HTML(response.text)
href = html.xpath('//h3[@class="title"]/a/@href')
href = title_tidy(href)
msg = ''
count = 0
for url in href[:NEWS_LIMIT]:
try:
if url[0:4] != 'http':
url = 'https:' + url
response = safe_request(url)
if not response:
continue
html = etree.HTML(response.text)
title = html.xpath('//h1[@itemprop="headline"]/text()')
if not title:
logger.warning(f'跳过视频或其他类型新闻: {url}')
continue
title = title[0]
msg += f'Title: {title}. Link: {url}\n'
count += 1
sleep(0.1)
except Exception as e:
logger.error(f"处理新闻失败: {url}, 错误: {str(e)}")
continue
logger.info(f"FOX新闻获取完成共获取{count}")
return msg
except Exception as e:
logger.error(f"获取FOX新闻失败: {str(e)}")
return "获取新闻失败,请查看日志了解详情"
def bbc():
logger.info("开始获取BBC新闻")
try:
head = 'https://www.bbc.com'
response = safe_request(head + '/')
if not response:
return "获取BBC新闻失败"
html = etree.HTML(response.text)
href = html.xpath(
'//h2[@data-testid="card-headline"]/../../../../../@href | //h2[@data-testid="card-headline"]/../../../../@href')
href = title_tidy(href)
msg = ''
count = 0
for url in href[:NEWS_LIMIT]:
try:
if url[0:4] == 'http':
continue
full_url = head + url
response = safe_request(full_url)
if not response:
continue
html = etree.HTML(response.text)
title = html.xpath('//div[@data-component="headline-block"]/h1/text()')
if not title:
logger.warning(f'跳过视频或其他类型新闻: {full_url}')
continue
title = title[0]
msg += f'Title: {title}. Link: {full_url}\n'
count += 1
sleep(0.1)
except Exception as e:
logger.error(f"处理新闻失败: {full_url}, 错误: {str(e)}")
continue
logger.info(f"BBC新闻获取完成共获取{count}")
return msg
except Exception as e:
logger.error(f"获取BBC新闻失败: {str(e)}")
return "获取新闻失败,请查看日志了解详情"

View File

@@ -0,0 +1,73 @@
#
# curl -X POST 'http://192.168.2.240/v1/chat-messages' \
# --header 'Authorization: Bearer {api_key}' \
# --header 'Content-Type: application/json' \
# --data-raw '{
# "inputs": {},
# "query": "What are the specs of the iPhone 13 Pro Max?",
# "response_mode": "streaming",
# "conversation_id": "",
# "user": "abc-123",
# "files": [
# {
# "type": "image",
# "transfer_method": "remote_url",
# "url": "https://cloud.dify.ai/logo/logo-site.png"
# }
# ]
# }'
import json
import requests
def dify_news_title_analyze(content):
# 设置Authorization和URL
authorization = "Bearer app-rhhKkbvHd2IAQoGX7xTzXZJj" # 请替换为真实的Authorization token
url = 'http://192.168.2.240/v1/chat-messages'
data = {
"response_mode": "blocking",
"conversation_id": "",
"inputs": {},
"query": content,
"user": "a-bot"
}
# 设置请求头
headers = {
"Content-Type": "application/json; charset=utf-8",
"Authorization": authorization
}
# 发送POST请求
response = requests.post(url, headers=headers, data=json.dumps(data), )
response.encoding = 'utf-8'
# 输出响应内容
print(response.status_code)
print(response.json())
return extract_content(response.json())
def extract_content(data):
"""解析API响应内容
Args:
data: API返回的响应数据可以是字典或字符串
Returns:
str: 提取的answer内容
"""
try:
# 如果是字符串,尝试解析为字典
if isinstance(data, str):
data = json.dumps(data)
# 如果是字典直接获取answer
if isinstance(data, dict):
answer = data.get('answer', '')
if answer:
return answer
return None
except Exception as e:
print(f"解析响应失败: {str(e)}")
return None

View File

@@ -43,7 +43,8 @@ class Feature(Enum):
GROUP_ADD = 16, "加群提醒功能" GROUP_ADD = 16, "加群提醒功能"
DOUYIN_PARSER = 17, "抖音链接转视频功能" DOUYIN_PARSER = 17, "抖音链接转视频功能"
GROUP_MEMBER_CHANGE = 18, "群成员变更提醒功能" GROUP_MEMBER_CHANGE = 18, "群成员变更提醒功能"
KID_PHOTO_EXTRACT =19, "儿童照片提取转发功能" # 小朋友照片提取功能 KID_PHOTO_EXTRACT = 19, "儿童照片提取转发功能" # 小朋友照片提取功能
NEWS = 20, "全球政治经济新闻"
def __new__(cls, value, description): def __new__(cls, value, description):
obj = object.__new__(cls) obj = object.__new__(cls)
@@ -240,11 +241,11 @@ class GroupBotManager:
str: 格式化的已启用功能列表字符串 str: 格式化的已启用功能列表字符串
""" """
enabled_features = [] enabled_features = []
# 检查群是否在列表中 # 检查群是否在列表中
if group_id not in GroupBotManager.local_cache["group_list"]: if group_id not in GroupBotManager.local_cache["group_list"]:
return "该群未启用机器人功能" return "该群未启用机器人功能"
# 遍历所有功能,检查哪些已启用且包含指令 # 遍历所有功能,检查哪些已启用且包含指令
for feature in Feature: for feature in Feature:
status = GroupBotManager.get_group_permission(group_id, feature) status = GroupBotManager.get_group_permission(group_id, feature)
@@ -255,18 +256,18 @@ class GroupBotManager:
"name": feature.name, "name": feature.name,
"description": feature.description "description": feature.description
}) })
# 如果没有启用任何带指令的功能 # 如果没有启用任何带指令的功能
if not enabled_features: if not enabled_features:
return "该群未启用任何带指令的功能" return "该群未启用任何带指令的功能"
# 构建格式化的字符串 # 构建格式化的字符串
result = f"群功能菜单:\n" result = f"群功能菜单:\n"
for feature in enabled_features: for feature in enabled_features:
result += f"{feature['id']}.{feature['description']}\n" result += f"{feature['id']}.{feature['description']}\n"
return result return result
@staticmethod @staticmethod
def get_group_list(): def get_group_list():
"""返回所有启用了群机器人的群组清单,格式为集合""" """返回所有启用了群机器人的群组清单,格式为集合"""