diff --git a/base/func_english_news.py b/base/func_english_news.py index 02ac69c..fcf9595 100644 --- a/base/func_english_news.py +++ b/base/func_english_news.py @@ -6,11 +6,35 @@ Created Date: 2024-01-21 Last Modified: 2024-03-24 Modified by: MrCrawL """ +from utils.ai.dify_news_analyze import dify_news_title_analyze +from utils.markdown_to_image import convert_md_str_to_image + '''Existing problem: text with hyperlink won't be saved''' import requests from time import localtime, sleep from lxml import etree +import logging +from datetime import datetime + +# 配置日志 +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s', + handlers=[ + logging.FileHandler(f'news_crawler_{datetime.now().strftime("%Y%m%d")}.log'), + logging.StreamHandler() + ] +) +logger = logging.getLogger(__name__) + +# 请求配置 +HEADERS = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' +} +TIMEOUT = 10 +MAX_RETRIES = 3 +NEWS_LIMIT = 30 def get_time(): @@ -18,7 +42,6 @@ def get_time(): return date_ -# delete duplicated def title_tidy(title_list): t_index = [] for i in range(1, len(title_list)): @@ -28,7 +51,6 @@ def title_tidy(title_list): return title_list -# tidy text, seems a little bit redundant def text_tidy(p_text): text_ = p_text.replace('’', "'") text_ = text_.replace(' \n\n', ' ') @@ -46,218 +68,275 @@ def text_tidy(p_text): return text_ -def save(text, file_name, mode='w', encoding='utf-8'): - with open(f'{file_name}.txt', mode, encoding=encoding) as f: f.write(text) +def safe_request(url, retry_count=0): + """安全的请求方法,包含重试机制""" + try: + response = requests.get(url, headers=HEADERS, timeout=TIMEOUT) + response.raise_for_status() + return response + except requests.RequestException as e: + if retry_count < MAX_RETRIES: + logger.warning(f"请求失败,正在进行第{retry_count + 1}次重试: {url}") + sleep(1) + return safe_request(url, retry_count + 1) + else: + logger.error(f"请求失败: {url}, 错误: {str(e)}") + return None def nbc(): - url = 'https://www.nbcnews.com/' - res = requests.get(url) - html = etree.HTML(res.text) - href = html.xpath('//h2/a/@href') - href = title_tidy(href) - # quant = int(input(f'There are {len(href)} pieces detected. How many would you download:')) - # if quant > len(href) or quant < 1: - # print("Outnumber!") - # quit() - count = 0 - # save('', f'NBC_news_title_{get_time()}') - # save('', f'NBC_news_text_{get_time()}') - msg ='' - for i in range(30): - url = href[i] - sleep(0.1) # delete to speed up - res = requests.get(url) - html = etree.HTML(res.text) - title = html.xpath('//h1/text()') - if len(title) == 0: - print(f'Video or other news. Link: {url}') - continue - title = title[0] - author = html.xpath('//span[@class="byline-name"]/a/text() | //span[@class="byline-name" and not(a)]/text()') - author = ', '.join(author) - text = html.xpath('//p[@class=""]/text()') - text = '\n\n'.join(text) - text = text_tidy(text) - count += 1 - # save(f'Title: {title}\nLink: {url}\n\n', f'NBC_news_title_{get_time()}', 'a') # news title - # save(f'Title: {title}\n\nOrigin: {url}\n\nAuthor: {author}\n\n\n', f'NBC_news_text_{get_time()}', 'a') - # save(f'{text}' + '\n\n------------------------------\n\n', f'NBC_news_text_{get_time()}', 'a') - # print(f'Title: {title}. Link: {href[i]}.') - msg += f'Title: {title}. Link: {href[i]}.\n' - return msg + logger.info("开始获取NBC新闻") + try: + url = 'https://www.nbcnews.com/' + response = safe_request(url) + if not response: + return "获取NBC新闻失败" + + html = etree.HTML(response.text) + href = html.xpath('//h2/a/@href') + href = title_tidy(href) + + msg = '' + count = 0 + + for url in href[:NEWS_LIMIT]: + try: + response = safe_request(url) + if not response: + continue + + html = etree.HTML(response.text) + title = html.xpath('//h1/text()') + + if not title: + logger.warning(f'跳过视频或其他类型新闻: {url}') + continue + + title = title[0] + msg += f'Title: {title}. Link: {url}\n' + count += 1 + sleep(0.1) + + except Exception as e: + logger.error(f"处理新闻失败: {url}, 错误: {str(e)}") + continue + + logger.info(f"NBC新闻获取完成,共获取{count}条") + return msg + + except Exception as e: + logger.error(f"获取NBC新闻失败: {str(e)}") + return "获取新闻失败,请查看日志了解详情" def cnn(): - head = 'https://www.cnn.com' - res = requests.get(head + '/') - html = etree.HTML(res.text) - href = html.xpath('//a[@data-link-type="article"]/@href') - href = title_tidy(href) - # quant = int(input(f'{len(href)} data detected. How many would you like to download:')) - # if quant > len(href) or quant < 1: - # print("Outnumber!") - # quit() - count = 0 - msg = '' - # save('', f'CNN_news_title_{get_time()}') - # save('', f'CNN_news_text_{get_time()}') - for i in range(30): - url = head + href[i] - sleep(0.1) # delete to speed up - res = requests.get(url) - html = etree.HTML(res.text) - title = html.xpath('//h1[@data-editable="headlineText"]/text()') - if len(title) == 0: - print(f'Video or other news. Link: {url}') - continue - title = title[0].strip() - author = html.xpath('//span[@class="byline__name"]/text()') - author = ', '.join(author) - text = html.xpath('//p[@class="paragraph inline-placeholder"]/text()') - for k in range(len(text)): text[k].strip() - text = ''.join(text) - text = text_tidy(text) - count += 1 - # save(f'Title: {title}\nLink: {url}\n\n', f'CNN_news_title_{get_time()}', 'a') # news title - # save(f'Title: {title}\n\nOrigin: {url}\n\nAuthor: {author}\n\n\n', f'CNN_news_text_{get_time()}', 'a') - # save(f'{text}' + '\n\n------------------------------\n\n', f'CNN_news_text_{get_time()}', 'a') - # print(f'Title: {title}. Link: {url}') - msg +=f'Title: {title}. Link: {url}\n' - # print(f'Files saved with {count} articles available.') - return msg + logger.info("开始获取CNN新闻") + try: + head = 'https://www.cnn.com' + response = safe_request(head + '/') + if not response: + return "获取CNN新闻失败" + + html = etree.HTML(response.text) + href = html.xpath('//a[@data-link-type="article"]/@href') + href = title_tidy(href) + + msg = '' + count = 0 + + for url in href[:NEWS_LIMIT]: + try: + full_url = head + url + response = safe_request(full_url) + if not response: + continue + + html = etree.HTML(response.text) + title = html.xpath('//h1[@data-editable="headlineText"]/text()') + + if not title: + logger.warning(f'跳过视频或其他类型新闻: {full_url}') + continue + + title = title[0].strip() + msg += f'Title: {title}. Link: {full_url}\n' + count += 1 + sleep(0.1) + + except Exception as e: + logger.error(f"处理新闻失败: {full_url}, 错误: {str(e)}") + continue + + logger.info(f"CNN新闻获取完成,共获取{count}条") + return msg + + except Exception as e: + logger.error(f"获取CNN新闻失败: {str(e)}") + return "获取新闻失败,请查看日志了解详情" + def abc(): - head = 'https://abcnews.go.com/' - res = requests.get(head) - html = etree.HTML(res.text) - href1 = html.xpath('//div[@class="HeadlinesTrio"]/a/@href') - href2 = html.xpath('//div[@class="title card"]/a[@class="AnchorLink"]/@href | //div[@class="title"]/a[@class="AnchorLink"]/@href') - href3 = html.xpath('//a[@target="_self"]/@href') - href4 = html.xpath('//a[@class="AnchorLink VideoTile"]/@href') - href = href1 + href2 + href3 + href4 - href = title_tidy(href) - # quant = int(input(f'{len(href)} data detected. How many would you like to download:')) - # if quant > len(href) or quant < 1: - # print("Outnumber!") - # quit() - count = 0 - msg = '' - # save('', f'ABC_news_title_{get_time()}') - # save('', f'ABC_news_text_{get_time()}') - for i in range(30): - url = href[i] - sleep(0.1) # delete to speed up - res = requests.get(url) - html = etree.HTML(res.text) - title = html.xpath('//div[@data-testid="prism-headline"]/h1/text()') - if len(title) == 0: - print(f'Video or other news. Link: {url}') - continue - title = title[0] - author = html.xpath('//a[@data-testid="prism-linkbase"]/text()') - author = ', '.join(author) - text = html.xpath('//div[@data-testid="prism-article-body"]/p/text()') - text = '\n\n'.join(text) - text = text_tidy(text) - count += 1 - # save(f'Title: {title}\nLink: {url}\n\n', f'ABC_news_title_{get_time()}', 'a') # news title - # save(f'Title: {title}\n\nOrigin: {url}\n\nAuthor: {author}\n\n\n', f'ABC_news_text_{get_time()}', 'a') - # save(f'{text}' + '\n\n------------------------------\n\n', f'ABC_news_text_{get_time()}', 'a') - # print(f'Title: {title}. Link: {url}') - msg +=f'Title: {title}. Link: {url}\n' - # print(f'Files saved with {count} articles available.') - return msg + logger.info("开始获取ABC新闻") + try: + head = 'https://abcnews.go.com' # 移除末尾的斜杠 + response = safe_request(head) + if not response: + return "获取ABC新闻失败" + + html = etree.HTML(response.text) + href1 = html.xpath('//div[@class="HeadlinesTrio"]/a/@href') + href2 = html.xpath( + '//div[@class="title card"]/a[@class="AnchorLink"]/@href | //div[@class="title"]/a[@class="AnchorLink"]/@href') + href3 = html.xpath('//a[@target="_self"]/@href') + href4 = html.xpath('//a[@class="AnchorLink VideoTile"]/@href') + href = title_tidy(href1 + href2 + href3 + href4) + + msg = '' + count = 0 + + for url in href[:NEWS_LIMIT]: + try: + # 处理URL格式 + if url.startswith('http'): + full_url = url + elif url.startswith('//'): + full_url = 'https:' + url + else: + full_url = head + ('' if url.startswith('/') else '/') + url + + response = safe_request(full_url) + if not response: + continue + + html = etree.HTML(response.text) + title = html.xpath('//div[@data-testid="prism-headline"]/h1/text()') + + if not title: + logger.warning(f'跳过视频或其他类型新闻: {full_url}') + continue + + title = title[0] + msg += f'Title: {title}. Link: {full_url}\n' + count += 1 + sleep(0.1) + + except Exception as e: + logger.error(f"处理新闻失败: {full_url}, 错误: {str(e)}") + continue + + logger.info(f"ABC新闻获取完成,共获取{count}条") + return msg + + except Exception as e: + logger.error(f"获取ABC新闻失败: {str(e)}") + return "获取新闻失败,请查看日志了解详情" + def fox(): - head = 'https://www.foxnews.com/' - res = requests.get(head) - html = etree.HTML(res.text) - href = html.xpath('//h3[@class="title"]/a/@href') - href = title_tidy(href) - # quant = int(input(f'{len(href)} data detected. How many would you like to download:')) - # if quant > len(href) or quant < 1: - # print("Outnumber!") - # quit() - count = 0 - msg ='' - # save('', f'FOX_news_title_{get_time()}') - # save('', f'FOX_news_text_{get_time()}') - for i in range(30): - if href[i][0:4] != 'http': href[i] = 'https:' + href[i] - url = href[i] - sleep(0.1) # delete to speed up - res = requests.get(url) - html = etree.HTML(res.text) - title = html.xpath('//h1[@itemprop="headline"]/text()') - if len(title) == 0: - print(f'Video or other news. Link: {url}') - continue - title = title[0] - author = html.xpath('//a[@rel="author"]/strong/text()') - author = ', '.join(author) - text = html.xpath('//div[@itemprop="articleBody"]/p/text()') - text = '\n\n'.join(text) - text = text_tidy(text) - count += 1 - # save(f'Title: {title}\nLink: {url}\n\n', f'FOX_news_title_{get_time()}', 'a') # news title - # save(f'Title: {title}\n\nOrigin: {url}\n\nAuthor: {author}\n\n\n', f'FOX_news_text_{get_time()}', 'a') - # save(f'{text}' + '\n\n------------------------------\n\n', f'FOX_news_text_{get_time()}', 'a') - # print(f'Title: {title}. Link: {url}') - msg +=f'Title: {title}. Link: {url}\n' - # print(f'Files saved with {count} articles available.') - return msg + logger.info("开始获取FOX新闻") + try: + head = 'https://www.foxnews.com/' + response = safe_request(head) + if not response: + return "获取FOX新闻失败" + + html = etree.HTML(response.text) + href = html.xpath('//h3[@class="title"]/a/@href') + href = title_tidy(href) + + msg = '' + count = 0 + + for url in href[:NEWS_LIMIT]: + try: + if url[0:4] != 'http': + url = 'https:' + url + + response = safe_request(url) + if not response: + continue + + html = etree.HTML(response.text) + title = html.xpath('//h1[@itemprop="headline"]/text()') + + if not title: + logger.warning(f'跳过视频或其他类型新闻: {url}') + continue + + title = title[0] + msg += f'Title: {title}. Link: {url}\n' + count += 1 + sleep(0.1) + + except Exception as e: + logger.error(f"处理新闻失败: {url}, 错误: {str(e)}") + continue + + logger.info(f"FOX新闻获取完成,共获取{count}条") + return msg + + except Exception as e: + logger.error(f"获取FOX新闻失败: {str(e)}") + return "获取新闻失败,请查看日志了解详情" + def bbc(): - head = 'https://www.bbc.com' - res = requests.get(head + '/') - html = etree.HTML(res.text) - href = html.xpath('//h2[@data-testid="card-headline"]/../../../../../@href | //h2[@data-testid="card-headline"]/../../../../@href') - href = title_tidy(href) - # quant = int(input(f'{len(href)} data detected. How many would you like to download:')) - # if quant > len(href) or quant < 1: - # print("Outnumber!") - # quit() - count = 0 - msg ='' - # save('', f'BBC_news_title_{get_time()}') - # save('', f'BBC_news_text_{get_time()}') - for i in range(30): - if href[i][0:4] == 'http': continue - url = head + href[i] - sleep(0.1) # delete to speed up - print(url) - res = requests.get(url) - html = etree.HTML(res.text) - title = html.xpath('//div[@data-component="headline-block"]/h1/text()') - if len(title) == 0: - # print(f'Video or other news. Link: {url}') - continue - title = title[0] - # author = html.xpath('//div[@data-testid="byline"]/div/span[@data-testid="byline-name"]/text()') - # author = ', '.join(author) - # text = html.xpath('//div[@data-component="text-block"]/p/b/text() | //div[@data-component="text-block"]/p/text()') - # text = '\n\n'.join(text) - # text = text_tidy(text) - count += 1 - # save(f'Title: {title}\nLink: {url}\n\n', f'BBC_news_title_{get_time()}', 'a') # news title - # save(f'Title: {title}\n\nOrigin: {url}\n\nAuthor: {author}\n\n\n', f'BBC_news_text_{get_time()}', 'a') - # save(f'{text}' + '\n\n------------------------------\n\n', f'BBC_news_text_{get_time()}', 'a') - # print(f'Title: {title}. Link: {url}') + logger.info("开始获取BBC新闻") + try: + head = 'https://www.bbc.com' + response = safe_request(head + '/') + if not response: + return "获取BBC新闻失败" - msg +=f'Title: {title}. Link: {url}\n' - # print(f'Files saved with {count} articles available.') - return msg + html = etree.HTML(response.text) + href = html.xpath( + '//h2[@data-testid="card-headline"]/../../../../../@href | //h2[@data-testid="card-headline"]/../../../../@href') + href = title_tidy(href) -if __name__ == '__main__': - # Hello, World! :) - # news = input('Choose news site["nbc","cnn","abc","fox","bbc"]:').lower() - # if news == 'nbc': nbc() - # elif news == 'cnn': cnn() - # elif news == 'abc': abc() - # elif news == 'fox': fox() - # elif news == 'bbc': bbc() - # else: - # print('Oops! It seems a wrong input. Please retry...') - # sleep(2) - print(bbc()) + msg = '' + count = 0 + + for url in href[:NEWS_LIMIT]: + try: + if url[0:4] == 'http': + continue + + full_url = head + url + response = safe_request(full_url) + if not response: + continue + + html = etree.HTML(response.text) + title = html.xpath('//div[@data-component="headline-block"]/h1/text()') + + if not title: + logger.warning(f'跳过视频或其他类型新闻: {full_url}') + continue + + title = title[0] + msg += f'Title: {title}. Link: {full_url}\n' + count += 1 + sleep(0.1) + + except Exception as e: + logger.error(f"处理新闻失败: {full_url}, 错误: {str(e)}") + continue + + logger.info(f"BBC新闻获取完成,共获取{count}条") + return msg + + except Exception as e: + logger.error(f"获取BBC新闻失败: {str(e)}") + return "获取新闻失败,请查看日志了解详情" + + +def all_english_news(): + news_titles = "" + news_titles += nbc() + "\n" + news_titles += cnn() + "\n" + news_titles += abc() + "\n" + news_titles += fox() + "\n" + news_titles += bbc() + "\n" + markdown_news = dify_news_title_analyze(news_titles) + spath = convert_md_str_to_image(markdown_news, "news_output.png") + return spath diff --git a/base/func_news.py b/base/func_news.py index 97d0f08..088141f 100644 --- a/base/func_news.py +++ b/base/func_news.py @@ -11,6 +11,7 @@ import requests from lxml import etree from base import func_english_news +from utils.ai.dify_news_analyze import dify_news_title_analyze class News(object): @@ -70,10 +71,10 @@ class News(object): post = response.json() # 提取content列表 - 避免使用str作为变量名 content_list = post.get('data', {}).get('cards', []) - + if content_list and len(content_list) > 0: news_items = content_list[0].get('content', []) - + # 遍历列表,并格式化每个字典的title, url,然后添加到output字符串中 for index, article in enumerate(news_items, start=1): if isinstance(article, dict) and 'word' in article: @@ -81,34 +82,34 @@ class News(object): # url = article.get('url', '') # 使用f-string格式化字符串,并添加到output中 output += f"{index} :#{title}\n" - + # 输出最终的字符串 return output else: self.LOG.error(f"获取百度新闻失败,状态码: {response.status_code}") return "获取百度新闻失败,请稍后再试" - + except Exception as e: self.LOG.error(f"获取百度新闻时出错: {e}") return f"获取百度新闻时出错: {e}" - def get_eng_news(self,website): + def get_eng_news(self, website): if website == 'nbc': - return func_english_news.nbc() + return func_english_news.nbc() elif website == 'cnn': - return func_english_news.cnn() + return func_english_news.cnn() elif website == 'abc': return func_english_news.abc() elif website == 'fox': return func_english_news.fox() elif website == 'bbc': - return func_english_news.bbc() + return func_english_news.bbc() + if __name__ == "__main__": news = News() - print(news.get_baidu_news()) # # msg = "@水牛-分身 今日百度新闻" # # q = re.sub(r"@.*?[\u2005|\s]", "", msg).replace(" ", "") # # print(q) - # print(news.get_eng_news('nbc')) \ No newline at end of file + # print(news.get_eng_news('nbc')) diff --git a/plugins/global_news/__init__.py b/plugins/global_news/__init__.py new file mode 100644 index 0000000..02d5955 --- /dev/null +++ b/plugins/global_news/__init__.py @@ -0,0 +1,7 @@ +# 从当前包的main模块导入GlobalNewsPlugin类 +from .main import GlobalNewsPlugin + +# 提供get_plugin函数,返回插件实例 +def get_plugin(): + """获取插件实例""" + return GlobalNewsPlugin() \ No newline at end of file diff --git a/plugins/global_news/config.toml b/plugins/global_news/config.toml new file mode 100644 index 0000000..d56b34c --- /dev/null +++ b/plugins/global_news/config.toml @@ -0,0 +1,6 @@ +enable = true +command = ["全球新闻", "国际新闻", "环球新闻", "政经新闻", "政治经济新闻"] +command-format = """ +🌍全球新闻指令: +全球新闻 - 获取最新的全球政治经济新闻 +""" \ No newline at end of file diff --git a/plugins/global_news/main.py b/plugins/global_news/main.py new file mode 100644 index 0000000..e6efe51 --- /dev/null +++ b/plugins/global_news/main.py @@ -0,0 +1,191 @@ +import logging +import asyncio +import threading +import time # 添加这一行 +from typing import Dict, Any, List, Optional, Tuple + +from wcferry import Wcf + +from plugin_common.message_plugin_interface import MessagePluginInterface +from plugin_common.plugin_interface import PluginStatus +from utils.decorator.plugin_decorators import plugin_stats_decorator +from utils.robot_cmd.robot_command import Feature, PermissionStatus, GroupBotManager +from utils.decorator.points_decorator import plugin_points_cost +from utils.ai.dify_news_analyze import dify_news_title_analyze +from utils.markdown_to_image import convert_md_str_to_image + +# 导入新闻抓取函数 +from .news_crawler import nbc, cnn, abc, fox, bbc + + +class GlobalNewsPlugin(MessagePluginInterface): + """全球政治经济新闻插件""" + + @property + def name(self) -> str: + return "全球政治经济新闻" + + @property + def version(self) -> str: + return "1.0.0" + + @property + def description(self) -> str: + return "提供全球政治经济新闻,支持多个国际新闻源" + + @property + def author(self) -> str: + return "Trae AI" + + @property + def command_prefix(self) -> Optional[str]: + return "" # 不需要前缀,直接匹配命令 + + @property + def commands(self) -> List[str]: + return self._commands + + def __init__(self): + super().__init__() + self._news_tasks = {} # 存储正在进行的新闻抓取任务 + + def initialize(self, context: Dict[str, Any]) -> bool: + """初始化插件""" + self.LOG = logging.getLogger(f"Plugin.{self.name}") + self.LOG.info(f"正在初始化 {self.name} 插件...") + + # 保存上下文对象 + self.wcf = context.get("wcf") + self.event_system = context.get("event_system") + self.message_util = context.get("message_util") + + self._commands = self._config.get("GlobalNews", {}).get("command", ["全球新闻", "国际新闻", "环球新闻", "政经新闻"]) + self.command_format = self._config.get("GlobalNews", {}).get("command-format", "全球新闻 - 获取最新的全球政治经济新闻") + self.enable = self._config.get("GlobalNews", {}).get("enable", True) + + self.LOG.info(f"[{self.name}] 插件初始化完成,指令:{self._commands}") + return True + + def start(self) -> bool: + """启动插件""" + self.LOG.info(f"[{self.name}] 插件已启动") + self.status = PluginStatus.RUNNING + return True + + def stop(self) -> bool: + """停止插件""" + self.LOG.info(f"[{self.name}] 插件已停止") + self.status = PluginStatus.STOPPED + return True + + def can_process(self, message: Dict[str, Any]) -> bool: + """检查是否可以处理该消息""" + if not self.enable: + return False + + content = str(message.get("content", "")).strip() + command = content.split(" ")[0] + + return command in self._commands + + @plugin_stats_decorator(plugin_name="全球政治经济新闻") + @plugin_points_cost(5, "全球新闻消耗积分", Feature.NEWS) + def process_message(self, message: Dict[str, Any]) -> Tuple[bool, Optional[str]]: + """处理消息""" + content = str(message.get("content", "")).strip() + self.LOG.info(f"插件执行: {self.name}:{content}") + sender = message.get("sender") + roomid = message.get("roomid", "") + wcf: Wcf = message.get("wcf") + gbm: GroupBotManager = message.get("gbm") + + # 检查权限 + if roomid and gbm.get_group_permission(roomid, Feature.NEWS) == PermissionStatus.DISABLED: + return False, "没有权限" + + # 生成唯一任务ID + task_id = f"{sender}_{roomid}_{int(time.time())}" + + # 发送等待消息 + wcf.send_text("🌍正在获取全球新闻,请稍候...", + (roomid if roomid else sender), sender) + + # 启动异步任务 + self._start_news_task(task_id, sender, roomid, wcf) + + return True, "新闻获取任务已启动" + + def _start_news_task(self, task_id: str, sender: str, roomid: str, wcf: Wcf): + """启动异步新闻获取任务""" + thread = threading.Thread( + target=self._fetch_news_thread, + args=(task_id, sender, roomid, wcf) + ) + thread.daemon = True + thread.start() + self._news_tasks[task_id] = thread + self.LOG.info(f"启动新闻获取任务: {task_id}") + + def _fetch_news_thread(self, task_id: str, sender: str, roomid: str, wcf: Wcf): + """在单独的线程中运行异步新闻获取任务""" + try: + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + news_result = loop.run_until_complete(self._fetch_news_async()) + loop.close() + + # 处理结果 + if news_result: + # 发送新闻图片 + receiver = roomid if roomid else sender + wcf.send_image(news_result, receiver) + wcf.send_text("🌍全球新闻获取完成!", receiver, sender) + else: + wcf.send_text("❌获取新闻失败,请稍后再试", + (roomid if roomid else sender), sender) + except Exception as e: + self.LOG.error(f"新闻获取任务出错: {e}") + wcf.send_text(f"❌获取新闻出错: {str(e)}", + (roomid if roomid else sender), sender) + finally: + # 清理任务 + if task_id in self._news_tasks: + del self._news_tasks[task_id] + + async def _fetch_news_async(self) -> str: + """异步获取所有新闻源的新闻""" + try: + # 创建所有新闻源的任务 + tasks = [ + self._run_in_executor(nbc), + self._run_in_executor(cnn), + self._run_in_executor(abc), + self._run_in_executor(fox), + self._run_in_executor(bbc) + ] + + # 并行执行所有任务 + results = await asyncio.gather(*tasks) + + # 合并结果 + news_titles = "\n".join(results) + + # 使用AI分析新闻 + markdown_news = await self._run_in_executor( + dify_news_title_analyze, news_titles + ) + + # 转换为图片 + image_path = await self._run_in_executor( + convert_md_str_to_image, markdown_news, "news_output.png" + ) + + return image_path + except Exception as e: + self.LOG.error(f"异步获取新闻失败: {e}") + return "" + + async def _run_in_executor(self, func, *args): + """在线程池中运行同步函数""" + loop = asyncio.get_event_loop() + return await loop.run_in_executor(None, func, *args) \ No newline at end of file diff --git a/plugins/global_news/news_crawler.py b/plugins/global_news/news_crawler.py new file mode 100644 index 0000000..af9863e --- /dev/null +++ b/plugins/global_news/news_crawler.py @@ -0,0 +1,307 @@ +# -*- coding: utf-8 -*- +""" +Program: Global News Crawler +Author: Trae AI (based on MrCrawL's work) +Created Date: 2024-05-01 +""" +import requests +from time import localtime, sleep +from lxml import etree +import logging +from datetime import datetime +import time + +# 配置日志 +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s', + handlers=[ + logging.FileHandler(f'global_news_{datetime.now().strftime("%Y%m%d")}.log'), + logging.StreamHandler() + ] +) +logger = logging.getLogger(__name__) + +# 请求配置 +HEADERS = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' +} +TIMEOUT = 10 +MAX_RETRIES = 3 +NEWS_LIMIT = 30 + + +def get_time(): + date_ = f'{str(localtime().tm_year).zfill(4)}-{str(localtime().tm_mon).zfill(2)}-{str(localtime().tm_mday).zfill(2)}' + return date_ + + +def title_tidy(title_list): + t_index = [] + for i in range(1, len(title_list)): + if title_list[i] == title_list[i - 1]: t_index.append(i) + t_index.reverse() + for i in range(len(t_index)): title_list.pop(t_index[i]) + return title_list + + +def safe_request(url, retry_count=0): + """安全的请求方法,包含重试机制""" + try: + response = requests.get(url, headers=HEADERS, timeout=TIMEOUT) + response.raise_for_status() + return response + except requests.RequestException as e: + if retry_count < MAX_RETRIES: + logger.warning(f"请求失败,正在进行第{retry_count + 1}次重试: {url}") + sleep(1) + return safe_request(url, retry_count + 1) + else: + logger.error(f"请求失败: {url}, 错误: {str(e)}") + return None + + +def nbc(): + logger.info("开始获取NBC新闻") + try: + url = 'https://www.nbcnews.com/' + response = safe_request(url) + if not response: + return "获取NBC新闻失败" + + html = etree.HTML(response.text) + href = html.xpath('//h2/a/@href') + href = title_tidy(href) + + msg = '' + count = 0 + + for url in href[:NEWS_LIMIT]: + try: + response = safe_request(url) + if not response: + continue + + html = etree.HTML(response.text) + title = html.xpath('//h1/text()') + + if not title: + logger.warning(f'跳过视频或其他类型新闻: {url}') + continue + + title = title[0] + msg += f'Title: {title}. Link: {url}\n' + count += 1 + sleep(0.1) + + except Exception as e: + logger.error(f"处理新闻失败: {url}, 错误: {str(e)}") + continue + + logger.info(f"NBC新闻获取完成,共获取{count}条") + return msg + + except Exception as e: + logger.error(f"获取NBC新闻失败: {str(e)}") + return "获取新闻失败,请查看日志了解详情" + + +def cnn(): + logger.info("开始获取CNN新闻") + try: + head = 'https://www.cnn.com' + response = safe_request(head + '/') + if not response: + return "获取CNN新闻失败" + + html = etree.HTML(response.text) + href = html.xpath('//a[@data-link-type="article"]/@href') + href = title_tidy(href) + + msg = '' + count = 0 + + for url in href[:NEWS_LIMIT]: + try: + full_url = head + url + response = safe_request(full_url) + if not response: + continue + + html = etree.HTML(response.text) + title = html.xpath('//h1[@data-editable="headlineText"]/text()') + + if not title: + logger.warning(f'跳过视频或其他类型新闻: {full_url}') + continue + + title = title[0].strip() + msg += f'Title: {title}. Link: {full_url}\n' + count += 1 + sleep(0.1) + + except Exception as e: + logger.error(f"处理新闻失败: {full_url}, 错误: {str(e)}") + continue + + logger.info(f"CNN新闻获取完成,共获取{count}条") + return msg + + except Exception as e: + logger.error(f"获取CNN新闻失败: {str(e)}") + return "获取新闻失败,请查看日志了解详情" + + +def abc(): + logger.info("开始获取ABC新闻") + try: + head = 'https://abcnews.go.com' # 移除末尾的斜杠 + response = safe_request(head) + if not response: + return "获取ABC新闻失败" + + html = etree.HTML(response.text) + href1 = html.xpath('//div[@class="HeadlinesTrio"]/a/@href') + href2 = html.xpath( + '//div[@class="title card"]/a[@class="AnchorLink"]/@href | //div[@class="title"]/a[@class="AnchorLink"]/@href') + href3 = html.xpath('//a[@target="_self"]/@href') + href4 = html.xpath('//a[@class="AnchorLink VideoTile"]/@href') + href = title_tidy(href1 + href2 + href3 + href4) + + msg = '' + count = 0 + + for url in href[:NEWS_LIMIT]: + try: + # 处理URL格式 + if url.startswith('http'): + full_url = url + elif url.startswith('//'): + full_url = 'https:' + url + else: + full_url = head + ('' if url.startswith('/') else '/') + url + + response = safe_request(full_url) + if not response: + continue + + html = etree.HTML(response.text) + title = html.xpath('//div[@data-testid="prism-headline"]/h1/text()') + + if not title: + logger.warning(f'跳过视频或其他类型新闻: {full_url}') + continue + + title = title[0] + msg += f'Title: {title}. Link: {full_url}\n' + count += 1 + sleep(0.1) + + except Exception as e: + logger.error(f"处理新闻失败: {full_url}, 错误: {str(e)}") + continue + + logger.info(f"ABC新闻获取完成,共获取{count}条") + return msg + + except Exception as e: + logger.error(f"获取ABC新闻失败: {str(e)}") + return "获取新闻失败,请查看日志了解详情" + + +def fox(): + logger.info("开始获取FOX新闻") + try: + head = 'https://www.foxnews.com/' + response = safe_request(head) + if not response: + return "获取FOX新闻失败" + + html = etree.HTML(response.text) + href = html.xpath('//h3[@class="title"]/a/@href') + href = title_tidy(href) + + msg = '' + count = 0 + + for url in href[:NEWS_LIMIT]: + try: + if url[0:4] != 'http': + url = 'https:' + url + + response = safe_request(url) + if not response: + continue + + html = etree.HTML(response.text) + title = html.xpath('//h1[@itemprop="headline"]/text()') + + if not title: + logger.warning(f'跳过视频或其他类型新闻: {url}') + continue + + title = title[0] + msg += f'Title: {title}. Link: {url}\n' + count += 1 + sleep(0.1) + + except Exception as e: + logger.error(f"处理新闻失败: {url}, 错误: {str(e)}") + continue + + logger.info(f"FOX新闻获取完成,共获取{count}条") + return msg + + except Exception as e: + logger.error(f"获取FOX新闻失败: {str(e)}") + return "获取新闻失败,请查看日志了解详情" + + +def bbc(): + logger.info("开始获取BBC新闻") + try: + head = 'https://www.bbc.com' + response = safe_request(head + '/') + if not response: + return "获取BBC新闻失败" + + html = etree.HTML(response.text) + href = html.xpath( + '//h2[@data-testid="card-headline"]/../../../../../@href | //h2[@data-testid="card-headline"]/../../../../@href') + href = title_tidy(href) + + msg = '' + count = 0 + + for url in href[:NEWS_LIMIT]: + try: + if url[0:4] == 'http': + continue + + full_url = head + url + response = safe_request(full_url) + if not response: + continue + + html = etree.HTML(response.text) + title = html.xpath('//div[@data-component="headline-block"]/h1/text()') + + if not title: + logger.warning(f'跳过视频或其他类型新闻: {full_url}') + continue + + title = title[0] + msg += f'Title: {title}. Link: {full_url}\n' + count += 1 + sleep(0.1) + + except Exception as e: + logger.error(f"处理新闻失败: {full_url}, 错误: {str(e)}") + continue + + logger.info(f"BBC新闻获取完成,共获取{count}条") + return msg + + except Exception as e: + logger.error(f"获取BBC新闻失败: {str(e)}") + return "获取新闻失败,请查看日志了解详情" \ No newline at end of file diff --git a/utils/ai/dify_news_analyze.py b/utils/ai/dify_news_analyze.py new file mode 100644 index 0000000..07122e3 --- /dev/null +++ b/utils/ai/dify_news_analyze.py @@ -0,0 +1,73 @@ +# +# curl -X POST 'http://192.168.2.240/v1/chat-messages' \ +# --header 'Authorization: Bearer {api_key}' \ +# --header 'Content-Type: application/json' \ +# --data-raw '{ +# "inputs": {}, +# "query": "What are the specs of the iPhone 13 Pro Max?", +# "response_mode": "streaming", +# "conversation_id": "", +# "user": "abc-123", +# "files": [ +# { +# "type": "image", +# "transfer_method": "remote_url", +# "url": "https://cloud.dify.ai/logo/logo-site.png" +# } +# ] +# }' +import json + +import requests + + +def dify_news_title_analyze(content): + # 设置Authorization和URL + authorization = "Bearer app-rhhKkbvHd2IAQoGX7xTzXZJj" # 请替换为真实的Authorization token + url = 'http://192.168.2.240/v1/chat-messages' + + data = { + "response_mode": "blocking", + "conversation_id": "", + "inputs": {}, + "query": content, + "user": "a-bot" + } + + # 设置请求头 + headers = { + "Content-Type": "application/json; charset=utf-8", + "Authorization": authorization + } + + # 发送POST请求 + response = requests.post(url, headers=headers, data=json.dumps(data), ) + response.encoding = 'utf-8' + + # 输出响应内容 + print(response.status_code) + print(response.json()) + return extract_content(response.json()) + + +def extract_content(data): + """解析API响应内容 + Args: + data: API返回的响应数据,可以是字典或字符串 + Returns: + str: 提取的answer内容 + """ + try: + # 如果是字符串,尝试解析为字典 + if isinstance(data, str): + data = json.dumps(data) + # 如果是字典,直接获取answer + if isinstance(data, dict): + answer = data.get('answer', '') + if answer: + return answer + + return None + except Exception as e: + print(f"解析响应失败: {str(e)}") + return None diff --git a/utils/robot_cmd/robot_command.py b/utils/robot_cmd/robot_command.py index 8f6d64e..0d13b62 100644 --- a/utils/robot_cmd/robot_command.py +++ b/utils/robot_cmd/robot_command.py @@ -43,7 +43,8 @@ class Feature(Enum): GROUP_ADD = 16, "加群提醒功能" DOUYIN_PARSER = 17, "抖音链接转视频功能" GROUP_MEMBER_CHANGE = 18, "群成员变更提醒功能" - KID_PHOTO_EXTRACT =19, "儿童照片提取转发功能" # 小朋友照片提取功能 + KID_PHOTO_EXTRACT = 19, "儿童照片提取转发功能" # 小朋友照片提取功能 + NEWS = 20, "全球政治经济新闻" def __new__(cls, value, description): obj = object.__new__(cls) @@ -240,11 +241,11 @@ class GroupBotManager: str: 格式化的已启用功能列表字符串 """ enabled_features = [] - + # 检查群是否在列表中 if group_id not in GroupBotManager.local_cache["group_list"]: return "该群未启用机器人功能" - + # 遍历所有功能,检查哪些已启用且包含指令 for feature in Feature: status = GroupBotManager.get_group_permission(group_id, feature) @@ -255,18 +256,18 @@ class GroupBotManager: "name": feature.name, "description": feature.description }) - + # 如果没有启用任何带指令的功能 if not enabled_features: return "该群未启用任何带指令的功能" - + # 构建格式化的字符串 result = f"群功能菜单:\n" for feature in enabled_features: result += f"{feature['id']}.{feature['description']}\n" - + return result - + @staticmethod def get_group_list(): """返回所有启用了群机器人的群组清单,格式为集合"""