feature:全球政治经济新闻

2025-04-14 16:02:00 +08:00
parent bfdb1831d3
commit 8afd0f49d0
8 changed files with 885 additions and 220 deletions
--- a/base/func_english_news.py
+++ b/base/func_english_news.py
@@ -6,11 +6,35 @@ Created Date: 2024-01-21
 Last Modified: 2024-03-24
 Modified by: MrCrawL
 """
+from utils.ai.dify_news_analyze import dify_news_title_analyze
+from utils.markdown_to_image import convert_md_str_to_image
+
 '''Existing problem: text with hyperlink won't be saved'''

 import requests
 from time import localtime, sleep
 from lxml import etree
+import logging
+from datetime import datetime
+
+# 配置日志
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.FileHandler(f'news_crawler_{datetime.now().strftime("%Y%m%d")}.log'),
+        logging.StreamHandler()
+    ]
+)
+logger = logging.getLogger(__name__)
+
+# 请求配置
+HEADERS = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+}
+TIMEOUT = 10
+MAX_RETRIES = 3
+NEWS_LIMIT = 30


 def get_time():
@@ -18,7 +42,6 @@ def get_time():
    return date_


-# delete duplicated
 def title_tidy(title_list):
    t_index = []
    for i in range(1, len(title_list)):
@@ -28,7 +51,6 @@ def title_tidy(title_list):
    return title_list


-# tidy text, seems a little bit redundant
 def text_tidy(p_text):
    text_ = p_text.replace('’', "'")
    text_ = text_.replace(' \n\n', ' ')
@@ -46,218 +68,275 @@ def text_tidy(p_text):
    return text_


-def save(text, file_name, mode='w', encoding='utf-8'):
-    with open(f'{file_name}.txt', mode, encoding=encoding) as f: f.write(text)
+def safe_request(url, retry_count=0):
+    """安全的请求方法，包含重试机制"""
+    try:
+        response = requests.get(url, headers=HEADERS, timeout=TIMEOUT)
+        response.raise_for_status()
+        return response
+    except requests.RequestException as e:
+        if retry_count < MAX_RETRIES:
+            logger.warning(f"请求失败，正在进行第{retry_count + 1}次重试: {url}")
+            sleep(1)
+            return safe_request(url, retry_count + 1)
+        else:
+            logger.error(f"请求失败: {url}, 错误: {str(e)}")
+            return None


 def nbc():
-    url = 'https://www.nbcnews.com/'
-    res = requests.get(url)
-    html = etree.HTML(res.text)
-    href = html.xpath('//h2/a/@href')
-    href = title_tidy(href)
-    # quant = int(input(f'There are {len(href)} pieces detected. How many would you download:'))
-    # if quant > len(href) or quant < 1:
-    #     print("Outnumber!")
-    #     quit()
-    count = 0
-    # save('', f'NBC_news_title_{get_time()}')
-    # save('', f'NBC_news_text_{get_time()}')
-    msg =''
-    for i in range(30):
-        url = href[i]
-        sleep(0.1)  # delete to speed up
-        res = requests.get(url)
-        html = etree.HTML(res.text)
-        title = html.xpath('//h1/text()')
-        if len(title) == 0:
-            print(f'Video or other news. Link: {url}')
-            continue
-        title = title[0]
-        author = html.xpath('//span[@class="byline-name"]/a/text() | //span[@class="byline-name" and not(a)]/text()')
-        author = ', '.join(author)
-        text = html.xpath('//p[@class=""]/text()')
-        text = '\n\n'.join(text)
-        text = text_tidy(text)
-        count += 1
-        # save(f'Title: {title}\nLink: {url}\n\n', f'NBC_news_title_{get_time()}', 'a')  # news title
-        # save(f'Title: {title}\n\nOrigin: {url}\n\nAuthor: {author}\n\n\n', f'NBC_news_text_{get_time()}', 'a')
-        # save(f'{text}' + '\n\n------------------------------\n\n', f'NBC_news_text_{get_time()}', 'a')
-        # print(f'Title: {title}. Link: {href[i]}.')
-        msg += f'Title: {title}. Link: {href[i]}.\n'
-    return msg
+    logger.info("开始获取NBC新闻")
+    try:
+        url = 'https://www.nbcnews.com/'
+        response = safe_request(url)
+        if not response:
+            return "获取NBC新闻失败"
+
+        html = etree.HTML(response.text)
+        href = html.xpath('//h2/a/@href')
+        href = title_tidy(href)
+
+        msg = ''
+        count = 0
+
+        for url in href[:NEWS_LIMIT]:
+            try:
+                response = safe_request(url)
+                if not response:
+                    continue
+
+                html = etree.HTML(response.text)
+                title = html.xpath('//h1/text()')
+
+                if not title:
+                    logger.warning(f'跳过视频或其他类型新闻: {url}')
+                    continue
+
+                title = title[0]
+                msg += f'Title: {title}. Link: {url}\n'
+                count += 1
+                sleep(0.1)
+
+            except Exception as e:
+                logger.error(f"处理新闻失败: {url}, 错误: {str(e)}")
+                continue
+
+        logger.info(f"NBC新闻获取完成，共获取{count}条")
+        return msg
+
+    except Exception as e:
+        logger.error(f"获取NBC新闻失败: {str(e)}")
+        return "获取新闻失败，请查看日志了解详情"


 def cnn():
-    head = 'https://www.cnn.com'
-    res = requests.get(head + '/')
-    html = etree.HTML(res.text)
-    href = html.xpath('//a[@data-link-type="article"]/@href')
-    href = title_tidy(href)
-    # quant = int(input(f'{len(href)} data detected. How many would you like to download:'))
-    # if quant > len(href) or quant < 1:
-    #     print("Outnumber!")
-    #     quit()
-    count = 0
-    msg = ''
-    # save('', f'CNN_news_title_{get_time()}')
-    # save('', f'CNN_news_text_{get_time()}')
-    for i in range(30):
-        url = head + href[i]
-        sleep(0.1)  # delete to speed up
-        res = requests.get(url)
-        html = etree.HTML(res.text)
-        title = html.xpath('//h1[@data-editable="headlineText"]/text()')
-        if len(title) == 0:
-            print(f'Video or other news. Link: {url}')
-            continue
-        title = title[0].strip()
-        author = html.xpath('//span[@class="byline__name"]/text()')
-        author = ', '.join(author)
-        text = html.xpath('//p[@class="paragraph inline-placeholder"]/text()')
-        for k in range(len(text)): text[k].strip()
-        text = ''.join(text)
-        text = text_tidy(text)
-        count += 1
-        # save(f'Title: {title}\nLink: {url}\n\n', f'CNN_news_title_{get_time()}', 'a')  # news title
-        # save(f'Title: {title}\n\nOrigin: {url}\n\nAuthor: {author}\n\n\n', f'CNN_news_text_{get_time()}', 'a')
-        # save(f'{text}' + '\n\n------------------------------\n\n', f'CNN_news_text_{get_time()}', 'a')
-        # print(f'Title: {title}. Link: {url}')
-        msg +=f'Title: {title}. Link: {url}\n'
-    # print(f'Files saved with {count} articles available.')
-    return msg
+    logger.info("开始获取CNN新闻")
+    try:
+        head = 'https://www.cnn.com'
+        response = safe_request(head + '/')
+        if not response:
+            return "获取CNN新闻失败"
+
+        html = etree.HTML(response.text)
+        href = html.xpath('//a[@data-link-type="article"]/@href')
+        href = title_tidy(href)
+
+        msg = ''
+        count = 0
+
+        for url in href[:NEWS_LIMIT]:
+            try:
+                full_url = head + url
+                response = safe_request(full_url)
+                if not response:
+                    continue
+
+                html = etree.HTML(response.text)
+                title = html.xpath('//h1[@data-editable="headlineText"]/text()')
+
+                if not title:
+                    logger.warning(f'跳过视频或其他类型新闻: {full_url}')
+                    continue
+
+                title = title[0].strip()
+                msg += f'Title: {title}. Link: {full_url}\n'
+                count += 1
+                sleep(0.1)
+
+            except Exception as e:
+                logger.error(f"处理新闻失败: {full_url}, 错误: {str(e)}")
+                continue
+
+        logger.info(f"CNN新闻获取完成，共获取{count}条")
+        return msg
+
+    except Exception as e:
+        logger.error(f"获取CNN新闻失败: {str(e)}")
+        return "获取新闻失败，请查看日志了解详情"
+

 def abc():
-    head = 'https://abcnews.go.com/'
-    res = requests.get(head)
-    html = etree.HTML(res.text)
-    href1 = html.xpath('//div[@class="HeadlinesTrio"]/a/@href')
-    href2 = html.xpath('//div[@class="title card"]/a[@class="AnchorLink"]/@href | //div[@class="title"]/a[@class="AnchorLink"]/@href')
-    href3 = html.xpath('//a[@target="_self"]/@href')
-    href4 = html.xpath('//a[@class="AnchorLink VideoTile"]/@href')
-    href = href1 + href2 + href3 + href4
-    href = title_tidy(href)
-    # quant = int(input(f'{len(href)} data detected. How many would you like to download:'))
-    # if quant > len(href) or quant < 1:
-    #     print("Outnumber!")
-    #     quit()
-    count = 0
-    msg = ''
-    # save('', f'ABC_news_title_{get_time()}')
-    # save('', f'ABC_news_text_{get_time()}')
-    for i in range(30):
-        url = href[i]
-        sleep(0.1)  # delete to speed up
-        res = requests.get(url)
-        html = etree.HTML(res.text)
-        title = html.xpath('//div[@data-testid="prism-headline"]/h1/text()')
-        if len(title) == 0:
-            print(f'Video or other news. Link: {url}')
-            continue
-        title = title[0]
-        author = html.xpath('//a[@data-testid="prism-linkbase"]/text()')
-        author = ', '.join(author)
-        text = html.xpath('//div[@data-testid="prism-article-body"]/p/text()')
-        text = '\n\n'.join(text)
-        text = text_tidy(text)
-        count += 1
-        # save(f'Title: {title}\nLink: {url}\n\n', f'ABC_news_title_{get_time()}', 'a')  # news title
-        # save(f'Title: {title}\n\nOrigin: {url}\n\nAuthor: {author}\n\n\n', f'ABC_news_text_{get_time()}', 'a')
-        # save(f'{text}' + '\n\n------------------------------\n\n', f'ABC_news_text_{get_time()}', 'a')
-        # print(f'Title: {title}. Link: {url}')
-        msg +=f'Title: {title}. Link: {url}\n'
-    # print(f'Files saved with {count} articles available.')
-    return msg
+    logger.info("开始获取ABC新闻")
+    try:
+        head = 'https://abcnews.go.com'  # 移除末尾的斜杠
+        response = safe_request(head)
+        if not response:
+            return "获取ABC新闻失败"
+
+        html = etree.HTML(response.text)
+        href1 = html.xpath('//div[@class="HeadlinesTrio"]/a/@href')
+        href2 = html.xpath(
+            '//div[@class="title card"]/a[@class="AnchorLink"]/@href | //div[@class="title"]/a[@class="AnchorLink"]/@href')
+        href3 = html.xpath('//a[@target="_self"]/@href')
+        href4 = html.xpath('//a[@class="AnchorLink VideoTile"]/@href')
+        href = title_tidy(href1 + href2 + href3 + href4)
+
+        msg = ''
+        count = 0
+
+        for url in href[:NEWS_LIMIT]:
+            try:
+                # 处理URL格式
+                if url.startswith('http'):
+                    full_url = url
+                elif url.startswith('//'):
+                    full_url = 'https:' + url
+                else:
+                    full_url = head + ('' if url.startswith('/') else '/') + url
+
+                response = safe_request(full_url)
+                if not response:
+                    continue
+
+                html = etree.HTML(response.text)
+                title = html.xpath('//div[@data-testid="prism-headline"]/h1/text()')
+
+                if not title:
+                    logger.warning(f'跳过视频或其他类型新闻: {full_url}')
+                    continue
+
+                title = title[0]
+                msg += f'Title: {title}. Link: {full_url}\n'
+                count += 1
+                sleep(0.1)
+
+            except Exception as e:
+                logger.error(f"处理新闻失败: {full_url}, 错误: {str(e)}")
+                continue
+
+        logger.info(f"ABC新闻获取完成，共获取{count}条")
+        return msg
+
+    except Exception as e:
+        logger.error(f"获取ABC新闻失败: {str(e)}")
+        return "获取新闻失败，请查看日志了解详情"
+

 def fox():
-    head = 'https://www.foxnews.com/'
-    res = requests.get(head)
-    html = etree.HTML(res.text)
-    href = html.xpath('//h3[@class="title"]/a/@href')
-    href = title_tidy(href)
-    # quant = int(input(f'{len(href)} data detected. How many would you like to download:'))
-    # if quant > len(href) or quant < 1:
-    #     print("Outnumber!")
-    #     quit()
-    count = 0
-    msg =''
-    # save('', f'FOX_news_title_{get_time()}')
-    # save('', f'FOX_news_text_{get_time()}')
-    for i in range(30):
-        if href[i][0:4] != 'http': href[i] = 'https:' + href[i]
-        url = href[i]
-        sleep(0.1)  # delete to speed up
-        res = requests.get(url)
-        html = etree.HTML(res.text)
-        title = html.xpath('//h1[@itemprop="headline"]/text()')
-        if len(title) == 0:
-            print(f'Video or other news. Link: {url}')
-            continue
-        title = title[0]
-        author = html.xpath('//a[@rel="author"]/strong/text()')
-        author = ', '.join(author)
-        text = html.xpath('//div[@itemprop="articleBody"]/p/text()')
-        text = '\n\n'.join(text)
-        text = text_tidy(text)
-        count += 1
-        # save(f'Title: {title}\nLink: {url}\n\n', f'FOX_news_title_{get_time()}', 'a')  # news title
-        # save(f'Title: {title}\n\nOrigin: {url}\n\nAuthor: {author}\n\n\n', f'FOX_news_text_{get_time()}', 'a')
-        # save(f'{text}' + '\n\n------------------------------\n\n', f'FOX_news_text_{get_time()}', 'a')
-        # print(f'Title: {title}. Link: {url}')
-        msg +=f'Title: {title}. Link: {url}\n'
-    # print(f'Files saved with {count} articles available.')
-    return msg
+    logger.info("开始获取FOX新闻")
+    try:
+        head = 'https://www.foxnews.com/'
+        response = safe_request(head)
+        if not response:
+            return "获取FOX新闻失败"
+
+        html = etree.HTML(response.text)
+        href = html.xpath('//h3[@class="title"]/a/@href')
+        href = title_tidy(href)
+
+        msg = ''
+        count = 0
+
+        for url in href[:NEWS_LIMIT]:
+            try:
+                if url[0:4] != 'http':
+                    url = 'https:' + url
+
+                response = safe_request(url)
+                if not response:
+                    continue
+
+                html = etree.HTML(response.text)
+                title = html.xpath('//h1[@itemprop="headline"]/text()')
+
+                if not title:
+                    logger.warning(f'跳过视频或其他类型新闻: {url}')
+                    continue
+
+                title = title[0]
+                msg += f'Title: {title}. Link: {url}\n'
+                count += 1
+                sleep(0.1)
+
+            except Exception as e:
+                logger.error(f"处理新闻失败: {url}, 错误: {str(e)}")
+                continue
+
+        logger.info(f"FOX新闻获取完成，共获取{count}条")
+        return msg
+
+    except Exception as e:
+        logger.error(f"获取FOX新闻失败: {str(e)}")
+        return "获取新闻失败，请查看日志了解详情"
+

 def bbc():
-    head = 'https://www.bbc.com'
-    res = requests.get(head + '/')
-    html = etree.HTML(res.text)
-    href = html.xpath('//h2[@data-testid="card-headline"]/../../../../../@href | //h2[@data-testid="card-headline"]/../../../../@href')
-    href = title_tidy(href)
-    # quant = int(input(f'{len(href)} data detected. How many would you like to download:'))
-    # if quant > len(href) or quant < 1:
-    #     print("Outnumber!")
-    #     quit()
-    count = 0
-    msg =''
-    # save('', f'BBC_news_title_{get_time()}')
-    # save('', f'BBC_news_text_{get_time()}')
-    for i in range(30):
-        if href[i][0:4] == 'http': continue
-        url = head + href[i]
-        sleep(0.1)  # delete to speed up
-        print(url)
-        res = requests.get(url)
-        html = etree.HTML(res.text)
-        title = html.xpath('//div[@data-component="headline-block"]/h1/text()')
-        if len(title) == 0:
-            # print(f'Video or other news. Link: {url}')
-            continue
-        title = title[0]
-        # author = html.xpath('//div[@data-testid="byline"]/div/span[@data-testid="byline-name"]/text()')
-        # author = ', '.join(author)
-        # text = html.xpath('//div[@data-component="text-block"]/p/b/text() | //div[@data-component="text-block"]/p/text()')
-        # text = '\n\n'.join(text)
-        # text = text_tidy(text)
-        count += 1
-        # save(f'Title: {title}\nLink: {url}\n\n', f'BBC_news_title_{get_time()}', 'a')  # news title
-        # save(f'Title: {title}\n\nOrigin: {url}\n\nAuthor: {author}\n\n\n', f'BBC_news_text_{get_time()}', 'a')
-        # save(f'{text}' + '\n\n------------------------------\n\n', f'BBC_news_text_{get_time()}', 'a')
-        # print(f'Title: {title}. Link: {url}')
+    logger.info("开始获取BBC新闻")
+    try:
+        head = 'https://www.bbc.com'
+        response = safe_request(head + '/')
+        if not response:
+            return "获取BBC新闻失败"

-        msg +=f'Title: {title}. Link: {url}\n'
-    # print(f'Files saved with {count} articles available.')
-    return  msg
+        html = etree.HTML(response.text)
+        href = html.xpath(
+            '//h2[@data-testid="card-headline"]/../../../../../@href | //h2[@data-testid="card-headline"]/../../../../@href')
+        href = title_tidy(href)

-if __name__ == '__main__':
-    # Hello, World! :)
-    # news = input('Choose news site["nbc","cnn","abc","fox","bbc"]:').lower()
-    # if news == 'nbc': nbc()
-    # elif news == 'cnn': cnn()
-    # elif news == 'abc': abc()
-    # elif news == 'fox': fox()
-    # elif news == 'bbc': bbc()
-    # else:
-    #     print('Oops! It seems a wrong input. Please retry...')
-    #     sleep(2)
-    print(bbc())
+        msg = ''
+        count = 0
+
+        for url in href[:NEWS_LIMIT]:
+            try:
+                if url[0:4] == 'http':
+                    continue
+
+                full_url = head + url
+                response = safe_request(full_url)
+                if not response:
+                    continue
+
+                html = etree.HTML(response.text)
+                title = html.xpath('//div[@data-component="headline-block"]/h1/text()')
+
+                if not title:
+                    logger.warning(f'跳过视频或其他类型新闻: {full_url}')
+                    continue
+
+                title = title[0]
+                msg += f'Title: {title}. Link: {full_url}\n'
+                count += 1
+                sleep(0.1)
+
+            except Exception as e:
+                logger.error(f"处理新闻失败: {full_url}, 错误: {str(e)}")
+                continue
+
+        logger.info(f"BBC新闻获取完成，共获取{count}条")
+        return msg
+
+    except Exception as e:
+        logger.error(f"获取BBC新闻失败: {str(e)}")
+        return "获取新闻失败，请查看日志了解详情"
+
+
+def all_english_news():
+    news_titles = ""
+    news_titles += nbc() + "\n"
+    news_titles += cnn() + "\n"
+    news_titles += abc() + "\n"
+    news_titles += fox() + "\n"
+    news_titles += bbc() + "\n"
+    markdown_news = dify_news_title_analyze(news_titles)
+    spath = convert_md_str_to_image(markdown_news, "news_output.png")
+    return spath
--- a/base/func_news.py
+++ b/base/func_news.py
@@ -11,6 +11,7 @@ import requests
 from lxml import etree

 from base import func_english_news
+from utils.ai.dify_news_analyze import dify_news_title_analyze


 class News(object):
@@ -70,10 +71,10 @@ class News(object):
                post = response.json()
                # 提取content列表 - 避免使用str作为变量名
                content_list = post.get('data', {}).get('cards', [])
-                
+
                if content_list and len(content_list) > 0:
                    news_items = content_list[0].get('content', [])
-                    
+
                    # 遍历列表，并格式化每个字典的title, url，然后添加到output字符串中
                    for index, article in enumerate(news_items, start=1):
                        if isinstance(article, dict) and 'word' in article:
@@ -81,34 +82,34 @@ class News(object):
                            # url = article.get('url', '')
                            # 使用f-string格式化字符串，并添加到output中
                            output += f"{index} :#{title}\n"
-                
+
                # 输出最终的字符串
                return output
            else:
                self.LOG.error(f"获取百度新闻失败，状态码: {response.status_code}")
                return "获取百度新闻失败，请稍后再试"
-                
+
        except Exception as e:
            self.LOG.error(f"获取百度新闻时出错: {e}")
            return f"获取百度新闻时出错: {e}"

-    def get_eng_news(self,website):
+    def get_eng_news(self, website):
        if website == 'nbc':
-           return func_english_news.nbc()
+            return func_english_news.nbc()
        elif website == 'cnn':
-            return  func_english_news.cnn()
+            return func_english_news.cnn()
        elif website == 'abc':
            return func_english_news.abc()
        elif website == 'fox':
            return func_english_news.fox()
        elif website == 'bbc':
-            return  func_english_news.bbc()
+            return func_english_news.bbc()
+


 if __name__ == "__main__":
    news = News()
-    print(news.get_baidu_news())
    # # msg = "@水牛-分身 今日百度新闻"
    # # q = re.sub(r"@.*?[\u2005|\s]", "", msg).replace(" ", "")
    # # print(q)
-    # print(news.get_eng_news('nbc'))
+    # print(news.get_eng_news('nbc'))