abot/base/func_english_news.py

# -*- coding: utf-8 -*-
"""
Program: English Daily News Downloader
Author: MrCrawL
Created Date: 2024-01-21
Last Modified: 2024-03-24
Modified by: MrCrawL
"""
from utils.ai.dify_news_analyze import dify_news_title_analyze
from utils.markdown_to_image import convert_md_str_to_image

'''Existing problem: text with hyperlink won't be saved'''

import requests
from time import localtime, sleep
from lxml import etree
import logging
from datetime import datetime

# 配置日志
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(f'news_crawler_{datetime.now().strftime("%Y%m%d")}.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# 请求配置
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
TIMEOUT = 10
MAX_RETRIES = 3
NEWS_LIMIT = 30


def get_time():
    date_ = f'{str(localtime().tm_year).zfill(4)}-{str(localtime().tm_mon).zfill(2)}-{str(localtime().tm_mday).zfill(2)}'
    return date_


def title_tidy(title_list):
    t_index = []
    for i in range(1, len(title_list)):
        if title_list[i] == title_list[i - 1]: t_index.append(i)
    t_index.reverse()
    for i in range(len(t_index)): title_list.pop(t_index[i])
    return title_list


def text_tidy(p_text):
    text_ = p_text.replace('’', "'")
    text_ = text_.replace(' \n\n', ' ')
    text_ = text_.replace('\n\n ', ' ')
    text_ = text_.replace('\n\n,', ' ,')
    text_ = text_.replace(',\n\n', ', ')
    text_ = text_.replace(';\n\n', '; ')
    text_ = text_.replace('\n\n;', ' ;')
    text_ = text_.replace(':\n\n', ': ')
    text_ = text_.replace('\n\n:', ' :')
    text_ = text_.replace('"\n\n', '" ')
    text_ = text_.replace('\n\n"', ' "')
    text_ = text_.replace("'\n\n", "' ")
    text_ = text_.replace("\n\n'", " '")
    return text_


def safe_request(url, retry_count=0):
    """安全的请求方法，包含重试机制"""
    try:
        response = requests.get(url, headers=HEADERS, timeout=TIMEOUT)
        response.raise_for_status()
        return response
    except requests.RequestException as e:
        if retry_count < MAX_RETRIES:
            logger.warning(f"请求失败，正在进行第{retry_count + 1}次重试: {url}")
            sleep(1)
            return safe_request(url, retry_count + 1)
        else:
            logger.error(f"请求失败: {url}, 错误: {str(e)}")
            return None


def nbc():
    logger.info("开始获取NBC新闻")
    try:
        url = 'https://www.nbcnews.com/'
        response = safe_request(url)
        if not response:
            return "获取NBC新闻失败"

        html = etree.HTML(response.text)
        href = html.xpath('//h2/a/@href')
        href = title_tidy(href)

        msg = ''
        count = 0

        for url in href[:NEWS_LIMIT]:
            try:
                response = safe_request(url)
                if not response:
                    continue

                html = etree.HTML(response.text)
                title = html.xpath('//h1/text()')

                if not title:
                    logger.warning(f'跳过视频或其他类型新闻: {url}')
                    continue

                title = title[0]
                msg += f'Title: {title}. Link: {url}\n'
                count += 1
                sleep(0.1)

            except Exception as e:
                logger.error(f"处理新闻失败: {url}, 错误: {str(e)}")
                continue

        logger.info(f"NBC新闻获取完成，共获取{count}条")
        return msg

    except Exception as e:
        logger.error(f"获取NBC新闻失败: {str(e)}")
        return "获取新闻失败，请查看日志了解详情"


def cnn():
    logger.info("开始获取CNN新闻")
    try:
        head = 'https://www.cnn.com'
        response = safe_request(head + '/')
        if not response:
            return "获取CNN新闻失败"

        html = etree.HTML(response.text)
        href = html.xpath('//a[@data-link-type="article"]/@href')
        href = title_tidy(href)

        msg = ''
        count = 0

        for url in href[:NEWS_LIMIT]:
            try:
                full_url = head + url
                response = safe_request(full_url)
                if not response:
                    continue

                html = etree.HTML(response.text)
                title = html.xpath('//h1[@data-editable="headlineText"]/text()')

                if not title:
                    logger.warning(f'跳过视频或其他类型新闻: {full_url}')
                    continue

                title = title[0].strip()
                msg += f'Title: {title}. Link: {full_url}\n'
                count += 1
                sleep(0.1)

            except Exception as e:
                logger.error(f"处理新闻失败: {full_url}, 错误: {str(e)}")
                continue

        logger.info(f"CNN新闻获取完成，共获取{count}条")
        return msg

    except Exception as e:
        logger.error(f"获取CNN新闻失败: {str(e)}")
        return "获取新闻失败，请查看日志了解详情"


def abc():
    logger.info("开始获取ABC新闻")
    try:
        head = 'https://abcnews.go.com'  # 移除末尾的斜杠
        response = safe_request(head)
        if not response:
            return "获取ABC新闻失败"

        html = etree.HTML(response.text)
        href1 = html.xpath('//div[@class="HeadlinesTrio"]/a/@href')
        href2 = html.xpath(
            '//div[@class="title card"]/a[@class="AnchorLink"]/@href | //div[@class="title"]/a[@class="AnchorLink"]/@href')
        href3 = html.xpath('//a[@target="_self"]/@href')
        href4 = html.xpath('//a[@class="AnchorLink VideoTile"]/@href')
        href = title_tidy(href1 + href2 + href3 + href4)

        msg = ''
        count = 0

        for url in href[:NEWS_LIMIT]:
            try:
                # 处理URL格式
                if url.startswith('http'):
                    full_url = url
                elif url.startswith('//'):
                    full_url = 'https:' + url
                else:
                    full_url = head + ('' if url.startswith('/') else '/') + url

                response = safe_request(full_url)
                if not response:
                    continue

                html = etree.HTML(response.text)
                title = html.xpath('//div[@data-testid="prism-headline"]/h1/text()')

                if not title:
                    logger.warning(f'跳过视频或其他类型新闻: {full_url}')
                    continue

                title = title[0]
                msg += f'Title: {title}. Link: {full_url}\n'
                count += 1
                sleep(0.1)

            except Exception as e:
                logger.error(f"处理新闻失败: {full_url}, 错误: {str(e)}")
                continue

        logger.info(f"ABC新闻获取完成，共获取{count}条")
        return msg

    except Exception as e:
        logger.error(f"获取ABC新闻失败: {str(e)}")
        return "获取新闻失败，请查看日志了解详情"


def fox():
    logger.info("开始获取FOX新闻")
    try:
        head = 'https://www.foxnews.com/'
        response = safe_request(head)
        if not response:
            return "获取FOX新闻失败"

        html = etree.HTML(response.text)
        href = html.xpath('//h3[@class="title"]/a/@href')
        href = title_tidy(href)

        msg = ''
        count = 0

        for url in href[:NEWS_LIMIT]:
            try:
                if url[0:4] != 'http':
                    url = 'https:' + url

                response = safe_request(url)
                if not response:
                    continue

                html = etree.HTML(response.text)
                title = html.xpath('//h1[@itemprop="headline"]/text()')

                if not title:
                    logger.warning(f'跳过视频或其他类型新闻: {url}')
                    continue

                title = title[0]
                msg += f'Title: {title}. Link: {url}\n'
                count += 1
                sleep(0.1)

            except Exception as e:
                logger.error(f"处理新闻失败: {url}, 错误: {str(e)}")
                continue

        logger.info(f"FOX新闻获取完成，共获取{count}条")
        return msg

    except Exception as e:
        logger.error(f"获取FOX新闻失败: {str(e)}")
        return "获取新闻失败，请查看日志了解详情"


def bbc():
    logger.info("开始获取BBC新闻")
    try:
        head = 'https://www.bbc.com'
        response = safe_request(head + '/')
        if not response:
            return "获取BBC新闻失败"

        html = etree.HTML(response.text)
        href = html.xpath(
            '//h2[@data-testid="card-headline"]/../../../../../@href | //h2[@data-testid="card-headline"]/../../../../@href')
        href = title_tidy(href)

        msg = ''
        count = 0

        for url in href[:NEWS_LIMIT]:
            try:
                if url[0:4] == 'http':
                    continue

                full_url = head + url
                response = safe_request(full_url)
                if not response:
                    continue

                html = etree.HTML(response.text)
                title = html.xpath('//div[@data-component="headline-block"]/h1/text()')

                if not title:
                    logger.warning(f'跳过视频或其他类型新闻: {full_url}')
                    continue

                title = title[0]
                msg += f'Title: {title}. Link: {full_url}\n'
                count += 1
                sleep(0.1)

            except Exception as e:
                logger.error(f"处理新闻失败: {full_url}, 错误: {str(e)}")
                continue

        logger.info(f"BBC新闻获取完成，共获取{count}条")
        return msg

    except Exception as e:
        logger.error(f"获取BBC新闻失败: {str(e)}")
        return "获取新闻失败，请查看日志了解详情"


def all_english_news():
    news_titles = ""
    news_titles += nbc() + "\n"
    news_titles += cnn() + "\n"
    news_titles += abc() + "\n"
    news_titles += fox() + "\n"
    news_titles += bbc() + "\n"
    markdown_news = dify_news_title_analyze(news_titles)
    spath = convert_md_str_to_image(markdown_news, "news_output.png")
    return spath