# -*- coding: utf-8 -*- """ Program: English Daily News Downloader Author: MrCrawL Created Date: 2024-01-21 Last Modified: 2024-03-24 Modified by: MrCrawL """ from utils.markdown_to_image import convert_md_str_to_image '''Existing problem: text with hyperlink won't be saved''' import requests from time import localtime, sleep from lxml import etree from loguru import logger # 请求配置 HEADERS = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } TIMEOUT = 10 MAX_RETRIES = 3 NEWS_LIMIT = 30 def get_time(): date_ = f'{str(localtime().tm_year).zfill(4)}-{str(localtime().tm_mon).zfill(2)}-{str(localtime().tm_mday).zfill(2)}' return date_ def title_tidy(title_list): t_index = [] for i in range(1, len(title_list)): if title_list[i] == title_list[i - 1]: t_index.append(i) t_index.reverse() for i in range(len(t_index)): title_list.pop(t_index[i]) return title_list def text_tidy(p_text): text_ = p_text.replace('’', "'") text_ = text_.replace(' \n\n', ' ') text_ = text_.replace('\n\n ', ' ') text_ = text_.replace('\n\n,', ' ,') text_ = text_.replace(',\n\n', ', ') text_ = text_.replace(';\n\n', '; ') text_ = text_.replace('\n\n;', ' ;') text_ = text_.replace(':\n\n', ': ') text_ = text_.replace('\n\n:', ' :') text_ = text_.replace('"\n\n', '" ') text_ = text_.replace('\n\n"', ' "') text_ = text_.replace("'\n\n", "' ") text_ = text_.replace("\n\n'", " '") return text_ def safe_request(url, retry_count=0): """安全的请求方法,包含重试机制""" try: response = requests.get(url, headers=HEADERS, timeout=TIMEOUT) response.raise_for_status() return response except requests.RequestException as e: if retry_count < MAX_RETRIES: logger.warning(f"请求失败,正在进行第{retry_count + 1}次重试: {url}") sleep(1) return safe_request(url, retry_count + 1) else: logger.error(f"请求失败: {url}, 错误: {str(e)}") return None def nbc(): logger.info("开始获取NBC新闻") try: url = 'https://www.nbcnews.com/' response = safe_request(url) if not response: return "获取NBC新闻失败" html = etree.HTML(response.text) href = html.xpath('//h2/a/@href') href = title_tidy(href) msg = '' count = 0 for url in href[:NEWS_LIMIT]: try: response = safe_request(url) if not response: continue html = etree.HTML(response.text) title = html.xpath('//h1/text()') if not title: logger.warning(f'跳过视频或其他类型新闻: {url}') continue title = title[0] msg += f'Title: {title}. Link: {url}\n' count += 1 sleep(0.1) except Exception as e: logger.error(f"处理新闻失败: {url}, 错误: {str(e)}") continue logger.info(f"NBC新闻获取完成,共获取{count}条") return msg except Exception as e: logger.error(f"获取NBC新闻失败: {str(e)}") return "获取新闻失败,请查看日志了解详情" def cnn(): logger.info("开始获取CNN新闻") try: head = 'https://www.cnn.com' response = safe_request(head + '/') if not response: return "获取CNN新闻失败" html = etree.HTML(response.text) href = html.xpath('//a[@data-link-type="article"]/@href') href = title_tidy(href) msg = '' count = 0 for url in href[:NEWS_LIMIT]: try: full_url = head + url response = safe_request(full_url) if not response: continue html = etree.HTML(response.text) title = html.xpath('//h1[@data-editable="headlineText"]/text()') if not title: logger.warning(f'跳过视频或其他类型新闻: {full_url}') continue title = title[0].strip() msg += f'Title: {title}. Link: {full_url}\n' count += 1 sleep(0.1) except Exception as e: logger.error(f"处理新闻失败: {full_url}, 错误: {str(e)}") continue logger.info(f"CNN新闻获取完成,共获取{count}条") return msg except Exception as e: logger.error(f"获取CNN新闻失败: {str(e)}") return "获取新闻失败,请查看日志了解详情" def abc(): logger.info("开始获取ABC新闻") try: head = 'https://abcnews.go.com' # 移除末尾的斜杠 response = safe_request(head) if not response: return "获取ABC新闻失败" html = etree.HTML(response.text) href1 = html.xpath('//div[@class="HeadlinesTrio"]/a/@href') href2 = html.xpath( '//div[@class="title card"]/a[@class="AnchorLink"]/@href | //div[@class="title"]/a[@class="AnchorLink"]/@href') href3 = html.xpath('//a[@target="_self"]/@href') href4 = html.xpath('//a[@class="AnchorLink VideoTile"]/@href') href = title_tidy(href1 + href2 + href3 + href4) msg = '' count = 0 for url in href[:NEWS_LIMIT]: try: # 处理URL格式 if url.startswith('http'): full_url = url elif url.startswith('//'): full_url = 'https:' + url else: full_url = head + ('' if url.startswith('/') else '/') + url response = safe_request(full_url) if not response: continue html = etree.HTML(response.text) title = html.xpath('//div[@data-testid="prism-headline"]/h1/text()') if not title: logger.warning(f'跳过视频或其他类型新闻: {full_url}') continue title = title[0] msg += f'Title: {title}. Link: {full_url}\n' count += 1 sleep(0.1) except Exception as e: logger.error(f"处理新闻失败: {full_url}, 错误: {str(e)}") continue logger.info(f"ABC新闻获取完成,共获取{count}条") return msg except Exception as e: logger.error(f"获取ABC新闻失败: {str(e)}") return "获取新闻失败,请查看日志了解详情" def fox(): logger.info("开始获取FOX新闻") try: head = 'https://www.foxnews.com/' response = safe_request(head) if not response: return "获取FOX新闻失败" html = etree.HTML(response.text) href = html.xpath('//h3[@class="title"]/a/@href') href = title_tidy(href) msg = '' count = 0 for url in href[:NEWS_LIMIT]: try: if url[0:4] != 'http': url = 'https:' + url response = safe_request(url) if not response: continue html = etree.HTML(response.text) title = html.xpath('//h1[@itemprop="headline"]/text()') if not title: logger.warning(f'跳过视频或其他类型新闻: {url}') continue title = title[0] msg += f'Title: {title}. Link: {url}\n' count += 1 sleep(0.1) except Exception as e: logger.error(f"处理新闻失败: {url}, 错误: {str(e)}") continue logger.info(f"FOX新闻获取完成,共获取{count}条") return msg except Exception as e: logger.error(f"获取FOX新闻失败: {str(e)}") return "获取新闻失败,请查看日志了解详情" def bbc(): logger.info("开始获取BBC新闻") try: head = 'https://www.bbc.com' response = safe_request(head + '/') if not response: return "获取BBC新闻失败" html = etree.HTML(response.text) href = html.xpath( '//h2[@data-testid="card-headline"]/../../../../../@href | //h2[@data-testid="card-headline"]/../../../../@href') href = title_tidy(href) msg = '' count = 0 for url in href[:NEWS_LIMIT]: try: if url[0:4] == 'http': continue full_url = head + url response = safe_request(full_url) if not response: continue html = etree.HTML(response.text) title = html.xpath('//div[@data-component="headline-block"]/h1/text()') if not title: logger.warning(f'跳过视频或其他类型新闻: {full_url}') continue title = title[0] msg += f'Title: {title}. Link: {full_url}\n' count += 1 sleep(0.1) except Exception as e: logger.error(f"处理新闻失败: {full_url}, 错误: {str(e)}") continue logger.info(f"BBC新闻获取完成,共获取{count}条") return msg except Exception as e: logger.error(f"获取BBC新闻失败: {str(e)}") return "获取新闻失败,请查看日志了解详情" def all_english_news(): news_titles = "" news_titles += nbc() + "\n" news_titles += cnn() + "\n" news_titles += abc() + "\n" news_titles += fox() + "\n" news_titles += bbc() + "\n" markdown_news = news_titles # self.dify_news_title_analyze(news_titles) spath = convert_md_str_to_image(markdown_news, "news_output.png") return spath