264 lines
10 KiB
Python
264 lines
10 KiB
Python
# -*- coding: utf-8 -*-
|
||
"""
|
||
Program: English Daily News Downloader
|
||
Author: MrCrawL
|
||
Created Date: 2024-01-21
|
||
Last Modified: 2024-03-24
|
||
Modified by: MrCrawL
|
||
"""
|
||
'''Existing problem: text with hyperlink won't be saved'''
|
||
|
||
import requests
|
||
from time import localtime, sleep
|
||
from lxml import etree
|
||
|
||
|
||
def get_time():
|
||
date_ = f'{str(localtime().tm_year).zfill(4)}-{str(localtime().tm_mon).zfill(2)}-{str(localtime().tm_mday).zfill(2)}'
|
||
return date_
|
||
|
||
|
||
# delete duplicated
|
||
def title_tidy(title_list):
|
||
t_index = []
|
||
for i in range(1, len(title_list)):
|
||
if title_list[i] == title_list[i - 1]: t_index.append(i)
|
||
t_index.reverse()
|
||
for i in range(len(t_index)): title_list.pop(t_index[i])
|
||
return title_list
|
||
|
||
|
||
# tidy text, seems a little bit redundant
|
||
def text_tidy(p_text):
|
||
text_ = p_text.replace('’', "'")
|
||
text_ = text_.replace(' \n\n', ' ')
|
||
text_ = text_.replace('\n\n ', ' ')
|
||
text_ = text_.replace('\n\n,', ' ,')
|
||
text_ = text_.replace(',\n\n', ', ')
|
||
text_ = text_.replace(';\n\n', '; ')
|
||
text_ = text_.replace('\n\n;', ' ;')
|
||
text_ = text_.replace(':\n\n', ': ')
|
||
text_ = text_.replace('\n\n:', ' :')
|
||
text_ = text_.replace('"\n\n', '" ')
|
||
text_ = text_.replace('\n\n"', ' "')
|
||
text_ = text_.replace("'\n\n", "' ")
|
||
text_ = text_.replace("\n\n'", " '")
|
||
return text_
|
||
|
||
|
||
def save(text, file_name, mode='w', encoding='utf-8'):
|
||
with open(f'{file_name}.txt', mode, encoding=encoding) as f: f.write(text)
|
||
|
||
|
||
def nbc():
|
||
url = 'https://www.nbcnews.com/'
|
||
res = requests.get(url)
|
||
html = etree.HTML(res.text)
|
||
href = html.xpath('//h2/a/@href')
|
||
href = title_tidy(href)
|
||
# quant = int(input(f'There are {len(href)} pieces detected. How many would you download:'))
|
||
# if quant > len(href) or quant < 1:
|
||
# print("Outnumber!")
|
||
# quit()
|
||
count = 0
|
||
# save('', f'NBC_news_title_{get_time()}')
|
||
# save('', f'NBC_news_text_{get_time()}')
|
||
msg =''
|
||
for i in range(30):
|
||
url = href[i]
|
||
sleep(0.1) # delete to speed up
|
||
res = requests.get(url)
|
||
html = etree.HTML(res.text)
|
||
title = html.xpath('//h1/text()')
|
||
if len(title) == 0:
|
||
print(f'Video or other news. Link: {url}')
|
||
continue
|
||
title = title[0]
|
||
author = html.xpath('//span[@class="byline-name"]/a/text() | //span[@class="byline-name" and not(a)]/text()')
|
||
author = ', '.join(author)
|
||
text = html.xpath('//p[@class=""]/text()')
|
||
text = '\n\n'.join(text)
|
||
text = text_tidy(text)
|
||
count += 1
|
||
# save(f'Title: {title}\nLink: {url}\n\n', f'NBC_news_title_{get_time()}', 'a') # news title
|
||
# save(f'Title: {title}\n\nOrigin: {url}\n\nAuthor: {author}\n\n\n', f'NBC_news_text_{get_time()}', 'a')
|
||
# save(f'{text}' + '\n\n------------------------------\n\n', f'NBC_news_text_{get_time()}', 'a')
|
||
# print(f'Title: {title}. Link: {href[i]}.')
|
||
msg += f'Title: {title}. Link: {href[i]}.\n'
|
||
return msg
|
||
|
||
|
||
def cnn():
|
||
head = 'https://www.cnn.com'
|
||
res = requests.get(head + '/')
|
||
html = etree.HTML(res.text)
|
||
href = html.xpath('//a[@data-link-type="article"]/@href')
|
||
href = title_tidy(href)
|
||
# quant = int(input(f'{len(href)} data detected. How many would you like to download:'))
|
||
# if quant > len(href) or quant < 1:
|
||
# print("Outnumber!")
|
||
# quit()
|
||
count = 0
|
||
msg = ''
|
||
# save('', f'CNN_news_title_{get_time()}')
|
||
# save('', f'CNN_news_text_{get_time()}')
|
||
for i in range(30):
|
||
url = head + href[i]
|
||
sleep(0.1) # delete to speed up
|
||
res = requests.get(url)
|
||
html = etree.HTML(res.text)
|
||
title = html.xpath('//h1[@data-editable="headlineText"]/text()')
|
||
if len(title) == 0:
|
||
print(f'Video or other news. Link: {url}')
|
||
continue
|
||
title = title[0].strip()
|
||
author = html.xpath('//span[@class="byline__name"]/text()')
|
||
author = ', '.join(author)
|
||
text = html.xpath('//p[@class="paragraph inline-placeholder"]/text()')
|
||
for k in range(len(text)): text[k].strip()
|
||
text = ''.join(text)
|
||
text = text_tidy(text)
|
||
count += 1
|
||
# save(f'Title: {title}\nLink: {url}\n\n', f'CNN_news_title_{get_time()}', 'a') # news title
|
||
# save(f'Title: {title}\n\nOrigin: {url}\n\nAuthor: {author}\n\n\n', f'CNN_news_text_{get_time()}', 'a')
|
||
# save(f'{text}' + '\n\n------------------------------\n\n', f'CNN_news_text_{get_time()}', 'a')
|
||
# print(f'Title: {title}. Link: {url}')
|
||
msg +=f'Title: {title}. Link: {url}\n'
|
||
# print(f'Files saved with {count} articles available.')
|
||
return msg
|
||
|
||
def abc():
|
||
head = 'https://abcnews.go.com/'
|
||
res = requests.get(head)
|
||
html = etree.HTML(res.text)
|
||
href1 = html.xpath('//div[@class="HeadlinesTrio"]/a/@href')
|
||
href2 = html.xpath('//div[@class="title card"]/a[@class="AnchorLink"]/@href | //div[@class="title"]/a[@class="AnchorLink"]/@href')
|
||
href3 = html.xpath('//a[@target="_self"]/@href')
|
||
href4 = html.xpath('//a[@class="AnchorLink VideoTile"]/@href')
|
||
href = href1 + href2 + href3 + href4
|
||
href = title_tidy(href)
|
||
# quant = int(input(f'{len(href)} data detected. How many would you like to download:'))
|
||
# if quant > len(href) or quant < 1:
|
||
# print("Outnumber!")
|
||
# quit()
|
||
count = 0
|
||
msg = ''
|
||
# save('', f'ABC_news_title_{get_time()}')
|
||
# save('', f'ABC_news_text_{get_time()}')
|
||
for i in range(30):
|
||
url = href[i]
|
||
sleep(0.1) # delete to speed up
|
||
res = requests.get(url)
|
||
html = etree.HTML(res.text)
|
||
title = html.xpath('//div[@data-testid="prism-headline"]/h1/text()')
|
||
if len(title) == 0:
|
||
print(f'Video or other news. Link: {url}')
|
||
continue
|
||
title = title[0]
|
||
author = html.xpath('//a[@data-testid="prism-linkbase"]/text()')
|
||
author = ', '.join(author)
|
||
text = html.xpath('//div[@data-testid="prism-article-body"]/p/text()')
|
||
text = '\n\n'.join(text)
|
||
text = text_tidy(text)
|
||
count += 1
|
||
# save(f'Title: {title}\nLink: {url}\n\n', f'ABC_news_title_{get_time()}', 'a') # news title
|
||
# save(f'Title: {title}\n\nOrigin: {url}\n\nAuthor: {author}\n\n\n', f'ABC_news_text_{get_time()}', 'a')
|
||
# save(f'{text}' + '\n\n------------------------------\n\n', f'ABC_news_text_{get_time()}', 'a')
|
||
# print(f'Title: {title}. Link: {url}')
|
||
msg +=f'Title: {title}. Link: {url}\n'
|
||
# print(f'Files saved with {count} articles available.')
|
||
return msg
|
||
|
||
def fox():
|
||
head = 'https://www.foxnews.com/'
|
||
res = requests.get(head)
|
||
html = etree.HTML(res.text)
|
||
href = html.xpath('//h3[@class="title"]/a/@href')
|
||
href = title_tidy(href)
|
||
# quant = int(input(f'{len(href)} data detected. How many would you like to download:'))
|
||
# if quant > len(href) or quant < 1:
|
||
# print("Outnumber!")
|
||
# quit()
|
||
count = 0
|
||
msg =''
|
||
# save('', f'FOX_news_title_{get_time()}')
|
||
# save('', f'FOX_news_text_{get_time()}')
|
||
for i in range(30):
|
||
if href[i][0:4] != 'http': href[i] = 'https:' + href[i]
|
||
url = href[i]
|
||
sleep(0.1) # delete to speed up
|
||
res = requests.get(url)
|
||
html = etree.HTML(res.text)
|
||
title = html.xpath('//h1[@itemprop="headline"]/text()')
|
||
if len(title) == 0:
|
||
print(f'Video or other news. Link: {url}')
|
||
continue
|
||
title = title[0]
|
||
author = html.xpath('//a[@rel="author"]/strong/text()')
|
||
author = ', '.join(author)
|
||
text = html.xpath('//div[@itemprop="articleBody"]/p/text()')
|
||
text = '\n\n'.join(text)
|
||
text = text_tidy(text)
|
||
count += 1
|
||
# save(f'Title: {title}\nLink: {url}\n\n', f'FOX_news_title_{get_time()}', 'a') # news title
|
||
# save(f'Title: {title}\n\nOrigin: {url}\n\nAuthor: {author}\n\n\n', f'FOX_news_text_{get_time()}', 'a')
|
||
# save(f'{text}' + '\n\n------------------------------\n\n', f'FOX_news_text_{get_time()}', 'a')
|
||
# print(f'Title: {title}. Link: {url}')
|
||
msg +=f'Title: {title}. Link: {url}\n'
|
||
# print(f'Files saved with {count} articles available.')
|
||
return msg
|
||
|
||
def bbc():
|
||
head = 'https://www.bbc.com'
|
||
res = requests.get(head + '/')
|
||
html = etree.HTML(res.text)
|
||
href = html.xpath('//h2[@data-testid="card-headline"]/../../../../../@href | //h2[@data-testid="card-headline"]/../../../../@href')
|
||
href = title_tidy(href)
|
||
# quant = int(input(f'{len(href)} data detected. How many would you like to download:'))
|
||
# if quant > len(href) or quant < 1:
|
||
# print("Outnumber!")
|
||
# quit()
|
||
count = 0
|
||
msg =''
|
||
# save('', f'BBC_news_title_{get_time()}')
|
||
# save('', f'BBC_news_text_{get_time()}')
|
||
for i in range(30):
|
||
if href[i][0:4] == 'http': continue
|
||
url = head + href[i]
|
||
sleep(0.1) # delete to speed up
|
||
print(url)
|
||
res = requests.get(url)
|
||
html = etree.HTML(res.text)
|
||
title = html.xpath('//div[@data-component="headline-block"]/h1/text()')
|
||
if len(title) == 0:
|
||
# print(f'Video or other news. Link: {url}')
|
||
continue
|
||
title = title[0]
|
||
# author = html.xpath('//div[@data-testid="byline"]/div/span[@data-testid="byline-name"]/text()')
|
||
# author = ', '.join(author)
|
||
# text = html.xpath('//div[@data-component="text-block"]/p/b/text() | //div[@data-component="text-block"]/p/text()')
|
||
# text = '\n\n'.join(text)
|
||
# text = text_tidy(text)
|
||
count += 1
|
||
# save(f'Title: {title}\nLink: {url}\n\n', f'BBC_news_title_{get_time()}', 'a') # news title
|
||
# save(f'Title: {title}\n\nOrigin: {url}\n\nAuthor: {author}\n\n\n', f'BBC_news_text_{get_time()}', 'a')
|
||
# save(f'{text}' + '\n\n------------------------------\n\n', f'BBC_news_text_{get_time()}', 'a')
|
||
# print(f'Title: {title}. Link: {url}')
|
||
|
||
msg +=f'Title: {title}. Link: {url}\n'
|
||
# print(f'Files saved with {count} articles available.')
|
||
return msg
|
||
|
||
if __name__ == '__main__':
|
||
# Hello, World! :)
|
||
# news = input('Choose news site["nbc","cnn","abc","fox","bbc"]:').lower()
|
||
# if news == 'nbc': nbc()
|
||
# elif news == 'cnn': cnn()
|
||
# elif news == 'abc': abc()
|
||
# elif news == 'fox': fox()
|
||
# elif news == 'bbc': bbc()
|
||
# else:
|
||
# print('Oops! It seems a wrong input. Please retry...')
|
||
# sleep(2)
|
||
print(bbc())
|