Add Reuters grabbers.

This commit is contained in:
Jack Hadrill 2020-01-18 19:47:27 +00:00
parent 285c0a3fc0
commit b5771bef90
9 changed files with 33 additions and 52 deletions

View File

@ -3,11 +3,13 @@ import sqlite3
import requests
import xml.etree.ElementTree as et
from abc import abstractmethod, ABC
from datetime import datetime
from requests.exceptions import HTTPError
class Grabber(ABC):
articles = []
date_format = "%a, %d %b %Y %H:%M:%S %z"
db = None
name = ""
@ -16,10 +18,6 @@ class Grabber(ABC):
def feed_url(self):
raise NotImplementedError
@abstractmethod
def parse(self, feed):
raise NotImplementedError
def __init__(self, db):
self.db = db
self.name = self.__class__.__name__
@ -61,6 +59,19 @@ class Grabber(ABC):
response.raise_for_status()
return response.content
def parse(self, feed):
articles = []
for article in feed.iter("item"):
try:
# Sat, 18 Jan 2020 14:21:49 -0500
timestamp = int(datetime.strptime(article.find("pubDate").text, self.date_format).timestamp())
title = article.find("title").text
description = article.find("description").text
articles.append((timestamp, title, description))
except AttributeError:
logging.error(f"Received non-parsable news article from {self.name}.")
return articles
def process(self, articles, new_articles):
delta_articles = [article for article in new_articles if article not in articles]
if delta_articles:

4
run.py
View File

@ -5,6 +5,8 @@ from pathlib import Path
from sources.ArsTechnica import ArsTechnica
from sources.BBCBusiness import BBCBusiness
from sources.BBCTechnology import BBCTechnology
from sources.ReutersBusiness import ReutersBusiness
from sources.ReutersTechnology import ReutersTechnology
DATABASE_PATH = Path("storage.db")
GRAB_FREQUENCY = 15
@ -21,6 +23,8 @@ def main():
ArsTechnica(db),
BBCBusiness(db),
BBCTechnology(db),
ReutersBusiness(db),
ReutersTechnology(db),
]
while True:

View File

@ -1,21 +1,5 @@
import logging
from datetime import datetime
from grabber import Grabber
class ArsTechnica(Grabber):
feed_url = "http://feeds.arstechnica.com/arstechnica/index"
date_format = "%a, %d %b %Y %H:%M:%S %z"
def parse(self, feed):
articles = []
for article in feed.iter("item"):
try:
# Sat, 18 Jan 2020 15:41:56 +0000
timestamp = int(datetime.strptime(article.find("pubDate").text, self.date_format).timestamp())
title = article.find("title").text
description = article.find("description").text
articles.append((timestamp, title, description))
except AttributeError:
logging.error(f"Received non-parsable news article from {self.name}.")
return articles

View File

@ -1,21 +1,6 @@
import logging
from datetime import datetime
from grabber import Grabber
class BBCBusiness(Grabber):
feed_url = "http://feeds.bbci.co.uk/news/business/rss.xml"
date_format = "%a, %d %b %Y %H:%M:%S %Z"
def parse(self, feed):
articles = []
for article in feed.iter("item"):
try:
# Fri, 17 Jan 2020 19:09:40 GMT
timestamp = int(datetime.strptime(article.find("pubDate").text, self.date_format).timestamp())
title = article.find("title").text
description = article.find("description").text
articles.append((timestamp, title, description))
except AttributeError:
logging.error(f"Received non-parsable news article from {self.name}.")
return articles

View File

@ -1,23 +1,6 @@
import logging
from datetime import datetime
from grabber import Grabber
class BBCTechnology(Grabber):
feed_url = "http://feeds.bbci.co.uk/news/technology/rss.xml"
date_format = "%a, %d %b %Y %H:%M:%S %Z"
def parse(self, feed):
articles = []
for article in feed.iter("item"):
try:
# Fri, 17 Jan 2020 19:09:40 GMT
timestamp = int(datetime.strptime(article.find("pubDate").text, self.date_format).timestamp())
title = article.find("title").text
description = article.find("description").text
articles.append((timestamp, title, description))
except AttributeError:
logging.error(f"Received non-parsable news article from {self.name}.")
return articles

View File

@ -0,0 +1,5 @@
from grabber import Grabber
class ReutersBusiness(Grabber):
feed_url = "http://feeds.reuters.com/reuters/businessNews?format=xml"

View File

@ -0,0 +1,5 @@
from grabber import Grabber
class ReutersTechnology(Grabber):
feed_url = "http://feeds.reuters.com/reuters/technologyNews?format=xml"

View File

@ -1,11 +1,15 @@
# http://feeds.arstechnica.com/arstechnica/index
# http://feeds.bbci.co.uk/news/business/rss.xml
# http://feeds.bbci.co.uk/news/technology/rss.xml
# http://feeds.reuters.com/reuters/businessNews?format=xml
# http://feeds.reuters.com/reuters/technologyNews?format=xml
# http://rss.nytimes.com/services/xml/rss/nyt/Business.xml
# http://rss.nytimes.com/services/xml/rss/nyt/Technology.xml
# http://www.forbes.com/business/feed/
# http://www.forbes.com/technology/feed/
# http://www.ft.com/rss/home/us
# http://www.macworld.com/index.rss
# http://www.wired.com/feed
# https://www.engadget.com/rss.xml
# https://www.huffpost.com/section/business/feed
# https://www.huffpost.com/section/technology/feed

View File