From 5098ea53b35527f868288cf76ad5e1885cd0eff1 Mon Sep 17 00:00:00 2001 From: Jack Hadrill Date: Sat, 18 Jan 2020 19:47:27 +0000 Subject: [PATCH] Add Reuters grabbers. --- grabber.py | 19 +++++++++++++++---- run.py | 4 ++++ sources/ArsTechnica.py | 16 ---------------- sources/BBCBusiness.py | 15 --------------- sources/BBCTechnology.py | 17 ----------------- sources/ReutersBusiness.py | 5 +++++ sources/ReutersTechnology.py | 5 +++++ sources/sources.txt | 4 ++++ storage.db | 0 9 files changed, 33 insertions(+), 52 deletions(-) create mode 100644 sources/ReutersBusiness.py create mode 100644 sources/ReutersTechnology.py delete mode 100644 storage.db diff --git a/grabber.py b/grabber.py index 76b2764..5821875 100644 --- a/grabber.py +++ b/grabber.py @@ -3,11 +3,13 @@ import sqlite3 import requests import xml.etree.ElementTree as et from abc import abstractmethod, ABC +from datetime import datetime from requests.exceptions import HTTPError class Grabber(ABC): articles = [] + date_format = "%a, %d %b %Y %H:%M:%S %z" db = None name = "" @@ -16,10 +18,6 @@ class Grabber(ABC): def feed_url(self): raise NotImplementedError - @abstractmethod - def parse(self, feed): - raise NotImplementedError - def __init__(self, db): self.db = db self.name = self.__class__.__name__ @@ -61,6 +59,19 @@ class Grabber(ABC): response.raise_for_status() return response.content + def parse(self, feed): + articles = [] + for article in feed.iter("item"): + try: + # Sat, 18 Jan 2020 14:21:49 -0500 + timestamp = int(datetime.strptime(article.find("pubDate").text, self.date_format).timestamp()) + title = article.find("title").text + description = article.find("description").text + articles.append((timestamp, title, description)) + except AttributeError: + logging.error(f"Received non-parsable news article from {self.name}.") + return articles + def process(self, articles, new_articles): delta_articles = [article for article in new_articles if article not in articles] if delta_articles: diff --git a/run.py b/run.py index 551b8e2..5239481 100644 --- a/run.py +++ b/run.py @@ -5,6 +5,8 @@ from pathlib import Path from sources.ArsTechnica import ArsTechnica from sources.BBCBusiness import BBCBusiness from sources.BBCTechnology import BBCTechnology +from sources.ReutersBusiness import ReutersBusiness +from sources.ReutersTechnology import ReutersTechnology DATABASE_PATH = Path("storage.db") GRAB_FREQUENCY = 15 @@ -21,6 +23,8 @@ def main(): ArsTechnica(db), BBCBusiness(db), BBCTechnology(db), + ReutersBusiness(db), + ReutersTechnology(db), ] while True: diff --git a/sources/ArsTechnica.py b/sources/ArsTechnica.py index 08b68d2..3ddbcaa 100644 --- a/sources/ArsTechnica.py +++ b/sources/ArsTechnica.py @@ -1,21 +1,5 @@ -import logging -from datetime import datetime from grabber import Grabber class ArsTechnica(Grabber): feed_url = "http://feeds.arstechnica.com/arstechnica/index" - date_format = "%a, %d %b %Y %H:%M:%S %z" - - def parse(self, feed): - articles = [] - for article in feed.iter("item"): - try: - # Sat, 18 Jan 2020 15:41:56 +0000 - timestamp = int(datetime.strptime(article.find("pubDate").text, self.date_format).timestamp()) - title = article.find("title").text - description = article.find("description").text - articles.append((timestamp, title, description)) - except AttributeError: - logging.error(f"Received non-parsable news article from {self.name}.") - return articles diff --git a/sources/BBCBusiness.py b/sources/BBCBusiness.py index 1f35b04..acc166b 100644 --- a/sources/BBCBusiness.py +++ b/sources/BBCBusiness.py @@ -1,21 +1,6 @@ -import logging -from datetime import datetime from grabber import Grabber class BBCBusiness(Grabber): feed_url = "http://feeds.bbci.co.uk/news/business/rss.xml" date_format = "%a, %d %b %Y %H:%M:%S %Z" - - def parse(self, feed): - articles = [] - for article in feed.iter("item"): - try: - # Fri, 17 Jan 2020 19:09:40 GMT - timestamp = int(datetime.strptime(article.find("pubDate").text, self.date_format).timestamp()) - title = article.find("title").text - description = article.find("description").text - articles.append((timestamp, title, description)) - except AttributeError: - logging.error(f"Received non-parsable news article from {self.name}.") - return articles diff --git a/sources/BBCTechnology.py b/sources/BBCTechnology.py index 6d4137d..41f6abd 100644 --- a/sources/BBCTechnology.py +++ b/sources/BBCTechnology.py @@ -1,23 +1,6 @@ -import logging -from datetime import datetime from grabber import Grabber class BBCTechnology(Grabber): feed_url = "http://feeds.bbci.co.uk/news/technology/rss.xml" date_format = "%a, %d %b %Y %H:%M:%S %Z" - - def parse(self, feed): - articles = [] - for article in feed.iter("item"): - try: - # Fri, 17 Jan 2020 19:09:40 GMT - timestamp = int(datetime.strptime(article.find("pubDate").text, self.date_format).timestamp()) - title = article.find("title").text - description = article.find("description").text - articles.append((timestamp, title, description)) - except AttributeError: - logging.error(f"Received non-parsable news article from {self.name}.") - return articles - - diff --git a/sources/ReutersBusiness.py b/sources/ReutersBusiness.py new file mode 100644 index 0000000..96f7479 --- /dev/null +++ b/sources/ReutersBusiness.py @@ -0,0 +1,5 @@ +from grabber import Grabber + + +class ReutersBusiness(Grabber): + feed_url = "http://feeds.reuters.com/reuters/businessNews?format=xml" diff --git a/sources/ReutersTechnology.py b/sources/ReutersTechnology.py new file mode 100644 index 0000000..16f84bc --- /dev/null +++ b/sources/ReutersTechnology.py @@ -0,0 +1,5 @@ +from grabber import Grabber + + +class ReutersTechnology(Grabber): + feed_url = "http://feeds.reuters.com/reuters/technologyNews?format=xml" diff --git a/sources/sources.txt b/sources/sources.txt index 4124c7d..b637aff 100644 --- a/sources/sources.txt +++ b/sources/sources.txt @@ -1,11 +1,15 @@ # http://feeds.arstechnica.com/arstechnica/index # http://feeds.bbci.co.uk/news/business/rss.xml # http://feeds.bbci.co.uk/news/technology/rss.xml +# http://feeds.reuters.com/reuters/businessNews?format=xml # http://feeds.reuters.com/reuters/technologyNews?format=xml +# http://rss.nytimes.com/services/xml/rss/nyt/Business.xml # http://rss.nytimes.com/services/xml/rss/nyt/Technology.xml +# http://www.forbes.com/business/feed/ # http://www.forbes.com/technology/feed/ # http://www.ft.com/rss/home/us # http://www.macworld.com/index.rss # http://www.wired.com/feed # https://www.engadget.com/rss.xml +# https://www.huffpost.com/section/business/feed # https://www.huffpost.com/section/technology/feed \ No newline at end of file diff --git a/storage.db b/storage.db deleted file mode 100644 index e69de29..0000000