From 5098ea53b35527f868288cf76ad5e1885cd0eff1 Mon Sep 17 00:00:00 2001
From: Jack Hadrill <jack@hadrill.co.uk>
Date: Sat, 18 Jan 2020 19:47:27 +0000
Subject: [PATCH] Add Reuters grabbers.

---
 grabber.py                   | 19 +++++++++++++++----
 run.py                       |  4 ++++
 sources/ArsTechnica.py       | 16 ----------------
 sources/BBCBusiness.py       | 15 ---------------
 sources/BBCTechnology.py     | 17 -----------------
 sources/ReutersBusiness.py   |  5 +++++
 sources/ReutersTechnology.py |  5 +++++
 sources/sources.txt          |  4 ++++
 storage.db                   |  0
 9 files changed, 33 insertions(+), 52 deletions(-)
 create mode 100644 sources/ReutersBusiness.py
 create mode 100644 sources/ReutersTechnology.py
 delete mode 100644 storage.db

diff --git a/grabber.py b/grabber.py
index 76b2764..5821875 100644
--- a/grabber.py
+++ b/grabber.py
@@ -3,11 +3,13 @@ import sqlite3
 import requests
 import xml.etree.ElementTree as et
 from abc import abstractmethod, ABC
+from datetime import datetime
 from requests.exceptions import HTTPError
 
 
 class Grabber(ABC):
     articles = []
+    date_format = "%a, %d %b %Y %H:%M:%S %z"
     db = None
     name = ""
 
@@ -16,10 +18,6 @@ class Grabber(ABC):
     def feed_url(self):
         raise NotImplementedError
 
-    @abstractmethod
-    def parse(self, feed):
-        raise NotImplementedError
-
     def __init__(self, db):
         self.db = db
         self.name = self.__class__.__name__
@@ -61,6 +59,19 @@ class Grabber(ABC):
         response.raise_for_status()
         return response.content
 
+    def parse(self, feed):
+        articles = []
+        for article in feed.iter("item"):
+            try:
+                # Sat, 18 Jan 2020 14:21:49 -0500
+                timestamp = int(datetime.strptime(article.find("pubDate").text, self.date_format).timestamp())
+                title = article.find("title").text
+                description = article.find("description").text
+                articles.append((timestamp, title, description))
+            except AttributeError:
+                logging.error(f"Received non-parsable news article from {self.name}.")
+        return articles
+
     def process(self, articles, new_articles):
         delta_articles = [article for article in new_articles if article not in articles]
         if delta_articles:
diff --git a/run.py b/run.py
index 551b8e2..5239481 100644
--- a/run.py
+++ b/run.py
@@ -5,6 +5,8 @@ from pathlib import Path
 from sources.ArsTechnica import ArsTechnica
 from sources.BBCBusiness import BBCBusiness
 from sources.BBCTechnology import BBCTechnology
+from sources.ReutersBusiness import ReutersBusiness
+from sources.ReutersTechnology import ReutersTechnology
 
 DATABASE_PATH = Path("storage.db")
 GRAB_FREQUENCY = 15
@@ -21,6 +23,8 @@ def main():
             ArsTechnica(db),
             BBCBusiness(db),
             BBCTechnology(db),
+            ReutersBusiness(db),
+            ReutersTechnology(db),
         ]
 
         while True:
diff --git a/sources/ArsTechnica.py b/sources/ArsTechnica.py
index 08b68d2..3ddbcaa 100644
--- a/sources/ArsTechnica.py
+++ b/sources/ArsTechnica.py
@@ -1,21 +1,5 @@
-import logging
-from datetime import datetime
 from grabber import Grabber
 
 
 class ArsTechnica(Grabber):
     feed_url = "http://feeds.arstechnica.com/arstechnica/index"
-    date_format = "%a, %d %b %Y %H:%M:%S %z"
-
-    def parse(self, feed):
-        articles = []
-        for article in feed.iter("item"):
-            try:
-                # Sat, 18 Jan 2020 15:41:56 +0000
-                timestamp = int(datetime.strptime(article.find("pubDate").text, self.date_format).timestamp())
-                title = article.find("title").text
-                description = article.find("description").text
-                articles.append((timestamp, title, description))
-            except AttributeError:
-                logging.error(f"Received non-parsable news article from {self.name}.")
-        return articles
diff --git a/sources/BBCBusiness.py b/sources/BBCBusiness.py
index 1f35b04..acc166b 100644
--- a/sources/BBCBusiness.py
+++ b/sources/BBCBusiness.py
@@ -1,21 +1,6 @@
-import logging
-from datetime import datetime
 from grabber import Grabber
 
 
 class BBCBusiness(Grabber):
     feed_url = "http://feeds.bbci.co.uk/news/business/rss.xml"
     date_format = "%a, %d %b %Y %H:%M:%S %Z"
-
-    def parse(self, feed):
-        articles = []
-        for article in feed.iter("item"):
-            try:
-                # Fri, 17 Jan 2020 19:09:40 GMT
-                timestamp = int(datetime.strptime(article.find("pubDate").text, self.date_format).timestamp())
-                title = article.find("title").text
-                description = article.find("description").text
-                articles.append((timestamp, title, description))
-            except AttributeError:
-                logging.error(f"Received non-parsable news article from {self.name}.")
-        return articles
diff --git a/sources/BBCTechnology.py b/sources/BBCTechnology.py
index 6d4137d..41f6abd 100644
--- a/sources/BBCTechnology.py
+++ b/sources/BBCTechnology.py
@@ -1,23 +1,6 @@
-import logging
-from datetime import datetime
 from grabber import Grabber
 
 
 class BBCTechnology(Grabber):
     feed_url = "http://feeds.bbci.co.uk/news/technology/rss.xml"
     date_format = "%a, %d %b %Y %H:%M:%S %Z"
-
-    def parse(self, feed):
-        articles = []
-        for article in feed.iter("item"):
-            try:
-                # Fri, 17 Jan 2020 19:09:40 GMT
-                timestamp = int(datetime.strptime(article.find("pubDate").text, self.date_format).timestamp())
-                title = article.find("title").text
-                description = article.find("description").text
-                articles.append((timestamp, title, description))
-            except AttributeError:
-                logging.error(f"Received non-parsable news article from {self.name}.")
-        return articles
-
-
diff --git a/sources/ReutersBusiness.py b/sources/ReutersBusiness.py
new file mode 100644
index 0000000..96f7479
--- /dev/null
+++ b/sources/ReutersBusiness.py
@@ -0,0 +1,5 @@
+from grabber import Grabber
+
+
+class ReutersBusiness(Grabber):
+    feed_url = "http://feeds.reuters.com/reuters/businessNews?format=xml"
diff --git a/sources/ReutersTechnology.py b/sources/ReutersTechnology.py
new file mode 100644
index 0000000..16f84bc
--- /dev/null
+++ b/sources/ReutersTechnology.py
@@ -0,0 +1,5 @@
+from grabber import Grabber
+
+
+class ReutersTechnology(Grabber):
+    feed_url = "http://feeds.reuters.com/reuters/technologyNews?format=xml"
diff --git a/sources/sources.txt b/sources/sources.txt
index 4124c7d..b637aff 100644
--- a/sources/sources.txt
+++ b/sources/sources.txt
@@ -1,11 +1,15 @@
 # http://feeds.arstechnica.com/arstechnica/index
 # http://feeds.bbci.co.uk/news/business/rss.xml
 # http://feeds.bbci.co.uk/news/technology/rss.xml
+# http://feeds.reuters.com/reuters/businessNews?format=xml
 # http://feeds.reuters.com/reuters/technologyNews?format=xml
+# http://rss.nytimes.com/services/xml/rss/nyt/Business.xml
 # http://rss.nytimes.com/services/xml/rss/nyt/Technology.xml
+# http://www.forbes.com/business/feed/
 # http://www.forbes.com/technology/feed/
 # http://www.ft.com/rss/home/us
 # http://www.macworld.com/index.rss
 # http://www.wired.com/feed
 # https://www.engadget.com/rss.xml
+# https://www.huffpost.com/section/business/feed
 # https://www.huffpost.com/section/technology/feed
\ No newline at end of file
diff --git a/storage.db b/storage.db
deleted file mode 100644
index e69de29..0000000