First attempt.

2020-01-18 15:34:40 +00:00 · 2020-01-18 15:34:40 +00:00 · cd876e3a20
commit cd876e3a20
parent 55dd70e727
7 changed files with 154 additions and 0 deletions
--- a/11
+++ b/11
@ -0,0 +1,11 @@
+[[source]]
+name = "pypi"
+url = "https://pypi.org/simple"
+verify_ssl = true
+
+[dev-packages]
+
+[packages]
+
+[requires]
+python_version = "3.7"
--- a/grabber.py
+++ b/grabber.py
@ -0,0 +1,47 @@
+import logging
+import requests
+from abc import *
+from threading import Timer
+
+
+class Grabber(ABC):
+    db = None
+    _interval = 60
+    _running = False
+
+    def __init__(self, db, interval):
+        self.db = db
+        self._interval = interval
+
+    @property
+    @abstractmethod
+    def feed_url(self):
+        raise NotImplementedError
+
+    @abstractmethod
+    def grab(self):
+        raise NotImplementedError
+
+    def request(self):
+        response = requests.get(self.feed_url)
+        response.raise_for_status()
+        return response.content
+
+    def timer(self):
+        if self._running:
+            self.grab()
+            Timer(self._interval, self.timer).start()
+
+    def start(self):
+        if not self._running:
+            self._running = True
+            self.timer()
+        else:
+            logging.error(f"Grabber for {self.__class__.__name__} already started.")
+
+    def stop(self):
+        if self._running:
+            self._running = False
+        else:
+            logging.error(f"Grabber for {self.__class__.__name__} already stopped.")
+
--- a/run.py
+++ b/run.py
@ -0,0 +1,36 @@
+import logging
+import sqlite3
+from pathlib import Path
+from sources.ArsTechnica import ArsTechnica
+from sources.BBCBusiness import BBCBusiness
+
+DATABASE_PATH = Path("storage.db")
+SCRAPE_INTERVAL = 15
+
+
+def configure_logging():
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s %(levelname)-8s %(message)s",
+        datefmt="%Y-%m-%d %H:%M"
+    )
+
+
+def setup_database():
+    db = sqlite3.connect(DATABASE_PATH)
+    return db
+
+
+def main():
+    db = setup_database()
+    grabbers = [
+        ArsTechnica(db, SCRAPE_INTERVAL),
+        BBCBusiness(db, SCRAPE_INTERVAL)
+    ]
+    for grabber in grabbers:
+        grabber.start()
+
+
+if __name__ == "__main__":
+    configure_logging()
+    main()
--- a/sources/ArsTechnica.py
+++ b/sources/ArsTechnica.py
@ -0,0 +1,10 @@
+from grabber import Grabber
+
+
+class ArsTechnica(Grabber):
+    feed_url = "http://feeds.arstechnica.com/arstechnica/index"
+
+    def grab(self):
+        pass
+
+
--- a/sources/BBCBusiness.py
+++ b/sources/BBCBusiness.py
@ -0,0 +1,39 @@
+import csv
+import logging
+import xml.etree.ElementTree as et
+from datetime import datetime
+from grabber import Grabber
+from requests.exceptions import HTTPError
+
+
+class BBCBusiness(Grabber):
+    articles = []
+    feed_url = "http://feeds.bbci.co.uk/news/business/rss.xml"
+
+    def grab(self):
+        try:
+            feed = et.fromstring(self.request())
+            self.process(feed)
+        except (HTTPError, et.ParseError):
+            logging.error(f"Unable to get updated news from {self.__class__.__name__}.")
+
+    def process(self, feed):
+        for item in feed.iter("item"):
+            article = self.parse(item)
+            if article not in self.articles:
+                self.articles.append(article)
+                logging.info(f"New article from {self.__class__.__name__} at {datetime.fromtimestamp(article[0])}")
+
+    def parse(self, item):
+        article = None
+        try:
+            date = item.find("pubDate").text
+            # Fri, 17 Jan 2020 19:09:40 GMT
+            timestamp = int(datetime.strptime(date, "%a, %d %b %Y %H:%M:%S %Z").timestamp())
+            title = item.find("title").text
+            description = item.find("description").text
+            article = (timestamp, title, description)
+        except AttributeError:
+            logging.error(f"Received non-parsable news article from {self.__class__.__name__}.")
+        finally:
+            return article
--- a/sources/sources.txt
+++ b/sources/sources.txt
@ -0,0 +1,11 @@
+# http://feeds.arstechnica.com/arstechnica/index
+# http://feeds.bbci.co.uk/news/business/rss.xml
+# http://feeds.bbci.co.uk/news/technology/rss.xml
+# http://feeds.reuters.com/reuters/technologyNews?format=xml
+# http://rss.nytimes.com/services/xml/rss/nyt/Technology.xml
+# http://www.forbes.com/technology/feed/
+# http://www.ft.com/rss/home/us
+# http://www.macworld.com/index.rss
+# http://www.wired.com/feed
+# https://www.engadget.com/rss.xml
+# https://www.huffpost.com/section/technology/feed
--- a/storage.db
+++ b/storage.db