Merge pull request 'develop' (#3) from develop into master

Reviewed-on: #3
2020-09-08 11:11:19 +00:00 · 2020-09-08 11:11:19 +00:00 · 6096bdbde4
commit 6096bdbde4
parent 55dd70e727 04a6707157
28 changed files with 357 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -90,6 +90,7 @@ celerybeat-schedule
 # Environments
 .env
 .venv
+.idea
 env/
 venv/
 ENV/
--- a/12
+++ b/12
@ -0,0 +1,12 @@
+[[source]]
+name = "pypi"
+url = "https://pypi.org/simple"
+verify_ssl = true
+
+[dev-packages]
+
+[packages]
+requests = "*"
+
+[requires]
+python_version = "3.7"
--- a/Pipfile.lock
+++ b/Pipfile.lock
@ -0,0 +1,57 @@
+{
+    "_meta": {
+        "hash": {
+            "sha256": "bb57e0d7853b45999e47c163c46b95bc2fde31c527d8d7b5b5539dc979444a6d"
+        },
+        "pipfile-spec": 6,
+        "requires": {
+            "python_version": "3.7"
+        },
+        "sources": [
+            {
+                "name": "pypi",
+                "url": "https://pypi.org/simple",
+                "verify_ssl": true
+            }
+        ]
+    },
+    "default": {
+        "certifi": {
+            "hashes": [
+                "sha256:017c25db2a153ce562900032d5bc68e9f191e44e9a0f762f373977de9df1fbb3",
+                "sha256:25b64c7da4cd7479594d035c08c2d809eb4aab3a26e5a990ea98cc450c320f1f"
+            ],
+            "version": "==2019.11.28"
+        },
+        "chardet": {
+            "hashes": [
+                "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae",
+                "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"
+            ],
+            "version": "==3.0.4"
+        },
+        "idna": {
+            "hashes": [
+                "sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407",
+                "sha256:ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c"
+            ],
+            "version": "==2.8"
+        },
+        "requests": {
+            "hashes": [
+                "sha256:11e007a8a2aa0323f5a921e9e6a2d7e4e67d9877e85773fba9ba6419025cbeb4",
+                "sha256:9cf5292fcd0f598c671cfc1e0d7d1a7f13bb8085e9a590f48c010551dc6c4b31"
+            ],
+            "index": "pypi",
+            "version": "==2.22.0"
+        },
+        "urllib3": {
+            "hashes": [
+                "sha256:a8a318824cc77d1fd4b2bec2ded92646630d7fe8619497b142c84a9e6f5a7293",
+                "sha256:f3c5fd51747d450d4dcf6f923c81f78f811aab8205fda64b0aba34a4e48b0745"
+            ],
+            "version": "==1.25.7"
+        }
+    },
+    "develop": {}
+}
--- a/grabber.py
+++ b/grabber.py
@ -0,0 +1,90 @@
+import logging
+import sqlite3
+import requests
+import xml.etree.ElementTree as et
+from abc import abstractmethod, ABC
+from datetime import datetime
+from requests.exceptions import HTTPError
+
+
+class Grabber(ABC):
+    articles = []
+    date_format = "%a, %d %b %Y %H:%M:%S %z"
+    db = None
+    name = ""
+
+    @property
+    @abstractmethod
+    def feed_url(self):
+        raise NotImplementedError
+
+    def __init__(self, db):
+        self.db = db
+        self.name = self.__class__.__name__
+        self.setup_tables()
+        self.articles = self.restore()
+
+    def setup_tables(self):
+        try:
+            cur = self.db.cursor()
+            cur.execute(f"CREATE TABLE IF NOT EXISTS {self.name} "
+                    f"(id INTEGER PRIMARY KEY AUTOINCREMENT, timestamp TIMESTAMP, title VARCHAR, description VARCHAR)")
+            self.db.commit()
+        except sqlite3.Error:
+            logging.error(f"Could not create table in database for {self.name}.")
+
+    def store(self, articles):
+        try:
+            cur = self.db.cursor()
+            cur.executemany(f"INSERT INTO {self.name} (timestamp, title, description) "
+                            f"VALUES (?,?,?)", articles)
+            self.db.commit()
+        except sqlite3.Error:
+            logging.error(f"Could not store updated news articles from {self.name}.")
+
+    def restore(self):
+        articles = []
+        try:
+            cur = self.db.cursor()
+            cur.execute(f"SELECT timestamp, title, description from {self.name}")
+            articles = cur.fetchall()
+            logging.info(f"Restored {len(articles)} news articles from database for {self.name}.")
+        except sqlite3.Error:
+            logging.error(f"Could not restore news articles from database for {self.name}.")
+        finally:
+            return articles
+
+    def request(self):
+        response = requests.get(self.feed_url, headers={"User-Agent": "JackNet"})
+        response.raise_for_status()
+        return response.content
+
+    def parse(self, feed):
+        articles = []
+        for article in feed.iter("item"):
+            try:
+                timestamp = int(datetime.strptime(article.find("pubDate").text, self.date_format).timestamp())
+                title = article.find("title").text
+                description = article.find("description").text
+                articles.append((timestamp, title, description))
+            except AttributeError:
+                logging.error(f"Received non-parsable news article from {self.name}.")
+        return articles
+
+    def process(self, articles, new_articles):
+        delta_articles = [article for article in new_articles if article not in articles]
+        if delta_articles:
+            logging.info(f"Received {len(delta_articles)} new news articles from {self.name}.")
+        return delta_articles
+
+    def grab(self):
+        try:
+            feed = et.fromstring(self.request())
+            new_articles = self.parse(feed)
+            delta_articles = self.process(self.articles, new_articles)
+            self.store(delta_articles)
+            self.articles = new_articles
+        except HTTPError:
+            logging.error(f"Unable to download updated news articles from {self.name}.")
+        except (et.ParseError, ValueError):
+            logging.error(f"Unable to parse updated news articles from {self.name}.")
--- a/run.py
+++ b/run.py
@ -0,0 +1,77 @@
+import logging
+import sqlite3
+import time
+from pathlib import Path
+from sources.ArsTechnica import ArsTechnica
+from sources.BBCBusiness import BBCBusiness
+from sources.BBCTechnology import BBCTechnology
+from sources.Engadget import Engadget
+from sources.FinancialTimes import FinancialTimes
+from sources.ForbesBusiness import ForbesBusiness
+from sources.ForbesFinance import ForbesFinance
+from sources.ForbesTechnology import ForbesTechnology
+from sources.HuffingtonPostBusiness import HuffingtonPostBusiness
+from sources.HuffingtonPostEconomy import HuffingtonPostEconomy
+from sources.HuffingtonPostTechnology import HuffingtonPostTechnology
+from sources.IBTimesCompanies import IBTimesCompanies
+from sources.IBTimesTechnology import IBTimesTechnology
+from sources.MacWorld import MacWorld
+from sources.NYTBusiness import NYTBusiness
+from sources.NYTEconomy import NYTEconomy
+from sources.NYTTechnology import NYTTechnology
+from sources.ReutersBusiness import ReutersBusiness
+from sources.ReutersTechnology import ReutersTechnology
+from sources.TheEconomistBusiness import TheEconomistBusiness
+from sources.TheEconomistFinance import TheEconomistFinance
+from sources.TheEconomistTechnology import TheEconomistTechnology
+from sources.Wired import Wired
+
+DATABASE_PATH = Path("storage.db")
+GRAB_FREQUENCY = 60
+
+
+def main():
+    try:
+        db = sqlite3.connect(DATABASE_PATH)
+        if not db:
+            raise sqlite3.DatabaseError
+
+        grabbers = [
+            ArsTechnica(db),
+            BBCBusiness(db),
+            BBCTechnology(db),
+            Engadget(db),
+            FinancialTimes(db),
+            ForbesBusiness(db),
+            ForbesFinance(db),
+            ForbesTechnology(db),
+            HuffingtonPostBusiness(db),
+            HuffingtonPostEconomy(db),
+            HuffingtonPostTechnology(db),
+            IBTimesCompanies(db),
+            IBTimesTechnology(db),
+            MacWorld(db),
+            NYTBusiness(db),
+            NYTEconomy(db),
+            NYTTechnology(db),
+            ReutersBusiness(db),
+            ReutersTechnology(db),
+            TheEconomistBusiness(db),
+            TheEconomistFinance(db),
+            TheEconomistTechnology(db),
+            Wired(db),
+        ]
+
+        while True:
+            for grabber in grabbers:
+                grabber.grab()
+                time.sleep(GRAB_FREQUENCY/len(grabbers))
+
+    except sqlite3.Error:
+        logging.error("Could not connect to database.")
+        exit(-1)
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO,format="%(asctime)s %(levelname)-8s %(message)s",datefmt="%Y-%m-%d %H:%M:%S")
+    main()
--- a/sources/ArsTechnica.py
+++ b/sources/ArsTechnica.py
@ -0,0 +1,5 @@
+from grabber import Grabber
+
+
+class ArsTechnica(Grabber):
+    feed_url = "http://feeds.arstechnica.com/arstechnica/index"
--- a/sources/BBCBusiness.py
+++ b/sources/BBCBusiness.py
@ -0,0 +1,6 @@
+from grabber import Grabber
+
+
+class BBCBusiness(Grabber):
+    feed_url = "http://feeds.bbci.co.uk/news/business/rss.xml"
+    date_format = "%a, %d %b %Y %H:%M:%S %Z"
--- a/sources/BBCTechnology.py
+++ b/sources/BBCTechnology.py
@ -0,0 +1,6 @@
+from grabber import Grabber
+
+
+class BBCTechnology(Grabber):
+    feed_url = "http://feeds.bbci.co.uk/news/technology/rss.xml"
+    date_format = "%a, %d %b %Y %H:%M:%S %Z"
--- a/sources/Engadget.py
+++ b/sources/Engadget.py
@ -0,0 +1,5 @@
+from grabber import Grabber
+
+
+class Engadget(Grabber):
+    feed_url = "https://www.engadget.com/rss.xml"
--- a/sources/FinancialTimes.py
+++ b/sources/FinancialTimes.py
@ -0,0 +1,6 @@
+from grabber import Grabber
+
+
+class FinancialTimes(Grabber):
+    feed_url = "https://www.ft.com/?format=rss&edition=international"
+    date_format = "%a, %d %b %Y %H:%M:%S %Z"
--- a/sources/ForbesBusiness.py
+++ b/sources/ForbesBusiness.py
@ -0,0 +1,5 @@
+from grabber import Grabber
+
+
+class ForbesBusiness(Grabber):
+    feed_url = "http://www.forbes.com/business/feed/"
--- a/sources/ForbesFinance.py
+++ b/sources/ForbesFinance.py
@ -0,0 +1,5 @@
+from grabber import Grabber
+
+
+class ForbesFinance(Grabber):
+    feed_url = "http://www.forbes.com/finance/feed/"
--- a/sources/ForbesTechnology.py
+++ b/sources/ForbesTechnology.py
@ -0,0 +1,5 @@
+from grabber import Grabber
+
+
+class ForbesTechnology(Grabber):
+    feed_url = "http://www.forbes.com/technology/feed/"
--- a/sources/HuffingtonPostBusiness.py
+++ b/sources/HuffingtonPostBusiness.py
@ -0,0 +1,5 @@
+from grabber import Grabber
+
+
+class HuffingtonPostBusiness(Grabber):
+    feed_url = "https://www.huffpost.com/section/business/feed"
--- a/sources/HuffingtonPostEconomy.py
+++ b/sources/HuffingtonPostEconomy.py
@ -0,0 +1,5 @@
+from grabber import Grabber
+
+
+class HuffingtonPostEconomy(Grabber):
+    feed_url = "https://www.huffpost.com/section/economy/feed"
--- a/sources/HuffingtonPostTechnology.py
+++ b/sources/HuffingtonPostTechnology.py
@ -0,0 +1,5 @@
+from grabber import Grabber
+
+
+class HuffingtonPostTechnology(Grabber):
+    feed_url = "https://www.huffpost.com/section/technology/feed"
--- a/sources/IBTimesCompanies.py
+++ b/sources/IBTimesCompanies.py
@ -0,0 +1,6 @@
+from grabber import Grabber
+
+
+class IBTimesCompanies(Grabber):
+    feed_url = "http://www.ibtimes.co.uk/rss/companies"
+    date_format = "%Y-%m-%dT%H:%M:%S%z"
--- a/sources/IBTimesTechnology.py
+++ b/sources/IBTimesTechnology.py
@ -0,0 +1,6 @@
+from grabber import Grabber
+
+
+class IBTimesTechnology(Grabber):
+    feed_url = "http://www.ibtimes.co.uk/rss/technology"
+    date_format = "%Y-%m-%dT%H:%M:%S%z"
--- a/sources/MacWorld.py
+++ b/sources/MacWorld.py
@ -0,0 +1,5 @@
+from grabber import Grabber
+
+
+class MacWorld(Grabber):
+    feed_url = "http://www.macworld.com/index.rss"
--- a/sources/NYTBusiness.py
+++ b/sources/NYTBusiness.py
@ -0,0 +1,5 @@
+from grabber import Grabber
+
+
+class NYTBusiness(Grabber):
+    feed_url = "http://rss.nytimes.com/services/xml/rss/nyt/Business.xml"
--- a/sources/NYTEconomy.py
+++ b/sources/NYTEconomy.py
@ -0,0 +1,5 @@
+from grabber import Grabber
+
+
+class NYTEconomy(Grabber):
+    feed_url = "http://rss.nytimes.com/services/xml/rss/nyt/Economy.xml"
--- a/sources/NYTTechnology.py
+++ b/sources/NYTTechnology.py
@ -0,0 +1,5 @@
+from grabber import Grabber
+
+
+class NYTTechnology(Grabber):
+    feed_url = "http://rss.nytimes.com/services/xml/rss/nyt/Technology.xml"
--- a/sources/ReutersBusiness.py
+++ b/sources/ReutersBusiness.py
@ -0,0 +1,5 @@
+from grabber import Grabber
+
+
+class ReutersBusiness(Grabber):
+    feed_url = "http://feeds.reuters.com/reuters/businessNews?format=xml"
--- a/sources/ReutersTechnology.py
+++ b/sources/ReutersTechnology.py
@ -0,0 +1,5 @@
+from grabber import Grabber
+
+
+class ReutersTechnology(Grabber):
+    feed_url = "http://feeds.reuters.com/reuters/technologyNews?format=xml"
--- a/sources/TheEconomistBusiness.py
+++ b/sources/TheEconomistBusiness.py
@ -0,0 +1,5 @@
+from grabber import Grabber
+
+
+class TheEconomistBusiness(Grabber):
+    feed_url = "https://www.economist.com/business/rss.xml"
--- a/sources/TheEconomistFinance.py
+++ b/sources/TheEconomistFinance.py
@ -0,0 +1,5 @@
+from grabber import Grabber
+
+
+class TheEconomistFinance(Grabber):
+    feed_url = "https://www.economist.com/finance-and-economics/rss.xml"
--- a/sources/TheEconomistTechnology.py
+++ b/sources/TheEconomistTechnology.py
@ -0,0 +1,5 @@
+from grabber import Grabber
+
+
+class TheEconomistTechnology(Grabber):
+    feed_url = "https://www.economist.com/science-and-technology/rss.xml"
--- a/sources/Wired.py
+++ b/sources/Wired.py
@ -0,0 +1,5 @@
+from grabber import Grabber
+
+
+class Wired(Grabber):
+    feed_url = "http://www.wired.com/feed"