Merge pull request 'develop' (#3) from develop into master

Reviewed-on: #3
2020-09-08 11:11:19 +00:00 · 2020-09-08 11:11:19 +00:00 · 6096bdbde4
commit 6096bdbde4
parent 55dd70e727 04a6707157
28 changed files with 357 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -90,6 +90,7 @@ celerybeat-schedule
 # Environments
 .env
 .venv
 .idea
 env/
 venv/
 ENV/
--- a/12
+++ b/12
@ -0,0 +1,12 @@
 [[source]]
 name = "pypi"
 url = "https://pypi.org/simple"
 verify_ssl = true
 [dev-packages]
 [packages]
 requests = "*"
 [requires]
 python_version = "3.7"
--- a/Pipfile.lock
+++ b/Pipfile.lock
@ -0,0 +1,57 @@
 {
    "_meta": {
        "hash": {
            "sha256": "bb57e0d7853b45999e47c163c46b95bc2fde31c527d8d7b5b5539dc979444a6d"
        },
        "pipfile-spec": 6,
        "requires": {
            "python_version": "3.7"
        },
        "sources": [
            {
                "name": "pypi",
                "url": "https://pypi.org/simple",
                "verify_ssl": true
            }
        ]
    },
    "default": {
        "certifi": {
            "hashes": [
                "sha256:017c25db2a153ce562900032d5bc68e9f191e44e9a0f762f373977de9df1fbb3",
                "sha256:25b64c7da4cd7479594d035c08c2d809eb4aab3a26e5a990ea98cc450c320f1f"
            ],
            "version": "==2019.11.28"
        },
        "chardet": {
            "hashes": [
                "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae",
                "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"
            ],
            "version": "==3.0.4"
        },
        "idna": {
            "hashes": [
                "sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407",
                "sha256:ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c"
            ],
            "version": "==2.8"
        },
        "requests": {
            "hashes": [
                "sha256:11e007a8a2aa0323f5a921e9e6a2d7e4e67d9877e85773fba9ba6419025cbeb4",
                "sha256:9cf5292fcd0f598c671cfc1e0d7d1a7f13bb8085e9a590f48c010551dc6c4b31"
            ],
            "index": "pypi",
            "version": "==2.22.0"
        },
        "urllib3": {
            "hashes": [
                "sha256:a8a318824cc77d1fd4b2bec2ded92646630d7fe8619497b142c84a9e6f5a7293",
                "sha256:f3c5fd51747d450d4dcf6f923c81f78f811aab8205fda64b0aba34a4e48b0745"
            ],
            "version": "==1.25.7"
        }
    },
    "develop": {}
 }
--- a/grabber.py
+++ b/grabber.py
@ -0,0 +1,90 @@
 import logging
 import sqlite3
 import requests
 import xml.etree.ElementTree as et
 from abc import abstractmethod, ABC
 from datetime import datetime
 from requests.exceptions import HTTPError
 class Grabber(ABC):
    articles = []
    date_format = "%a, %d %b %Y %H:%M:%S %z"
    db = None
    name = ""
    @property
    @abstractmethod
    def feed_url(self):
        raise NotImplementedError
    def __init__(self, db):
        self.db = db
        self.name = self.__class__.__name__
        self.setup_tables()
        self.articles = self.restore()
    def setup_tables(self):
        try:
            cur = self.db.cursor()
            cur.execute(f"CREATE TABLE IF NOT EXISTS {self.name} "
                    f"(id INTEGER PRIMARY KEY AUTOINCREMENT, timestamp TIMESTAMP, title VARCHAR, description VARCHAR)")
            self.db.commit()
        except sqlite3.Error:
            logging.error(f"Could not create table in database for {self.name}.")
    def store(self, articles):
        try:
            cur = self.db.cursor()
            cur.executemany(f"INSERT INTO {self.name} (timestamp, title, description) "
                            f"VALUES (?,?,?)", articles)
            self.db.commit()
        except sqlite3.Error:
            logging.error(f"Could not store updated news articles from {self.name}.")
    def restore(self):
        articles = []
        try:
            cur = self.db.cursor()
            cur.execute(f"SELECT timestamp, title, description from {self.name}")
            articles = cur.fetchall()
            logging.info(f"Restored {len(articles)} news articles from database for {self.name}.")
        except sqlite3.Error:
            logging.error(f"Could not restore news articles from database for {self.name}.")
        finally:
            return articles
    def request(self):
        response = requests.get(self.feed_url, headers={"User-Agent": "JackNet"})
        response.raise_for_status()
        return response.content
    def parse(self, feed):
        articles = []
        for article in feed.iter("item"):
            try:
                timestamp = int(datetime.strptime(article.find("pubDate").text, self.date_format).timestamp())
                title = article.find("title").text
                description = article.find("description").text
                articles.append((timestamp, title, description))
            except AttributeError:
                logging.error(f"Received non-parsable news article from {self.name}.")
        return articles
    def process(self, articles, new_articles):
        delta_articles = [article for article in new_articles if article not in articles]
        if delta_articles:
            logging.info(f"Received {len(delta_articles)} new news articles from {self.name}.")
        return delta_articles
    def grab(self):
        try:
            feed = et.fromstring(self.request())
            new_articles = self.parse(feed)
            delta_articles = self.process(self.articles, new_articles)
            self.store(delta_articles)
            self.articles = new_articles
        except HTTPError:
            logging.error(f"Unable to download updated news articles from {self.name}.")
        except (et.ParseError, ValueError):
            logging.error(f"Unable to parse updated news articles from {self.name}.")
--- a/run.py
+++ b/run.py
@ -0,0 +1,77 @@
 import logging
 import sqlite3
 import time
 from pathlib import Path
 from sources.ArsTechnica import ArsTechnica
 from sources.BBCBusiness import BBCBusiness
 from sources.BBCTechnology import BBCTechnology
 from sources.Engadget import Engadget
 from sources.FinancialTimes import FinancialTimes
 from sources.ForbesBusiness import ForbesBusiness
 from sources.ForbesFinance import ForbesFinance
 from sources.ForbesTechnology import ForbesTechnology
 from sources.HuffingtonPostBusiness import HuffingtonPostBusiness
 from sources.HuffingtonPostEconomy import HuffingtonPostEconomy
 from sources.HuffingtonPostTechnology import HuffingtonPostTechnology
 from sources.IBTimesCompanies import IBTimesCompanies
 from sources.IBTimesTechnology import IBTimesTechnology
 from sources.MacWorld import MacWorld
 from sources.NYTBusiness import NYTBusiness
 from sources.NYTEconomy import NYTEconomy
 from sources.NYTTechnology import NYTTechnology
 from sources.ReutersBusiness import ReutersBusiness
 from sources.ReutersTechnology import ReutersTechnology
 from sources.TheEconomistBusiness import TheEconomistBusiness
 from sources.TheEconomistFinance import TheEconomistFinance
 from sources.TheEconomistTechnology import TheEconomistTechnology
 from sources.Wired import Wired
 DATABASE_PATH = Path("storage.db")
 GRAB_FREQUENCY = 60
 def main():
    try:
        db = sqlite3.connect(DATABASE_PATH)
        if not db:
            raise sqlite3.DatabaseError
        grabbers = [
            ArsTechnica(db),
            BBCBusiness(db),
            BBCTechnology(db),
            Engadget(db),
            FinancialTimes(db),
            ForbesBusiness(db),
            ForbesFinance(db),
            ForbesTechnology(db),
            HuffingtonPostBusiness(db),
            HuffingtonPostEconomy(db),
            HuffingtonPostTechnology(db),
            IBTimesCompanies(db),
            IBTimesTechnology(db),
            MacWorld(db),
            NYTBusiness(db),
            NYTEconomy(db),
            NYTTechnology(db),
            ReutersBusiness(db),
            ReutersTechnology(db),
            TheEconomistBusiness(db),
            TheEconomistFinance(db),
            TheEconomistTechnology(db),
            Wired(db),
        ]
        while True:
            for grabber in grabbers:
                grabber.grab()
                time.sleep(GRAB_FREQUENCY/len(grabbers))
    except sqlite3.Error:
        logging.error("Could not connect to database.")
        exit(-1)
 if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO,format="%(asctime)s %(levelname)-8s %(message)s",datefmt="%Y-%m-%d %H:%M:%S")
    main()
--- a/sources/ArsTechnica.py
+++ b/sources/ArsTechnica.py
@ -0,0 +1,5 @@
 from grabber import Grabber
 class ArsTechnica(Grabber):
    feed_url = "http://feeds.arstechnica.com/arstechnica/index"
--- a/sources/BBCBusiness.py
+++ b/sources/BBCBusiness.py
@ -0,0 +1,6 @@
 from grabber import Grabber
 class BBCBusiness(Grabber):
    feed_url = "http://feeds.bbci.co.uk/news/business/rss.xml"
    date_format = "%a, %d %b %Y %H:%M:%S %Z"
--- a/sources/BBCTechnology.py
+++ b/sources/BBCTechnology.py
@ -0,0 +1,6 @@
 from grabber import Grabber
 class BBCTechnology(Grabber):
    feed_url = "http://feeds.bbci.co.uk/news/technology/rss.xml"
    date_format = "%a, %d %b %Y %H:%M:%S %Z"
--- a/sources/Engadget.py
+++ b/sources/Engadget.py
@ -0,0 +1,5 @@
 from grabber import Grabber
 class Engadget(Grabber):
    feed_url = "https://www.engadget.com/rss.xml"
--- a/sources/FinancialTimes.py
+++ b/sources/FinancialTimes.py
@ -0,0 +1,6 @@
 from grabber import Grabber
 class FinancialTimes(Grabber):
    feed_url = "https://www.ft.com/?format=rss&edition=international"
    date_format = "%a, %d %b %Y %H:%M:%S %Z"
--- a/sources/ForbesBusiness.py
+++ b/sources/ForbesBusiness.py
@ -0,0 +1,5 @@
 from grabber import Grabber
 class ForbesBusiness(Grabber):
    feed_url = "http://www.forbes.com/business/feed/"
--- a/sources/ForbesFinance.py
+++ b/sources/ForbesFinance.py
@ -0,0 +1,5 @@
 from grabber import Grabber
 class ForbesFinance(Grabber):
    feed_url = "http://www.forbes.com/finance/feed/"
--- a/sources/ForbesTechnology.py
+++ b/sources/ForbesTechnology.py
@ -0,0 +1,5 @@
 from grabber import Grabber
 class ForbesTechnology(Grabber):
    feed_url = "http://www.forbes.com/technology/feed/"
--- a/sources/HuffingtonPostBusiness.py
+++ b/sources/HuffingtonPostBusiness.py
@ -0,0 +1,5 @@
 from grabber import Grabber
 class HuffingtonPostBusiness(Grabber):
    feed_url = "https://www.huffpost.com/section/business/feed"
--- a/sources/HuffingtonPostEconomy.py
+++ b/sources/HuffingtonPostEconomy.py
@ -0,0 +1,5 @@
 from grabber import Grabber
 class HuffingtonPostEconomy(Grabber):
    feed_url = "https://www.huffpost.com/section/economy/feed"
--- a/sources/HuffingtonPostTechnology.py
+++ b/sources/HuffingtonPostTechnology.py
@ -0,0 +1,5 @@
 from grabber import Grabber
 class HuffingtonPostTechnology(Grabber):
    feed_url = "https://www.huffpost.com/section/technology/feed"
--- a/sources/IBTimesCompanies.py
+++ b/sources/IBTimesCompanies.py
@ -0,0 +1,6 @@
 from grabber import Grabber
 class IBTimesCompanies(Grabber):
    feed_url = "http://www.ibtimes.co.uk/rss/companies"
    date_format = "%Y-%m-%dT%H:%M:%S%z"
--- a/sources/IBTimesTechnology.py
+++ b/sources/IBTimesTechnology.py
@ -0,0 +1,6 @@
 from grabber import Grabber
 class IBTimesTechnology(Grabber):
    feed_url = "http://www.ibtimes.co.uk/rss/technology"
    date_format = "%Y-%m-%dT%H:%M:%S%z"
--- a/sources/MacWorld.py
+++ b/sources/MacWorld.py
@ -0,0 +1,5 @@
 from grabber import Grabber
 class MacWorld(Grabber):
    feed_url = "http://www.macworld.com/index.rss"
--- a/sources/NYTBusiness.py
+++ b/sources/NYTBusiness.py
@ -0,0 +1,5 @@
 from grabber import Grabber
 class NYTBusiness(Grabber):
    feed_url = "http://rss.nytimes.com/services/xml/rss/nyt/Business.xml"
--- a/sources/NYTEconomy.py
+++ b/sources/NYTEconomy.py
@ -0,0 +1,5 @@
 from grabber import Grabber
 class NYTEconomy(Grabber):
    feed_url = "http://rss.nytimes.com/services/xml/rss/nyt/Economy.xml"
--- a/sources/NYTTechnology.py
+++ b/sources/NYTTechnology.py
@ -0,0 +1,5 @@
 from grabber import Grabber
 class NYTTechnology(Grabber):
    feed_url = "http://rss.nytimes.com/services/xml/rss/nyt/Technology.xml"
--- a/sources/ReutersBusiness.py
+++ b/sources/ReutersBusiness.py
@ -0,0 +1,5 @@
 from grabber import Grabber
 class ReutersBusiness(Grabber):
    feed_url = "http://feeds.reuters.com/reuters/businessNews?format=xml"
--- a/sources/ReutersTechnology.py
+++ b/sources/ReutersTechnology.py
@ -0,0 +1,5 @@
 from grabber import Grabber
 class ReutersTechnology(Grabber):
    feed_url = "http://feeds.reuters.com/reuters/technologyNews?format=xml"
--- a/sources/TheEconomistBusiness.py
+++ b/sources/TheEconomistBusiness.py
@ -0,0 +1,5 @@
 from grabber import Grabber
 class TheEconomistBusiness(Grabber):
    feed_url = "https://www.economist.com/business/rss.xml"
--- a/sources/TheEconomistFinance.py
+++ b/sources/TheEconomistFinance.py
@ -0,0 +1,5 @@
 from grabber import Grabber
 class TheEconomistFinance(Grabber):
    feed_url = "https://www.economist.com/finance-and-economics/rss.xml"
--- a/sources/TheEconomistTechnology.py
+++ b/sources/TheEconomistTechnology.py
@ -0,0 +1,5 @@
 from grabber import Grabber
 class TheEconomistTechnology(Grabber):
    feed_url = "https://www.economist.com/science-and-technology/rss.xml"
--- a/sources/Wired.py
+++ b/sources/Wired.py
@ -0,0 +1,5 @@
 from grabber import Grabber
 class Wired(Grabber):
    feed_url = "http://www.wired.com/feed"