diff --git a/.gitignore b/.gitignore index e61bca2..17bb22a 100644 --- a/.gitignore +++ b/.gitignore @@ -90,6 +90,7 @@ celerybeat-schedule # Environments .env .venv +.idea env/ venv/ ENV/ diff --git a/Pipfile b/Pipfile index b723d01..c50dea4 100644 --- a/Pipfile +++ b/Pipfile @@ -6,6 +6,7 @@ verify_ssl = true [dev-packages] [packages] +requests = "*" [requires] python_version = "3.7" diff --git a/Pipfile.lock b/Pipfile.lock new file mode 100644 index 0000000..71af2b8 --- /dev/null +++ b/Pipfile.lock @@ -0,0 +1,57 @@ +{ + "_meta": { + "hash": { + "sha256": "bb57e0d7853b45999e47c163c46b95bc2fde31c527d8d7b5b5539dc979444a6d" + }, + "pipfile-spec": 6, + "requires": { + "python_version": "3.7" + }, + "sources": [ + { + "name": "pypi", + "url": "https://pypi.org/simple", + "verify_ssl": true + } + ] + }, + "default": { + "certifi": { + "hashes": [ + "sha256:017c25db2a153ce562900032d5bc68e9f191e44e9a0f762f373977de9df1fbb3", + "sha256:25b64c7da4cd7479594d035c08c2d809eb4aab3a26e5a990ea98cc450c320f1f" + ], + "version": "==2019.11.28" + }, + "chardet": { + "hashes": [ + "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", + "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691" + ], + "version": "==3.0.4" + }, + "idna": { + "hashes": [ + "sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407", + "sha256:ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c" + ], + "version": "==2.8" + }, + "requests": { + "hashes": [ + "sha256:11e007a8a2aa0323f5a921e9e6a2d7e4e67d9877e85773fba9ba6419025cbeb4", + "sha256:9cf5292fcd0f598c671cfc1e0d7d1a7f13bb8085e9a590f48c010551dc6c4b31" + ], + "index": "pypi", + "version": "==2.22.0" + }, + "urllib3": { + "hashes": [ + "sha256:a8a318824cc77d1fd4b2bec2ded92646630d7fe8619497b142c84a9e6f5a7293", + "sha256:f3c5fd51747d450d4dcf6f923c81f78f811aab8205fda64b0aba34a4e48b0745" + ], + "version": "==1.25.7" + } + }, + "develop": {} +} diff --git a/grabber.py b/grabber.py index 12059e7..76b2764 100644 --- a/grabber.py +++ b/grabber.py @@ -1,17 +1,15 @@ import logging +import sqlite3 import requests -from abc import * -from threading import Timer +import xml.etree.ElementTree as et +from abc import abstractmethod, ABC +from requests.exceptions import HTTPError class Grabber(ABC): + articles = [] db = None - _interval = 60 - _running = False - - def __init__(self, db, interval): - self.db = db - self._interval = interval + name = "" @property @abstractmethod @@ -19,29 +17,65 @@ class Grabber(ABC): raise NotImplementedError @abstractmethod - def grab(self): + def parse(self, feed): raise NotImplementedError + def __init__(self, db): + self.db = db + self.name = self.__class__.__name__ + self.setup_tables() + self.articles = self.restore() + + def setup_tables(self): + try: + cur = self.db.cursor() + cur.execute(f"CREATE TABLE IF NOT EXISTS {self.name} " + f"(id INTEGER PRIMARY KEY AUTOINCREMENT, timestamp TIMESTAMP, title VARCHAR, description VARCHAR)") + self.db.commit() + except sqlite3.Error: + logging.error(f"Could not create table in database for {self.name}.") + + def store(self, articles): + try: + cur = self.db.cursor() + cur.executemany(f"INSERT INTO {self.name} (timestamp, title, description) " + f"VALUES (?,?,?)", articles) + self.db.commit() + except sqlite3.Error: + logging.error(f"Could not store updated news articles from {self.name}.") + + def restore(self): + articles = [] + try: + cur = self.db.cursor() + cur.execute(f"SELECT timestamp, title, description from {self.name}") + articles = cur.fetchall() + logging.info(f"Restored {len(articles)} news articles from database for {self.name}.") + except sqlite3.Error: + logging.error(f"Could not restore news articles from database for {self.name}.") + finally: + return articles + def request(self): response = requests.get(self.feed_url) response.raise_for_status() return response.content - def timer(self): - if self._running: - self.grab() - Timer(self._interval, self.timer).start() + def process(self, articles, new_articles): + delta_articles = [article for article in new_articles if article not in articles] + if delta_articles: + logging.info(f"Received {len(delta_articles)} new news articles from {self.name}.") + return delta_articles - def start(self): - if not self._running: - self._running = True - self.timer() - else: - logging.error(f"Grabber for {self.__class__.__name__} already started.") - - def stop(self): - if self._running: - self._running = False - else: - logging.error(f"Grabber for {self.__class__.__name__} already stopped.") + def grab(self): + try: + feed = et.fromstring(self.request()) + new_articles = self.parse(feed) + delta_articles = self.process(self.articles, new_articles) + self.store(delta_articles) + self.articles = new_articles + except HTTPError: + logging.error(f"Unable to download updated news articles from {self.name}.") + except (et.ParseError, ValueError): + logging.error(f"Unable to parse updated news articles from {self.name}.") diff --git a/run.py b/run.py index 93ea248..551b8e2 100644 --- a/run.py +++ b/run.py @@ -1,36 +1,38 @@ import logging import sqlite3 +import time from pathlib import Path from sources.ArsTechnica import ArsTechnica from sources.BBCBusiness import BBCBusiness +from sources.BBCTechnology import BBCTechnology DATABASE_PATH = Path("storage.db") -SCRAPE_INTERVAL = 15 - - -def configure_logging(): - logging.basicConfig( - level=logging.INFO, - format="%(asctime)s %(levelname)-8s %(message)s", - datefmt="%Y-%m-%d %H:%M" - ) - - -def setup_database(): - db = sqlite3.connect(DATABASE_PATH) - return db +GRAB_FREQUENCY = 15 +GRAB_INTERVAL = 5 def main(): - db = setup_database() - grabbers = [ - ArsTechnica(db, SCRAPE_INTERVAL), - BBCBusiness(db, SCRAPE_INTERVAL) - ] - for grabber in grabbers: - grabber.start() + try: + db = sqlite3.connect(DATABASE_PATH) + if not db: + raise sqlite3.DatabaseError + + grabbers = [ + ArsTechnica(db), + BBCBusiness(db), + BBCTechnology(db), + ] + + while True: + for grabber in grabbers: + grabber.grab() + time.sleep(GRAB_FREQUENCY/GRAB_INTERVAL) + + except sqlite3.Error: + logging.error("Could not connect to database.") + exit(-1) if __name__ == "__main__": - configure_logging() + logging.basicConfig(level=logging.INFO,format="%(asctime)s %(levelname)-8s %(message)s",datefmt="%Y-%m-%d %H:%M:%S") main() diff --git a/sources/ArsTechnica.py b/sources/ArsTechnica.py index ad53c92..08b68d2 100644 --- a/sources/ArsTechnica.py +++ b/sources/ArsTechnica.py @@ -1,10 +1,21 @@ +import logging +from datetime import datetime from grabber import Grabber class ArsTechnica(Grabber): feed_url = "http://feeds.arstechnica.com/arstechnica/index" + date_format = "%a, %d %b %Y %H:%M:%S %z" - def grab(self): - pass - - + def parse(self, feed): + articles = [] + for article in feed.iter("item"): + try: + # Sat, 18 Jan 2020 15:41:56 +0000 + timestamp = int(datetime.strptime(article.find("pubDate").text, self.date_format).timestamp()) + title = article.find("title").text + description = article.find("description").text + articles.append((timestamp, title, description)) + except AttributeError: + logging.error(f"Received non-parsable news article from {self.name}.") + return articles diff --git a/sources/BBCBusiness.py b/sources/BBCBusiness.py index b9090a0..1f35b04 100644 --- a/sources/BBCBusiness.py +++ b/sources/BBCBusiness.py @@ -1,39 +1,21 @@ -import csv import logging -import xml.etree.ElementTree as et from datetime import datetime from grabber import Grabber -from requests.exceptions import HTTPError class BBCBusiness(Grabber): - articles = [] feed_url = "http://feeds.bbci.co.uk/news/business/rss.xml" + date_format = "%a, %d %b %Y %H:%M:%S %Z" - def grab(self): - try: - feed = et.fromstring(self.request()) - self.process(feed) - except (HTTPError, et.ParseError): - logging.error(f"Unable to get updated news from {self.__class__.__name__}.") - - def process(self, feed): - for item in feed.iter("item"): - article = self.parse(item) - if article not in self.articles: - self.articles.append(article) - logging.info(f"New article from {self.__class__.__name__} at {datetime.fromtimestamp(article[0])}") - - def parse(self, item): - article = None - try: - date = item.find("pubDate").text - # Fri, 17 Jan 2020 19:09:40 GMT - timestamp = int(datetime.strptime(date, "%a, %d %b %Y %H:%M:%S %Z").timestamp()) - title = item.find("title").text - description = item.find("description").text - article = (timestamp, title, description) - except AttributeError: - logging.error(f"Received non-parsable news article from {self.__class__.__name__}.") - finally: - return article + def parse(self, feed): + articles = [] + for article in feed.iter("item"): + try: + # Fri, 17 Jan 2020 19:09:40 GMT + timestamp = int(datetime.strptime(article.find("pubDate").text, self.date_format).timestamp()) + title = article.find("title").text + description = article.find("description").text + articles.append((timestamp, title, description)) + except AttributeError: + logging.error(f"Received non-parsable news article from {self.name}.") + return articles diff --git a/sources/BBCTechnology.py b/sources/BBCTechnology.py new file mode 100644 index 0000000..6d4137d --- /dev/null +++ b/sources/BBCTechnology.py @@ -0,0 +1,23 @@ +import logging +from datetime import datetime +from grabber import Grabber + + +class BBCTechnology(Grabber): + feed_url = "http://feeds.bbci.co.uk/news/technology/rss.xml" + date_format = "%a, %d %b %Y %H:%M:%S %Z" + + def parse(self, feed): + articles = [] + for article in feed.iter("item"): + try: + # Fri, 17 Jan 2020 19:09:40 GMT + timestamp = int(datetime.strptime(article.find("pubDate").text, self.date_format).timestamp()) + title = article.find("title").text + description = article.find("description").text + articles.append((timestamp, title, description)) + except AttributeError: + logging.error(f"Received non-parsable news article from {self.name}.") + return articles + +