diff --git a/Pipfile b/Pipfile new file mode 100644 index 0000000..b723d01 --- /dev/null +++ b/Pipfile @@ -0,0 +1,11 @@ +[[source]] +name = "pypi" +url = "https://pypi.org/simple" +verify_ssl = true + +[dev-packages] + +[packages] + +[requires] +python_version = "3.7" diff --git a/grabber.py b/grabber.py new file mode 100644 index 0000000..12059e7 --- /dev/null +++ b/grabber.py @@ -0,0 +1,47 @@ +import logging +import requests +from abc import * +from threading import Timer + + +class Grabber(ABC): + db = None + _interval = 60 + _running = False + + def __init__(self, db, interval): + self.db = db + self._interval = interval + + @property + @abstractmethod + def feed_url(self): + raise NotImplementedError + + @abstractmethod + def grab(self): + raise NotImplementedError + + def request(self): + response = requests.get(self.feed_url) + response.raise_for_status() + return response.content + + def timer(self): + if self._running: + self.grab() + Timer(self._interval, self.timer).start() + + def start(self): + if not self._running: + self._running = True + self.timer() + else: + logging.error(f"Grabber for {self.__class__.__name__} already started.") + + def stop(self): + if self._running: + self._running = False + else: + logging.error(f"Grabber for {self.__class__.__name__} already stopped.") + diff --git a/run.py b/run.py new file mode 100644 index 0000000..93ea248 --- /dev/null +++ b/run.py @@ -0,0 +1,36 @@ +import logging +import sqlite3 +from pathlib import Path +from sources.ArsTechnica import ArsTechnica +from sources.BBCBusiness import BBCBusiness + +DATABASE_PATH = Path("storage.db") +SCRAPE_INTERVAL = 15 + + +def configure_logging(): + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)-8s %(message)s", + datefmt="%Y-%m-%d %H:%M" + ) + + +def setup_database(): + db = sqlite3.connect(DATABASE_PATH) + return db + + +def main(): + db = setup_database() + grabbers = [ + ArsTechnica(db, SCRAPE_INTERVAL), + BBCBusiness(db, SCRAPE_INTERVAL) + ] + for grabber in grabbers: + grabber.start() + + +if __name__ == "__main__": + configure_logging() + main() diff --git a/sources/ArsTechnica.py b/sources/ArsTechnica.py new file mode 100644 index 0000000..ad53c92 --- /dev/null +++ b/sources/ArsTechnica.py @@ -0,0 +1,10 @@ +from grabber import Grabber + + +class ArsTechnica(Grabber): + feed_url = "http://feeds.arstechnica.com/arstechnica/index" + + def grab(self): + pass + + diff --git a/sources/BBCBusiness.py b/sources/BBCBusiness.py new file mode 100644 index 0000000..b9090a0 --- /dev/null +++ b/sources/BBCBusiness.py @@ -0,0 +1,39 @@ +import csv +import logging +import xml.etree.ElementTree as et +from datetime import datetime +from grabber import Grabber +from requests.exceptions import HTTPError + + +class BBCBusiness(Grabber): + articles = [] + feed_url = "http://feeds.bbci.co.uk/news/business/rss.xml" + + def grab(self): + try: + feed = et.fromstring(self.request()) + self.process(feed) + except (HTTPError, et.ParseError): + logging.error(f"Unable to get updated news from {self.__class__.__name__}.") + + def process(self, feed): + for item in feed.iter("item"): + article = self.parse(item) + if article not in self.articles: + self.articles.append(article) + logging.info(f"New article from {self.__class__.__name__} at {datetime.fromtimestamp(article[0])}") + + def parse(self, item): + article = None + try: + date = item.find("pubDate").text + # Fri, 17 Jan 2020 19:09:40 GMT + timestamp = int(datetime.strptime(date, "%a, %d %b %Y %H:%M:%S %Z").timestamp()) + title = item.find("title").text + description = item.find("description").text + article = (timestamp, title, description) + except AttributeError: + logging.error(f"Received non-parsable news article from {self.__class__.__name__}.") + finally: + return article diff --git a/sources/sources.txt b/sources/sources.txt new file mode 100644 index 0000000..4124c7d --- /dev/null +++ b/sources/sources.txt @@ -0,0 +1,11 @@ +# http://feeds.arstechnica.com/arstechnica/index +# http://feeds.bbci.co.uk/news/business/rss.xml +# http://feeds.bbci.co.uk/news/technology/rss.xml +# http://feeds.reuters.com/reuters/technologyNews?format=xml +# http://rss.nytimes.com/services/xml/rss/nyt/Technology.xml +# http://www.forbes.com/technology/feed/ +# http://www.ft.com/rss/home/us +# http://www.macworld.com/index.rss +# http://www.wired.com/feed +# https://www.engadget.com/rss.xml +# https://www.huffpost.com/section/technology/feed \ No newline at end of file diff --git a/storage.db b/storage.db new file mode 100644 index 0000000..e69de29