First attempt.

This commit is contained in:
Jack Hadrill 2020-01-18 15:34:40 +00:00
parent 55dd70e727
commit cd876e3a20
7 changed files with 154 additions and 0 deletions

11
Pipfile Normal file
View File

@ -0,0 +1,11 @@
[[source]]
name = "pypi"
url = "https://pypi.org/simple"
verify_ssl = true
[dev-packages]
[packages]
[requires]
python_version = "3.7"

47
grabber.py Normal file
View File

@ -0,0 +1,47 @@
import logging
import requests
from abc import *
from threading import Timer
class Grabber(ABC):
db = None
_interval = 60
_running = False
def __init__(self, db, interval):
self.db = db
self._interval = interval
@property
@abstractmethod
def feed_url(self):
raise NotImplementedError
@abstractmethod
def grab(self):
raise NotImplementedError
def request(self):
response = requests.get(self.feed_url)
response.raise_for_status()
return response.content
def timer(self):
if self._running:
self.grab()
Timer(self._interval, self.timer).start()
def start(self):
if not self._running:
self._running = True
self.timer()
else:
logging.error(f"Grabber for {self.__class__.__name__} already started.")
def stop(self):
if self._running:
self._running = False
else:
logging.error(f"Grabber for {self.__class__.__name__} already stopped.")

36
run.py Normal file
View File

@ -0,0 +1,36 @@
import logging
import sqlite3
from pathlib import Path
from sources.ArsTechnica import ArsTechnica
from sources.BBCBusiness import BBCBusiness
DATABASE_PATH = Path("storage.db")
SCRAPE_INTERVAL = 15
def configure_logging():
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)-8s %(message)s",
datefmt="%Y-%m-%d %H:%M"
)
def setup_database():
db = sqlite3.connect(DATABASE_PATH)
return db
def main():
db = setup_database()
grabbers = [
ArsTechnica(db, SCRAPE_INTERVAL),
BBCBusiness(db, SCRAPE_INTERVAL)
]
for grabber in grabbers:
grabber.start()
if __name__ == "__main__":
configure_logging()
main()

10
sources/ArsTechnica.py Normal file
View File

@ -0,0 +1,10 @@
from grabber import Grabber
class ArsTechnica(Grabber):
feed_url = "http://feeds.arstechnica.com/arstechnica/index"
def grab(self):
pass

39
sources/BBCBusiness.py Normal file
View File

@ -0,0 +1,39 @@
import csv
import logging
import xml.etree.ElementTree as et
from datetime import datetime
from grabber import Grabber
from requests.exceptions import HTTPError
class BBCBusiness(Grabber):
articles = []
feed_url = "http://feeds.bbci.co.uk/news/business/rss.xml"
def grab(self):
try:
feed = et.fromstring(self.request())
self.process(feed)
except (HTTPError, et.ParseError):
logging.error(f"Unable to get updated news from {self.__class__.__name__}.")
def process(self, feed):
for item in feed.iter("item"):
article = self.parse(item)
if article not in self.articles:
self.articles.append(article)
logging.info(f"New article from {self.__class__.__name__} at {datetime.fromtimestamp(article[0])}")
def parse(self, item):
article = None
try:
date = item.find("pubDate").text
# Fri, 17 Jan 2020 19:09:40 GMT
timestamp = int(datetime.strptime(date, "%a, %d %b %Y %H:%M:%S %Z").timestamp())
title = item.find("title").text
description = item.find("description").text
article = (timestamp, title, description)
except AttributeError:
logging.error(f"Received non-parsable news article from {self.__class__.__name__}.")
finally:
return article

11
sources/sources.txt Normal file
View File

@ -0,0 +1,11 @@
# http://feeds.arstechnica.com/arstechnica/index
# http://feeds.bbci.co.uk/news/business/rss.xml
# http://feeds.bbci.co.uk/news/technology/rss.xml
# http://feeds.reuters.com/reuters/technologyNews?format=xml
# http://rss.nytimes.com/services/xml/rss/nyt/Technology.xml
# http://www.forbes.com/technology/feed/
# http://www.ft.com/rss/home/us
# http://www.macworld.com/index.rss
# http://www.wired.com/feed
# https://www.engadget.com/rss.xml
# https://www.huffpost.com/section/technology/feed

0
storage.db Normal file
View File