Big refactor.

This commit is contained in:
Jack Hadrill 2020-01-18 18:36:33 +00:00
parent cd876e3a20
commit 285c0a3fc0
8 changed files with 193 additions and 82 deletions

1
.gitignore vendored
View File

@ -90,6 +90,7 @@ celerybeat-schedule
# Environments # Environments
.env .env
.venv .venv
.idea
env/ env/
venv/ venv/
ENV/ ENV/

View File

@ -6,6 +6,7 @@ verify_ssl = true
[dev-packages] [dev-packages]
[packages] [packages]
requests = "*"
[requires] [requires]
python_version = "3.7" python_version = "3.7"

57
Pipfile.lock generated Normal file
View File

@ -0,0 +1,57 @@
{
"_meta": {
"hash": {
"sha256": "bb57e0d7853b45999e47c163c46b95bc2fde31c527d8d7b5b5539dc979444a6d"
},
"pipfile-spec": 6,
"requires": {
"python_version": "3.7"
},
"sources": [
{
"name": "pypi",
"url": "https://pypi.org/simple",
"verify_ssl": true
}
]
},
"default": {
"certifi": {
"hashes": [
"sha256:017c25db2a153ce562900032d5bc68e9f191e44e9a0f762f373977de9df1fbb3",
"sha256:25b64c7da4cd7479594d035c08c2d809eb4aab3a26e5a990ea98cc450c320f1f"
],
"version": "==2019.11.28"
},
"chardet": {
"hashes": [
"sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae",
"sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"
],
"version": "==3.0.4"
},
"idna": {
"hashes": [
"sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407",
"sha256:ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c"
],
"version": "==2.8"
},
"requests": {
"hashes": [
"sha256:11e007a8a2aa0323f5a921e9e6a2d7e4e67d9877e85773fba9ba6419025cbeb4",
"sha256:9cf5292fcd0f598c671cfc1e0d7d1a7f13bb8085e9a590f48c010551dc6c4b31"
],
"index": "pypi",
"version": "==2.22.0"
},
"urllib3": {
"hashes": [
"sha256:a8a318824cc77d1fd4b2bec2ded92646630d7fe8619497b142c84a9e6f5a7293",
"sha256:f3c5fd51747d450d4dcf6f923c81f78f811aab8205fda64b0aba34a4e48b0745"
],
"version": "==1.25.7"
}
},
"develop": {}
}

View File

@ -1,17 +1,15 @@
import logging import logging
import sqlite3
import requests import requests
from abc import * import xml.etree.ElementTree as et
from threading import Timer from abc import abstractmethod, ABC
from requests.exceptions import HTTPError
class Grabber(ABC): class Grabber(ABC):
articles = []
db = None db = None
_interval = 60 name = ""
_running = False
def __init__(self, db, interval):
self.db = db
self._interval = interval
@property @property
@abstractmethod @abstractmethod
@ -19,29 +17,65 @@ class Grabber(ABC):
raise NotImplementedError raise NotImplementedError
@abstractmethod @abstractmethod
def grab(self): def parse(self, feed):
raise NotImplementedError raise NotImplementedError
def __init__(self, db):
self.db = db
self.name = self.__class__.__name__
self.setup_tables()
self.articles = self.restore()
def setup_tables(self):
try:
cur = self.db.cursor()
cur.execute(f"CREATE TABLE IF NOT EXISTS {self.name} "
f"(id INTEGER PRIMARY KEY AUTOINCREMENT, timestamp TIMESTAMP, title VARCHAR, description VARCHAR)")
self.db.commit()
except sqlite3.Error:
logging.error(f"Could not create table in database for {self.name}.")
def store(self, articles):
try:
cur = self.db.cursor()
cur.executemany(f"INSERT INTO {self.name} (timestamp, title, description) "
f"VALUES (?,?,?)", articles)
self.db.commit()
except sqlite3.Error:
logging.error(f"Could not store updated news articles from {self.name}.")
def restore(self):
articles = []
try:
cur = self.db.cursor()
cur.execute(f"SELECT timestamp, title, description from {self.name}")
articles = cur.fetchall()
logging.info(f"Restored {len(articles)} news articles from database for {self.name}.")
except sqlite3.Error:
logging.error(f"Could not restore news articles from database for {self.name}.")
finally:
return articles
def request(self): def request(self):
response = requests.get(self.feed_url) response = requests.get(self.feed_url)
response.raise_for_status() response.raise_for_status()
return response.content return response.content
def timer(self): def process(self, articles, new_articles):
if self._running: delta_articles = [article for article in new_articles if article not in articles]
self.grab() if delta_articles:
Timer(self._interval, self.timer).start() logging.info(f"Received {len(delta_articles)} new news articles from {self.name}.")
return delta_articles
def start(self): def grab(self):
if not self._running: try:
self._running = True feed = et.fromstring(self.request())
self.timer() new_articles = self.parse(feed)
else: delta_articles = self.process(self.articles, new_articles)
logging.error(f"Grabber for {self.__class__.__name__} already started.") self.store(delta_articles)
self.articles = new_articles
def stop(self): except HTTPError:
if self._running: logging.error(f"Unable to download updated news articles from {self.name}.")
self._running = False except (et.ParseError, ValueError):
else: logging.error(f"Unable to parse updated news articles from {self.name}.")
logging.error(f"Grabber for {self.__class__.__name__} already stopped.")

46
run.py
View File

@ -1,36 +1,38 @@
import logging import logging
import sqlite3 import sqlite3
import time
from pathlib import Path from pathlib import Path
from sources.ArsTechnica import ArsTechnica from sources.ArsTechnica import ArsTechnica
from sources.BBCBusiness import BBCBusiness from sources.BBCBusiness import BBCBusiness
from sources.BBCTechnology import BBCTechnology
DATABASE_PATH = Path("storage.db") DATABASE_PATH = Path("storage.db")
SCRAPE_INTERVAL = 15 GRAB_FREQUENCY = 15
GRAB_INTERVAL = 5
def configure_logging():
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)-8s %(message)s",
datefmt="%Y-%m-%d %H:%M"
)
def setup_database():
db = sqlite3.connect(DATABASE_PATH)
return db
def main(): def main():
db = setup_database() try:
grabbers = [ db = sqlite3.connect(DATABASE_PATH)
ArsTechnica(db, SCRAPE_INTERVAL), if not db:
BBCBusiness(db, SCRAPE_INTERVAL) raise sqlite3.DatabaseError
]
for grabber in grabbers: grabbers = [
grabber.start() ArsTechnica(db),
BBCBusiness(db),
BBCTechnology(db),
]
while True:
for grabber in grabbers:
grabber.grab()
time.sleep(GRAB_FREQUENCY/GRAB_INTERVAL)
except sqlite3.Error:
logging.error("Could not connect to database.")
exit(-1)
if __name__ == "__main__": if __name__ == "__main__":
configure_logging() logging.basicConfig(level=logging.INFO,format="%(asctime)s %(levelname)-8s %(message)s",datefmt="%Y-%m-%d %H:%M:%S")
main() main()

View File

@ -1,10 +1,21 @@
import logging
from datetime import datetime
from grabber import Grabber from grabber import Grabber
class ArsTechnica(Grabber): class ArsTechnica(Grabber):
feed_url = "http://feeds.arstechnica.com/arstechnica/index" feed_url = "http://feeds.arstechnica.com/arstechnica/index"
date_format = "%a, %d %b %Y %H:%M:%S %z"
def grab(self): def parse(self, feed):
pass articles = []
for article in feed.iter("item"):
try:
# Sat, 18 Jan 2020 15:41:56 +0000
timestamp = int(datetime.strptime(article.find("pubDate").text, self.date_format).timestamp())
title = article.find("title").text
description = article.find("description").text
articles.append((timestamp, title, description))
except AttributeError:
logging.error(f"Received non-parsable news article from {self.name}.")
return articles

View File

@ -1,39 +1,21 @@
import csv
import logging import logging
import xml.etree.ElementTree as et
from datetime import datetime from datetime import datetime
from grabber import Grabber from grabber import Grabber
from requests.exceptions import HTTPError
class BBCBusiness(Grabber): class BBCBusiness(Grabber):
articles = []
feed_url = "http://feeds.bbci.co.uk/news/business/rss.xml" feed_url = "http://feeds.bbci.co.uk/news/business/rss.xml"
date_format = "%a, %d %b %Y %H:%M:%S %Z"
def grab(self): def parse(self, feed):
try: articles = []
feed = et.fromstring(self.request()) for article in feed.iter("item"):
self.process(feed) try:
except (HTTPError, et.ParseError): # Fri, 17 Jan 2020 19:09:40 GMT
logging.error(f"Unable to get updated news from {self.__class__.__name__}.") timestamp = int(datetime.strptime(article.find("pubDate").text, self.date_format).timestamp())
title = article.find("title").text
def process(self, feed): description = article.find("description").text
for item in feed.iter("item"): articles.append((timestamp, title, description))
article = self.parse(item) except AttributeError:
if article not in self.articles: logging.error(f"Received non-parsable news article from {self.name}.")
self.articles.append(article) return articles
logging.info(f"New article from {self.__class__.__name__} at {datetime.fromtimestamp(article[0])}")
def parse(self, item):
article = None
try:
date = item.find("pubDate").text
# Fri, 17 Jan 2020 19:09:40 GMT
timestamp = int(datetime.strptime(date, "%a, %d %b %Y %H:%M:%S %Z").timestamp())
title = item.find("title").text
description = item.find("description").text
article = (timestamp, title, description)
except AttributeError:
logging.error(f"Received non-parsable news article from {self.__class__.__name__}.")
finally:
return article

23
sources/BBCTechnology.py Normal file
View File

@ -0,0 +1,23 @@
import logging
from datetime import datetime
from grabber import Grabber
class BBCTechnology(Grabber):
feed_url = "http://feeds.bbci.co.uk/news/technology/rss.xml"
date_format = "%a, %d %b %Y %H:%M:%S %Z"
def parse(self, feed):
articles = []
for article in feed.iter("item"):
try:
# Fri, 17 Jan 2020 19:09:40 GMT
timestamp = int(datetime.strptime(article.find("pubDate").text, self.date_format).timestamp())
title = article.find("title").text
description = article.find("description").text
articles.append((timestamp, title, description))
except AttributeError:
logging.error(f"Received non-parsable news article from {self.name}.")
return articles