Big refactor.
This commit is contained in:
parent
cd876e3a20
commit
285c0a3fc0
|
@ -90,6 +90,7 @@ celerybeat-schedule
|
|||
# Environments
|
||||
.env
|
||||
.venv
|
||||
.idea
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
|
|
1
Pipfile
1
Pipfile
|
@ -6,6 +6,7 @@ verify_ssl = true
|
|||
[dev-packages]
|
||||
|
||||
[packages]
|
||||
requests = "*"
|
||||
|
||||
[requires]
|
||||
python_version = "3.7"
|
||||
|
|
|
@ -0,0 +1,57 @@
|
|||
{
|
||||
"_meta": {
|
||||
"hash": {
|
||||
"sha256": "bb57e0d7853b45999e47c163c46b95bc2fde31c527d8d7b5b5539dc979444a6d"
|
||||
},
|
||||
"pipfile-spec": 6,
|
||||
"requires": {
|
||||
"python_version": "3.7"
|
||||
},
|
||||
"sources": [
|
||||
{
|
||||
"name": "pypi",
|
||||
"url": "https://pypi.org/simple",
|
||||
"verify_ssl": true
|
||||
}
|
||||
]
|
||||
},
|
||||
"default": {
|
||||
"certifi": {
|
||||
"hashes": [
|
||||
"sha256:017c25db2a153ce562900032d5bc68e9f191e44e9a0f762f373977de9df1fbb3",
|
||||
"sha256:25b64c7da4cd7479594d035c08c2d809eb4aab3a26e5a990ea98cc450c320f1f"
|
||||
],
|
||||
"version": "==2019.11.28"
|
||||
},
|
||||
"chardet": {
|
||||
"hashes": [
|
||||
"sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae",
|
||||
"sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"
|
||||
],
|
||||
"version": "==3.0.4"
|
||||
},
|
||||
"idna": {
|
||||
"hashes": [
|
||||
"sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407",
|
||||
"sha256:ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c"
|
||||
],
|
||||
"version": "==2.8"
|
||||
},
|
||||
"requests": {
|
||||
"hashes": [
|
||||
"sha256:11e007a8a2aa0323f5a921e9e6a2d7e4e67d9877e85773fba9ba6419025cbeb4",
|
||||
"sha256:9cf5292fcd0f598c671cfc1e0d7d1a7f13bb8085e9a590f48c010551dc6c4b31"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2.22.0"
|
||||
},
|
||||
"urllib3": {
|
||||
"hashes": [
|
||||
"sha256:a8a318824cc77d1fd4b2bec2ded92646630d7fe8619497b142c84a9e6f5a7293",
|
||||
"sha256:f3c5fd51747d450d4dcf6f923c81f78f811aab8205fda64b0aba34a4e48b0745"
|
||||
],
|
||||
"version": "==1.25.7"
|
||||
}
|
||||
},
|
||||
"develop": {}
|
||||
}
|
84
grabber.py
84
grabber.py
|
@ -1,17 +1,15 @@
|
|||
import logging
|
||||
import sqlite3
|
||||
import requests
|
||||
from abc import *
|
||||
from threading import Timer
|
||||
import xml.etree.ElementTree as et
|
||||
from abc import abstractmethod, ABC
|
||||
from requests.exceptions import HTTPError
|
||||
|
||||
|
||||
class Grabber(ABC):
|
||||
articles = []
|
||||
db = None
|
||||
_interval = 60
|
||||
_running = False
|
||||
|
||||
def __init__(self, db, interval):
|
||||
self.db = db
|
||||
self._interval = interval
|
||||
name = ""
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
|
@ -19,29 +17,65 @@ class Grabber(ABC):
|
|||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def grab(self):
|
||||
def parse(self, feed):
|
||||
raise NotImplementedError
|
||||
|
||||
def __init__(self, db):
|
||||
self.db = db
|
||||
self.name = self.__class__.__name__
|
||||
self.setup_tables()
|
||||
self.articles = self.restore()
|
||||
|
||||
def setup_tables(self):
|
||||
try:
|
||||
cur = self.db.cursor()
|
||||
cur.execute(f"CREATE TABLE IF NOT EXISTS {self.name} "
|
||||
f"(id INTEGER PRIMARY KEY AUTOINCREMENT, timestamp TIMESTAMP, title VARCHAR, description VARCHAR)")
|
||||
self.db.commit()
|
||||
except sqlite3.Error:
|
||||
logging.error(f"Could not create table in database for {self.name}.")
|
||||
|
||||
def store(self, articles):
|
||||
try:
|
||||
cur = self.db.cursor()
|
||||
cur.executemany(f"INSERT INTO {self.name} (timestamp, title, description) "
|
||||
f"VALUES (?,?,?)", articles)
|
||||
self.db.commit()
|
||||
except sqlite3.Error:
|
||||
logging.error(f"Could not store updated news articles from {self.name}.")
|
||||
|
||||
def restore(self):
|
||||
articles = []
|
||||
try:
|
||||
cur = self.db.cursor()
|
||||
cur.execute(f"SELECT timestamp, title, description from {self.name}")
|
||||
articles = cur.fetchall()
|
||||
logging.info(f"Restored {len(articles)} news articles from database for {self.name}.")
|
||||
except sqlite3.Error:
|
||||
logging.error(f"Could not restore news articles from database for {self.name}.")
|
||||
finally:
|
||||
return articles
|
||||
|
||||
def request(self):
|
||||
response = requests.get(self.feed_url)
|
||||
response.raise_for_status()
|
||||
return response.content
|
||||
|
||||
def timer(self):
|
||||
if self._running:
|
||||
self.grab()
|
||||
Timer(self._interval, self.timer).start()
|
||||
def process(self, articles, new_articles):
|
||||
delta_articles = [article for article in new_articles if article not in articles]
|
||||
if delta_articles:
|
||||
logging.info(f"Received {len(delta_articles)} new news articles from {self.name}.")
|
||||
return delta_articles
|
||||
|
||||
def start(self):
|
||||
if not self._running:
|
||||
self._running = True
|
||||
self.timer()
|
||||
else:
|
||||
logging.error(f"Grabber for {self.__class__.__name__} already started.")
|
||||
|
||||
def stop(self):
|
||||
if self._running:
|
||||
self._running = False
|
||||
else:
|
||||
logging.error(f"Grabber for {self.__class__.__name__} already stopped.")
|
||||
def grab(self):
|
||||
try:
|
||||
feed = et.fromstring(self.request())
|
||||
new_articles = self.parse(feed)
|
||||
delta_articles = self.process(self.articles, new_articles)
|
||||
self.store(delta_articles)
|
||||
self.articles = new_articles
|
||||
except HTTPError:
|
||||
logging.error(f"Unable to download updated news articles from {self.name}.")
|
||||
except (et.ParseError, ValueError):
|
||||
logging.error(f"Unable to parse updated news articles from {self.name}.")
|
||||
|
||||
|
|
46
run.py
46
run.py
|
@ -1,36 +1,38 @@
|
|||
import logging
|
||||
import sqlite3
|
||||
import time
|
||||
from pathlib import Path
|
||||
from sources.ArsTechnica import ArsTechnica
|
||||
from sources.BBCBusiness import BBCBusiness
|
||||
from sources.BBCTechnology import BBCTechnology
|
||||
|
||||
DATABASE_PATH = Path("storage.db")
|
||||
SCRAPE_INTERVAL = 15
|
||||
|
||||
|
||||
def configure_logging():
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s %(levelname)-8s %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M"
|
||||
)
|
||||
|
||||
|
||||
def setup_database():
|
||||
db = sqlite3.connect(DATABASE_PATH)
|
||||
return db
|
||||
GRAB_FREQUENCY = 15
|
||||
GRAB_INTERVAL = 5
|
||||
|
||||
|
||||
def main():
|
||||
db = setup_database()
|
||||
grabbers = [
|
||||
ArsTechnica(db, SCRAPE_INTERVAL),
|
||||
BBCBusiness(db, SCRAPE_INTERVAL)
|
||||
]
|
||||
for grabber in grabbers:
|
||||
grabber.start()
|
||||
try:
|
||||
db = sqlite3.connect(DATABASE_PATH)
|
||||
if not db:
|
||||
raise sqlite3.DatabaseError
|
||||
|
||||
grabbers = [
|
||||
ArsTechnica(db),
|
||||
BBCBusiness(db),
|
||||
BBCTechnology(db),
|
||||
]
|
||||
|
||||
while True:
|
||||
for grabber in grabbers:
|
||||
grabber.grab()
|
||||
time.sleep(GRAB_FREQUENCY/GRAB_INTERVAL)
|
||||
|
||||
except sqlite3.Error:
|
||||
logging.error("Could not connect to database.")
|
||||
exit(-1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
configure_logging()
|
||||
logging.basicConfig(level=logging.INFO,format="%(asctime)s %(levelname)-8s %(message)s",datefmt="%Y-%m-%d %H:%M:%S")
|
||||
main()
|
||||
|
|
|
@ -1,10 +1,21 @@
|
|||
import logging
|
||||
from datetime import datetime
|
||||
from grabber import Grabber
|
||||
|
||||
|
||||
class ArsTechnica(Grabber):
|
||||
feed_url = "http://feeds.arstechnica.com/arstechnica/index"
|
||||
date_format = "%a, %d %b %Y %H:%M:%S %z"
|
||||
|
||||
def grab(self):
|
||||
pass
|
||||
|
||||
|
||||
def parse(self, feed):
|
||||
articles = []
|
||||
for article in feed.iter("item"):
|
||||
try:
|
||||
# Sat, 18 Jan 2020 15:41:56 +0000
|
||||
timestamp = int(datetime.strptime(article.find("pubDate").text, self.date_format).timestamp())
|
||||
title = article.find("title").text
|
||||
description = article.find("description").text
|
||||
articles.append((timestamp, title, description))
|
||||
except AttributeError:
|
||||
logging.error(f"Received non-parsable news article from {self.name}.")
|
||||
return articles
|
||||
|
|
|
@ -1,39 +1,21 @@
|
|||
import csv
|
||||
import logging
|
||||
import xml.etree.ElementTree as et
|
||||
from datetime import datetime
|
||||
from grabber import Grabber
|
||||
from requests.exceptions import HTTPError
|
||||
|
||||
|
||||
class BBCBusiness(Grabber):
|
||||
articles = []
|
||||
feed_url = "http://feeds.bbci.co.uk/news/business/rss.xml"
|
||||
date_format = "%a, %d %b %Y %H:%M:%S %Z"
|
||||
|
||||
def grab(self):
|
||||
try:
|
||||
feed = et.fromstring(self.request())
|
||||
self.process(feed)
|
||||
except (HTTPError, et.ParseError):
|
||||
logging.error(f"Unable to get updated news from {self.__class__.__name__}.")
|
||||
|
||||
def process(self, feed):
|
||||
for item in feed.iter("item"):
|
||||
article = self.parse(item)
|
||||
if article not in self.articles:
|
||||
self.articles.append(article)
|
||||
logging.info(f"New article from {self.__class__.__name__} at {datetime.fromtimestamp(article[0])}")
|
||||
|
||||
def parse(self, item):
|
||||
article = None
|
||||
try:
|
||||
date = item.find("pubDate").text
|
||||
# Fri, 17 Jan 2020 19:09:40 GMT
|
||||
timestamp = int(datetime.strptime(date, "%a, %d %b %Y %H:%M:%S %Z").timestamp())
|
||||
title = item.find("title").text
|
||||
description = item.find("description").text
|
||||
article = (timestamp, title, description)
|
||||
except AttributeError:
|
||||
logging.error(f"Received non-parsable news article from {self.__class__.__name__}.")
|
||||
finally:
|
||||
return article
|
||||
def parse(self, feed):
|
||||
articles = []
|
||||
for article in feed.iter("item"):
|
||||
try:
|
||||
# Fri, 17 Jan 2020 19:09:40 GMT
|
||||
timestamp = int(datetime.strptime(article.find("pubDate").text, self.date_format).timestamp())
|
||||
title = article.find("title").text
|
||||
description = article.find("description").text
|
||||
articles.append((timestamp, title, description))
|
||||
except AttributeError:
|
||||
logging.error(f"Received non-parsable news article from {self.name}.")
|
||||
return articles
|
||||
|
|
|
@ -0,0 +1,23 @@
|
|||
import logging
|
||||
from datetime import datetime
|
||||
from grabber import Grabber
|
||||
|
||||
|
||||
class BBCTechnology(Grabber):
|
||||
feed_url = "http://feeds.bbci.co.uk/news/technology/rss.xml"
|
||||
date_format = "%a, %d %b %Y %H:%M:%S %Z"
|
||||
|
||||
def parse(self, feed):
|
||||
articles = []
|
||||
for article in feed.iter("item"):
|
||||
try:
|
||||
# Fri, 17 Jan 2020 19:09:40 GMT
|
||||
timestamp = int(datetime.strptime(article.find("pubDate").text, self.date_format).timestamp())
|
||||
title = article.find("title").text
|
||||
description = article.find("description").text
|
||||
articles.append((timestamp, title, description))
|
||||
except AttributeError:
|
||||
logging.error(f"Received non-parsable news article from {self.name}.")
|
||||
return articles
|
||||
|
||||
|
Loading…
Reference in New Issue