2020-01-18 15:34:40 +00:00
|
|
|
import logging
|
2020-01-18 18:36:33 +00:00
|
|
|
import sqlite3
|
2020-01-18 15:34:40 +00:00
|
|
|
import requests
|
2020-01-18 18:36:33 +00:00
|
|
|
import xml.etree.ElementTree as et
|
|
|
|
from abc import abstractmethod, ABC
|
2020-01-18 19:47:27 +00:00
|
|
|
from datetime import datetime
|
2020-01-18 18:36:33 +00:00
|
|
|
from requests.exceptions import HTTPError
|
2020-01-18 15:34:40 +00:00
|
|
|
|
|
|
|
|
|
|
|
class Grabber(ABC):
|
2020-01-18 18:36:33 +00:00
|
|
|
articles = []
|
2020-01-18 19:47:27 +00:00
|
|
|
date_format = "%a, %d %b %Y %H:%M:%S %z"
|
2020-01-18 15:34:40 +00:00
|
|
|
db = None
|
2020-01-18 18:36:33 +00:00
|
|
|
name = ""
|
2020-01-18 15:34:40 +00:00
|
|
|
|
|
|
|
@property
|
|
|
|
@abstractmethod
|
|
|
|
def feed_url(self):
|
|
|
|
raise NotImplementedError
|
|
|
|
|
2020-01-18 18:36:33 +00:00
|
|
|
def __init__(self, db):
|
|
|
|
self.db = db
|
|
|
|
self.name = self.__class__.__name__
|
|
|
|
self.setup_tables()
|
|
|
|
self.articles = self.restore()
|
|
|
|
|
|
|
|
def setup_tables(self):
|
|
|
|
try:
|
|
|
|
cur = self.db.cursor()
|
|
|
|
cur.execute(f"CREATE TABLE IF NOT EXISTS {self.name} "
|
|
|
|
f"(id INTEGER PRIMARY KEY AUTOINCREMENT, timestamp TIMESTAMP, title VARCHAR, description VARCHAR)")
|
|
|
|
self.db.commit()
|
|
|
|
except sqlite3.Error:
|
|
|
|
logging.error(f"Could not create table in database for {self.name}.")
|
|
|
|
|
|
|
|
def store(self, articles):
|
|
|
|
try:
|
|
|
|
cur = self.db.cursor()
|
|
|
|
cur.executemany(f"INSERT INTO {self.name} (timestamp, title, description) "
|
|
|
|
f"VALUES (?,?,?)", articles)
|
|
|
|
self.db.commit()
|
|
|
|
except sqlite3.Error:
|
|
|
|
logging.error(f"Could not store updated news articles from {self.name}.")
|
|
|
|
|
|
|
|
def restore(self):
|
|
|
|
articles = []
|
|
|
|
try:
|
|
|
|
cur = self.db.cursor()
|
|
|
|
cur.execute(f"SELECT timestamp, title, description from {self.name}")
|
|
|
|
articles = cur.fetchall()
|
|
|
|
logging.info(f"Restored {len(articles)} news articles from database for {self.name}.")
|
|
|
|
except sqlite3.Error:
|
|
|
|
logging.error(f"Could not restore news articles from database for {self.name}.")
|
|
|
|
finally:
|
|
|
|
return articles
|
|
|
|
|
2020-01-18 15:34:40 +00:00
|
|
|
def request(self):
|
2020-01-18 21:45:44 +00:00
|
|
|
response = requests.get(self.feed_url, headers={"User-Agent": "JackNet"})
|
2020-01-18 15:34:40 +00:00
|
|
|
response.raise_for_status()
|
|
|
|
return response.content
|
|
|
|
|
2020-01-18 19:47:27 +00:00
|
|
|
def parse(self, feed):
|
|
|
|
articles = []
|
|
|
|
for article in feed.iter("item"):
|
|
|
|
try:
|
|
|
|
# Sat, 18 Jan 2020 14:21:49 -0500
|
|
|
|
timestamp = int(datetime.strptime(article.find("pubDate").text, self.date_format).timestamp())
|
|
|
|
title = article.find("title").text
|
|
|
|
description = article.find("description").text
|
|
|
|
articles.append((timestamp, title, description))
|
|
|
|
except AttributeError:
|
|
|
|
logging.error(f"Received non-parsable news article from {self.name}.")
|
|
|
|
return articles
|
|
|
|
|
2020-01-18 18:36:33 +00:00
|
|
|
def process(self, articles, new_articles):
|
|
|
|
delta_articles = [article for article in new_articles if article not in articles]
|
|
|
|
if delta_articles:
|
|
|
|
logging.info(f"Received {len(delta_articles)} new news articles from {self.name}.")
|
|
|
|
return delta_articles
|
|
|
|
|
|
|
|
def grab(self):
|
|
|
|
try:
|
|
|
|
feed = et.fromstring(self.request())
|
|
|
|
new_articles = self.parse(feed)
|
|
|
|
delta_articles = self.process(self.articles, new_articles)
|
|
|
|
self.store(delta_articles)
|
|
|
|
self.articles = new_articles
|
|
|
|
except HTTPError:
|
|
|
|
logging.error(f"Unable to download updated news articles from {self.name}.")
|
|
|
|
except (et.ParseError, ValueError):
|
|
|
|
logging.error(f"Unable to parse updated news articles from {self.name}.")
|
2020-01-18 15:34:40 +00:00
|
|
|
|