import logging import sqlite3 import requests import xml.etree.ElementTree as et from abc import abstractmethod, ABC from datetime import datetime from requests.exceptions import HTTPError class Grabber(ABC): articles = [] date_format = "%a, %d %b %Y %H:%M:%S %z" db = None name = "" @property @abstractmethod def feed_url(self): raise NotImplementedError def __init__(self, db): self.db = db self.name = self.__class__.__name__ self.setup_tables() self.articles = self.restore() def setup_tables(self): try: cur = self.db.cursor() cur.execute(f"CREATE TABLE IF NOT EXISTS {self.name} " f"(id INTEGER PRIMARY KEY AUTOINCREMENT, timestamp TIMESTAMP, title VARCHAR, description VARCHAR)") self.db.commit() except sqlite3.Error: logging.error(f"Could not create table in database for {self.name}.") def store(self, articles): try: cur = self.db.cursor() cur.executemany(f"INSERT INTO {self.name} (timestamp, title, description) " f"VALUES (?,?,?)", articles) self.db.commit() except sqlite3.Error: logging.error(f"Could not store updated news articles from {self.name}.") def restore(self): articles = [] try: cur = self.db.cursor() cur.execute(f"SELECT timestamp, title, description from {self.name}") articles = cur.fetchall() logging.info(f"Restored {len(articles)} news articles from database for {self.name}.") except sqlite3.Error: logging.error(f"Could not restore news articles from database for {self.name}.") finally: return articles def request(self): response = requests.get(self.feed_url, headers={"User-Agent": "JackNet"}) response.raise_for_status() return response.content def parse(self, feed): articles = [] for article in feed.iter("item"): try: # Sat, 18 Jan 2020 14:21:49 -0500 timestamp = int(datetime.strptime(article.find("pubDate").text, self.date_format).timestamp()) title = article.find("title").text description = article.find("description").text articles.append((timestamp, title, description)) except AttributeError: logging.error(f"Received non-parsable news article from {self.name}.") return articles def process(self, articles, new_articles): delta_articles = [article for article in new_articles if article not in articles] if delta_articles: logging.info(f"Received {len(delta_articles)} new news articles from {self.name}.") return delta_articles def grab(self): try: feed = et.fromstring(self.request()) new_articles = self.parse(feed) delta_articles = self.process(self.articles, new_articles) self.store(delta_articles) self.articles = new_articles except HTTPError: logging.error(f"Unable to download updated news articles from {self.name}.") except (et.ParseError, ValueError): logging.error(f"Unable to parse updated news articles from {self.name}.")