1
0
Fork 0

Merge pull request 'develop' (#3) from develop into master

Reviewed-on: #3
This commit is contained in:
Jack Hadrill 2020-09-08 11:11:19 +00:00
commit 6096bdbde4
28 changed files with 357 additions and 0 deletions

1
.gitignore vendored
View File

@ -90,6 +90,7 @@ celerybeat-schedule
# Environments
.env
.venv
.idea
env/
venv/
ENV/

12
Pipfile Normal file
View File

@ -0,0 +1,12 @@
[[source]]
name = "pypi"
url = "https://pypi.org/simple"
verify_ssl = true
[dev-packages]
[packages]
requests = "*"
[requires]
python_version = "3.7"

57
Pipfile.lock generated Normal file
View File

@ -0,0 +1,57 @@
{
"_meta": {
"hash": {
"sha256": "bb57e0d7853b45999e47c163c46b95bc2fde31c527d8d7b5b5539dc979444a6d"
},
"pipfile-spec": 6,
"requires": {
"python_version": "3.7"
},
"sources": [
{
"name": "pypi",
"url": "https://pypi.org/simple",
"verify_ssl": true
}
]
},
"default": {
"certifi": {
"hashes": [
"sha256:017c25db2a153ce562900032d5bc68e9f191e44e9a0f762f373977de9df1fbb3",
"sha256:25b64c7da4cd7479594d035c08c2d809eb4aab3a26e5a990ea98cc450c320f1f"
],
"version": "==2019.11.28"
},
"chardet": {
"hashes": [
"sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae",
"sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"
],
"version": "==3.0.4"
},
"idna": {
"hashes": [
"sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407",
"sha256:ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c"
],
"version": "==2.8"
},
"requests": {
"hashes": [
"sha256:11e007a8a2aa0323f5a921e9e6a2d7e4e67d9877e85773fba9ba6419025cbeb4",
"sha256:9cf5292fcd0f598c671cfc1e0d7d1a7f13bb8085e9a590f48c010551dc6c4b31"
],
"index": "pypi",
"version": "==2.22.0"
},
"urllib3": {
"hashes": [
"sha256:a8a318824cc77d1fd4b2bec2ded92646630d7fe8619497b142c84a9e6f5a7293",
"sha256:f3c5fd51747d450d4dcf6f923c81f78f811aab8205fda64b0aba34a4e48b0745"
],
"version": "==1.25.7"
}
},
"develop": {}
}

90
grabber.py Normal file
View File

@ -0,0 +1,90 @@
import logging
import sqlite3
import requests
import xml.etree.ElementTree as et
from abc import abstractmethod, ABC
from datetime import datetime
from requests.exceptions import HTTPError
class Grabber(ABC):
articles = []
date_format = "%a, %d %b %Y %H:%M:%S %z"
db = None
name = ""
@property
@abstractmethod
def feed_url(self):
raise NotImplementedError
def __init__(self, db):
self.db = db
self.name = self.__class__.__name__
self.setup_tables()
self.articles = self.restore()
def setup_tables(self):
try:
cur = self.db.cursor()
cur.execute(f"CREATE TABLE IF NOT EXISTS {self.name} "
f"(id INTEGER PRIMARY KEY AUTOINCREMENT, timestamp TIMESTAMP, title VARCHAR, description VARCHAR)")
self.db.commit()
except sqlite3.Error:
logging.error(f"Could not create table in database for {self.name}.")
def store(self, articles):
try:
cur = self.db.cursor()
cur.executemany(f"INSERT INTO {self.name} (timestamp, title, description) "
f"VALUES (?,?,?)", articles)
self.db.commit()
except sqlite3.Error:
logging.error(f"Could not store updated news articles from {self.name}.")
def restore(self):
articles = []
try:
cur = self.db.cursor()
cur.execute(f"SELECT timestamp, title, description from {self.name}")
articles = cur.fetchall()
logging.info(f"Restored {len(articles)} news articles from database for {self.name}.")
except sqlite3.Error:
logging.error(f"Could not restore news articles from database for {self.name}.")
finally:
return articles
def request(self):
response = requests.get(self.feed_url, headers={"User-Agent": "JackNet"})
response.raise_for_status()
return response.content
def parse(self, feed):
articles = []
for article in feed.iter("item"):
try:
timestamp = int(datetime.strptime(article.find("pubDate").text, self.date_format).timestamp())
title = article.find("title").text
description = article.find("description").text
articles.append((timestamp, title, description))
except AttributeError:
logging.error(f"Received non-parsable news article from {self.name}.")
return articles
def process(self, articles, new_articles):
delta_articles = [article for article in new_articles if article not in articles]
if delta_articles:
logging.info(f"Received {len(delta_articles)} new news articles from {self.name}.")
return delta_articles
def grab(self):
try:
feed = et.fromstring(self.request())
new_articles = self.parse(feed)
delta_articles = self.process(self.articles, new_articles)
self.store(delta_articles)
self.articles = new_articles
except HTTPError:
logging.error(f"Unable to download updated news articles from {self.name}.")
except (et.ParseError, ValueError):
logging.error(f"Unable to parse updated news articles from {self.name}.")

77
run.py Normal file
View File

@ -0,0 +1,77 @@
import logging
import sqlite3
import time
from pathlib import Path
from sources.ArsTechnica import ArsTechnica
from sources.BBCBusiness import BBCBusiness
from sources.BBCTechnology import BBCTechnology
from sources.Engadget import Engadget
from sources.FinancialTimes import FinancialTimes
from sources.ForbesBusiness import ForbesBusiness
from sources.ForbesFinance import ForbesFinance
from sources.ForbesTechnology import ForbesTechnology
from sources.HuffingtonPostBusiness import HuffingtonPostBusiness
from sources.HuffingtonPostEconomy import HuffingtonPostEconomy
from sources.HuffingtonPostTechnology import HuffingtonPostTechnology
from sources.IBTimesCompanies import IBTimesCompanies
from sources.IBTimesTechnology import IBTimesTechnology
from sources.MacWorld import MacWorld
from sources.NYTBusiness import NYTBusiness
from sources.NYTEconomy import NYTEconomy
from sources.NYTTechnology import NYTTechnology
from sources.ReutersBusiness import ReutersBusiness
from sources.ReutersTechnology import ReutersTechnology
from sources.TheEconomistBusiness import TheEconomistBusiness
from sources.TheEconomistFinance import TheEconomistFinance
from sources.TheEconomistTechnology import TheEconomistTechnology
from sources.Wired import Wired
DATABASE_PATH = Path("storage.db")
GRAB_FREQUENCY = 60
def main():
try:
db = sqlite3.connect(DATABASE_PATH)
if not db:
raise sqlite3.DatabaseError
grabbers = [
ArsTechnica(db),
BBCBusiness(db),
BBCTechnology(db),
Engadget(db),
FinancialTimes(db),
ForbesBusiness(db),
ForbesFinance(db),
ForbesTechnology(db),
HuffingtonPostBusiness(db),
HuffingtonPostEconomy(db),
HuffingtonPostTechnology(db),
IBTimesCompanies(db),
IBTimesTechnology(db),
MacWorld(db),
NYTBusiness(db),
NYTEconomy(db),
NYTTechnology(db),
ReutersBusiness(db),
ReutersTechnology(db),
TheEconomistBusiness(db),
TheEconomistFinance(db),
TheEconomistTechnology(db),
Wired(db),
]
while True:
for grabber in grabbers:
grabber.grab()
time.sleep(GRAB_FREQUENCY/len(grabbers))
except sqlite3.Error:
logging.error("Could not connect to database.")
exit(-1)
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO,format="%(asctime)s %(levelname)-8s %(message)s",datefmt="%Y-%m-%d %H:%M:%S")
main()

5
sources/ArsTechnica.py Normal file
View File

@ -0,0 +1,5 @@
from grabber import Grabber
class ArsTechnica(Grabber):
feed_url = "http://feeds.arstechnica.com/arstechnica/index"

6
sources/BBCBusiness.py Normal file
View File

@ -0,0 +1,6 @@
from grabber import Grabber
class BBCBusiness(Grabber):
feed_url = "http://feeds.bbci.co.uk/news/business/rss.xml"
date_format = "%a, %d %b %Y %H:%M:%S %Z"

6
sources/BBCTechnology.py Normal file
View File

@ -0,0 +1,6 @@
from grabber import Grabber
class BBCTechnology(Grabber):
feed_url = "http://feeds.bbci.co.uk/news/technology/rss.xml"
date_format = "%a, %d %b %Y %H:%M:%S %Z"

5
sources/Engadget.py Normal file
View File

@ -0,0 +1,5 @@
from grabber import Grabber
class Engadget(Grabber):
feed_url = "https://www.engadget.com/rss.xml"

View File

@ -0,0 +1,6 @@
from grabber import Grabber
class FinancialTimes(Grabber):
feed_url = "https://www.ft.com/?format=rss&edition=international"
date_format = "%a, %d %b %Y %H:%M:%S %Z"

View File

@ -0,0 +1,5 @@
from grabber import Grabber
class ForbesBusiness(Grabber):
feed_url = "http://www.forbes.com/business/feed/"

5
sources/ForbesFinance.py Normal file
View File

@ -0,0 +1,5 @@
from grabber import Grabber
class ForbesFinance(Grabber):
feed_url = "http://www.forbes.com/finance/feed/"

View File

@ -0,0 +1,5 @@
from grabber import Grabber
class ForbesTechnology(Grabber):
feed_url = "http://www.forbes.com/technology/feed/"

View File

@ -0,0 +1,5 @@
from grabber import Grabber
class HuffingtonPostBusiness(Grabber):
feed_url = "https://www.huffpost.com/section/business/feed"

View File

@ -0,0 +1,5 @@
from grabber import Grabber
class HuffingtonPostEconomy(Grabber):
feed_url = "https://www.huffpost.com/section/economy/feed"

View File

@ -0,0 +1,5 @@
from grabber import Grabber
class HuffingtonPostTechnology(Grabber):
feed_url = "https://www.huffpost.com/section/technology/feed"

View File

@ -0,0 +1,6 @@
from grabber import Grabber
class IBTimesCompanies(Grabber):
feed_url = "http://www.ibtimes.co.uk/rss/companies"
date_format = "%Y-%m-%dT%H:%M:%S%z"

View File

@ -0,0 +1,6 @@
from grabber import Grabber
class IBTimesTechnology(Grabber):
feed_url = "http://www.ibtimes.co.uk/rss/technology"
date_format = "%Y-%m-%dT%H:%M:%S%z"

5
sources/MacWorld.py Normal file
View File

@ -0,0 +1,5 @@
from grabber import Grabber
class MacWorld(Grabber):
feed_url = "http://www.macworld.com/index.rss"

5
sources/NYTBusiness.py Normal file
View File

@ -0,0 +1,5 @@
from grabber import Grabber
class NYTBusiness(Grabber):
feed_url = "http://rss.nytimes.com/services/xml/rss/nyt/Business.xml"

5
sources/NYTEconomy.py Normal file
View File

@ -0,0 +1,5 @@
from grabber import Grabber
class NYTEconomy(Grabber):
feed_url = "http://rss.nytimes.com/services/xml/rss/nyt/Economy.xml"

5
sources/NYTTechnology.py Normal file
View File

@ -0,0 +1,5 @@
from grabber import Grabber
class NYTTechnology(Grabber):
feed_url = "http://rss.nytimes.com/services/xml/rss/nyt/Technology.xml"

View File

@ -0,0 +1,5 @@
from grabber import Grabber
class ReutersBusiness(Grabber):
feed_url = "http://feeds.reuters.com/reuters/businessNews?format=xml"

View File

@ -0,0 +1,5 @@
from grabber import Grabber
class ReutersTechnology(Grabber):
feed_url = "http://feeds.reuters.com/reuters/technologyNews?format=xml"

View File

@ -0,0 +1,5 @@
from grabber import Grabber
class TheEconomistBusiness(Grabber):
feed_url = "https://www.economist.com/business/rss.xml"

View File

@ -0,0 +1,5 @@
from grabber import Grabber
class TheEconomistFinance(Grabber):
feed_url = "https://www.economist.com/finance-and-economics/rss.xml"

View File

@ -0,0 +1,5 @@
from grabber import Grabber
class TheEconomistTechnology(Grabber):
feed_url = "https://www.economist.com/science-and-technology/rss.xml"

5
sources/Wired.py Normal file
View File

@ -0,0 +1,5 @@
from grabber import Grabber
class Wired(Grabber):
feed_url = "http://www.wired.com/feed"