Merge pull request 'develop' (#3) from develop into master
Reviewed-on: #3
This commit is contained in:
commit
6096bdbde4
1
.gitignore
vendored
1
.gitignore
vendored
@ -90,6 +90,7 @@ celerybeat-schedule
|
|||||||
# Environments
|
# Environments
|
||||||
.env
|
.env
|
||||||
.venv
|
.venv
|
||||||
|
.idea
|
||||||
env/
|
env/
|
||||||
venv/
|
venv/
|
||||||
ENV/
|
ENV/
|
||||||
|
12
Pipfile
Normal file
12
Pipfile
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
[[source]]
|
||||||
|
name = "pypi"
|
||||||
|
url = "https://pypi.org/simple"
|
||||||
|
verify_ssl = true
|
||||||
|
|
||||||
|
[dev-packages]
|
||||||
|
|
||||||
|
[packages]
|
||||||
|
requests = "*"
|
||||||
|
|
||||||
|
[requires]
|
||||||
|
python_version = "3.7"
|
57
Pipfile.lock
generated
Normal file
57
Pipfile.lock
generated
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
{
|
||||||
|
"_meta": {
|
||||||
|
"hash": {
|
||||||
|
"sha256": "bb57e0d7853b45999e47c163c46b95bc2fde31c527d8d7b5b5539dc979444a6d"
|
||||||
|
},
|
||||||
|
"pipfile-spec": 6,
|
||||||
|
"requires": {
|
||||||
|
"python_version": "3.7"
|
||||||
|
},
|
||||||
|
"sources": [
|
||||||
|
{
|
||||||
|
"name": "pypi",
|
||||||
|
"url": "https://pypi.org/simple",
|
||||||
|
"verify_ssl": true
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"default": {
|
||||||
|
"certifi": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:017c25db2a153ce562900032d5bc68e9f191e44e9a0f762f373977de9df1fbb3",
|
||||||
|
"sha256:25b64c7da4cd7479594d035c08c2d809eb4aab3a26e5a990ea98cc450c320f1f"
|
||||||
|
],
|
||||||
|
"version": "==2019.11.28"
|
||||||
|
},
|
||||||
|
"chardet": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae",
|
||||||
|
"sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"
|
||||||
|
],
|
||||||
|
"version": "==3.0.4"
|
||||||
|
},
|
||||||
|
"idna": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407",
|
||||||
|
"sha256:ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c"
|
||||||
|
],
|
||||||
|
"version": "==2.8"
|
||||||
|
},
|
||||||
|
"requests": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:11e007a8a2aa0323f5a921e9e6a2d7e4e67d9877e85773fba9ba6419025cbeb4",
|
||||||
|
"sha256:9cf5292fcd0f598c671cfc1e0d7d1a7f13bb8085e9a590f48c010551dc6c4b31"
|
||||||
|
],
|
||||||
|
"index": "pypi",
|
||||||
|
"version": "==2.22.0"
|
||||||
|
},
|
||||||
|
"urllib3": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:a8a318824cc77d1fd4b2bec2ded92646630d7fe8619497b142c84a9e6f5a7293",
|
||||||
|
"sha256:f3c5fd51747d450d4dcf6f923c81f78f811aab8205fda64b0aba34a4e48b0745"
|
||||||
|
],
|
||||||
|
"version": "==1.25.7"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"develop": {}
|
||||||
|
}
|
90
grabber.py
Normal file
90
grabber.py
Normal file
@ -0,0 +1,90 @@
|
|||||||
|
import logging
|
||||||
|
import sqlite3
|
||||||
|
import requests
|
||||||
|
import xml.etree.ElementTree as et
|
||||||
|
from abc import abstractmethod, ABC
|
||||||
|
from datetime import datetime
|
||||||
|
from requests.exceptions import HTTPError
|
||||||
|
|
||||||
|
|
||||||
|
class Grabber(ABC):
|
||||||
|
articles = []
|
||||||
|
date_format = "%a, %d %b %Y %H:%M:%S %z"
|
||||||
|
db = None
|
||||||
|
name = ""
|
||||||
|
|
||||||
|
@property
|
||||||
|
@abstractmethod
|
||||||
|
def feed_url(self):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def __init__(self, db):
|
||||||
|
self.db = db
|
||||||
|
self.name = self.__class__.__name__
|
||||||
|
self.setup_tables()
|
||||||
|
self.articles = self.restore()
|
||||||
|
|
||||||
|
def setup_tables(self):
|
||||||
|
try:
|
||||||
|
cur = self.db.cursor()
|
||||||
|
cur.execute(f"CREATE TABLE IF NOT EXISTS {self.name} "
|
||||||
|
f"(id INTEGER PRIMARY KEY AUTOINCREMENT, timestamp TIMESTAMP, title VARCHAR, description VARCHAR)")
|
||||||
|
self.db.commit()
|
||||||
|
except sqlite3.Error:
|
||||||
|
logging.error(f"Could not create table in database for {self.name}.")
|
||||||
|
|
||||||
|
def store(self, articles):
|
||||||
|
try:
|
||||||
|
cur = self.db.cursor()
|
||||||
|
cur.executemany(f"INSERT INTO {self.name} (timestamp, title, description) "
|
||||||
|
f"VALUES (?,?,?)", articles)
|
||||||
|
self.db.commit()
|
||||||
|
except sqlite3.Error:
|
||||||
|
logging.error(f"Could not store updated news articles from {self.name}.")
|
||||||
|
|
||||||
|
def restore(self):
|
||||||
|
articles = []
|
||||||
|
try:
|
||||||
|
cur = self.db.cursor()
|
||||||
|
cur.execute(f"SELECT timestamp, title, description from {self.name}")
|
||||||
|
articles = cur.fetchall()
|
||||||
|
logging.info(f"Restored {len(articles)} news articles from database for {self.name}.")
|
||||||
|
except sqlite3.Error:
|
||||||
|
logging.error(f"Could not restore news articles from database for {self.name}.")
|
||||||
|
finally:
|
||||||
|
return articles
|
||||||
|
|
||||||
|
def request(self):
|
||||||
|
response = requests.get(self.feed_url, headers={"User-Agent": "JackNet"})
|
||||||
|
response.raise_for_status()
|
||||||
|
return response.content
|
||||||
|
|
||||||
|
def parse(self, feed):
|
||||||
|
articles = []
|
||||||
|
for article in feed.iter("item"):
|
||||||
|
try:
|
||||||
|
timestamp = int(datetime.strptime(article.find("pubDate").text, self.date_format).timestamp())
|
||||||
|
title = article.find("title").text
|
||||||
|
description = article.find("description").text
|
||||||
|
articles.append((timestamp, title, description))
|
||||||
|
except AttributeError:
|
||||||
|
logging.error(f"Received non-parsable news article from {self.name}.")
|
||||||
|
return articles
|
||||||
|
|
||||||
|
def process(self, articles, new_articles):
|
||||||
|
delta_articles = [article for article in new_articles if article not in articles]
|
||||||
|
if delta_articles:
|
||||||
|
logging.info(f"Received {len(delta_articles)} new news articles from {self.name}.")
|
||||||
|
return delta_articles
|
||||||
|
|
||||||
|
def grab(self):
|
||||||
|
try:
|
||||||
|
feed = et.fromstring(self.request())
|
||||||
|
new_articles = self.parse(feed)
|
||||||
|
delta_articles = self.process(self.articles, new_articles)
|
||||||
|
self.store(delta_articles)
|
||||||
|
self.articles = new_articles
|
||||||
|
except HTTPError:
|
||||||
|
logging.error(f"Unable to download updated news articles from {self.name}.")
|
||||||
|
except (et.ParseError, ValueError):
|
||||||
|
logging.error(f"Unable to parse updated news articles from {self.name}.")
|
77
run.py
Normal file
77
run.py
Normal file
@ -0,0 +1,77 @@
|
|||||||
|
import logging
|
||||||
|
import sqlite3
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
from sources.ArsTechnica import ArsTechnica
|
||||||
|
from sources.BBCBusiness import BBCBusiness
|
||||||
|
from sources.BBCTechnology import BBCTechnology
|
||||||
|
from sources.Engadget import Engadget
|
||||||
|
from sources.FinancialTimes import FinancialTimes
|
||||||
|
from sources.ForbesBusiness import ForbesBusiness
|
||||||
|
from sources.ForbesFinance import ForbesFinance
|
||||||
|
from sources.ForbesTechnology import ForbesTechnology
|
||||||
|
from sources.HuffingtonPostBusiness import HuffingtonPostBusiness
|
||||||
|
from sources.HuffingtonPostEconomy import HuffingtonPostEconomy
|
||||||
|
from sources.HuffingtonPostTechnology import HuffingtonPostTechnology
|
||||||
|
from sources.IBTimesCompanies import IBTimesCompanies
|
||||||
|
from sources.IBTimesTechnology import IBTimesTechnology
|
||||||
|
from sources.MacWorld import MacWorld
|
||||||
|
from sources.NYTBusiness import NYTBusiness
|
||||||
|
from sources.NYTEconomy import NYTEconomy
|
||||||
|
from sources.NYTTechnology import NYTTechnology
|
||||||
|
from sources.ReutersBusiness import ReutersBusiness
|
||||||
|
from sources.ReutersTechnology import ReutersTechnology
|
||||||
|
from sources.TheEconomistBusiness import TheEconomistBusiness
|
||||||
|
from sources.TheEconomistFinance import TheEconomistFinance
|
||||||
|
from sources.TheEconomistTechnology import TheEconomistTechnology
|
||||||
|
from sources.Wired import Wired
|
||||||
|
|
||||||
|
DATABASE_PATH = Path("storage.db")
|
||||||
|
GRAB_FREQUENCY = 60
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
try:
|
||||||
|
db = sqlite3.connect(DATABASE_PATH)
|
||||||
|
if not db:
|
||||||
|
raise sqlite3.DatabaseError
|
||||||
|
|
||||||
|
grabbers = [
|
||||||
|
ArsTechnica(db),
|
||||||
|
BBCBusiness(db),
|
||||||
|
BBCTechnology(db),
|
||||||
|
Engadget(db),
|
||||||
|
FinancialTimes(db),
|
||||||
|
ForbesBusiness(db),
|
||||||
|
ForbesFinance(db),
|
||||||
|
ForbesTechnology(db),
|
||||||
|
HuffingtonPostBusiness(db),
|
||||||
|
HuffingtonPostEconomy(db),
|
||||||
|
HuffingtonPostTechnology(db),
|
||||||
|
IBTimesCompanies(db),
|
||||||
|
IBTimesTechnology(db),
|
||||||
|
MacWorld(db),
|
||||||
|
NYTBusiness(db),
|
||||||
|
NYTEconomy(db),
|
||||||
|
NYTTechnology(db),
|
||||||
|
ReutersBusiness(db),
|
||||||
|
ReutersTechnology(db),
|
||||||
|
TheEconomistBusiness(db),
|
||||||
|
TheEconomistFinance(db),
|
||||||
|
TheEconomistTechnology(db),
|
||||||
|
Wired(db),
|
||||||
|
]
|
||||||
|
|
||||||
|
while True:
|
||||||
|
for grabber in grabbers:
|
||||||
|
grabber.grab()
|
||||||
|
time.sleep(GRAB_FREQUENCY/len(grabbers))
|
||||||
|
|
||||||
|
except sqlite3.Error:
|
||||||
|
logging.error("Could not connect to database.")
|
||||||
|
exit(-1)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
logging.basicConfig(level=logging.INFO,format="%(asctime)s %(levelname)-8s %(message)s",datefmt="%Y-%m-%d %H:%M:%S")
|
||||||
|
main()
|
5
sources/ArsTechnica.py
Normal file
5
sources/ArsTechnica.py
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
from grabber import Grabber
|
||||||
|
|
||||||
|
|
||||||
|
class ArsTechnica(Grabber):
|
||||||
|
feed_url = "http://feeds.arstechnica.com/arstechnica/index"
|
6
sources/BBCBusiness.py
Normal file
6
sources/BBCBusiness.py
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
from grabber import Grabber
|
||||||
|
|
||||||
|
|
||||||
|
class BBCBusiness(Grabber):
|
||||||
|
feed_url = "http://feeds.bbci.co.uk/news/business/rss.xml"
|
||||||
|
date_format = "%a, %d %b %Y %H:%M:%S %Z"
|
6
sources/BBCTechnology.py
Normal file
6
sources/BBCTechnology.py
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
from grabber import Grabber
|
||||||
|
|
||||||
|
|
||||||
|
class BBCTechnology(Grabber):
|
||||||
|
feed_url = "http://feeds.bbci.co.uk/news/technology/rss.xml"
|
||||||
|
date_format = "%a, %d %b %Y %H:%M:%S %Z"
|
5
sources/Engadget.py
Normal file
5
sources/Engadget.py
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
from grabber import Grabber
|
||||||
|
|
||||||
|
|
||||||
|
class Engadget(Grabber):
|
||||||
|
feed_url = "https://www.engadget.com/rss.xml"
|
6
sources/FinancialTimes.py
Normal file
6
sources/FinancialTimes.py
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
from grabber import Grabber
|
||||||
|
|
||||||
|
|
||||||
|
class FinancialTimes(Grabber):
|
||||||
|
feed_url = "https://www.ft.com/?format=rss&edition=international"
|
||||||
|
date_format = "%a, %d %b %Y %H:%M:%S %Z"
|
5
sources/ForbesBusiness.py
Normal file
5
sources/ForbesBusiness.py
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
from grabber import Grabber
|
||||||
|
|
||||||
|
|
||||||
|
class ForbesBusiness(Grabber):
|
||||||
|
feed_url = "http://www.forbes.com/business/feed/"
|
5
sources/ForbesFinance.py
Normal file
5
sources/ForbesFinance.py
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
from grabber import Grabber
|
||||||
|
|
||||||
|
|
||||||
|
class ForbesFinance(Grabber):
|
||||||
|
feed_url = "http://www.forbes.com/finance/feed/"
|
5
sources/ForbesTechnology.py
Normal file
5
sources/ForbesTechnology.py
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
from grabber import Grabber
|
||||||
|
|
||||||
|
|
||||||
|
class ForbesTechnology(Grabber):
|
||||||
|
feed_url = "http://www.forbes.com/technology/feed/"
|
5
sources/HuffingtonPostBusiness.py
Normal file
5
sources/HuffingtonPostBusiness.py
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
from grabber import Grabber
|
||||||
|
|
||||||
|
|
||||||
|
class HuffingtonPostBusiness(Grabber):
|
||||||
|
feed_url = "https://www.huffpost.com/section/business/feed"
|
5
sources/HuffingtonPostEconomy.py
Normal file
5
sources/HuffingtonPostEconomy.py
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
from grabber import Grabber
|
||||||
|
|
||||||
|
|
||||||
|
class HuffingtonPostEconomy(Grabber):
|
||||||
|
feed_url = "https://www.huffpost.com/section/economy/feed"
|
5
sources/HuffingtonPostTechnology.py
Normal file
5
sources/HuffingtonPostTechnology.py
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
from grabber import Grabber
|
||||||
|
|
||||||
|
|
||||||
|
class HuffingtonPostTechnology(Grabber):
|
||||||
|
feed_url = "https://www.huffpost.com/section/technology/feed"
|
6
sources/IBTimesCompanies.py
Normal file
6
sources/IBTimesCompanies.py
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
from grabber import Grabber
|
||||||
|
|
||||||
|
|
||||||
|
class IBTimesCompanies(Grabber):
|
||||||
|
feed_url = "http://www.ibtimes.co.uk/rss/companies"
|
||||||
|
date_format = "%Y-%m-%dT%H:%M:%S%z"
|
6
sources/IBTimesTechnology.py
Normal file
6
sources/IBTimesTechnology.py
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
from grabber import Grabber
|
||||||
|
|
||||||
|
|
||||||
|
class IBTimesTechnology(Grabber):
|
||||||
|
feed_url = "http://www.ibtimes.co.uk/rss/technology"
|
||||||
|
date_format = "%Y-%m-%dT%H:%M:%S%z"
|
5
sources/MacWorld.py
Normal file
5
sources/MacWorld.py
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
from grabber import Grabber
|
||||||
|
|
||||||
|
|
||||||
|
class MacWorld(Grabber):
|
||||||
|
feed_url = "http://www.macworld.com/index.rss"
|
5
sources/NYTBusiness.py
Normal file
5
sources/NYTBusiness.py
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
from grabber import Grabber
|
||||||
|
|
||||||
|
|
||||||
|
class NYTBusiness(Grabber):
|
||||||
|
feed_url = "http://rss.nytimes.com/services/xml/rss/nyt/Business.xml"
|
5
sources/NYTEconomy.py
Normal file
5
sources/NYTEconomy.py
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
from grabber import Grabber
|
||||||
|
|
||||||
|
|
||||||
|
class NYTEconomy(Grabber):
|
||||||
|
feed_url = "http://rss.nytimes.com/services/xml/rss/nyt/Economy.xml"
|
5
sources/NYTTechnology.py
Normal file
5
sources/NYTTechnology.py
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
from grabber import Grabber
|
||||||
|
|
||||||
|
|
||||||
|
class NYTTechnology(Grabber):
|
||||||
|
feed_url = "http://rss.nytimes.com/services/xml/rss/nyt/Technology.xml"
|
5
sources/ReutersBusiness.py
Normal file
5
sources/ReutersBusiness.py
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
from grabber import Grabber
|
||||||
|
|
||||||
|
|
||||||
|
class ReutersBusiness(Grabber):
|
||||||
|
feed_url = "http://feeds.reuters.com/reuters/businessNews?format=xml"
|
5
sources/ReutersTechnology.py
Normal file
5
sources/ReutersTechnology.py
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
from grabber import Grabber
|
||||||
|
|
||||||
|
|
||||||
|
class ReutersTechnology(Grabber):
|
||||||
|
feed_url = "http://feeds.reuters.com/reuters/technologyNews?format=xml"
|
5
sources/TheEconomistBusiness.py
Normal file
5
sources/TheEconomistBusiness.py
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
from grabber import Grabber
|
||||||
|
|
||||||
|
|
||||||
|
class TheEconomistBusiness(Grabber):
|
||||||
|
feed_url = "https://www.economist.com/business/rss.xml"
|
5
sources/TheEconomistFinance.py
Normal file
5
sources/TheEconomistFinance.py
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
from grabber import Grabber
|
||||||
|
|
||||||
|
|
||||||
|
class TheEconomistFinance(Grabber):
|
||||||
|
feed_url = "https://www.economist.com/finance-and-economics/rss.xml"
|
5
sources/TheEconomistTechnology.py
Normal file
5
sources/TheEconomistTechnology.py
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
from grabber import Grabber
|
||||||
|
|
||||||
|
|
||||||
|
class TheEconomistTechnology(Grabber):
|
||||||
|
feed_url = "https://www.economist.com/science-and-technology/rss.xml"
|
5
sources/Wired.py
Normal file
5
sources/Wired.py
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
from grabber import Grabber
|
||||||
|
|
||||||
|
|
||||||
|
class Wired(Grabber):
|
||||||
|
feed_url = "http://www.wired.com/feed"
|
Loading…
x
Reference in New Issue
Block a user