First attempt.
This commit is contained in:
parent
55dd70e727
commit
cd876e3a20
11
Pipfile
Normal file
11
Pipfile
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
[[source]]
|
||||||
|
name = "pypi"
|
||||||
|
url = "https://pypi.org/simple"
|
||||||
|
verify_ssl = true
|
||||||
|
|
||||||
|
[dev-packages]
|
||||||
|
|
||||||
|
[packages]
|
||||||
|
|
||||||
|
[requires]
|
||||||
|
python_version = "3.7"
|
47
grabber.py
Normal file
47
grabber.py
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
import logging
|
||||||
|
import requests
|
||||||
|
from abc import *
|
||||||
|
from threading import Timer
|
||||||
|
|
||||||
|
|
||||||
|
class Grabber(ABC):
|
||||||
|
db = None
|
||||||
|
_interval = 60
|
||||||
|
_running = False
|
||||||
|
|
||||||
|
def __init__(self, db, interval):
|
||||||
|
self.db = db
|
||||||
|
self._interval = interval
|
||||||
|
|
||||||
|
@property
|
||||||
|
@abstractmethod
|
||||||
|
def feed_url(self):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def grab(self):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def request(self):
|
||||||
|
response = requests.get(self.feed_url)
|
||||||
|
response.raise_for_status()
|
||||||
|
return response.content
|
||||||
|
|
||||||
|
def timer(self):
|
||||||
|
if self._running:
|
||||||
|
self.grab()
|
||||||
|
Timer(self._interval, self.timer).start()
|
||||||
|
|
||||||
|
def start(self):
|
||||||
|
if not self._running:
|
||||||
|
self._running = True
|
||||||
|
self.timer()
|
||||||
|
else:
|
||||||
|
logging.error(f"Grabber for {self.__class__.__name__} already started.")
|
||||||
|
|
||||||
|
def stop(self):
|
||||||
|
if self._running:
|
||||||
|
self._running = False
|
||||||
|
else:
|
||||||
|
logging.error(f"Grabber for {self.__class__.__name__} already stopped.")
|
||||||
|
|
36
run.py
Normal file
36
run.py
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
import logging
|
||||||
|
import sqlite3
|
||||||
|
from pathlib import Path
|
||||||
|
from sources.ArsTechnica import ArsTechnica
|
||||||
|
from sources.BBCBusiness import BBCBusiness
|
||||||
|
|
||||||
|
DATABASE_PATH = Path("storage.db")
|
||||||
|
SCRAPE_INTERVAL = 15
|
||||||
|
|
||||||
|
|
||||||
|
def configure_logging():
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format="%(asctime)s %(levelname)-8s %(message)s",
|
||||||
|
datefmt="%Y-%m-%d %H:%M"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def setup_database():
|
||||||
|
db = sqlite3.connect(DATABASE_PATH)
|
||||||
|
return db
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
db = setup_database()
|
||||||
|
grabbers = [
|
||||||
|
ArsTechnica(db, SCRAPE_INTERVAL),
|
||||||
|
BBCBusiness(db, SCRAPE_INTERVAL)
|
||||||
|
]
|
||||||
|
for grabber in grabbers:
|
||||||
|
grabber.start()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
configure_logging()
|
||||||
|
main()
|
10
sources/ArsTechnica.py
Normal file
10
sources/ArsTechnica.py
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
from grabber import Grabber
|
||||||
|
|
||||||
|
|
||||||
|
class ArsTechnica(Grabber):
|
||||||
|
feed_url = "http://feeds.arstechnica.com/arstechnica/index"
|
||||||
|
|
||||||
|
def grab(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
39
sources/BBCBusiness.py
Normal file
39
sources/BBCBusiness.py
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
import csv
|
||||||
|
import logging
|
||||||
|
import xml.etree.ElementTree as et
|
||||||
|
from datetime import datetime
|
||||||
|
from grabber import Grabber
|
||||||
|
from requests.exceptions import HTTPError
|
||||||
|
|
||||||
|
|
||||||
|
class BBCBusiness(Grabber):
|
||||||
|
articles = []
|
||||||
|
feed_url = "http://feeds.bbci.co.uk/news/business/rss.xml"
|
||||||
|
|
||||||
|
def grab(self):
|
||||||
|
try:
|
||||||
|
feed = et.fromstring(self.request())
|
||||||
|
self.process(feed)
|
||||||
|
except (HTTPError, et.ParseError):
|
||||||
|
logging.error(f"Unable to get updated news from {self.__class__.__name__}.")
|
||||||
|
|
||||||
|
def process(self, feed):
|
||||||
|
for item in feed.iter("item"):
|
||||||
|
article = self.parse(item)
|
||||||
|
if article not in self.articles:
|
||||||
|
self.articles.append(article)
|
||||||
|
logging.info(f"New article from {self.__class__.__name__} at {datetime.fromtimestamp(article[0])}")
|
||||||
|
|
||||||
|
def parse(self, item):
|
||||||
|
article = None
|
||||||
|
try:
|
||||||
|
date = item.find("pubDate").text
|
||||||
|
# Fri, 17 Jan 2020 19:09:40 GMT
|
||||||
|
timestamp = int(datetime.strptime(date, "%a, %d %b %Y %H:%M:%S %Z").timestamp())
|
||||||
|
title = item.find("title").text
|
||||||
|
description = item.find("description").text
|
||||||
|
article = (timestamp, title, description)
|
||||||
|
except AttributeError:
|
||||||
|
logging.error(f"Received non-parsable news article from {self.__class__.__name__}.")
|
||||||
|
finally:
|
||||||
|
return article
|
11
sources/sources.txt
Normal file
11
sources/sources.txt
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
# http://feeds.arstechnica.com/arstechnica/index
|
||||||
|
# http://feeds.bbci.co.uk/news/business/rss.xml
|
||||||
|
# http://feeds.bbci.co.uk/news/technology/rss.xml
|
||||||
|
# http://feeds.reuters.com/reuters/technologyNews?format=xml
|
||||||
|
# http://rss.nytimes.com/services/xml/rss/nyt/Technology.xml
|
||||||
|
# http://www.forbes.com/technology/feed/
|
||||||
|
# http://www.ft.com/rss/home/us
|
||||||
|
# http://www.macworld.com/index.rss
|
||||||
|
# http://www.wired.com/feed
|
||||||
|
# https://www.engadget.com/rss.xml
|
||||||
|
# https://www.huffpost.com/section/technology/feed
|
0
storage.db
Normal file
0
storage.db
Normal file
Loading…
x
Reference in New Issue
Block a user