Add Reuters grabbers.

This commit is contained in:
Jack Hadrill 2020-01-18 19:47:27 +00:00
parent 285c0a3fc0
commit 5098ea53b3
9 changed files with 33 additions and 52 deletions

View File

@ -3,11 +3,13 @@ import sqlite3
import requests import requests
import xml.etree.ElementTree as et import xml.etree.ElementTree as et
from abc import abstractmethod, ABC from abc import abstractmethod, ABC
from datetime import datetime
from requests.exceptions import HTTPError from requests.exceptions import HTTPError
class Grabber(ABC): class Grabber(ABC):
articles = [] articles = []
date_format = "%a, %d %b %Y %H:%M:%S %z"
db = None db = None
name = "" name = ""
@ -16,10 +18,6 @@ class Grabber(ABC):
def feed_url(self): def feed_url(self):
raise NotImplementedError raise NotImplementedError
@abstractmethod
def parse(self, feed):
raise NotImplementedError
def __init__(self, db): def __init__(self, db):
self.db = db self.db = db
self.name = self.__class__.__name__ self.name = self.__class__.__name__
@ -61,6 +59,19 @@ class Grabber(ABC):
response.raise_for_status() response.raise_for_status()
return response.content return response.content
def parse(self, feed):
articles = []
for article in feed.iter("item"):
try:
# Sat, 18 Jan 2020 14:21:49 -0500
timestamp = int(datetime.strptime(article.find("pubDate").text, self.date_format).timestamp())
title = article.find("title").text
description = article.find("description").text
articles.append((timestamp, title, description))
except AttributeError:
logging.error(f"Received non-parsable news article from {self.name}.")
return articles
def process(self, articles, new_articles): def process(self, articles, new_articles):
delta_articles = [article for article in new_articles if article not in articles] delta_articles = [article for article in new_articles if article not in articles]
if delta_articles: if delta_articles:

4
run.py
View File

@ -5,6 +5,8 @@ from pathlib import Path
from sources.ArsTechnica import ArsTechnica from sources.ArsTechnica import ArsTechnica
from sources.BBCBusiness import BBCBusiness from sources.BBCBusiness import BBCBusiness
from sources.BBCTechnology import BBCTechnology from sources.BBCTechnology import BBCTechnology
from sources.ReutersBusiness import ReutersBusiness
from sources.ReutersTechnology import ReutersTechnology
DATABASE_PATH = Path("storage.db") DATABASE_PATH = Path("storage.db")
GRAB_FREQUENCY = 15 GRAB_FREQUENCY = 15
@ -21,6 +23,8 @@ def main():
ArsTechnica(db), ArsTechnica(db),
BBCBusiness(db), BBCBusiness(db),
BBCTechnology(db), BBCTechnology(db),
ReutersBusiness(db),
ReutersTechnology(db),
] ]
while True: while True:

View File

@ -1,21 +1,5 @@
import logging
from datetime import datetime
from grabber import Grabber from grabber import Grabber
class ArsTechnica(Grabber): class ArsTechnica(Grabber):
feed_url = "http://feeds.arstechnica.com/arstechnica/index" feed_url = "http://feeds.arstechnica.com/arstechnica/index"
date_format = "%a, %d %b %Y %H:%M:%S %z"
def parse(self, feed):
articles = []
for article in feed.iter("item"):
try:
# Sat, 18 Jan 2020 15:41:56 +0000
timestamp = int(datetime.strptime(article.find("pubDate").text, self.date_format).timestamp())
title = article.find("title").text
description = article.find("description").text
articles.append((timestamp, title, description))
except AttributeError:
logging.error(f"Received non-parsable news article from {self.name}.")
return articles

View File

@ -1,21 +1,6 @@
import logging
from datetime import datetime
from grabber import Grabber from grabber import Grabber
class BBCBusiness(Grabber): class BBCBusiness(Grabber):
feed_url = "http://feeds.bbci.co.uk/news/business/rss.xml" feed_url = "http://feeds.bbci.co.uk/news/business/rss.xml"
date_format = "%a, %d %b %Y %H:%M:%S %Z" date_format = "%a, %d %b %Y %H:%M:%S %Z"
def parse(self, feed):
articles = []
for article in feed.iter("item"):
try:
# Fri, 17 Jan 2020 19:09:40 GMT
timestamp = int(datetime.strptime(article.find("pubDate").text, self.date_format).timestamp())
title = article.find("title").text
description = article.find("description").text
articles.append((timestamp, title, description))
except AttributeError:
logging.error(f"Received non-parsable news article from {self.name}.")
return articles

View File

@ -1,23 +1,6 @@
import logging
from datetime import datetime
from grabber import Grabber from grabber import Grabber
class BBCTechnology(Grabber): class BBCTechnology(Grabber):
feed_url = "http://feeds.bbci.co.uk/news/technology/rss.xml" feed_url = "http://feeds.bbci.co.uk/news/technology/rss.xml"
date_format = "%a, %d %b %Y %H:%M:%S %Z" date_format = "%a, %d %b %Y %H:%M:%S %Z"
def parse(self, feed):
articles = []
for article in feed.iter("item"):
try:
# Fri, 17 Jan 2020 19:09:40 GMT
timestamp = int(datetime.strptime(article.find("pubDate").text, self.date_format).timestamp())
title = article.find("title").text
description = article.find("description").text
articles.append((timestamp, title, description))
except AttributeError:
logging.error(f"Received non-parsable news article from {self.name}.")
return articles

View File

@ -0,0 +1,5 @@
from grabber import Grabber
class ReutersBusiness(Grabber):
feed_url = "http://feeds.reuters.com/reuters/businessNews?format=xml"

View File

@ -0,0 +1,5 @@
from grabber import Grabber
class ReutersTechnology(Grabber):
feed_url = "http://feeds.reuters.com/reuters/technologyNews?format=xml"

View File

@ -1,11 +1,15 @@
# http://feeds.arstechnica.com/arstechnica/index # http://feeds.arstechnica.com/arstechnica/index
# http://feeds.bbci.co.uk/news/business/rss.xml # http://feeds.bbci.co.uk/news/business/rss.xml
# http://feeds.bbci.co.uk/news/technology/rss.xml # http://feeds.bbci.co.uk/news/technology/rss.xml
# http://feeds.reuters.com/reuters/businessNews?format=xml
# http://feeds.reuters.com/reuters/technologyNews?format=xml # http://feeds.reuters.com/reuters/technologyNews?format=xml
# http://rss.nytimes.com/services/xml/rss/nyt/Business.xml
# http://rss.nytimes.com/services/xml/rss/nyt/Technology.xml # http://rss.nytimes.com/services/xml/rss/nyt/Technology.xml
# http://www.forbes.com/business/feed/
# http://www.forbes.com/technology/feed/ # http://www.forbes.com/technology/feed/
# http://www.ft.com/rss/home/us # http://www.ft.com/rss/home/us
# http://www.macworld.com/index.rss # http://www.macworld.com/index.rss
# http://www.wired.com/feed # http://www.wired.com/feed
# https://www.engadget.com/rss.xml # https://www.engadget.com/rss.xml
# https://www.huffpost.com/section/business/feed
# https://www.huffpost.com/section/technology/feed # https://www.huffpost.com/section/technology/feed

View File