Add Reuters grabbers.

2020-01-18 19:47:27 +00:00 · 2020-01-18 19:47:27 +00:00 · b5771bef90
commit b5771bef90
parent 285c0a3fc0
9 changed files with 33 additions and 52 deletions
--- a/grabber.py
+++ b/grabber.py
@ -3,11 +3,13 @@ import sqlite3
 import requests
 import xml.etree.ElementTree as et
 from abc import abstractmethod, ABC
 from datetime import datetime
 from requests.exceptions import HTTPError
 class Grabber(ABC):
    articles = []
    date_format = "%a, %d %b %Y %H:%M:%S %z"
    db = None
    name = ""
@ -16,10 +18,6 @@ class Grabber(ABC):
    def feed_url(self):
        raise NotImplementedError
    @abstractmethod
    def parse(self, feed):
        raise NotImplementedError
    def __init__(self, db):
        self.db = db
        self.name = self.__class__.__name__
@ -61,6 +59,19 @@ class Grabber(ABC):
        response.raise_for_status()
        return response.content
    def parse(self, feed):
        articles = []
        for article in feed.iter("item"):
            try:
                # Sat, 18 Jan 2020 14:21:49 -0500
                timestamp = int(datetime.strptime(article.find("pubDate").text, self.date_format).timestamp())
                title = article.find("title").text
                description = article.find("description").text
                articles.append((timestamp, title, description))
            except AttributeError:
                logging.error(f"Received non-parsable news article from {self.name}.")
        return articles
    def process(self, articles, new_articles):
        delta_articles = [article for article in new_articles if article not in articles]
        if delta_articles:
--- a/run.py
+++ b/run.py
@ -5,6 +5,8 @@ from pathlib import Path
 from sources.ArsTechnica import ArsTechnica
 from sources.BBCBusiness import BBCBusiness
 from sources.BBCTechnology import BBCTechnology
 from sources.ReutersBusiness import ReutersBusiness
 from sources.ReutersTechnology import ReutersTechnology
 DATABASE_PATH = Path("storage.db")
 GRAB_FREQUENCY = 15
@ -21,6 +23,8 @@ def main():
            ArsTechnica(db),
            BBCBusiness(db),
            BBCTechnology(db),
            ReutersBusiness(db),
            ReutersTechnology(db),
        ]
        while True:
--- a/sources/ArsTechnica.py
+++ b/sources/ArsTechnica.py
@ -1,21 +1,5 @@
 import logging
 from datetime import datetime
 from grabber import Grabber
 class ArsTechnica(Grabber):
    feed_url = "http://feeds.arstechnica.com/arstechnica/index"
    date_format = "%a, %d %b %Y %H:%M:%S %z"
    def parse(self, feed):
        articles = []
        for article in feed.iter("item"):
            try:
                # Sat, 18 Jan 2020 15:41:56 +0000
                timestamp = int(datetime.strptime(article.find("pubDate").text, self.date_format).timestamp())
                title = article.find("title").text
                description = article.find("description").text
                articles.append((timestamp, title, description))
            except AttributeError:
                logging.error(f"Received non-parsable news article from {self.name}.")
        return articles
--- a/sources/BBCBusiness.py
+++ b/sources/BBCBusiness.py
@ -1,21 +1,6 @@
 import logging
 from datetime import datetime
 from grabber import Grabber
 class BBCBusiness(Grabber):
    feed_url = "http://feeds.bbci.co.uk/news/business/rss.xml"
    date_format = "%a, %d %b %Y %H:%M:%S %Z"
    def parse(self, feed):
        articles = []
        for article in feed.iter("item"):
            try:
                # Fri, 17 Jan 2020 19:09:40 GMT
                timestamp = int(datetime.strptime(article.find("pubDate").text, self.date_format).timestamp())
                title = article.find("title").text
                description = article.find("description").text
                articles.append((timestamp, title, description))
            except AttributeError:
                logging.error(f"Received non-parsable news article from {self.name}.")
        return articles
--- a/sources/BBCTechnology.py
+++ b/sources/BBCTechnology.py
@ -1,23 +1,6 @@
 import logging
 from datetime import datetime
 from grabber import Grabber
 class BBCTechnology(Grabber):
    feed_url = "http://feeds.bbci.co.uk/news/technology/rss.xml"
    date_format = "%a, %d %b %Y %H:%M:%S %Z"
    def parse(self, feed):
        articles = []
        for article in feed.iter("item"):
            try:
                # Fri, 17 Jan 2020 19:09:40 GMT
                timestamp = int(datetime.strptime(article.find("pubDate").text, self.date_format).timestamp())
                title = article.find("title").text
                description = article.find("description").text
                articles.append((timestamp, title, description))
            except AttributeError:
                logging.error(f"Received non-parsable news article from {self.name}.")
        return articles
--- a/sources/ReutersBusiness.py
+++ b/sources/ReutersBusiness.py
@ -0,0 +1,5 @@
 from grabber import Grabber
 class ReutersBusiness(Grabber):
    feed_url = "http://feeds.reuters.com/reuters/businessNews?format=xml"
--- a/sources/ReutersTechnology.py
+++ b/sources/ReutersTechnology.py
@ -0,0 +1,5 @@
 from grabber import Grabber
 class ReutersTechnology(Grabber):
    feed_url = "http://feeds.reuters.com/reuters/technologyNews?format=xml"
--- a/sources/sources.txt
+++ b/sources/sources.txt
@ -1,11 +1,15 @@
 # http://feeds.arstechnica.com/arstechnica/index
 # http://feeds.bbci.co.uk/news/business/rss.xml
 # http://feeds.bbci.co.uk/news/technology/rss.xml
 # http://feeds.reuters.com/reuters/businessNews?format=xml
 # http://feeds.reuters.com/reuters/technologyNews?format=xml
 # http://rss.nytimes.com/services/xml/rss/nyt/Business.xml
 # http://rss.nytimes.com/services/xml/rss/nyt/Technology.xml
 # http://www.forbes.com/business/feed/
 # http://www.forbes.com/technology/feed/
 # http://www.ft.com/rss/home/us
 # http://www.macworld.com/index.rss
 # http://www.wired.com/feed
 # https://www.engadget.com/rss.xml
 # https://www.huffpost.com/section/business/feed
 # https://www.huffpost.com/section/technology/feed
--- a/storage.db
+++ b/storage.db