Add Reuters grabbers.
This commit is contained in:
parent
285c0a3fc0
commit
b5771bef90
19
grabber.py
19
grabber.py
@ -3,11 +3,13 @@ import sqlite3
|
|||||||
import requests
|
import requests
|
||||||
import xml.etree.ElementTree as et
|
import xml.etree.ElementTree as et
|
||||||
from abc import abstractmethod, ABC
|
from abc import abstractmethod, ABC
|
||||||
|
from datetime import datetime
|
||||||
from requests.exceptions import HTTPError
|
from requests.exceptions import HTTPError
|
||||||
|
|
||||||
|
|
||||||
class Grabber(ABC):
|
class Grabber(ABC):
|
||||||
articles = []
|
articles = []
|
||||||
|
date_format = "%a, %d %b %Y %H:%M:%S %z"
|
||||||
db = None
|
db = None
|
||||||
name = ""
|
name = ""
|
||||||
|
|
||||||
@ -16,10 +18,6 @@ class Grabber(ABC):
|
|||||||
def feed_url(self):
|
def feed_url(self):
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def parse(self, feed):
|
|
||||||
raise NotImplementedError
|
|
||||||
|
|
||||||
def __init__(self, db):
|
def __init__(self, db):
|
||||||
self.db = db
|
self.db = db
|
||||||
self.name = self.__class__.__name__
|
self.name = self.__class__.__name__
|
||||||
@ -61,6 +59,19 @@ class Grabber(ABC):
|
|||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
return response.content
|
return response.content
|
||||||
|
|
||||||
|
def parse(self, feed):
|
||||||
|
articles = []
|
||||||
|
for article in feed.iter("item"):
|
||||||
|
try:
|
||||||
|
# Sat, 18 Jan 2020 14:21:49 -0500
|
||||||
|
timestamp = int(datetime.strptime(article.find("pubDate").text, self.date_format).timestamp())
|
||||||
|
title = article.find("title").text
|
||||||
|
description = article.find("description").text
|
||||||
|
articles.append((timestamp, title, description))
|
||||||
|
except AttributeError:
|
||||||
|
logging.error(f"Received non-parsable news article from {self.name}.")
|
||||||
|
return articles
|
||||||
|
|
||||||
def process(self, articles, new_articles):
|
def process(self, articles, new_articles):
|
||||||
delta_articles = [article for article in new_articles if article not in articles]
|
delta_articles = [article for article in new_articles if article not in articles]
|
||||||
if delta_articles:
|
if delta_articles:
|
||||||
|
4
run.py
4
run.py
@ -5,6 +5,8 @@ from pathlib import Path
|
|||||||
from sources.ArsTechnica import ArsTechnica
|
from sources.ArsTechnica import ArsTechnica
|
||||||
from sources.BBCBusiness import BBCBusiness
|
from sources.BBCBusiness import BBCBusiness
|
||||||
from sources.BBCTechnology import BBCTechnology
|
from sources.BBCTechnology import BBCTechnology
|
||||||
|
from sources.ReutersBusiness import ReutersBusiness
|
||||||
|
from sources.ReutersTechnology import ReutersTechnology
|
||||||
|
|
||||||
DATABASE_PATH = Path("storage.db")
|
DATABASE_PATH = Path("storage.db")
|
||||||
GRAB_FREQUENCY = 15
|
GRAB_FREQUENCY = 15
|
||||||
@ -21,6 +23,8 @@ def main():
|
|||||||
ArsTechnica(db),
|
ArsTechnica(db),
|
||||||
BBCBusiness(db),
|
BBCBusiness(db),
|
||||||
BBCTechnology(db),
|
BBCTechnology(db),
|
||||||
|
ReutersBusiness(db),
|
||||||
|
ReutersTechnology(db),
|
||||||
]
|
]
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
|
@ -1,21 +1,5 @@
|
|||||||
import logging
|
|
||||||
from datetime import datetime
|
|
||||||
from grabber import Grabber
|
from grabber import Grabber
|
||||||
|
|
||||||
|
|
||||||
class ArsTechnica(Grabber):
|
class ArsTechnica(Grabber):
|
||||||
feed_url = "http://feeds.arstechnica.com/arstechnica/index"
|
feed_url = "http://feeds.arstechnica.com/arstechnica/index"
|
||||||
date_format = "%a, %d %b %Y %H:%M:%S %z"
|
|
||||||
|
|
||||||
def parse(self, feed):
|
|
||||||
articles = []
|
|
||||||
for article in feed.iter("item"):
|
|
||||||
try:
|
|
||||||
# Sat, 18 Jan 2020 15:41:56 +0000
|
|
||||||
timestamp = int(datetime.strptime(article.find("pubDate").text, self.date_format).timestamp())
|
|
||||||
title = article.find("title").text
|
|
||||||
description = article.find("description").text
|
|
||||||
articles.append((timestamp, title, description))
|
|
||||||
except AttributeError:
|
|
||||||
logging.error(f"Received non-parsable news article from {self.name}.")
|
|
||||||
return articles
|
|
||||||
|
@ -1,21 +1,6 @@
|
|||||||
import logging
|
|
||||||
from datetime import datetime
|
|
||||||
from grabber import Grabber
|
from grabber import Grabber
|
||||||
|
|
||||||
|
|
||||||
class BBCBusiness(Grabber):
|
class BBCBusiness(Grabber):
|
||||||
feed_url = "http://feeds.bbci.co.uk/news/business/rss.xml"
|
feed_url = "http://feeds.bbci.co.uk/news/business/rss.xml"
|
||||||
date_format = "%a, %d %b %Y %H:%M:%S %Z"
|
date_format = "%a, %d %b %Y %H:%M:%S %Z"
|
||||||
|
|
||||||
def parse(self, feed):
|
|
||||||
articles = []
|
|
||||||
for article in feed.iter("item"):
|
|
||||||
try:
|
|
||||||
# Fri, 17 Jan 2020 19:09:40 GMT
|
|
||||||
timestamp = int(datetime.strptime(article.find("pubDate").text, self.date_format).timestamp())
|
|
||||||
title = article.find("title").text
|
|
||||||
description = article.find("description").text
|
|
||||||
articles.append((timestamp, title, description))
|
|
||||||
except AttributeError:
|
|
||||||
logging.error(f"Received non-parsable news article from {self.name}.")
|
|
||||||
return articles
|
|
||||||
|
@ -1,23 +1,6 @@
|
|||||||
import logging
|
|
||||||
from datetime import datetime
|
|
||||||
from grabber import Grabber
|
from grabber import Grabber
|
||||||
|
|
||||||
|
|
||||||
class BBCTechnology(Grabber):
|
class BBCTechnology(Grabber):
|
||||||
feed_url = "http://feeds.bbci.co.uk/news/technology/rss.xml"
|
feed_url = "http://feeds.bbci.co.uk/news/technology/rss.xml"
|
||||||
date_format = "%a, %d %b %Y %H:%M:%S %Z"
|
date_format = "%a, %d %b %Y %H:%M:%S %Z"
|
||||||
|
|
||||||
def parse(self, feed):
|
|
||||||
articles = []
|
|
||||||
for article in feed.iter("item"):
|
|
||||||
try:
|
|
||||||
# Fri, 17 Jan 2020 19:09:40 GMT
|
|
||||||
timestamp = int(datetime.strptime(article.find("pubDate").text, self.date_format).timestamp())
|
|
||||||
title = article.find("title").text
|
|
||||||
description = article.find("description").text
|
|
||||||
articles.append((timestamp, title, description))
|
|
||||||
except AttributeError:
|
|
||||||
logging.error(f"Received non-parsable news article from {self.name}.")
|
|
||||||
return articles
|
|
||||||
|
|
||||||
|
|
||||||
|
5
sources/ReutersBusiness.py
Normal file
5
sources/ReutersBusiness.py
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
from grabber import Grabber
|
||||||
|
|
||||||
|
|
||||||
|
class ReutersBusiness(Grabber):
|
||||||
|
feed_url = "http://feeds.reuters.com/reuters/businessNews?format=xml"
|
5
sources/ReutersTechnology.py
Normal file
5
sources/ReutersTechnology.py
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
from grabber import Grabber
|
||||||
|
|
||||||
|
|
||||||
|
class ReutersTechnology(Grabber):
|
||||||
|
feed_url = "http://feeds.reuters.com/reuters/technologyNews?format=xml"
|
@ -1,11 +1,15 @@
|
|||||||
# http://feeds.arstechnica.com/arstechnica/index
|
# http://feeds.arstechnica.com/arstechnica/index
|
||||||
# http://feeds.bbci.co.uk/news/business/rss.xml
|
# http://feeds.bbci.co.uk/news/business/rss.xml
|
||||||
# http://feeds.bbci.co.uk/news/technology/rss.xml
|
# http://feeds.bbci.co.uk/news/technology/rss.xml
|
||||||
|
# http://feeds.reuters.com/reuters/businessNews?format=xml
|
||||||
# http://feeds.reuters.com/reuters/technologyNews?format=xml
|
# http://feeds.reuters.com/reuters/technologyNews?format=xml
|
||||||
|
# http://rss.nytimes.com/services/xml/rss/nyt/Business.xml
|
||||||
# http://rss.nytimes.com/services/xml/rss/nyt/Technology.xml
|
# http://rss.nytimes.com/services/xml/rss/nyt/Technology.xml
|
||||||
|
# http://www.forbes.com/business/feed/
|
||||||
# http://www.forbes.com/technology/feed/
|
# http://www.forbes.com/technology/feed/
|
||||||
# http://www.ft.com/rss/home/us
|
# http://www.ft.com/rss/home/us
|
||||||
# http://www.macworld.com/index.rss
|
# http://www.macworld.com/index.rss
|
||||||
# http://www.wired.com/feed
|
# http://www.wired.com/feed
|
||||||
# https://www.engadget.com/rss.xml
|
# https://www.engadget.com/rss.xml
|
||||||
|
# https://www.huffpost.com/section/business/feed
|
||||||
# https://www.huffpost.com/section/technology/feed
|
# https://www.huffpost.com/section/technology/feed
|
Loading…
x
Reference in New Issue
Block a user