Big refactor.
This commit is contained in:
		
							parent
							
								
									cd876e3a20
								
							
						
					
					
						commit
						285c0a3fc0
					
				
							
								
								
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @ -90,6 +90,7 @@ celerybeat-schedule | ||||
| # Environments | ||||
| .env | ||||
| .venv | ||||
| .idea | ||||
| env/ | ||||
| venv/ | ||||
| ENV/ | ||||
|  | ||||
							
								
								
									
										1
									
								
								Pipfile
									
									
									
									
									
								
							
							
						
						
									
										1
									
								
								Pipfile
									
									
									
									
									
								
							| @ -6,6 +6,7 @@ verify_ssl = true | ||||
| [dev-packages] | ||||
| 
 | ||||
| [packages] | ||||
| requests = "*" | ||||
| 
 | ||||
| [requires] | ||||
| python_version = "3.7" | ||||
|  | ||||
							
								
								
									
										57
									
								
								Pipfile.lock
									
									
									
										generated
									
									
									
										Normal file
									
								
							
							
						
						
									
										57
									
								
								Pipfile.lock
									
									
									
										generated
									
									
									
										Normal file
									
								
							| @ -0,0 +1,57 @@ | ||||
| { | ||||
|     "_meta": { | ||||
|         "hash": { | ||||
|             "sha256": "bb57e0d7853b45999e47c163c46b95bc2fde31c527d8d7b5b5539dc979444a6d" | ||||
|         }, | ||||
|         "pipfile-spec": 6, | ||||
|         "requires": { | ||||
|             "python_version": "3.7" | ||||
|         }, | ||||
|         "sources": [ | ||||
|             { | ||||
|                 "name": "pypi", | ||||
|                 "url": "https://pypi.org/simple", | ||||
|                 "verify_ssl": true | ||||
|             } | ||||
|         ] | ||||
|     }, | ||||
|     "default": { | ||||
|         "certifi": { | ||||
|             "hashes": [ | ||||
|                 "sha256:017c25db2a153ce562900032d5bc68e9f191e44e9a0f762f373977de9df1fbb3", | ||||
|                 "sha256:25b64c7da4cd7479594d035c08c2d809eb4aab3a26e5a990ea98cc450c320f1f" | ||||
|             ], | ||||
|             "version": "==2019.11.28" | ||||
|         }, | ||||
|         "chardet": { | ||||
|             "hashes": [ | ||||
|                 "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", | ||||
|                 "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691" | ||||
|             ], | ||||
|             "version": "==3.0.4" | ||||
|         }, | ||||
|         "idna": { | ||||
|             "hashes": [ | ||||
|                 "sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407", | ||||
|                 "sha256:ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c" | ||||
|             ], | ||||
|             "version": "==2.8" | ||||
|         }, | ||||
|         "requests": { | ||||
|             "hashes": [ | ||||
|                 "sha256:11e007a8a2aa0323f5a921e9e6a2d7e4e67d9877e85773fba9ba6419025cbeb4", | ||||
|                 "sha256:9cf5292fcd0f598c671cfc1e0d7d1a7f13bb8085e9a590f48c010551dc6c4b31" | ||||
|             ], | ||||
|             "index": "pypi", | ||||
|             "version": "==2.22.0" | ||||
|         }, | ||||
|         "urllib3": { | ||||
|             "hashes": [ | ||||
|                 "sha256:a8a318824cc77d1fd4b2bec2ded92646630d7fe8619497b142c84a9e6f5a7293", | ||||
|                 "sha256:f3c5fd51747d450d4dcf6f923c81f78f811aab8205fda64b0aba34a4e48b0745" | ||||
|             ], | ||||
|             "version": "==1.25.7" | ||||
|         } | ||||
|     }, | ||||
|     "develop": {} | ||||
| } | ||||
							
								
								
									
										84
									
								
								grabber.py
									
									
									
									
									
								
							
							
						
						
									
										84
									
								
								grabber.py
									
									
									
									
									
								
							| @ -1,17 +1,15 @@ | ||||
| import logging | ||||
| import sqlite3 | ||||
| import requests | ||||
| from abc import * | ||||
| from threading import Timer | ||||
| import xml.etree.ElementTree as et | ||||
| from abc import abstractmethod, ABC | ||||
| from requests.exceptions import HTTPError | ||||
| 
 | ||||
| 
 | ||||
| class Grabber(ABC): | ||||
|     articles = [] | ||||
|     db = None | ||||
|     _interval = 60 | ||||
|     _running = False | ||||
| 
 | ||||
|     def __init__(self, db, interval): | ||||
|         self.db = db | ||||
|         self._interval = interval | ||||
|     name = "" | ||||
| 
 | ||||
|     @property | ||||
|     @abstractmethod | ||||
| @ -19,29 +17,65 @@ class Grabber(ABC): | ||||
|         raise NotImplementedError | ||||
| 
 | ||||
|     @abstractmethod | ||||
|     def grab(self): | ||||
|     def parse(self, feed): | ||||
|         raise NotImplementedError | ||||
| 
 | ||||
|     def __init__(self, db): | ||||
|         self.db = db | ||||
|         self.name = self.__class__.__name__ | ||||
|         self.setup_tables() | ||||
|         self.articles = self.restore() | ||||
| 
 | ||||
|     def setup_tables(self): | ||||
|         try: | ||||
|             cur = self.db.cursor() | ||||
|             cur.execute(f"CREATE TABLE IF NOT EXISTS {self.name} " | ||||
|                     f"(id INTEGER PRIMARY KEY AUTOINCREMENT, timestamp TIMESTAMP, title VARCHAR, description VARCHAR)") | ||||
|             self.db.commit() | ||||
|         except sqlite3.Error: | ||||
|             logging.error(f"Could not create table in database for {self.name}.") | ||||
| 
 | ||||
|     def store(self, articles): | ||||
|         try: | ||||
|             cur = self.db.cursor() | ||||
|             cur.executemany(f"INSERT INTO {self.name} (timestamp, title, description) " | ||||
|                             f"VALUES (?,?,?)", articles) | ||||
|             self.db.commit() | ||||
|         except sqlite3.Error: | ||||
|             logging.error(f"Could not store updated news articles from {self.name}.") | ||||
| 
 | ||||
|     def restore(self): | ||||
|         articles = [] | ||||
|         try: | ||||
|             cur = self.db.cursor() | ||||
|             cur.execute(f"SELECT timestamp, title, description from {self.name}") | ||||
|             articles = cur.fetchall() | ||||
|             logging.info(f"Restored {len(articles)} news articles from database for {self.name}.") | ||||
|         except sqlite3.Error: | ||||
|             logging.error(f"Could not restore news articles from database for {self.name}.") | ||||
|         finally: | ||||
|             return articles | ||||
| 
 | ||||
|     def request(self): | ||||
|         response = requests.get(self.feed_url) | ||||
|         response.raise_for_status() | ||||
|         return response.content | ||||
| 
 | ||||
|     def timer(self): | ||||
|         if self._running: | ||||
|             self.grab() | ||||
|             Timer(self._interval, self.timer).start() | ||||
|     def process(self, articles, new_articles): | ||||
|         delta_articles = [article for article in new_articles if article not in articles] | ||||
|         if delta_articles: | ||||
|             logging.info(f"Received {len(delta_articles)} new news articles from {self.name}.") | ||||
|         return delta_articles | ||||
| 
 | ||||
|     def start(self): | ||||
|         if not self._running: | ||||
|             self._running = True | ||||
|             self.timer() | ||||
|         else: | ||||
|             logging.error(f"Grabber for {self.__class__.__name__} already started.") | ||||
| 
 | ||||
|     def stop(self): | ||||
|         if self._running: | ||||
|             self._running = False | ||||
|         else: | ||||
|             logging.error(f"Grabber for {self.__class__.__name__} already stopped.") | ||||
|     def grab(self): | ||||
|         try: | ||||
|             feed = et.fromstring(self.request()) | ||||
|             new_articles = self.parse(feed) | ||||
|             delta_articles = self.process(self.articles, new_articles) | ||||
|             self.store(delta_articles) | ||||
|             self.articles = new_articles | ||||
|         except HTTPError: | ||||
|             logging.error(f"Unable to download updated news articles from {self.name}.") | ||||
|         except (et.ParseError, ValueError): | ||||
|             logging.error(f"Unable to parse updated news articles from {self.name}.") | ||||
| 
 | ||||
|  | ||||
							
								
								
									
										46
									
								
								run.py
									
									
									
									
									
								
							
							
						
						
									
										46
									
								
								run.py
									
									
									
									
									
								
							| @ -1,36 +1,38 @@ | ||||
| import logging | ||||
| import sqlite3 | ||||
| import time | ||||
| from pathlib import Path | ||||
| from sources.ArsTechnica import ArsTechnica | ||||
| from sources.BBCBusiness import BBCBusiness | ||||
| from sources.BBCTechnology import BBCTechnology | ||||
| 
 | ||||
| DATABASE_PATH = Path("storage.db") | ||||
| SCRAPE_INTERVAL = 15 | ||||
| 
 | ||||
| 
 | ||||
| def configure_logging(): | ||||
|     logging.basicConfig( | ||||
|         level=logging.INFO, | ||||
|         format="%(asctime)s %(levelname)-8s %(message)s", | ||||
|         datefmt="%Y-%m-%d %H:%M" | ||||
|     ) | ||||
| 
 | ||||
| 
 | ||||
| def setup_database(): | ||||
|     db = sqlite3.connect(DATABASE_PATH) | ||||
|     return db | ||||
| GRAB_FREQUENCY = 15 | ||||
| GRAB_INTERVAL = 5 | ||||
| 
 | ||||
| 
 | ||||
| def main(): | ||||
|     db = setup_database() | ||||
|     grabbers = [ | ||||
|         ArsTechnica(db, SCRAPE_INTERVAL), | ||||
|         BBCBusiness(db, SCRAPE_INTERVAL) | ||||
|     ] | ||||
|     for grabber in grabbers: | ||||
|         grabber.start() | ||||
|     try: | ||||
|         db = sqlite3.connect(DATABASE_PATH) | ||||
|         if not db: | ||||
|             raise sqlite3.DatabaseError | ||||
| 
 | ||||
|         grabbers = [ | ||||
|             ArsTechnica(db), | ||||
|             BBCBusiness(db), | ||||
|             BBCTechnology(db), | ||||
|         ] | ||||
| 
 | ||||
|         while True: | ||||
|             for grabber in grabbers: | ||||
|                 grabber.grab() | ||||
|                 time.sleep(GRAB_FREQUENCY/GRAB_INTERVAL) | ||||
| 
 | ||||
|     except sqlite3.Error: | ||||
|         logging.error("Could not connect to database.") | ||||
|         exit(-1) | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == "__main__": | ||||
|     configure_logging() | ||||
|     logging.basicConfig(level=logging.INFO,format="%(asctime)s %(levelname)-8s %(message)s",datefmt="%Y-%m-%d %H:%M:%S") | ||||
|     main() | ||||
|  | ||||
| @ -1,10 +1,21 @@ | ||||
| import logging | ||||
| from datetime import datetime | ||||
| from grabber import Grabber | ||||
| 
 | ||||
| 
 | ||||
| class ArsTechnica(Grabber): | ||||
|     feed_url = "http://feeds.arstechnica.com/arstechnica/index" | ||||
|     date_format = "%a, %d %b %Y %H:%M:%S %z" | ||||
| 
 | ||||
|     def grab(self): | ||||
|         pass | ||||
| 
 | ||||
| 
 | ||||
|     def parse(self, feed): | ||||
|         articles = [] | ||||
|         for article in feed.iter("item"): | ||||
|             try: | ||||
|                 # Sat, 18 Jan 2020 15:41:56 +0000 | ||||
|                 timestamp = int(datetime.strptime(article.find("pubDate").text, self.date_format).timestamp()) | ||||
|                 title = article.find("title").text | ||||
|                 description = article.find("description").text | ||||
|                 articles.append((timestamp, title, description)) | ||||
|             except AttributeError: | ||||
|                 logging.error(f"Received non-parsable news article from {self.name}.") | ||||
|         return articles | ||||
|  | ||||
| @ -1,39 +1,21 @@ | ||||
| import csv | ||||
| import logging | ||||
| import xml.etree.ElementTree as et | ||||
| from datetime import datetime | ||||
| from grabber import Grabber | ||||
| from requests.exceptions import HTTPError | ||||
| 
 | ||||
| 
 | ||||
| class BBCBusiness(Grabber): | ||||
|     articles = [] | ||||
|     feed_url = "http://feeds.bbci.co.uk/news/business/rss.xml" | ||||
|     date_format = "%a, %d %b %Y %H:%M:%S %Z" | ||||
| 
 | ||||
|     def grab(self): | ||||
|         try: | ||||
|             feed = et.fromstring(self.request()) | ||||
|             self.process(feed) | ||||
|         except (HTTPError, et.ParseError): | ||||
|             logging.error(f"Unable to get updated news from {self.__class__.__name__}.") | ||||
| 
 | ||||
|     def process(self, feed): | ||||
|         for item in feed.iter("item"): | ||||
|             article = self.parse(item) | ||||
|             if article not in self.articles: | ||||
|                 self.articles.append(article) | ||||
|                 logging.info(f"New article from {self.__class__.__name__} at {datetime.fromtimestamp(article[0])}") | ||||
| 
 | ||||
|     def parse(self, item): | ||||
|         article = None | ||||
|         try: | ||||
|             date = item.find("pubDate").text | ||||
|             # Fri, 17 Jan 2020 19:09:40 GMT | ||||
|             timestamp = int(datetime.strptime(date, "%a, %d %b %Y %H:%M:%S %Z").timestamp()) | ||||
|             title = item.find("title").text | ||||
|             description = item.find("description").text | ||||
|             article = (timestamp, title, description) | ||||
|         except AttributeError: | ||||
|             logging.error(f"Received non-parsable news article from {self.__class__.__name__}.") | ||||
|         finally: | ||||
|             return article | ||||
|     def parse(self, feed): | ||||
|         articles = [] | ||||
|         for article in feed.iter("item"): | ||||
|             try: | ||||
|                 # Fri, 17 Jan 2020 19:09:40 GMT | ||||
|                 timestamp = int(datetime.strptime(article.find("pubDate").text, self.date_format).timestamp()) | ||||
|                 title = article.find("title").text | ||||
|                 description = article.find("description").text | ||||
|                 articles.append((timestamp, title, description)) | ||||
|             except AttributeError: | ||||
|                 logging.error(f"Received non-parsable news article from {self.name}.") | ||||
|         return articles | ||||
|  | ||||
							
								
								
									
										23
									
								
								sources/BBCTechnology.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										23
									
								
								sources/BBCTechnology.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,23 @@ | ||||
| import logging | ||||
| from datetime import datetime | ||||
| from grabber import Grabber | ||||
| 
 | ||||
| 
 | ||||
| class BBCTechnology(Grabber): | ||||
|     feed_url = "http://feeds.bbci.co.uk/news/technology/rss.xml" | ||||
|     date_format = "%a, %d %b %Y %H:%M:%S %Z" | ||||
| 
 | ||||
|     def parse(self, feed): | ||||
|         articles = [] | ||||
|         for article in feed.iter("item"): | ||||
|             try: | ||||
|                 # Fri, 17 Jan 2020 19:09:40 GMT | ||||
|                 timestamp = int(datetime.strptime(article.find("pubDate").text, self.date_format).timestamp()) | ||||
|                 title = article.find("title").text | ||||
|                 description = article.find("description").text | ||||
|                 articles.append((timestamp, title, description)) | ||||
|             except AttributeError: | ||||
|                 logging.error(f"Received non-parsable news article from {self.name}.") | ||||
|         return articles | ||||
| 
 | ||||
| 
 | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user