Big refactor.
This commit is contained in:
		
							parent
							
								
									cd876e3a20
								
							
						
					
					
						commit
						285c0a3fc0
					
				
							
								
								
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @ -90,6 +90,7 @@ celerybeat-schedule | |||||||
| # Environments | # Environments | ||||||
| .env | .env | ||||||
| .venv | .venv | ||||||
|  | .idea | ||||||
| env/ | env/ | ||||||
| venv/ | venv/ | ||||||
| ENV/ | ENV/ | ||||||
|  | |||||||
							
								
								
									
										1
									
								
								Pipfile
									
									
									
									
									
								
							
							
						
						
									
										1
									
								
								Pipfile
									
									
									
									
									
								
							| @ -6,6 +6,7 @@ verify_ssl = true | |||||||
| [dev-packages] | [dev-packages] | ||||||
| 
 | 
 | ||||||
| [packages] | [packages] | ||||||
|  | requests = "*" | ||||||
| 
 | 
 | ||||||
| [requires] | [requires] | ||||||
| python_version = "3.7" | python_version = "3.7" | ||||||
|  | |||||||
							
								
								
									
										57
									
								
								Pipfile.lock
									
									
									
										generated
									
									
									
										Normal file
									
								
							
							
						
						
									
										57
									
								
								Pipfile.lock
									
									
									
										generated
									
									
									
										Normal file
									
								
							| @ -0,0 +1,57 @@ | |||||||
|  | { | ||||||
|  |     "_meta": { | ||||||
|  |         "hash": { | ||||||
|  |             "sha256": "bb57e0d7853b45999e47c163c46b95bc2fde31c527d8d7b5b5539dc979444a6d" | ||||||
|  |         }, | ||||||
|  |         "pipfile-spec": 6, | ||||||
|  |         "requires": { | ||||||
|  |             "python_version": "3.7" | ||||||
|  |         }, | ||||||
|  |         "sources": [ | ||||||
|  |             { | ||||||
|  |                 "name": "pypi", | ||||||
|  |                 "url": "https://pypi.org/simple", | ||||||
|  |                 "verify_ssl": true | ||||||
|  |             } | ||||||
|  |         ] | ||||||
|  |     }, | ||||||
|  |     "default": { | ||||||
|  |         "certifi": { | ||||||
|  |             "hashes": [ | ||||||
|  |                 "sha256:017c25db2a153ce562900032d5bc68e9f191e44e9a0f762f373977de9df1fbb3", | ||||||
|  |                 "sha256:25b64c7da4cd7479594d035c08c2d809eb4aab3a26e5a990ea98cc450c320f1f" | ||||||
|  |             ], | ||||||
|  |             "version": "==2019.11.28" | ||||||
|  |         }, | ||||||
|  |         "chardet": { | ||||||
|  |             "hashes": [ | ||||||
|  |                 "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", | ||||||
|  |                 "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691" | ||||||
|  |             ], | ||||||
|  |             "version": "==3.0.4" | ||||||
|  |         }, | ||||||
|  |         "idna": { | ||||||
|  |             "hashes": [ | ||||||
|  |                 "sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407", | ||||||
|  |                 "sha256:ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c" | ||||||
|  |             ], | ||||||
|  |             "version": "==2.8" | ||||||
|  |         }, | ||||||
|  |         "requests": { | ||||||
|  |             "hashes": [ | ||||||
|  |                 "sha256:11e007a8a2aa0323f5a921e9e6a2d7e4e67d9877e85773fba9ba6419025cbeb4", | ||||||
|  |                 "sha256:9cf5292fcd0f598c671cfc1e0d7d1a7f13bb8085e9a590f48c010551dc6c4b31" | ||||||
|  |             ], | ||||||
|  |             "index": "pypi", | ||||||
|  |             "version": "==2.22.0" | ||||||
|  |         }, | ||||||
|  |         "urllib3": { | ||||||
|  |             "hashes": [ | ||||||
|  |                 "sha256:a8a318824cc77d1fd4b2bec2ded92646630d7fe8619497b142c84a9e6f5a7293", | ||||||
|  |                 "sha256:f3c5fd51747d450d4dcf6f923c81f78f811aab8205fda64b0aba34a4e48b0745" | ||||||
|  |             ], | ||||||
|  |             "version": "==1.25.7" | ||||||
|  |         } | ||||||
|  |     }, | ||||||
|  |     "develop": {} | ||||||
|  | } | ||||||
							
								
								
									
										84
									
								
								grabber.py
									
									
									
									
									
								
							
							
						
						
									
										84
									
								
								grabber.py
									
									
									
									
									
								
							| @ -1,17 +1,15 @@ | |||||||
| import logging | import logging | ||||||
|  | import sqlite3 | ||||||
| import requests | import requests | ||||||
| from abc import * | import xml.etree.ElementTree as et | ||||||
| from threading import Timer | from abc import abstractmethod, ABC | ||||||
|  | from requests.exceptions import HTTPError | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class Grabber(ABC): | class Grabber(ABC): | ||||||
|  |     articles = [] | ||||||
|     db = None |     db = None | ||||||
|     _interval = 60 |     name = "" | ||||||
|     _running = False |  | ||||||
| 
 |  | ||||||
|     def __init__(self, db, interval): |  | ||||||
|         self.db = db |  | ||||||
|         self._interval = interval |  | ||||||
| 
 | 
 | ||||||
|     @property |     @property | ||||||
|     @abstractmethod |     @abstractmethod | ||||||
| @ -19,29 +17,65 @@ class Grabber(ABC): | |||||||
|         raise NotImplementedError |         raise NotImplementedError | ||||||
| 
 | 
 | ||||||
|     @abstractmethod |     @abstractmethod | ||||||
|     def grab(self): |     def parse(self, feed): | ||||||
|         raise NotImplementedError |         raise NotImplementedError | ||||||
| 
 | 
 | ||||||
|  |     def __init__(self, db): | ||||||
|  |         self.db = db | ||||||
|  |         self.name = self.__class__.__name__ | ||||||
|  |         self.setup_tables() | ||||||
|  |         self.articles = self.restore() | ||||||
|  | 
 | ||||||
|  |     def setup_tables(self): | ||||||
|  |         try: | ||||||
|  |             cur = self.db.cursor() | ||||||
|  |             cur.execute(f"CREATE TABLE IF NOT EXISTS {self.name} " | ||||||
|  |                     f"(id INTEGER PRIMARY KEY AUTOINCREMENT, timestamp TIMESTAMP, title VARCHAR, description VARCHAR)") | ||||||
|  |             self.db.commit() | ||||||
|  |         except sqlite3.Error: | ||||||
|  |             logging.error(f"Could not create table in database for {self.name}.") | ||||||
|  | 
 | ||||||
|  |     def store(self, articles): | ||||||
|  |         try: | ||||||
|  |             cur = self.db.cursor() | ||||||
|  |             cur.executemany(f"INSERT INTO {self.name} (timestamp, title, description) " | ||||||
|  |                             f"VALUES (?,?,?)", articles) | ||||||
|  |             self.db.commit() | ||||||
|  |         except sqlite3.Error: | ||||||
|  |             logging.error(f"Could not store updated news articles from {self.name}.") | ||||||
|  | 
 | ||||||
|  |     def restore(self): | ||||||
|  |         articles = [] | ||||||
|  |         try: | ||||||
|  |             cur = self.db.cursor() | ||||||
|  |             cur.execute(f"SELECT timestamp, title, description from {self.name}") | ||||||
|  |             articles = cur.fetchall() | ||||||
|  |             logging.info(f"Restored {len(articles)} news articles from database for {self.name}.") | ||||||
|  |         except sqlite3.Error: | ||||||
|  |             logging.error(f"Could not restore news articles from database for {self.name}.") | ||||||
|  |         finally: | ||||||
|  |             return articles | ||||||
|  | 
 | ||||||
|     def request(self): |     def request(self): | ||||||
|         response = requests.get(self.feed_url) |         response = requests.get(self.feed_url) | ||||||
|         response.raise_for_status() |         response.raise_for_status() | ||||||
|         return response.content |         return response.content | ||||||
| 
 | 
 | ||||||
|     def timer(self): |     def process(self, articles, new_articles): | ||||||
|         if self._running: |         delta_articles = [article for article in new_articles if article not in articles] | ||||||
|             self.grab() |         if delta_articles: | ||||||
|             Timer(self._interval, self.timer).start() |             logging.info(f"Received {len(delta_articles)} new news articles from {self.name}.") | ||||||
|  |         return delta_articles | ||||||
| 
 | 
 | ||||||
|     def start(self): |     def grab(self): | ||||||
|         if not self._running: |         try: | ||||||
|             self._running = True |             feed = et.fromstring(self.request()) | ||||||
|             self.timer() |             new_articles = self.parse(feed) | ||||||
|         else: |             delta_articles = self.process(self.articles, new_articles) | ||||||
|             logging.error(f"Grabber for {self.__class__.__name__} already started.") |             self.store(delta_articles) | ||||||
| 
 |             self.articles = new_articles | ||||||
|     def stop(self): |         except HTTPError: | ||||||
|         if self._running: |             logging.error(f"Unable to download updated news articles from {self.name}.") | ||||||
|             self._running = False |         except (et.ParseError, ValueError): | ||||||
|         else: |             logging.error(f"Unable to parse updated news articles from {self.name}.") | ||||||
|             logging.error(f"Grabber for {self.__class__.__name__} already stopped.") |  | ||||||
| 
 | 
 | ||||||
|  | |||||||
							
								
								
									
										40
									
								
								run.py
									
									
									
									
									
								
							
							
						
						
									
										40
									
								
								run.py
									
									
									
									
									
								
							| @ -1,36 +1,38 @@ | |||||||
| import logging | import logging | ||||||
| import sqlite3 | import sqlite3 | ||||||
|  | import time | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
| from sources.ArsTechnica import ArsTechnica | from sources.ArsTechnica import ArsTechnica | ||||||
| from sources.BBCBusiness import BBCBusiness | from sources.BBCBusiness import BBCBusiness | ||||||
|  | from sources.BBCTechnology import BBCTechnology | ||||||
| 
 | 
 | ||||||
| DATABASE_PATH = Path("storage.db") | DATABASE_PATH = Path("storage.db") | ||||||
| SCRAPE_INTERVAL = 15 | GRAB_FREQUENCY = 15 | ||||||
| 
 | GRAB_INTERVAL = 5 | ||||||
| 
 |  | ||||||
| def configure_logging(): |  | ||||||
|     logging.basicConfig( |  | ||||||
|         level=logging.INFO, |  | ||||||
|         format="%(asctime)s %(levelname)-8s %(message)s", |  | ||||||
|         datefmt="%Y-%m-%d %H:%M" |  | ||||||
|     ) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def setup_database(): |  | ||||||
|     db = sqlite3.connect(DATABASE_PATH) |  | ||||||
|     return db |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def main(): | def main(): | ||||||
|     db = setup_database() |     try: | ||||||
|  |         db = sqlite3.connect(DATABASE_PATH) | ||||||
|  |         if not db: | ||||||
|  |             raise sqlite3.DatabaseError | ||||||
|  | 
 | ||||||
|         grabbers = [ |         grabbers = [ | ||||||
|         ArsTechnica(db, SCRAPE_INTERVAL), |             ArsTechnica(db), | ||||||
|         BBCBusiness(db, SCRAPE_INTERVAL) |             BBCBusiness(db), | ||||||
|  |             BBCTechnology(db), | ||||||
|         ] |         ] | ||||||
|  | 
 | ||||||
|  |         while True: | ||||||
|             for grabber in grabbers: |             for grabber in grabbers: | ||||||
|         grabber.start() |                 grabber.grab() | ||||||
|  |                 time.sleep(GRAB_FREQUENCY/GRAB_INTERVAL) | ||||||
|  | 
 | ||||||
|  |     except sqlite3.Error: | ||||||
|  |         logging.error("Could not connect to database.") | ||||||
|  |         exit(-1) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| if __name__ == "__main__": | if __name__ == "__main__": | ||||||
|     configure_logging() |     logging.basicConfig(level=logging.INFO,format="%(asctime)s %(levelname)-8s %(message)s",datefmt="%Y-%m-%d %H:%M:%S") | ||||||
|     main() |     main() | ||||||
|  | |||||||
| @ -1,10 +1,21 @@ | |||||||
|  | import logging | ||||||
|  | from datetime import datetime | ||||||
| from grabber import Grabber | from grabber import Grabber | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class ArsTechnica(Grabber): | class ArsTechnica(Grabber): | ||||||
|     feed_url = "http://feeds.arstechnica.com/arstechnica/index" |     feed_url = "http://feeds.arstechnica.com/arstechnica/index" | ||||||
|  |     date_format = "%a, %d %b %Y %H:%M:%S %z" | ||||||
| 
 | 
 | ||||||
|     def grab(self): |     def parse(self, feed): | ||||||
|         pass |         articles = [] | ||||||
| 
 |         for article in feed.iter("item"): | ||||||
| 
 |             try: | ||||||
|  |                 # Sat, 18 Jan 2020 15:41:56 +0000 | ||||||
|  |                 timestamp = int(datetime.strptime(article.find("pubDate").text, self.date_format).timestamp()) | ||||||
|  |                 title = article.find("title").text | ||||||
|  |                 description = article.find("description").text | ||||||
|  |                 articles.append((timestamp, title, description)) | ||||||
|  |             except AttributeError: | ||||||
|  |                 logging.error(f"Received non-parsable news article from {self.name}.") | ||||||
|  |         return articles | ||||||
|  | |||||||
| @ -1,39 +1,21 @@ | |||||||
| import csv |  | ||||||
| import logging | import logging | ||||||
| import xml.etree.ElementTree as et |  | ||||||
| from datetime import datetime | from datetime import datetime | ||||||
| from grabber import Grabber | from grabber import Grabber | ||||||
| from requests.exceptions import HTTPError |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class BBCBusiness(Grabber): | class BBCBusiness(Grabber): | ||||||
|     articles = [] |  | ||||||
|     feed_url = "http://feeds.bbci.co.uk/news/business/rss.xml" |     feed_url = "http://feeds.bbci.co.uk/news/business/rss.xml" | ||||||
|  |     date_format = "%a, %d %b %Y %H:%M:%S %Z" | ||||||
| 
 | 
 | ||||||
|     def grab(self): |     def parse(self, feed): | ||||||
|  |         articles = [] | ||||||
|  |         for article in feed.iter("item"): | ||||||
|             try: |             try: | ||||||
|             feed = et.fromstring(self.request()) |  | ||||||
|             self.process(feed) |  | ||||||
|         except (HTTPError, et.ParseError): |  | ||||||
|             logging.error(f"Unable to get updated news from {self.__class__.__name__}.") |  | ||||||
| 
 |  | ||||||
|     def process(self, feed): |  | ||||||
|         for item in feed.iter("item"): |  | ||||||
|             article = self.parse(item) |  | ||||||
|             if article not in self.articles: |  | ||||||
|                 self.articles.append(article) |  | ||||||
|                 logging.info(f"New article from {self.__class__.__name__} at {datetime.fromtimestamp(article[0])}") |  | ||||||
| 
 |  | ||||||
|     def parse(self, item): |  | ||||||
|         article = None |  | ||||||
|         try: |  | ||||||
|             date = item.find("pubDate").text |  | ||||||
|                 # Fri, 17 Jan 2020 19:09:40 GMT |                 # Fri, 17 Jan 2020 19:09:40 GMT | ||||||
|             timestamp = int(datetime.strptime(date, "%a, %d %b %Y %H:%M:%S %Z").timestamp()) |                 timestamp = int(datetime.strptime(article.find("pubDate").text, self.date_format).timestamp()) | ||||||
|             title = item.find("title").text |                 title = article.find("title").text | ||||||
|             description = item.find("description").text |                 description = article.find("description").text | ||||||
|             article = (timestamp, title, description) |                 articles.append((timestamp, title, description)) | ||||||
|             except AttributeError: |             except AttributeError: | ||||||
|             logging.error(f"Received non-parsable news article from {self.__class__.__name__}.") |                 logging.error(f"Received non-parsable news article from {self.name}.") | ||||||
|         finally: |         return articles | ||||||
|             return article |  | ||||||
|  | |||||||
							
								
								
									
										23
									
								
								sources/BBCTechnology.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										23
									
								
								sources/BBCTechnology.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,23 @@ | |||||||
|  | import logging | ||||||
|  | from datetime import datetime | ||||||
|  | from grabber import Grabber | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class BBCTechnology(Grabber): | ||||||
|  |     feed_url = "http://feeds.bbci.co.uk/news/technology/rss.xml" | ||||||
|  |     date_format = "%a, %d %b %Y %H:%M:%S %Z" | ||||||
|  | 
 | ||||||
|  |     def parse(self, feed): | ||||||
|  |         articles = [] | ||||||
|  |         for article in feed.iter("item"): | ||||||
|  |             try: | ||||||
|  |                 # Fri, 17 Jan 2020 19:09:40 GMT | ||||||
|  |                 timestamp = int(datetime.strptime(article.find("pubDate").text, self.date_format).timestamp()) | ||||||
|  |                 title = article.find("title").text | ||||||
|  |                 description = article.find("description").text | ||||||
|  |                 articles.append((timestamp, title, description)) | ||||||
|  |             except AttributeError: | ||||||
|  |                 logging.error(f"Received non-parsable news article from {self.name}.") | ||||||
|  |         return articles | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user