Create update_keywords_scores.py
This commit is contained in:
		
							parent
							
								
									4207e89665
								
							
						
					
					
						commit
						e0dff08c14
					
				
							
								
								
									
										75
									
								
								update_keywords_scores.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										75
									
								
								update_keywords_scores.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,75 @@ | ||||
| from collections import OrderedDict | ||||
| from progress.bar import IncrementalBar | ||||
| import math | ||||
| from concurrent.futures import ThreadPoolExecutor | ||||
| 
 | ||||
| from imdb_utils import IMDbUtils | ||||
| from vcinema_utils import VCinemaUtils | ||||
| 
 | ||||
| # Page ID of https://wiki.jacknet.io/books/vcinema/page/keyword-scores | ||||
| KEYWORD_SCORES_PAGE_ID = 23 | ||||
| 
 | ||||
| 
 | ||||
| def get_keyword_scores(viewings): | ||||
|     viewings_filtered_keyword = VCinemaUtils.filter_viewings(viewings, "keywords") | ||||
| 
 | ||||
|     for keyword, viewings in viewings_filtered_keyword.items(): | ||||
|         viewings_filtered_keyword[keyword] = {"vcinema_films": viewings} | ||||
| 
 | ||||
|     min_vcinema_count = 2 | ||||
|     min_imdb_count = 4 | ||||
| 
 | ||||
|     add_keyword_totals(viewings_filtered_keyword, min_vcinema_count) | ||||
|     add_keyword_scores(viewings_filtered_keyword, min_vcinema_count, min_imdb_count) | ||||
| 
 | ||||
|     return viewings_filtered_keyword | ||||
| 
 | ||||
| 
 | ||||
| def add_keyword_totals(keywords, min_vcinema_count): | ||||
|     keyword_count = len([keyword for keyword in keywords.keys() if len(keywords[keyword]['vcinema_films']) >= min_vcinema_count]) | ||||
| 
 | ||||
|     with IncrementalBar(message='%(percent).1f%% - %(eta)ds remaining', max=keyword_count, check_tty=False) as bar: | ||||
|         with ThreadPoolExecutor(6) as executor: | ||||
|             for keyword, data in keywords.items(): | ||||
|                 if len(data['vcinema_films']) >= min_vcinema_count: | ||||
|                     executor.submit(add_keyword_total, keyword, keywords, bar) | ||||
| 
 | ||||
| 
 | ||||
| def add_keyword_total(keyword, keywords, progress_bar=None): | ||||
|     keyword_total = IMDbUtils.get_keyword_count(keyword) | ||||
| 
 | ||||
|     keywords[keyword]['total'] = keyword_total | ||||
| 
 | ||||
|     if progress_bar is not None: | ||||
|         progress_bar.next() | ||||
| 
 | ||||
| 
 | ||||
| def add_keyword_scores(keyword_data, min_vcinema_count, min_imdb_count): | ||||
|     for keyword in keyword_data.keys(): | ||||
|         if 'total' in keyword_data[keyword]: | ||||
|             vcinema_count = len(keyword_data[keyword]['vcinema_films']) | ||||
|             total_count = keyword_data[keyword]['total'] | ||||
| 
 | ||||
|             if vcinema_count >= min_vcinema_count and total_count >= min_imdb_count: | ||||
|                 score = vcinema_count / math.log(total_count) | ||||
| 
 | ||||
|                 keyword_data[keyword]['score'] = score | ||||
| 
 | ||||
| 
 | ||||
| def build_table(keyword_data, minimum_score=1.0): | ||||
|     keyword_data = {k: v for k, v in keyword_data.items() if 'score' in v and v['score'] >= minimum_score} | ||||
|     keyword_data = OrderedDict(sorted(keyword_data.items(), key=lambda t: t[1]['score'], reverse=True)) | ||||
| 
 | ||||
|     table = "| Keyword | Number of VCinema Films | Total IMDb entries | Score |\n| - | - | - | - |" | ||||
| 
 | ||||
|     for keyword, data in keyword_data.items(): | ||||
|         table += "\n" | ||||
| 
 | ||||
|         row_data = [] | ||||
|         row_data.append(str(keyword)) | ||||
|         row_data.append(str(len(data['vcinema_films']))) | ||||
|         row_data.append(str(len(data['total']))) | ||||
|         row_data.append(str(round(data['score'], 3))) | ||||
|         table += " | ".join(row_data) | ||||
| 
 | ||||
|     return table | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Sarah
						Sarah