From e0dff08c14b59f92f13141af991520c15c9d32d7 Mon Sep 17 00:00:00 2001 From: Sarah Date: Fri, 8 Apr 2022 22:40:01 +0100 Subject: [PATCH] Create update_keywords_scores.py --- update_keywords_scores.py | 75 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 update_keywords_scores.py diff --git a/update_keywords_scores.py b/update_keywords_scores.py new file mode 100644 index 0000000..7a78097 --- /dev/null +++ b/update_keywords_scores.py @@ -0,0 +1,75 @@ +from collections import OrderedDict +from progress.bar import IncrementalBar +import math +from concurrent.futures import ThreadPoolExecutor + +from imdb_utils import IMDbUtils +from vcinema_utils import VCinemaUtils + +# Page ID of https://wiki.jacknet.io/books/vcinema/page/keyword-scores +KEYWORD_SCORES_PAGE_ID = 23 + + +def get_keyword_scores(viewings): + viewings_filtered_keyword = VCinemaUtils.filter_viewings(viewings, "keywords") + + for keyword, viewings in viewings_filtered_keyword.items(): + viewings_filtered_keyword[keyword] = {"vcinema_films": viewings} + + min_vcinema_count = 2 + min_imdb_count = 4 + + add_keyword_totals(viewings_filtered_keyword, min_vcinema_count) + add_keyword_scores(viewings_filtered_keyword, min_vcinema_count, min_imdb_count) + + return viewings_filtered_keyword + + +def add_keyword_totals(keywords, min_vcinema_count): + keyword_count = len([keyword for keyword in keywords.keys() if len(keywords[keyword]['vcinema_films']) >= min_vcinema_count]) + + with IncrementalBar(message='%(percent).1f%% - %(eta)ds remaining', max=keyword_count, check_tty=False) as bar: + with ThreadPoolExecutor(6) as executor: + for keyword, data in keywords.items(): + if len(data['vcinema_films']) >= min_vcinema_count: + executor.submit(add_keyword_total, keyword, keywords, bar) + + +def add_keyword_total(keyword, keywords, progress_bar=None): + keyword_total = IMDbUtils.get_keyword_count(keyword) + + keywords[keyword]['total'] = keyword_total + + if progress_bar is not None: + progress_bar.next() + + +def add_keyword_scores(keyword_data, min_vcinema_count, min_imdb_count): + for keyword in keyword_data.keys(): + if 'total' in keyword_data[keyword]: + vcinema_count = len(keyword_data[keyword]['vcinema_films']) + total_count = keyword_data[keyword]['total'] + + if vcinema_count >= min_vcinema_count and total_count >= min_imdb_count: + score = vcinema_count / math.log(total_count) + + keyword_data[keyword]['score'] = score + + +def build_table(keyword_data, minimum_score=1.0): + keyword_data = {k: v for k, v in keyword_data.items() if 'score' in v and v['score'] >= minimum_score} + keyword_data = OrderedDict(sorted(keyword_data.items(), key=lambda t: t[1]['score'], reverse=True)) + + table = "| Keyword | Number of VCinema Films | Total IMDb entries | Score |\n| - | - | - | - |" + + for keyword, data in keyword_data.items(): + table += "\n" + + row_data = [] + row_data.append(str(keyword)) + row_data.append(str(len(data['vcinema_films']))) + row_data.append(str(len(data['total']))) + row_data.append(str(round(data['score'], 3))) + table += " | ".join(row_data) + + return table