from collections import OrderedDict from progress.bar import IncrementalBar import math from concurrent.futures import ThreadPoolExecutor from imdb_utils import IMDbUtils from vcinema_utils import VCinemaUtils # Page ID of https://wiki.jacknet.io/books/vcinema/page/keyword-scores KEYWORD_SCORES_PAGE_ID = 23 def get_keyword_scores(viewings): viewings_filtered_keyword = VCinemaUtils.filter_viewings(viewings, "keywords") for keyword, viewings in viewings_filtered_keyword.items(): viewings_filtered_keyword[keyword] = {"vcinema_films": viewings} min_vcinema_count = 2 min_imdb_count = 4 add_keyword_totals(viewings_filtered_keyword, min_vcinema_count) add_keyword_scores(viewings_filtered_keyword, min_vcinema_count, min_imdb_count) return viewings_filtered_keyword def add_keyword_totals(keywords, min_vcinema_count): keyword_count = len([keyword for keyword in keywords.keys() if len(keywords[keyword]['vcinema_films']) >= min_vcinema_count]) with IncrementalBar(message='%(percent).1f%% - %(eta)ds remaining', max=keyword_count, check_tty=False) as bar: with ThreadPoolExecutor(6) as executor: for keyword, data in keywords.items(): if len(data['vcinema_films']) >= min_vcinema_count: executor.submit(add_keyword_total, keyword, keywords, bar) def add_keyword_total(keyword, keywords, progress_bar=None): keyword_total = IMDbUtils.get_keyword_count(keyword) keywords[keyword]['total'] = keyword_total if progress_bar is not None: progress_bar.next() def add_keyword_scores(keyword_data, min_vcinema_count, min_imdb_count): for keyword in keyword_data.keys(): if 'total' in keyword_data[keyword]: vcinema_count = len(keyword_data[keyword]['vcinema_films']) total_count = keyword_data[keyword]['total'] if vcinema_count >= min_vcinema_count and total_count >= min_imdb_count: score = vcinema_count / math.log(total_count) keyword_data[keyword]['score'] = score def build_table(keyword_data, minimum_score=1.0): keyword_data = {k: v for k, v in keyword_data.items() if 'score' in v and v['score'] >= minimum_score} keyword_data = OrderedDict(sorted(keyword_data.items(), key=lambda t: t[1]['score'], reverse=True)) table = "| Keyword | Number of VCinema Films | Total IMDb entries | Score |\n| - | - | - | - |" for keyword, data in keyword_data.items(): table += "\n" row_data = [] row_data.append(str(keyword)) row_data.append(str(len(data['vcinema_films']))) row_data.append(str(len(data['total']))) row_data.append(str(round(data['score'], 3))) table += " | ".join(row_data) return table