From e0dff08c14b59f92f13141af991520c15c9d32d7 Mon Sep 17 00:00:00 2001
From: Sarah <sarah.bloor-2@student.manchester.ac.uk>
Date: Fri, 8 Apr 2022 22:40:01 +0100
Subject: [PATCH] Create update_keywords_scores.py

---
 update_keywords_scores.py | 75 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)
 create mode 100644 update_keywords_scores.py

diff --git a/update_keywords_scores.py b/update_keywords_scores.py
new file mode 100644
index 0000000..7a78097
--- /dev/null
+++ b/update_keywords_scores.py
@@ -0,0 +1,75 @@
+from collections import OrderedDict
+from progress.bar import IncrementalBar
+import math
+from concurrent.futures import ThreadPoolExecutor
+
+from imdb_utils import IMDbUtils
+from vcinema_utils import VCinemaUtils
+
+# Page ID of https://wiki.jacknet.io/books/vcinema/page/keyword-scores
+KEYWORD_SCORES_PAGE_ID = 23
+
+
+def get_keyword_scores(viewings):
+    viewings_filtered_keyword = VCinemaUtils.filter_viewings(viewings, "keywords")
+
+    for keyword, viewings in viewings_filtered_keyword.items():
+        viewings_filtered_keyword[keyword] = {"vcinema_films": viewings}
+
+    min_vcinema_count = 2
+    min_imdb_count = 4
+
+    add_keyword_totals(viewings_filtered_keyword, min_vcinema_count)
+    add_keyword_scores(viewings_filtered_keyword, min_vcinema_count, min_imdb_count)
+
+    return viewings_filtered_keyword
+
+
+def add_keyword_totals(keywords, min_vcinema_count):
+    keyword_count = len([keyword for keyword in keywords.keys() if len(keywords[keyword]['vcinema_films']) >= min_vcinema_count])
+
+    with IncrementalBar(message='%(percent).1f%% - %(eta)ds remaining', max=keyword_count, check_tty=False) as bar:
+        with ThreadPoolExecutor(6) as executor:
+            for keyword, data in keywords.items():
+                if len(data['vcinema_films']) >= min_vcinema_count:
+                    executor.submit(add_keyword_total, keyword, keywords, bar)
+
+
+def add_keyword_total(keyword, keywords, progress_bar=None):
+    keyword_total = IMDbUtils.get_keyword_count(keyword)
+
+    keywords[keyword]['total'] = keyword_total
+
+    if progress_bar is not None:
+        progress_bar.next()
+
+
+def add_keyword_scores(keyword_data, min_vcinema_count, min_imdb_count):
+    for keyword in keyword_data.keys():
+        if 'total' in keyword_data[keyword]:
+            vcinema_count = len(keyword_data[keyword]['vcinema_films'])
+            total_count = keyword_data[keyword]['total']
+
+            if vcinema_count >= min_vcinema_count and total_count >= min_imdb_count:
+                score = vcinema_count / math.log(total_count)
+
+                keyword_data[keyword]['score'] = score
+
+
+def build_table(keyword_data, minimum_score=1.0):
+    keyword_data = {k: v for k, v in keyword_data.items() if 'score' in v and v['score'] >= minimum_score}
+    keyword_data = OrderedDict(sorted(keyword_data.items(), key=lambda t: t[1]['score'], reverse=True))
+
+    table = "| Keyword | Number of VCinema Films | Total IMDb entries | Score |\n| - | - | - | - |"
+
+    for keyword, data in keyword_data.items():
+        table += "\n"
+
+        row_data = []
+        row_data.append(str(keyword))
+        row_data.append(str(len(data['vcinema_films'])))
+        row_data.append(str(len(data['total'])))
+        row_data.append(str(round(data['score'], 3)))
+        table += " | ".join(row_data)
+
+    return table