diff --git a/imdb_utils/IMDbUtils.py b/imdb_utils/IMDbUtils.py index 43a8596..c83a057 100644 --- a/imdb_utils/IMDbUtils.py +++ b/imdb_utils/IMDbUtils.py @@ -1,4 +1,7 @@ from imdb import IMDb +import requests +from bs4 import BeautifulSoup +import re def get_movie(imdb_id): @@ -16,3 +19,42 @@ def get_movie_keywords(imdb_id): return movie + +def get_api_keyword_count(keyword): + ia = IMDb() + + count = len(ia.get_keyword(keyword)) + + return count + + +def get_website_keyword_count(keyword): + try: + page = requests.get("https://www.imdb.com/search/keyword/?keywords=" + keyword) + except ConnectionError: + raise + + soup = BeautifulSoup(page.content, 'html.parser') + elements = soup.findAll("div", class_="desc") + + pagination_label = elements[0].text.replace("\n", "") + + pagination_label_reg = "(\d+,?\d*) titles" + pattern_match = re.compile(pagination_label_reg).search(pagination_label) + + if pattern_match is not None: + return int(pattern_match.group(1).replace(',', '')) + else: + return 1 + + +def get_keyword_count(keyword): + count = get_api_keyword_count(keyword) + + if count == 50: + try: + count = get_website_keyword_count(keyword) + except Exception as e: + raise + + return count