""" Checks a list of game names (comma separated in text file) if they are already included in the database. Is fuzzy, i.e. accepts a certain similarity of names. """ import json import re from difflib import SequenceMatcher from utils.utils import * def similarity(a, b): return SequenceMatcher(None, a, b).ratio() if __name__ == "__main__": similarity_threshold = 0.7 root_path = os.path.realpath(os.path.join(os.path.dirname(__file__), os.path.pardir)) # read docs/data.json data_file = os.path.join(root_path, 'docs', 'data.json') text = read_text(data_file) data = json.loads(text) # extract game names data = data['data'] data = (x[0] for x in data) existing_names = list(re.sub(r' \([^)]*\)', '', x) for x in data) # read names to test test_file = os.path.join(root_path, 'is_already_included.txt') text = read_text(test_file) test_names = text.split(', ') # loop over all test names for test_name in test_names: matches = [] # loop over all existing names for existing_name in existing_names: s = similarity(test_name.lower(), existing_name.lower()) if s > similarity_threshold: matches.append('{} ({:.2f})'.format(existing_name, s)) # were matches found if matches: print('{} maybe included in {}'.format(test_name, ', '.join(matches))) else: print('{} not included'.format(test_name))