opensourcegames/code/helpers/is_already_included.py

49 lines
1.5 KiB
Python

"""
Checks a list of game names (comma separated in text file) if they are already included in the database.
Is fuzzy, i.e. accepts a certain similarity of names.
"""
import json
import re
from difflib import SequenceMatcher
from utils.utils import *
def similarity(a, b):
return SequenceMatcher(None, a, b).ratio()
if __name__ == "__main__":
similarity_threshold = 0.7
root_path = os.path.realpath(os.path.join(os.path.dirname(__file__), os.path.pardir))
# read docs/data.json
data_file = os.path.join(root_path, 'docs', 'data.json')
text = read_text(data_file)
data = json.loads(text)
# extract game names
data = data['data']
data = (x[0] for x in data)
existing_names = list(re.sub(r' \([^)]*\)', '', x) for x in data)
# read names to test
test_file = os.path.join(root_path, 'is_already_included.txt')
text = read_text(test_file)
test_names = text.split(', ')
# loop over all test names
for test_name in test_names:
matches = []
# loop over all existing names
for existing_name in existing_names:
s = similarity(test_name.lower(), existing_name.lower())
if s > similarity_threshold:
matches.append('{} ({:.2f})'.format(existing_name, s))
# were matches found
if matches:
print('{} maybe included in {}'.format(test_name, ', '.join(matches)))
else:
print('{} not included'.format(test_name))