opensourcegames/code/utils/osg.py

"""
Specific functions working on the games.
"""

import re
import os
from difflib import SequenceMatcher
from utils import utils, osg_parse, constants as c

regex_sanitize_name = re.compile(r"[^A-Za-z 0-9-+]+")
regex_sanitize_name_space_eater = re.compile(r" +")


def name_similarity(a, b):
    return SequenceMatcher(None, str.casefold(a), str.casefold(b)).ratio()


def entry_iterator():
    """

    """

    # get all entries (ignore everything starting with underscore)
    entries = os.listdir(c.entries_path)

    # iterate over all entries
    for entry in entries:
        entry_path = os.path.join(c.entries_path, entry)

        # ignore directories ("tocs" for example)
        if os.path.isdir(entry_path):
            continue

        # read entry
        content = utils.read_text(entry_path)

        # yield
        yield entry, entry_path, content


def canonical_name(name):
    """
    Derives a canonical name from an actual name (suitable for file names, anchor names, ...)
    """
    name = name.casefold()
    name = name.replace('ö', 'o').replace('ä', 'a').replace('ü', 'u')
    name = regex_sanitize_name.sub('', name)
    name = regex_sanitize_name_space_eater.sub('_', name)
    name = name.replace('_-_', '-')
    name = name.replace('--', '-').replace('--', '-')

    return name


def read_developers():
    """

    :return:
    """
    grammar_file = os.path.join(c.code_path, 'grammar_listing.lark')
    developers = osg_parse.read_and_parse(c.developer_file, grammar_file, osg_parse.ListingTransformer)

    # now developers is a list of dictionaries for every entry with some properties

    # check for duplicate names entries
    names = [dev['Name'] for dev in developers]
    duplicate_names = (name for name in names if names.count(name) > 1)
    duplicate_names = set(duplicate_names)  # to avoid duplicates in duplicate_names
    if duplicate_names:
        print('Warning: duplicate developer names: {}'.format(', '.join(duplicate_names)))

    # check for essential, valid fields
    for dev in developers:
        # check that essential fields are existing
        for field in c.essential_developer_fields:
            if field not in dev:
                raise RuntimeError('Essential field "{}" missing in developer {}'.format(field, dev['Name']))
        # check that all fields are valid fields
        for field in dev.keys():
            if field not in c.valid_developer_fields:
                raise RuntimeError('Invalid field "{}" in developer {}.'.format(field, dev['Name']))
        # url fields
        for field in c.url_developer_fields:
            if field in dev:
                content = dev[field]
                if any(not (x.startswith('http://') or x.startswith('https://')) for x in content):
                    raise RuntimeError('Invalid URL in field "{}" in developer {}.'.format(field, dev['Name']))

    # convert to dictionary
    developers = {x['Name']: x for x in developers}

    return developers


def write_developers(developers):
    """

    :return:
    """
    # convert dictionary to list
    developers = list(developers.values())

    # comment
    content = '{}\n'.format(c.generic_comment_string)

    # number of developer
    content += '# Developer [{}]\n\n'.format(len(developers))

    # sort by name
    developers.sort(key=lambda x: str.casefold(x['Name']))

    # iterate over them
    for dev in developers:
        keys = list(dev.keys())
        # developer name
        content += '## {} [{}]\n\n'.format(dev['Name'], len(dev['Games']))
        keys.remove('Name')

        # all the remaining in alphabetical order, but 'games' first
        keys.remove('Games')
        keys.sort()
        keys = ['Games'] + keys
        for field in keys:
            value = dev[field]
            # lists get special treatment
            if isinstance(value, list):
                value.sort(key=str.casefold)
                value = [x if not ',' in x else '"{}"'.format(x) for x in value]  # surround those with a comma with quotation marks
                value = ', '.join(value)
            content += '- {}: {}\n'.format(field, value)
        content += '\n'

    # write
    utils.write_text(c.developer_file, content)


def read_inspirations():
    """
    Reads the info list about the games originals/inspirations from inspirations.md using the Lark parser grammar
    in grammar_listing.lark
    :return:
    """
    # read inspirations

    # read and parse inspirations
    grammar_file = os.path.join(c.code_path, 'grammar_listing.lark')
    inspirations = osg_parse.read_and_parse(c.inspirations_file, grammar_file, osg_parse.ListingTransformer)

    # now inspirations is a list of dictionaries for every entry with some properties

    # check for duplicate names entries
    names = [inspiration['Name'] for inspiration in inspirations]
    duplicate_names = (name for name in names if names.count(name) > 1)
    duplicate_names = set(duplicate_names)  # to avoid duplicates in duplicate_names
    if duplicate_names:
        raise RuntimeError('Duplicate inspiration names: {}'.format(', '.join(duplicate_names)))

    # check for essential, valid fields
    for inspiration in inspirations:
        # check that essential fields are existing
        for field in c.essential_inspiration_fields:
            if field not in inspiration:
                raise RuntimeError('Essential field "{}" missing in inspiration {}'.format(field, inspiration['Name']))
        # check that all fields are valid fields
        for field in inspiration.keys():
            if field not in c.valid_inspiration_fields:
                raise RuntimeError('Invalid field "{}" in inspiration {}.'.format(field, inspiration['Name']))
        # url fields
        for field in c.url_inspiration_fields:
            if field in inspiration:
                content = inspiration[field]
                if any(not (x.startswith('http://') or x.startswith('https://')) for x in content):
                    raise RuntimeError('Invalid URL in field "{}" in inspiration {}.'.format(field, inspiration['Name']))

    # convert to dictionary
    inspirations = {x['Name']: x for x in inspirations}

    return inspirations


def write_inspirations(inspirations):
    """
    Given an internal dictionary of inspirations, write it into the inspirations file
    :param inspirations:
    :return:
    """
    # convert dictionary to list
    inspirations = list(inspirations.values())

    # comment
    content = '{}\n'.format(c.generic_comment_string)

    # updated number of inspirations
    content += '# Inspirations [{}]\n\n'.format(len(inspirations))

    # sort by name
    inspirations.sort(key=lambda x: str.casefold(x['Name']))

    # iterate over them
    for inspiration in inspirations:
        keys = list(inspiration.keys())
        # inspiration name
        content += '## {} [{}]\n\n'.format(inspiration['Name'], len(inspiration['Inspired entries']))
        keys.remove('Name')

        # all the remaining in alphabetical order, but "inspired entries" first
        keys.remove('Inspired entries')
        keys.sort()
        keys = ['Inspired entries'] + keys
        for field in keys:
            value = inspiration[field]
            # lists get special treatment
            if isinstance(value, list):
                value.sort(key=str.casefold)  # sorted alphabetically
                value = [x if not ',' in x else '"{}"'.format(x) for x in value]  # surround those with a comma with quotation marks
                value = ', '.join(value)
            content += '- {}: {}\n'.format(field, value)
        content += '\n'

    # write
    utils.write_text(c.inspirations_file, content)


def read_entries():
    """
    Parses all entries and assembles interesting infos about them.
    """

    # setup parser and transformer
    grammar_file = os.path.join(c.code_path, 'grammar_entries.lark')
    grammar = utils.read_text(grammar_file)
    parse = osg_parse.create(grammar, osg_parse.EntryTransformer)

    # a database of all important infos about the entries
    entries = []

    # iterate over all entries
    exception_happened = False
    for file, _, content in entry_iterator():

        if not content.endswith('\n'):
            content += '\n'

        # parse and transform entry content
        try:
            entry = parse(content)
            entry = [('File', file),] + entry # add file information to the beginning
            entry = check_and_process_entry(entry)
        except Exception as e:
            print('{} - {}'.format(file, e))
            exception_happened = True
            # raise RuntimeError(e)
            continue

        # add to list
        entries.append(entry)
    if exception_happened:
        raise RuntimeError('errors while reading entries')

    return entries


def read_entry(file):
    """
    Reads a single entry
    :param file: the entry file (without path)
    :return: the entry
    """

    # setup parser and transformer
    grammar_file = os.path.join(c.code_path, 'grammar_entries.lark')
    grammar = utils.read_text(grammar_file)
    parse = osg_parse.create(grammar, osg_parse.EntryTransformer)

    # read entry file
    content = utils.read_text(os.path.join(c.entries_path, file))
    if not content.endswith('\n'):
        content += '\n'

    # parse and transform entry content
    try:
        entry = parse(content)
        entry = [('File', file),] + entry # add file information to the beginning
        entry = check_and_process_entry(entry)
    except Exception as e:
        print('{} - {}'.format(file, e))
        raise RuntimeError(e)

    return entry


def check_and_process_entry(entry):
    message = ''

    # check that all fields are valid fields and are existing in that order
    index = 0
    for e in entry:
        field = e[0]
        while index < len(c.valid_fields) and field != c.valid_fields[index]:
            index += 1
        if index == len(c.valid_fields):  # must be valid fields and must be in the right order
            message += 'Field "{}" either misspelled or in wrong order\n'.format(field)

    # order is fine we can convert to dictionary
    d = {}
    for field, value in entry:
        if field in d:
            message += 'Field "{}" appears twice\n'.format(field)
        d[field] = value
    entry = d

    # check for essential fields
    for field in c.essential_fields:
        if field not in entry:
            message += 'Essential property "{}" missing\n'.format(field)

    # now the same treatment for building
    building = entry['Building']
    d = {}
    for field, value in building:
        if field in d:
            message += 'Field "{}" appears twice\n'.format(field)
        d[field] = value
    building = d

    # check valid fields in building TODO should also check order
    for field in building.keys():
        if field not in c.valid_building_fields:
            message += 'Building field "{}" invalid\n'.format(field)
    entry['Building'] = building

    # check canonical file name
    file = entry['File']
    canonical_file_name = canonical_name(entry['Title']) + '.md'
    # we also allow -X with X =2..9 as possible extension (because of duplicate canonical file names)
    if canonical_file_name != file and canonical_file_name != file[:-5] + '.md':
        message += 'file name should be {}\n'.format(canonical_file_name)

    # state must contain either beta or mature but not both
    state = entry['State']
    for t in state:
        if t != 'beta' and t != 'mature' and not t.startswith('inactive since '):
            message += 'Unknown state "{}"'.format(t)
    if 'beta' in state == 'mature' in state:
        message += 'State must be one of <"beta", "mature">'

    # check urls
    for field in c.url_fields:
        values = entry.get(field, [])
        for value in values:
            if value.value.startswith('<') and value.value.endswith('>'):
                value.value = value.value[1:-1]
            if not any(value.startswith(x) for x in c.extended_valid_url_prefixes):
                message += 'URL "{}" in field "{}" does not start with a valid prefix'.format(value, field)

    # github/gitlab repositories should end on .git and should start with https
    for repo in entry['Code repository']:
        if any(repo.startswith(x) for x in ('@', '?')):
            continue
        repo = repo.value.split(' ')[0].strip()
        if any((x in repo for x in ('github', 'gitlab', 'git.tuxfamily', 'git.savannah'))):
                if not repo.startswith('https://'):
                    message += 'Repo "{}" should start with https://'.format(repo)
                if not repo.endswith('.git'):
                    message += 'Repo "{}" should end on .git.'.format(repo)

    # check that all platform tags are valid tags and are existing in that order
    if 'Platform' in entry:
        index = 0
        for platform in entry['Platform']:
            while index < len(c.valid_platforms) and platform != c.valid_platforms[index]:
                index += 1
            if index == len(c.valid_platforms):  # must be valid platforms and must be in that order
                message += 'Platform tag "{}" either misspelled or in wrong order'.format(platform)

    # there must be at least one keyword
    if not entry['Keyword']:
        message += 'Need at least one keyword'

    # check for existence of at least one recommended keywords
    keywords = entry['Keyword']
    if not any(keyword in keywords for keyword in c.recommended_keywords):
        message += 'Entry contains no recommended keywords'

    # languages should be known
    languages = entry['Code language']
    for language in languages:
        if language not in c.known_languages:
            message += 'Language "{}" is not a known code language. Misspelled or new?'.format(language)

    # licenses should be known
    licenses = entry['Code license']
    for license in licenses:
        if license not in c.known_licenses:
            message += 'License "{}" is not a known license. Misspelled or new?'.format(license)

    if message:
        raise RuntimeError(message)

    return entry


def is_inactive(entry):
    state = entry['State']
    phrase = 'inactive since '
    return any(x.startswith(phrase) for x in state)


def extract_inactive_year(entry):
    state = entry['State']
    phrase = 'inactive since '
    inactive_year = [x.value[len(phrase):] for x in state if x.startswith(phrase)]
    assert len(inactive_year) <= 1
    if inactive_year:
        return int(inactive_year[0])
    else:
        return None


def write_entries(entries):
    """

    :return:
    """

    # iterate over all entries
    for entry in entries:
        write_entry(entry)


def write_entry(entry):
    """

    :param entry:
    :return:
    """
    # TODO check entry

    # get path
    entry_path = os.path.join(c.entries_path, entry['File'])

    # create output content
    content = create_entry_content(entry)

    # write entry
    utils.write_text(entry_path, content)


def create_entry_content(entry):
    """

    :param entry:
    :return:
    """

    # title
    content = '# {}\n\n'.format(entry['Title'])

    # we automatically sort some fields
    sort_fun = lambda x: str.casefold(x.value)
    for field in ('Media', 'Inspiration', 'Code Language'):
        if field in entry:
            values = entry[field]
            entry[field] = sorted(values, key=sort_fun)
    # we also sort keywords, but first the recommend ones and then other ones
    keywords = entry['Keyword']
    a = [x for x in keywords if x in c.recommended_keywords]
    b = [x for x in keywords if x not in c.recommended_keywords]
    entry['Keyword'] = sorted(a, key=sort_fun) + sorted(b, key=sort_fun)

    # now all properties in the recommended order
    for field in c.valid_properties:
        if field in entry:
            e = entry[field]
            e = ['"{}"'.format(x) if any(y in x.value for y in (',', ' (')) else x for x in e]
            e = [str(x) for x in e]
            content += '- {}: {}\n'.format(field, ', '.join(e))
    content += '\n'

    # if there is a note, insert it
    if 'Note' in entry:
        content += entry['Note']

    # building header
    content += '## Building\n'

    # building properties if present
    has_properties = False
    for field in c.valid_building_properties:
        if field in entry['Building']:
            if not has_properties:
                has_properties = True
                content += '\n'
            e = entry['Building'][field]
            e = ['"{}"'.format(x) if ',' in x else x for x in e]
            e = [str(x) for x in e]
            content += '- {}: {}\n'.format(field, ', '.join(e))

    # if there is a note, insert it
    if 'Note' in entry['Building']:
        content += '\n'
        content += entry['Building']['Note']

    return content


def is_url(str):
    """
    Could be too generous. See https://stackoverflow.com/questions/7160737/how-to-validate-a-url-in-python-malformed-or-not for other possibilities.
    :param str:
    :return:
    """
    if any(str.startswith(x) for x in c.valid_url_prefixes) and not ' ' in str:
        return True
    return False


def all_urls(entries):
    """
    Gets all urls of all entries in a dictionary (key=url value=list of entries (file name) with this url
    :param entries:
    :return:
    """
    urls = {}
    # iterate over entries
    for entry in entries:
        file = entry['File']
        for field in c.url_fields:  # TODO there are other fields, maybe just regex on the whole content
            for value in entry.get(field, []):
                if value.comment:
                    value = value.value + ' ' + value.comment
                else:
                    value = value.value
                for subvalue in value.split(' '):
                    subvalue = subvalue.strip()
                    if is_url(subvalue):
                        urls[subvalue] = urls.get(subvalue, []) + [file]
    return urls


def git_repo(repo):
    """
    Tests if a repo URL is a git repo, then returns the repo url.
    """

    # everything that starts with 'git://'
    if repo.startswith('git://'):
        return repo

    # generic (https://*.git) or (http://*.git) ending on git
    if (repo.startswith('https://') or repo.startswith('http://')) and repo.endswith('.git'):
        return repo

    # for all others we just check if they start with the typical urls of git services
    services = ['https://git.tuxfamily.org/', 'http://git.pond.sub.org/', 'https://gitorious.org/',
                'https://git.code.sf.net/p/']
    if any(repo.startswith(service) for service in services):
        return repo

    # the rest is not recognized as a git url
    return None


def svn_repo(repo):
    """
    Tests if a repo URL is a svn repo, then returns the repo url.
    """

    # we can just go for known providers of svn
    services = ('svn://', 'https://svn.code.sf.net/p/', 'http://svn.savannah.gnu.org/svn/', 'https://svn.icculus.org/', 'http://svn.icculus.org/', 'http://svn.uktrainsim.com/svn/', 'https://rpg.hamsterrepublic.com/source/wip')
    if any(repo.startswith(service) for service in services):
        return repo

    # not svn
    return None


def hg_repo(repo):
    """
    Tests if a repo URL is a hg repo, then returns the repo url.
    """
    if repo.startswith('https://bitbucket.org/') and not repo.endswith('.git'):
        return repo

    if repo.startswith('http://hg.'):
        return repo

    # not hg
    return None