opensourcegames/code/synchronization/libregamewiki_import.py

"""
Imports game details from libregamewiki by scraping the website, starting from https://libregamewiki.org/Category:Games

Also parse rejected games (https://libregamewiki.org/Libregamewiki:Rejected_games_list) and maybe https://libregamewiki.org/Libregamewiki:Suggested_games

Unique left column names in the game info boxes:
['Code license', 'Code licenses', 'Developer', 'Developers', 'Engine', 'Engines', 'Genre', 'Genres', 'Libraries', 'Library', 'Media license', 'Media licenses', 'P. language', 'P. languages', 'Platforms']

TODO there are games on LGW which are not part of the Games category but part of XXX-Games sub-categories, find them
"""

import os
import requests
import json
import re
from bs4 import BeautifulSoup
from utils import constants, utils, osg


def download_lgw_content():
    """

    :return:
    """

    # parameters
    base_url = 'https://libregamewiki.org'
    destination_path = os.path.join(constants.root_path, 'code', '../lgw-import')
    utils.recreate_directory(destination_path)

    # read and process the base url (get all games and categories)
    url = base_url + '/Category:Games'
    games = []
    while True:
        text = requests.get(url).text
        soup = BeautifulSoup(text, 'html.parser')
        # categories = soup.find('div', id='mw-subcategories').find_all('li')
        # categories = [(x.a['href'], x.a.string) for x in categories]

        # game pages
        pages = soup.find('div', id='mw-pages').find_all('li')
        games.extend(((x.a['href'], x.a.string) for x in pages))

        # next page
        next_page = soup.find('a', string='next page')
        if not next_page:
            break
        url = base_url + next_page['href']

    # remove all those that start with user
    games = [game for game in games if not any(game[1].startswith(x) for x in ('User:', 'Template:', 'Bullet'))]

    print('current number of games in LGW {}'.format(len(games)))

    for game in games:
        print(game[1])
        url = base_url + game[0]
        destination_file = os.path.join(destination_path, osg.canonical_name(game[0][1:]) + '.html')

        text = requests.get(url).text
        utils.write_text(destination_file, text)


def parse_lgw_content():

    # paths
    import_path = os.path.join(constants.root_path, 'code', '../lgw-import')
    entries_file = os.path.join(import_path, '_lgw.json')

    # iterate over all imported files
    files = os.listdir(import_path)
    entries = []
    for file in files:
        if file.startswith('_lgw'):
            continue

        text = utils.read_text(os.path.join(import_path, file))

        # parse the html
        soup = BeautifulSoup(text, 'html.parser')
        title = soup.h1.get_text()
        print(title)
        entry = {'name': title}

        # get all external links
        ignored_external_links = ('libregamewiki.org', 'freegamedev.net', 'freegamer.blogspot.com', 'opengameart.org', 'gnu.org', 'creativecommons.org', 'freesound.org', 'freecode.com', 'freenode.net')
        links = [(x['href'], x.get_text()) for x in soup.find_all('a', href=True)]
        links = [x for x in links if x[0].startswith('http') and not any([y in x[0] for y in ignored_external_links])]
        entry['external links'] = links

        # get meta description
        description = soup.find('meta', attrs={"name": "description"})
        entry['description'] = description['content']

        # parse gameinfobox
        infos = soup.find('div', class_='gameinfobox')
        if not infos:
            print(' no gameinfobox')
        else:
            infos = infos.find_all('tr')
            for x in infos:
                if x.th and x.td:
                    # row with header
                    key = x.th.get_text()
                    content = x.td.get_text()
                    content = content.split(',')
                    content = [x.strip() for x in content]
                    entry[key] = content
                if not x.th and x.td:
                    # row without header: contribute section
                    x = x.find_all('li')
                    x = [(x.a.string, x.a['href']) for x in x if x.a]
                    for key, content in x:
                        entry[key] = content

        # parse "for available as package in"
        tables = soup.find_all('table', class_='wikitable')
        tables = [table for table in tables if table.caption and table.caption.string.startswith('Available as package')]
        if len(tables) > 0:
            if len(tables) > 1:
                raise RuntimeError()
            table = tables[0]
            packages = table.find_all('tr')
            packages = [x.td.a['href'] for x in packages]
            entry['linux-packages'] = packages

        # categories
        categories = soup.find_all('div', id='mw-normal-catlinks')
        if not categories:
            print(' no categories')
            categories = []
        else:
            if len(categories) > 1:
                raise RuntimeError()
            categories = categories[0]
            categories = categories.find_all('li')
            categories = [x.a.string for x in categories]
            if 'Games' not in categories:
                print(' "Games" not in categories')
            else:
                categories.remove('Games')  # should be there
            # strip games at the end
            phrase = ' games'
            categories = [x[:-len(phrase)] if x.endswith(phrase) else x for x in categories]
            ignored_categories = ['Articles lacking reference', 'Stubs']
            categories = [x for x in categories if x not in ignored_categories]
        entry['categories'] = categories

        entries.append(entry)

    # save entries
    text = json.dumps(entries, indent=1)
    utils.write_text(entries_file, text)


def replace_content(entries, fields, replacement, search):
    if not isinstance(fields, tuple):
        fields = (fields, )
    for index, entry in enumerate(entries):
        for field in fields:
            if field in entry:
                content = entry[field]
                if not isinstance(content, list):
                    content = [content]
                entry[field] = [replacement if x in search else x for x in content]
        entries[index] = entry
    return entries


def ignore_content(entries, fields, ignored):
    if not isinstance(fields, tuple):
        fields = (fields, )
    for index, entry in enumerate(entries):
        for field in fields:
            if field in entry:
                content = entry[field]
                if not isinstance(content, list):
                    content = [content]
                content = [x for x in content if x not in ignored]
                if content:
                    entry[field] = content
                else:
                    del entry[field]
        entries[index] = entry
    return entries


def remove_prefix_suffix(entries, fields, prefixes, suffixes):
    if not isinstance(fields, tuple):
        fields = (fields, )
    for index, entry in enumerate(entries):
        for field in fields:
            if field in entry:
                content = entry[field]
                if not isinstance(content, list):
                    content = [content]
                for prefix in prefixes:
                    content = [x[len(prefix):] if x.startswith(prefix) else x for x in content]
                for sufix in suffixes:
                    content = [x[:-len(sufix)] if x.endswith(sufix) else x for x in content]
                content = [x.strip() for x in content]
                entry[field] = content
        entries[index] = entry
    return entries


def lower_case_content(entries, field):
    for index, entry in enumerate(entries):
        if field in entry:
            content = entry[field]
            if not isinstance(content, list):
                content = [content]
            entry[field] = [x.casefold() for x in content]
            entries[index] = entry
    return entries


def remove_parenthized_content(entries, fields):
    if not isinstance(fields, tuple):
        fields = (fields, )
    for index, entry in enumerate(entries):
        for field in fields:
            if field in entry:
                content = entry[field]
                if not isinstance(content, list):
                    content = [content]
                content = [re.sub(r'\([^)]*\)', '', c) for c in content]  # remove parentheses content
                content = [x.strip() for x in content]
                content = list(set(content))
                entry[field] = content
        entries[index] = entry
    return entries


def ignore_nonnumbers(entries, fields):
    if not isinstance(fields, tuple):
        fields = (fields, )
    for index, entry in enumerate(entries):
        for field in fields:
            if field in entry:
                content = entry[field]
                if not isinstance(content, list):
                    content = [content]
                content = [x for x in content if x.isdigit()]
                entry[field] = content
        entries[index] = entry
    return entries


def clean_lgw_content():

    # paths
    import_path = os.path.join(constants.root_path, 'code', '../lgw-import')
    entries_file = os.path.join(import_path, '_lgw.json')
    cleaned_entries_file = os.path.join(import_path, '_lgw.cleaned.json')

    # load entries
    text = utils.read_text(entries_file)
    entries = json.loads(text)

    # rename keys
    key_replacements = (('developer', ('Developer', 'Developers')), ('code license', ('Code license', 'Code licenses')), ('engine', ('Engine', 'Engines')), ('genre', ('Genre', 'Genres')),
                        ('library', ('Library', 'Libraries')), ('assets license', ('Media license', 'Media licenses')), ('code language', ('P. language', 'P. languages')), ('home', ('Homepage',)),
                        ('platform', ('Platforms', )), ('tracker', ('Bug/Feature Tracker', )), ('repo', ('Source Code', )), ('forum', ('Forum', )), ('chat', ('Chat', )), ('origin', ('Origin', )),
                        ('dev home', ('Development Project', )), ('last active', ('Release date', )))
    for index, entry in enumerate(entries):
        for new_key, old_keys in key_replacements:
            for key in old_keys:
                if key in entry:
                    entry[new_key] = entry[key]
                    del entry[key]
                    break
        entries[index] = entry

    # ignore keys
    ignored_keys = ('origin', 'Latest\xa0release')
    for index, entry in enumerate(entries):
        for key in ignored_keys:
            if key in entry:
                del entry[key]
        entries[index] = entry

    # check for unique field names
    unique_fields = set()
    for entry in entries:
        unique_fields.update(entry.keys())
    print('unique lgw fields: {}'.format(sorted(list(unique_fields))))

    # which fields are mandatory
    mandatory_fields = unique_fields.copy()
    for entry in entries:
        remove_fields = [field for field in mandatory_fields if field not in entry]
        mandatory_fields -= set(remove_fields)
    print('mandatory lgw fields: {}'.format(sorted(list(mandatory_fields))))

    # statistics before
    print('field contents before')
    fields = sorted(list(unique_fields - {'description', 'external links', 'dev home', 'forum', 'home',
                                          'linux-packages', 'developer', 'chat', 'tracker', 'Latest release', 'name',
                                          'repo', 'Release date', 'categories'}))
    for field in fields:
        content = [entry[field] for entry in entries if field in entry]
        # flatten
        flat_content = []
        for c in content:
            if isinstance(c, list):
                flat_content.extend(c)
            else:
                flat_content.append(c)
        statistics = utils.unique_elements_and_occurrences(flat_content)
        print('{}: {}'.format(field, ', '.join(statistics)))

    # content replacements
    entries = remove_parenthized_content(entries, ('assets license', 'code language', 'code license', 'engine', 'genre', 'last active', 'library'))
    entries = remove_prefix_suffix(entries, ('code license', 'assets license'), ('"', 'GNU'), ('"', '[3]', '[2]', '[1]', 'only', ' license'))
    entries = replace_content(entries, ('code license', 'assets license'), 'GPL', ('General Public License', ))
    entries = replace_content(entries, ('code license', 'assets license'), 'GPL-2.0', ('GPLv2', ))  # for LGW GPLv2 would be the correct writing
    entries = replace_content(entries, ('code license', 'assets license'), 'GPL-2', ('GPLv2', 'GPL v2', 'GPL version 2.0', 'GPL 2.0', 'General Public License v2', 'GPL version 2', 'Gplv2', 'GPL 2'))
    entries = replace_content(entries, ('code license', 'assets license'), 'GPL-2', ('GPL v2 or later', 'GPL 2+', 'GPL v2+', 'GPLv2+', 'GPL version 2 or later'))
    entries = replace_content(entries, ('code license', 'assets license'), 'GPL-3.0', ('GPLv3', ))  # for LGW GPLv3 would be the correct writing
    entries = replace_content(entries, ('code license', 'assets license'), 'GPL-3', ('GPL v3', 'GNU GPL v3', 'GPL 3', 'General Public License 3', 'General Public License v3.0'))
    entries = replace_content(entries, ('code license', 'assets license'), 'GPL-3', ('GPL v3+', 'GPLv3+', 'GPL v.3 or later', 'GPL v3 or later'))
    entries = replace_content(entries, ('code license', 'assets license'), 'AGPL-3.0', ('AGPLv3', 'AGPL', 'Affero General Public License v3.0', 'AGPL v3'))
    entries = replace_content(entries, ('code license', 'assets license'), 'Public domain', ('public domain', 'Public Domain'))
    entries = replace_content(entries, ('code license', 'assets license'), 'zlib', ('zlib/libpng license', 'Zlib License'))
    entries = replace_content(entries, ('code license', 'assets license'), 'BSD', ('Original BSD License', ))
    entries = replace_content(entries, ('code license', 'assets license'), 'CC-BY-SA-3.0', ('Creative Commons Attribution-ShareAlike 3.0 Unported License', 'CC-BY-SA 3.0', 'CC BY-SA 3.0'))
    entries = replace_content(entries, ('code license', 'assets license'), 'CC-BY-SA', ('CC BY-SA',))
    entries = replace_content(entries, ('code license', 'assets license'), 'MIT', ('MIT License', 'MIT"'))
    entries = replace_content(entries, ('assets license', ), 'no media', ('No media', 'no media?'))
    entries = replace_content(entries, 'platform', 'macOS', ('Mac', ))
    entries = remove_prefix_suffix(entries, ('code language', 'developer'), (), ('[3]', '[2]', '[1]'))
    entries = ignore_content(entries, 'code language', ('HTML5', 'HTML', 'English', 'XML', 'WML', 'CSS'))
    entries = replace_content(entries, 'code language', 'Lua', ('lua', 'LUA'))
    entries = remove_prefix_suffix(entries, 'genre', (), ('game', 'games'))
    entries = lower_case_content(entries, 'genre')
    entries = replace_content(entries, 'genre', 'platform', ('platformer', ))
    entries = replace_content(entries, 'genre', 'role playing', ('rpg', ))
    entries = replace_content(entries, 'genre', 'first person, shooter', ('fps', ))
    entries = replace_content(entries, 'genre', 'real time, strategy', ('rts',))
    entries = replace_content(entries, 'genre', 'turn based, strategy', ('tbs',))
    entries = ignore_content(entries, 'categories', ('GPL', 'C++', 'C', 'ECMAScript', 'Python', 'Java', 'CC BY-SA', 'Lua', 'LGPL', 'CC-BY', 'BSD', 'MIT', 'Qt', 'SDL', 'OpenGL', 'Pygame', 'PD', 'GLUT', 'Haskell', 'Allegro', 'Ruby', 'Zlib/libpng', 'OpenAL', 'Perl', 'Free Pascal', 'LÖVE', 'HTML5', 'Id Tech 1'))
    entries = replace_content(entries, 'library', 'pygame', ('Pygame', ))
    entries = replace_content(entries, 'library', 'Qt', ('QT', 'Qt4'))
    entries = ignore_content(entries, 'library', ('C++', 'Lua', 'Mozilla Firefox', 'DirectX', 'Boost'))
    entries = ignore_nonnumbers(entries, 'last active')
    entries = ignore_content(entries, 'last active', ('2019', ))
    entries = ignore_content(entries, 'platform', ('DOS', ))

    # list for every unique field
    print('\nfield contents after')
    fields = sorted(list(unique_fields - {'description', 'external links', 'dev home', 'forum', 'home',
                                          'linux-packages', 'developer', 'chat', 'tracker', 'Latest release', 'name',
                                          'repo', 'Release date', 'categories'}))
    for field in fields:
        content = [entry[field] for entry in entries if field in entry]
        # flatten
        flat_content = []
        for c in content:
            if isinstance(c, list):
                flat_content.extend(c)
            else:
                flat_content.append(c)
        statistics = utils.unique_elements_and_occurrences(flat_content)
        print('{}: {}\n'.format(field, ', '.join(statistics)))

    # save entries
    text = json.dumps(entries, indent=1)
    utils.write_text(cleaned_entries_file, text)


if __name__ == "__main__":

    # stage one
    # download_lgw_content()

    # stage two
    # parse_lgw_content()

    # stage three
    # clean_lgw_content()