update Github information (devs and project stars)

2021-09-27 14:51:43 +02:00
parent 4f151766bb
commit 6b6ca69a88
989 changed files with 10736 additions and 2086 deletions
--- a/code/synchronization/libregamewiki_import.py
+++ b/code/synchronization/libregamewiki_import.py
@@ -0,0 +1,381 @@
+"""
+Imports game details from libregamewiki by scraping the website, starting from https://libregamewiki.org/Category:Games
+
+Also parse rejected games (https://libregamewiki.org/Libregamewiki:Rejected_games_list) and maybe https://libregamewiki.org/Libregamewiki:Suggested_games
+
+Unique left column names in the game info boxes:
+['Code license', 'Code licenses', 'Developer', 'Developers', 'Engine', 'Engines', 'Genre', 'Genres', 'Libraries', 'Library', 'Media license', 'Media licenses', 'P. language', 'P. languages', 'Platforms']
+
+TODO there are games on LGW which are not part of the Games category but part of XXX-Games sub-categories, find them
+"""
+
+import os
+import requests
+import json
+import re
+from bs4 import BeautifulSoup
+from utils import constants, utils, osg
+
+
+def download_lgw_content():
+    """
+
+    :return:
+    """
+
+    # parameters
+    base_url = 'https://libregamewiki.org'
+    destination_path = os.path.join(constants.root_path, 'code', '../lgw-import')
+    utils.recreate_directory(destination_path)
+
+    # read and process the base url (get all games and categories)
+    url = base_url + '/Category:Games'
+    games = []
+    while True:
+        text = requests.get(url).text
+        soup = BeautifulSoup(text, 'html.parser')
+        # categories = soup.find('div', id='mw-subcategories').find_all('li')
+        # categories = [(x.a['href'], x.a.string) for x in categories]
+
+        # game pages
+        pages = soup.find('div', id='mw-pages').find_all('li')
+        games.extend(((x.a['href'], x.a.string) for x in pages))
+
+        # next page
+        next_page = soup.find('a', string='next page')
+        if not next_page:
+            break
+        url = base_url + next_page['href']
+
+    # remove all those that start with user
+    games = [game for game in games if not any(game[1].startswith(x) for x in ('User:', 'Template:', 'Bullet'))]
+
+    print('current number of games in LGW {}'.format(len(games)))
+
+    for game in games:
+        print(game[1])
+        url = base_url + game[0]
+        destination_file = os.path.join(destination_path, osg.canonical_name(game[0][1:]) + '.html')
+
+        text = requests.get(url).text
+        utils.write_text(destination_file, text)
+
+
+def parse_lgw_content():
+
+    # paths
+    import_path = os.path.join(constants.root_path, 'code', '../lgw-import')
+    entries_file = os.path.join(import_path, '_lgw.json')
+
+    # iterate over all imported files
+    files = os.listdir(import_path)
+    entries = []
+    for file in files:
+        if file.startswith('_lgw'):
+            continue
+
+        text = utils.read_text(os.path.join(import_path, file))
+
+        # parse the html
+        soup = BeautifulSoup(text, 'html.parser')
+        title = soup.h1.get_text()
+        print(title)
+        entry = {'name': title}
+
+        # get all external links
+        ignored_external_links = ('libregamewiki.org', 'freegamedev.net', 'freegamer.blogspot.com', 'opengameart.org', 'gnu.org', 'creativecommons.org', 'freesound.org', 'freecode.com', 'freenode.net')
+        links = [(x['href'], x.get_text()) for x in soup.find_all('a', href=True)]
+        links = [x for x in links if x[0].startswith('http') and not any([y in x[0] for y in ignored_external_links])]
+        entry['external links'] = links
+
+        # get meta description
+        description = soup.find('meta', attrs={"name": "description"})
+        entry['description'] = description['content']
+
+        # parse gameinfobox
+        infos = soup.find('div', class_='gameinfobox')
+        if not infos:
+            print(' no gameinfobox')
+        else:
+            infos = infos.find_all('tr')
+            for x in infos:
+                if x.th and x.td:
+                    # row with header
+                    key = x.th.get_text()
+                    content = x.td.get_text()
+                    content = content.split(',')
+                    content = [x.strip() for x in content]
+                    entry[key] = content
+                if not x.th and x.td:
+                    # row without header: contribute section
+                    x = x.find_all('li')
+                    x = [(x.a.string, x.a['href']) for x in x if x.a]
+                    for key, content in x:
+                        entry[key] = content
+
+        # parse "for available as package in"
+        tables = soup.find_all('table', class_='wikitable')
+        tables = [table for table in tables if table.caption and table.caption.string.startswith('Available as package')]
+        if len(tables) > 0:
+            if len(tables) > 1:
+                raise RuntimeError()
+            table = tables[0]
+            packages = table.find_all('tr')
+            packages = [x.td.a['href'] for x in packages]
+            entry['linux-packages'] = packages
+
+        # categories
+        categories = soup.find_all('div', id='mw-normal-catlinks')
+        if not categories:
+            print(' no categories')
+            categories = []
+        else:
+            if len(categories) > 1:
+                raise RuntimeError()
+            categories = categories[0]
+            categories = categories.find_all('li')
+            categories = [x.a.string for x in categories]
+            if 'Games' not in categories:
+                print(' "Games" not in categories')
+            else:
+                categories.remove('Games')  # should be there
+            # strip games at the end
+            phrase = ' games'
+            categories = [x[:-len(phrase)] if x.endswith(phrase) else x for x in categories]
+            ignored_categories = ['Articles lacking reference', 'Stubs']
+            categories = [x for x in categories if x not in ignored_categories]
+        entry['categories'] = categories
+
+        entries.append(entry)
+
+    # save entries
+    text = json.dumps(entries, indent=1)
+    utils.write_text(entries_file, text)
+
+
+def replace_content(entries, fields, replacement, search):
+    if not isinstance(fields, tuple):
+        fields = (fields, )
+    for index, entry in enumerate(entries):
+        for field in fields:
+            if field in entry:
+                content = entry[field]
+                if not isinstance(content, list):
+                    content = [content]
+                entry[field] = [replacement if x in search else x for x in content]
+        entries[index] = entry
+    return entries
+
+
+def ignore_content(entries, fields, ignored):
+    if not isinstance(fields, tuple):
+        fields = (fields, )
+    for index, entry in enumerate(entries):
+        for field in fields:
+            if field in entry:
+                content = entry[field]
+                if not isinstance(content, list):
+                    content = [content]
+                content = [x for x in content if x not in ignored]
+                if content:
+                    entry[field] = content
+                else:
+                    del entry[field]
+        entries[index] = entry
+    return entries
+
+
+def remove_prefix_suffix(entries, fields, prefixes, suffixes):
+    if not isinstance(fields, tuple):
+        fields = (fields, )
+    for index, entry in enumerate(entries):
+        for field in fields:
+            if field in entry:
+                content = entry[field]
+                if not isinstance(content, list):
+                    content = [content]
+                for prefix in prefixes:
+                    content = [x[len(prefix):] if x.startswith(prefix) else x for x in content]
+                for sufix in suffixes:
+                    content = [x[:-len(sufix)] if x.endswith(sufix) else x for x in content]
+                content = [x.strip() for x in content]
+                entry[field] = content
+        entries[index] = entry
+    return entries
+
+
+def lower_case_content(entries, field):
+    for index, entry in enumerate(entries):
+        if field in entry:
+            content = entry[field]
+            if not isinstance(content, list):
+                content = [content]
+            entry[field] = [x.casefold() for x in content]
+            entries[index] = entry
+    return entries
+
+
+def remove_parenthized_content(entries, fields):
+    if not isinstance(fields, tuple):
+        fields = (fields, )
+    for index, entry in enumerate(entries):
+        for field in fields:
+            if field in entry:
+                content = entry[field]
+                if not isinstance(content, list):
+                    content = [content]
+                content = [re.sub(r'\([^)]*\)', '', c) for c in content]  # remove parentheses content
+                content = [x.strip() for x in content]
+                content = list(set(content))
+                entry[field] = content
+        entries[index] = entry
+    return entries
+
+
+def ignore_nonnumbers(entries, fields):
+    if not isinstance(fields, tuple):
+        fields = (fields, )
+    for index, entry in enumerate(entries):
+        for field in fields:
+            if field in entry:
+                content = entry[field]
+                if not isinstance(content, list):
+                    content = [content]
+                content = [x for x in content if x.isdigit()]
+                entry[field] = content
+        entries[index] = entry
+    return entries
+
+
+def clean_lgw_content():
+
+    # paths
+    import_path = os.path.join(constants.root_path, 'code', '../lgw-import')
+    entries_file = os.path.join(import_path, '_lgw.json')
+    cleaned_entries_file = os.path.join(import_path, '_lgw.cleaned.json')
+
+    # load entries
+    text = utils.read_text(entries_file)
+    entries = json.loads(text)
+
+    # rename keys
+    key_replacements = (('developer', ('Developer', 'Developers')), ('code license', ('Code license', 'Code licenses')), ('engine', ('Engine', 'Engines')), ('genre', ('Genre', 'Genres')),
+                        ('library', ('Library', 'Libraries')), ('assets license', ('Media license', 'Media licenses')), ('code language', ('P. language', 'P. languages')), ('home', ('Homepage',)),
+                        ('platform', ('Platforms', )), ('tracker', ('Bug/Feature Tracker', )), ('repo', ('Source Code', )), ('forum', ('Forum', )), ('chat', ('Chat', )), ('origin', ('Origin', )),
+                        ('dev home', ('Development Project', )), ('last active', ('Release date', )))
+    for index, entry in enumerate(entries):
+        for new_key, old_keys in key_replacements:
+            for key in old_keys:
+                if key in entry:
+                    entry[new_key] = entry[key]
+                    del entry[key]
+                    break
+        entries[index] = entry
+
+    # ignore keys
+    ignored_keys = ('origin', 'Latest\xa0release')
+    for index, entry in enumerate(entries):
+        for key in ignored_keys:
+            if key in entry:
+                del entry[key]
+        entries[index] = entry
+
+    # check for unique field names
+    unique_fields = set()
+    for entry in entries:
+        unique_fields.update(entry.keys())
+    print('unique lgw fields: {}'.format(sorted(list(unique_fields))))
+
+    # which fields are mandatory
+    mandatory_fields = unique_fields.copy()
+    for entry in entries:
+        remove_fields = [field for field in mandatory_fields if field not in entry]
+        mandatory_fields -= set(remove_fields)
+    print('mandatory lgw fields: {}'.format(sorted(list(mandatory_fields))))
+
+    # statistics before
+    print('field contents before')
+    fields = sorted(list(unique_fields - {'description', 'external links', 'dev home', 'forum', 'home',
+                                          'linux-packages', 'developer', 'chat', 'tracker', 'Latest release', 'name',
+                                          'repo', 'Release date', 'categories'}))
+    for field in fields:
+        content = [entry[field] for entry in entries if field in entry]
+        # flatten
+        flat_content = []
+        for c in content:
+            if isinstance(c, list):
+                flat_content.extend(c)
+            else:
+                flat_content.append(c)
+        statistics = utils.unique_elements_and_occurrences(flat_content)
+        print('{}: {}'.format(field, ', '.join(statistics)))
+
+    # content replacements
+    entries = remove_parenthized_content(entries, ('assets license', 'code language', 'code license', 'engine', 'genre', 'last active', 'library'))
+    entries = remove_prefix_suffix(entries, ('code license', 'assets license'), ('"', 'GNU'), ('"', '[3]', '[2]', '[1]', 'only', ' license'))
+    entries = replace_content(entries, ('code license', 'assets license'), 'GPL', ('General Public License', ))
+    entries = replace_content(entries, ('code license', 'assets license'), 'GPL-2.0', ('GPLv2', ))  # for LGW GPLv2 would be the correct writing
+    entries = replace_content(entries, ('code license', 'assets license'), 'GPL-2', ('GPLv2', 'GPL v2', 'GPL version 2.0', 'GPL 2.0', 'General Public License v2', 'GPL version 2', 'Gplv2', 'GPL 2'))
+    entries = replace_content(entries, ('code license', 'assets license'), 'GPL-2', ('GPL v2 or later', 'GPL 2+', 'GPL v2+', 'GPLv2+', 'GPL version 2 or later'))
+    entries = replace_content(entries, ('code license', 'assets license'), 'GPL-3.0', ('GPLv3', ))  # for LGW GPLv3 would be the correct writing
+    entries = replace_content(entries, ('code license', 'assets license'), 'GPL-3', ('GPL v3', 'GNU GPL v3', 'GPL 3', 'General Public License 3', 'General Public License v3.0'))
+    entries = replace_content(entries, ('code license', 'assets license'), 'GPL-3', ('GPL v3+', 'GPLv3+', 'GPL v.3 or later', 'GPL v3 or later'))
+    entries = replace_content(entries, ('code license', 'assets license'), 'AGPL-3.0', ('AGPLv3', 'AGPL', 'Affero General Public License v3.0', 'AGPL v3'))
+    entries = replace_content(entries, ('code license', 'assets license'), 'Public domain', ('public domain', 'Public Domain'))
+    entries = replace_content(entries, ('code license', 'assets license'), 'zlib', ('zlib/libpng license', 'Zlib License'))
+    entries = replace_content(entries, ('code license', 'assets license'), 'BSD', ('Original BSD License', ))
+    entries = replace_content(entries, ('code license', 'assets license'), 'CC-BY-SA-3.0', ('Creative Commons Attribution-ShareAlike 3.0 Unported License', 'CC-BY-SA 3.0', 'CC BY-SA 3.0'))
+    entries = replace_content(entries, ('code license', 'assets license'), 'CC-BY-SA', ('CC BY-SA',))
+    entries = replace_content(entries, ('code license', 'assets license'), 'MIT', ('MIT License', 'MIT"'))
+    entries = replace_content(entries, ('assets license', ), 'no media', ('No media', 'no media?'))
+    entries = replace_content(entries, 'platform', 'macOS', ('Mac', ))
+    entries = remove_prefix_suffix(entries, ('code language', 'developer'), (), ('[3]', '[2]', '[1]'))
+    entries = ignore_content(entries, 'code language', ('HTML5', 'HTML', 'English', 'XML', 'WML', 'CSS'))
+    entries = replace_content(entries, 'code language', 'Lua', ('lua', 'LUA'))
+    entries = remove_prefix_suffix(entries, 'genre', (), ('game', 'games'))
+    entries = lower_case_content(entries, 'genre')
+    entries = replace_content(entries, 'genre', 'platform', ('platformer', ))
+    entries = replace_content(entries, 'genre', 'role playing', ('rpg', ))
+    entries = replace_content(entries, 'genre', 'first person, shooter', ('fps', ))
+    entries = replace_content(entries, 'genre', 'real time, strategy', ('rts',))
+    entries = replace_content(entries, 'genre', 'turn based, strategy', ('tbs',))
+    entries = ignore_content(entries, 'categories', ('GPL', 'C++', 'C', 'ECMAScript', 'Python', 'Java', 'CC BY-SA', 'Lua', 'LGPL', 'CC-BY', 'BSD', 'MIT', 'Qt', 'SDL', 'OpenGL', 'Pygame', 'PD', 'GLUT', 'Haskell', 'Allegro', 'Ruby', 'Zlib/libpng', 'OpenAL', 'Perl', 'Free Pascal', 'LÖVE', 'HTML5', 'Id Tech 1'))
+    entries = replace_content(entries, 'library', 'pygame', ('Pygame', ))
+    entries = replace_content(entries, 'library', 'Qt', ('QT', 'Qt4'))
+    entries = ignore_content(entries, 'library', ('C++', 'Lua', 'Mozilla Firefox', 'DirectX', 'Boost'))
+    entries = ignore_nonnumbers(entries, 'last active')
+    entries = ignore_content(entries, 'last active', ('2019', ))
+    entries = ignore_content(entries, 'platform', ('DOS', ))
+
+    # list for every unique field
+    print('\nfield contents after')
+    fields = sorted(list(unique_fields - {'description', 'external links', 'dev home', 'forum', 'home',
+                                          'linux-packages', 'developer', 'chat', 'tracker', 'Latest release', 'name',
+                                          'repo', 'Release date', 'categories'}))
+    for field in fields:
+        content = [entry[field] for entry in entries if field in entry]
+        # flatten
+        flat_content = []
+        for c in content:
+            if isinstance(c, list):
+                flat_content.extend(c)
+            else:
+                flat_content.append(c)
+        statistics = utils.unique_elements_and_occurrences(flat_content)
+        print('{}: {}\n'.format(field, ', '.join(statistics)))
+
+    # save entries
+    text = json.dumps(entries, indent=1)
+    utils.write_text(cleaned_entries_file, text)
+
+
+if __name__ == "__main__":
+
+    # stage one
+    # download_lgw_content()
+
+    # stage two
+    # parse_lgw_content()
+
+    # stage three
+    # clean_lgw_content()
--- a/code/synchronization/libregamewiki_synchronization.py
+++ b/code/synchronization/libregamewiki_synchronization.py
@@ -0,0 +1,304 @@
+"""
+Once data from libregamewiki is imported, synchronize with our database, i.e. identify the entries both have in common,
+estimate the differences in the entries both have in common, suggest to add the entries they have not in common to each
+other.
+
+unique imported fields: 'assets license', 'categories', 'code language', 'code license', 'developer', 'engine', 'genre', 'library', 'linux-packages', 'name', 'platform'
+mandatory imported fields: 'categories', 'name'
+
+Mapping lgw -> ours
+assets license -> assets license
+categories -> keywords
+code language -> code language
+code license -> code license
+developer -> free text (info)
+engine -> code dependencies
+genre -> keywords
+library -> code dependencies
+linux-packages - > free text (info)
+name -> name
+platform -> platform
+
+TODO also ignore our rejected entries
+"""
+
+import json
+import os
+from utils import constants, utils, osg
+
+lgw_name_aliases = {'Eat the Whistle': 'Eat The Whistle', 'Scorched 3D': 'Scorched3D',
+                    'Blob Wars Episode 1 : Metal Blob Solid': 'Blobwars: Metal Blob Solid',
+                    'Adventure': 'Colossal Cave Adventure',
+                    'Liquid War 6': 'Liquid War', 'Gusanos': 'GUSANOS', 'Corewars': 'Core War', 'FLARE': 'Flare',
+                    'Vitetris': 'vitetris', 'Powder Toy': 'The Powder Toy', 'Asylum': 'SDL Asylum',
+                    'Atanks': 'Atomic Tanks', 'HeXon': 'heXon', 'Unnethack': 'UnNetHack',
+                    'Nova Pinball': 'NOVA PINBALL', 'Jump n Bump': "Jump'n'Bump",
+                    'Blades of Exile': 'Classic Blades of Exile',
+                    'Colobot': 'Colobot: Gold Edition', 'Dead Justice': 'Cat Mother Dead Justice',
+                    'FreeDink': 'GNU FreeDink', 'FRaBs': 'fRaBs', 'Harmonist': 'Harmonist: Dayoriah Clan Infiltration',
+                    'Iris2 3D Client - for Ultima Online': 'Iris2',
+                    'Java Classic Role Playing Game': 'jClassicRPG', 'Osgg': 'OldSkool Gravity Game',
+                    'PyRacerz': 'pyRacerz', 'Starfighter': 'Project: Starfighter',
+                    'TORCS': 'TORCS, The Open Racing Car Simulator', 'Vertigo (game)': 'Vertigo',
+                    'XInvaders3D': 'XInvaders 3D', 'LambdaRogue': 'LambdaRogue: The Book of Stars',
+                    'Maniadrive': 'ManiaDrive', 'Story of Seasons': "Greentwip's Harvest Moon", 'TinyTris': 'Tiny Tris',
+                    'Which Way Is Up': 'Which Way Is Up?', 'CannonSmash': 'Cannon Smash', 'UFO:Alien Invasion': 'UFO: Alien Invasion'}
+lgw_ignored_entries = ['Hetris', '8 Kingdoms', 'Antigravitaattori', 'Arena of Honour', 'Arkhart', 'Ascent of Justice',
+                       'Balazar III', 'Balder3D', 'Barbie Seahorse Adventures', 'Barrage', 'Gnome Batalla Naval',
+                       'Blocks',
+                       'Brickshooter', 'Bweakfwu', 'Cheese Boys', 'Clippers', 'Codewars', 'CRAFT: The Vicious Vikings',
+                       'DQM', 'EmMines', 'Eskimo-run', 'Farlands', 'Feuerkraft', 'Fight or Perish', 'Flatland', 'Forest patrol',
+                       'Flare: Empyrean Campaign', 'Free Reign', 'GalaxyMage',
+                       'Gloss', 'GRUB Invaders', 'Howitzer Skirmish', 'Imperium: Sticks', 'Interstate Outlaws',
+                       'GNOME Games', 'KDE Games', 'LegacyClone', 'Memonix', 'Ninjapix', 'Neverputt', 'Militia Defense',
+                       'Sudoku86', 'Look Around the Corner', 'GPSFish',
+                       'Terminal Overload release history', 'Scions of Darkness', 'Sedtris', 'SilChess', 'SSTPong',
+                       'Tesseract Trainer', 'TunnelWars', 'The Fortress', 'Tunnel']
+
+licenses_map = {'GPLv2': 'GPL-2.0', 'GPLv2+': 'GPL-2.0', 'GPLv3': 'GPL-3.0', 'GPLv3+': 'GPL-3.0'}
+
+
+def compare_sets(a, b, name, limit=None):
+    """
+
+    :param limit:
+    :param a:
+    :param b:
+    :param name:
+    :return:
+    """
+    p = ''
+    if not isinstance(a, set):
+        a = set(a)
+    if not isinstance(b, set):
+        b = set(b)
+    d = sorted(list(a - b))
+    if d and limit != 'notus':
+        p += ' {} : us :  {}\n'.format(name, ', '.join(d))
+    d = sorted(list(b - a))
+    if d and limit != 'notthem':
+        p += ' {} : them : {}\n'.format(name, ', '.join(d))
+    return p
+
+
+if __name__ == "__main__":
+
+    # some parameter
+    similarity_threshold = 0.8
+    maximal_newly_created_entries = 40
+
+    # paths
+    lgw_import_path = os.path.join(constants.root_path, 'code', '../lgw-import')
+    lgw_entries_file = os.path.join(lgw_import_path, '_lgw.cleaned.json')
+
+    # import lgw import
+    text = utils.read_text(lgw_entries_file)
+    lgw_entries = json.loads(text)
+
+    # eliminate the ignored entries
+    _ = [x['name'] for x in lgw_entries if x['name'] in lgw_ignored_entries]  # those that will be ignored
+    _ = set(lgw_ignored_entries) - set(_)  # those that shall be ignored minus those that will be ignored
+    if _:
+        print('Can un-ignore {}'.format(_))
+    lgw_entries = [x for x in lgw_entries if x['name'] not in lgw_ignored_entries]
+
+    # perform name and code language replacements
+    _ = [x['name'] for x in lgw_entries if x['name'] in lgw_name_aliases.keys()]  # those that will be renamed
+    _ = set(lgw_name_aliases.keys()) - set(_)  # those that shall be renamed minus those that will be renamed
+    if _:
+        print('Can un-rename {}'.format(_))
+    for index, lgw_entry in enumerate(lgw_entries):
+        if lgw_entry['name'] in lgw_name_aliases:
+            lgw_entry['name'] = lgw_name_aliases[lgw_entry['name']]
+        if 'code language' in lgw_entry:
+            languages = lgw_entry['code language']
+            h = []
+            for l in languages:
+                for g in ('/', 'and'):
+                    if g in l:
+                        l = l.split(g)
+                        l = [x.strip() for x in l]
+                if type(l) == str:
+                    l = [l]
+                h.extend(l)
+            languages = h
+            if languages:
+                lgw_entry['code language'] = languages
+            else:
+                del lgw_entry['code language']
+        lgw_entries[index] = lgw_entry
+
+    # check for unique field names
+    unique_fields = set()
+    for lgw_entry in lgw_entries:
+        unique_fields.update(lgw_entry.keys())
+    print('unique lgw fields: {}'.format(sorted(list(unique_fields))))
+
+    # which fields are mandatory
+    mandatory_fields = unique_fields.copy()
+    for lgw_entry in lgw_entries:
+        remove_fields = [field for field in mandatory_fields if field not in lgw_entry]
+        mandatory_fields -= set(remove_fields)
+    print('mandatory lgw fields: {}'.format(sorted(list(mandatory_fields))))
+
+    # read our database
+    our_entries = osg.read_entries()
+    print('{} entries with us'.format(len(our_entries)))
+
+    # just the names
+    lgw_names = set([x['name'] for x in lgw_entries])
+    our_names = set([x['Title'] for x in our_entries])
+    common_names = lgw_names & our_names
+    lgw_names -= common_names
+    our_names -= common_names
+    print('{} in both, {} only in LGW, {} only with us'.format(len(common_names), len(lgw_names), len(our_names)))
+
+    # find similar names among the rest
+    print('similar names (them - us')
+    for lgw_name in lgw_names:
+        for our_name in our_names:
+            if osg.name_similarity(lgw_name, our_name) > similarity_threshold:
+                print('"{}" - "{}"'.format(lgw_name, our_name))
+
+    newly_created_entries = 0
+    # iterate over their entries
+    print('\n')
+    for lgw_entry in lgw_entries:
+        lgw_name = lgw_entry['name']
+
+        is_included = False
+        for our_entry in our_entries:
+            our_name = our_entry['Title']
+
+            # find those that entries in LGW that are also in our database and compare them
+            if lgw_name == our_name:
+                is_included = True
+                # a match, check the fields
+                name = lgw_name
+
+                p = ''
+
+                # TODO key names have changed on our side
+
+                # platform
+                key = 'platform'
+                p += compare_sets(lgw_entry.get(key, []), our_entry.get(key, []), key)
+
+                # categories/keywords
+                # p += compare_sets(lgw_entry.get('categories', []), our_entry.get('keywords', []), 'categories/keywords')
+
+                # code language
+                key = 'code language'
+                p += compare_sets(lgw_entry.get(key, []), our_entry.get(key, []), key)
+
+                # code license (GPLv2)
+                key = 'code license'
+                p += compare_sets(lgw_entry.get(key, []), our_entry.get(key, []), key)
+
+                # engine, library
+                p += compare_sets(lgw_entry.get('engine', []), our_entry.get('code dependencies', []),
+                                  'code dependencies', 'notthem')
+                p += compare_sets(lgw_entry.get('library', []), our_entry.get('code dependencies', []),
+                                  'code dependencies', 'notthem')
+                p += compare_sets(lgw_entry.get('engine', []) + lgw_entry.get('library', []),
+                                  our_entry.get('code dependencies', []), 'engine/library', 'notus')
+
+                # assets license
+                key = 'assets license'
+                p += compare_sets(lgw_entry.get(key, []), our_entry.get(key, []), key)
+
+                # TODO developer (need to introduce a field with us first)
+
+                if p:
+                    print('{}\n{}'.format(name, p))
+
+        if not is_included:
+            # a new entry, that we have never seen, maybe we should make an entry of our own
+            # TODO we could use the write capabilities to write the entry in our own format, the hardcoded format here might be brittle, on the other hand we can also write slightly wrong stuff here without problems
+
+            if newly_created_entries >= maximal_newly_created_entries:
+                continue
+
+            # determine file name
+            print('create new entry for {}'.format(lgw_name))
+            file_name = osg.canonical_name(lgw_name) + '.md'
+            target_file = os.path.join(constants.entries_path, file_name)
+            if os.path.isfile(target_file):
+                print('warning: file {} already existing, save under slightly different name'.format(file_name))
+                target_file = os.path.join(constants.entries_path, file_name[:-3] + '-duplicate.md')
+                if os.path.isfile(target_file):
+                    continue  # just for safety reasons
+
+            # add name
+            entry = '# {}\n\n'.format(lgw_name)
+
+            # empty home (mandatory on our side)
+            home = lgw_entry.get('home', None)
+            dev_home = lgw_entry.get('dev home', None)
+            entry += '- Home: {}\n'.format(', '.join([x for x in [home, dev_home] if x]))
+
+            # state mandatory on our side
+            entry += '- State: \n'
+
+            # platform, if existing
+            if 'platform' in lgw_entry:
+                entry += '- Platform: {}\n'.format(', '.join(lgw_entry['platform']))
+
+            # keywords (genre) (also mandatory)
+            keywords = lgw_entry.get('genre', [])
+            if 'assets license' in lgw_entry:
+                keywords.append('open content')
+            keywords.sort(key=str.casefold)
+            if keywords:
+                entry += '- Keyword: {}\n'.format(', '.join(keywords))
+
+            # code repository (mandatory but not scraped from lgw)
+            entry += '- Code repository: {}\n'.format(lgw_entry.get('repo', ''))
+
+            # code language, mandatory on our side
+            languages = lgw_entry.get('code language', [])
+            languages.sort(key=str.casefold)
+            entry += '- Code language: {}\n'.format(', '.join(languages))
+
+            # code license, mandatory on our side
+            licenses = lgw_entry.get('code license', [])
+            licenses = [licenses_map[x] if x in licenses_map else x for x in licenses]
+            licenses.sort(key=str.casefold)
+            entry += '- Code license: {}\n'.format(', '.join(licenses))
+
+            # code dependencies (only if existing)
+            code_dependencies = lgw_entry.get('engine', [])
+            code_dependencies.extend(lgw_entry.get('library', []))
+            code_dependencies.sort(key=str.casefold)
+            if code_dependencies:
+                entry += '- Code dependency: {}\n'.format(', '.join(code_dependencies))
+
+            # assets licenses (only if existing)
+            if 'assets license' in lgw_entry:
+                licenses = lgw_entry.get('assets license', [])
+                licenses = [licenses_map[x] if x in licenses_map else x for x in licenses]
+                licenses.sort(key=str.casefold)
+                entry += '- Assets license: {}\n'.format(', '.join(licenses))
+
+            # developer
+            if 'developer' in lgw_entry:
+                entry += '- Developer: {}\n'.format(', '.join(lgw_entry['developer']))
+
+            # add empty description (not anymore)
+            entry += '\n_{}_\n\n'.format(lgw_entry['description'])
+
+            # external links
+            ext_links = lgw_entry['external links']
+            if ext_links:
+                entry += '\nLinks: {}\n'.format('\n '.join(['{}: {}'.format(x[1], x[0]) for x in ext_links]))
+
+            # linux packages
+            if 'linux-packages' in lgw_entry:
+                entry += '{}\n'.format(lgw_entry['linux-packages'])
+
+            # write ## Building
+            entry += '\n## Building\n'
+
+            # finally write to file
+            utils.write_text(target_file, entry)
+            newly_created_entries += 1
--- a/code/synchronization/osgameclones_synchronization.py
+++ b/code/synchronization/osgameclones_synchronization.py
@@ -0,0 +1,542 @@
+"""
+
+osgameclones has the following fields:
+'updated', 'video', 'repo', 'license', 'originals', 'status', 'multiplayer', 'info', 'lang', 'feed', 'content', 'images', 'url', 'name', 'framework', 'type', 'development'
+
+mandatory fields are: 'name', 'license', 'type', 'originals'
+
+possible values:
+osgc-development: active(337), complete(32), halted(330), sporadic(129), very active(6)
+osgc-multiplayer: Co-op(5), Competitive(13), Hotseat(3), LAN(17), Local(3), Matchmaking(1), Online(33), Split-screen(7)
+osgc-type: clone(171), remake(684), similar(11), tool(7)
+osgc-status: playable(274), semi-playable(34), unplayable(34)
+osgc-license: ['AFL3', 'AGPL3', 'Apache', 'Artistic', 'As-is', 'BSD', 'BSD2', 'BSD4', 'bzip2', 'CC-BY', 'CC-BY-NC', 'CC-BY-NC-ND', 'CC-BY-NC-SA', 'CC-BY-SA', 'CC0', 'Custom', 'GPL2', 'GPL3', 'IJG', 'ISC', 'JRL', 'LGPL2', 'LGPL3', 'Libpng', 'MAME', 'MIT', 'MPL', 'MS-PL', 'Multiple', 'NGPL', 'PD', 'WTFPL', 'Zlib']
+osgc-content: commercial(104), free(32), open(61), swappable(5)
+
+Mapping osgameclones -> ours
+
+name -> name
+type -> keywords, description
+originals -> keywords
+repo -> code repository
+url -> home
+feed (-> home)
+development -> state
+status -> state
+multiplayer -> keywords
+lang -> code language
+framework -> code dependencies
+license -> code license / assets license
+content -> keywords
+info -> after fields
+updated not used
+images not used
+video: not used
+
+TODO also ignore our rejected entries
+"""
+
+import ruamel.yaml as yaml
+import os
+from utils import constants, utils, osg
+
+# should change on osgameclones
+osgc_name_aliases = {'4DTris': '4D-TRIS', 'fheroes2': 'Free Heroes 2', 'DrCreep': 'The Castles of Dr. Creep',
+                     'Duke3d_win32': 'Duke3d_w32', 'GNOME Atomix': 'Atomix', 'Head over Heels 2': 'Head over Heels',
+                     'mewl': 'M.E.W.L.', 'LinWarrior': 'Linwarrior 3D', 'Mice Men Remix': 'Mice Men: Remix',
+                     'OpenApoc': 'Open Apocalypse', 'open-cube': 'Open Cube', 'open-horizon': 'Open Horizon',
+                     'opengl_test_drive_clone': 'OpenGL Test Drive Remake', "Freenukum Jump'n Run": 'Freenukum',
+                     'Play Freeciv!': 'Freeciv-web', 'ProjectX': 'Forsaken', 'Lyon': 'Roton', 'Mafia II: Toolkit': 'Mafia: Toolkit',
+                     'Siege of Avalon Open Source': 'Siege of Avalon : Open Source', 'ss13remake': 'SS13 Remake',
+                     'shadowgrounds': 'Shadowgrounds', 'RxWars': 'Prescription Wars', 'REDRIVER2': 'REDriver2',
+                     'Super Mario Bros And Level Editor in C#': 'Mario Objects', 'Unitystation': 'unitystation',
+                     'tetris': 'Just another Tetris™ clone', 'twin-e': 'TwinEngine', 'super-methane-brothers-gx': 'Super Methane Brothers for Wii and GameCube',
+                     'CrossUO: Ultima Online': 'CrossUO', 'Doomsday': 'Doomsday Engine', 'OpMon': 'OPMon',
+                     '2048-python': '2048 Python', 'Free Heroes 2 - Enhanced': 'Free Heroes 2', 'ironseed_fpc': 'ironseed',
+                     'KKnD': 'OpenKrush', 'bab-be-u': 'BAB BE U', 'ironseed': 'Ironseed', 'urde': 'Metaforce'}
+
+# conversion between licenses syntax them and us
+osgc_licenses_map = {'GPL2': 'GPL-2.0', 'GPL3': 'GPL-3.0', 'AGPL3': 'AGPL-3.0', 'LGPL3': 'LGPL-3.0',
+                     'LGPL2': 'LGPL-2.0 or 2.1?', 'MPL': 'MPL-2.0', 'Apache': 'Apache-2.0',
+                     'Artistic': 'Artistic License', 'Zlib': 'zlib', 'PD': 'Public domain', 'AFL3': 'AFL-3.0',
+                     'BSD2': '2-clause BSD', 'JRL': 'Java Research License'}
+
+# ignore osgc entries (for various reasons like unclear license etc.)
+osgc_ignored_entries = ["A Mouse's Vengeance", 'achtungkurve.com', 'AdaDoom3', 'Agendaroids', 'Alien 8', 'Ard-Reil',
+                        'Balloon Fight', 'bladerunner (Engine within SCUMMVM)', 'Block Shooter', 'Bomb Mania Reloaded',
+                        'boulder-dash', 'Cannon Fodder', 'Contra_remake', 'CosmicArk-Advanced', 'Deuteros X',
+                        'datastorm', 'div-columns', 'div-pacman2600', 'div-pitfall', 'div-spaceinvaders2600', 'EXILE',
+                        'Free in the Dark', 'Prepare Carefully', 'OpenKKnD',
+                        'Football Manager', 'Fight Or Perish', 'EarthShakerDS', 'Entombed!', 'FreeRails 2',
+                        'Glest Advanced Engine', 'FreedroidClassic', 'FreeFT', 'Future Blocks', 'HeadOverHeels',
+                        'Herzog 3D', 'Homeworld SDL', 'imperialism-remake', 'Jumping Jack 2: Worryingly Familiar',
+                        'Jumping Jack: Further Adventures', 'Jumpman', 'legion', 'KZap', 'LastNinja', 'Lemmix', 'LixD',
+                        'luminesk5', 'Manic Miner', 'Meridian 59 Server 105', 'Meridian 59 German Server 112',
+                        'Mining Haze', 'OpenGeneral', 'MonoStrategy', 'New RAW', 'OpenDeathValley', 'OpenOutcast',
+                        'openStrato', 'OpenPop', 'pacman',
+                        'Phavon', 'Project: Xenocide', 'pyspaceinvaders', 'PyTouhou', 'Racer',
+                        'Ruby OMF 2097 Remake', 'Snipes', 'Spaceship Duel', 'Space Station 14', 'Starlane Empire',
+                        'Styx', 'Super Mario Bros With SFML in C#', 'thromolusng', 'Tile World 2', 'Tranzam',
+                        'Voxelstein 3D', 'XQuest 2',
+                        'xrick', 'zedragon', 'Uncharted waters 2 remake', 'Desktop Adventures Engine for ScummVM',
+                        'Open Sonic', 'Aladdin_DirectX', 'Alive_Reversing', 're3', 'Sonic-1-2-2013-Decompilation',
+                        'Sonic-CD-11-Decompilation', 'Stunt Car Racer Remake']
+
+
+def unique_field_contents(entries, field):
+    """
+    """
+    unique_content = set()
+    for entry in entries:
+        if field in entry:
+            field_content = entry[field]
+            if type(field_content) is list:
+                unique_content.update(field_content)
+            else:
+                unique_content.add(field_content)
+    unique_content = sorted(list(unique_content), key=str.casefold)
+    return unique_content
+
+
+def compare_sets(a, b, name, limit=None):
+    """
+
+    :param limit: 'notus', 'notthem'
+    :param a: them
+    :param b: us
+    :param name: prefix in output
+    :return:
+    """
+    p = ''
+    if not isinstance(a, set):
+        a = set(a)
+    if not isinstance(b, set):
+        b = set(b)
+    d = sorted(list(a - b))
+    if d and limit != 'notus':
+        p += ' {} : us :  {}\n'.format(name, ', '.join(d))
+    d = sorted(list(b - a))
+    if d and limit != 'notthem':
+        p += ' {} : them : {}\n'.format(name, ', '.join(d))
+    return p
+
+
+if __name__ == "__main__":
+
+    # some parameter
+    similarity_threshold = 0.8
+    maximal_newly_created_entries = 40
+    check_similar_names = False
+
+    # paths
+    root_path = os.path.realpath(os.path.join(os.path.dirname(__file__), os.path.pardir))
+
+    # import the osgameclones data
+    osgc_path = os.path.realpath(os.path.join(root_path, os.path.pardir, 'osgameclones.git', 'games'))
+    osgc_files = os.listdir(osgc_path)
+
+    # iterate over all yaml files in osgameclones/data folder and load contents
+    osgc_entries = []
+    for file in osgc_files:
+        # read yaml
+        with open(os.path.join(osgc_path, file), 'r', encoding='utf-8') as stream:
+            try:
+                _ = yaml.safe_load(stream)
+            except Exception as exc:
+                print(file)
+                raise exc
+
+        # add to entries
+        osgc_entries.extend(_)
+    print('Currently {} entries in osgameclones'.format(len(osgc_entries)))
+
+    # check: print all git repos with untypical structure
+    untypical_structure = ''
+    for osgc_entry in osgc_entries:
+        name = osgc_entry['name']
+        if 'repo' in osgc_entry:
+            osgc_repos = osgc_entry['repo']
+            if isinstance(osgc_repos, str):
+                osgc_repos = [osgc_repos]
+            for repo in osgc_repos:
+                if 'github' in repo and any((repo.endswith(x) for x in ('/', '.git'))):
+                    untypical_structure += ' {} : {}\n'.format(osgc_entry['name'], repo)
+    if untypical_structure:
+        print('Git repos with untypical URL\n{}'.format(untypical_structure))
+
+    # which fields do they have
+    osgc_fields = set()
+    for osgc_entry in osgc_entries:
+        osgc_fields.update(osgc_entry.keys())
+    osgc_fields = sorted(list(osgc_fields))
+    print('Unique osgc-fields\n {}'.format(', '.join(osgc_fields)))
+
+    for field in osgc_fields:
+        if field in ('video', 'feed', 'url', 'repo', 'info', 'updated', 'images', 'name', 'originals'):
+            continue
+        osgc_content = [entry[field] for entry in osgc_entries if field in entry]
+        # flatten
+        flat_content = []
+        for c in osgc_content:
+            if isinstance(c, list):
+                flat_content.extend(c)
+            else:
+                flat_content.append(c)
+        statistics = utils.unique_elements_and_occurrences(flat_content)
+        statistics.sort(key=str.casefold)
+        print('{}: {}'.format(field, ', '.join(statistics)))
+
+    # eliminate the ignored entries
+    _ = [x['name'] for x in osgc_entries if x['name'] in osgc_ignored_entries]  # those that will be ignored
+    _ = set(osgc_ignored_entries) - set(_)  # those that shall be ignored minus those that will be ignored
+    if _:
+        print('Can un-ignore {}'.format(_))
+    osgc_entries = [x for x in osgc_entries if x['name'] not in osgc_ignored_entries]
+
+    # fix names and licenses (so they are not longer detected as deviations downstreams)
+    _ = [x['name'] for x in osgc_entries if x['name'] in osgc_name_aliases.keys()]  # those that will be renamed
+    _ = set(osgc_name_aliases.keys()) - set(_)  # those that shall be renamed minus those that will be renamed
+    if _:
+        print('Can un-rename {}'.format(_))
+    for index, entry in enumerate(osgc_entries):
+        name = entry['name']
+        if name in osgc_name_aliases:
+            entry['name'] = osgc_name_aliases[name]
+        if 'license' in entry:
+            osgc_licenses = entry['license']
+            osgc_licenses = [osgc_licenses_map.get(x, x) for x in osgc_licenses]
+            entry['license'] = osgc_licenses
+        # fix content (add suffix content)
+        if 'content' in entry:
+            osgc_content = entry['content']
+            if isinstance(osgc_content, str):
+                osgc_content = [osgc_content]
+            osgc_content = [x + ' content' for x in osgc_content]
+            entry['content'] = osgc_content
+        osgc_entries[index] = entry  # TODO is this necessary or is the entry modified anyway?
+
+    # which fields do they have
+    osgc_fields = set()
+    for osgc_entry in osgc_entries:
+        osgc_fields.update(osgc_entry.keys())
+    print('unique osgc-fields: {}'.format(osgc_fields))
+
+    # which fields are mandatory
+    for osgc_entry in osgc_entries:
+        remove_fields = [field for field in osgc_fields if field not in osgc_entry]
+        osgc_fields -= set(remove_fields)
+    print('mandatory osfg-fields: {}'.format(osgc_fields))
+
+    # some field statistics
+    print('osgc-development: {}'.format(unique_field_contents(osgc_entries, 'development')))
+    print('osgc-multiplayer: {}'.format(unique_field_contents(osgc_entries, 'multiplayer')))
+    print('osgc-type: {}'.format(unique_field_contents(osgc_entries, 'type')))
+    print('osgc-languages: {}'.format(unique_field_contents(osgc_entries, 'lang')))
+    print('osgc-licenses: {}'.format(unique_field_contents(osgc_entries, 'license')))
+    print('osgc-status: {}'.format(unique_field_contents(osgc_entries, 'status')))
+    print('osgc-framework: {}'.format(unique_field_contents(osgc_entries, 'framework')))
+    print('osgc-content: {}'.format(unique_field_contents(osgc_entries, 'content')))
+
+    # read our database
+    our_entries = osg.read_entries()
+    print('{} entries with us'.format(len(our_entries)))
+
+    # just the names
+    osgc_names = set([x['name'] for x in osgc_entries])
+    our_names = set([x['Title'] for x in our_entries])
+    common_names = osgc_names & our_names
+    osgc_names -= common_names
+    our_names -= common_names
+    print('{} in both, {} only in osgameclones, {} only with us'.format(len(common_names), len(osgc_names),
+                                                                        len(our_names)))
+    # find similar names among the rest
+    if check_similar_names:
+        print('look for similar names (theirs - ours)')
+        for osgc_name in osgc_names:
+           for our_name in our_names:
+               if osg.name_similarity(osgc_name, our_name) > similarity_threshold:
+                   print(' {} - {}'.format(osgc_name, our_name))
+
+    newly_created_entries = 0
+    # iterate over their entries
+    for osgc_entry in osgc_entries:
+        osgc_name = osgc_entry['name']
+
+        is_included = False
+        for our_entry in our_entries:
+            our_name = our_entry['Title']
+
+            # find those that entries in osgameclones that are also in our database and compare them
+            if osgc_name == our_name:
+                is_included = True
+                # a match, check the fields
+                name = osgc_name
+
+                p = ''
+
+                # TODO key names have changed on our side
+
+                # compare their lang with our code language
+                if 'lang' in osgc_entry:
+                    osgc_languages = osgc_entry['lang']
+                    if type(osgc_languages) == str:
+                        osgc_languages = [osgc_languages]
+                    our_languages = [x.value for x in our_entry['Code language']]  # essential field
+                    p += compare_sets(osgc_languages, our_languages, 'code language')
+
+                # compare their license with our code and assets license
+                if 'license' in osgc_entry:
+                    osgc_licenses = osgc_entry['license']
+                    our_code_licenses = [x.value for x in our_entry['Code license']]  # essential field
+                    our_assets_licenses = [x.value for x in our_entry.get('Assets license', [])]
+                    p += compare_sets(osgc_licenses, our_code_licenses + our_assets_licenses, 'licenses', 'notthem')
+                    p += compare_sets(osgc_licenses, our_code_licenses, 'licenses', 'notus')
+
+                # compare their framework with our code dependencies (capitalization is ignored for now, only starts are compared)
+                our_framework_replacements = {'allegro4': 'allegro'}
+                if 'framework' in osgc_entry:
+                    osgc_frameworks = osgc_entry['framework']
+                    if type(osgc_frameworks) == str:
+                        osgc_frameworks = [osgc_frameworks]
+                    our_frameworks = [x.value for x in our_entry.get('Code dependency', [])]
+                    our_frameworks = [x.casefold() for x in our_frameworks]
+                    our_frameworks = [x if x not in our_framework_replacements else our_framework_replacements[x] for x
+                                      in our_frameworks]
+                    osgc_frameworks = [x.casefold() for x in osgc_frameworks]
+                    p += compare_sets(osgc_frameworks, our_frameworks, 'framework/dependencies')
+
+                # compare their repo with our code repository and download
+                if 'repo' in osgc_entry:
+                    osgc_repos = osgc_entry['repo']
+                    if type(osgc_repos) == str:
+                        osgc_repos = [osgc_repos]
+                    osgc_repos = [utils.strip_url(url) for url in osgc_repos]
+                    osgc_repos = [x for x in osgc_repos if not x.startswith(
+                        'sourceforge.net/projects/')]  # we don't need the general sites there
+                    # osgc_repos = [x for x in osgc_repos if not x.startswith('https://sourceforge.net/projects/')] # ignore some
+                    our_repos = our_entry.get('Code repository', [])
+                    our_repos = [utils.strip_url(url.value) for url in our_repos]
+                    our_repos = [x for x in our_repos if not x.startswith(
+                        'gitlab.com/osgames/')]  # we do not yet spread our own deeds (but we will some day)
+                    our_repos = [x for x in our_repos if
+                                 'cvs.sourceforge.net' not in x and 'svn.code.sf.net/p/' not in x]  # no cvs or svn anymore
+                    our_downloads = our_entry.get('Download', [])
+                    our_downloads = [utils.strip_url(url.value) for url in our_downloads]
+                    p += compare_sets(osgc_repos, our_repos + our_downloads, 'repo',
+                                      'notthem')  # if their repos are not in our downloads or repos
+                    p += compare_sets(osgc_repos, our_repos[:1], 'repo',
+                                      'notus')  # if our main repo is not in their repo
+
+                # compare their url (and feed) to our home (and strip urls)
+                if 'url' in osgc_entry:
+                    osgc_urls = osgc_entry['url']
+                    if type(osgc_urls) == str:
+                        osgc_urls = [osgc_urls]
+                    osgc_urls = [utils.strip_url(url) for url in osgc_urls]
+                    our_urls = our_entry['Home']
+                    our_urls = [utils.strip_url(url.value) for url in our_urls]
+                    p += compare_sets(osgc_urls, our_urls, 'url/home', 'notthem')  # if their urls are not in our urls
+                    # our_urls = [url for url in our_urls if
+                    #             not url.startswith('github.com/')]  # they don't have them as url
+                    p += compare_sets(osgc_urls, our_urls[:1], 'url/home',
+                                      'notus')  # if our first url is not in their urls
+
+                # compare their status with our state (playable can be beta/mature with us, but not playable must be beta)
+                if 'status' in osgc_entry:
+                    osgc_status = osgc_entry['status']
+                    our_status = our_entry['State']  # essential field
+                    if osgc_status != 'playable' and 'mature' in our_status:
+                        p += ' status : mismatch : them {}, us mature\n'.format(osgc_status)
+
+                # compare their development with our state
+                if 'development' in osgc_entry:
+                    osgc_development = osgc_entry['development']
+                    our_inactive = 'inactive' in our_entry
+                    our_status = our_entry['State']  # essential field
+                    if osgc_development == 'halted' and not our_inactive:
+                        p += ' development : mismatch : them halted - us not inactive\n'
+                    if osgc_development in ['very active', 'active'] and our_inactive:
+                        p += ' development : mismatch : them {}, us inactive\n'.format(osgc_development)
+                    if osgc_development == 'complete' and 'mature' not in our_status:
+                        p += ' development : mismatch : them complete, us not mature\n'
+
+                # get our keywords
+                our_keywords = [x.value for x in our_entry['Keyword']] # essential
+
+                # compare their originals to our inspirations
+                our_originals = [x.value for x in our_entry.get('Inspiration', [])]
+                if 'originals' in osgc_entry:
+                    osgc_originals = osgc_entry['originals']
+                    osgc_originals = [x.replace(',', '') for x in
+                                      osgc_originals]  # we cannot have ',' or parts in parentheses in original names
+                    p += compare_sets(osgc_originals, our_originals, 'originals')
+
+                # compare their multiplayer with our keywords (multiplayer) (only lowercase comparison)
+                if 'multiplayer' in osgc_entry:
+                    osgc_multiplayer = osgc_entry['multiplayer']
+                    if type(osgc_multiplayer) == str:
+                        osgc_multiplayer = [osgc_multiplayer]
+                    osgc_multiplayer = [x.casefold() for x in osgc_multiplayer]
+                    osgc_multiplayer = [x for x in osgc_multiplayer if x not in ['competitive']]  # ignored
+                    our_multiplayer = [x for x in our_keywords if x.startswith('multiplayer ')]
+                    if our_multiplayer:
+                        if len(our_multiplayer) != 1:
+                            print(our_entry)
+                            raise RuntimeError()
+                        assert len(our_multiplayer) == 1
+                        our_multiplayer = our_multiplayer[0][11:].split('+')
+                        our_multiplayer = [x.strip().casefold() for x in our_multiplayer]
+                    p += compare_sets(osgc_multiplayer, our_multiplayer, 'multiplayer')
+
+                # compare content with keywords
+                if 'content' in osgc_entry:
+                    osgc_content = osgc_entry['content']
+                    if isinstance(osgc_content, str):
+                        osgc_content = [osgc_content]
+                    p += compare_sets(osgc_content, our_keywords, 'content/keywords',
+                                      'notthem')  # only to us because we have more then them
+
+                # compare their type to our keywords
+                if 'type' in osgc_entry:
+                    game_type = osgc_entry['type']
+                    if isinstance(game_type, str):
+                        game_type = [game_type]
+                    p += compare_sets(game_type, our_keywords, 'type/keywords',
+                                      'notthem')  # only to us because we have more then them
+
+                if p:
+                    print('{}\n{}'.format(name, p))
+
+        if not is_included:
+            # a new entry, that we have never seen, maybe we should make an entry of our own
+            # continue
+            # TODO we could use the write capabilities to write the entry in our own format, the hardcoded format here might be brittle, on the other hand we can also write slightly wrong stuff here without problems
+
+            if newly_created_entries >= maximal_newly_created_entries:
+                continue
+
+            game_type = osgc_entry.get('type', None)
+            osgc_status = osgc_entry.get('status', None)
+
+            # we sort some out here (maybe we want to have a closer look at them later)
+            if osgc_status == 'unplayable':
+                # for now not the unplayable ones
+                continue
+            if 'license' not in osgc_entry or 'As-is' in osgc_entry['license']:
+                # for now not the ones without license or with as-is license
+                continue
+
+            # determine file name
+            print('create new entry for {}'.format(osgc_name))
+            file_name = osg.canonical_name(osgc_name) + '.md'
+            target_file = os.path.join(constants.entries_path, file_name)
+            if os.path.isfile(target_file):
+                print('warning: file {} already existing, save under slightly different name'.format(file_name))
+                target_file = os.path.join(constants.entries_path, file_name[:-3] + '-duplicate.md')
+                if os.path.isfile(target_file):
+                    continue  # just for safety reasons
+
+            # add name
+            entry = '# {}\n\n'.format(osgc_name)
+
+            # home
+            home = osgc_entry.get('url', None)
+            entry += '- Home: {}\n'.format(home)
+
+            # inspiration
+            if 'originals' in osgc_entry:
+                osgc_originals = osgc_entry['originals']
+                if type(osgc_originals) == str:
+                    osgc_originals = [osgc_originals]
+                entry += '- Inspiration: {}\n'.format(', '.join(osgc_originals))
+
+            # state
+            entry += '- State: {}'.format(osgc_status)
+            if 'development' in osgc_entry:
+                if osgc_entry['development'] == 'halted':
+                    entry += ', inactive since XX'
+            entry += '\n'
+
+            # language tags
+            lang = osgc_entry.get('lang', [])
+            if type(lang) == str:
+                lang = [lang]
+            # platform 'Web' if language == JavaScript or TypeScript
+            if len(lang) == 1 and lang[0] in ('JavaScript', 'TypeScript'):
+                entry += '- Platform: Web\n'
+
+            # keywords
+            keywords = []
+            if game_type:
+                keywords.append(game_type)
+            if 'multiplayer' in osgc_entry:
+                osgc_multiplayer = osgc_entry['multiplayer']
+                if type(osgc_multiplayer) == str:
+                    osgc_multiplayer = [osgc_multiplayer]
+                keywords.append('multiplayer {}'.format(' + '.join(osgc_multiplayer)))
+            if 'content' in osgc_entry:
+                osgc_content = osgc_entry['content']  # it's a list
+                osgc_content = ', '.join(osgc_content)
+                keywords.append(osgc_content)
+            if keywords:
+                entry += '- Keyword: {}\n'.format(', '.join(keywords))
+
+            # code repository (mandatory on our side)
+            repo = osgc_entry.get('repo', None)
+            if repo and repo.startswith('https://git') and not repo.endswith('.git'):
+                # we have them with .git on github/gitlab
+                repo += '.git'
+            entry += '- Code repository: {}\n'.format(repo)
+
+            # code language (mandatory on our side)
+            entry += '- Code language: {}\n'.format(', '.join(lang))
+
+            # code license
+            entry += '- Code license: {}\n'.format(', '.join(osgc_entry['license']))
+
+            # code dependencies (if existing)
+            if 'framework' in osgc_entry:
+                osgc_frameworks = osgc_entry['framework']
+                if type(osgc_frameworks) == str:
+                    osgc_frameworks = [osgc_frameworks]
+                entry += '- Code dependency: {}\n'.format(', '.join(osgc_frameworks))
+
+            # add description (already put into Inspiration)
+            # description = '{} of {}.'.format(game_type.capitalize(), ', '.join(osgc_entry['originals']))
+            # entry += '\n{}\n\n'.format(description)
+
+            # write info (if existing)
+            if 'info' in osgc_entry:
+                entry += '\n{}\n\n'.format(osgc_entry['info'])
+
+            # write ## Building
+            entry += '\n## Building\n'
+
+            # finally write to file
+            utils.write_text(target_file, entry)
+            newly_created_entries += 1
+
+    # now iterate over our entries and test if we can add anything to them
+    print('entry that could be added to them')
+    for our_entry in our_entries:
+        our_name = our_entry['Title']
+
+        # only if contains a keyword starting with "inspired by" and not "tool", "framework" or "library"
+        our_keywords = our_entry['Keyword']
+        if not any([x.startswith('inspired by ') for x in our_keywords]):
+            continue
+        if any([x in ['tool', 'library', 'framework'] for x in our_keywords]):
+            continue
+
+        is_included = False
+        for osgc_entry in osgc_entries:
+            osgc_name = osgc_entry['name']
+
+            if osgc_name == our_name:
+                is_included = True
+
+        if not is_included:
+            # that could be added to them
+            print('- [{}]({})'.format(our_name,
+                                      'https://github.com/Trilarion/opensourcegames/blob/master/entries/' + our_entry[
+                                          'file']))
--- a/code/synchronization/sourceforge_import.py
+++ b/code/synchronization/sourceforge_import.py
@@ -0,0 +1,164 @@
+"""
+Scrapes Sourceforge project sites and adds (mostly developer) information to our database.
+""" # TODO sourceforge sites that are not existing anymore but we have an archive link, also scrape
+
+import os
+import json
+import requests
+from bs4 import BeautifulSoup
+from utils import constants as c, utils, osg, osg_parse
+
+sf_entries_file = os.path.join(c.code_path, 'sourceforge_entries.txt')
+prefix = 'https://sourceforge.net/projects/'
+
+# author names in SF that aren't the author names how we have them
+SF_alias_list = {'Erik Johansson (aka feneur)': 'Erik Johansson', 'Itms': 'Nicolas Auvray', 'baris yuksel': 'Baris Yuksel',
+                 'Wraitii': 'Lancelot de Ferrière', 'Simzer': 'Simon Laszlo', 'armin bajramovic': 'Armin Bajramovic',
+                 'bleu tailfly': 'bleutailfly', 'dlh': 'DLH', 'Bjorn Hansen': 'Bjørn Hansen', 'Louens Veen': 'Lourens Veen',
+                 'linley_henzell': 'Linley Henzell', 'Patrice DUHAMEL': 'Patrice Duhamel', 'Etienne SOBOLE': 'Etienne Sobole',
+                 'L. H.    [Lubomír]': 'L. H. Lubomír', 'davidjoffe': 'David Joffe', 'EugeneLoza': 'Eugene Loza',
+                 'Kenneth Gangsto': 'Kenneth Gangstø', 'Lucas GAUTHERON': 'Lucas Gautheron', 'Per I Mathisen': 'Per Inge Mathisen',
+                 'wrtlprnft': 'Wrzlprnft', 'daniel_santos': 'Daniel Santos', 'Dark_Sylinc': 'darksylinc',
+                 'Don Llopis': 'Don E. Llopis', 'dwachs': 'Dwachs', 'Pierre-Loup Griffais': 'Pierre-Loup A. Griffais',
+                 'Richard Gobeille': 'Richard C. Gobeille', 'timfelgentreff': 'Tim Felgentreff',
+                 'Dr. Martin Brumm': 'Martin Brumm', 'Dr. Wolf-Dieter Beelitz': 'Wolf-Dieter Beelitz'}
+
+SF_ignore_list = ('', 'Arianne Integration Bot')
+
+
+def collect_sourceforge_entries():
+    """
+    Reads the entries of the database and collects all entries with sourceforge as project site
+    """
+
+    # read entries
+    entries = osg.read_entries()
+    print('{} entries read'.format(len(entries)))
+
+    # loop over entries
+    files = []
+    for entry in entries:
+        urls = [x for x in entry['Home'] if x.startswith(prefix)]
+        if urls:
+            files.append(entry['File'])
+
+    # write to file
+    print('{} entries with sourceforge projects'.format(len(files)))
+    utils.write_text(sf_entries_file, json.dumps(files, indent=1))
+
+
+def sourceforge_import():
+    """
+
+    :return:
+    """
+    files = json.loads(utils.read_text(sf_entries_file))
+
+    all_developers = osg.read_developers()
+    print(' {} developers read'.format(len(all_developers)))
+    all_developers_changed = False
+
+    # all exceptions that happen will be eaten (but will end the execution)
+    try:
+        # loop over each entry
+        for index, file in enumerate(files):
+            print(' process {}'.format(file))
+
+            # read entry
+            entry = osg.read_entry(file)
+            developers = entry.get('Developer', [])
+            urls = [x.value for x in entry['Home'] if x.startswith('https://sourceforge.net/projects/')]
+
+            entry_changed = False
+
+            for url in urls:
+                print('  sf project {}'.format(url))
+
+                if not url.endswith('/'):
+                    print('error: sf project does not end with slash')
+                    url += '/'
+
+                # members
+                url_members = 'https://sourceforge.net/p/' + url[len(prefix):] + '_members/'
+                response = requests.get(url_members)
+                if response.status_code != 200:
+                    print('error: url {} not accessible, status {}'.format(url_members, response.status_code))
+                    raise RuntimeError()
+                soup = BeautifulSoup(response.text, 'html.parser')
+                authors = soup.find('div', id='content_base').find('table').find_all('tr')
+                authors = [author.find_all('td') for author in authors]
+                authors = [author[1].a['href'] for author in authors if len(author) == 3]
+                for author in authors:
+                    # sometimes author already contains the full url, sometimes not
+                    url_author = 'https://sourceforge.net' + author if not author.startswith('http') else author
+                    response = requests.get(url_author)
+                    if response.status_code != 200 and author not in ('/u/favorito/',):
+                        print('error: url {} not accessible, status {}'.format(url_author, response.status_code))
+                        raise RuntimeError()
+                    url_author = response.url  # could be different now
+                    if 'auth/?return_to' in url_author or response.status_code != 200:
+                        # for some reason authorisation is forbidden or page was not available (happens for example for /u/kantaros)
+                        author_name = author[3:-1]
+                        nickname = author_name
+                    else:
+                        soup = BeautifulSoup(response.text, 'html.parser')
+                        author_name = soup.h1.get_text()
+                        author_name = SF_alias_list.get(author_name, author_name)  # replace by alias if possible
+                        nickname = soup.find('dl', class_='personal-data').find('dd').get_text()
+                        nickname = nickname.replace('\n', '').strip()
+                    nickname += '@SF' # our indication of the platform to search for
+                    author_name = author_name.strip() # names can still have white spaces before or after
+
+                    if author_name in SF_ignore_list:
+                        continue
+
+                    # look author up in entry developers
+                    if author_name not in developers:
+                        print('   dev "{}" added to entry {}'.format(author_name, file))
+                        entry['Developer'] = entry.get('Developer', []) + [osg_parse.ValueWithComment(author_name)]
+                        entry_changed = True
+                        developers = entry.get('Developer', [])
+
+                    # look author and SF nickname up in developers data base
+                    if author_name in all_developers:
+                        dev = all_developers[author_name]
+                        if not nickname in dev.get('Contact', []):
+                            print(' existing dev "{}" added nickname ({}) to developer database'.format(author_name, nickname))
+                            # check that name has not already @SF contact
+                            if any(x.endswith('@SF') for x in dev.get('Contact', [])):
+                                print('warning: already SF contact')
+                            all_developers[author_name]['Contact'] = dev.get('Contact', []) + [nickname]
+                            all_developers_changed = True
+                    else:
+                        print('   dev "{}" ({}) added to developer database'.format(author_name, nickname))
+                        all_developers[author_name] = {'Name': author_name, 'Contact': [nickname], 'Games': [entry['Title']]}
+                        all_developers_changed = True
+
+            if entry_changed:
+                # save entry
+                osg.write_entry(entry)
+                print('  entry updated')
+    except:
+        raise
+    finally:
+        # shorten file list
+        utils.write_text(sf_entries_file, json.dumps(files[index:], indent=1))
+
+        # save entry
+        osg.write_entry(entry)
+        print(' entry updated')
+
+        # maybe save all developers
+        if all_developers_changed:
+            # save all developers
+            osg.write_developers(all_developers)
+            print('developers database updated')
+
+
+if __name__ == "__main__":
+
+    # collect entries
+    collect_sourceforge_entries()
+
+    # import information from sf
+    # sourceforge_import()