maintenance scripts improved

2020-09-08 22:37:05 +02:00 · 2020-09-08 22:37:05 +02:00 · c985780dc2
commit c985780dc2
parent 82af77b017
6 changed files with 590 additions and 533 deletions
--- a/code/maintenance.py
+++ b/code/maintenance.py
@ -1,497 +1,10 @@
-"""
-    Runs a series of maintenance operations on the collection of entry files, updating the table of content files for
-    each category as well as creating a statistics file.
-
-    Counts the number of records each sub-folder and updates the overview.
-    Sorts the entries in the contents files of each sub folder alphabetically.
-
-    This script runs with Python 3, it could also with Python 2 with some minor tweaks probably.
-"""
-
-import requests
-import datetime
 import json
 import textwrap
 import os
 import re

 import utils.constants
-from utils import constants as c, utils, osg
-
-def extract_links():
-    """
-    Parses all entries and extracts http(s) links from them
-    """
-
-    # regex for finding urls (can be in <> or in ]() or after a whitespace
-    regex = re.compile(r"[\s\n]<(http.+?)>|]\((http.+?)\)|[\s\n](http[^\s\n,]+?)[\s\n,]")
-
-    # iterate over all entries
-    urls = set()
-    for _, _, content in entry_iterator():
-
-        # apply regex
-        matches = regex.findall(content)
-
-        # for each match
-        for match in matches:
-
-            # for each possible clause
-            for url in match:
-
-                # if there was something (and not a sourceforge git url)
-                if url:
-                    urls.add(url)
-    urls = sorted(list(urls), key=str.casefold)
-    return urls
-
-
-
-
-def check_validity_external_links():
-    """
-    Checks all external links it can find for validity. Prints those with non OK HTTP responses. Does only need to be run
-    from time to time.
-    """
-
-    # regex for finding urls (can be in <> or in ]() or after a whitespace
-    regex = re.compile(r"[\s\n]<(http.+?)>|\]\((http.+?)\)|[\s\n](http[^\s\n,]+?)[\s\n\)]")
-
-    # ignore the following patterns (they give false positives here)
-    ignored_urls = ('https://git.tukaani.org/xz.git', 'https://git.code.sf.net/', 'http://hg.hedgewars.org/hedgewars/', 'https://git.xiph.org/vorbis.git', 'http://svn.uktrainsim.com/svn/openrails', 'https://www.srb2.org/', 'http://wiki.srb2.org/')
-
-    # some do redirect, but we nedertheless want the original URL in the database
-    redirect_okay = ('https://octaforge.org/', 'https://svn.openttd.org/', 'https://godotengine.org/download')
-
-    # extract all links from entries
-    import urllib3
-    urllib3.disable_warnings() # otherwise we cannot verify those with SSL errors without getting warnings
-    urls = {}
-    for entry, _, content in osg.entry_iterator():
-        # apply regex
-        matches = regex.findall(content)
-        # for each match
-        for match in matches:
-            for url in match:
-                if url and not any((url.startswith(x) for x in ignored_urls)):
-                    # ignore bzr.sourceforge, no web address found
-                    if 'bzr.sourceforge.net/bzrroot/' in url:
-                        continue
-
-                    # add "/" at the end
-                    if any((url.startswith(x) for x in ('https://anongit.freedesktop.org/git', 'https://git.savannah.gnu.org/git/', 'https://git.savannah.nongnu.org/git/', 'https://git.artsoft.org/'))):
-                        url += '/'
-
-                    if url.startswith('https://bitbucket.org/') and url.endswith('.git'):
-                        url = url[:-4] + '/commits/'
-                    if url.startswith('https://svn.code.sf.net/p/'):
-                        url = 'http' + url[5:] + '/'
-                    if url.startswith('http://cvs.savannah.nongnu.org:/sources/'):
-                        url = 'http://cvs.savannah.nongnu.org/viewvc/' + url[40:] + '/'
-                    if url.startswith('http://cvs.savannah.gnu.org:/sources/'):
-                        url = 'http://cvs.savannah.gnu.org/viewvc/' + url[37:] + '/'
-
-                    # generally ".git" at the end is not working well, except sometimes
-                    if url.endswith('.git') and not any((url.startswith(x) for x in ('https://repo.or.cz', 'https://git.tuxfamily.org/fanwor/fanwor'))):
-                        url = url[:-4]
-
-                    if url in urls:
-                        urls[url].add(entry)
-                    else:
-                        urls[url] = {entry}
-    print('found {} unique links'.format(len(urls)))
-    print("start checking external links (can take a while)")
-
-    # now iterate over all urls
-    for url, names in urls.items():
-        names = list(names)  # was a set
-        if len(names) == 1:
-            names = names[0]
-        try:
-            verify = True
-            # some have an expired certificate but otherwise still work
-            if any((url.startswith(x) for x in ('https://perso.b2b2c.ca/~sarrazip/dev/', 'https://dreerally.com/', 'https://henlin.net/', 'https://www.megamek.org/', 'https://pixeldoctrine.com/', 'https://gitorious.org/', 'https://www.opmon-game.ga/'))):
-                verify = False
-            r = requests.head(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64)'}, timeout=20, allow_redirects=True, verify=verify)
-            if r.status_code == 405:  # head method not supported, try get
-                r = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64)'}, timeout=20, allow_redirects=True, verify=verify)
-            # check for bad status
-            if r.status_code != requests.codes.ok:
-                print('{}: {} - {}'.format(names, url, r.status_code))
-            # check for redirect
-            if r.history and url not in redirect_okay:
-                # only / added or http->https sometimes
-                redirected_url = r.url
-                if redirected_url == url + '/':
-                    output = '{}: {} -> {} - redirect "/" at end '
-                elif redirected_url == 'https' + url[4:]:
-                    output = '{}: {} -> {} - redirect "https" at start'
-                else:
-                    output = '{}: {} -> {} - redirect '
-                print(output.format(names, url, redirected_url))
-        except Exception as e:
-            error_name = type(e).__name__
-            if error_name == 'SSLError' and any((url.startswith(x) for x in ('https://gitorious.org/', 'https://www.freedroid.org/download/'))):
-                continue  # even though verify is False, these errors still get through
-            print('{}: {} - exception {}'.format(names, url, error_name))
-
-
-def fix_entries():
-    """
-    Fixes the keywords, code dependencies, build systems, .. entries, mostly by automatically sorting them.
-    """
-
-    keyword_synonyms = {'RTS': ('real time', 'strategy'), 'realtime': 'real time'}
-
-    # TODO also sort other fields, only read once and then do all, move to separate file
-    # example Javascript to JavaScript and then add whenever the known languages check hits
-
-    print('fix entries')
-
-    # keywords
-    regex = re.compile(r"(.*)- Keywords:([^\n]*)(.*)", re.DOTALL)
-
-    # iterate over all entries
-    for entry, entry_path, content in osg.entry_iterator():
-
-        # match with regex
-        matches = regex.findall(content)
-        if len(matches) != 1:
-            raise RuntimeError('Could not find keywords in entry "{}"'.format(entry))
-
-        match = matches[0]
-
-        # get elements out, split, strip, delete duplicates
-        elements = match[1].split(',')
-        elements = [x.strip() for x in elements]
-        elements = list(set(elements))
-
-        # get category out
-        for keyword in utils.constants.recommended_keywords:
-            if keyword in elements:
-                elements.remove(keyword)
-                category = keyword
-                break
-
-        # special treatments here
-        elements = [x if x != 'TBS' and x != 'TB' else 'turn based' for x in elements]
-        elements = [x if x != 'RTS' else 'real time' for x in elements]
-        elements = [x if x != 'MMO' else 'massive multiplayer online' for x in elements]
-        elements = [x if x != 'MMO' else 'multiplayer online' for x in elements]
-        elements = [x if x != 'SP' else 'singleplayer' for x in elements]
-        elements = [x if x != 'MP' else 'multiplayer' for x in elements]
-        elements = [x if x != 'engine' else 'game engine' for x in elements]
-        elements = [x if x != 'rpg' else 'role playing' for x in elements]
-        elements = [x if x != 'turn based' else 'turn-based' for x in elements]
-        for keyword in ('browser', 'misc', 'tools'):
-            if keyword in elements:
-                elements.remove(keyword)
-
-        # sort
-        elements.sort(key=str.casefold)
-
-        # add category
-        elements.insert(0, category)
-
-        keywords = '- Keywords: {}'.format(', '.join(elements))
-
-        new_content = match[0] + keywords + match[2]
-
-        if new_content != content:
-            # write again
-            utils.write_text(entry_path, new_content)
-
-    # code dependencies
-    regex = re.compile(r"(.*)- Code dependencies:([^\n]*)(.*)", re.DOTALL)
-
-    # iterate over all entries
-    for entry, entry_path, content in osg.entry_iterator():
-        # match with regex
-        matches = regex.findall(content)
-
-        if not matches:
-            # no code dependencies given
-            continue
-
-        match = matches[0]
-
-        # get code dependencies out, split, strip, delete duplicates
-        elements = match[1].split(',')
-        elements = [x.strip() for x in elements]
-        elements = list(set(elements))
-
-        # special treatments here
-        elements = [x if x != 'Blender' else 'Blender game engine' for x in elements]
-        elements = [x if x.lower() != 'libgdx' else 'libGDX' for x in elements]
-        elements = [x if x != 'SDL 2' else 'SDL2' for x in elements]
-        elements = [x if x.lower() != "ren'py" else "Ren'Py" for x in elements]
-
-        # sort
-        elements.sort(key=str.casefold)
-
-        code_dependencies = '- Code dependencies: {}'.format(', '.join(elements))
-
-        new_content = match[0] + code_dependencies + match[2]
-
-        if new_content != content:
-            # write again
-            utils.write_text(entry_path, new_content)
-
-    # build systems
-    regex = re.compile(r"(.*)- Build system:([^\n]*)(.*)", re.DOTALL)
-
-    # iterate over all entries
-    for entry, entry_path, content in osg.entry_iterator():
-        # match with regex
-        matches = regex.findall(content)
-
-        if not matches:
-            # no build system given
-            continue
-
-        match = matches[0]
-
-        # get code dependencies out, split, strip, delete duplicates
-        elements = match[1].split(',')
-        elements = [x.strip() for x in elements]
-        elements = list(set(elements))
-
-        # special treatments here
-
-        # sort
-        elements.sort(key=str.casefold)
-
-        build_system = '- Build system: {}'.format(', '.join(elements))
-
-        new_content = match[0] + build_system + match[2]
-
-        if new_content != content:
-            # write again
-            utils.write_text(entry_path, new_content)
-
-
-def update_statistics(infos):
-    """
-    Generates the statistics page.
-
-    Should be done every time the entries change.
-    """
-
-    print('update statistics')
-
-    # start the page
-    statistics_file = os.path.join(c.root_path, 'statistics.md')
-    statistics = '[comment]: # (autogenerated content, do not edit)\n# Statistics\n\n'
-
-    # total number
-    number_entries = len(infos)
-    rel = lambda x: x / number_entries * 100  # conversion to percent
-
-    statistics += 'analyzed {} entries on {}\n\n'.format(number_entries,
-                                                         datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
-
-    # State (beta, mature, inactive)
-    statistics += '## State\n\n'
-
-    number_state_beta = sum(1 for x in infos if 'beta' in x['state'])
-    number_state_mature = sum(1 for x in infos if 'mature' in x['state'])
-    number_inactive = sum(1 for x in infos if 'inactive' in x)
-    statistics += '- mature: {} ({:.1f}%)\n- beta: {} ({:.1f}%)\n- inactive: {} ({:.1f}%)\n\n'.format(
-        number_state_mature, rel(number_state_mature), number_state_beta, rel(number_state_beta), number_inactive,
-        rel(number_inactive))
-
-    if number_inactive > 0:
-        entries_inactive = [(x['Name'], x['inactive']) for x in infos if 'inactive' in x]
-        entries_inactive.sort(key=lambda x: str.casefold(x[0]))  # first sort by name
-        entries_inactive.sort(key=lambda x: x[1], reverse=True)  # then sort by inactive year (more recently first)
-        entries_inactive = ['{} ({})'.format(*x) for x in entries_inactive]
-        statistics += '##### Inactive State\n\n' + ', '.join(entries_inactive) + '\n\n'
-
-    # Language
-    statistics += '## Code Languages\n\n'
-    field = 'code language'
-
-    # those without language tag
-    # TODO the language tag is now an essential field, this cannot happen anymore
-    # number_no_language = sum(1 for x in infois if field not in x)
-    # if number_no_language > 0:
-    #     statistics += 'Without language tag: {} ({:.1f}%)\n\n'.format(number_no_language, rel(number_no_language))
-    #     entries_no_language = [x['Name'] for x in infois if field not in x]
-    #     entries_no_language.sort()
-    #     statistics += ', '.join(entries_no_language) + '\n\n'
-
-    # get all languages together
-    languages = []
-    for info in infos:
-        if field in info:
-            languages.extend(info[field])
-
-    unique_languages = set(languages)
-    unique_languages = [(l, languages.count(l) / len(languages)) for l in unique_languages]
-    unique_languages.sort(key=lambda x: str.casefold(x[0]))  # first sort by name
-    unique_languages.sort(key=lambda x: x[1], reverse=True)  # then sort by occurrence (highest occurrence first)
-    unique_languages = ['- {} ({:.1f}%)\n'.format(x[0], x[1] * 100) for x in unique_languages]
-    statistics += '##### Language frequency\n\n' + ''.join(unique_languages) + '\n'
-
-    # Licenses
-    statistics += '## Code licenses\n\n'
-    field = 'code license'
-
-    # those without license
-    number_no_license = sum(1 for x in infos if field not in x)
-    if number_no_license > 0:
-        statistics += 'Without license tag: {} ({:.1f}%)\n\n'.format(number_no_license, rel(number_no_license))
-        entries_no_license = [x['Name'] for x in infos if field not in x]
-        entries_no_license.sort()
-        statistics += ', '.join(entries_no_license) + '\n\n'
-
-    # get all licenses together
-    licenses = []
-    for info in infos:
-        if field in info:
-            licenses.extend(info[field])
-
-    unique_licenses = set(licenses)
-    unique_licenses = [(l, licenses.count(l) / len(licenses)) for l in unique_licenses]
-    unique_licenses.sort(key=lambda x: str.casefold(x[0]))  # first sort by name
-    unique_licenses.sort(key=lambda x: -x[1])  # then sort by occurrence (highest occurrence first)
-    unique_licenses = ['- {} ({:.1f}%)\n'.format(x[0], x[1] * 100) for x in unique_licenses]
-    statistics += '##### Licenses frequency\n\n' + ''.join(unique_licenses) + '\n'
-
-    # Keywords
-    statistics += '## Keywords\n\n'
-    field = 'keywords'
-
-    # get all keywords together
-    keywords = []
-    for info in infos:
-        if field in info:
-            keywords.extend(info[field])
-    # reduce those starting with "inspired by"
-    keywords = [x if not x.startswith('inspired by') else 'inspired' for x in keywords]
-    # reduce those starting with "multiplayer"
-    keywords = [x if not x.startswith('multiplayer') else 'multiplayer' for x in keywords]
-
-    unique_keywords = set(keywords)
-    unique_keywords = [(l, keywords.count(l) / len(keywords)) for l in unique_keywords]
-    unique_keywords.sort(key=lambda x: str.casefold(x[0]))  # first sort by name
-    unique_keywords.sort(key=lambda x: -x[1])  # then sort by occurrence (highest occurrence first)
-    unique_keywords = ['- {} ({:.1f}%)'.format(x[0], x[1] * 100) for x in unique_keywords]
-    statistics += '##### Keywords frequency\n\n' + '\n'.join(unique_keywords) + '\n\n'
-
-    # no download or play field
-    statistics += '## Entries without download or play fields\n\n'
-
-    entries = []
-    for info in infos:
-        if 'download' not in info and 'play' not in info:
-            entries.append(info['Name'])
-    entries.sort(key=str.casefold)
-    statistics += '{}: '.format(len(entries)) + ', '.join(entries) + '\n\n'
-
-    # code hosted not on github, gitlab, bitbucket, launchpad, sourceforge
-    popular_code_repositories = ('github.com', 'gitlab.com', 'bitbucket.org', 'code.sf.net', 'code.launchpad.net')
-    statistics += '## Entries with a code repository not on a popular site\n\n'
-
-    entries = []
-    field = 'code repository'
-    for info in infos:
-        if field in info:
-            popular = False
-            for repo in info[field]:
-                for popular_repo in popular_code_repositories:
-                    if popular_repo in repo:
-                        popular = True
-                        break
-            # if there were repositories, but none popular, add them to the list
-            if not popular:
-                entries.append(info['Name'])
-                # print(info[field])
-    entries.sort(key=str.casefold)
-    statistics += '{}: '.format(len(entries)) + ', '.join(entries) + '\n\n'
-
-    # Code dependencies
-    statistics += '## Code dependencies\n\n'
-    field = 'code dependencies'
-
-    # get all code dependencies together
-    code_dependencies = []
-    entries_with_code_dependency = 0
-    for info in infos:
-        if field in info:
-            code_dependencies.extend(info[field])
-            entries_with_code_dependency += 1
-    statistics += 'With code dependency field {} ({:.1f}%)\n\n'.format(entries_with_code_dependency,
-                                                                       rel(entries_with_code_dependency))
-
-    unique_code_dependencies = set(code_dependencies)
-    unique_code_dependencies = [(l, code_dependencies.count(l) / len(code_dependencies)) for l in
-                                unique_code_dependencies]
-    unique_code_dependencies.sort(key=lambda x: str.casefold(x[0]))  # first sort by name
-    unique_code_dependencies.sort(key=lambda x: -x[1])  # then sort by occurrence (highest occurrence first)
-    unique_code_dependencies = ['- {} ({:.1f}%)'.format(x[0], x[1] * 100) for x in unique_code_dependencies]
-    statistics += '##### Code dependencies frequency\n\n' + '\n'.join(unique_code_dependencies) + '\n\n'
-
-    # Build systems:
-    statistics += '## Build systems\n\n'
-    field = 'build system'
-
-    # get all build systems together
-    build_systems = []
-    for info in infos:
-        if field in info:
-            build_systems.extend(info[field])
-
-    statistics += 'Build systems information available for {:.1f}% of all projects.\n\n'.format(rel(len(build_systems)))
-
-    unique_build_systems = set(build_systems)
-    unique_build_systems = [(l, build_systems.count(l) / len(build_systems)) for l in unique_build_systems]
-    unique_build_systems.sort(key=lambda x: str.casefold(x[0]))  # first sort by name
-    unique_build_systems.sort(key=lambda x: -x[1])  # then sort by occurrence (highest occurrence first)
-    unique_build_systems = ['- {} ({:.1f}%)'.format(x[0], x[1] * 100) for x in unique_build_systems]
-    statistics += '##### Build systems frequency ({})\n\n'.format(len(build_systems)) + '\n'.join(
-        unique_build_systems) + '\n\n'
-
-    # C, C++ projects without build system information
-    c_cpp_project_without_build_system = []
-    for info in infos:
-        if field not in info and ('C' in info['code language'] or 'C++' in info['code language']):
-            c_cpp_project_without_build_system.append(info['Name'])
-    c_cpp_project_without_build_system.sort(key=str.casefold)
-    statistics += '##### C and C++ projects without build system information ({})\n\n'.format(
-        len(c_cpp_project_without_build_system)) + ', '.join(c_cpp_project_without_build_system) + '\n\n'
-
-    # C, C++ projects with build system information but without CMake as build system
-    c_cpp_project_not_cmake = []
-    for info in infos:
-        if field in info and 'CMake' in info[field] and (
-                'C' in info['code language'] or 'C++' in info['code language']):
-            c_cpp_project_not_cmake.append(info['Name'])
-    c_cpp_project_not_cmake.sort(key=str.casefold)
-    statistics += '##### C and C++ projects with a build system different from CMake ({})\n\n'.format(
-        len(c_cpp_project_not_cmake)) + ', '.join(c_cpp_project_not_cmake) + '\n\n'
-
-    # Platform
-    statistics += '## Platform\n\n'
-    field = 'platform'
-
-    # get all platforms together
-    platforms = []
-    for info in infos:
-        if field in info:
-            platforms.extend(info[field])
-
-    statistics += 'Platform information available for {:.1f}% of all projects.\n\n'.format(rel(len(platforms)))
-
-    unique_platforms = set(platforms)
-    unique_platforms = [(l, platforms.count(l) / len(platforms)) for l in unique_platforms]
-    unique_platforms.sort(key=lambda x: str.casefold(x[0]))  # first sort by name
-    unique_platforms.sort(key=lambda x: -x[1])  # then sort by occurrence (highest occurrence first)
-    unique_platforms = ['- {} ({:.1f}%)'.format(x[0], x[1] * 100) for x in unique_platforms]
-    statistics += '##### Platforms frequency\n\n' + '\n'.join(unique_platforms) + '\n\n'
-
-    # write to statistics file
-    utils.write_text(statistics_file, statistics)
+from utils import constants as c, utils


 def export_json(infos):
@ -720,18 +233,6 @@ def export_git_code_repositories_json():
    utils.write_text(json_path, text)


-def sort_text_file(file, name):
-    """
-    Reads a text file, splits in lines, removes duplicates, sort, writes back.
-    """
-    text = utils.read_text(file)
-    text = text.split('\n')
-    text = sorted(list(set(text)), key=str.casefold)
-    print('{} contains {} items'.format(name, len(text)))
-    text = '\n'.join(text)
-    utils.write_text(file, text)
-
-
 def check_validity_backlog():
    import requests

@ -792,36 +293,6 @@ def check_code_dependencies(infos):
        print('{} ({})'.format(*dep))


-if __name__ == "__main__":
-
-    check_validity_backlog()



-
-    # fix entries
-    fix_entries()
-
-    # recount and write to readme and to tocs
-    update_readme_and_tocs(infos)
-
-    # generate report
-    update_statistics(infos)
-
-    # update database for html table
-    export_json(infos)
-
-    # collect list of primary code repositories
-    export_primary_code_repositories_json(infos)
-
-    # check code dependencies
-    check_code_dependencies(infos)
-
-    # collect list of git code repositories (only one per project) for git_statistics script
-    export_git_code_repositories_json()
-
-    # check external links (only rarely)
-    # check_validity_external_links()
-
-    # sort rejected games list file
-    sort_text_file(os.path.join(c.root_path, 'code', 'rejected.txt'), 'rejected games list')
--- a/code/maintenance_entries.py
+++ b/code/maintenance_entries.py
@ -0,0 +1,581 @@
+"""
+    Runs a series of maintenance operations on the collection of entry files, updating the table of content files for
+    each category as well as creating a statistics file.
+
+    Counts the number of records each sub-folder and updates the overview.
+    Sorts the entries in the contents files of each sub folder alphabetically.
+"""
+
+import os
+import re
+import datetime
+from utils import osg, osg_ui, utils, constants as c
+import requests
+
+
+def create_toc(title, file, entries):
+    """
+
+    """
+    # file path
+    toc_file = os.path.join(c.tocs_path, file)
+
+    # header line
+    text = '[comment]: # (autogenerated content, do not edit)\n# {}\n\n'.format(title)
+
+    # assemble rows
+    rows = []
+    for entry in entries:
+        info = entry['Code language'] + entry['Code license'] + entry['State']
+        info = [x.value for x in info]
+        rows.append('- **[{}]({})** ({})'.format(entry['Title'], '../' + entry['File'], ', '.join(info)))
+
+    # sort rows (by title)
+    rows.sort(key=str.casefold)
+
+    # add to text
+    text += '\n'.join(rows)
+
+    # write to toc file
+    utils.write_text(toc_file, text)
+
+    print('Readme and TOCs updated')
+
+
+def sort_text_file(file, name):
+    """
+    Reads a text file, splits in lines, removes duplicates, sort, writes back.
+    """
+    text = utils.read_text(file)
+    text = text.split('\n')
+    text = sorted(list(set(text)), key=str.casefold)
+    print('{} contains {} items'.format(name, len(text)))
+    text = '\n'.join(text)
+    utils.write_text(file, text)
+
+
+class EntriesMaintainer:
+
+    def __init__(self):
+        self.entries = None
+
+    def read_entries(self):
+        self.entries = osg.read_entries()
+        print('{} entries read'.format(len(self.entries)))
+
+    def write_entries(self):
+        if not self.entries:
+            print('entries not yet loaded')
+            return
+        osg.write_entries(self.entries)
+        print('entries written')
+
+
+    def check_template_leftovers(self):
+        """
+        Checks for template leftovers.
+        Should be run only occasionally.
+        """
+        # load template and get all lines
+        text = utils.read_text(os.path.join(c.root_path, 'template.md'))
+        text = text.split('\n')
+        check_strings = [x for x in text if x and not x.startswith('##')]
+
+        # iterate over all entries
+        for _, entry_path, content in osg.entry_iterator():
+
+            for check_string in check_strings:
+                if content.find(check_string) >= 0:
+                    print('{}: found {}'.format(os.path.basename(entry_path), check_string))
+        print('checked for template leftovers')
+
+    def clean_rejected(self):
+        """
+
+        :return:
+        """
+        # sort rejected games list file
+        sort_text_file(os.path.join(c.root_path, 'code', 'rejected.txt'), 'rejected games list')
+
+    def clean_backlog(self):
+        """
+
+        :return:
+        """
+        if not self.entries:
+            print('entries not yet loaded')
+            return
+        # get urls from entries
+        included_urls = osg.all_urls(self.entries)
+        included_urls = list(included_urls.keys())  # only need the URLs here
+
+        # get urls from rejected file
+        text = utils.read_text(c.rejected_file)
+        regex = re.compile(r"\((http.*?)\)", re.MULTILINE)
+        matches = regex.findall(text)
+        rejected_urls = []
+        for match in matches:
+            urls = match.split(',')
+            urls = [x.strip() for x in urls]
+            rejected_urls.extend(urls)
+        included_urls.extend(rejected_urls)
+
+        # those that only have a web archive version, also get the original version
+        more_urls = []
+        for url in included_urls:
+            if url.startswith('https://web.archive.org/web'):
+                # print(url) # sometimes the http is missing in archive links (would need proper parsing)
+                url = url[url.index('http', 5):]
+                more_urls.append(url)
+        included_urls.extend(more_urls)
+
+        # now we strip the urls
+        stripped_urls = [utils.strip_url(x) for x in included_urls]
+        stripped_urls = set(stripped_urls)  # removes duplicates for performance
+
+        # read backlog and get urls from there
+        text = utils.read_text(c.backlog_file)
+        text = text.split('\n')
+
+        # remove those that are in stripped_game_urls
+        text = [x for x in text if utils.strip_url(x) not in stripped_urls]
+
+        # remove duplicates and sort
+        text = sorted(list(set(text)), key=str.casefold)
+        print('backlog contains {} items'.format(len(text)))
+
+        # join and save again
+        text = '\n'.join(text)
+        utils.write_text(c.backlog_file, text)
+
+        print('backlog cleaned')
+
+    def check_external_links(self):
+        """
+        Checks all external links it can find for validity. Prints those with non OK HTTP responses. Does only need to be run
+        from time to time.
+        """
+
+        # regex for finding urls (can be in <> or in ]() or after a whitespace
+        regex = re.compile(r"[\s\n]<(http.+?)>|\]\((http.+?)\)|[\s\n](http[^\s\n,]+?)[\s\n\)]")
+
+        # ignore the following patterns (they give false positives here)
+        ignored_urls = (
+        'https://git.tukaani.org/xz.git', 'https://git.code.sf.net/', 'http://hg.hedgewars.org/hedgewars/',
+        'https://git.xiph.org/vorbis.git', 'http://svn.uktrainsim.com/svn/openrails', 'https://www.srb2.org/',
+        'http://wiki.srb2.org/')
+
+        # some do redirect, but we nedertheless want the original URL in the database
+        redirect_okay = ('https://octaforge.org/', 'https://svn.openttd.org/', 'https://godotengine.org/download')
+
+        # extract all links from entries
+        import urllib3
+        urllib3.disable_warnings()  # otherwise we cannot verify those with SSL errors without getting warnings
+        urls = {}
+        for entry, _, content in osg.entry_iterator():
+            # apply regex
+            matches = regex.findall(content)
+            # for each match
+            for match in matches:
+                for url in match:
+                    if url and not any((url.startswith(x) for x in ignored_urls)):
+                        # ignore bzr.sourceforge, no web address found
+                        if 'bzr.sourceforge.net/bzrroot/' in url:
+                            continue
+
+                        # add "/" at the end
+                        if any((url.startswith(x) for x in (
+                        'https://anongit.freedesktop.org/git', 'https://git.savannah.gnu.org/git/',
+                        'https://git.savannah.nongnu.org/git/', 'https://git.artsoft.org/'))):
+                            url += '/'
+
+                        if url.startswith('https://bitbucket.org/') and url.endswith('.git'):
+                            url = url[:-4] + '/commits/'
+                        if url.startswith('https://svn.code.sf.net/p/'):
+                            url = 'http' + url[5:] + '/'
+                        if url.startswith('http://cvs.savannah.nongnu.org:/sources/'):
+                            url = 'http://cvs.savannah.nongnu.org/viewvc/' + url[40:] + '/'
+                        if url.startswith('http://cvs.savannah.gnu.org:/sources/'):
+                            url = 'http://cvs.savannah.gnu.org/viewvc/' + url[37:] + '/'
+
+                        # generally ".git" at the end is not working well, except sometimes
+                        if url.endswith('.git') and not any((url.startswith(x) for x in (
+                        'https://repo.or.cz', 'https://git.tuxfamily.org/fanwor/fanwor'))):
+                            url = url[:-4]
+
+                        if url in urls:
+                            urls[url].add(entry)
+                        else:
+                            urls[url] = {entry}
+        print('found {} unique links'.format(len(urls)))
+        print("start checking external links (can take a while)")
+
+        # now iterate over all urls
+        for url, names in urls.items():
+            names = list(names)  # was a set
+            if len(names) == 1:
+                names = names[0]
+            try:
+                verify = True
+                # some have an expired certificate but otherwise still work
+                if any((url.startswith(x) for x in (
+                'https://perso.b2b2c.ca/~sarrazip/dev/', 'https://dreerally.com/', 'https://henlin.net/',
+                'https://www.megamek.org/', 'https://pixeldoctrine.com/', 'https://gitorious.org/',
+                'https://www.opmon-game.ga/'))):
+                    verify = False
+                r = requests.head(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64)'}, timeout=20,
+                                  allow_redirects=True, verify=verify)
+                if r.status_code == 405:  # head method not supported, try get
+                    r = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64)'},
+                                     timeout=20, allow_redirects=True, verify=verify)
+                # check for bad status
+                if r.status_code != requests.codes.ok:
+                    print('{}: {} - {}'.format(names, url, r.status_code))
+                # check for redirect
+                if r.history and url not in redirect_okay:
+                    # only / added or http->https sometimes
+                    redirected_url = r.url
+                    if redirected_url == url + '/':
+                        output = '{}: {} -> {} - redirect "/" at end '
+                    elif redirected_url == 'https' + url[4:]:
+                        output = '{}: {} -> {} - redirect "https" at start'
+                    else:
+                        output = '{}: {} -> {} - redirect '
+                    print(output.format(names, url, redirected_url))
+            except Exception as e:
+                error_name = type(e).__name__
+                if error_name == 'SSLError' and any((url.startswith(x) for x in (
+                'https://gitorious.org/', 'https://www.freedroid.org/download/'))):
+                    continue  # even though verify is False, these errors still get through
+                print('{}: {} - exception {}'.format(names, url, error_name))
+
+    def update_readme_tocs(self):
+        """
+        Recounts entries in sub categories and writes them to the readme.
+        Also updates the _toc files in the categories directories.
+
+        Note: The Readme must have a specific structure at the beginning, starting with "# Open Source Games" and ending
+        on "A collection.."
+
+        Needs to be performed regularly.
+        """
+
+        # completely delete content of toc path
+        for file in os.listdir(c.tocs_path):
+            os.remove(os.path.join(c.tocs_path, file))
+
+        # read readme
+        readme_file = os.path.join(c.root_path, 'README.md')
+        readme_text = utils.read_text(readme_file)
+
+        # compile regex for identifying the building blocks in the readme
+        regex = re.compile(r"(.*?)(\[comment\]: # \(start.*?end of autogenerated content\))(.*)", re.DOTALL)
+
+        # apply regex
+        matches = regex.findall(readme_text)
+        if len(matches) != 1:
+            raise RuntimeError('readme file has invalid structure')
+        matches = matches[0]
+        start = matches[0]
+        end = matches[2]
+
+        tocs_text = ''
+
+        # split into games, tools, frameworks, libraries
+        games = [x for x in self.entries if not any([y in x['Keywords'] for y in ('tool', 'framework', 'library')])]
+        tools = [x for x in self.entries if 'tool' in x['Keywords']]
+        frameworks = [x for x in self.entries if 'framework' in x['Keywords']]
+        libraries = [x for x in self.entries if 'library' in x['Keywords']]
+        
+        # create games, tools, frameworks, libraries tocs
+        title = 'Games'
+        file = '_games.md'
+        tocs_text += '**[{}](entries/tocs/{}#{})** ({}) - '.format(title, file, title, len(games))
+        create_toc(title, file, games)
+
+        title = 'Tools'
+        file = '_tools.md'
+        tocs_text += '**[{}](entries/tocs/{}#{})** ({}) - '.format(title, file, title, len(tools))
+        create_toc(title, file, tools)
+
+        title = 'Frameworks'
+        file = '_frameworks.md'
+        tocs_text += '**[{}](entries/tocs/{}#{})** ({}) - '.format(title, file, title, len(frameworks))
+        create_toc(title, file, frameworks)
+
+        title = 'Libraries'
+        file = '_libraries.md'
+        tocs_text += '**[{}](entries/tocs/{}#{})** ({})\n'.format(title, file, title, len(libraries))
+        create_toc(title, file, libraries)
+
+        # create by category
+        categories_text = []
+        for keyword in c.recommended_keywords:
+            filtered = [x for x in self.entries if keyword in x['Keywords']]
+            title = keyword.capitalize()
+            name = keyword.replace(' ', '-')
+            file = '_{}.md'.format(name)
+            categories_text.append('**[{}](entries/tocs/{}#{})** ({})'.format(title, file, name, len(filtered)))
+            create_toc(title, file, filtered)
+        categories_text.sort()
+        tocs_text += '\nBy category: {}\n'.format(', '.join(categories_text))
+
+        # create by platform
+        platforms_text = []
+        for platform in c.valid_platforms:
+            filtered = [x for x in self.entries if platform in x.get('Platform', [])]
+            title = platform
+            name = platform.lower()
+            file = '_{}.md'.format(name)
+            platforms_text.append('**[{}](entries/tocs/{}#{})** ({})'.format(title, file, name, len(filtered)))
+            create_toc(title, file, filtered)
+        tocs_text += '\nBy platform: {}\n'.format(', '.join(platforms_text))
+
+        # insert new text in the middle (the \n before the second comment is necessary, otherwise Markdown displays it as part of the bullet list)
+        text = start + "[comment]: # (start of autogenerated content, do not edit)\n" + tocs_text + "\n[comment]: # (end of autogenerated content)" + end
+
+        # write to readme
+        utils.write_text(readme_file, text)
+
+    def update_statistics(self):
+        """
+        Generates the statistics page.
+
+        Should be done every time the entries change.
+        """
+        if not self.entries:
+            print('entries not yet loaded')
+            return
+
+        # start the page
+        statistics = '[comment]: # (autogenerated content, do not edit)\n# Statistics\n\n'
+
+        # total number
+        number_entries = len(self.entries)
+        rel = lambda x: x / number_entries * 100  # conversion to percent
+
+        statistics += 'analyzed {} entries on {}\n\n'.format(number_entries,
+                                                             datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
+
+        # State (beta, mature, inactive)
+        statistics += '## State\n\n'
+
+        number_state_beta = sum(1 for x in self.entries if 'beta' in x['State'])
+        number_state_mature = sum(1 for x in self.entries if 'mature' in x['State'])
+        number_inactive = sum(1 for x in self.entries if osg.is_inactive(x))
+        statistics += '- mature: {} ({:.1f}%)\n- beta: {} ({:.1f}%)\n- inactive: {} ({:.1f}%)\n\n'.format(
+            number_state_mature, rel(number_state_mature), number_state_beta, rel(number_state_beta), number_inactive,
+            rel(number_inactive))
+
+        if number_inactive > 0:
+            entries_inactive = [(x['Title'], osg.extract_inactive_year(x)) for x in self.entries if osg.is_inactive(x)]
+            entries_inactive.sort(key=lambda x: str.casefold(x[0]))  # first sort by name
+            entries_inactive.sort(key=lambda x: x[1], reverse=True)  # then sort by inactive year (more recently first)
+            entries_inactive = ['{} ({})'.format(*x) for x in entries_inactive]
+            statistics += '##### Inactive State\n\n' + ', '.join(entries_inactive) + '\n\n'
+
+        # Language
+        statistics += '## Code Languages\n\n'
+        field = 'Code language'
+
+        # get all languages together
+        languages = []
+        for entry in self.entries:
+            languages.extend(entry[field])
+        languages = [x.value for x in languages]
+
+        unique_languages = set(languages)
+        unique_languages = [(l, languages.count(l) / len(languages)) for l in unique_languages]
+        unique_languages.sort(key=lambda x: str.casefold(x[0]))  # first sort by name
+        unique_languages.sort(key=lambda x: x[1], reverse=True)  # then sort by occurrence (highest occurrence first)
+        unique_languages = ['- {} ({:.1f}%)\n'.format(x[0], x[1] * 100) for x in unique_languages]
+        statistics += '##### Language frequency\n\n' + ''.join(unique_languages) + '\n'
+
+        # Licenses
+        statistics += '## Code licenses\n\n'
+        field = 'Code license'
+
+        # get all licenses together
+        licenses = []
+        for entry in self.entries:
+            licenses.extend(entry[field])
+        licenses = [x.value for x in licenses]
+
+        unique_licenses = set(licenses)
+        unique_licenses = [(l, licenses.count(l) / len(licenses)) for l in unique_licenses]
+        unique_licenses.sort(key=lambda x: str.casefold(x[0]))  # first sort by name
+        unique_licenses.sort(key=lambda x: -x[1])  # then sort by occurrence (highest occurrence first)
+        unique_licenses = ['- {} ({:.1f}%)\n'.format(x[0], x[1] * 100) for x in unique_licenses]
+        statistics += '##### Licenses frequency\n\n' + ''.join(unique_licenses) + '\n'
+
+        # Keywords
+        statistics += '## Keywords\n\n'
+        field = 'Keywords'
+
+        # get all keywords together
+        keywords = []
+        for entry in self.entries:
+            keywords.extend(entry[field])
+        keywords = [x.value for x in keywords]
+
+        # reduce those starting with "multiplayer"
+        keywords = [x if not x.startswith('multiplayer') else 'multiplayer' for x in keywords]
+
+        unique_keywords = set(keywords)
+        unique_keywords = [(l, keywords.count(l) / len(keywords)) for l in unique_keywords]
+        unique_keywords.sort(key=lambda x: str.casefold(x[0]))  # first sort by name
+        unique_keywords.sort(key=lambda x: -x[1])  # then sort by occurrence (highest occurrence first)
+        unique_keywords = ['- {} ({:.1f}%)'.format(x[0], x[1] * 100) for x in unique_keywords]
+        statistics += '##### Keywords frequency\n\n' + '\n'.join(unique_keywords) + '\n\n'
+
+        # no download or play field
+        statistics += '## Entries without download or play fields\n\n'
+
+        entries = []
+        for entry in self.entries:
+            if 'Download' not in entry and 'Play' not in entry:
+                entries.append(entry['Title'])
+        entries.sort(key=str.casefold)
+        statistics += '{}: '.format(len(entries)) + ', '.join(entries) + '\n\n'
+
+        # code hosted not on github, gitlab, bitbucket, launchpad, sourceforge
+        popular_code_repositories = ('github.com', 'gitlab.com', 'bitbucket.org', 'code.sf.net', 'code.launchpad.net')
+        statistics += '## Entries with a code repository not on a popular site\n\n'
+
+        entries = []
+        field = 'Code repository'
+        for entry in self.entries:
+            popular = False
+            for repo in entry[field]:
+                for popular_repo in popular_code_repositories:
+                    if popular_repo in repo.value:
+                        popular = True
+                        break
+            # if there were repositories, but none popular, add them to the list
+            if not popular:
+                entries.append(entry['Title'])
+                # print(info[field])
+        entries.sort(key=str.casefold)
+        statistics += '{}: '.format(len(entries)) + ', '.join(entries) + '\n\n'
+
+        # Code dependencies
+        statistics += '## Code dependencies\n\n'
+        field = 'Code dependencies'
+
+        # get all code dependencies together
+        code_dependencies = []
+        entries_with_code_dependency = 0
+        for entry in self.entries:
+            if field in entry:
+                code_dependencies.extend(entry[field])
+                entries_with_code_dependency += 1
+        code_dependencies = [x.value for x in code_dependencies]
+        statistics += 'With code dependency field {} ({:.1f}%)\n\n'.format(entries_with_code_dependency,
+                                                                           rel(entries_with_code_dependency))
+
+        unique_code_dependencies = set(code_dependencies)
+        unique_code_dependencies = [(l, code_dependencies.count(l) / len(code_dependencies)) for l in
+                                    unique_code_dependencies]
+        unique_code_dependencies.sort(key=lambda x: str.casefold(x[0]))  # first sort by name
+        unique_code_dependencies.sort(key=lambda x: -x[1])  # then sort by occurrence (highest occurrence first)
+        unique_code_dependencies = ['- {} ({:.1f}%)'.format(x[0], x[1] * 100) for x in unique_code_dependencies]
+        statistics += '##### Code dependencies frequency\n\n' + '\n'.join(unique_code_dependencies) + '\n\n'
+
+        # Build systems:
+        statistics += '## Build systems\n\n'
+        field = 'Build system'
+
+        # get all build systems together
+        build_systems = []
+        for entry in self.entries:
+            if field in entry['Building']:
+                build_systems.extend(entry['Building'][field])
+        build_systems = [x.value for x in build_systems]
+
+        statistics += 'Build systems information available for {:.1f}% of all projects.\n\n'.format(
+            rel(len(build_systems)))
+
+        unique_build_systems = set(build_systems)
+        unique_build_systems = [(l, build_systems.count(l) / len(build_systems)) for l in unique_build_systems]
+        unique_build_systems.sort(key=lambda x: str.casefold(x[0]))  # first sort by name
+        unique_build_systems.sort(key=lambda x: -x[1])  # then sort by occurrence (highest occurrence first)
+        unique_build_systems = ['- {} ({:.1f}%)'.format(x[0], x[1] * 100) for x in unique_build_systems]
+        statistics += '##### Build systems frequency ({})\n\n'.format(len(build_systems)) + '\n'.join(
+            unique_build_systems) + '\n\n'
+
+        # C, C++ projects without build system information
+        c_cpp_project_without_build_system = []
+        for entry in self.entries:
+            if field not in entry and ('C' in entry['Code language'] or 'C++' in entry['Code language']):
+                c_cpp_project_without_build_system.append(entry['Title'])
+        c_cpp_project_without_build_system.sort(key=str.casefold)
+        statistics += '##### C and C++ projects without build system information ({})\n\n'.format(
+            len(c_cpp_project_without_build_system)) + ', '.join(c_cpp_project_without_build_system) + '\n\n'
+
+        # C, C++ projects with build system information but without CMake as build system
+        c_cpp_project_not_cmake = []
+        for entry in entries:
+            if field in entry and 'CMake' in entry[field] and (
+                    'C' in entry['Code language'] or 'C++' in entry['Code language']):
+                c_cpp_project_not_cmake.append(entry['Title'])
+        c_cpp_project_not_cmake.sort(key=str.casefold)
+        statistics += '##### C and C++ projects with a build system different from CMake ({})\n\n'.format(
+            len(c_cpp_project_not_cmake)) + ', '.join(c_cpp_project_not_cmake) + '\n\n'
+
+        # Platform
+        statistics += '## Platform\n\n'
+        field = 'Platform'
+
+        # get all platforms together
+        platforms = []
+        for entry in self.entries:
+            if field in entry:
+                platforms.extend(entry[field])
+        platforms = [x.value for x in platforms]
+
+        statistics += 'Platform information available for {:.1f}% of all projects.\n\n'.format(rel(len(platforms)))
+
+        unique_platforms = set(platforms)
+        unique_platforms = [(l, platforms.count(l) / len(platforms)) for l in unique_platforms]
+        unique_platforms.sort(key=lambda x: str.casefold(x[0]))  # first sort by name
+        unique_platforms.sort(key=lambda x: -x[1])  # then sort by occurrence (highest occurrence first)
+        unique_platforms = ['- {} ({:.1f}%)'.format(x[0], x[1] * 100) for x in unique_platforms]
+        statistics += '##### Platforms frequency\n\n' + '\n'.join(unique_platforms) + '\n\n'
+
+        # write to statistics file
+        utils.write_text(c.statistics_file, statistics)
+
+        print('statistics updated')
+
+    def update_html(self):
+        pass
+
+    def update_repos(self):
+        pass
+
+    def complete_run(self):
+        pass
+
+
+if __name__ == "__main__":
+
+    m = EntriesMaintainer()
+
+    actions = {
+        'Read entries': m.read_entries,
+        'Write entries': m.write_entries,
+        'Check template leftovers': m.check_template_leftovers,
+        'Check external links': m.check_external_links,
+        'Check rejected entries': m.clean_rejected,
+        'Check external links (takes quite long)': m.check_external_links,
+        'Clean backlog': m.clean_backlog,
+        'Update Readme and TOCs': m.update_readme_tocs,
+        'Update statistics': m.update_statistics,
+        'Update HTML': m.update_html,
+        'Update repository list': m.update_repos,
+        'Complete run': m.complete_run
+    }
+
+    osg_ui.run_simple_button_app('Entries developer', actions)
+
+
--- a/code/utils/constants.py
+++ b/code/utils/constants.py
@ -16,6 +16,7 @@ developer_file = os.path.join(root_path, 'developers.md')

 backlog_file = os.path.join(code_path, 'backlog.txt')
 rejected_file = os.path.join(code_path, 'rejected.txt')
+statistics_file = os.path.join(root_path, 'statistics.md')

 # local config
 local_config_file = os.path.join(root_path, 'local-config.ini')
--- a/code/utils/osg.py
+++ b/code/utils/osg.py
@ -370,11 +370,16 @@ def check_and_process_entry(entry):

    return entry

+def is_inactive(entry):
+    state = entry['State']
+    phrase = 'inactive since '
+    return any(x.startswith(phrase) for x in state)
+

 def extract_inactive_year(entry):
    state = entry['State']
    phrase = 'inactive since '
-    inactive_year = [x[len(phrase):] for x in state if x.startswith(phrase)]
+    inactive_year = [x.value[len(phrase):] for x in state if x.startswith(phrase)]
    assert len(inactive_year) <= 1
    if inactive_year:
        return inactive_year[0]
--- a/entries/enigma.md
+++ b/entries/enigma.md
@ -13,7 +13,6 @@
 - Code dependencies: SDL

 Puzzle game based on Oxyd.
-Inspired by Oxyd.

 ## Building

--- a/entries/iron_seed.md
+++ b/entries/iron_seed.md
@ -5,7 +5,7 @@
 - Inspirations: Iron Seed
 - State: mature, inactive since 2013
 - Download: https://web.archive.org/web/20150802151352/http://www.ironseed.com/ironseed-v1.20.0016-2013-03-17.zip
- Keywords: remake, inspired by Iron Seed
+- Keywords: remake
 - Code repository: @see-download
 - Code language: Pascal
 - Code license: GPL-3.0 (not with the source code)