new entries and reorganization of python scripts (added git archive)

2018-06-01 09:23:02 +02:00
parent ce9c189ce1
commit 22d7ec4537
18 changed files with 246 additions and 23 deletions
--- a/archive/archives.csv
+++ b/archive/archives.csv
@@ -0,0 +1,14 @@
+https://github.com/guillaume-gouchon/island
+https://github.com/FreezingMoon/AncientBeast
+https://github.com/godrin/antargis
+https://github.com/bote-team/bote
+https://github.com/Trilarion/civil
+https://github.com/SWY1985/CivOne
+https://github.com/colobot/colobot
+https://github.com/tautvilas/epoh
+https://github.com/hinogi/eternalwinterwars
+https://github.com/infidel-/cult
+https://github.com/Vakarias/farcolony
+https://github.com/freeciv/freeciv
+https://github.com/freeciv/freeciv-web
+https://github.com/freeorion/freeorion
--- a/archive/update.py
+++ b/archive/update.py
@@ -0,0 +1,62 @@
+"""
+    Clones and/or pulls all the gits listed in archives.csv
+
+    Requires: git executable in the path
+
+    Warning: This may take a long time on the first run and may need a lot of storage space!
+"""
+
+import os
+import csv
+import subprocess
+
+
+def derive_folder_name(url):
+    github = 'https://github.com/'
+    if url.startswith(github):
+        url = url[len(github):].split('/')
+        folder = 'github.' + url[0] + '.' + url[1] + '.git'
+    return folder
+
+
+def clone(url, folder):
+    result = subprocess.run(["git", "clone", url, folder])
+    if result.returncode:
+        print(result)
+
+
+def pull():
+    result = subprocess.run(["git", "pull", "--all"])
+    if result.returncode:
+        print(result)
+
+
+if __name__ == '__main__':
+
+    # get this folder
+    root_folder = os.path.realpath(os.path.dirname(__file__))
+
+    # read archives.csv
+    archives = []
+    with open('archives.csv', newline='') as f:
+        reader = csv.reader(f)
+        for row in reader:
+            archives.append(row)
+
+    # loop over archives
+    for archive in archives:
+        url = archive[0]
+        folder = os.path.join(root_folder, derive_folder_name(url))
+
+        # if not existing do the initial checkout
+        if not os.path.isdir(folder):
+            os.chdir(root_folder)
+            clone(url, folder)
+
+        # pull all
+        os.chdir(folder)
+        pull()
+
+
+
+
--- a/tools/maintenance.py
+++ b/tools/maintenance.py
@@ -0,0 +1,470 @@
+"""
+    Counts the number of records each subfolder and updates the overview. Sorts the entries in the contents files of
+    each sub folder alphabetically.
+
+    This script runs with Python 3, it could also with Python 2 with some minor tweaks probably, but that's not important.
+
+    TODO get number of games with github or bitbucket repository and list those who have neither
+    TODO Which C, C++ projects do not use CMake
+    TODO for those games with github repositories get activity, number of open issues, number of merge requests and display in a health monitor file
+    TODO search for ?? and replace with either nothing or missing information
+"""
+
+import os
+import re
+import urllib.request
+import http.client
+import datetime
+
+def get_category_paths():
+    """
+    Returns all sub folders of the games path.
+    """
+    return [os.path.join(games_path, x) for x in os.listdir(games_path) if os.path.isdir(os.path.join(games_path, x))]
+
+def get_entry_paths(category_path):
+    """
+    Returns all files of a category path, except for '_toc.md'.
+    """
+    return [os.path.join(category_path, x) for x in os.listdir(category_path) if x != '_toc.md' and os.path.isfile(os.path.join(category_path, x))]
+
+def read_first_line_from_file(file):
+    """
+    Convenience function because we only need the first line of a category overview really.
+    """
+    with open(file, 'r') as f:
+        line = f.readline()
+    return line
+
+def read_interesting_info_from_file(file):
+    """
+    Parses a file for some interesting fields and concatenates the content. To be displayed after the game name in the
+    category overview.
+    """
+    with open(file, 'r') as f:
+        text = f.read()
+
+    output = [None, None, None]
+
+    # language
+    regex = re.compile(r"- Language\(s\): (.*)")
+    matches = regex.findall(text)
+    if matches:
+        output[0] = matches[0]
+
+    # license
+    regex = re.compile(r"- License: (.*)")
+    matches = regex.findall(text)
+    if matches:
+        output[1] = matches[0]
+
+    # state
+    regex = re.compile(r"- State: (.*)")
+    matches = regex.findall(text)
+    if matches:
+        output[2] = matches[0]
+
+    output = [x for x in output if x] # eliminate empty entries
+
+    output = ", ".join(output)
+
+    return output
+
+
+def update_readme():
+    """
+    Recounts entries in sub categories and writes them to the readme. Needs to be performed regularly.
+    """
+    print('update readme file')
+
+    # read readme
+    with open(readme_path) as f:
+        readme_text = f.read()
+
+    # compile regex for identifying the building blocks
+    regex = re.compile(r"(# Open Source Games\n\n)(.*)(\nA collection.*)", re.DOTALL)
+
+    # apply regex
+    matches = regex.findall(readme_text)
+    matches = matches[0]
+    start = matches[0]
+    end = matches[2]
+
+    # get sub folders
+    category_paths = get_category_paths()
+
+    # get number of files (minus 1) in each sub folder
+    n = [len(os.listdir(path)) - 1 for path in category_paths]
+
+    # assemble paths
+    paths = [os.path.join(path, '_toc.md') for path in category_paths]
+
+    # get titles (discarding first two ("# ") and last ("\n") characters)
+    titles = [read_first_line_from_file(path)[2:-1] for path in paths]
+
+    # combine titles, category names, numbers in one list
+    info = zip(titles, [os.path.basename(path) for path in category_paths], n)
+
+    # sort according to sub category title (should be unique)
+    info = sorted(info, key=lambda x:x[0])
+
+    # assemble output
+    update = ['- **[{}](games/{}/_toc.md)** ({})\n'.format(*entry) for entry in info]
+    update = "{} entries\n".format(sum(n)) + "".join(update)
+
+    # insert new text in the middle
+    text = start + "[comment]: # (start of autogenerated content, do not edit)\n" + update + "\n[comment]: # (end of autogenerated content)" + end
+
+    # write to readme
+    with open(readme_path, 'w') as f:
+        f.write(text)
+
+def update_category_tocs():
+    """
+    Lists all entries in all sub folders and generates the list in the toc file. Needs to be performed regularly.
+    """
+    # get category paths
+    category_paths = get_category_paths()
+
+    # for each category
+    for category_path in category_paths:
+        print('generate toc for {}'.format(os.path.basename(category_path)))
+
+        # read toc header line
+        toc_file = os.path.join(category_path, '_toc.md')
+        toc_header = read_first_line_from_file(toc_file)
+
+        # get paths of all entries in this category
+        entry_paths = get_entry_paths(category_path)
+
+        # get titles (discarding first two ("# ") and last ("\n") characters)
+        titles = [read_first_line_from_file(path)[2:-1] for path in entry_paths]
+
+        # get more interesting info
+        more = [read_interesting_info_from_file(path) for path in entry_paths]
+
+        # combine name and file name
+        info = zip(titles, [os.path.basename(path) for path in entry_paths], more)
+
+        # sort according to entry title (should be unique)
+        info = sorted(info, key=lambda x:x[0])
+
+        # assemble output
+        update = ['- **[{}]({})** ({})\n'.format(*entry) for entry in info]
+        update = "".join(update)
+
+        # combine toc header
+        text = toc_header + '\n' + "[comment]: # (start of autogenerated content, do not edit)\n" + update + "\n[comment]: # (end of autogenerated content)"
+
+        # write to toc file
+        with open(toc_file, 'w') as f:
+            f.write(text)
+
+def check_validity_external_links():
+    """
+    Checks all external links it can find for validity. Prints those with non OK HTTP responses. Does only need to be run
+    from time to time.
+    """
+    # regex for finding urls (can be in <> or in () or a whitespace
+    regex = re.compile(r"[\s\n]<(http.+?)>|\]\((http.+?)\)|[\s\n](http[^\s\n]+)")
+
+    # count
+    number_checked_links = 0
+
+    # get category paths
+    category_paths = get_category_paths()
+
+    # for each category
+    for category_path in category_paths:
+        print('check links for {}'.format(os.path.basename(category_path)))
+
+        # get entry paths
+        entry_paths = get_entry_paths(category_path)
+
+        # for each entry
+        for entry_path in entry_paths:
+            # read entry
+            with open(entry_path, 'r') as f:
+                content = f.read()
+
+            # apply regex
+            matches = regex.findall(content)
+
+            # for each match
+            for match in matches:
+
+                # for each possible clause
+                for url in match:
+
+                    # if there was something
+                    if url:
+                        try:
+                            # without a special headers, frequent 403 responses occur
+                            req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64)'})
+                            urllib.request.urlopen(req)
+                        except urllib.error.HTTPError as e:
+                            print("{}: {} - {}".format(os.path.basename(entry_path), url, e.code))
+                        except http.client.RemoteDisconnected:
+                            print("{}: {} - disconnected without response".format(os.path.basename(entry_path), url))
+
+                        number_checked_links += 1
+
+                        if number_checked_links % 50 == 0:
+                            print("{} links checked".format(number_checked_links))
+
+    print("{} links checked".format(number_checked_links))
+
+def fix_notation():
+    """
+    Changes notation, quite special. Only run when needed.
+    """
+    regex = re.compile(r"- License details:(.*)")
+
+    # get category paths
+    category_paths = get_category_paths()
+
+    # for each category
+    for category_path in category_paths:
+        # get paths of all entries in this category
+        entry_paths = get_entry_paths(category_path)
+
+        for entry_path in entry_paths:
+            # read it line by line
+            with open(entry_path) as f:
+                content = f.readlines()
+
+            # apply regex on every line
+            matched_lines = [regex.findall(line) for line in content]
+
+            # loop over all the lines
+            for line, match in enumerate(matched_lines):
+                if match:
+                    match = match[0]
+
+                    # patch content
+                    content[line] = "- Code license details:{}\n".format(match)
+
+            # write it line by line
+            with open(entry_path, "w") as f:
+                f.writelines(content)
+
+def regular_replacements():
+    """
+    Replacing some stuff by shortcuts. Can be run regularly
+    """
+    # get category paths
+    category_paths = get_category_paths()
+
+    # for each category
+    for category_path in category_paths:
+        # get paths of all entries in this category
+        entry_paths = get_entry_paths(category_path)
+
+        for entry_path in entry_paths:
+            # read it line by line
+            with open(entry_path) as f:
+                content = f.read()
+
+            # now the replacements
+            content = content.replace('?source=navbar', '') # sourceforge specific
+            content = content.replace('single player', 'SP')
+            content = content.replace('multi player', 'MP')
+
+            # write it line by line
+            with open(entry_path, "w") as f:
+                f.write(content)
+
+def check_template_leftovers():
+    """
+    Checks for template leftovers.
+    """
+    check_strings = ['# {NAME}', '_{One line description}_', '- Home: {URL}', '- Media: {URL}', '- Download: {URL}', '- State: beta, mature, inactive since', '- Keywords: SP, MP, RTS, TBS (if none, remove the line)', '- Code: primary repository (type if not git), other repositories (type if not git)', '- Language(s): {XX}', '- License: {XX} (if special, include link)', '{XXX}']
+
+    # get category paths
+    category_paths = get_category_paths()
+
+    # for each category
+    for category_path in category_paths:
+        # get paths of all entries in this category
+        entry_paths = get_entry_paths(category_path)
+
+        for entry_path in entry_paths:
+            # read it line by line
+            with open(entry_path) as f:
+                content = f.read()
+
+            for check_string in check_strings:
+                if content.find(check_string) >= 0:
+                    print('{}: found {}'.format(os.path.basename(entry_path), check_string))
+
+def parse_entry(content):
+    """
+    Returns a dictionary of the features of the content
+    """
+
+    info = {}
+
+    # state
+    regex = re.compile(r"- State: (.*)")
+    matches = regex.findall(content)
+    if matches:
+        # first remove everything in parenthesis
+        states = re.sub(r'\([^)]*\)', '', matches[0])
+        states = states.split(',')
+        states = [x.strip() for x in states]
+        if 'beta' in states:
+            info['state'] = 'beta'
+        elif 'mature' in states:
+            info['state'] = 'mature'
+        else:
+            print('Neither beta nor mature in state tag: {}'.format(content))
+        inactive = next((int(x[14:]) for x in states if x.startswith('inactive since')), None) # only the year
+        if inactive:
+            info['inactive'] = inactive
+
+    # language
+    regex = re.compile(r"- Language\(s\): (.*)")
+    matches = regex.findall(content)
+    if matches:
+        # first remove everything in parenthesis
+        languages = re.sub(r'\([^)]*\)', '', matches[0])
+        languages = languages.split(',')
+        languages = [x.strip() for x in languages]
+        info['language'] = languages
+
+    # license
+    regex = re.compile(r"- Code license: (.*)")
+    matches = regex.findall(content)
+    if matches:
+        # first remove everything in parenthesis
+        license = re.sub(r'\([^)]*\)', '', matches[0])
+        info['license'] = license
+
+    return info
+
+
+def generate_statistics():
+    """
+
+    """
+    statistics_path = os.path.join(games_path, 'statistics.md')
+    statistics = '[comment]: # (autogenerated content, do not edit)\n# Statistics\n\n'
+
+    # get category paths
+    category_paths = get_category_paths()
+
+    # for each category
+    infos = []
+    for category_path in category_paths:
+        # get paths of all entries in this category
+        entry_paths = get_entry_paths(category_path)
+
+        for entry_path in entry_paths:
+            # read it line by line
+            with open(entry_path) as f:
+                content = f.read()
+
+            info = parse_entry(content)
+            info['file'] = os.path.basename(entry_path)[:-3] # [:-3] to cut off the .md
+            infos.append(info)
+
+    # total number
+    number_entries = len(infos)
+    rel = lambda x: x / number_entries * 100 # converion to percent
+    statistics += 'analyzed {} entries on {}\n\n'.format(number_entries, datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
+
+    # State (beta, mature, inactive)
+    statistics += '## State\n\n'
+
+    number_state_beta = sum(1 for x in infos if 'state' in x and x['state'] == 'beta')
+    number_state_mature = sum(1 for x in infos if 'state' in x and x['state'] == 'mature')
+    number_inactive = sum(1 for x in infos if 'inactive' in x)
+    statistics += '- mature: {} ({:.1f}%)\n- beta: {} ({:.1f}%)\n- inactive: {} ({:.1f}%)\n\n'.format(number_state_mature, rel(number_state_mature), number_state_beta, rel(number_state_beta), number_inactive, rel(number_inactive))
+
+    if number_inactive > 0:
+        entries_inactive = [(x['file'], x['inactive']) for x in infos if 'inactive' in x]
+        entries_inactive.sort(key=lambda x: x[0])  # first sort by name
+        entries_inactive.sort(key=lambda x: -x[1]) # then sort by inactive year (more recently first)
+        entries_inactive = ['{} ({})'.format(*x) for x in entries_inactive]
+        statistics += '##### Inactive State\n\n' + ', '.join(entries_inactive) + '\n\n'
+
+    entries_no_state = [x['file'] for x in infos if 'state' not in x]
+    if entries_no_state:
+        entries_no_state.sort()
+        statistics += '##### Without state tag ({})\n\n'.format(len(entries_no_state)) + ', '.join(entries_no_state) + '\n\n'
+
+    # Language
+    statistics += '## Languages\n\n'
+    number_no_language = sum(1 for x in infos if 'language' not in x)
+    if number_no_language > 0:
+        statistics += 'Without language tag: {} ({:.1f}%)\n\n'.format(number_no_language, rel(number_no_language))
+        entries_no_language = [x['file'] for x in infos if 'language' not in x]
+        entries_no_language.sort()
+        statistics += ', '.join(entries_no_language) + '\n\n'
+
+    # get all languages together
+    languages = []
+    for info in infos:
+        if 'language' in info:
+            languages.extend(info['language'])
+
+    unique_languages = set(languages)
+    unique_languages = [(l, languages.count(l) / len(languages)) for l in unique_languages]
+    unique_languages.sort(key=lambda x: x[0]) # first sort by name
+    unique_languages.sort(key=lambda x: -x[1]) # then sort by occurrence (highest occurrence first)
+    unique_languages = ['- {} ({:.1f}%)\n'.format(x[0], x[1]*100) for x in unique_languages]
+    statistics += '##### Language frequency\n\n' + ''.join(unique_languages) + '\n'
+
+    # Licenses
+    statistics += '## Code licenses\n\n'
+    number_no_license = sum(1 for x in infos if 'license' not in x)
+    if number_no_license > 0:
+        statistics += 'Without license tag: {} ({:.1f}%)\n\n'.format(number_no_license, rel(number_no_license))
+        entries_no_license = [x['file'] for x in infos if 'license' not in x]
+        entries_no_license.sort()
+        statistics += ', '.join(entries_no_license) + '\n\n'
+
+    # get all licenses together
+    licenses = []
+    for info in infos:
+        if 'license' in info:
+            licenses.append(info['license'])
+
+    unique_licenses = set(licenses)
+    unique_licenses = [(l, licenses.count(l) / len(licenses)) for l in unique_licenses]
+    unique_licenses.sort(key=lambda x: x[0]) # first sort by name
+    unique_licenses.sort(key=lambda x: -x[1]) # then sort by occurrence (highest occurrence first)
+    unique_licenses = ['- {} ({:.1f}%)\n'.format(x[0], x[1]*100) for x in unique_licenses]
+    statistics += '##### Licenses frequency\n\n' + ''.join(unique_licenses) + '\n'
+
+    with open(statistics_path, 'w') as f:
+        f.write(statistics)
+
+
+
+if __name__ == "__main__":
+
+    # paths
+    games_path = os.path.realpath(os.path.join(os.path.dirname(__file__), os.path.pardir, 'games'))
+    readme_path = os.path.join(games_path, os.pardir, 'README.md')
+
+    # recount and write to readme
+    update_readme()
+
+    # generate list in toc files
+    update_category_tocs()
+
+    # generate report
+    generate_statistics()
+
+    # check for unfilled template lines
+    # check_template_leftovers()
+
+    # check external links (only rarely)
+    #check_validity_external_links()
+
+    # special, only run when needed
+    # fix_notation()
+
+    # regular replacements
+    #regular_replacements()