adapted generation of statistics and json results of maintenance script to new layout

2019-07-08 13:10:05 +02:00
parent 2915f52a19
commit 59b96e0878
5 changed files with 1240 additions and 1837 deletions
--- a/tools/maintenance.py
+++ b/tools/maintenance.py
@ -22,29 +22,25 @@ valid_fields = ('Home', 'Media', 'State', 'Play', 'Download', 'Platform', 'Keywo
 valid_platforms = ('Windows', 'Linux', 'macOS', 'Android', 'Browser')
 recommended_keywords = ('action', 'arcade', 'adventure', 'visual novel', 'sports', 'platform', 'puzzle', 'role playing', 'simulation', 'strategy', 'card game', 'board game', 'music', 'educational', 'tool', 'game engine', 'framework', 'library')

-def extract_overview_for_toc(file):
+
+def entry_iterator():
    """
-    Parses a file for some interesting fields and concatenates the content.
-    
-    To be displayed after the game name in the category TOCs.
+
    """
-    info = infos[file]

-    output = []
+    # get all entries (ignore everything starting with underscore)
+    entries = os.listdir(games_path)
+    entries = (x for x in entries if not x.startswith('_'))

-    if 'code language' in info:
-        output.extend(info['code language'])
+    # iterate over all entries
+    for entry in entries:
+        entry_path = os.path.join(games_path, entry)

-    if 'code license' in info:
-        output.extend(info['code license'])
+        # read entry
+        content = read_text(entry_path)

-    # state
-    if 'state' in info:
-        output.extend(info['state'])
-
-    output = ", ".join(output)
-
-    return output
+        # yield
+        yield entry, entry_path, content


 def update_readme_and_tocs(infos):
@ -57,7 +53,7 @@ def update_readme_and_tocs(infos):

    Needs to be performed regularly.
    """
-    print('update readme file')
+    print('update readme and toc files')

    # delete all toc files
    entries = os.listdir(games_path)
@ -66,6 +62,7 @@ def update_readme_and_tocs(infos):
        os.remove(os.path.join(games_path, entry))

    # read readme
+    readme_file = os.path.join(root_path, 'README.md')
    readme_text = read_text(readme_file)

    # compile regex for identifying the building blocks
@ -82,14 +79,14 @@ def update_readme_and_tocs(infos):
    # create all toc and readme entry
    title = 'All'
    file = '_all.md'
-    update = ['- **[{}](games/{})** ({})\n'.format(title, file, len(infos))]
+    update = ['- **[{}](games/{}#{})** ({})\n'.format(title, file, title, len(infos))]
    create_toc(title, file, infos)

    for keyword in recommended_keywords:
        infos_filtered = [x for x in infos if keyword in x['keywords']]
        title = keyword.capitalize()
        file = '_{}.md'.format(keyword)
-        update.append('- **[{}](games/{})** ({})\n'.format(title, file, len(infos_filtered)))
+        update.append('- **[{}](games/{}#{})** ({})\n'.format(title, file, title, len(infos_filtered)))
        create_toc(title, file, infos_filtered)
    update = ''.join(update)

@ -121,79 +118,26 @@ def create_toc(title, file, entries):
    # add to text
    text += '\n'.join(rows)

+    # write to toc file
    write_text(toc_file, text)


-
-def update_category_tocs():
-    """
-    Lists all entries in all sub folders and generates the list in the toc file.
-
-    Needs to be performed regularly.
-    """
-    # get category paths
-    category_paths = get_category_paths()
-
-    # for each category
-    for category_path in category_paths:
-        print('generate toc for {}'.format(os.path.basename(category_path)))
-
-        # read toc header line
-        toc_file = os.path.join(category_path, TOC)
-        toc_header = read_first_line(toc_file) # stays as is
-
-        # get paths of all entries in this category
-        entry_paths = get_entry_paths(category_path)
-
-        # get titles (discarding first two ("# ") and last ("\n") characters)
-        titles = [read_first_line(path)[2:-1] for path in entry_paths]
-
-        # get more interesting info
-        more = [extract_overview_for_toc(path) for path in entry_paths]
-
-        # combine name, file name and more info
-        info = zip(titles, [os.path.basename(path) for path in entry_paths], more)
-
-        # sort according to entry title (should be unique)
-        info = sorted(info, key=lambda x:x[0])
-
-        # assemble output
-        update = ['- **[{}]({})** ({})\n'.format(*entry) for entry in info]
-        update = "".join(update)
-
-        # combine with toc header
-        text = toc_header + '\n' + "[comment]: # (start of autogenerated content, do not edit)\n" + update + "\n[comment]: # (end of autogenerated content)"
-
-        # write to toc file
-        with open(toc_file, mode='w', encoding='utf-8') as f:
-            f.write(text)
-
-
 def check_validity_external_links():
    """
    Checks all external links it can find for validity. Prints those with non OK HTTP responses. Does only need to be run
    from time to time.
    """
+
+    print("check external links (can take a while)")
+
    # regex for finding urls (can be in <> or in () or a whitespace
    regex = re.compile(r"[\s\n]<(http.+?)>|\]\((http.+?)\)|[\s\n](http[^\s\n,]+)")

    # count
    number_checked_links = 0

-    # get category paths
-    category_paths = get_category_paths()
-
-    # for each category
-    for category_path in category_paths:
-        print('check links for {}'.format(os.path.basename(category_path)))
-
-        # get entry paths
-        entry_paths = get_entry_paths(category_path)
-
-        # for each entry
-        for entry_path in entry_paths:
-            # read entry
-            content = read_text(entry_path)
+    # iterate over all entries
+    for _, entry_path, content in entry_iterator():

            # apply regex
            matches = regex.findall(content)
@ -232,43 +176,36 @@ def check_template_leftovers():
    Should be run only occasionally.
    """

+    print('check for template leftovers')
+
    # load template and get all lines
-    text = read_text(os.path.join(games_path, 'template.md'))
+    text = read_text(os.path.join(root_path, 'template.md'))
    text = text.split('\n')
    check_strings = [x for x in text if x and not x.startswith('##')]

-    # get category paths
-    category_paths = get_category_paths()
+    # iterate over all entries
+    for _, entry_path, content in entry_iterator():

-    # for each category
-    for category_path in category_paths:
-        # get paths of all entries in this category
-        entry_paths = get_entry_paths(category_path)
+        for check_string in check_strings:
+            if content.find(check_string) >= 0:
+                raise RuntimeError('{}: found {}'.format(os.path.basename(entry_path), check_string))

-        for entry_path in entry_paths:
-            # read it line by line
-            content = read_text(entry_path)
-
-            for check_string in check_strings:
-                if content.find(check_string) >= 0:
-                    print('{}: found {}'.format(os.path.basename(entry_path), check_string))

 def fix_keywords():
    """
    Fixes the keywords.
    """

+    print('fix keywords')
+
    regex = re.compile(r"(.*)(- Keywords:.*)(- Code repository: .*)",  re.DOTALL)

-    # get all entries
+    # get all entries (ignore everything starting with underscore)
    entries = os.listdir(games_path)
+    entries = (x for x in entries if not x.startswith('_'))

    # iterate over all entries
-    for entry in entries:
-        entry_path = os.path.join(games_path, entry)
-
-        # read entry
-        content = read_text(entry_path)
+    for entry, entry_path, content in entry_iterator():

        # match with regex
        matches = regex.findall(content)
@ -308,6 +245,7 @@ def fix_keywords():
            # write again
            write_text(entry_path, new_content)

+
 def parse_entry(content):
    """
    Returns a dictionary of the features of the content
@ -442,19 +380,13 @@ def assemble_infos():
    Parses all entries and assembles interesting infos about them.
    """

+    print('assemble game infos')
+
    # a database of all important infos about the entries
    infos = []

-    # get all entries (ignore everything starting with underscore)
-    entries = os.listdir(games_path)
-    entries = (x for x in entries if not x.startswith('_'))
-
    # iterate over all entries
-    for entry in entries:
-        entry_path = os.path.join(games_path, entry)
-
-        # read entry
-        content = read_text(entry_path)
+    for entry, _, content in entry_iterator():

        # parse entry
        info = parse_entry(content)
@ -468,35 +400,35 @@ def assemble_infos():
    return infos


-def generate_statistics():
+def update_statistics(infos):
    """
    Generates the statistics page.

    Should be done every time the entries change.
    """

-    # for this function replace infos with infos.values
-    infois = infos.values()
+    print('update statistics')

    # start the page
-    statistics_path = os.path.join(games_path, 'statistics.md')
+    statistics_file = os.path.join(root_path, 'statistics.md')
    statistics = '[comment]: # (autogenerated content, do not edit)\n# Statistics\n\n'

    # total number
-    number_entries = len(infois)
+    number_entries = len(infos)
    rel = lambda x: x / number_entries * 100 # conversion to percent
+
    statistics += 'analyzed {} entries on {}\n\n'.format(number_entries, datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))

    # State (beta, mature, inactive)
    statistics += '## State\n\n'

-    number_state_beta = sum(1 for x in infois if 'beta' in x['state'])
-    number_state_mature = sum(1 for x in infois if 'mature' in x['state'])
-    number_inactive = sum(1 for x in infois if 'inactive' in x)
+    number_state_beta = sum(1 for x in infos if 'beta' in x['state'])
+    number_state_mature = sum(1 for x in infos if 'mature' in x['state'])
+    number_inactive = sum(1 for x in infos if 'inactive' in x)
    statistics += '- mature: {} ({:.1f}%)\n- beta: {} ({:.1f}%)\n- inactive: {} ({:.1f}%)\n\n'.format(number_state_mature, rel(number_state_mature), number_state_beta, rel(number_state_beta), number_inactive, rel(number_inactive))

    if number_inactive > 0:
-        entries_inactive = [(x['title'], x['inactive']) for x in infois if 'inactive' in x]
+        entries_inactive = [(x['title'], x['inactive']) for x in infos if 'inactive' in x]
        entries_inactive.sort(key=lambda x: x[0])  # first sort by name
        entries_inactive.sort(key=lambda x: x[1], reverse=True) # then sort by inactive year (more recently first)
        entries_inactive = ['{} ({})'.format(*x) for x in entries_inactive]
@ -517,7 +449,7 @@ def generate_statistics():

    # get all languages together
    languages = []
-    for info in infois:
+    for info in infos:
        if field in info:
            languages.extend(info[field])

@ -533,16 +465,16 @@ def generate_statistics():
    field = 'code license'

    # those without license
-    number_no_license = sum(1 for x in infois if field not in x)
+    number_no_license = sum(1 for x in infos if field not in x)
    if number_no_license > 0:
        statistics += 'Without license tag: {} ({:.1f}%)\n\n'.format(number_no_license, rel(number_no_license))
-        entries_no_license = [x['title'] for x in infois if field not in x]
+        entries_no_license = [x['title'] for x in infos if field not in x]
        entries_no_license.sort()
        statistics += ', '.join(entries_no_license) + '\n\n'

    # get all licenses together
    licenses = []
-    for info in infois:
+    for info in infos:
        if field in info:
            licenses.extend(info[field])

@ -559,7 +491,7 @@ def generate_statistics():

    # get all keywords together
    keywords = []
-    for info in infois:
+    for info in infos:
        if field in info:
            keywords.extend(info[field])

@ -574,7 +506,7 @@ def generate_statistics():
    statistics += '## Entries without download or play fields\n\n'

    entries = []
-    for info in infois:
+    for info in infos:
        if 'download' not in info and 'play' not in info:
            entries.append(info['title'])
    entries.sort()
@ -586,7 +518,7 @@ def generate_statistics():

    entries = []
    field = 'code repository'
-    for info in infois:
+    for info in infos:
        if field in info:
            popular = False
            for repo in info[field]:
@ -607,11 +539,11 @@ def generate_statistics():

    # get all build systems together
    build_systems = []
-    for info in infois:
+    for info in infos:
        if field in info:
            build_systems.extend(info[field])

-    statistics += 'Build systems information available for {:.1f}% of all projects.\n\n'.format(len(build_systems) / len(infois) * 100)
+    statistics += 'Build systems information available for {:.1f}% of all projects.\n\n'.format(rel(len(build_systems)))

    unique_build_systems = set(build_systems)
    unique_build_systems = [(l, build_systems.count(l) / len(build_systems)) for l in unique_build_systems]
@ -622,7 +554,7 @@ def generate_statistics():

    # C, C++ projects without build system information
    c_cpp_project_without_build_system = []
-    for info in infois:
+    for info in infos:
        if field not in info and ('C' in info['code language'] or 'C++' in info['code language']):
            c_cpp_project_without_build_system.append(info['title'])
    c_cpp_project_without_build_system.sort()
@ -630,7 +562,7 @@ def generate_statistics():

    # C, C++ projects with build system information but without CMake as build system
    c_cpp_project_not_cmake = []
-    for info in infois:
+    for info in infos:
        if field in info and 'CMake' in info[field] and ('C' in info['code language'] or 'C++' in info['code language']):
            c_cpp_project_not_cmake.append(info['title'])
    c_cpp_project_not_cmake.sort()
@ -642,11 +574,11 @@ def generate_statistics():

    # get all platforms together
    platforms = []
-    for info in infois:
+    for info in infos:
        if field in info:
            platforms.extend(info[field])

-    statistics += 'Platform information available for {:.1f}% of all projects.\n\n'.format(len(platforms) / len(infois) * 100)
+    statistics += 'Platform information available for {:.1f}% of all projects.\n\n'.format(rel(len(platforms)))

    unique_platforms = set(platforms)
    unique_platforms = [(l, platforms.count(l) / len(platforms)) for l in unique_platforms]
@ -655,25 +587,27 @@ def generate_statistics():
    unique_platforms = ['- {} ({:.1f}%)'.format(x[0], x[1]*100) for x in unique_platforms]
    statistics += '##### Platforms frequency\n\n' + '\n'.join(unique_platforms) + '\n\n'

-    with open(statistics_path, mode='w', encoding='utf-8') as f:
-        f.write(statistics)
+    # write to statistics file
+    write_text(statistics_file, statistics)


-def export_json():
+def export_json(infos):
    """
    Parses all entries, collects interesting info and stores it in a json file suitable for displaying
    with a dynamic table in a browser.
    """

+    print('export to json for web display')
+
    # make database out of it
-    db = {'headings': ['Game', 'Description', 'Download', 'Category', 'State', 'Keywords', 'Source']}
+    db = {'headings': ['Game', 'Description', 'Download', 'State', 'Keywords', 'Source']}

    entries = []
-    for info in infos.values():
+    for info in infos:

        # game & description
        entry = ['{} (<a href="{}">home</a>, <a href="{}">entry</a>)'.format(info['title'], info['home'][0],
-            r'https://github.com/Trilarion/opensourcegames/blob/master/games/' + info['path']),
+            r'https://github.com/Trilarion/opensourcegames/blob/master/games/' + info['file']),
            textwrap.shorten(info['description'], width=60, placeholder='..')]

        # download
@ -683,9 +617,6 @@ def export_json():
        else:
            entry.append('')

-        # category
-        entry.append(info['category'])
-
        # state (field state is essential)
        entry.append('{} / {}'.format(info['state'][0], 'inactive since {}'.format(info['inactive']) if 'inactive' in info else 'active'))

@ -780,13 +711,13 @@ def bzr_repo(repo):
    return None


-def update_primary_code_repositories():
+def export_primary_code_repositories_json():

    primary_repos = {'git':[],'svn':[],'hg':[],'bzr':[]}
    unconsumed_entries = []

    # for every entry filter those that are known git repositories (add additional repositories)
-    for info in infos.values():
+    for info in infos:
        field = 'code repository-raw'
        # if field 'Code repository' is available
        if field in info:
@ -837,36 +768,37 @@ def update_primary_code_repositories():
        primary_repos[k] = sorted(set(v))

    # write them to tools/git
-    json_path = os.path.join(games_path, os.path.pardir, 'tools', 'archives.json')
+    json_path = os.path.join(root_path, 'tools', 'archives.json')
    text = json.dumps(primary_repos, indent=1)
    write_text(json_path, text)

+
 if __name__ == "__main__":

    # paths
-    games_path = os.path.realpath(os.path.join(os.path.dirname(__file__), os.path.pardir, 'games'))
-    readme_file = os.path.realpath(os.path.join(games_path, os.pardir, 'README.md'))
+    root_path  = os.path.realpath(os.path.join(os.path.dirname(__file__), os.path.pardir))
+    games_path = os.path.join(root_path, 'games')
+
+    # check for unfilled template lines
+    check_template_leftovers()
+
+    # fix keywords
+    fix_keywords()

    # assemble info
    infos = assemble_infos()

-    # recount and wriite to readme and to tocs
+    # recount and write to readme and to tocs
    update_readme_and_tocs(infos)

    # generate report
-    #generate_statistics()
+    update_statistics(infos)

    # update database for html table
-    #export_json()
+    export_json(infos)

-    # check for unfilled template lines
-    #check_template_leftovers()
-
-    # fix keywords
-    # fix_keywords()
+    # collect list of primary code repositories
+    export_primary_code_repositories_json()

    # check external links (only rarely)
    # check_validity_external_links()
-
-    # collect list of primary code repositories
-    #update_primary_code_repositories()