diff --git a/code/maintenance.py b/code/maintenance.py index 4ce42351..76c81694 100644 --- a/code/maintenance.py +++ b/code/maintenance.py @@ -1,497 +1,10 @@ -""" - Runs a series of maintenance operations on the collection of entry files, updating the table of content files for - each category as well as creating a statistics file. - - Counts the number of records each sub-folder and updates the overview. - Sorts the entries in the contents files of each sub folder alphabetically. - - This script runs with Python 3, it could also with Python 2 with some minor tweaks probably. -""" - -import requests -import datetime import json import textwrap import os import re import utils.constants -from utils import constants as c, utils, osg - -def extract_links(): - """ - Parses all entries and extracts http(s) links from them - """ - - # regex for finding urls (can be in <> or in ]() or after a whitespace - regex = re.compile(r"[\s\n]<(http.+?)>|]\((http.+?)\)|[\s\n](http[^\s\n,]+?)[\s\n,]") - - # iterate over all entries - urls = set() - for _, _, content in entry_iterator(): - - # apply regex - matches = regex.findall(content) - - # for each match - for match in matches: - - # for each possible clause - for url in match: - - # if there was something (and not a sourceforge git url) - if url: - urls.add(url) - urls = sorted(list(urls), key=str.casefold) - return urls - - - - -def check_validity_external_links(): - """ - Checks all external links it can find for validity. Prints those with non OK HTTP responses. Does only need to be run - from time to time. - """ - - # regex for finding urls (can be in <> or in ]() or after a whitespace - regex = re.compile(r"[\s\n]<(http.+?)>|\]\((http.+?)\)|[\s\n](http[^\s\n,]+?)[\s\n\)]") - - # ignore the following patterns (they give false positives here) - ignored_urls = ('https://git.tukaani.org/xz.git', 'https://git.code.sf.net/', 'http://hg.hedgewars.org/hedgewars/', 'https://git.xiph.org/vorbis.git', 'http://svn.uktrainsim.com/svn/openrails', 'https://www.srb2.org/', 'http://wiki.srb2.org/') - - # some do redirect, but we nedertheless want the original URL in the database - redirect_okay = ('https://octaforge.org/', 'https://svn.openttd.org/', 'https://godotengine.org/download') - - # extract all links from entries - import urllib3 - urllib3.disable_warnings() # otherwise we cannot verify those with SSL errors without getting warnings - urls = {} - for entry, _, content in osg.entry_iterator(): - # apply regex - matches = regex.findall(content) - # for each match - for match in matches: - for url in match: - if url and not any((url.startswith(x) for x in ignored_urls)): - # ignore bzr.sourceforge, no web address found - if 'bzr.sourceforge.net/bzrroot/' in url: - continue - - # add "/" at the end - if any((url.startswith(x) for x in ('https://anongit.freedesktop.org/git', 'https://git.savannah.gnu.org/git/', 'https://git.savannah.nongnu.org/git/', 'https://git.artsoft.org/'))): - url += '/' - - if url.startswith('https://bitbucket.org/') and url.endswith('.git'): - url = url[:-4] + '/commits/' - if url.startswith('https://svn.code.sf.net/p/'): - url = 'http' + url[5:] + '/' - if url.startswith('http://cvs.savannah.nongnu.org:/sources/'): - url = 'http://cvs.savannah.nongnu.org/viewvc/' + url[40:] + '/' - if url.startswith('http://cvs.savannah.gnu.org:/sources/'): - url = 'http://cvs.savannah.gnu.org/viewvc/' + url[37:] + '/' - - # generally ".git" at the end is not working well, except sometimes - if url.endswith('.git') and not any((url.startswith(x) for x in ('https://repo.or.cz', 'https://git.tuxfamily.org/fanwor/fanwor'))): - url = url[:-4] - - if url in urls: - urls[url].add(entry) - else: - urls[url] = {entry} - print('found {} unique links'.format(len(urls))) - print("start checking external links (can take a while)") - - # now iterate over all urls - for url, names in urls.items(): - names = list(names) # was a set - if len(names) == 1: - names = names[0] - try: - verify = True - # some have an expired certificate but otherwise still work - if any((url.startswith(x) for x in ('https://perso.b2b2c.ca/~sarrazip/dev/', 'https://dreerally.com/', 'https://henlin.net/', 'https://www.megamek.org/', 'https://pixeldoctrine.com/', 'https://gitorious.org/', 'https://www.opmon-game.ga/'))): - verify = False - r = requests.head(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64)'}, timeout=20, allow_redirects=True, verify=verify) - if r.status_code == 405: # head method not supported, try get - r = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64)'}, timeout=20, allow_redirects=True, verify=verify) - # check for bad status - if r.status_code != requests.codes.ok: - print('{}: {} - {}'.format(names, url, r.status_code)) - # check for redirect - if r.history and url not in redirect_okay: - # only / added or http->https sometimes - redirected_url = r.url - if redirected_url == url + '/': - output = '{}: {} -> {} - redirect "/" at end ' - elif redirected_url == 'https' + url[4:]: - output = '{}: {} -> {} - redirect "https" at start' - else: - output = '{}: {} -> {} - redirect ' - print(output.format(names, url, redirected_url)) - except Exception as e: - error_name = type(e).__name__ - if error_name == 'SSLError' and any((url.startswith(x) for x in ('https://gitorious.org/', 'https://www.freedroid.org/download/'))): - continue # even though verify is False, these errors still get through - print('{}: {} - exception {}'.format(names, url, error_name)) - - -def fix_entries(): - """ - Fixes the keywords, code dependencies, build systems, .. entries, mostly by automatically sorting them. - """ - - keyword_synonyms = {'RTS': ('real time', 'strategy'), 'realtime': 'real time'} - - # TODO also sort other fields, only read once and then do all, move to separate file - # example Javascript to JavaScript and then add whenever the known languages check hits - - print('fix entries') - - # keywords - regex = re.compile(r"(.*)- Keywords:([^\n]*)(.*)", re.DOTALL) - - # iterate over all entries - for entry, entry_path, content in osg.entry_iterator(): - - # match with regex - matches = regex.findall(content) - if len(matches) != 1: - raise RuntimeError('Could not find keywords in entry "{}"'.format(entry)) - - match = matches[0] - - # get elements out, split, strip, delete duplicates - elements = match[1].split(',') - elements = [x.strip() for x in elements] - elements = list(set(elements)) - - # get category out - for keyword in utils.constants.recommended_keywords: - if keyword in elements: - elements.remove(keyword) - category = keyword - break - - # special treatments here - elements = [x if x != 'TBS' and x != 'TB' else 'turn based' for x in elements] - elements = [x if x != 'RTS' else 'real time' for x in elements] - elements = [x if x != 'MMO' else 'massive multiplayer online' for x in elements] - elements = [x if x != 'MMO' else 'multiplayer online' for x in elements] - elements = [x if x != 'SP' else 'singleplayer' for x in elements] - elements = [x if x != 'MP' else 'multiplayer' for x in elements] - elements = [x if x != 'engine' else 'game engine' for x in elements] - elements = [x if x != 'rpg' else 'role playing' for x in elements] - elements = [x if x != 'turn based' else 'turn-based' for x in elements] - for keyword in ('browser', 'misc', 'tools'): - if keyword in elements: - elements.remove(keyword) - - # sort - elements.sort(key=str.casefold) - - # add category - elements.insert(0, category) - - keywords = '- Keywords: {}'.format(', '.join(elements)) - - new_content = match[0] + keywords + match[2] - - if new_content != content: - # write again - utils.write_text(entry_path, new_content) - - # code dependencies - regex = re.compile(r"(.*)- Code dependencies:([^\n]*)(.*)", re.DOTALL) - - # iterate over all entries - for entry, entry_path, content in osg.entry_iterator(): - # match with regex - matches = regex.findall(content) - - if not matches: - # no code dependencies given - continue - - match = matches[0] - - # get code dependencies out, split, strip, delete duplicates - elements = match[1].split(',') - elements = [x.strip() for x in elements] - elements = list(set(elements)) - - # special treatments here - elements = [x if x != 'Blender' else 'Blender game engine' for x in elements] - elements = [x if x.lower() != 'libgdx' else 'libGDX' for x in elements] - elements = [x if x != 'SDL 2' else 'SDL2' for x in elements] - elements = [x if x.lower() != "ren'py" else "Ren'Py" for x in elements] - - # sort - elements.sort(key=str.casefold) - - code_dependencies = '- Code dependencies: {}'.format(', '.join(elements)) - - new_content = match[0] + code_dependencies + match[2] - - if new_content != content: - # write again - utils.write_text(entry_path, new_content) - - # build systems - regex = re.compile(r"(.*)- Build system:([^\n]*)(.*)", re.DOTALL) - - # iterate over all entries - for entry, entry_path, content in osg.entry_iterator(): - # match with regex - matches = regex.findall(content) - - if not matches: - # no build system given - continue - - match = matches[0] - - # get code dependencies out, split, strip, delete duplicates - elements = match[1].split(',') - elements = [x.strip() for x in elements] - elements = list(set(elements)) - - # special treatments here - - # sort - elements.sort(key=str.casefold) - - build_system = '- Build system: {}'.format(', '.join(elements)) - - new_content = match[0] + build_system + match[2] - - if new_content != content: - # write again - utils.write_text(entry_path, new_content) - - -def update_statistics(infos): - """ - Generates the statistics page. - - Should be done every time the entries change. - """ - - print('update statistics') - - # start the page - statistics_file = os.path.join(c.root_path, 'statistics.md') - statistics = '[comment]: # (autogenerated content, do not edit)\n# Statistics\n\n' - - # total number - number_entries = len(infos) - rel = lambda x: x / number_entries * 100 # conversion to percent - - statistics += 'analyzed {} entries on {}\n\n'.format(number_entries, - datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')) - - # State (beta, mature, inactive) - statistics += '## State\n\n' - - number_state_beta = sum(1 for x in infos if 'beta' in x['state']) - number_state_mature = sum(1 for x in infos if 'mature' in x['state']) - number_inactive = sum(1 for x in infos if 'inactive' in x) - statistics += '- mature: {} ({:.1f}%)\n- beta: {} ({:.1f}%)\n- inactive: {} ({:.1f}%)\n\n'.format( - number_state_mature, rel(number_state_mature), number_state_beta, rel(number_state_beta), number_inactive, - rel(number_inactive)) - - if number_inactive > 0: - entries_inactive = [(x['Name'], x['inactive']) for x in infos if 'inactive' in x] - entries_inactive.sort(key=lambda x: str.casefold(x[0])) # first sort by name - entries_inactive.sort(key=lambda x: x[1], reverse=True) # then sort by inactive year (more recently first) - entries_inactive = ['{} ({})'.format(*x) for x in entries_inactive] - statistics += '##### Inactive State\n\n' + ', '.join(entries_inactive) + '\n\n' - - # Language - statistics += '## Code Languages\n\n' - field = 'code language' - - # those without language tag - # TODO the language tag is now an essential field, this cannot happen anymore - # number_no_language = sum(1 for x in infois if field not in x) - # if number_no_language > 0: - # statistics += 'Without language tag: {} ({:.1f}%)\n\n'.format(number_no_language, rel(number_no_language)) - # entries_no_language = [x['Name'] for x in infois if field not in x] - # entries_no_language.sort() - # statistics += ', '.join(entries_no_language) + '\n\n' - - # get all languages together - languages = [] - for info in infos: - if field in info: - languages.extend(info[field]) - - unique_languages = set(languages) - unique_languages = [(l, languages.count(l) / len(languages)) for l in unique_languages] - unique_languages.sort(key=lambda x: str.casefold(x[0])) # first sort by name - unique_languages.sort(key=lambda x: x[1], reverse=True) # then sort by occurrence (highest occurrence first) - unique_languages = ['- {} ({:.1f}%)\n'.format(x[0], x[1] * 100) for x in unique_languages] - statistics += '##### Language frequency\n\n' + ''.join(unique_languages) + '\n' - - # Licenses - statistics += '## Code licenses\n\n' - field = 'code license' - - # those without license - number_no_license = sum(1 for x in infos if field not in x) - if number_no_license > 0: - statistics += 'Without license tag: {} ({:.1f}%)\n\n'.format(number_no_license, rel(number_no_license)) - entries_no_license = [x['Name'] for x in infos if field not in x] - entries_no_license.sort() - statistics += ', '.join(entries_no_license) + '\n\n' - - # get all licenses together - licenses = [] - for info in infos: - if field in info: - licenses.extend(info[field]) - - unique_licenses = set(licenses) - unique_licenses = [(l, licenses.count(l) / len(licenses)) for l in unique_licenses] - unique_licenses.sort(key=lambda x: str.casefold(x[0])) # first sort by name - unique_licenses.sort(key=lambda x: -x[1]) # then sort by occurrence (highest occurrence first) - unique_licenses = ['- {} ({:.1f}%)\n'.format(x[0], x[1] * 100) for x in unique_licenses] - statistics += '##### Licenses frequency\n\n' + ''.join(unique_licenses) + '\n' - - # Keywords - statistics += '## Keywords\n\n' - field = 'keywords' - - # get all keywords together - keywords = [] - for info in infos: - if field in info: - keywords.extend(info[field]) - # reduce those starting with "inspired by" - keywords = [x if not x.startswith('inspired by') else 'inspired' for x in keywords] - # reduce those starting with "multiplayer" - keywords = [x if not x.startswith('multiplayer') else 'multiplayer' for x in keywords] - - unique_keywords = set(keywords) - unique_keywords = [(l, keywords.count(l) / len(keywords)) for l in unique_keywords] - unique_keywords.sort(key=lambda x: str.casefold(x[0])) # first sort by name - unique_keywords.sort(key=lambda x: -x[1]) # then sort by occurrence (highest occurrence first) - unique_keywords = ['- {} ({:.1f}%)'.format(x[0], x[1] * 100) for x in unique_keywords] - statistics += '##### Keywords frequency\n\n' + '\n'.join(unique_keywords) + '\n\n' - - # no download or play field - statistics += '## Entries without download or play fields\n\n' - - entries = [] - for info in infos: - if 'download' not in info and 'play' not in info: - entries.append(info['Name']) - entries.sort(key=str.casefold) - statistics += '{}: '.format(len(entries)) + ', '.join(entries) + '\n\n' - - # code hosted not on github, gitlab, bitbucket, launchpad, sourceforge - popular_code_repositories = ('github.com', 'gitlab.com', 'bitbucket.org', 'code.sf.net', 'code.launchpad.net') - statistics += '## Entries with a code repository not on a popular site\n\n' - - entries = [] - field = 'code repository' - for info in infos: - if field in info: - popular = False - for repo in info[field]: - for popular_repo in popular_code_repositories: - if popular_repo in repo: - popular = True - break - # if there were repositories, but none popular, add them to the list - if not popular: - entries.append(info['Name']) - # print(info[field]) - entries.sort(key=str.casefold) - statistics += '{}: '.format(len(entries)) + ', '.join(entries) + '\n\n' - - # Code dependencies - statistics += '## Code dependencies\n\n' - field = 'code dependencies' - - # get all code dependencies together - code_dependencies = [] - entries_with_code_dependency = 0 - for info in infos: - if field in info: - code_dependencies.extend(info[field]) - entries_with_code_dependency += 1 - statistics += 'With code dependency field {} ({:.1f}%)\n\n'.format(entries_with_code_dependency, - rel(entries_with_code_dependency)) - - unique_code_dependencies = set(code_dependencies) - unique_code_dependencies = [(l, code_dependencies.count(l) / len(code_dependencies)) for l in - unique_code_dependencies] - unique_code_dependencies.sort(key=lambda x: str.casefold(x[0])) # first sort by name - unique_code_dependencies.sort(key=lambda x: -x[1]) # then sort by occurrence (highest occurrence first) - unique_code_dependencies = ['- {} ({:.1f}%)'.format(x[0], x[1] * 100) for x in unique_code_dependencies] - statistics += '##### Code dependencies frequency\n\n' + '\n'.join(unique_code_dependencies) + '\n\n' - - # Build systems: - statistics += '## Build systems\n\n' - field = 'build system' - - # get all build systems together - build_systems = [] - for info in infos: - if field in info: - build_systems.extend(info[field]) - - statistics += 'Build systems information available for {:.1f}% of all projects.\n\n'.format(rel(len(build_systems))) - - unique_build_systems = set(build_systems) - unique_build_systems = [(l, build_systems.count(l) / len(build_systems)) for l in unique_build_systems] - unique_build_systems.sort(key=lambda x: str.casefold(x[0])) # first sort by name - unique_build_systems.sort(key=lambda x: -x[1]) # then sort by occurrence (highest occurrence first) - unique_build_systems = ['- {} ({:.1f}%)'.format(x[0], x[1] * 100) for x in unique_build_systems] - statistics += '##### Build systems frequency ({})\n\n'.format(len(build_systems)) + '\n'.join( - unique_build_systems) + '\n\n' - - # C, C++ projects without build system information - c_cpp_project_without_build_system = [] - for info in infos: - if field not in info and ('C' in info['code language'] or 'C++' in info['code language']): - c_cpp_project_without_build_system.append(info['Name']) - c_cpp_project_without_build_system.sort(key=str.casefold) - statistics += '##### C and C++ projects without build system information ({})\n\n'.format( - len(c_cpp_project_without_build_system)) + ', '.join(c_cpp_project_without_build_system) + '\n\n' - - # C, C++ projects with build system information but without CMake as build system - c_cpp_project_not_cmake = [] - for info in infos: - if field in info and 'CMake' in info[field] and ( - 'C' in info['code language'] or 'C++' in info['code language']): - c_cpp_project_not_cmake.append(info['Name']) - c_cpp_project_not_cmake.sort(key=str.casefold) - statistics += '##### C and C++ projects with a build system different from CMake ({})\n\n'.format( - len(c_cpp_project_not_cmake)) + ', '.join(c_cpp_project_not_cmake) + '\n\n' - - # Platform - statistics += '## Platform\n\n' - field = 'platform' - - # get all platforms together - platforms = [] - for info in infos: - if field in info: - platforms.extend(info[field]) - - statistics += 'Platform information available for {:.1f}% of all projects.\n\n'.format(rel(len(platforms))) - - unique_platforms = set(platforms) - unique_platforms = [(l, platforms.count(l) / len(platforms)) for l in unique_platforms] - unique_platforms.sort(key=lambda x: str.casefold(x[0])) # first sort by name - unique_platforms.sort(key=lambda x: -x[1]) # then sort by occurrence (highest occurrence first) - unique_platforms = ['- {} ({:.1f}%)'.format(x[0], x[1] * 100) for x in unique_platforms] - statistics += '##### Platforms frequency\n\n' + '\n'.join(unique_platforms) + '\n\n' - - # write to statistics file - utils.write_text(statistics_file, statistics) +from utils import constants as c, utils def export_json(infos): @@ -720,18 +233,6 @@ def export_git_code_repositories_json(): utils.write_text(json_path, text) -def sort_text_file(file, name): - """ - Reads a text file, splits in lines, removes duplicates, sort, writes back. - """ - text = utils.read_text(file) - text = text.split('\n') - text = sorted(list(set(text)), key=str.casefold) - print('{} contains {} items'.format(name, len(text))) - text = '\n'.join(text) - utils.write_text(file, text) - - def check_validity_backlog(): import requests @@ -792,36 +293,6 @@ def check_code_dependencies(infos): print('{} ({})'.format(*dep)) -if __name__ == "__main__": - - check_validity_backlog() - - # fix entries - fix_entries() - - # recount and write to readme and to tocs - update_readme_and_tocs(infos) - - # generate report - update_statistics(infos) - - # update database for html table - export_json(infos) - - # collect list of primary code repositories - export_primary_code_repositories_json(infos) - - # check code dependencies - check_code_dependencies(infos) - - # collect list of git code repositories (only one per project) for git_statistics script - export_git_code_repositories_json() - - # check external links (only rarely) - # check_validity_external_links() - - # sort rejected games list file - sort_text_file(os.path.join(c.root_path, 'code', 'rejected.txt'), 'rejected games list') diff --git a/code/maintenance_entries.py b/code/maintenance_entries.py new file mode 100644 index 00000000..4ccb235d --- /dev/null +++ b/code/maintenance_entries.py @@ -0,0 +1,581 @@ +""" + Runs a series of maintenance operations on the collection of entry files, updating the table of content files for + each category as well as creating a statistics file. + + Counts the number of records each sub-folder and updates the overview. + Sorts the entries in the contents files of each sub folder alphabetically. +""" + +import os +import re +import datetime +from utils import osg, osg_ui, utils, constants as c +import requests + + +def create_toc(title, file, entries): + """ + + """ + # file path + toc_file = os.path.join(c.tocs_path, file) + + # header line + text = '[comment]: # (autogenerated content, do not edit)\n# {}\n\n'.format(title) + + # assemble rows + rows = [] + for entry in entries: + info = entry['Code language'] + entry['Code license'] + entry['State'] + info = [x.value for x in info] + rows.append('- **[{}]({})** ({})'.format(entry['Title'], '../' + entry['File'], ', '.join(info))) + + # sort rows (by title) + rows.sort(key=str.casefold) + + # add to text + text += '\n'.join(rows) + + # write to toc file + utils.write_text(toc_file, text) + + print('Readme and TOCs updated') + + +def sort_text_file(file, name): + """ + Reads a text file, splits in lines, removes duplicates, sort, writes back. + """ + text = utils.read_text(file) + text = text.split('\n') + text = sorted(list(set(text)), key=str.casefold) + print('{} contains {} items'.format(name, len(text))) + text = '\n'.join(text) + utils.write_text(file, text) + + +class EntriesMaintainer: + + def __init__(self): + self.entries = None + + def read_entries(self): + self.entries = osg.read_entries() + print('{} entries read'.format(len(self.entries))) + + def write_entries(self): + if not self.entries: + print('entries not yet loaded') + return + osg.write_entries(self.entries) + print('entries written') + + + def check_template_leftovers(self): + """ + Checks for template leftovers. + Should be run only occasionally. + """ + # load template and get all lines + text = utils.read_text(os.path.join(c.root_path, 'template.md')) + text = text.split('\n') + check_strings = [x for x in text if x and not x.startswith('##')] + + # iterate over all entries + for _, entry_path, content in osg.entry_iterator(): + + for check_string in check_strings: + if content.find(check_string) >= 0: + print('{}: found {}'.format(os.path.basename(entry_path), check_string)) + print('checked for template leftovers') + + def clean_rejected(self): + """ + + :return: + """ + # sort rejected games list file + sort_text_file(os.path.join(c.root_path, 'code', 'rejected.txt'), 'rejected games list') + + def clean_backlog(self): + """ + + :return: + """ + if not self.entries: + print('entries not yet loaded') + return + # get urls from entries + included_urls = osg.all_urls(self.entries) + included_urls = list(included_urls.keys()) # only need the URLs here + + # get urls from rejected file + text = utils.read_text(c.rejected_file) + regex = re.compile(r"\((http.*?)\)", re.MULTILINE) + matches = regex.findall(text) + rejected_urls = [] + for match in matches: + urls = match.split(',') + urls = [x.strip() for x in urls] + rejected_urls.extend(urls) + included_urls.extend(rejected_urls) + + # those that only have a web archive version, also get the original version + more_urls = [] + for url in included_urls: + if url.startswith('https://web.archive.org/web'): + # print(url) # sometimes the http is missing in archive links (would need proper parsing) + url = url[url.index('http', 5):] + more_urls.append(url) + included_urls.extend(more_urls) + + # now we strip the urls + stripped_urls = [utils.strip_url(x) for x in included_urls] + stripped_urls = set(stripped_urls) # removes duplicates for performance + + # read backlog and get urls from there + text = utils.read_text(c.backlog_file) + text = text.split('\n') + + # remove those that are in stripped_game_urls + text = [x for x in text if utils.strip_url(x) not in stripped_urls] + + # remove duplicates and sort + text = sorted(list(set(text)), key=str.casefold) + print('backlog contains {} items'.format(len(text))) + + # join and save again + text = '\n'.join(text) + utils.write_text(c.backlog_file, text) + + print('backlog cleaned') + + def check_external_links(self): + """ + Checks all external links it can find for validity. Prints those with non OK HTTP responses. Does only need to be run + from time to time. + """ + + # regex for finding urls (can be in <> or in ]() or after a whitespace + regex = re.compile(r"[\s\n]<(http.+?)>|\]\((http.+?)\)|[\s\n](http[^\s\n,]+?)[\s\n\)]") + + # ignore the following patterns (they give false positives here) + ignored_urls = ( + 'https://git.tukaani.org/xz.git', 'https://git.code.sf.net/', 'http://hg.hedgewars.org/hedgewars/', + 'https://git.xiph.org/vorbis.git', 'http://svn.uktrainsim.com/svn/openrails', 'https://www.srb2.org/', + 'http://wiki.srb2.org/') + + # some do redirect, but we nedertheless want the original URL in the database + redirect_okay = ('https://octaforge.org/', 'https://svn.openttd.org/', 'https://godotengine.org/download') + + # extract all links from entries + import urllib3 + urllib3.disable_warnings() # otherwise we cannot verify those with SSL errors without getting warnings + urls = {} + for entry, _, content in osg.entry_iterator(): + # apply regex + matches = regex.findall(content) + # for each match + for match in matches: + for url in match: + if url and not any((url.startswith(x) for x in ignored_urls)): + # ignore bzr.sourceforge, no web address found + if 'bzr.sourceforge.net/bzrroot/' in url: + continue + + # add "/" at the end + if any((url.startswith(x) for x in ( + 'https://anongit.freedesktop.org/git', 'https://git.savannah.gnu.org/git/', + 'https://git.savannah.nongnu.org/git/', 'https://git.artsoft.org/'))): + url += '/' + + if url.startswith('https://bitbucket.org/') and url.endswith('.git'): + url = url[:-4] + '/commits/' + if url.startswith('https://svn.code.sf.net/p/'): + url = 'http' + url[5:] + '/' + if url.startswith('http://cvs.savannah.nongnu.org:/sources/'): + url = 'http://cvs.savannah.nongnu.org/viewvc/' + url[40:] + '/' + if url.startswith('http://cvs.savannah.gnu.org:/sources/'): + url = 'http://cvs.savannah.gnu.org/viewvc/' + url[37:] + '/' + + # generally ".git" at the end is not working well, except sometimes + if url.endswith('.git') and not any((url.startswith(x) for x in ( + 'https://repo.or.cz', 'https://git.tuxfamily.org/fanwor/fanwor'))): + url = url[:-4] + + if url in urls: + urls[url].add(entry) + else: + urls[url] = {entry} + print('found {} unique links'.format(len(urls))) + print("start checking external links (can take a while)") + + # now iterate over all urls + for url, names in urls.items(): + names = list(names) # was a set + if len(names) == 1: + names = names[0] + try: + verify = True + # some have an expired certificate but otherwise still work + if any((url.startswith(x) for x in ( + 'https://perso.b2b2c.ca/~sarrazip/dev/', 'https://dreerally.com/', 'https://henlin.net/', + 'https://www.megamek.org/', 'https://pixeldoctrine.com/', 'https://gitorious.org/', + 'https://www.opmon-game.ga/'))): + verify = False + r = requests.head(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64)'}, timeout=20, + allow_redirects=True, verify=verify) + if r.status_code == 405: # head method not supported, try get + r = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64)'}, + timeout=20, allow_redirects=True, verify=verify) + # check for bad status + if r.status_code != requests.codes.ok: + print('{}: {} - {}'.format(names, url, r.status_code)) + # check for redirect + if r.history and url not in redirect_okay: + # only / added or http->https sometimes + redirected_url = r.url + if redirected_url == url + '/': + output = '{}: {} -> {} - redirect "/" at end ' + elif redirected_url == 'https' + url[4:]: + output = '{}: {} -> {} - redirect "https" at start' + else: + output = '{}: {} -> {} - redirect ' + print(output.format(names, url, redirected_url)) + except Exception as e: + error_name = type(e).__name__ + if error_name == 'SSLError' and any((url.startswith(x) for x in ( + 'https://gitorious.org/', 'https://www.freedroid.org/download/'))): + continue # even though verify is False, these errors still get through + print('{}: {} - exception {}'.format(names, url, error_name)) + + def update_readme_tocs(self): + """ + Recounts entries in sub categories and writes them to the readme. + Also updates the _toc files in the categories directories. + + Note: The Readme must have a specific structure at the beginning, starting with "# Open Source Games" and ending + on "A collection.." + + Needs to be performed regularly. + """ + + # completely delete content of toc path + for file in os.listdir(c.tocs_path): + os.remove(os.path.join(c.tocs_path, file)) + + # read readme + readme_file = os.path.join(c.root_path, 'README.md') + readme_text = utils.read_text(readme_file) + + # compile regex for identifying the building blocks in the readme + regex = re.compile(r"(.*?)(\[comment\]: # \(start.*?end of autogenerated content\))(.*)", re.DOTALL) + + # apply regex + matches = regex.findall(readme_text) + if len(matches) != 1: + raise RuntimeError('readme file has invalid structure') + matches = matches[0] + start = matches[0] + end = matches[2] + + tocs_text = '' + + # split into games, tools, frameworks, libraries + games = [x for x in self.entries if not any([y in x['Keywords'] for y in ('tool', 'framework', 'library')])] + tools = [x for x in self.entries if 'tool' in x['Keywords']] + frameworks = [x for x in self.entries if 'framework' in x['Keywords']] + libraries = [x for x in self.entries if 'library' in x['Keywords']] + + # create games, tools, frameworks, libraries tocs + title = 'Games' + file = '_games.md' + tocs_text += '**[{}](entries/tocs/{}#{})** ({}) - '.format(title, file, title, len(games)) + create_toc(title, file, games) + + title = 'Tools' + file = '_tools.md' + tocs_text += '**[{}](entries/tocs/{}#{})** ({}) - '.format(title, file, title, len(tools)) + create_toc(title, file, tools) + + title = 'Frameworks' + file = '_frameworks.md' + tocs_text += '**[{}](entries/tocs/{}#{})** ({}) - '.format(title, file, title, len(frameworks)) + create_toc(title, file, frameworks) + + title = 'Libraries' + file = '_libraries.md' + tocs_text += '**[{}](entries/tocs/{}#{})** ({})\n'.format(title, file, title, len(libraries)) + create_toc(title, file, libraries) + + # create by category + categories_text = [] + for keyword in c.recommended_keywords: + filtered = [x for x in self.entries if keyword in x['Keywords']] + title = keyword.capitalize() + name = keyword.replace(' ', '-') + file = '_{}.md'.format(name) + categories_text.append('**[{}](entries/tocs/{}#{})** ({})'.format(title, file, name, len(filtered))) + create_toc(title, file, filtered) + categories_text.sort() + tocs_text += '\nBy category: {}\n'.format(', '.join(categories_text)) + + # create by platform + platforms_text = [] + for platform in c.valid_platforms: + filtered = [x for x in self.entries if platform in x.get('Platform', [])] + title = platform + name = platform.lower() + file = '_{}.md'.format(name) + platforms_text.append('**[{}](entries/tocs/{}#{})** ({})'.format(title, file, name, len(filtered))) + create_toc(title, file, filtered) + tocs_text += '\nBy platform: {}\n'.format(', '.join(platforms_text)) + + # insert new text in the middle (the \n before the second comment is necessary, otherwise Markdown displays it as part of the bullet list) + text = start + "[comment]: # (start of autogenerated content, do not edit)\n" + tocs_text + "\n[comment]: # (end of autogenerated content)" + end + + # write to readme + utils.write_text(readme_file, text) + + def update_statistics(self): + """ + Generates the statistics page. + + Should be done every time the entries change. + """ + if not self.entries: + print('entries not yet loaded') + return + + # start the page + statistics = '[comment]: # (autogenerated content, do not edit)\n# Statistics\n\n' + + # total number + number_entries = len(self.entries) + rel = lambda x: x / number_entries * 100 # conversion to percent + + statistics += 'analyzed {} entries on {}\n\n'.format(number_entries, + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')) + + # State (beta, mature, inactive) + statistics += '## State\n\n' + + number_state_beta = sum(1 for x in self.entries if 'beta' in x['State']) + number_state_mature = sum(1 for x in self.entries if 'mature' in x['State']) + number_inactive = sum(1 for x in self.entries if osg.is_inactive(x)) + statistics += '- mature: {} ({:.1f}%)\n- beta: {} ({:.1f}%)\n- inactive: {} ({:.1f}%)\n\n'.format( + number_state_mature, rel(number_state_mature), number_state_beta, rel(number_state_beta), number_inactive, + rel(number_inactive)) + + if number_inactive > 0: + entries_inactive = [(x['Title'], osg.extract_inactive_year(x)) for x in self.entries if osg.is_inactive(x)] + entries_inactive.sort(key=lambda x: str.casefold(x[0])) # first sort by name + entries_inactive.sort(key=lambda x: x[1], reverse=True) # then sort by inactive year (more recently first) + entries_inactive = ['{} ({})'.format(*x) for x in entries_inactive] + statistics += '##### Inactive State\n\n' + ', '.join(entries_inactive) + '\n\n' + + # Language + statistics += '## Code Languages\n\n' + field = 'Code language' + + # get all languages together + languages = [] + for entry in self.entries: + languages.extend(entry[field]) + languages = [x.value for x in languages] + + unique_languages = set(languages) + unique_languages = [(l, languages.count(l) / len(languages)) for l in unique_languages] + unique_languages.sort(key=lambda x: str.casefold(x[0])) # first sort by name + unique_languages.sort(key=lambda x: x[1], reverse=True) # then sort by occurrence (highest occurrence first) + unique_languages = ['- {} ({:.1f}%)\n'.format(x[0], x[1] * 100) for x in unique_languages] + statistics += '##### Language frequency\n\n' + ''.join(unique_languages) + '\n' + + # Licenses + statistics += '## Code licenses\n\n' + field = 'Code license' + + # get all licenses together + licenses = [] + for entry in self.entries: + licenses.extend(entry[field]) + licenses = [x.value for x in licenses] + + unique_licenses = set(licenses) + unique_licenses = [(l, licenses.count(l) / len(licenses)) for l in unique_licenses] + unique_licenses.sort(key=lambda x: str.casefold(x[0])) # first sort by name + unique_licenses.sort(key=lambda x: -x[1]) # then sort by occurrence (highest occurrence first) + unique_licenses = ['- {} ({:.1f}%)\n'.format(x[0], x[1] * 100) for x in unique_licenses] + statistics += '##### Licenses frequency\n\n' + ''.join(unique_licenses) + '\n' + + # Keywords + statistics += '## Keywords\n\n' + field = 'Keywords' + + # get all keywords together + keywords = [] + for entry in self.entries: + keywords.extend(entry[field]) + keywords = [x.value for x in keywords] + + # reduce those starting with "multiplayer" + keywords = [x if not x.startswith('multiplayer') else 'multiplayer' for x in keywords] + + unique_keywords = set(keywords) + unique_keywords = [(l, keywords.count(l) / len(keywords)) for l in unique_keywords] + unique_keywords.sort(key=lambda x: str.casefold(x[0])) # first sort by name + unique_keywords.sort(key=lambda x: -x[1]) # then sort by occurrence (highest occurrence first) + unique_keywords = ['- {} ({:.1f}%)'.format(x[0], x[1] * 100) for x in unique_keywords] + statistics += '##### Keywords frequency\n\n' + '\n'.join(unique_keywords) + '\n\n' + + # no download or play field + statistics += '## Entries without download or play fields\n\n' + + entries = [] + for entry in self.entries: + if 'Download' not in entry and 'Play' not in entry: + entries.append(entry['Title']) + entries.sort(key=str.casefold) + statistics += '{}: '.format(len(entries)) + ', '.join(entries) + '\n\n' + + # code hosted not on github, gitlab, bitbucket, launchpad, sourceforge + popular_code_repositories = ('github.com', 'gitlab.com', 'bitbucket.org', 'code.sf.net', 'code.launchpad.net') + statistics += '## Entries with a code repository not on a popular site\n\n' + + entries = [] + field = 'Code repository' + for entry in self.entries: + popular = False + for repo in entry[field]: + for popular_repo in popular_code_repositories: + if popular_repo in repo.value: + popular = True + break + # if there were repositories, but none popular, add them to the list + if not popular: + entries.append(entry['Title']) + # print(info[field]) + entries.sort(key=str.casefold) + statistics += '{}: '.format(len(entries)) + ', '.join(entries) + '\n\n' + + # Code dependencies + statistics += '## Code dependencies\n\n' + field = 'Code dependencies' + + # get all code dependencies together + code_dependencies = [] + entries_with_code_dependency = 0 + for entry in self.entries: + if field in entry: + code_dependencies.extend(entry[field]) + entries_with_code_dependency += 1 + code_dependencies = [x.value for x in code_dependencies] + statistics += 'With code dependency field {} ({:.1f}%)\n\n'.format(entries_with_code_dependency, + rel(entries_with_code_dependency)) + + unique_code_dependencies = set(code_dependencies) + unique_code_dependencies = [(l, code_dependencies.count(l) / len(code_dependencies)) for l in + unique_code_dependencies] + unique_code_dependencies.sort(key=lambda x: str.casefold(x[0])) # first sort by name + unique_code_dependencies.sort(key=lambda x: -x[1]) # then sort by occurrence (highest occurrence first) + unique_code_dependencies = ['- {} ({:.1f}%)'.format(x[0], x[1] * 100) for x in unique_code_dependencies] + statistics += '##### Code dependencies frequency\n\n' + '\n'.join(unique_code_dependencies) + '\n\n' + + # Build systems: + statistics += '## Build systems\n\n' + field = 'Build system' + + # get all build systems together + build_systems = [] + for entry in self.entries: + if field in entry['Building']: + build_systems.extend(entry['Building'][field]) + build_systems = [x.value for x in build_systems] + + statistics += 'Build systems information available for {:.1f}% of all projects.\n\n'.format( + rel(len(build_systems))) + + unique_build_systems = set(build_systems) + unique_build_systems = [(l, build_systems.count(l) / len(build_systems)) for l in unique_build_systems] + unique_build_systems.sort(key=lambda x: str.casefold(x[0])) # first sort by name + unique_build_systems.sort(key=lambda x: -x[1]) # then sort by occurrence (highest occurrence first) + unique_build_systems = ['- {} ({:.1f}%)'.format(x[0], x[1] * 100) for x in unique_build_systems] + statistics += '##### Build systems frequency ({})\n\n'.format(len(build_systems)) + '\n'.join( + unique_build_systems) + '\n\n' + + # C, C++ projects without build system information + c_cpp_project_without_build_system = [] + for entry in self.entries: + if field not in entry and ('C' in entry['Code language'] or 'C++' in entry['Code language']): + c_cpp_project_without_build_system.append(entry['Title']) + c_cpp_project_without_build_system.sort(key=str.casefold) + statistics += '##### C and C++ projects without build system information ({})\n\n'.format( + len(c_cpp_project_without_build_system)) + ', '.join(c_cpp_project_without_build_system) + '\n\n' + + # C, C++ projects with build system information but without CMake as build system + c_cpp_project_not_cmake = [] + for entry in entries: + if field in entry and 'CMake' in entry[field] and ( + 'C' in entry['Code language'] or 'C++' in entry['Code language']): + c_cpp_project_not_cmake.append(entry['Title']) + c_cpp_project_not_cmake.sort(key=str.casefold) + statistics += '##### C and C++ projects with a build system different from CMake ({})\n\n'.format( + len(c_cpp_project_not_cmake)) + ', '.join(c_cpp_project_not_cmake) + '\n\n' + + # Platform + statistics += '## Platform\n\n' + field = 'Platform' + + # get all platforms together + platforms = [] + for entry in self.entries: + if field in entry: + platforms.extend(entry[field]) + platforms = [x.value for x in platforms] + + statistics += 'Platform information available for {:.1f}% of all projects.\n\n'.format(rel(len(platforms))) + + unique_platforms = set(platforms) + unique_platforms = [(l, platforms.count(l) / len(platforms)) for l in unique_platforms] + unique_platforms.sort(key=lambda x: str.casefold(x[0])) # first sort by name + unique_platforms.sort(key=lambda x: -x[1]) # then sort by occurrence (highest occurrence first) + unique_platforms = ['- {} ({:.1f}%)'.format(x[0], x[1] * 100) for x in unique_platforms] + statistics += '##### Platforms frequency\n\n' + '\n'.join(unique_platforms) + '\n\n' + + # write to statistics file + utils.write_text(c.statistics_file, statistics) + + print('statistics updated') + + def update_html(self): + pass + + def update_repos(self): + pass + + def complete_run(self): + pass + + +if __name__ == "__main__": + + m = EntriesMaintainer() + + actions = { + 'Read entries': m.read_entries, + 'Write entries': m.write_entries, + 'Check template leftovers': m.check_template_leftovers, + 'Check external links': m.check_external_links, + 'Check rejected entries': m.clean_rejected, + 'Check external links (takes quite long)': m.check_external_links, + 'Clean backlog': m.clean_backlog, + 'Update Readme and TOCs': m.update_readme_tocs, + 'Update statistics': m.update_statistics, + 'Update HTML': m.update_html, + 'Update repository list': m.update_repos, + 'Complete run': m.complete_run + } + + osg_ui.run_simple_button_app('Entries developer', actions) + + diff --git a/code/utils/constants.py b/code/utils/constants.py index 8396459c..3529434d 100644 --- a/code/utils/constants.py +++ b/code/utils/constants.py @@ -16,6 +16,7 @@ developer_file = os.path.join(root_path, 'developers.md') backlog_file = os.path.join(code_path, 'backlog.txt') rejected_file = os.path.join(code_path, 'rejected.txt') +statistics_file = os.path.join(root_path, 'statistics.md') # local config local_config_file = os.path.join(root_path, 'local-config.ini') diff --git a/code/utils/osg.py b/code/utils/osg.py index 8ad8bfbb..46bf2cbe 100644 --- a/code/utils/osg.py +++ b/code/utils/osg.py @@ -370,11 +370,16 @@ def check_and_process_entry(entry): return entry +def is_inactive(entry): + state = entry['State'] + phrase = 'inactive since ' + return any(x.startswith(phrase) for x in state) + def extract_inactive_year(entry): state = entry['State'] phrase = 'inactive since ' - inactive_year = [x[len(phrase):] for x in state if x.startswith(phrase)] + inactive_year = [x.value[len(phrase):] for x in state if x.startswith(phrase)] assert len(inactive_year) <= 1 if inactive_year: return inactive_year[0] diff --git a/entries/enigma.md b/entries/enigma.md index 6f5065a9..6ed29d24 100644 --- a/entries/enigma.md +++ b/entries/enigma.md @@ -13,7 +13,6 @@ - Code dependencies: SDL Puzzle game based on Oxyd. -Inspired by Oxyd. ## Building diff --git a/entries/iron_seed.md b/entries/iron_seed.md index 876a653b..930c3946 100644 --- a/entries/iron_seed.md +++ b/entries/iron_seed.md @@ -5,7 +5,7 @@ - Inspirations: Iron Seed - State: mature, inactive since 2013 - Download: https://web.archive.org/web/20150802151352/http://www.ironseed.com/ironseed-v1.20.0016-2013-03-17.zip -- Keywords: remake, inspired by Iron Seed +- Keywords: remake - Code repository: @see-download - Code language: Pascal - Code license: GPL-3.0 (not with the source code)