rename fields to improve understanding, adapting maintenance.py (intermediate)

This commit is contained in:
Trilarion
2018-06-07 17:44:44 +02:00
parent 381a62784a
commit 6070bd5360
417 changed files with 1607 additions and 1606 deletions

View File

@ -2,8 +2,8 @@
Runs a series of maintenance operations on the collection of entry files, updating the table of content files for
each category as well as creating a statistics file.
Counts the number of records each subfolder and updates the overview. Sorts the entries in the contents files of
each sub folder alphabetically.
Counts the number of records each sub-folder and updates the overview.
Sorts the entries in the contents files of each sub folder alphabetically.
This script runs with Python 3, it could also with Python 2 with some minor tweaks probably.
"""
@ -14,19 +14,19 @@ import urllib.request
import http.client
import datetime
def get_category_paths():
"""
Returns all sub folders of the games path.
"""
return [os.path.join(games_path, x) for x in os.listdir(games_path) if os.path.isdir(os.path.join(games_path, x))]
TOC = '_toc.md'
def get_entry_paths(category_path):
"""
Returns all files of a category path, except for '_toc.md'.
"""
return [os.path.join(category_path, x) for x in os.listdir(category_path) if x != '_toc.md' and os.path.isfile(os.path.join(category_path, x))]
def read_first_line_from_file(file):
def read_text(file):
"""
Reads a whole text file (UTF-8 encoded).
"""
with open(file, mode='r', encoding='utf-8') as f:
text = f.read()
return text
def read_first_line(file):
"""
Convenience function because we only need the first line of a category overview really.
"""
@ -34,16 +34,42 @@ def read_first_line_from_file(file):
line = f.readline()
return line
def read_interesting_info_from_file(file):
def write_text(file, text):
"""
Parses a file for some interesting fields and concatenates the content. To be displayed after the game name in the
category overview.
Writes a whole text file (UTF-8 encoded).
"""
with open(file, mode='w', encoding='utf-8') as f:
f.write(text)
def get_category_paths():
"""
Returns all sub folders of the games path.
"""
return [os.path.join(games_path, x) for x in os.listdir(games_path) if os.path.isdir(os.path.join(games_path, x))]
def get_entry_paths(category_path):
"""
Returns all files of a category path, except for '_toc.md'.
"""
return [os.path.join(category_path, x) for x in os.listdir(category_path) if x != TOC and os.path.isfile(os.path.join(category_path, x))]
def extract_overview_for_toc(file):
"""
Parses a file for some interesting fields and concatenates the content.
To be displayed after the game name in the category TOCs.
"""
with open(file, mode='r', encoding='utf-8') as f:
text = f.read()
output = [None, None, None]
# TODO unify this with the statistics
# language
regex = re.compile(r"- Language\(s\): (.*)")
matches = regex.findall(text)
@ -71,19 +97,25 @@ def read_interesting_info_from_file(file):
def update_readme():
"""
Recounts entries in sub categories and writes them to the readme. Needs to be performed regularly.
Recounts entries in sub categories and writes them to the readme.
Also updates the _toc files in the categories directories.
Note: The Readme must have a specific structure at the beginning, starting with "# Open Source Games" and ending
on "A collection.."
Needs to be performed regularly.
"""
print('update readme file')
# read readme
with open(readme_path, mode='r', encoding='utf-8') as f:
readme_text = f.read()
readme_text = read_text(readme_file)
# compile regex for identifying the building blocks
regex = re.compile(r"(# Open Source Games\n\n)(.*)(\nA collection.*)", re.DOTALL)
# apply regex
matches = regex.findall(readme_text)
assert len(matches) == 1
matches = matches[0]
start = matches[0]
end = matches[2]
@ -91,35 +123,37 @@ def update_readme():
# get sub folders
category_paths = get_category_paths()
# get number of files (minus 1) in each sub folder
n = [len(os.listdir(path)) - 1 for path in category_paths]
# assemble paths
paths = [os.path.join(path, '_toc.md') for path in category_paths]
toc_paths = [os.path.join(path, TOC) for path in category_paths]
# get titles (discarding first two ("# ") and last ("\n") characters)
titles = [read_first_line_from_file(path)[2:-1] for path in paths]
category_titles = [read_first_line(path)[2:-1] for path in toc_paths]
# get number of files (minus 1 for the already existing TOC file) in each sub folder
n_entries = [len(os.listdir(path)) - 1 for path in category_paths]
# combine titles, category names, numbers in one list
info = zip(titles, [os.path.basename(path) for path in category_paths], n)
info = zip(category_titles, [os.path.basename(path) for path in category_paths], n_entries)
# sort according to sub category title (should be unique)
info = sorted(info, key=lambda x:x[0])
# assemble output
update = ['- **[{}](games/{}/_toc.md)** ({})\n'.format(*entry) for entry in info]
update = "{} entries\n".format(sum(n)) + "".join(update)
update = ['- **[{}](games/{}/{})** ({})\n'.format(entry[0], entry[1], TOC, entry[2]) for entry in info]
update = "{} entries\n".format(sum(n_entries)) + "".join(update)
# insert new text in the middle
text = start + "[comment]: # (start of autogenerated content, do not edit)\n" + update + "\n[comment]: # (end of autogenerated content)" + end
text = start + "[comment]: # (start of autogenerated content, do not edit)\n" + update + "[comment]: # (end of autogenerated content)" + end
# write to readme
with open(readme_path, mode='w', encoding='utf-8') as f:
f.write(text)
write_text(readme_file, text)
def update_category_tocs():
"""
Lists all entries in all sub folders and generates the list in the toc file. Needs to be performed regularly.
Lists all entries in all sub folders and generates the list in the toc file.
Needs to be performed regularly.
"""
# get category paths
category_paths = get_category_paths()
@ -129,19 +163,19 @@ def update_category_tocs():
print('generate toc for {}'.format(os.path.basename(category_path)))
# read toc header line
toc_file = os.path.join(category_path, '_toc.md')
toc_header = read_first_line_from_file(toc_file)
toc_file = os.path.join(category_path, TOC)
toc_header = read_first_line(toc_file) # stays as is
# get paths of all entries in this category
entry_paths = get_entry_paths(category_path)
# get titles (discarding first two ("# ") and last ("\n") characters)
titles = [read_first_line_from_file(path)[2:-1] for path in entry_paths]
titles = [read_first_line(path)[2:-1] for path in entry_paths]
# get more interesting info
more = [read_interesting_info_from_file(path) for path in entry_paths]
more = [extract_overview_for_toc(path) for path in entry_paths]
# combine name and file name
# combine name, file name and more info
info = zip(titles, [os.path.basename(path) for path in entry_paths], more)
# sort according to entry title (should be unique)
@ -151,13 +185,14 @@ def update_category_tocs():
update = ['- **[{}]({})** ({})\n'.format(*entry) for entry in info]
update = "".join(update)
# combine toc header
text = toc_header + '\n' + "[comment]: # (start of autogenerated content, do not edit)\n" + update + "\n[comment]: # (end of autogenerated content)"
# combine with toc header
text = toc_header + '\n' + "[comment]: # (start of autogenerated content, do not edit)\n" + update + "[comment]: # (end of autogenerated content)"
# write to toc file
with open(toc_file, mode='w', encoding='utf-8') as f:
f.write(text)
def check_validity_external_links():
"""
Checks all external links it can find for validity. Prints those with non OK HTTP responses. Does only need to be run
@ -212,71 +247,18 @@ def check_validity_external_links():
print("{} links checked".format(number_checked_links))
def fix_notation():
"""
Changes notation, quite special. Only run when needed.
"""
regex = re.compile(r"- License details:(.*)")
# get category paths
category_paths = get_category_paths()
# for each category
for category_path in category_paths:
# get paths of all entries in this category
entry_paths = get_entry_paths(category_path)
for entry_path in entry_paths:
# read it line by line
with open(entry_path, 'r', 'utf-8') as f:
content = f.readlines()
# apply regex on every line
matched_lines = [regex.findall(line) for line in content]
# loop over all the lines
for line, match in enumerate(matched_lines):
if match:
match = match[0]
# patch content
content[line] = "- Code license details:{}\n".format(match)
# write it line by line
with open(entry_path, "w", 'utf-8') as f:
f.writelines(content)
def regular_replacements():
"""
Replacing some stuff by shortcuts. Can be run regularly
"""
# get category paths
category_paths = get_category_paths()
# for each category
for category_path in category_paths:
# get paths of all entries in this category
entry_paths = get_entry_paths(category_path)
for entry_path in entry_paths:
# read it line by line
with open(entry_path, 'r', 'utf-8') as f:
content = f.read()
# now the replacements
content = content.replace('?source=navbar', '') # sourceforge specific
content = content.replace('single player', 'SP')
content = content.replace('multi player', 'MP')
# write it line by line
with open(entry_path, "w", 'utf-8') as f:
f.write(content)
def check_template_leftovers():
"""
Checks for template leftovers.
Should be run only occasionally.
"""
check_strings = ['# {NAME}', '_{One line description}_', '- Home: {URL}', '- Media: {URL}', '- Download: {URL}', '- State: beta, mature, inactive since', '- Keywords: SP, MP, RTS, TBS (if none, remove the line)', '- Code: primary repository (type if not git), other repositories (type if not git)', '- Language(s): {XX}', '- License: {XX} (if special, include link)', '{XXX}']
# load template and get all lines
text = read_text(os.path.join(games_path, 'template.md'))
text = text.split('\n')
check_strings = [x for x in text if x and not x.startswith('##')]
# get category paths
category_paths = get_category_paths()
@ -288,13 +270,13 @@ def check_template_leftovers():
for entry_path in entry_paths:
# read it line by line
with open(entry_path, 'r', 'utf-8') as f:
content = f.read()
content = read_text(entry_path)
for check_string in check_strings:
if content.find(check_string) >= 0:
print('{}: found {}'.format(os.path.basename(entry_path), check_string))
def parse_entry(content):
"""
Returns a dictionary of the features of the content
@ -302,48 +284,67 @@ def parse_entry(content):
info = {}
# state
regex = re.compile(r"- State: (.*)")
# read title
regex = re.compile(r"^# (.*)")
matches = regex.findall(content)
if matches:
# first remove everything in parenthesis
states = re.sub(r'\([^)]*\)', '', matches[0])
states = states.split(',')
states = [x.strip() for x in states]
if 'beta' in states:
info['state'] = 'beta'
elif 'mature' in states:
info['state'] = 'mature'
else:
print('Neither beta nor mature in state tag: {}'.format(content))
inactive = next((int(x[14:]) for x in states if x.startswith('inactive since')), None) # only the year
if inactive:
info['inactive'] = inactive
assert len(matches) == 1
info['title'] = matches[0]
# language
regex = re.compile(r"- Language\(s\): (.*)")
matches = regex.findall(content)
if matches:
# first remove everything in parenthesis
languages = re.sub(r'\([^)]*\)', '', matches[0])
languages = languages.split(',')
languages = [x.strip() for x in languages]
info['language'] = languages
# first read all field names
regex = re.compile(r"- (.*): ")
fields = regex.findall(content)
# license
regex = re.compile(r"- Code license: (.*)")
matches = regex.findall(content)
if matches:
# first remove everything in parenthesis
license = re.sub(r'\([^)]*\)', '', matches[0])
info['license'] = license
# iterate over found field
for field in fields:
regex = re.compile(r"- {}: (.*)".format(field))
matches = regex.findall(content)
assert len(matches) == 1 # every field should only be present once
v = matches[0]
# first store as is
info[field.lower()+'-raw'] = v
# remove parenthesis
v = re.sub(r'\([^)]*\)', '', v)
# split on ','
v = v.split(',')
# finally strip
v = [x.strip() for x in v]
# store in info
info[field.lower()] = v
# checks
# essential fields
essential_fields = ['home', 'state']
for field in essential_fields:
if field not in info:
print('Essential field "{}" missing in entry {}'.format(field, info['title']))
return {}
# state must contain either beta or mature but not both
v = info['state']
if 'beta' in v != 'mature' in v:
printf('State must be one of <beta, mature> in entry {}'.format(info['title']))
return {}
# extract inactive
phrase = 'inactive since '
inactive_year = [x[len(phrase):] for x in info['state'] if x.startswith(phrase)]
if inactive_year:
info['inactive'] = inactive_year
return info
def generate_statistics():
"""
Generates the statistics page.
Should be done everytime the entries change.
"""
statistics_path = os.path.join(games_path, 'statistics.md')
statistics = '[comment]: # (autogenerated content, do not edit)\n# Statistics\n\n'
@ -351,31 +352,37 @@ def generate_statistics():
# get category paths
category_paths = get_category_paths()
# for each category
# a database of all important infos about the entries
infos = []
# for each category
for category_path in category_paths:
# get paths of all entries in this category
entry_paths = get_entry_paths(category_path)
for entry_path in entry_paths:
# read it line by line
with open(entry_path, mode='r', encoding='utf-8') as f:
content = f.read()
# read entry
content = read_text(entry_path)
# parse entry
info = parse_entry(content)
# add file information
info['file'] = os.path.basename(entry_path)[:-3] # [:-3] to cut off the .md
# add to list
infos.append(info)
# total number
number_entries = len(infos)
rel = lambda x: x / number_entries * 100 # converion to percent
rel = lambda x: x / number_entries * 100 # conversion to percent
statistics += 'analyzed {} entries on {}\n\n'.format(number_entries, datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
# State (beta, mature, inactive)
statistics += '## State\n\n'
number_state_beta = sum(1 for x in infos if 'state' in x and x['state'] == 'beta')
number_state_mature = sum(1 for x in infos if 'state' in x and x['state'] == 'mature')
number_state_beta = sum(1 for x in infos if 'beta' in x['state'])
number_state_mature = sum(1 for x in infos if 'mature' in x['state'])
number_inactive = sum(1 for x in infos if 'inactive' in x)
statistics += '- mature: {} ({:.1f}%)\n- beta: {} ({:.1f}%)\n- inactive: {} ({:.1f}%)\n\n'.format(number_state_mature, rel(number_state_mature), number_state_beta, rel(number_state_beta), number_inactive, rel(number_inactive))
@ -439,18 +446,17 @@ def generate_statistics():
f.write(statistics)
if __name__ == "__main__":
# paths
games_path = os.path.realpath(os.path.join(os.path.dirname(__file__), os.path.pardir, 'games'))
readme_path = os.path.join(games_path, os.pardir, 'README.md')
readme_file = os.path.realpath(os.path.join(games_path, os.pardir, 'README.md'))
# recount and write to readme
update_readme()
# update_readme()
# generate list in toc files
update_category_tocs()
# update_category_tocs()
# generate report
generate_statistics()
@ -459,10 +465,4 @@ if __name__ == "__main__":
# check_template_leftovers()
# check external links (only rarely)
#check_validity_external_links()
# special, only run when needed
# fix_notation()
# regular replacements
#regular_replacements()
# check_validity_external_links()