update Github information (devs and project stars)

This commit is contained in:
Trilarion
2021-09-27 14:51:43 +02:00
parent 4f151766bb
commit 6b6ca69a88
989 changed files with 10736 additions and 2086 deletions

View File

@ -0,0 +1,381 @@
"""
Imports game details from libregamewiki by scraping the website, starting from https://libregamewiki.org/Category:Games
Also parse rejected games (https://libregamewiki.org/Libregamewiki:Rejected_games_list) and maybe https://libregamewiki.org/Libregamewiki:Suggested_games
Unique left column names in the game info boxes:
['Code license', 'Code licenses', 'Developer', 'Developers', 'Engine', 'Engines', 'Genre', 'Genres', 'Libraries', 'Library', 'Media license', 'Media licenses', 'P. language', 'P. languages', 'Platforms']
TODO there are games on LGW which are not part of the Games category but part of XXX-Games sub-categories, find them
"""
import os
import requests
import json
import re
from bs4 import BeautifulSoup
from utils import constants, utils, osg
def download_lgw_content():
"""
:return:
"""
# parameters
base_url = 'https://libregamewiki.org'
destination_path = os.path.join(constants.root_path, 'code', '../lgw-import')
utils.recreate_directory(destination_path)
# read and process the base url (get all games and categories)
url = base_url + '/Category:Games'
games = []
while True:
text = requests.get(url).text
soup = BeautifulSoup(text, 'html.parser')
# categories = soup.find('div', id='mw-subcategories').find_all('li')
# categories = [(x.a['href'], x.a.string) for x in categories]
# game pages
pages = soup.find('div', id='mw-pages').find_all('li')
games.extend(((x.a['href'], x.a.string) for x in pages))
# next page
next_page = soup.find('a', string='next page')
if not next_page:
break
url = base_url + next_page['href']
# remove all those that start with user
games = [game for game in games if not any(game[1].startswith(x) for x in ('User:', 'Template:', 'Bullet'))]
print('current number of games in LGW {}'.format(len(games)))
for game in games:
print(game[1])
url = base_url + game[0]
destination_file = os.path.join(destination_path, osg.canonical_name(game[0][1:]) + '.html')
text = requests.get(url).text
utils.write_text(destination_file, text)
def parse_lgw_content():
# paths
import_path = os.path.join(constants.root_path, 'code', '../lgw-import')
entries_file = os.path.join(import_path, '_lgw.json')
# iterate over all imported files
files = os.listdir(import_path)
entries = []
for file in files:
if file.startswith('_lgw'):
continue
text = utils.read_text(os.path.join(import_path, file))
# parse the html
soup = BeautifulSoup(text, 'html.parser')
title = soup.h1.get_text()
print(title)
entry = {'name': title}
# get all external links
ignored_external_links = ('libregamewiki.org', 'freegamedev.net', 'freegamer.blogspot.com', 'opengameart.org', 'gnu.org', 'creativecommons.org', 'freesound.org', 'freecode.com', 'freenode.net')
links = [(x['href'], x.get_text()) for x in soup.find_all('a', href=True)]
links = [x for x in links if x[0].startswith('http') and not any([y in x[0] for y in ignored_external_links])]
entry['external links'] = links
# get meta description
description = soup.find('meta', attrs={"name": "description"})
entry['description'] = description['content']
# parse gameinfobox
infos = soup.find('div', class_='gameinfobox')
if not infos:
print(' no gameinfobox')
else:
infos = infos.find_all('tr')
for x in infos:
if x.th and x.td:
# row with header
key = x.th.get_text()
content = x.td.get_text()
content = content.split(',')
content = [x.strip() for x in content]
entry[key] = content
if not x.th and x.td:
# row without header: contribute section
x = x.find_all('li')
x = [(x.a.string, x.a['href']) for x in x if x.a]
for key, content in x:
entry[key] = content
# parse "for available as package in"
tables = soup.find_all('table', class_='wikitable')
tables = [table for table in tables if table.caption and table.caption.string.startswith('Available as package')]
if len(tables) > 0:
if len(tables) > 1:
raise RuntimeError()
table = tables[0]
packages = table.find_all('tr')
packages = [x.td.a['href'] for x in packages]
entry['linux-packages'] = packages
# categories
categories = soup.find_all('div', id='mw-normal-catlinks')
if not categories:
print(' no categories')
categories = []
else:
if len(categories) > 1:
raise RuntimeError()
categories = categories[0]
categories = categories.find_all('li')
categories = [x.a.string for x in categories]
if 'Games' not in categories:
print(' "Games" not in categories')
else:
categories.remove('Games') # should be there
# strip games at the end
phrase = ' games'
categories = [x[:-len(phrase)] if x.endswith(phrase) else x for x in categories]
ignored_categories = ['Articles lacking reference', 'Stubs']
categories = [x for x in categories if x not in ignored_categories]
entry['categories'] = categories
entries.append(entry)
# save entries
text = json.dumps(entries, indent=1)
utils.write_text(entries_file, text)
def replace_content(entries, fields, replacement, search):
if not isinstance(fields, tuple):
fields = (fields, )
for index, entry in enumerate(entries):
for field in fields:
if field in entry:
content = entry[field]
if not isinstance(content, list):
content = [content]
entry[field] = [replacement if x in search else x for x in content]
entries[index] = entry
return entries
def ignore_content(entries, fields, ignored):
if not isinstance(fields, tuple):
fields = (fields, )
for index, entry in enumerate(entries):
for field in fields:
if field in entry:
content = entry[field]
if not isinstance(content, list):
content = [content]
content = [x for x in content if x not in ignored]
if content:
entry[field] = content
else:
del entry[field]
entries[index] = entry
return entries
def remove_prefix_suffix(entries, fields, prefixes, suffixes):
if not isinstance(fields, tuple):
fields = (fields, )
for index, entry in enumerate(entries):
for field in fields:
if field in entry:
content = entry[field]
if not isinstance(content, list):
content = [content]
for prefix in prefixes:
content = [x[len(prefix):] if x.startswith(prefix) else x for x in content]
for sufix in suffixes:
content = [x[:-len(sufix)] if x.endswith(sufix) else x for x in content]
content = [x.strip() for x in content]
entry[field] = content
entries[index] = entry
return entries
def lower_case_content(entries, field):
for index, entry in enumerate(entries):
if field in entry:
content = entry[field]
if not isinstance(content, list):
content = [content]
entry[field] = [x.casefold() for x in content]
entries[index] = entry
return entries
def remove_parenthized_content(entries, fields):
if not isinstance(fields, tuple):
fields = (fields, )
for index, entry in enumerate(entries):
for field in fields:
if field in entry:
content = entry[field]
if not isinstance(content, list):
content = [content]
content = [re.sub(r'\([^)]*\)', '', c) for c in content] # remove parentheses content
content = [x.strip() for x in content]
content = list(set(content))
entry[field] = content
entries[index] = entry
return entries
def ignore_nonnumbers(entries, fields):
if not isinstance(fields, tuple):
fields = (fields, )
for index, entry in enumerate(entries):
for field in fields:
if field in entry:
content = entry[field]
if not isinstance(content, list):
content = [content]
content = [x for x in content if x.isdigit()]
entry[field] = content
entries[index] = entry
return entries
def clean_lgw_content():
# paths
import_path = os.path.join(constants.root_path, 'code', '../lgw-import')
entries_file = os.path.join(import_path, '_lgw.json')
cleaned_entries_file = os.path.join(import_path, '_lgw.cleaned.json')
# load entries
text = utils.read_text(entries_file)
entries = json.loads(text)
# rename keys
key_replacements = (('developer', ('Developer', 'Developers')), ('code license', ('Code license', 'Code licenses')), ('engine', ('Engine', 'Engines')), ('genre', ('Genre', 'Genres')),
('library', ('Library', 'Libraries')), ('assets license', ('Media license', 'Media licenses')), ('code language', ('P. language', 'P. languages')), ('home', ('Homepage',)),
('platform', ('Platforms', )), ('tracker', ('Bug/Feature Tracker', )), ('repo', ('Source Code', )), ('forum', ('Forum', )), ('chat', ('Chat', )), ('origin', ('Origin', )),
('dev home', ('Development Project', )), ('last active', ('Release date', )))
for index, entry in enumerate(entries):
for new_key, old_keys in key_replacements:
for key in old_keys:
if key in entry:
entry[new_key] = entry[key]
del entry[key]
break
entries[index] = entry
# ignore keys
ignored_keys = ('origin', 'Latest\xa0release')
for index, entry in enumerate(entries):
for key in ignored_keys:
if key in entry:
del entry[key]
entries[index] = entry
# check for unique field names
unique_fields = set()
for entry in entries:
unique_fields.update(entry.keys())
print('unique lgw fields: {}'.format(sorted(list(unique_fields))))
# which fields are mandatory
mandatory_fields = unique_fields.copy()
for entry in entries:
remove_fields = [field for field in mandatory_fields if field not in entry]
mandatory_fields -= set(remove_fields)
print('mandatory lgw fields: {}'.format(sorted(list(mandatory_fields))))
# statistics before
print('field contents before')
fields = sorted(list(unique_fields - {'description', 'external links', 'dev home', 'forum', 'home',
'linux-packages', 'developer', 'chat', 'tracker', 'Latest release', 'name',
'repo', 'Release date', 'categories'}))
for field in fields:
content = [entry[field] for entry in entries if field in entry]
# flatten
flat_content = []
for c in content:
if isinstance(c, list):
flat_content.extend(c)
else:
flat_content.append(c)
statistics = utils.unique_elements_and_occurrences(flat_content)
print('{}: {}'.format(field, ', '.join(statistics)))
# content replacements
entries = remove_parenthized_content(entries, ('assets license', 'code language', 'code license', 'engine', 'genre', 'last active', 'library'))
entries = remove_prefix_suffix(entries, ('code license', 'assets license'), ('"', 'GNU'), ('"', '[3]', '[2]', '[1]', 'only', ' license'))
entries = replace_content(entries, ('code license', 'assets license'), 'GPL', ('General Public License', ))
entries = replace_content(entries, ('code license', 'assets license'), 'GPL-2.0', ('GPLv2', )) # for LGW GPLv2 would be the correct writing
entries = replace_content(entries, ('code license', 'assets license'), 'GPL-2', ('GPLv2', 'GPL v2', 'GPL version 2.0', 'GPL 2.0', 'General Public License v2', 'GPL version 2', 'Gplv2', 'GPL 2'))
entries = replace_content(entries, ('code license', 'assets license'), 'GPL-2', ('GPL v2 or later', 'GPL 2+', 'GPL v2+', 'GPLv2+', 'GPL version 2 or later'))
entries = replace_content(entries, ('code license', 'assets license'), 'GPL-3.0', ('GPLv3', )) # for LGW GPLv3 would be the correct writing
entries = replace_content(entries, ('code license', 'assets license'), 'GPL-3', ('GPL v3', 'GNU GPL v3', 'GPL 3', 'General Public License 3', 'General Public License v3.0'))
entries = replace_content(entries, ('code license', 'assets license'), 'GPL-3', ('GPL v3+', 'GPLv3+', 'GPL v.3 or later', 'GPL v3 or later'))
entries = replace_content(entries, ('code license', 'assets license'), 'AGPL-3.0', ('AGPLv3', 'AGPL', 'Affero General Public License v3.0', 'AGPL v3'))
entries = replace_content(entries, ('code license', 'assets license'), 'Public domain', ('public domain', 'Public Domain'))
entries = replace_content(entries, ('code license', 'assets license'), 'zlib', ('zlib/libpng license', 'Zlib License'))
entries = replace_content(entries, ('code license', 'assets license'), 'BSD', ('Original BSD License', ))
entries = replace_content(entries, ('code license', 'assets license'), 'CC-BY-SA-3.0', ('Creative Commons Attribution-ShareAlike 3.0 Unported License', 'CC-BY-SA 3.0', 'CC BY-SA 3.0'))
entries = replace_content(entries, ('code license', 'assets license'), 'CC-BY-SA', ('CC BY-SA',))
entries = replace_content(entries, ('code license', 'assets license'), 'MIT', ('MIT License', 'MIT"'))
entries = replace_content(entries, ('assets license', ), 'no media', ('No media', 'no media?'))
entries = replace_content(entries, 'platform', 'macOS', ('Mac', ))
entries = remove_prefix_suffix(entries, ('code language', 'developer'), (), ('[3]', '[2]', '[1]'))
entries = ignore_content(entries, 'code language', ('HTML5', 'HTML', 'English', 'XML', 'WML', 'CSS'))
entries = replace_content(entries, 'code language', 'Lua', ('lua', 'LUA'))
entries = remove_prefix_suffix(entries, 'genre', (), ('game', 'games'))
entries = lower_case_content(entries, 'genre')
entries = replace_content(entries, 'genre', 'platform', ('platformer', ))
entries = replace_content(entries, 'genre', 'role playing', ('rpg', ))
entries = replace_content(entries, 'genre', 'first person, shooter', ('fps', ))
entries = replace_content(entries, 'genre', 'real time, strategy', ('rts',))
entries = replace_content(entries, 'genre', 'turn based, strategy', ('tbs',))
entries = ignore_content(entries, 'categories', ('GPL', 'C++', 'C', 'ECMAScript', 'Python', 'Java', 'CC BY-SA', 'Lua', 'LGPL', 'CC-BY', 'BSD', 'MIT', 'Qt', 'SDL', 'OpenGL', 'Pygame', 'PD', 'GLUT', 'Haskell', 'Allegro', 'Ruby', 'Zlib/libpng', 'OpenAL', 'Perl', 'Free Pascal', 'LÖVE', 'HTML5', 'Id Tech 1'))
entries = replace_content(entries, 'library', 'pygame', ('Pygame', ))
entries = replace_content(entries, 'library', 'Qt', ('QT', 'Qt4'))
entries = ignore_content(entries, 'library', ('C++', 'Lua', 'Mozilla Firefox', 'DirectX', 'Boost'))
entries = ignore_nonnumbers(entries, 'last active')
entries = ignore_content(entries, 'last active', ('2019', ))
entries = ignore_content(entries, 'platform', ('DOS', ))
# list for every unique field
print('\nfield contents after')
fields = sorted(list(unique_fields - {'description', 'external links', 'dev home', 'forum', 'home',
'linux-packages', 'developer', 'chat', 'tracker', 'Latest release', 'name',
'repo', 'Release date', 'categories'}))
for field in fields:
content = [entry[field] for entry in entries if field in entry]
# flatten
flat_content = []
for c in content:
if isinstance(c, list):
flat_content.extend(c)
else:
flat_content.append(c)
statistics = utils.unique_elements_and_occurrences(flat_content)
print('{}: {}\n'.format(field, ', '.join(statistics)))
# save entries
text = json.dumps(entries, indent=1)
utils.write_text(cleaned_entries_file, text)
if __name__ == "__main__":
# stage one
# download_lgw_content()
# stage two
# parse_lgw_content()
# stage three
# clean_lgw_content()

View File

@ -0,0 +1,304 @@
"""
Once data from libregamewiki is imported, synchronize with our database, i.e. identify the entries both have in common,
estimate the differences in the entries both have in common, suggest to add the entries they have not in common to each
other.
unique imported fields: 'assets license', 'categories', 'code language', 'code license', 'developer', 'engine', 'genre', 'library', 'linux-packages', 'name', 'platform'
mandatory imported fields: 'categories', 'name'
Mapping lgw -> ours
assets license -> assets license
categories -> keywords
code language -> code language
code license -> code license
developer -> free text (info)
engine -> code dependencies
genre -> keywords
library -> code dependencies
linux-packages - > free text (info)
name -> name
platform -> platform
TODO also ignore our rejected entries
"""
import json
import os
from utils import constants, utils, osg
lgw_name_aliases = {'Eat the Whistle': 'Eat The Whistle', 'Scorched 3D': 'Scorched3D',
'Blob Wars Episode 1 : Metal Blob Solid': 'Blobwars: Metal Blob Solid',
'Adventure': 'Colossal Cave Adventure',
'Liquid War 6': 'Liquid War', 'Gusanos': 'GUSANOS', 'Corewars': 'Core War', 'FLARE': 'Flare',
'Vitetris': 'vitetris', 'Powder Toy': 'The Powder Toy', 'Asylum': 'SDL Asylum',
'Atanks': 'Atomic Tanks', 'HeXon': 'heXon', 'Unnethack': 'UnNetHack',
'Nova Pinball': 'NOVA PINBALL', 'Jump n Bump': "Jump'n'Bump",
'Blades of Exile': 'Classic Blades of Exile',
'Colobot': 'Colobot: Gold Edition', 'Dead Justice': 'Cat Mother Dead Justice',
'FreeDink': 'GNU FreeDink', 'FRaBs': 'fRaBs', 'Harmonist': 'Harmonist: Dayoriah Clan Infiltration',
'Iris2 3D Client - for Ultima Online': 'Iris2',
'Java Classic Role Playing Game': 'jClassicRPG', 'Osgg': 'OldSkool Gravity Game',
'PyRacerz': 'pyRacerz', 'Starfighter': 'Project: Starfighter',
'TORCS': 'TORCS, The Open Racing Car Simulator', 'Vertigo (game)': 'Vertigo',
'XInvaders3D': 'XInvaders 3D', 'LambdaRogue': 'LambdaRogue: The Book of Stars',
'Maniadrive': 'ManiaDrive', 'Story of Seasons': "Greentwip's Harvest Moon", 'TinyTris': 'Tiny Tris',
'Which Way Is Up': 'Which Way Is Up?', 'CannonSmash': 'Cannon Smash', 'UFO:Alien Invasion': 'UFO: Alien Invasion'}
lgw_ignored_entries = ['Hetris', '8 Kingdoms', 'Antigravitaattori', 'Arena of Honour', 'Arkhart', 'Ascent of Justice',
'Balazar III', 'Balder3D', 'Barbie Seahorse Adventures', 'Barrage', 'Gnome Batalla Naval',
'Blocks',
'Brickshooter', 'Bweakfwu', 'Cheese Boys', 'Clippers', 'Codewars', 'CRAFT: The Vicious Vikings',
'DQM', 'EmMines', 'Eskimo-run', 'Farlands', 'Feuerkraft', 'Fight or Perish', 'Flatland', 'Forest patrol',
'Flare: Empyrean Campaign', 'Free Reign', 'GalaxyMage',
'Gloss', 'GRUB Invaders', 'Howitzer Skirmish', 'Imperium: Sticks', 'Interstate Outlaws',
'GNOME Games', 'KDE Games', 'LegacyClone', 'Memonix', 'Ninjapix', 'Neverputt', 'Militia Defense',
'Sudoku86', 'Look Around the Corner', 'GPSFish',
'Terminal Overload release history', 'Scions of Darkness', 'Sedtris', 'SilChess', 'SSTPong',
'Tesseract Trainer', 'TunnelWars', 'The Fortress', 'Tunnel']
licenses_map = {'GPLv2': 'GPL-2.0', 'GPLv2+': 'GPL-2.0', 'GPLv3': 'GPL-3.0', 'GPLv3+': 'GPL-3.0'}
def compare_sets(a, b, name, limit=None):
"""
:param limit:
:param a:
:param b:
:param name:
:return:
"""
p = ''
if not isinstance(a, set):
a = set(a)
if not isinstance(b, set):
b = set(b)
d = sorted(list(a - b))
if d and limit != 'notus':
p += ' {} : us : {}\n'.format(name, ', '.join(d))
d = sorted(list(b - a))
if d and limit != 'notthem':
p += ' {} : them : {}\n'.format(name, ', '.join(d))
return p
if __name__ == "__main__":
# some parameter
similarity_threshold = 0.8
maximal_newly_created_entries = 40
# paths
lgw_import_path = os.path.join(constants.root_path, 'code', '../lgw-import')
lgw_entries_file = os.path.join(lgw_import_path, '_lgw.cleaned.json')
# import lgw import
text = utils.read_text(lgw_entries_file)
lgw_entries = json.loads(text)
# eliminate the ignored entries
_ = [x['name'] for x in lgw_entries if x['name'] in lgw_ignored_entries] # those that will be ignored
_ = set(lgw_ignored_entries) - set(_) # those that shall be ignored minus those that will be ignored
if _:
print('Can un-ignore {}'.format(_))
lgw_entries = [x for x in lgw_entries if x['name'] not in lgw_ignored_entries]
# perform name and code language replacements
_ = [x['name'] for x in lgw_entries if x['name'] in lgw_name_aliases.keys()] # those that will be renamed
_ = set(lgw_name_aliases.keys()) - set(_) # those that shall be renamed minus those that will be renamed
if _:
print('Can un-rename {}'.format(_))
for index, lgw_entry in enumerate(lgw_entries):
if lgw_entry['name'] in lgw_name_aliases:
lgw_entry['name'] = lgw_name_aliases[lgw_entry['name']]
if 'code language' in lgw_entry:
languages = lgw_entry['code language']
h = []
for l in languages:
for g in ('/', 'and'):
if g in l:
l = l.split(g)
l = [x.strip() for x in l]
if type(l) == str:
l = [l]
h.extend(l)
languages = h
if languages:
lgw_entry['code language'] = languages
else:
del lgw_entry['code language']
lgw_entries[index] = lgw_entry
# check for unique field names
unique_fields = set()
for lgw_entry in lgw_entries:
unique_fields.update(lgw_entry.keys())
print('unique lgw fields: {}'.format(sorted(list(unique_fields))))
# which fields are mandatory
mandatory_fields = unique_fields.copy()
for lgw_entry in lgw_entries:
remove_fields = [field for field in mandatory_fields if field not in lgw_entry]
mandatory_fields -= set(remove_fields)
print('mandatory lgw fields: {}'.format(sorted(list(mandatory_fields))))
# read our database
our_entries = osg.read_entries()
print('{} entries with us'.format(len(our_entries)))
# just the names
lgw_names = set([x['name'] for x in lgw_entries])
our_names = set([x['Title'] for x in our_entries])
common_names = lgw_names & our_names
lgw_names -= common_names
our_names -= common_names
print('{} in both, {} only in LGW, {} only with us'.format(len(common_names), len(lgw_names), len(our_names)))
# find similar names among the rest
print('similar names (them - us')
for lgw_name in lgw_names:
for our_name in our_names:
if osg.name_similarity(lgw_name, our_name) > similarity_threshold:
print('"{}" - "{}"'.format(lgw_name, our_name))
newly_created_entries = 0
# iterate over their entries
print('\n')
for lgw_entry in lgw_entries:
lgw_name = lgw_entry['name']
is_included = False
for our_entry in our_entries:
our_name = our_entry['Title']
# find those that entries in LGW that are also in our database and compare them
if lgw_name == our_name:
is_included = True
# a match, check the fields
name = lgw_name
p = ''
# TODO key names have changed on our side
# platform
key = 'platform'
p += compare_sets(lgw_entry.get(key, []), our_entry.get(key, []), key)
# categories/keywords
# p += compare_sets(lgw_entry.get('categories', []), our_entry.get('keywords', []), 'categories/keywords')
# code language
key = 'code language'
p += compare_sets(lgw_entry.get(key, []), our_entry.get(key, []), key)
# code license (GPLv2)
key = 'code license'
p += compare_sets(lgw_entry.get(key, []), our_entry.get(key, []), key)
# engine, library
p += compare_sets(lgw_entry.get('engine', []), our_entry.get('code dependencies', []),
'code dependencies', 'notthem')
p += compare_sets(lgw_entry.get('library', []), our_entry.get('code dependencies', []),
'code dependencies', 'notthem')
p += compare_sets(lgw_entry.get('engine', []) + lgw_entry.get('library', []),
our_entry.get('code dependencies', []), 'engine/library', 'notus')
# assets license
key = 'assets license'
p += compare_sets(lgw_entry.get(key, []), our_entry.get(key, []), key)
# TODO developer (need to introduce a field with us first)
if p:
print('{}\n{}'.format(name, p))
if not is_included:
# a new entry, that we have never seen, maybe we should make an entry of our own
# TODO we could use the write capabilities to write the entry in our own format, the hardcoded format here might be brittle, on the other hand we can also write slightly wrong stuff here without problems
if newly_created_entries >= maximal_newly_created_entries:
continue
# determine file name
print('create new entry for {}'.format(lgw_name))
file_name = osg.canonical_name(lgw_name) + '.md'
target_file = os.path.join(constants.entries_path, file_name)
if os.path.isfile(target_file):
print('warning: file {} already existing, save under slightly different name'.format(file_name))
target_file = os.path.join(constants.entries_path, file_name[:-3] + '-duplicate.md')
if os.path.isfile(target_file):
continue # just for safety reasons
# add name
entry = '# {}\n\n'.format(lgw_name)
# empty home (mandatory on our side)
home = lgw_entry.get('home', None)
dev_home = lgw_entry.get('dev home', None)
entry += '- Home: {}\n'.format(', '.join([x for x in [home, dev_home] if x]))
# state mandatory on our side
entry += '- State: \n'
# platform, if existing
if 'platform' in lgw_entry:
entry += '- Platform: {}\n'.format(', '.join(lgw_entry['platform']))
# keywords (genre) (also mandatory)
keywords = lgw_entry.get('genre', [])
if 'assets license' in lgw_entry:
keywords.append('open content')
keywords.sort(key=str.casefold)
if keywords:
entry += '- Keyword: {}\n'.format(', '.join(keywords))
# code repository (mandatory but not scraped from lgw)
entry += '- Code repository: {}\n'.format(lgw_entry.get('repo', ''))
# code language, mandatory on our side
languages = lgw_entry.get('code language', [])
languages.sort(key=str.casefold)
entry += '- Code language: {}\n'.format(', '.join(languages))
# code license, mandatory on our side
licenses = lgw_entry.get('code license', [])
licenses = [licenses_map[x] if x in licenses_map else x for x in licenses]
licenses.sort(key=str.casefold)
entry += '- Code license: {}\n'.format(', '.join(licenses))
# code dependencies (only if existing)
code_dependencies = lgw_entry.get('engine', [])
code_dependencies.extend(lgw_entry.get('library', []))
code_dependencies.sort(key=str.casefold)
if code_dependencies:
entry += '- Code dependency: {}\n'.format(', '.join(code_dependencies))
# assets licenses (only if existing)
if 'assets license' in lgw_entry:
licenses = lgw_entry.get('assets license', [])
licenses = [licenses_map[x] if x in licenses_map else x for x in licenses]
licenses.sort(key=str.casefold)
entry += '- Assets license: {}\n'.format(', '.join(licenses))
# developer
if 'developer' in lgw_entry:
entry += '- Developer: {}\n'.format(', '.join(lgw_entry['developer']))
# add empty description (not anymore)
entry += '\n_{}_\n\n'.format(lgw_entry['description'])
# external links
ext_links = lgw_entry['external links']
if ext_links:
entry += '\nLinks: {}\n'.format('\n '.join(['{}: {}'.format(x[1], x[0]) for x in ext_links]))
# linux packages
if 'linux-packages' in lgw_entry:
entry += '{}\n'.format(lgw_entry['linux-packages'])
# write ## Building
entry += '\n## Building\n'
# finally write to file
utils.write_text(target_file, entry)
newly_created_entries += 1

View File

@ -0,0 +1,542 @@
"""
osgameclones has the following fields:
'updated', 'video', 'repo', 'license', 'originals', 'status', 'multiplayer', 'info', 'lang', 'feed', 'content', 'images', 'url', 'name', 'framework', 'type', 'development'
mandatory fields are: 'name', 'license', 'type', 'originals'
possible values:
osgc-development: active(337), complete(32), halted(330), sporadic(129), very active(6)
osgc-multiplayer: Co-op(5), Competitive(13), Hotseat(3), LAN(17), Local(3), Matchmaking(1), Online(33), Split-screen(7)
osgc-type: clone(171), remake(684), similar(11), tool(7)
osgc-status: playable(274), semi-playable(34), unplayable(34)
osgc-license: ['AFL3', 'AGPL3', 'Apache', 'Artistic', 'As-is', 'BSD', 'BSD2', 'BSD4', 'bzip2', 'CC-BY', 'CC-BY-NC', 'CC-BY-NC-ND', 'CC-BY-NC-SA', 'CC-BY-SA', 'CC0', 'Custom', 'GPL2', 'GPL3', 'IJG', 'ISC', 'JRL', 'LGPL2', 'LGPL3', 'Libpng', 'MAME', 'MIT', 'MPL', 'MS-PL', 'Multiple', 'NGPL', 'PD', 'WTFPL', 'Zlib']
osgc-content: commercial(104), free(32), open(61), swappable(5)
Mapping osgameclones -> ours
name -> name
type -> keywords, description
originals -> keywords
repo -> code repository
url -> home
feed (-> home)
development -> state
status -> state
multiplayer -> keywords
lang -> code language
framework -> code dependencies
license -> code license / assets license
content -> keywords
info -> after fields
updated not used
images not used
video: not used
TODO also ignore our rejected entries
"""
import ruamel.yaml as yaml
import os
from utils import constants, utils, osg
# should change on osgameclones
osgc_name_aliases = {'4DTris': '4D-TRIS', 'fheroes2': 'Free Heroes 2', 'DrCreep': 'The Castles of Dr. Creep',
'Duke3d_win32': 'Duke3d_w32', 'GNOME Atomix': 'Atomix', 'Head over Heels 2': 'Head over Heels',
'mewl': 'M.E.W.L.', 'LinWarrior': 'Linwarrior 3D', 'Mice Men Remix': 'Mice Men: Remix',
'OpenApoc': 'Open Apocalypse', 'open-cube': 'Open Cube', 'open-horizon': 'Open Horizon',
'opengl_test_drive_clone': 'OpenGL Test Drive Remake', "Freenukum Jump'n Run": 'Freenukum',
'Play Freeciv!': 'Freeciv-web', 'ProjectX': 'Forsaken', 'Lyon': 'Roton', 'Mafia II: Toolkit': 'Mafia: Toolkit',
'Siege of Avalon Open Source': 'Siege of Avalon : Open Source', 'ss13remake': 'SS13 Remake',
'shadowgrounds': 'Shadowgrounds', 'RxWars': 'Prescription Wars', 'REDRIVER2': 'REDriver2',
'Super Mario Bros And Level Editor in C#': 'Mario Objects', 'Unitystation': 'unitystation',
'tetris': 'Just another Tetris™ clone', 'twin-e': 'TwinEngine', 'super-methane-brothers-gx': 'Super Methane Brothers for Wii and GameCube',
'CrossUO: Ultima Online': 'CrossUO', 'Doomsday': 'Doomsday Engine', 'OpMon': 'OPMon',
'2048-python': '2048 Python', 'Free Heroes 2 - Enhanced': 'Free Heroes 2', 'ironseed_fpc': 'ironseed',
'KKnD': 'OpenKrush', 'bab-be-u': 'BAB BE U', 'ironseed': 'Ironseed', 'urde': 'Metaforce'}
# conversion between licenses syntax them and us
osgc_licenses_map = {'GPL2': 'GPL-2.0', 'GPL3': 'GPL-3.0', 'AGPL3': 'AGPL-3.0', 'LGPL3': 'LGPL-3.0',
'LGPL2': 'LGPL-2.0 or 2.1?', 'MPL': 'MPL-2.0', 'Apache': 'Apache-2.0',
'Artistic': 'Artistic License', 'Zlib': 'zlib', 'PD': 'Public domain', 'AFL3': 'AFL-3.0',
'BSD2': '2-clause BSD', 'JRL': 'Java Research License'}
# ignore osgc entries (for various reasons like unclear license etc.)
osgc_ignored_entries = ["A Mouse's Vengeance", 'achtungkurve.com', 'AdaDoom3', 'Agendaroids', 'Alien 8', 'Ard-Reil',
'Balloon Fight', 'bladerunner (Engine within SCUMMVM)', 'Block Shooter', 'Bomb Mania Reloaded',
'boulder-dash', 'Cannon Fodder', 'Contra_remake', 'CosmicArk-Advanced', 'Deuteros X',
'datastorm', 'div-columns', 'div-pacman2600', 'div-pitfall', 'div-spaceinvaders2600', 'EXILE',
'Free in the Dark', 'Prepare Carefully', 'OpenKKnD',
'Football Manager', 'Fight Or Perish', 'EarthShakerDS', 'Entombed!', 'FreeRails 2',
'Glest Advanced Engine', 'FreedroidClassic', 'FreeFT', 'Future Blocks', 'HeadOverHeels',
'Herzog 3D', 'Homeworld SDL', 'imperialism-remake', 'Jumping Jack 2: Worryingly Familiar',
'Jumping Jack: Further Adventures', 'Jumpman', 'legion', 'KZap', 'LastNinja', 'Lemmix', 'LixD',
'luminesk5', 'Manic Miner', 'Meridian 59 Server 105', 'Meridian 59 German Server 112',
'Mining Haze', 'OpenGeneral', 'MonoStrategy', 'New RAW', 'OpenDeathValley', 'OpenOutcast',
'openStrato', 'OpenPop', 'pacman',
'Phavon', 'Project: Xenocide', 'pyspaceinvaders', 'PyTouhou', 'Racer',
'Ruby OMF 2097 Remake', 'Snipes', 'Spaceship Duel', 'Space Station 14', 'Starlane Empire',
'Styx', 'Super Mario Bros With SFML in C#', 'thromolusng', 'Tile World 2', 'Tranzam',
'Voxelstein 3D', 'XQuest 2',
'xrick', 'zedragon', 'Uncharted waters 2 remake', 'Desktop Adventures Engine for ScummVM',
'Open Sonic', 'Aladdin_DirectX', 'Alive_Reversing', 're3', 'Sonic-1-2-2013-Decompilation',
'Sonic-CD-11-Decompilation', 'Stunt Car Racer Remake']
def unique_field_contents(entries, field):
"""
"""
unique_content = set()
for entry in entries:
if field in entry:
field_content = entry[field]
if type(field_content) is list:
unique_content.update(field_content)
else:
unique_content.add(field_content)
unique_content = sorted(list(unique_content), key=str.casefold)
return unique_content
def compare_sets(a, b, name, limit=None):
"""
:param limit: 'notus', 'notthem'
:param a: them
:param b: us
:param name: prefix in output
:return:
"""
p = ''
if not isinstance(a, set):
a = set(a)
if not isinstance(b, set):
b = set(b)
d = sorted(list(a - b))
if d and limit != 'notus':
p += ' {} : us : {}\n'.format(name, ', '.join(d))
d = sorted(list(b - a))
if d and limit != 'notthem':
p += ' {} : them : {}\n'.format(name, ', '.join(d))
return p
if __name__ == "__main__":
# some parameter
similarity_threshold = 0.8
maximal_newly_created_entries = 40
check_similar_names = False
# paths
root_path = os.path.realpath(os.path.join(os.path.dirname(__file__), os.path.pardir))
# import the osgameclones data
osgc_path = os.path.realpath(os.path.join(root_path, os.path.pardir, 'osgameclones.git', 'games'))
osgc_files = os.listdir(osgc_path)
# iterate over all yaml files in osgameclones/data folder and load contents
osgc_entries = []
for file in osgc_files:
# read yaml
with open(os.path.join(osgc_path, file), 'r', encoding='utf-8') as stream:
try:
_ = yaml.safe_load(stream)
except Exception as exc:
print(file)
raise exc
# add to entries
osgc_entries.extend(_)
print('Currently {} entries in osgameclones'.format(len(osgc_entries)))
# check: print all git repos with untypical structure
untypical_structure = ''
for osgc_entry in osgc_entries:
name = osgc_entry['name']
if 'repo' in osgc_entry:
osgc_repos = osgc_entry['repo']
if isinstance(osgc_repos, str):
osgc_repos = [osgc_repos]
for repo in osgc_repos:
if 'github' in repo and any((repo.endswith(x) for x in ('/', '.git'))):
untypical_structure += ' {} : {}\n'.format(osgc_entry['name'], repo)
if untypical_structure:
print('Git repos with untypical URL\n{}'.format(untypical_structure))
# which fields do they have
osgc_fields = set()
for osgc_entry in osgc_entries:
osgc_fields.update(osgc_entry.keys())
osgc_fields = sorted(list(osgc_fields))
print('Unique osgc-fields\n {}'.format(', '.join(osgc_fields)))
for field in osgc_fields:
if field in ('video', 'feed', 'url', 'repo', 'info', 'updated', 'images', 'name', 'originals'):
continue
osgc_content = [entry[field] for entry in osgc_entries if field in entry]
# flatten
flat_content = []
for c in osgc_content:
if isinstance(c, list):
flat_content.extend(c)
else:
flat_content.append(c)
statistics = utils.unique_elements_and_occurrences(flat_content)
statistics.sort(key=str.casefold)
print('{}: {}'.format(field, ', '.join(statistics)))
# eliminate the ignored entries
_ = [x['name'] for x in osgc_entries if x['name'] in osgc_ignored_entries] # those that will be ignored
_ = set(osgc_ignored_entries) - set(_) # those that shall be ignored minus those that will be ignored
if _:
print('Can un-ignore {}'.format(_))
osgc_entries = [x for x in osgc_entries if x['name'] not in osgc_ignored_entries]
# fix names and licenses (so they are not longer detected as deviations downstreams)
_ = [x['name'] for x in osgc_entries if x['name'] in osgc_name_aliases.keys()] # those that will be renamed
_ = set(osgc_name_aliases.keys()) - set(_) # those that shall be renamed minus those that will be renamed
if _:
print('Can un-rename {}'.format(_))
for index, entry in enumerate(osgc_entries):
name = entry['name']
if name in osgc_name_aliases:
entry['name'] = osgc_name_aliases[name]
if 'license' in entry:
osgc_licenses = entry['license']
osgc_licenses = [osgc_licenses_map.get(x, x) for x in osgc_licenses]
entry['license'] = osgc_licenses
# fix content (add suffix content)
if 'content' in entry:
osgc_content = entry['content']
if isinstance(osgc_content, str):
osgc_content = [osgc_content]
osgc_content = [x + ' content' for x in osgc_content]
entry['content'] = osgc_content
osgc_entries[index] = entry # TODO is this necessary or is the entry modified anyway?
# which fields do they have
osgc_fields = set()
for osgc_entry in osgc_entries:
osgc_fields.update(osgc_entry.keys())
print('unique osgc-fields: {}'.format(osgc_fields))
# which fields are mandatory
for osgc_entry in osgc_entries:
remove_fields = [field for field in osgc_fields if field not in osgc_entry]
osgc_fields -= set(remove_fields)
print('mandatory osfg-fields: {}'.format(osgc_fields))
# some field statistics
print('osgc-development: {}'.format(unique_field_contents(osgc_entries, 'development')))
print('osgc-multiplayer: {}'.format(unique_field_contents(osgc_entries, 'multiplayer')))
print('osgc-type: {}'.format(unique_field_contents(osgc_entries, 'type')))
print('osgc-languages: {}'.format(unique_field_contents(osgc_entries, 'lang')))
print('osgc-licenses: {}'.format(unique_field_contents(osgc_entries, 'license')))
print('osgc-status: {}'.format(unique_field_contents(osgc_entries, 'status')))
print('osgc-framework: {}'.format(unique_field_contents(osgc_entries, 'framework')))
print('osgc-content: {}'.format(unique_field_contents(osgc_entries, 'content')))
# read our database
our_entries = osg.read_entries()
print('{} entries with us'.format(len(our_entries)))
# just the names
osgc_names = set([x['name'] for x in osgc_entries])
our_names = set([x['Title'] for x in our_entries])
common_names = osgc_names & our_names
osgc_names -= common_names
our_names -= common_names
print('{} in both, {} only in osgameclones, {} only with us'.format(len(common_names), len(osgc_names),
len(our_names)))
# find similar names among the rest
if check_similar_names:
print('look for similar names (theirs - ours)')
for osgc_name in osgc_names:
for our_name in our_names:
if osg.name_similarity(osgc_name, our_name) > similarity_threshold:
print(' {} - {}'.format(osgc_name, our_name))
newly_created_entries = 0
# iterate over their entries
for osgc_entry in osgc_entries:
osgc_name = osgc_entry['name']
is_included = False
for our_entry in our_entries:
our_name = our_entry['Title']
# find those that entries in osgameclones that are also in our database and compare them
if osgc_name == our_name:
is_included = True
# a match, check the fields
name = osgc_name
p = ''
# TODO key names have changed on our side
# compare their lang with our code language
if 'lang' in osgc_entry:
osgc_languages = osgc_entry['lang']
if type(osgc_languages) == str:
osgc_languages = [osgc_languages]
our_languages = [x.value for x in our_entry['Code language']] # essential field
p += compare_sets(osgc_languages, our_languages, 'code language')
# compare their license with our code and assets license
if 'license' in osgc_entry:
osgc_licenses = osgc_entry['license']
our_code_licenses = [x.value for x in our_entry['Code license']] # essential field
our_assets_licenses = [x.value for x in our_entry.get('Assets license', [])]
p += compare_sets(osgc_licenses, our_code_licenses + our_assets_licenses, 'licenses', 'notthem')
p += compare_sets(osgc_licenses, our_code_licenses, 'licenses', 'notus')
# compare their framework with our code dependencies (capitalization is ignored for now, only starts are compared)
our_framework_replacements = {'allegro4': 'allegro'}
if 'framework' in osgc_entry:
osgc_frameworks = osgc_entry['framework']
if type(osgc_frameworks) == str:
osgc_frameworks = [osgc_frameworks]
our_frameworks = [x.value for x in our_entry.get('Code dependency', [])]
our_frameworks = [x.casefold() for x in our_frameworks]
our_frameworks = [x if x not in our_framework_replacements else our_framework_replacements[x] for x
in our_frameworks]
osgc_frameworks = [x.casefold() for x in osgc_frameworks]
p += compare_sets(osgc_frameworks, our_frameworks, 'framework/dependencies')
# compare their repo with our code repository and download
if 'repo' in osgc_entry:
osgc_repos = osgc_entry['repo']
if type(osgc_repos) == str:
osgc_repos = [osgc_repos]
osgc_repos = [utils.strip_url(url) for url in osgc_repos]
osgc_repos = [x for x in osgc_repos if not x.startswith(
'sourceforge.net/projects/')] # we don't need the general sites there
# osgc_repos = [x for x in osgc_repos if not x.startswith('https://sourceforge.net/projects/')] # ignore some
our_repos = our_entry.get('Code repository', [])
our_repos = [utils.strip_url(url.value) for url in our_repos]
our_repos = [x for x in our_repos if not x.startswith(
'gitlab.com/osgames/')] # we do not yet spread our own deeds (but we will some day)
our_repos = [x for x in our_repos if
'cvs.sourceforge.net' not in x and 'svn.code.sf.net/p/' not in x] # no cvs or svn anymore
our_downloads = our_entry.get('Download', [])
our_downloads = [utils.strip_url(url.value) for url in our_downloads]
p += compare_sets(osgc_repos, our_repos + our_downloads, 'repo',
'notthem') # if their repos are not in our downloads or repos
p += compare_sets(osgc_repos, our_repos[:1], 'repo',
'notus') # if our main repo is not in their repo
# compare their url (and feed) to our home (and strip urls)
if 'url' in osgc_entry:
osgc_urls = osgc_entry['url']
if type(osgc_urls) == str:
osgc_urls = [osgc_urls]
osgc_urls = [utils.strip_url(url) for url in osgc_urls]
our_urls = our_entry['Home']
our_urls = [utils.strip_url(url.value) for url in our_urls]
p += compare_sets(osgc_urls, our_urls, 'url/home', 'notthem') # if their urls are not in our urls
# our_urls = [url for url in our_urls if
# not url.startswith('github.com/')] # they don't have them as url
p += compare_sets(osgc_urls, our_urls[:1], 'url/home',
'notus') # if our first url is not in their urls
# compare their status with our state (playable can be beta/mature with us, but not playable must be beta)
if 'status' in osgc_entry:
osgc_status = osgc_entry['status']
our_status = our_entry['State'] # essential field
if osgc_status != 'playable' and 'mature' in our_status:
p += ' status : mismatch : them {}, us mature\n'.format(osgc_status)
# compare their development with our state
if 'development' in osgc_entry:
osgc_development = osgc_entry['development']
our_inactive = 'inactive' in our_entry
our_status = our_entry['State'] # essential field
if osgc_development == 'halted' and not our_inactive:
p += ' development : mismatch : them halted - us not inactive\n'
if osgc_development in ['very active', 'active'] and our_inactive:
p += ' development : mismatch : them {}, us inactive\n'.format(osgc_development)
if osgc_development == 'complete' and 'mature' not in our_status:
p += ' development : mismatch : them complete, us not mature\n'
# get our keywords
our_keywords = [x.value for x in our_entry['Keyword']] # essential
# compare their originals to our inspirations
our_originals = [x.value for x in our_entry.get('Inspiration', [])]
if 'originals' in osgc_entry:
osgc_originals = osgc_entry['originals']
osgc_originals = [x.replace(',', '') for x in
osgc_originals] # we cannot have ',' or parts in parentheses in original names
p += compare_sets(osgc_originals, our_originals, 'originals')
# compare their multiplayer with our keywords (multiplayer) (only lowercase comparison)
if 'multiplayer' in osgc_entry:
osgc_multiplayer = osgc_entry['multiplayer']
if type(osgc_multiplayer) == str:
osgc_multiplayer = [osgc_multiplayer]
osgc_multiplayer = [x.casefold() for x in osgc_multiplayer]
osgc_multiplayer = [x for x in osgc_multiplayer if x not in ['competitive']] # ignored
our_multiplayer = [x for x in our_keywords if x.startswith('multiplayer ')]
if our_multiplayer:
if len(our_multiplayer) != 1:
print(our_entry)
raise RuntimeError()
assert len(our_multiplayer) == 1
our_multiplayer = our_multiplayer[0][11:].split('+')
our_multiplayer = [x.strip().casefold() for x in our_multiplayer]
p += compare_sets(osgc_multiplayer, our_multiplayer, 'multiplayer')
# compare content with keywords
if 'content' in osgc_entry:
osgc_content = osgc_entry['content']
if isinstance(osgc_content, str):
osgc_content = [osgc_content]
p += compare_sets(osgc_content, our_keywords, 'content/keywords',
'notthem') # only to us because we have more then them
# compare their type to our keywords
if 'type' in osgc_entry:
game_type = osgc_entry['type']
if isinstance(game_type, str):
game_type = [game_type]
p += compare_sets(game_type, our_keywords, 'type/keywords',
'notthem') # only to us because we have more then them
if p:
print('{}\n{}'.format(name, p))
if not is_included:
# a new entry, that we have never seen, maybe we should make an entry of our own
# continue
# TODO we could use the write capabilities to write the entry in our own format, the hardcoded format here might be brittle, on the other hand we can also write slightly wrong stuff here without problems
if newly_created_entries >= maximal_newly_created_entries:
continue
game_type = osgc_entry.get('type', None)
osgc_status = osgc_entry.get('status', None)
# we sort some out here (maybe we want to have a closer look at them later)
if osgc_status == 'unplayable':
# for now not the unplayable ones
continue
if 'license' not in osgc_entry or 'As-is' in osgc_entry['license']:
# for now not the ones without license or with as-is license
continue
# determine file name
print('create new entry for {}'.format(osgc_name))
file_name = osg.canonical_name(osgc_name) + '.md'
target_file = os.path.join(constants.entries_path, file_name)
if os.path.isfile(target_file):
print('warning: file {} already existing, save under slightly different name'.format(file_name))
target_file = os.path.join(constants.entries_path, file_name[:-3] + '-duplicate.md')
if os.path.isfile(target_file):
continue # just for safety reasons
# add name
entry = '# {}\n\n'.format(osgc_name)
# home
home = osgc_entry.get('url', None)
entry += '- Home: {}\n'.format(home)
# inspiration
if 'originals' in osgc_entry:
osgc_originals = osgc_entry['originals']
if type(osgc_originals) == str:
osgc_originals = [osgc_originals]
entry += '- Inspiration: {}\n'.format(', '.join(osgc_originals))
# state
entry += '- State: {}'.format(osgc_status)
if 'development' in osgc_entry:
if osgc_entry['development'] == 'halted':
entry += ', inactive since XX'
entry += '\n'
# language tags
lang = osgc_entry.get('lang', [])
if type(lang) == str:
lang = [lang]
# platform 'Web' if language == JavaScript or TypeScript
if len(lang) == 1 and lang[0] in ('JavaScript', 'TypeScript'):
entry += '- Platform: Web\n'
# keywords
keywords = []
if game_type:
keywords.append(game_type)
if 'multiplayer' in osgc_entry:
osgc_multiplayer = osgc_entry['multiplayer']
if type(osgc_multiplayer) == str:
osgc_multiplayer = [osgc_multiplayer]
keywords.append('multiplayer {}'.format(' + '.join(osgc_multiplayer)))
if 'content' in osgc_entry:
osgc_content = osgc_entry['content'] # it's a list
osgc_content = ', '.join(osgc_content)
keywords.append(osgc_content)
if keywords:
entry += '- Keyword: {}\n'.format(', '.join(keywords))
# code repository (mandatory on our side)
repo = osgc_entry.get('repo', None)
if repo and repo.startswith('https://git') and not repo.endswith('.git'):
# we have them with .git on github/gitlab
repo += '.git'
entry += '- Code repository: {}\n'.format(repo)
# code language (mandatory on our side)
entry += '- Code language: {}\n'.format(', '.join(lang))
# code license
entry += '- Code license: {}\n'.format(', '.join(osgc_entry['license']))
# code dependencies (if existing)
if 'framework' in osgc_entry:
osgc_frameworks = osgc_entry['framework']
if type(osgc_frameworks) == str:
osgc_frameworks = [osgc_frameworks]
entry += '- Code dependency: {}\n'.format(', '.join(osgc_frameworks))
# add description (already put into Inspiration)
# description = '{} of {}.'.format(game_type.capitalize(), ', '.join(osgc_entry['originals']))
# entry += '\n{}\n\n'.format(description)
# write info (if existing)
if 'info' in osgc_entry:
entry += '\n{}\n\n'.format(osgc_entry['info'])
# write ## Building
entry += '\n## Building\n'
# finally write to file
utils.write_text(target_file, entry)
newly_created_entries += 1
# now iterate over our entries and test if we can add anything to them
print('entry that could be added to them')
for our_entry in our_entries:
our_name = our_entry['Title']
# only if contains a keyword starting with "inspired by" and not "tool", "framework" or "library"
our_keywords = our_entry['Keyword']
if not any([x.startswith('inspired by ') for x in our_keywords]):
continue
if any([x in ['tool', 'library', 'framework'] for x in our_keywords]):
continue
is_included = False
for osgc_entry in osgc_entries:
osgc_name = osgc_entry['name']
if osgc_name == our_name:
is_included = True
if not is_included:
# that could be added to them
print('- [{}]({})'.format(our_name,
'https://github.com/Trilarion/opensourcegames/blob/master/entries/' + our_entry[
'file']))

View File

@ -0,0 +1,164 @@
"""
Scrapes Sourceforge project sites and adds (mostly developer) information to our database.
""" # TODO sourceforge sites that are not existing anymore but we have an archive link, also scrape
import os
import json
import requests
from bs4 import BeautifulSoup
from utils import constants as c, utils, osg, osg_parse
sf_entries_file = os.path.join(c.code_path, 'sourceforge_entries.txt')
prefix = 'https://sourceforge.net/projects/'
# author names in SF that aren't the author names how we have them
SF_alias_list = {'Erik Johansson (aka feneur)': 'Erik Johansson', 'Itms': 'Nicolas Auvray', 'baris yuksel': 'Baris Yuksel',
'Wraitii': 'Lancelot de Ferrière', 'Simzer': 'Simon Laszlo', 'armin bajramovic': 'Armin Bajramovic',
'bleu tailfly': 'bleutailfly', 'dlh': 'DLH', 'Bjorn Hansen': 'Bjørn Hansen', 'Louens Veen': 'Lourens Veen',
'linley_henzell': 'Linley Henzell', 'Patrice DUHAMEL': 'Patrice Duhamel', 'Etienne SOBOLE': 'Etienne Sobole',
'L. H. [Lubomír]': 'L. H. Lubomír', 'davidjoffe': 'David Joffe', 'EugeneLoza': 'Eugene Loza',
'Kenneth Gangsto': 'Kenneth Gangstø', 'Lucas GAUTHERON': 'Lucas Gautheron', 'Per I Mathisen': 'Per Inge Mathisen',
'wrtlprnft': 'Wrzlprnft', 'daniel_santos': 'Daniel Santos', 'Dark_Sylinc': 'darksylinc',
'Don Llopis': 'Don E. Llopis', 'dwachs': 'Dwachs', 'Pierre-Loup Griffais': 'Pierre-Loup A. Griffais',
'Richard Gobeille': 'Richard C. Gobeille', 'timfelgentreff': 'Tim Felgentreff',
'Dr. Martin Brumm': 'Martin Brumm', 'Dr. Wolf-Dieter Beelitz': 'Wolf-Dieter Beelitz'}
SF_ignore_list = ('', 'Arianne Integration Bot')
def collect_sourceforge_entries():
"""
Reads the entries of the database and collects all entries with sourceforge as project site
"""
# read entries
entries = osg.read_entries()
print('{} entries read'.format(len(entries)))
# loop over entries
files = []
for entry in entries:
urls = [x for x in entry['Home'] if x.startswith(prefix)]
if urls:
files.append(entry['File'])
# write to file
print('{} entries with sourceforge projects'.format(len(files)))
utils.write_text(sf_entries_file, json.dumps(files, indent=1))
def sourceforge_import():
"""
:return:
"""
files = json.loads(utils.read_text(sf_entries_file))
all_developers = osg.read_developers()
print(' {} developers read'.format(len(all_developers)))
all_developers_changed = False
# all exceptions that happen will be eaten (but will end the execution)
try:
# loop over each entry
for index, file in enumerate(files):
print(' process {}'.format(file))
# read entry
entry = osg.read_entry(file)
developers = entry.get('Developer', [])
urls = [x.value for x in entry['Home'] if x.startswith('https://sourceforge.net/projects/')]
entry_changed = False
for url in urls:
print(' sf project {}'.format(url))
if not url.endswith('/'):
print('error: sf project does not end with slash')
url += '/'
# members
url_members = 'https://sourceforge.net/p/' + url[len(prefix):] + '_members/'
response = requests.get(url_members)
if response.status_code != 200:
print('error: url {} not accessible, status {}'.format(url_members, response.status_code))
raise RuntimeError()
soup = BeautifulSoup(response.text, 'html.parser')
authors = soup.find('div', id='content_base').find('table').find_all('tr')
authors = [author.find_all('td') for author in authors]
authors = [author[1].a['href'] for author in authors if len(author) == 3]
for author in authors:
# sometimes author already contains the full url, sometimes not
url_author = 'https://sourceforge.net' + author if not author.startswith('http') else author
response = requests.get(url_author)
if response.status_code != 200 and author not in ('/u/favorito/',):
print('error: url {} not accessible, status {}'.format(url_author, response.status_code))
raise RuntimeError()
url_author = response.url # could be different now
if 'auth/?return_to' in url_author or response.status_code != 200:
# for some reason authorisation is forbidden or page was not available (happens for example for /u/kantaros)
author_name = author[3:-1]
nickname = author_name
else:
soup = BeautifulSoup(response.text, 'html.parser')
author_name = soup.h1.get_text()
author_name = SF_alias_list.get(author_name, author_name) # replace by alias if possible
nickname = soup.find('dl', class_='personal-data').find('dd').get_text()
nickname = nickname.replace('\n', '').strip()
nickname += '@SF' # our indication of the platform to search for
author_name = author_name.strip() # names can still have white spaces before or after
if author_name in SF_ignore_list:
continue
# look author up in entry developers
if author_name not in developers:
print(' dev "{}" added to entry {}'.format(author_name, file))
entry['Developer'] = entry.get('Developer', []) + [osg_parse.ValueWithComment(author_name)]
entry_changed = True
developers = entry.get('Developer', [])
# look author and SF nickname up in developers data base
if author_name in all_developers:
dev = all_developers[author_name]
if not nickname in dev.get('Contact', []):
print(' existing dev "{}" added nickname ({}) to developer database'.format(author_name, nickname))
# check that name has not already @SF contact
if any(x.endswith('@SF') for x in dev.get('Contact', [])):
print('warning: already SF contact')
all_developers[author_name]['Contact'] = dev.get('Contact', []) + [nickname]
all_developers_changed = True
else:
print(' dev "{}" ({}) added to developer database'.format(author_name, nickname))
all_developers[author_name] = {'Name': author_name, 'Contact': [nickname], 'Games': [entry['Title']]}
all_developers_changed = True
if entry_changed:
# save entry
osg.write_entry(entry)
print(' entry updated')
except:
raise
finally:
# shorten file list
utils.write_text(sf_entries_file, json.dumps(files[index:], indent=1))
# save entry
osg.write_entry(entry)
print(' entry updated')
# maybe save all developers
if all_developers_changed:
# save all developers
osg.write_developers(all_developers)
print('developers database updated')
if __name__ == "__main__":
# collect entries
collect_sourceforge_entries()
# import information from sf
# sourceforge_import()