imports from osgameclones

This commit is contained in:
Trilarion
2019-08-23 22:17:51 +02:00
parent 42dee8c7e8
commit 39a9f55cae
49 changed files with 1199 additions and 169 deletions

View File

@ -21,6 +21,8 @@
"https://git.code.sf.net/p/chromium-bsu/code",
"https://git.code.sf.net/p/dangerdeep/git",
"https://git.code.sf.net/p/dnt/code",
"https://git.code.sf.net/p/doomlegacy/legacy2",
"https://git.code.sf.net/p/doomlegacy/masterserver",
"https://git.code.sf.net/p/dunedynasty/dunedynasty",
"https://git.code.sf.net/p/dunelegacy/code",
"https://git.code.sf.net/p/epicheroes/code",
@ -103,6 +105,8 @@
"https://github.com/Illarion-eV/Illarion-Content.git",
"https://github.com/Illarion-eV/Illarion-Java.git",
"https://github.com/Illarion-eV/Illarion-Server.git",
"https://github.com/Interkarma/daggerfall-unity.git",
"https://github.com/Interrupt/delverengine.git",
"https://github.com/Kromster80/kam_remake.git",
"https://github.com/LWJGL/lwjgl3.git",
"https://github.com/Leejjon/Battleround.git",
@ -190,6 +194,7 @@
"https://github.com/angband/angband.git",
"https://github.com/antionio/game-off-2013.git",
"https://github.com/anttisalonen/kingdoms.git",
"https://github.com/aperture-software/colditz-escape.git",
"https://github.com/arescentral/antares.git",
"https://github.com/arx/ArxLibertatis.git",
"https://github.com/atrinik/atrinik.git",
@ -217,9 +222,13 @@
"https://github.com/craftworkgames/infiniminer.git",
"https://github.com/crawl/crawl.git",
"https://github.com/cthielen/Epiar.git",
"https://github.com/cubosphere/cubosphere-code.git",
"https://github.com/cxong/cdogs-sdl.git",
"https://github.com/darklegion/tremulous.git",
"https://github.com/davidjoffe/dave_gnukem.git",
"https://github.com/delight-im/OpenSoccer.git",
"https://github.com/dgengin/DGEngine.git",
"https://github.com/dhewm/dhewm3.git",
"https://github.com/djyt/cannonball.git",
"https://github.com/dmecke/OpenSoccerStar.git",
"https://github.com/doxygen/doxygen.git",
@ -229,6 +238,8 @@
"https://github.com/dungeons-of-moria/umoria.git",
"https://github.com/ec429/harris.git",
"https://github.com/egoboo/egoboo.git",
"https://github.com/ellisonleao/clumsy-bird.git",
"https://github.com/emezeske/digbuild.git",
"https://github.com/endless-sky/endless-sky.git",
"https://github.com/enigma-dev/enigma-dev.git",
"https://github.com/exult/exult.git",
@ -237,6 +248,7 @@
"https://github.com/fariazz/World-of-Heroes.git",
"https://github.com/farmboy0/slashem.git",
"https://github.com/fastrgv/AdaVenture.git",
"https://github.com/fogleman/Craft.git",
"https://github.com/freeciv/freeciv-web.git",
"https://github.com/freeciv/freeciv.git",
"https://github.com/freedoom/freedoom.git",
@ -245,6 +257,7 @@
"https://github.com/freeors/War-Of-Kingdom.git",
"https://github.com/freeserf/freeserf.git",
"https://github.com/gabrielecirulli/2048.git",
"https://github.com/galaxyhaxz/devilution.git",
"https://github.com/gemrb/gemrb.git",
"https://github.com/glennrp/libpng.git",
"https://github.com/goblinhack/goblinhack.git",
@ -258,7 +271,9 @@
"https://github.com/guillaume-gouchon/dungeonquest.git",
"https://github.com/guillaume-gouchon/smash.js.git",
"https://github.com/hackcraft-de/linwarrior.git",
"https://github.com/haleymt/CrystalQuest.git",
"https://github.com/harfbuzz/harfbuzz.git",
"https://github.com/haroldo-ok/datastorm.git",
"https://github.com/henkboom/pax-britannica.git",
"https://github.com/hhirsch/ardentryst.git",
"https://github.com/hinogi/eternalwinterwars.git",
@ -341,12 +356,15 @@
"https://github.com/red-eclipse/base.git",
"https://github.com/richardjs/Maelstrom.git",
"https://github.com/riksweeney/edgar.git",
"https://github.com/rohit-n/Clonepoint.git",
"https://github.com/sabetts/bratwurst.git",
"https://github.com/sago007/annchienta.git",
"https://github.com/samcv/brainworkshop.git",
"https://github.com/scottschiller/ArmorAlley.git",
"https://github.com/scummvm/scummvm.git",
"https://github.com/shinyquagsire23/DesktopAdventures.git",
"https://github.com/silverweed/lifish.git",
"https://github.com/simeonpilgrim/coab.git",
"https://github.com/singularity/singularity.git",
"https://github.com/snauts/game-lv.git",
"https://github.com/spring/spring.git",
@ -357,6 +375,7 @@
"https://github.com/superpowers/superpowers-core.git",
"https://github.com/supertuxkart/stk-code.git",
"https://github.com/suprafun/aiwars.git",
"https://github.com/svkaiser/Doom64EX.git",
"https://github.com/swig/swig.git",
"https://github.com/tales/sourceoftales.git",
"https://github.com/tales/tales-client.git",
@ -375,6 +394,7 @@
"https://github.com/unnethack/unnethack.git",
"https://github.com/urho3d/Urho3D.git",
"https://github.com/valeriansaliou/boulder-dash.git",
"https://github.com/varunpant/CrappyBird.git",
"https://github.com/vcmi/vcmi.git",
"https://github.com/vcosta/derclou.git",
"https://github.com/vegastrike/Vega-Strike-Engine-Source.git",
@ -394,6 +414,7 @@
"https://github.com/zaki/irrlicht.git",
"https://github.com/zenorogue/hyperrogue.git",
"https://github.com/zombieman1041/BlakedAwesomenaughts.git",
"https://gitlab.com/Dringgstein/Commander-Genius.git",
"https://gitlab.com/KilgoreTroutMaskReplicant/1oom.git",
"https://gitlab.com/drummyfish/Bombman.git",
"https://gitlab.com/evol/evol-all.git",

View File

@ -2,16 +2,203 @@
Imports game details from libregamewiki by scraping the website, starting from https://libregamewiki.org/Category:Games
Also parse rejected games (https://libregamewiki.org/Libregamewiki:Rejected_games_list) and maybe https://libregamewiki.org/Libregamewiki:Suggested_games
Unique left column names in the game info boxes:
['Code license', 'Code licenses', 'Developer', 'Developers', 'Engine', 'Engines', 'Genre', 'Genres', 'Libraries', 'Library', 'Media license', 'Media licenses', 'P. language', 'P. languages', 'Platforms']
"""
import requests
import re
import json
from bs4 import BeautifulSoup, NavigableString
from utils.utils import *
def key_selection_gameinfobox(a, b):
"""
Checks which of the two elements in a is in b or none but not both
"""
if len(a) != 2:
raise RuntimeError()
c = [x in b for x in a]
if all(c):
raise RuntimeError
if not any(c):
return None, None
d = [(k, i) for (i, k) in enumerate(a) if c[i]]
return d[0]
def extract_field_content(key, idx, info):
"""
From a game info field.
"""
content = info[key].get_text()
content = content.split(',')
content = [x.strip() for x in content]
content = [x if not (x.endswith('[1]') or x.endswith('[2]')) else x[:-3] for x in content] # remove trailing [1,2]
content = [x.strip() for x in content]
if not content:
raise RuntimeError
if (len(content) > 1 and idx == 0) or (len(content) == 1 and idx == 1):
print(' warning: {} Sg./Pl. mismatch'.format(key))
return content
if __name__ == "__main__":
regex_games = re.compile(r"<li><a href=\"\/(.+?)\".*?>(.+?)<\/a><\/li>") # url part, name
# parameters
base_url = 'https://libregamewiki.org'
ignored_gameinfos = ['Contribute', 'Origin', 'Release date', 'Latest release']
# read and process the base url (get all games and categories)
url = base_url + '/Category:Games'
games = []
while True:
text = requests.get(url).text
soup = BeautifulSoup(text, 'html.parser')
#categories = soup.find('div', id='mw-subcategories').find_all('li')
#categories = [(x.a['href'], x.a.string) for x in categories]
# game pages
pages = soup.find('div', id='mw-pages').find_all('li')
games.extend(((x.a['href'], x.a.string) for x in pages))
# next page
next_page = soup.find('a', string='next page')
if not next_page:
break
url = base_url + next_page['href']
print('current number of games in LGW {}'.format(len(games)))
# parse games
counter = 0
unique_gameinfo_fields = set()
entries = []
for game in games:
url = base_url + game[0]
text = requests.get(url).text
soup = BeautifulSoup(text, 'html.parser')
title = soup.h1.string
print(title)
entry = {'name': title}
# parse gameinfobox
info = soup.find('div', class_='gameinfobox')
if not info:
print(' no gameinfobox')
else:
info = info.find_all('tr')
info = [(x.th.string, x.td) for x in info if x.th and x.th.string]
info = [x for x in info if x[0] not in ignored_gameinfos]
info = dict(info)
unique_gameinfo_fields.update(info.keys())
# consume fields of gameinfobox
# genre
key, idx = key_selection_gameinfobox(('Genre', 'Genres'), info.keys())
if key:
genres = extract_field_content(key, idx, info)
entry['genre']
del info[key]
# platforms
key = 'Platforms'
if key in info:
platforms = extract_field_content(key, 1, info)
# platforms = [x if x != 'Mac' else 'macOS' for x in platforms] # replace Mac with macOS
entry['platform'] = platforms
del info[key]
# developer
key, idx = key_selection_gameinfobox(('Developer', 'Developers'), info.keys())
if key:
entry['developer'] = extract_field_content(key, idx, info)
del info[key]
# code license
key, idx = key_selection_gameinfobox(('Code license', 'Code licenses'), info.keys())
if key:
entry['code license'] = extract_field_content(key, idx, info)
del info[key]
# media license
key, idx = key_selection_gameinfobox(('Media license', 'Media licenses'), info.keys())
if key:
entry['assets license'] = extract_field_content(key, idx, info)
del info[key]
# engine
key, idx = key_selection_gameinfobox(('Engine', 'Engines'), info.keys())
if key:
entry['engine'] = extract_field_content(key, idx, info)
del info[key]
# library
key, idx = key_selection_gameinfobox(('Library', 'Libraries'), info.keys())
if key:
entry['library'] = extract_field_content(key, idx, info)
del info[key]
# programming language
key, idx = key_selection_gameinfobox(('P. language', 'P. languages'), info.keys())
if key:
languages = extract_field_content(key, idx, info)
languages = [x for x in languages if x != 'HTML5'] # ignore HTML5
entry['code language'] = languages
del info[key]
# unconsumed
if info:
print('unconsumed gameinfo keys {}'.format(info.keys()))
raise RuntimeError()
# parse "for available as package in"
tables = soup.find_all('table', class_='wikitable')
tables = [table for table in tables if table.caption and table.caption.string.startswith('Available as package')]
if len(tables) > 0:
if len(tables) > 1:
raise RuntimeError()
table = tables[0]
packages = table.find_all('tr')
packages = [x.td.a['href'] for x in packages]
entry['linux-packages'] = packages
# categories
categories = soup.find_all('div', id='mw-normal-catlinks')
if not categories:
print(' no categories')
categories = []
else:
if len(categories) > 1:
raise RuntimeError()
categories = categories[0]
categories = categories.find_all('li')
categories = [x.a.string for x in categories]
if 'Games' not in categories:
print(' "Games" not in categories')
else:
categories.remove('Games') # should be there
# strip games at the end
phrase = ' games'
categories = [x[:-len(phrase)] if x.endswith(phrase) else x for x in categories]
ignored_categories = ['Articles lacking reference', 'Stubs']
categories = [x for x in categories if x not in ignored_categories]
entry['categories'] = categories
entries.append(entry)
# print(entry)
counter += 1
if counter > 20:
# break
pass
unique_gameinfo_fields = sorted(list(unique_gameinfo_fields))
print('unique gameinfo fields: {}'.format(unique_gameinfo_fields))
# save entries
json_path = os.path.join(os.path.dirname(__file__), 'lgw_import.json')
text = json.dumps(entries, indent=1)
write_text(json_path, text)
# read base url
base_url = 'https://libregamewiki.org/Category:Games'
text = requests.get(base_url).text
print(text)

View File

@ -2,4 +2,46 @@
Once data from libregamewiki is imported, synchronize with our database, i.e. identify the entries both have in common,
estimate the differences in the entries both have in common, suggest to add the entries they have not in common to each
other.
"""
unique imported fields: 'assets license', 'categories', 'code language', 'code license', 'developer', 'engine', 'genre', 'library', 'linux-packages', 'name', 'platform'
"""
import json
from utils.utils import *
def get_unique_field_content(field, entries):
"""
"""
unique_content = set()
for entry in entries:
if field in entry:
unique_content.update(entry[field])
return sorted(list(unique_content))
platform_replacements = {'Mac': 'macOS'}
if __name__ == "__main__":
# import lgw import
json_path = os.path.join(os.path.dirname(__file__), 'lgw_import.json')
text = read_text(json_path)
lgw_entries = json.loads(text)
# check for unique field names
unique_fields = set()
for lgw_entry in lgw_entries:
unique_fields.update(lgw_entry.keys())
unique_fields = sorted(list(unique_fields))
print('unique lgw fields: {}'.format(unique_fields))
# unique contents
print('{}: {}'.format('platform', get_unique_field_content('platform', lgw_entries)))
print('{}: {}'.format('code language', get_unique_field_content('code language', lgw_entries)))
print('{}: {}'.format('categories', get_unique_field_content('categories', lgw_entries)))
print('{}: {}'.format('genre', get_unique_field_content('genre', lgw_entries)))
print('{}: {}'.format('library', get_unique_field_content('library', lgw_entries)))
print('{}: {}'.format('code license', get_unique_field_content('code license', lgw_entries)))
print('{}: {}'.format('assets license', get_unique_field_content('assets license', lgw_entries)))
print('{}: {}'.format('engine', get_unique_field_content('engine', lgw_entries)))

View File

@ -45,7 +45,8 @@ osgc_name_aliases = {}
osgc_licenses_map = {'GPL2': 'GPL-2.0', 'GPL3': 'GPL-3.0', 'AGPL3': 'AGPL-3.0', 'LGPL3': 'LGPL-3.0', 'LGPL2': 'LGPL-2.1', 'MPL': 'MPL-2.0', 'Apache': 'Apache-2.0', 'Artistic': 'Artistic License'}
# ignore osgc entries (for various reasons like unclear license etc.)
osgc_ignored_entries = ["A Mouse's Vengeance", 'achtungkurve.com', 'AdaDoom3', 'Agendaroids', 'Alien 8', 'Ard-Reil', 'Balloon Fight', 'bladerunner (Engine within SCUMMVM)', 'Block Shooter', 'Bomb Mania Reloaded', 'boulder-dash', 'Cannon Fodder']
osgc_ignored_entries = ["A Mouse's Vengeance", 'achtungkurve.com', 'AdaDoom3', 'Agendaroids', 'Alien 8', 'Ard-Reil', 'Balloon Fight', 'bladerunner (Engine within SCUMMVM)', 'Block Shooter', 'Bomb Mania Reloaded', 'boulder-dash', 'Cannon Fodder', 'Contra_remake', 'CosmicArk-Advanced', 'Deuteros X', 'datastorm'
,'div-columns', 'div-pacman2600', 'div-pitfall', 'div-spaceinvaders2600']
def similarity(a, b):
return SequenceMatcher(None, str.casefold(a), str.casefold(b)).ratio()
@ -325,12 +326,12 @@ if __name__ == "__main__":
originals = osgc_entry['originals']
if type(originals) == str:
originals = [originals]
keywords.append('inspired by {}'.format(' + '.join(original)))
keywords.append('inspired by {}'.format(' + '.join(originals)))
if 'multiplayer' in osgc_entry:
multiplayer = osgc_entry['multiplayer']
if type(multiplayer) == str:
multiplayer = [multiplayer]
keywords.extend('multiplayer {}'.format(' + '.join(multiplayer)))
keywords.append('multiplayer {}'.format(' + '.join(multiplayer)))
if 'content' in osgc_entry:
content = osgc_entry['content']
keywords.append('{} content'.format(content))