opensourcegames/tools/libregamewiki_import.py
2019-08-23 22:17:51 +02:00

205 lines
7.3 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Imports game details from libregamewiki by scraping the website, starting from https://libregamewiki.org/Category:Games
Also parse rejected games (https://libregamewiki.org/Libregamewiki:Rejected_games_list) and maybe https://libregamewiki.org/Libregamewiki:Suggested_games
Unique left column names in the game info boxes:
['Code license', 'Code licenses', 'Developer', 'Developers', 'Engine', 'Engines', 'Genre', 'Genres', 'Libraries', 'Library', 'Media license', 'Media licenses', 'P. language', 'P. languages', 'Platforms']
"""
import requests
import json
from bs4 import BeautifulSoup, NavigableString
from utils.utils import *
def key_selection_gameinfobox(a, b):
"""
Checks which of the two elements in a is in b or none but not both
"""
if len(a) != 2:
raise RuntimeError()
c = [x in b for x in a]
if all(c):
raise RuntimeError
if not any(c):
return None, None
d = [(k, i) for (i, k) in enumerate(a) if c[i]]
return d[0]
def extract_field_content(key, idx, info):
"""
From a game info field.
"""
content = info[key].get_text()
content = content.split(',')
content = [x.strip() for x in content]
content = [x if not (x.endswith('[1]') or x.endswith('[2]')) else x[:-3] for x in content] # remove trailing [1,2]
content = [x.strip() for x in content]
if not content:
raise RuntimeError
if (len(content) > 1 and idx == 0) or (len(content) == 1 and idx == 1):
print(' warning: {} Sg./Pl. mismatch'.format(key))
return content
if __name__ == "__main__":
# parameters
base_url = 'https://libregamewiki.org'
ignored_gameinfos = ['Contribute', 'Origin', 'Release date', 'Latest release']
# read and process the base url (get all games and categories)
url = base_url + '/Category:Games'
games = []
while True:
text = requests.get(url).text
soup = BeautifulSoup(text, 'html.parser')
#categories = soup.find('div', id='mw-subcategories').find_all('li')
#categories = [(x.a['href'], x.a.string) for x in categories]
# game pages
pages = soup.find('div', id='mw-pages').find_all('li')
games.extend(((x.a['href'], x.a.string) for x in pages))
# next page
next_page = soup.find('a', string='next page')
if not next_page:
break
url = base_url + next_page['href']
print('current number of games in LGW {}'.format(len(games)))
# parse games
counter = 0
unique_gameinfo_fields = set()
entries = []
for game in games:
url = base_url + game[0]
text = requests.get(url).text
soup = BeautifulSoup(text, 'html.parser')
title = soup.h1.string
print(title)
entry = {'name': title}
# parse gameinfobox
info = soup.find('div', class_='gameinfobox')
if not info:
print(' no gameinfobox')
else:
info = info.find_all('tr')
info = [(x.th.string, x.td) for x in info if x.th and x.th.string]
info = [x for x in info if x[0] not in ignored_gameinfos]
info = dict(info)
unique_gameinfo_fields.update(info.keys())
# consume fields of gameinfobox
# genre
key, idx = key_selection_gameinfobox(('Genre', 'Genres'), info.keys())
if key:
genres = extract_field_content(key, idx, info)
entry['genre']
del info[key]
# platforms
key = 'Platforms'
if key in info:
platforms = extract_field_content(key, 1, info)
# platforms = [x if x != 'Mac' else 'macOS' for x in platforms] # replace Mac with macOS
entry['platform'] = platforms
del info[key]
# developer
key, idx = key_selection_gameinfobox(('Developer', 'Developers'), info.keys())
if key:
entry['developer'] = extract_field_content(key, idx, info)
del info[key]
# code license
key, idx = key_selection_gameinfobox(('Code license', 'Code licenses'), info.keys())
if key:
entry['code license'] = extract_field_content(key, idx, info)
del info[key]
# media license
key, idx = key_selection_gameinfobox(('Media license', 'Media licenses'), info.keys())
if key:
entry['assets license'] = extract_field_content(key, idx, info)
del info[key]
# engine
key, idx = key_selection_gameinfobox(('Engine', 'Engines'), info.keys())
if key:
entry['engine'] = extract_field_content(key, idx, info)
del info[key]
# library
key, idx = key_selection_gameinfobox(('Library', 'Libraries'), info.keys())
if key:
entry['library'] = extract_field_content(key, idx, info)
del info[key]
# programming language
key, idx = key_selection_gameinfobox(('P. language', 'P. languages'), info.keys())
if key:
languages = extract_field_content(key, idx, info)
languages = [x for x in languages if x != 'HTML5'] # ignore HTML5
entry['code language'] = languages
del info[key]
# unconsumed
if info:
print('unconsumed gameinfo keys {}'.format(info.keys()))
raise RuntimeError()
# parse "for available as package in"
tables = soup.find_all('table', class_='wikitable')
tables = [table for table in tables if table.caption and table.caption.string.startswith('Available as package')]
if len(tables) > 0:
if len(tables) > 1:
raise RuntimeError()
table = tables[0]
packages = table.find_all('tr')
packages = [x.td.a['href'] for x in packages]
entry['linux-packages'] = packages
# categories
categories = soup.find_all('div', id='mw-normal-catlinks')
if not categories:
print(' no categories')
categories = []
else:
if len(categories) > 1:
raise RuntimeError()
categories = categories[0]
categories = categories.find_all('li')
categories = [x.a.string for x in categories]
if 'Games' not in categories:
print(' "Games" not in categories')
else:
categories.remove('Games') # should be there
# strip games at the end
phrase = ' games'
categories = [x[:-len(phrase)] if x.endswith(phrase) else x for x in categories]
ignored_categories = ['Articles lacking reference', 'Stubs']
categories = [x for x in categories if x not in ignored_categories]
entry['categories'] = categories
entries.append(entry)
# print(entry)
counter += 1
if counter > 20:
# break
pass
unique_gameinfo_fields = sorted(list(unique_gameinfo_fields))
print('unique gameinfo fields: {}'.format(unique_gameinfo_fields))
# save entries
json_path = os.path.join(os.path.dirname(__file__), 'lgw_import.json')
text = json.dumps(entries, indent=1)
write_text(json_path, text)