205 lines
7.3 KiB
Python
205 lines
7.3 KiB
Python
"""
|
||
Imports game details from libregamewiki by scraping the website, starting from https://libregamewiki.org/Category:Games
|
||
|
||
Also parse rejected games (https://libregamewiki.org/Libregamewiki:Rejected_games_list) and maybe https://libregamewiki.org/Libregamewiki:Suggested_games
|
||
|
||
Unique left column names in the game info boxes:
|
||
['Code license', 'Code licenses', 'Developer', 'Developers', 'Engine', 'Engines', 'Genre', 'Genres', 'Libraries', 'Library', 'Media license', 'Media licenses', 'P. language', 'P. languages', 'Platforms']
|
||
"""
|
||
|
||
import requests
|
||
import json
|
||
from bs4 import BeautifulSoup, NavigableString
|
||
from utils.utils import *
|
||
|
||
|
||
def key_selection_gameinfobox(a, b):
|
||
"""
|
||
Checks which of the two elements in a is in b or none but not both
|
||
"""
|
||
if len(a) != 2:
|
||
raise RuntimeError()
|
||
c = [x in b for x in a]
|
||
if all(c):
|
||
raise RuntimeError
|
||
if not any(c):
|
||
return None, None
|
||
d = [(k, i) for (i, k) in enumerate(a) if c[i]]
|
||
return d[0]
|
||
|
||
|
||
def extract_field_content(key, idx, info):
|
||
"""
|
||
From a game info field.
|
||
"""
|
||
content = info[key].get_text()
|
||
content = content.split(',')
|
||
content = [x.strip() for x in content]
|
||
content = [x if not (x.endswith('[1]') or x.endswith('[2]')) else x[:-3] for x in content] # remove trailing [1,2]
|
||
content = [x.strip() for x in content]
|
||
if not content:
|
||
raise RuntimeError
|
||
if (len(content) > 1 and idx == 0) or (len(content) == 1 and idx == 1):
|
||
print(' warning: {} Sg./Pl. mismatch'.format(key))
|
||
return content
|
||
|
||
|
||
if __name__ == "__main__":
|
||
|
||
# parameters
|
||
base_url = 'https://libregamewiki.org'
|
||
ignored_gameinfos = ['Contribute', 'Origin', 'Release date', 'Latest release']
|
||
|
||
# read and process the base url (get all games and categories)
|
||
url = base_url + '/Category:Games'
|
||
games = []
|
||
while True:
|
||
text = requests.get(url).text
|
||
soup = BeautifulSoup(text, 'html.parser')
|
||
#categories = soup.find('div', id='mw-subcategories').find_all('li')
|
||
#categories = [(x.a['href'], x.a.string) for x in categories]
|
||
|
||
# game pages
|
||
pages = soup.find('div', id='mw-pages').find_all('li')
|
||
games.extend(((x.a['href'], x.a.string) for x in pages))
|
||
|
||
# next page
|
||
next_page = soup.find('a', string='next page')
|
||
if not next_page:
|
||
break
|
||
url = base_url + next_page['href']
|
||
|
||
print('current number of games in LGW {}'.format(len(games)))
|
||
|
||
# parse games
|
||
counter = 0
|
||
unique_gameinfo_fields = set()
|
||
entries = []
|
||
for game in games:
|
||
url = base_url + game[0]
|
||
text = requests.get(url).text
|
||
soup = BeautifulSoup(text, 'html.parser')
|
||
title = soup.h1.string
|
||
print(title)
|
||
entry = {'name': title}
|
||
|
||
# parse gameinfobox
|
||
info = soup.find('div', class_='gameinfobox')
|
||
if not info:
|
||
print(' no gameinfobox')
|
||
else:
|
||
info = info.find_all('tr')
|
||
info = [(x.th.string, x.td) for x in info if x.th and x.th.string]
|
||
info = [x for x in info if x[0] not in ignored_gameinfos]
|
||
info = dict(info)
|
||
unique_gameinfo_fields.update(info.keys())
|
||
|
||
# consume fields of gameinfobox
|
||
# genre
|
||
key, idx = key_selection_gameinfobox(('Genre', 'Genres'), info.keys())
|
||
if key:
|
||
genres = extract_field_content(key, idx, info)
|
||
entry['genre']
|
||
del info[key]
|
||
|
||
# platforms
|
||
key = 'Platforms'
|
||
if key in info:
|
||
platforms = extract_field_content(key, 1, info)
|
||
# platforms = [x if x != 'Mac' else 'macOS' for x in platforms] # replace Mac with macOS
|
||
entry['platform'] = platforms
|
||
del info[key]
|
||
|
||
# developer
|
||
key, idx = key_selection_gameinfobox(('Developer', 'Developers'), info.keys())
|
||
if key:
|
||
entry['developer'] = extract_field_content(key, idx, info)
|
||
del info[key]
|
||
|
||
# code license
|
||
key, idx = key_selection_gameinfobox(('Code license', 'Code licenses'), info.keys())
|
||
if key:
|
||
entry['code license'] = extract_field_content(key, idx, info)
|
||
del info[key]
|
||
|
||
# media license
|
||
key, idx = key_selection_gameinfobox(('Media license', 'Media licenses'), info.keys())
|
||
if key:
|
||
entry['assets license'] = extract_field_content(key, idx, info)
|
||
del info[key]
|
||
|
||
# engine
|
||
key, idx = key_selection_gameinfobox(('Engine', 'Engines'), info.keys())
|
||
if key:
|
||
entry['engine'] = extract_field_content(key, idx, info)
|
||
del info[key]
|
||
|
||
# library
|
||
key, idx = key_selection_gameinfobox(('Library', 'Libraries'), info.keys())
|
||
if key:
|
||
entry['library'] = extract_field_content(key, idx, info)
|
||
del info[key]
|
||
|
||
# programming language
|
||
key, idx = key_selection_gameinfobox(('P. language', 'P. languages'), info.keys())
|
||
if key:
|
||
languages = extract_field_content(key, idx, info)
|
||
languages = [x for x in languages if x != 'HTML5'] # ignore HTML5
|
||
entry['code language'] = languages
|
||
del info[key]
|
||
|
||
# unconsumed
|
||
if info:
|
||
print('unconsumed gameinfo keys {}'.format(info.keys()))
|
||
raise RuntimeError()
|
||
|
||
# parse "for available as package in"
|
||
tables = soup.find_all('table', class_='wikitable')
|
||
tables = [table for table in tables if table.caption and table.caption.string.startswith('Available as package')]
|
||
if len(tables) > 0:
|
||
if len(tables) > 1:
|
||
raise RuntimeError()
|
||
table = tables[0]
|
||
packages = table.find_all('tr')
|
||
packages = [x.td.a['href'] for x in packages]
|
||
entry['linux-packages'] = packages
|
||
|
||
# categories
|
||
categories = soup.find_all('div', id='mw-normal-catlinks')
|
||
if not categories:
|
||
print(' no categories')
|
||
categories = []
|
||
else:
|
||
if len(categories) > 1:
|
||
raise RuntimeError()
|
||
categories = categories[0]
|
||
categories = categories.find_all('li')
|
||
categories = [x.a.string for x in categories]
|
||
if 'Games' not in categories:
|
||
print(' "Games" not in categories')
|
||
else:
|
||
categories.remove('Games') # should be there
|
||
# strip games at the end
|
||
phrase = ' games'
|
||
categories = [x[:-len(phrase)] if x.endswith(phrase) else x for x in categories]
|
||
ignored_categories = ['Articles lacking reference', 'Stubs']
|
||
categories = [x for x in categories if x not in ignored_categories]
|
||
entry['categories'] = categories
|
||
|
||
entries.append(entry)
|
||
# print(entry)
|
||
|
||
counter += 1
|
||
if counter > 20:
|
||
# break
|
||
pass
|
||
|
||
unique_gameinfo_fields = sorted(list(unique_gameinfo_fields))
|
||
print('unique gameinfo fields: {}'.format(unique_gameinfo_fields))
|
||
|
||
# save entries
|
||
json_path = os.path.join(os.path.dirname(__file__), 'lgw_import.json')
|
||
text = json.dumps(entries, indent=1)
|
||
write_text(json_path, text)
|
||
|