""" Imports game details from libregamewiki by scraping the website, starting from https://libregamewiki.org/Category:Games Also parse rejected games (https://libregamewiki.org/Libregamewiki:Rejected_games_list) and maybe https://libregamewiki.org/Libregamewiki:Suggested_games Unique left column names in the game info boxes: ['Code license', 'Code licenses', 'Developer', 'Developers', 'Engine', 'Engines', 'Genre', 'Genres', 'Libraries', 'Library', 'Media license', 'Media licenses', 'P. language', 'P. languages', 'Platforms'] """ import requests import json from bs4 import BeautifulSoup, NavigableString from utils.utils import * def key_selection_gameinfobox(a, b): """ Checks which of the two elements in a is in b or none but not both """ if len(a) != 2: raise RuntimeError() c = [x in b for x in a] if all(c): raise RuntimeError if not any(c): return None, None d = [(k, i) for (i, k) in enumerate(a) if c[i]] return d[0] def extract_field_content(key, idx, info): """ From a game info field. """ content = info[key].get_text() content = content.split(',') content = [x.strip() for x in content] content = [x if not (x.endswith('[1]') or x.endswith('[2]')) else x[:-3] for x in content] # remove trailing [1,2] content = [x.strip() for x in content] if not content: raise RuntimeError if (len(content) > 1 and idx == 0) or (len(content) == 1 and idx == 1): print(' warning: {} Sg./Pl. mismatch'.format(key)) return content if __name__ == "__main__": # parameters base_url = 'https://libregamewiki.org' ignored_gameinfos = ['Contribute', 'Origin', 'Release date', 'Latest release'] # read and process the base url (get all games and categories) url = base_url + '/Category:Games' games = [] while True: text = requests.get(url).text soup = BeautifulSoup(text, 'html.parser') #categories = soup.find('div', id='mw-subcategories').find_all('li') #categories = [(x.a['href'], x.a.string) for x in categories] # game pages pages = soup.find('div', id='mw-pages').find_all('li') games.extend(((x.a['href'], x.a.string) for x in pages)) # next page next_page = soup.find('a', string='next page') if not next_page: break url = base_url + next_page['href'] print('current number of games in LGW {}'.format(len(games))) # parse games counter = 0 unique_gameinfo_fields = set() entries = [] for game in games: url = base_url + game[0] text = requests.get(url).text soup = BeautifulSoup(text, 'html.parser') title = soup.h1.string print(title) entry = {'name': title} # parse gameinfobox info = soup.find('div', class_='gameinfobox') if not info: print(' no gameinfobox') else: info = info.find_all('tr') info = [(x.th.string, x.td) for x in info if x.th and x.th.string] info = [x for x in info if x[0] not in ignored_gameinfos] info = dict(info) unique_gameinfo_fields.update(info.keys()) # consume fields of gameinfobox # genre key, idx = key_selection_gameinfobox(('Genre', 'Genres'), info.keys()) if key: genres = extract_field_content(key, idx, info) entry['genre'] del info[key] # platforms key = 'Platforms' if key in info: platforms = extract_field_content(key, 1, info) # platforms = [x if x != 'Mac' else 'macOS' for x in platforms] # replace Mac with macOS entry['platform'] = platforms del info[key] # developer key, idx = key_selection_gameinfobox(('Developer', 'Developers'), info.keys()) if key: entry['developer'] = extract_field_content(key, idx, info) del info[key] # code license key, idx = key_selection_gameinfobox(('Code license', 'Code licenses'), info.keys()) if key: entry['code license'] = extract_field_content(key, idx, info) del info[key] # media license key, idx = key_selection_gameinfobox(('Media license', 'Media licenses'), info.keys()) if key: entry['assets license'] = extract_field_content(key, idx, info) del info[key] # engine key, idx = key_selection_gameinfobox(('Engine', 'Engines'), info.keys()) if key: entry['engine'] = extract_field_content(key, idx, info) del info[key] # library key, idx = key_selection_gameinfobox(('Library', 'Libraries'), info.keys()) if key: entry['library'] = extract_field_content(key, idx, info) del info[key] # programming language key, idx = key_selection_gameinfobox(('P. language', 'P. languages'), info.keys()) if key: languages = extract_field_content(key, idx, info) languages = [x for x in languages if x != 'HTML5'] # ignore HTML5 entry['code language'] = languages del info[key] # unconsumed if info: print('unconsumed gameinfo keys {}'.format(info.keys())) raise RuntimeError() # parse "for available as package in" tables = soup.find_all('table', class_='wikitable') tables = [table for table in tables if table.caption and table.caption.string.startswith('Available as package')] if len(tables) > 0: if len(tables) > 1: raise RuntimeError() table = tables[0] packages = table.find_all('tr') packages = [x.td.a['href'] for x in packages] entry['linux-packages'] = packages # categories categories = soup.find_all('div', id='mw-normal-catlinks') if not categories: print(' no categories') categories = [] else: if len(categories) > 1: raise RuntimeError() categories = categories[0] categories = categories.find_all('li') categories = [x.a.string for x in categories] if 'Games' not in categories: print(' "Games" not in categories') else: categories.remove('Games') # should be there # strip games at the end phrase = ' games' categories = [x[:-len(phrase)] if x.endswith(phrase) else x for x in categories] ignored_categories = ['Articles lacking reference', 'Stubs'] categories = [x for x in categories if x not in ignored_categories] entry['categories'] = categories entries.append(entry) # print(entry) counter += 1 if counter > 20: # break pass unique_gameinfo_fields = sorted(list(unique_gameinfo_fields)) print('unique gameinfo fields: {}'.format(unique_gameinfo_fields)) # save entries json_path = os.path.join(os.path.dirname(__file__), 'lgw_import.json') text = json.dumps(entries, indent=1) write_text(json_path, text)