""" Imports game details from libregamewiki by scraping the website, starting from https://libregamewiki.org/Category:Games Also parse rejected games (https://libregamewiki.org/Libregamewiki:Rejected_games_list) and maybe https://libregamewiki.org/Libregamewiki:Suggested_games Unique left column names in the game info boxes: ['Code license', 'Code licenses', 'Developer', 'Developers', 'Engine', 'Engines', 'Genre', 'Genres', 'Libraries', 'Library', 'Media license', 'Media licenses', 'P. language', 'P. languages', 'Platforms'] """ import os import requests import json from bs4 import BeautifulSoup from utils import constants, utils, osg def download_lgw_content(): """ :return: """ # parameters base_url = 'https://libregamewiki.org' destination_path = os.path.join(constants.root_path, 'tools', 'lgw-import') utils.recreate_directory(destination_path) # read and process the base url (get all games and categories) url = base_url + '/Category:Games' games = [] while True: text = requests.get(url).text soup = BeautifulSoup(text, 'html.parser') #categories = soup.find('div', id='mw-subcategories').find_all('li') #categories = [(x.a['href'], x.a.string) for x in categories] # game pages pages = soup.find('div', id='mw-pages').find_all('li') games.extend(((x.a['href'], x.a.string) for x in pages)) # next page next_page = soup.find('a', string='next page') if not next_page: break url = base_url + next_page['href'] # remove all those that start with user games = [game for game in games if not any(game[1].startswith(x) for x in ('User:', 'Template:', 'Bullet'))] print('current number of games in LGW {}'.format(len(games))) for game in games: print(game[1]) url = base_url + game[0] destination_file = os.path.join(destination_path, osg.canonical_game_name(game[0][1:]) + '.html') text = requests.get(url).text utils.write_text(destination_file, text) def parse_lgw_content(): # paths import_path = os.path.join(constants.root_path, 'tools', 'lgw-import') entries_file = os.path.join(import_path, '_lgw.json') # iterate over all imported files files = os.listdir(import_path) entries = [] for file in files: if file == '_lgw.json': continue text = utils.read_text(os.path.join(import_path, file)) # parse the html soup = BeautifulSoup(text, 'html.parser') title = soup.h1.get_text() print(title) entry = {'name': title} # get all external links links = [(x['href'], x.get_text()) for x in soup.find_all('a', href=True)] links = [x for x in links if x[0].startswith('http') and not x[0].startswith('https://libregamewiki.org/')] entry['external links'] = links # get meta description description = soup.find('meta', attrs={"name":"description"}) entry['description'] = description['content'] # parse gameinfobox infos = soup.find('div', class_='gameinfobox') if not infos: print(' no gameinfobox') else: infos = infos.find_all('tr') for x in infos: if x.th and x.td: # row with header key = x.th.get_text() content = x.td.get_text() content = content.split(',') content = [x.strip() for x in content] entry[key] = content if not x.th and x.td: # row without header: contribute section x = x.find_all('li') x = [(x.a.string, x.a['href']) for x in x if x.a] for key, content in x: entry[key] = content # parse "for available as package in" tables = soup.find_all('table', class_='wikitable') tables = [table for table in tables if table.caption and table.caption.string.startswith('Available as package')] if len(tables) > 0: if len(tables) > 1: raise RuntimeError() table = tables[0] packages = table.find_all('tr') packages = [x.td.a['href'] for x in packages] entry['linux-packages'] = packages # categories categories = soup.find_all('div', id='mw-normal-catlinks') if not categories: print(' no categories') categories = [] else: if len(categories) > 1: raise RuntimeError() categories = categories[0] categories = categories.find_all('li') categories = [x.a.string for x in categories] if 'Games' not in categories: print(' "Games" not in categories') else: categories.remove('Games') # should be there # strip games at the end phrase = ' games' categories = [x[:-len(phrase)] if x.endswith(phrase) else x for x in categories] ignored_categories = ['Articles lacking reference', 'Stubs'] categories = [x for x in categories if x not in ignored_categories] entry['categories'] = categories entries.append(entry) # save entries text = json.dumps(entries, indent=1) utils.write_text(entries_file, text) def clean_lgw_content(): # paths import_path = os.path.join(constants.root_path, 'tools', 'lgw-import') entries_file = os.path.join(import_path, '_lgw.json') cleaned_entries_file = os.path.join(import_path, '_lgw.cleaned.json') # load entries text = utils.read_text(entries_file) entries = json.loads(text) # rename keys key_replacements = (('developer', ('Developer', 'Developers')), ('code license', ('Code license', 'Code licenses')), ('engine', ('Engine', 'Engines')), ('genre', ('Genre', 'Genres'))) for index, entry in enumerate(entries): for new_key, old_keys in key_replacements: for key in old_keys: if key in entry: entry[new_key] = entry[key] del entry[key] break entries[index] = entry # check for unique field names unique_fields = set() for entry in entries: unique_fields.update(entry.keys()) print('unique lgw fields: {}'.format(sorted(list(unique_fields)))) # which fields are mandatory for entry in entries: remove_fields = [field for field in unique_fields if field not in entry] unique_fields -= set(remove_fields) print('mandatory lgw fields: {}'.format(sorted(list(unique_fields)))) if __name__ == "__main__": # stage one # download_lgw_content() # stage two # parse_lgw_content() # stage three clean_lgw_content()