198 lines
6.8 KiB
Python
198 lines
6.8 KiB
Python
"""
|
|
Imports game details from libregamewiki by scraping the website, starting from https://libregamewiki.org/Category:Games
|
|
|
|
Also parse rejected games (https://libregamewiki.org/Libregamewiki:Rejected_games_list) and maybe https://libregamewiki.org/Libregamewiki:Suggested_games
|
|
|
|
Unique left column names in the game info boxes:
|
|
['Code license', 'Code licenses', 'Developer', 'Developers', 'Engine', 'Engines', 'Genre', 'Genres', 'Libraries', 'Library', 'Media license', 'Media licenses', 'P. language', 'P. languages', 'Platforms']
|
|
"""
|
|
|
|
import os
|
|
import requests
|
|
import json
|
|
from bs4 import BeautifulSoup
|
|
from utils import constants, utils, osg
|
|
|
|
|
|
def download_lgw_content():
|
|
"""
|
|
|
|
:return:
|
|
"""
|
|
|
|
# parameters
|
|
base_url = 'https://libregamewiki.org'
|
|
destination_path = os.path.join(constants.root_path, 'tools', 'lgw-import')
|
|
utils.recreate_directory(destination_path)
|
|
|
|
# read and process the base url (get all games and categories)
|
|
url = base_url + '/Category:Games'
|
|
games = []
|
|
while True:
|
|
text = requests.get(url).text
|
|
soup = BeautifulSoup(text, 'html.parser')
|
|
#categories = soup.find('div', id='mw-subcategories').find_all('li')
|
|
#categories = [(x.a['href'], x.a.string) for x in categories]
|
|
|
|
# game pages
|
|
pages = soup.find('div', id='mw-pages').find_all('li')
|
|
games.extend(((x.a['href'], x.a.string) for x in pages))
|
|
|
|
# next page
|
|
next_page = soup.find('a', string='next page')
|
|
if not next_page:
|
|
break
|
|
url = base_url + next_page['href']
|
|
|
|
# remove all those that start with user
|
|
games = [game for game in games if not any(game[1].startswith(x) for x in ('User:', 'Template:', 'Bullet'))]
|
|
|
|
print('current number of games in LGW {}'.format(len(games)))
|
|
|
|
for game in games:
|
|
print(game[1])
|
|
url = base_url + game[0]
|
|
destination_file = os.path.join(destination_path, osg.canonical_game_name(game[0][1:]) + '.html')
|
|
|
|
text = requests.get(url).text
|
|
utils.write_text(destination_file, text)
|
|
|
|
|
|
def parse_lgw_content():
|
|
|
|
# paths
|
|
import_path = os.path.join(constants.root_path, 'tools', 'lgw-import')
|
|
entries_file = os.path.join(import_path, '_lgw.json')
|
|
|
|
# iterate over all imported files
|
|
files = os.listdir(import_path)
|
|
entries = []
|
|
for file in files:
|
|
if file == '_lgw.json':
|
|
continue
|
|
|
|
text = utils.read_text(os.path.join(import_path, file))
|
|
|
|
# parse the html
|
|
soup = BeautifulSoup(text, 'html.parser')
|
|
title = soup.h1.get_text()
|
|
print(title)
|
|
entry = {'name': title}
|
|
|
|
# get all external links
|
|
links = [(x['href'], x.get_text()) for x in soup.find_all('a', href=True)]
|
|
links = [x for x in links if x[0].startswith('http') and not x[0].startswith('https://libregamewiki.org/')]
|
|
entry['external links'] = links
|
|
|
|
# get meta description
|
|
description = soup.find('meta', attrs={"name":"description"})
|
|
entry['description'] = description['content']
|
|
|
|
# parse gameinfobox
|
|
infos = soup.find('div', class_='gameinfobox')
|
|
if not infos:
|
|
print(' no gameinfobox')
|
|
else:
|
|
infos = infos.find_all('tr')
|
|
for x in infos:
|
|
if x.th and x.td:
|
|
# row with header
|
|
key = x.th.get_text()
|
|
content = x.td.get_text()
|
|
content = content.split(',')
|
|
content = [x.strip() for x in content]
|
|
entry[key] = content
|
|
if not x.th and x.td:
|
|
# row without header: contribute section
|
|
x = x.find_all('li')
|
|
x = [(x.a.string, x.a['href']) for x in x if x.a]
|
|
for key, content in x:
|
|
entry[key] = content
|
|
|
|
# parse "for available as package in"
|
|
tables = soup.find_all('table', class_='wikitable')
|
|
tables = [table for table in tables if table.caption and table.caption.string.startswith('Available as package')]
|
|
if len(tables) > 0:
|
|
if len(tables) > 1:
|
|
raise RuntimeError()
|
|
table = tables[0]
|
|
packages = table.find_all('tr')
|
|
packages = [x.td.a['href'] for x in packages]
|
|
entry['linux-packages'] = packages
|
|
|
|
# categories
|
|
categories = soup.find_all('div', id='mw-normal-catlinks')
|
|
if not categories:
|
|
print(' no categories')
|
|
categories = []
|
|
else:
|
|
if len(categories) > 1:
|
|
raise RuntimeError()
|
|
categories = categories[0]
|
|
categories = categories.find_all('li')
|
|
categories = [x.a.string for x in categories]
|
|
if 'Games' not in categories:
|
|
print(' "Games" not in categories')
|
|
else:
|
|
categories.remove('Games') # should be there
|
|
# strip games at the end
|
|
phrase = ' games'
|
|
categories = [x[:-len(phrase)] if x.endswith(phrase) else x for x in categories]
|
|
ignored_categories = ['Articles lacking reference', 'Stubs']
|
|
categories = [x for x in categories if x not in ignored_categories]
|
|
entry['categories'] = categories
|
|
|
|
entries.append(entry)
|
|
|
|
|
|
# save entries
|
|
text = json.dumps(entries, indent=1)
|
|
utils.write_text(entries_file, text)
|
|
|
|
|
|
def clean_lgw_content():
|
|
|
|
# paths
|
|
import_path = os.path.join(constants.root_path, 'tools', 'lgw-import')
|
|
entries_file = os.path.join(import_path, '_lgw.json')
|
|
cleaned_entries_file = os.path.join(import_path, '_lgw.cleaned.json')
|
|
|
|
# load entries
|
|
text = utils.read_text(entries_file)
|
|
entries = json.loads(text)
|
|
|
|
# rename keys
|
|
key_replacements = (('developer', ('Developer', 'Developers')), ('code license', ('Code license', 'Code licenses')), ('engine', ('Engine', 'Engines')), ('genre', ('Genre', 'Genres')))
|
|
for index, entry in enumerate(entries):
|
|
for new_key, old_keys in key_replacements:
|
|
for key in old_keys:
|
|
if key in entry:
|
|
entry[new_key] = entry[key]
|
|
del entry[key]
|
|
break
|
|
|
|
entries[index] = entry
|
|
|
|
# check for unique field names
|
|
unique_fields = set()
|
|
for entry in entries:
|
|
unique_fields.update(entry.keys())
|
|
print('unique lgw fields: {}'.format(sorted(list(unique_fields))))
|
|
|
|
# which fields are mandatory
|
|
for entry in entries:
|
|
remove_fields = [field for field in unique_fields if field not in entry]
|
|
unique_fields -= set(remove_fields)
|
|
print('mandatory lgw fields: {}'.format(sorted(list(unique_fields))))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
# stage one
|
|
# download_lgw_content()
|
|
|
|
# stage two
|
|
# parse_lgw_content()
|
|
|
|
# stage three
|
|
clean_lgw_content() |