imports from osgameclones

2019-08-23 22:17:51 +02:00
parent 42dee8c7e8
commit 39a9f55cae
49 changed files with 1199 additions and 169 deletions
--- a/tools/libregamewiki_import.py
+++ b/tools/libregamewiki_import.py
@@ -2,16 +2,203 @@
 Imports game details from libregamewiki by scraping the website, starting from https://libregamewiki.org/Category:Games

 Also parse rejected games (https://libregamewiki.org/Libregamewiki:Rejected_games_list) and maybe https://libregamewiki.org/Libregamewiki:Suggested_games
+
+Unique left column names in the game info boxes:
+['Code license', 'Code licenses', 'Developer', 'Developers', 'Engine', 'Engines', 'Genre', 'Genres', 'Libraries', 'Library', 'Media license', 'Media licenses', 'P. language', 'P. languages', 'Platforms']
 """

 import requests
-import re
+import json
+from bs4 import BeautifulSoup, NavigableString
+from utils.utils import *
+
+
+def key_selection_gameinfobox(a, b):
+    """
+    Checks which of the two elements in a is in b or none but not both
+    """
+    if len(a) != 2:
+        raise RuntimeError()
+    c = [x in b for x in a]
+    if all(c):
+        raise RuntimeError
+    if not any(c):
+        return None, None
+    d = [(k, i) for (i, k) in enumerate(a) if c[i]]
+    return d[0]
+
+
+def extract_field_content(key, idx, info):
+    """
+    From a game info field.
+    """
+    content = info[key].get_text()
+    content = content.split(',')
+    content = [x.strip() for x in content]
+    content = [x if not (x.endswith('[1]') or x.endswith('[2]')) else x[:-3] for x in content]  # remove trailing [1,2]
+    content = [x.strip() for x in content]
+    if not content:
+        raise RuntimeError
+    if (len(content) > 1 and idx == 0) or (len(content) == 1 and idx == 1):
+        print(' warning: {} Sg./Pl. mismatch'.format(key))
+    return content
+

 if __name__ == "__main__":

-    regex_games = re.compile(r"<li><a href=\"\/(.+?)\".*?>(.+?)<\/a><\/li>") # url part, name
+    # parameters
+    base_url = 'https://libregamewiki.org'
+    ignored_gameinfos = ['Contribute', 'Origin', 'Release date', 'Latest release']
+
+    # read and process the base url (get all games and categories)
+    url = base_url + '/Category:Games'
+    games = []
+    while True:
+        text = requests.get(url).text
+        soup = BeautifulSoup(text, 'html.parser')
+        #categories = soup.find('div', id='mw-subcategories').find_all('li')
+        #categories = [(x.a['href'], x.a.string) for x in categories]
+
+        # game pages
+        pages = soup.find('div', id='mw-pages').find_all('li')
+        games.extend(((x.a['href'], x.a.string) for x in pages))
+
+        # next page
+        next_page = soup.find('a', string='next page')
+        if not next_page:
+            break
+        url = base_url + next_page['href']
+
+    print('current number of games in LGW {}'.format(len(games)))
+
+    # parse games
+    counter = 0
+    unique_gameinfo_fields = set()
+    entries = []
+    for game in games:
+        url = base_url + game[0]
+        text = requests.get(url).text
+        soup = BeautifulSoup(text, 'html.parser')
+        title = soup.h1.string
+        print(title)
+        entry = {'name': title}
+
+        # parse gameinfobox
+        info = soup.find('div', class_='gameinfobox')
+        if not info:
+            print(' no gameinfobox')
+        else:
+            info = info.find_all('tr')
+            info = [(x.th.string, x.td) for x in info if x.th and x.th.string]
+            info = [x for x in info if x[0] not in ignored_gameinfos]
+            info = dict(info)
+            unique_gameinfo_fields.update(info.keys())
+
+            # consume fields of gameinfobox
+            # genre
+            key, idx = key_selection_gameinfobox(('Genre', 'Genres'), info.keys())
+            if key:
+                genres = extract_field_content(key, idx, info)
+                entry['genre']
+                del info[key]
+
+            # platforms
+            key = 'Platforms'
+            if key in info:
+                platforms = extract_field_content(key, 1, info)
+                # platforms = [x if x != 'Mac' else 'macOS' for x in platforms] # replace Mac with macOS
+                entry['platform'] = platforms
+                del info[key]
+
+            # developer
+            key, idx = key_selection_gameinfobox(('Developer', 'Developers'), info.keys())
+            if key:
+                entry['developer'] = extract_field_content(key, idx, info)
+                del info[key]
+
+            # code license
+            key, idx = key_selection_gameinfobox(('Code license', 'Code licenses'), info.keys())
+            if key:
+                entry['code license'] = extract_field_content(key, idx, info)
+                del info[key]
+
+            # media license
+            key, idx = key_selection_gameinfobox(('Media license', 'Media licenses'), info.keys())
+            if key:
+                entry['assets license'] = extract_field_content(key, idx, info)
+                del info[key]
+
+            # engine
+            key, idx = key_selection_gameinfobox(('Engine', 'Engines'), info.keys())
+            if key:
+                entry['engine'] = extract_field_content(key, idx, info)
+                del info[key]
+
+            # library
+            key, idx = key_selection_gameinfobox(('Library', 'Libraries'), info.keys())
+            if key:
+                entry['library'] = extract_field_content(key, idx, info)
+                del info[key]
+
+            # programming language
+            key, idx = key_selection_gameinfobox(('P. language', 'P. languages'), info.keys())
+            if key:
+                languages = extract_field_content(key, idx, info)
+                languages = [x for x in languages if x != 'HTML5'] # ignore HTML5
+                entry['code language'] = languages
+                del info[key]
+
+            # unconsumed
+            if info:
+                print('unconsumed gameinfo keys {}'.format(info.keys()))
+                raise RuntimeError()
+
+        # parse "for available as package in"
+        tables = soup.find_all('table', class_='wikitable')
+        tables = [table for table in tables if table.caption and table.caption.string.startswith('Available as package')]
+        if len(tables) > 0:
+            if len(tables) > 1:
+                raise RuntimeError()
+            table = tables[0]
+            packages = table.find_all('tr')
+            packages = [x.td.a['href'] for x in packages]
+            entry['linux-packages'] = packages
+
+        # categories
+        categories = soup.find_all('div', id='mw-normal-catlinks')
+        if not categories:
+            print(' no categories')
+            categories = []
+        else:
+            if len(categories) > 1:
+                raise RuntimeError()
+            categories = categories[0]
+            categories = categories.find_all('li')
+            categories = [x.a.string for x in categories]
+            if 'Games' not in categories:
+                print(' "Games" not in categories')
+            else:
+                categories.remove('Games') # should be there
+            # strip games at the end
+            phrase = ' games'
+            categories = [x[:-len(phrase)] if x.endswith(phrase) else x for x in categories]
+            ignored_categories = ['Articles lacking reference', 'Stubs']
+            categories = [x for x in categories if x not in ignored_categories]
+        entry['categories'] = categories
+
+        entries.append(entry)
+        # print(entry)
+
+        counter += 1
+        if counter > 20:
+            # break
+            pass
+
+    unique_gameinfo_fields = sorted(list(unique_gameinfo_fields))
+    print('unique gameinfo fields: {}'.format(unique_gameinfo_fields))
+
+    # save entries
+    json_path = os.path.join(os.path.dirname(__file__), 'lgw_import.json')
+    text = json.dumps(entries, indent=1)
+    write_text(json_path, text)

-    # read base url
-    base_url = 'https://libregamewiki.org/Category:Games'
-    text = requests.get(base_url).text
-    print(text)