additions from backlog

This commit is contained in:
Trilarion
2019-10-04 15:18:09 +02:00
parent a4529af611
commit 99b3eb5e74
38 changed files with 459 additions and 194 deletions

View File

@ -5,11 +5,14 @@ Also parse rejected games (https://libregamewiki.org/Libregamewiki:Rejected_game
Unique left column names in the game info boxes:
['Code license', 'Code licenses', 'Developer', 'Developers', 'Engine', 'Engines', 'Genre', 'Genres', 'Libraries', 'Library', 'Media license', 'Media licenses', 'P. language', 'P. languages', 'Platforms']
TODO there are games on LGW which are not part of the Games category but part of XXX-Games sub-categories
"""
import os
import requests
import json
import re
from bs4 import BeautifulSoup
from utils import constants, utils, osg
@ -210,6 +213,39 @@ def lower_case_content(entries, field):
entries[index] = entry
return entries
def remove_parenthized_content(entries, fields):
if not isinstance(fields, tuple):
fields = (fields, )
for index, entry in enumerate(entries):
for field in fields:
if field in entry:
content = entry[field]
if not isinstance(content, list):
content = [content]
content = [re.sub(r'\([^)]*\)', '', c) for c in content] # remove parentheses content
content = [x.strip() for x in content]
content = list(set(content))
entry[field] = content
entries[index] = entry
return entries
def ignore_nonnumbers(entries, fields):
if not isinstance(fields, tuple):
fields = (fields, )
for index, entry in enumerate(entries):
for field in fields:
if field in entry:
content = entry[field]
if not isinstance(content, list):
content = [content]
content = [x for x in content if x.isdigit()]
entry[field] = content
entries[index] = entry
return entries
def clean_lgw_content():
# paths
@ -257,6 +293,7 @@ def clean_lgw_content():
print('mandatory lgw fields: {}'.format(sorted(list(mandatory_fields))))
# content replacements
entries = remove_parenthized_content(entries, ('assets license', 'code language', 'code license', 'engine', 'genre', 'last active', 'library'))
entries = remove_prefix_suffix(entries, ('code license', 'assets license'), ('"', 'GNU', ), ('"', '[3]', '[2]', '[1]', 'only'))
entries = replace_content(entries, ('code license', 'assets license'), 'GPL', ('General Public License', ))
entries = replace_content(entries, ('code license', 'assets license'), 'GPLv2', ('GPL v2', 'GPL version 2.0', 'GPL 2.0'))
@ -267,6 +304,8 @@ def clean_lgw_content():
entries = replace_content(entries, ('code license', 'assets license'), 'zlib', ('zlib/libpng license', ))
entries = replace_content(entries, ('code license', 'assets license'), 'BSD', ('Original BSD License', ))
entries = replace_content(entries, ('code license', 'assets license'), 'CC-BY-SA-3.0', ('Creative Commons Attribution-ShareAlike 3.0 Unported License', 'CC-BY-SA 3.0', 'CC BY-SA 3.0'))
entries = replace_content(entries, ('code license', 'assets license'), 'CC-BY-SA', ('CC BY-SA',))
entries = replace_content(entries, ('code license', 'assets license'), 'MIT', ('MIT License',))
entries = replace_content(entries, 'platform', 'macOS', ('Mac', ))
entries = remove_prefix_suffix(entries, 'code language', (), ('[3]', '[2]', '[1]'))
entries = ignore_content(entries, 'code language', ('HTML5', 'HTML', 'English', 'XML', 'WML'))
@ -282,6 +321,9 @@ def clean_lgw_content():
entries = replace_content(entries, 'library', 'pygame', ('Pygame', ))
entries = replace_content(entries, 'library', 'Qt', ('QT', ))
entries = ignore_content(entries, 'library', ('C++', 'Lua', 'Mozilla Firefox'))
entries = ignore_nonnumbers(entries, 'last active')
entries = ignore_content(entries, 'last active', ('2019', ))
entries = ignore_content(entries, 'platform', ('DOS', ))
# list for every unique field
# fields = sorted(list(unique_fields))