synchronized awesome list (https://github.com/radek-sprta/awesome-game-remakes) and sourceforge member pages and some inspiration wikipedia links

This commit is contained in:
Trilarion
2021-09-28 13:42:58 +02:00
parent 0daac9c31e
commit e7ea8fc6ab
67 changed files with 607 additions and 129 deletions

View File

@ -1,6 +1,8 @@
"""
Scrapes Sourceforge project sites and adds (mostly developer) information to our database.
""" # TODO sourceforge sites that are not existing anymore but we have an archive link, also scrape
"""
# TODO sourceforge sites that are not existing anymore but we have an archive link, also scrape
import os
import json
@ -12,17 +14,22 @@ sf_entries_file = os.path.join(c.code_path, 'sourceforge_entries.txt')
prefix = 'https://sourceforge.net/projects/'
# author names in SF that aren't the author names how we have them
SF_alias_list = {'Erik Johansson (aka feneur)': 'Erik Johansson', 'Itms': 'Nicolas Auvray', 'baris yuksel': 'Baris Yuksel',
SF_alias_list = {'Erik Johansson (aka feneur)': 'Erik Johansson', 'Itms': 'Nicolas Auvray',
'baris yuksel': 'Baris Yuksel',
'Wraitii': 'Lancelot de Ferrière', 'Simzer': 'Simon Laszlo', 'armin bajramovic': 'Armin Bajramovic',
'bleu tailfly': 'bleutailfly', 'dlh': 'DLH', 'Bjorn Hansen': 'Bjørn Hansen', 'Louens Veen': 'Lourens Veen',
'linley_henzell': 'Linley Henzell', 'Patrice DUHAMEL': 'Patrice Duhamel', 'Etienne SOBOLE': 'Etienne Sobole',
'bleu tailfly': 'bleutailfly', 'dlh': 'DLH', 'Bjorn Hansen': 'Bjørn Hansen',
'Louens Veen': 'Lourens Veen',
'linley_henzell': 'Linley Henzell', 'Patrice DUHAMEL': 'Patrice Duhamel',
'Etienne SOBOLE': 'Etienne Sobole',
'L. H. [Lubomír]': 'L. H. Lubomír', 'davidjoffe': 'David Joffe', 'EugeneLoza': 'Eugene Loza',
'Kenneth Gangsto': 'Kenneth Gangstø', 'Lucas GAUTHERON': 'Lucas Gautheron', 'Per I Mathisen': 'Per Inge Mathisen',
'Kenneth Gangsto': 'Kenneth Gangstø', 'Lucas GAUTHERON': 'Lucas Gautheron',
'Per I Mathisen': 'Per Inge Mathisen',
'wrtlprnft': 'Wrzlprnft', 'daniel_santos': 'Daniel Santos', 'Dark_Sylinc': 'darksylinc',
'Don Llopis': 'Don E. Llopis', 'dwachs': 'Dwachs', 'Pierre-Loup Griffais': 'Pierre-Loup A. Griffais',
'Richard Gobeille': 'Richard C. Gobeille', 'timfelgentreff': 'Tim Felgentreff',
'Dr. Martin Brumm': 'Martin Brumm', 'Dr. Wolf-Dieter Beelitz': 'Wolf-Dieter Beelitz'}
# authors to be ignored
SF_ignore_list = ('', 'Arianne Integration Bot')
@ -49,28 +56,32 @@ def collect_sourceforge_entries():
def sourceforge_import():
"""
:return:
Scraps Sourceforge project sites and adds developer information to the entries
"""
# read entries that have sourceforge projects
files = json.loads(utils.read_text(sf_entries_file))
# read developer information
all_developers = osg.read_developers()
print(' {} developers read'.format(len(all_developers)))
all_developers_changed = False
# all exceptions that happen will be eaten (but will end the execution)
try:
# loop over each entry
# loop over each entry with a sourceforge project
for index, file in enumerate(files):
print(' process {}'.format(file))
print(' process {} ({})'.format(file, index))
# read entry
# read full entry
entry = osg.read_entry(file)
developers = entry.get('Developer', [])
urls = [x.value for x in entry['Home'] if x.startswith('https://sourceforge.net/projects/')]
# do we need to save it again
entry_changed = False
# for all sourceforge project urls in this entry
for url in urls:
print(' sf project {}'.format(url))
@ -78,8 +89,11 @@ def sourceforge_import():
print('error: sf project does not end with slash')
url += '/'
# members
url_members = 'https://sourceforge.net/p/' + url[len(prefix):] + '_members/'
# read and parse members page
project_name = url[len(prefix):-1]
if 'berlios' in project_name: # berlios projects never have member pages
continue
url_members = 'https://sourceforge.net/p/' + project_name + '/_members/'
response = requests.get(url_members)
if response.status_code != 200:
print('error: url {} not accessible, status {}'.format(url_members, response.status_code))
@ -88,48 +102,54 @@ def sourceforge_import():
authors = soup.find('div', id='content_base').find('table').find_all('tr')
authors = [author.find_all('td') for author in authors]
authors = [author[1].a['href'] for author in authors if len(author) == 3]
# for every author in the list of scraped authors
for author in authors:
# sometimes author already contains the full url, sometimes not
url_author = 'https://sourceforge.net' + author if not author.startswith('http') else author
# get the personal author page from sourceforge
response = requests.get(url_author)
if response.status_code != 200 and author not in ('/u/favorito/',):
print('error: url {} not accessible, status {}'.format(url_author, response.status_code))
raise RuntimeError()
url_author = response.url # could be different now
url_author = response.url # could be different now (redirect)
if 'auth/?return_to' in url_author or response.status_code != 200:
# for some reason authorisation is forbidden or page was not available (happens for example for /u/kantaros)
author_name = author[3:-1]
nickname = author_name
else:
# this is the typical case
soup = BeautifulSoup(response.text, 'html.parser')
author_name = soup.h1.get_text()
author_name = soup.h1.get_text().strip() # lately they have a newline at the end, need to strip that
author_name = SF_alias_list.get(author_name, author_name) # replace by alias if possible
nickname = soup.find('dl', class_='personal-data').find('dd').get_text()
nickname = nickname.replace('\n', '').strip()
nickname += '@SF' # our indication of the platform to search for
author_name = author_name.strip() # names can still have white spaces before or after
nickname += '@SF' # our indication of the platform to search for
author_name = author_name.strip() # names could still have white spaces before or after
# some authors we ignore
if author_name in SF_ignore_list:
continue
# look author up in entry developers
# look author up in entry developers field, if not existing add
if author_name not in developers:
print(' dev "{}" added to entry {}'.format(author_name, file))
entry['Developer'] = entry.get('Developer', []) + [osg_parse.ValueWithComment(author_name)]
entry_changed = True
developers = entry.get('Developer', [])
developers = entry.get('Developer', []) # update developers
# look author and SF nickname up in developers data base
if author_name in all_developers:
# get existing developer information
dev = all_developers[author_name]
if not nickname in dev.get('Contact', []):
if nickname not in dev.get('Contact', []):
print(' existing dev "{}" added nickname ({}) to developer database'.format(author_name, nickname))
# check that name has not already @SF contact
if any(x.endswith('@SF') for x in dev.get('Contact', [])):
print('warning: already SF contact')
print('warning: already different SF contact existing')
all_developers[author_name]['Contact'] = dev.get('Contact', []) + [nickname]
all_developers_changed = True
else:
# new developer entry in the developers data base
print(' dev "{}" ({}) added to developer database'.format(author_name, nickname))
all_developers[author_name] = {'Name': author_name, 'Contact': [nickname], 'Games': [entry['Title']]}
all_developers_changed = True
@ -156,9 +176,8 @@ def sourceforge_import():
if __name__ == "__main__":
# collect entries
collect_sourceforge_entries()
# collect_sourceforge_entries()
# import information from sf
# sourceforge_import()
sourceforge_import()

View File

@ -0,0 +1,76 @@
"""
Synchronizes with awesome lists from
"""
import re
import requests
from utils import osg, osg_rejected
AWESOME_LIST = 'https://raw.githubusercontent.com/radek-sprta/awesome-game-remakes/master/README.md'
# Probably could fix them within the awesome lists
IGNORED = ('2006rebotted', 'raw(gl)', 'fheroes2', 'FS2OPEN', 'Barbarian', 'Hexen II: Hammer of Thyrion')
matcher = re.compile(r'\[(.*)?\]\((.*?)\) - (.*)') # general structure: - [title](link) - description
if __name__ == "__main__":
# read rejected
rejected = osg_rejected.read_rejected_file()
# read awesome list
print('read {}'.format(AWESOME_LIST))
r = requests.get(AWESOME_LIST, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64)'}, timeout=20, allow_redirects=True)
if r.status_code != requests.codes.ok:
raise RuntimeError('Cannot download awesome list.')
text = r.text
text = text.split('\n##')[2:]
entries = []
for items in text:
items = items.split('\n')
category = items[0].strip()
items = [item for item in items[1:] if item.startswith('-')]
for item in items:
matches = matcher.findall(item)[0] # we know it will be exactly one
title = matches[0]
url = matches[1]
description = matches[2]
entries.append({'Title': title, 'URL': url, 'Description': description, 'Category': category})
# remove those from the ignored list
entries = [entry for entry in entries if not any(entry['Title'] == x for x in IGNORED)]
# remove those that are in our rejected list
rejected_titles = [x['Title'] for x in rejected]
entries = [entry for entry in entries if entry['Title'] not in rejected_titles]
print('after filtering for rejected entries {}'.format(len(entries)))
# a bit of statistics about this awesome list
print('contains {} entries in {} categories'.format(len(entries), len(text)))
n = [0, 0]
for entry in entries:
if entry['URL'].startswith('https://github.com/'):
n[0] += 1
else:
n[1] += 1
print('{} links to Github, {} links not to Github'.format(*n))
# read our database
our_entries = osg.read_entries()
print('{} entries read (osgl)'.format(len(our_entries)))
# go through this awesome list entries one by one and compare to our list
for entry in entries:
title = entry['Title']
url = entry['URL']
# go through our entries
similar_entries = []
for our_entry in our_entries:
title_equal = title == our_entry['Title']
url_present = any(url in x for x in our_entry['Home']) or any(url in x for x in our_entry.get('Code repository', []))
if title_equal or url_present:
similar_entries.append(our_entry)
if not similar_entries:
print('Unknown entry "{}" {} - {} - {}'.format(entry['Title'], entry['URL'], entry['Category'], entry['Description']))