developer import from sourceforge

This commit is contained in:
Trilarion
2021-01-06 16:13:17 +01:00
parent 489adf0f88
commit 32ae77c7da
99 changed files with 2873 additions and 189 deletions

View File

@ -191,6 +191,13 @@ https://gamejolt.com/ (search there)
https://games.kde.org/ (all of them)
https://games.kde.org/old/kde_arcade.php
https://gdevelop-app.com/
https://github.com/FaronBracy/RogueSharp
https://github.com/jmorton06/Lumos
https://github.com/codenamecpp/carnage3d
https://github.com/zhangdoa/InnocenceEngine
https://github.com/marukrap/RoguelikeDevResources
http://www.gjt.org/ (all there)
https://github.blog/2014-01-06-github-game-off-ii-winners/
https://github.com/00-Evan/shattered-pixel-dungeon
https://github.com/00-Evan/shattered-pixel-dungeon-gdx
https://github.com/acedogblast/Project-Uranium-Godot

28
code/github_import.py Normal file
View File

@ -0,0 +1,28 @@
"""
Uses the Github API to learn more about the Github projects.
"""
# Github
urls = [x for x in repos if x.startswith('https://github.com/')]
urls = []
for url in urls:
print(' github repo: {}'.format(url))
github_info = osg_github.retrieve_repo_info(url)
for contributor in github_info['contributors']:
name = contributor.name
dev = developer_info_lookup(name)
in_devs = dev and 'contact' in dev and contributor.login + '@GH' in dev['contact']
in_entry = name in entry_developer
if in_devs and in_entry:
continue # already existing in entry and devs
content += ' {}: {}@GH'.format(name, contributor.login)
if contributor.blog:
content += ' url: {}'.format(contributor.blog)
if not in_devs:
content += ' (not in devs)'
if not in_entry:
content += ' (not in entry)'
content += '\n'
if __name__ == "__main__":

View File

@ -3,119 +3,8 @@ Checks the entries and tries to detect additional developer content, by retrievi
stored Git repositories.
"""
import os
import sys
import requests
from utils import osg, osg_ui
from bs4 import BeautifulSoup
from utils import constants as c, utils, osg, osg_github
# author names in SF that aren't the author names how we have them
SF_alias_list = {'Erik Johansson (aka feneur)': 'Erik Johansson', 'Itms': 'Nicolas Auvray',
'Wraitii': 'Lancelot de Ferrière', 'Simzer': 'Simon Laszlo', 'armin bajramovic': 'Armin Bajramovic'}
def test():
# loop over infos
developers = ''
try:
i = 0
# active = False
for entry in entries:
# if entry['Name'] == 'Aleph One':
# active = True
# if not active:
# continue
# for testing purposes
i += 1
if i > 40:
break
# print
entry_name = '{} - {}'.format(entry['file'], entry['Name'])
print(entry_name)
content = ''
entry_developer = entry.get('developer', [])
# parse home
home = entry['home']
# sourceforge project site
prefix = 'https://sourceforge.net/projects/'
url = [x for x in home if x.startswith(prefix)]
if len(url) == 1:
url = url[0]
print(' sourceforge project site: {}'.format(url))
url = 'https://sourceforge.net/p/' + url[len(prefix):] + '_members/'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
authors = soup.find('div', id='content_base').find('table').find_all('tr')
authors = [author.find_all('td') for author in authors]
authors = [author[1].a['href'] for author in authors if len(author) == 3]
for author in authors:
# sometimes author already contains the full url, sometimes not
url = 'https://sourceforge.net' + author if not author.startswith('http') else author
response = requests.get(url)
url = response.url # could be different now
if 'auth/?return_to' in url:
# for some reason authorisation is forbidden
author_name = author
nickname = author
else:
soup = BeautifulSoup(response.text, 'html.parser')
author_name = soup.h1.get_text()
author_name = SF_alias_list.get(author_name, author_name) # replace by alias if possible
nickname = soup.find('dl', class_='personal-data').find('dd').get_text()
nickname = nickname.replace('\n', '').strip()
dev = developer_info_lookup(author_name)
in_devs = dev and 'contact' in dev and nickname + '@SF' in dev['contact']
in_entry = author_name in entry_developer
if in_devs and in_entry:
continue # already existing in entry and devs
content += ' {} : {}@SF'.format(author_name, nickname)
if not in_devs:
content += ' (not in devs)'
if not in_entry:
content += ' (not in entry)'
content += '\n'
# parse source repository
repos = entry.get('code repository', [])
# Github
urls = [x for x in repos if x.startswith('https://github.com/')]
urls = []
for url in urls:
print(' github repo: {}'.format(url))
github_info = osg_github.retrieve_repo_info(url)
for contributor in github_info['contributors']:
name = contributor.name
dev = developer_info_lookup(name)
in_devs = dev and 'contact' in dev and contributor.login + '@GH' in dev['contact']
in_entry = name in entry_developer
if in_devs and in_entry:
continue # already existing in entry and devs
content += ' {}: {}@GH'.format(name, contributor.login)
if contributor.blog:
content += ' url: {}'.format(contributor.blog)
if not in_devs:
content += ' (not in devs)'
if not in_entry:
content += ' (not in entry)'
content += '\n'
if content:
developers += '{}\n\n{}\n'.format(entry_name, content)
except RuntimeError as e:
raise e
# pass
finally:
# store developer info
utils.write_text(os.path.join(c.root_path, 'collected_developer_info.txt'), developers)
from utils import osg_ui
from utils import osg
class DevelopersMaintainer:
@ -202,6 +91,16 @@ class DevelopersMaintainer:
self.entries = osg.read_entries()
print('{} entries read'.format(len(self.entries)))
def special_ops(self):
# need entries loaded
if not self.entries:
print('entries not yet loaded')
return
for entry in self.entries:
for developer in entry.get('Developer', []):
if developer.comment:
print('{:<25} - {:<25} - {}'.format(entry['File'], developer.value, developer.comment))
if __name__ == "__main__":
@ -214,6 +113,7 @@ if __name__ == "__main__":
'Check for orphans': m.check_for_orphans,
'Check for games in developers not listed': m.check_for_missing_developers_in_entries,
'Update developers from entries': m.update_developers_from_entries,
'Special': m.special_ops,
'Read entries': m.read_entries
}

View File

@ -859,21 +859,29 @@ class EntriesMaintainer:
print('entries not yet loaded')
return
# combine content keywords
n = len('content ')
# cvs without any git
for entry in self.entries:
keywords = entry['Keyword']
content = [keyword for keyword in keywords if keyword.startswith('content')]
if len(content) > 1:
# remove from keywords
keywords = [keyword for keyword in keywords if keyword not in content]
# remove prefix
content = [str(keyword)[n:].strip() for keyword in content]
# join with +
content = 'content {}'.format(' + '.join(content))
keywords.append(osg_parse.ValueWithComment(content))
entry['Keyword'] = keywords
print('fixed "{}"'.format(entry['File']))
repos = entry['Code repository']
cvs = [repo for repo in repos if 'cvs' in repo]
git = [repo for repo in repos if 'git' in repo]
if len(cvs) > 0 and len(git) == 0:
print('Entry "{}" with repos: {}'.format(entry['File'], repos))
# # combine content keywords
# n = len('content ')
# for entry in self.entries:
# keywords = entry['Keyword']
# content = [keyword for keyword in keywords if keyword.startswith('content')]
# if len(content) > 1:
# # remove from keywords
# keywords = [keyword for keyword in keywords if keyword not in content]
# # remove prefix
# content = [str(keyword)[n:].strip() for keyword in content]
# # join with +
# content = 'content {}'.format(' + '.join(content))
# keywords.append(osg_parse.ValueWithComment(content))
# entry['Keyword'] = keywords
# print('fixed "{}"'.format(entry['File']))
print('special ops finished')

152
code/sourceforge_import.py Normal file
View File

@ -0,0 +1,152 @@
"""
Scrapes Sourceforge project sites and adds (mostly developer) information to our database.
""" # TODO sourceforge sites that are not existing anymore but we have an archive link, also scrape
import os
import json
import requests
from bs4 import BeautifulSoup
from utils import constants as c, utils, osg, osg_parse
sf_entries_file = os.path.join(c.code_path, 'sourceforge_entries.txt')
prefix = 'https://sourceforge.net/projects/'
# author names in SF that aren't the author names how we have them
SF_alias_list = {'Erik Johansson (aka feneur)': 'Erik Johansson', 'Itms': 'Nicolas Auvray', 'baris yuksel': 'Baris Yuksel',
'Wraitii': 'Lancelot de Ferrière', 'Simzer': 'Simon Laszlo', 'armin bajramovic': 'Armin Bajramovic',
'bleu tailfly': 'bleutailfly', 'dlh': 'DLH', 'Bjorn Hansen': 'Bjørn Hansen'}
SF_ignore_list = ('', 'Arianne Integration Bot')
def collect_sourceforge_entries():
"""
Reads the entries of the database and collects all entries with sourceforge as project site
"""
# read entries
entries = osg.read_entries()
print('{} entries read'.format(len(entries)))
# loop over entries
files = []
for entry in entries:
urls = [x for x in entry['Home'] if x.startswith(prefix)]
if urls:
files.append(entry['File'])
# write to file
print('{} entries with sourceforge projects'.format(len(files)))
utils.write_text(sf_entries_file, json.dumps(files, indent=1))
def sourceforge_import():
"""
:return:
"""
files = json.loads(utils.read_text(sf_entries_file))
all_developers = osg.read_developers()
print(' {} developers read'.format(len(all_developers)))
all_developers_changed = False
# all exceptions that happen will be eaten (but will end the execution)
try:
# loop over each entry
for index, file in enumerate(files):
print(' process {}'.format(file))
# read entry
entry = osg.read_entry(file)
developers = entry.get('Developer', [])
urls = [x.value for x in entry['Home'] if x.startswith('https://sourceforge.net/projects/')]
entry_changed = False
for url in urls:
print(' sf project {}'.format(url))
if not url.endswith('/'):
print('error: sf project does not end with slash')
url += '/'
# members
url_members = 'https://sourceforge.net/p/' + url[len(prefix):] + '_members/'
response = requests.get(url_members)
if response.status_code != 200:
raise RuntimeError('url {} not accessible'.format(url_members))
soup = BeautifulSoup(response.text, 'html.parser')
authors = soup.find('div', id='content_base').find('table').find_all('tr')
authors = [author.find_all('td') for author in authors]
authors = [author[1].a['href'] for author in authors if len(author) == 3]
for author in authors:
# sometimes author already contains the full url, sometimes not
url_author = 'https://sourceforge.net' + author if not author.startswith('http') else author
response = requests.get(url_author)
url_author = response.url # could be different now
if 'auth/?return_to' in url_author:
# for some reason authorisation is forbidden or page was not available (happens for example for /u/kantaros)
author_name = author[3:-1]
nickname = author_name
else:
soup = BeautifulSoup(response.text, 'html.parser')
author_name = soup.h1.get_text()
author_name = SF_alias_list.get(author_name, author_name) # replace by alias if possible
nickname = soup.find('dl', class_='personal-data').find('dd').get_text()
nickname = nickname.replace('\n', '').strip()
nickname += '@SF' # our indication of the platform to search for
if author_name in SF_ignore_list:
continue
# look author up in entry developers
if author_name not in developers:
print(' dev "{}" added to entry {}'.format(author_name, file))
entry['Developer'] = entry.get('Developer', []) + [osg_parse.ValueWithComment(author_name)]
entry_changed = True
developers = entry.get('Developer', [])
# look author and SF nickname up in developers data base
if author_name in all_developers:
dev = all_developers[author_name]
if not nickname in dev.get('Contact', []):
print(' existing dev "{}" added nickname ({}) to developer database'.format(author_name, nickname))
# check that name has not already @SF contact
if any(x.endswith('@SF') for x in dev.get('Contact', [])):
print('warning: already SF contact')
all_developers[author_name]['Contact'] = dev.get('Contact', []) + [nickname]
all_developers_changed = True
else:
print(' dev "{}" ({}) added to developer database'.format(author_name, nickname))
all_developers[author_name] = {'Name': author_name, 'Contact': nickname, 'Games': [entry['Title']]}
all_developers_changed = True
if entry_changed:
# save entry
osg.write_entry(entry)
print(' entry updated')
except:
raise
finally:
# shorten file list
utils.write_text(sf_entries_file, json.dumps(files[index:], indent=1))
# save entry
osg.write_entry(entry)
print(' entry updated')
# maybe save all developers
if all_developers_changed:
# save all developers
osg.write_developers(all_developers)
print('developers database updated')
if __name__ == "__main__":
# collect entries
# collect_sourceforge_entries()
# import information from sf
sourceforge_import()

View File

@ -260,6 +260,35 @@ def read_entries():
return entries
def read_entry(file):
"""
Reads a single entry
:param file: the entry file (without path)
:return: the entry
"""
# setup parser and transformer
grammar_file = os.path.join(c.code_path, 'grammar_entries.lark')
grammar = utils.read_text(grammar_file)
parse = osg_parse.create(grammar, osg_parse.EntryTransformer)
# read entry file
content = utils.read_text(os.path.join(c.entries_path, file))
if not content.endswith('\n'):
content += '\n'
# parse and transform entry content
try:
entry = parse(content)
entry = [('File', file),] + entry # add file information to the beginning
entry = check_and_process_entry(entry)
except Exception as e:
print('{} - {}'.format(file, e))
raise RuntimeError(e)
return entry
def check_and_process_entry(entry):
message = ''

View File

@ -21,11 +21,11 @@ class ListingTransformer(lark.Transformer):
def property(self, x):
"""
The key of a property will be converted to lower case and the value part is the second part
Key is first part, values are following.
:param x:
:return:
"""
return x[0], x[1:]
return x[0].value, x[1:]
def name(self, x):
"""