opensourcegames/code/maintenance_collect_developer_infos.py

"""
Checks the entries and tries to detect additional developer content, by retrieving websites or logging information from
stored Git repositories.
"""

import os
import requests
from bs4 import BeautifulSoup
from utils import constants as c, utils, osg, osg_github


def developer_info_lookup(name):
    for dev in developer_info:
        if name == dev['name']:
            return dev
    return None


# author names in SF that aren't the author names how we have them
SF_alias_list = {'Erik Johansson (aka feneur)': 'Erik Johansson', 'Itms': 'Nicolas Auvray',
                 'Wraitii': 'Lancelot de Ferrière', 'Simzer': 'Simon Laszlo', 'armin bajramovic': 'Armin Bajramovic'}

if __name__ == "__main__":

    # read developer info
    developer_info = osg.read_developer_info()
    osg.write_developer_info(developer_info)  # write again just to make nice

    # assemble info
    entries = osg.assemble_infos()

    # cross-check
    osg.compare_entries_developers(entries, developer_info)

    # loop over infos
    developers = ''
    try:
        i = 0
        # active = False
        for entry in entries:

            # if entry['name'] == 'Aleph One':
            #    active = True
            # if not active:
            #    continue

            # for testing purposes
            i += 1
            if i > 40:
                break

            # print
            entry_name = '{} - {}'.format(entry['file'], entry['name'])
            print(entry_name)
            content = ''

            entry_developer = entry.get('developer', [])

            # parse home
            home = entry['home']
            # sourceforge project site
            prefix = 'https://sourceforge.net/projects/'
            url = [x for x in home if x.startswith(prefix)]
            if len(url) == 1:
                url = url[0]
                print(' sourceforge project site: {}'.format(url))
                url = 'https://sourceforge.net/p/' + url[len(prefix):] + '_members/'
                response = requests.get(url)
                soup = BeautifulSoup(response.text, 'html.parser')
                authors = soup.find('div', id='content_base').find('table').find_all('tr')
                authors = [author.find_all('td') for author in authors]
                authors = [author[1].a['href'] for author in authors if len(author) == 3]
                for author in authors:
                    # sometimes author already contains the full url, sometimes not
                    url = 'https://sourceforge.net' + author if not author.startswith('http') else author
                    response = requests.get(url)
                    url = response.url  # could be different now
                    if 'auth/?return_to' in url:
                        # for some reason authorisation is forbidden
                        author_name = author
                        nickname = author
                    else:
                        soup = BeautifulSoup(response.text, 'html.parser')
                        author_name = soup.h1.get_text()
                        author_name = SF_alias_list.get(author_name, author_name)  # replace by alias if possible
                        nickname = soup.find('dl', class_='personal-data').find('dd').get_text()
                        nickname = nickname.replace('\n', '').strip()
                    dev = developer_info_lookup(author_name)
                    in_devs = dev and 'contact' in dev and nickname + '@SF' in dev['contact']
                    in_entry = author_name in entry_developer
                    if in_devs and in_entry:
                        continue  # already existing in entry and devs
                    content += ' {} : {}@SF'.format(author_name, nickname)
                    if not in_devs:
                        content += ' (not in devs)'
                    if not in_entry:
                        content += ' (not in entry)'
                    content += '\n'

            # parse source repository
            repos = entry.get('code repository', [])

            # Github
            urls = [x for x in repos if x.startswith('https://github.com/')]
            urls = []
            for url in urls:
                print(' github repo: {}'.format(url))
                github_info = osg_github.retrieve_repo_info(url)
                for contributor in github_info['contributors']:
                    name = contributor.name
                    dev = developer_info_lookup(name)
                    in_devs = dev and 'contact' in dev and contributor.login + '@GH' in dev['contact']
                    in_entry = name in entry_developer
                    if in_devs and in_entry:
                        continue  # already existing in entry and devs
                    content += ' {}: {}@GH'.format(name, contributor.login)
                    if contributor.blog:
                        content += ' url: {}'.format(contributor.blog)
                    if not in_devs:
                        content += ' (not in devs)'
                    if not in_entry:
                        content += ' (not in entry)'
                    content += '\n'

            if content:
                developers += '{}\n\n{}\n'.format(entry_name, content)

    except RuntimeError as e:
        raise e
        # pass
    finally:
        # store developer info
        utils.write_text(os.path.join(c.root_path, 'collected_developer_info.txt'), developers)