opensourcegames/code/synchronization/synchronize_awesome_lists.py

"""
Synchronizes with awesome lists from
"""

import re
import requests
from utils import osg, osg_rejected

# TODO Probably could fix some of the ignored cases within the awesome lists (or fix the small deviations in structure)
# TODO not all of them are awesome actually

# AWESOME_LIST = 'https://raw.githubusercontent.com/radek-sprta/awesome-game-remakes/master/README.md'
# IGNORED = ('2006rebotted', 'raw(gl)', 'fheroes2', 'FS2OPEN', 'Barbarian', 'Hexen II: Hammer of Thyrion')

AWESOME_LIST = 'https://raw.githubusercontent.com/leereilly/games/master/README.md'
IGNORED = ('Warsow',)

# two different - signs are used sometimes
matcher = re.compile(r'\[(.*)?\]\((.*?)\) [-– ]*(.*)')  # general structure: - [title](link) - description

if __name__ == "__main__":

    # read rejected
    rejected = osg_rejected.read_rejected_file()

    # read awesome list
    print('read {}'.format(AWESOME_LIST))
    r = requests.get(AWESOME_LIST, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64)'}, timeout=20, allow_redirects=True)
    if r.status_code != requests.codes.ok:
        raise RuntimeError('Cannot download awesome list.')
    text = r.text
    text = text.split('\n##')[2:]
    entries = []
    for items in text:
        items = items.split('\n')
        category = items[0].strip()
        items = [item for item in items[1:] if item.startswith('- ') or item.startswith('* ')]
        for item in items:
            # print(item)
            # print(matcher.findall(item))
            matches = matcher.findall(item)[0]  # we know it will be exactly one
            title = matches[0]
            url = matches[1]
            description = matches[2]
            entries.append({'Title': title, 'URL': url, 'Description': description, 'Category': category})
    print('contains {} entries'.format(len(entries)))

    # remove those from the ignored list
    entries = [entry for entry in entries if not any(entry['Title'] == x for x in IGNORED)]

    # remove those that are in our rejected list
    rejected_titles = [x['Title'] for x in rejected]
    entries = [entry for entry in entries if entry['Title'] not in rejected_titles]
    print('after filtering for rejected and ignored entries {}'.format(len(entries)))

    # a bit of statistics about this awesome list
    print('contains {} entries in {} categories'.format(len(entries), len(text)))
    n = [0, 0]
    for entry in entries:
        if entry['URL'].startswith('https://github.com/'):
            n[0] += 1
        else:
            n[1] += 1
    print('{} links to Github, {} links not to Github'.format(*n))

    # read our database
    our_entries = osg.read_entries()
    print('{} entries read (osgl)'.format(len(our_entries)))

    # go through this awesome list entries one by one and compare to our list
    index = 1
    for entry in entries:
        title = entry['Title']
        url = entry['URL']
        # go through our entries
        similar_entries = []
        for our_entry in our_entries:
            title_equal = title == our_entry['Title']
            url_present = any(url in x for x in our_entry['Home']) or any(url in x for x in our_entry.get('Code repository', []))
            if title_equal or url_present:
                similar_entries.append(our_entry)
        if not similar_entries:
            print('Unknown entry ({}) "{}" {} - {} - {}'.format(index, entry['Title'], entry['URL'], entry['Category'], entry['Description']))
            index += 1