finished checking external links

This commit is contained in:
Trilarion
2020-08-21 08:39:20 +02:00
parent cbb621e41c
commit e8fff46e2c
49 changed files with 125 additions and 117 deletions

View File

@ -3,7 +3,6 @@
"git://git.blender.org/blender.git",
"git://git.colm.net/ragel.git",
"git://opensimulator.org/git/opensim",
"http://git.artsoft.org/rocksndiamonds.git",
"http://git.pond.sub.org/empserver",
"https://anongit.freedesktop.org/git/pkg-config.git",
"https://anongit.kde.org/ksudoku.git",
@ -16,6 +15,7 @@
"https://bitbucket.org/ecwolf/ecwolf.git",
"https://bitbucket.org/fade0ff/lemmini.git",
"https://bitbucket.org/rbv/ohrrpgce-svn.git",
"https://git.artsoft.org/rocksndiamonds.git",
"https://git.code.sf.net/p/allegator/alex4",
"https://git.code.sf.net/p/arianne/marauroa",
"https://git.code.sf.net/p/arianne/stendhal",
@ -702,6 +702,7 @@
"https://github.com/lksj/einstein-puzzle.git",
"https://github.com/llopisdon/monsters_and_mushrooms.git",
"https://github.com/lo-th/3d.city.git",
"https://github.com/love2d/love.git",
"https://github.com/lua/lua.git",
"https://github.com/mackers/xultris.git",
"https://github.com/madler/zlib.git",
@ -1344,7 +1345,6 @@
"https://bitbucket.org/gopostal/postal-1-open-source",
"https://bitbucket.org/mstrobel/supremacy/src",
"https://bitbucket.org/mzeilfelder/trunk_hc1",
"https://bitbucket.org/rude/love",
"https://bitbucket.org/ryzom/ryzomcore",
"https://bitbucket.org/sumwars/sumwars-code",
"https://bitbucket.org/thesheep/fujo/src"

View File

@ -141,6 +141,7 @@ https://arcade.academy/
https://archive.codeplex.com/?p=turnota
https://archive.codeplex.com/?p=voxeliq
https://archive.org/details/Gna_code_hosting (all of them)
https://awesomeopensource.com/categories/games
https://blenderartists.org/t/devils-pinball/552785
https://blends.debian.org/games/tasks/
https://blends.debian.org/games/tasks/racing

View File

@ -138,8 +138,6 @@ def check_validity_external_links():
from time to time.
"""
# TODO Gitorius works in principle but onyl without SSL verify (requests probably can do that)
# regex for finding urls (can be in <> or in ]() or after a whitespace
regex = re.compile(r"[\s\n]<(http.+?)>|\]\((http.+?)\)|[\s\n](http[^\s\n,]+?)[\s\n\)]")
@ -147,9 +145,11 @@ def check_validity_external_links():
ignored_urls = ('https://git.tukaani.org/xz.git', 'https://git.code.sf.net/', 'http://hg.hedgewars.org/hedgewars/', 'https://git.xiph.org/vorbis.git', 'http://svn.uktrainsim.com/svn/openrails')
# some do redirect, but we nedertheless want the original URL in the database
redirect_okay = ('https://octaforge.org/', 'https://svn.openttd.org/')
redirect_okay = ('https://octaforge.org/', 'https://svn.openttd.org/', 'https://godotengine.org/download')
# extract all links from entries
import urllib3
urllib3.disable_warnings() # otherwise we cannot verify those with SSL errors without getting warnings
urls = {}
for entry, _, content in osg.entry_iterator():
# apply regex
@ -158,55 +158,66 @@ def check_validity_external_links():
for match in matches:
for url in match:
if url and not any((url.startswith(x) for x in ignored_urls)):
# github and gitlab git URLs are shortened to not contain .git
if any((url.startswith(x) for x in ('https://github.com/', 'https://gitlab.com/', 'https://salsa.debian.org/', 'https://src.fedoraproject.org/', 'https://gitlab.gnome.org/GNOME/'))) and url.endswith('.git'):
url = url[:-4]
if (url.startswith('https://svn.code.sf.net/p/') or url.startswith('http://svn.code.sf.net/p/')) and url.endswith('code'):
url = url + '/'
if url.startswith('https://bitbucket.org/') and url.endswith('.git'):
url = url[:-4] + '/commits/'
if url.startswith('https://svn.code.sf.net/p/') or url.endswith('.cvs.sourceforge.net'):
url = 'http' + url[5:]
if url.startswith('https://git.savannah.gnu.org/git/') or url.startswith('https://git.savannah.nongnu.org/git/') or url.startswith('http://git.artsoft.org/'):
url = url + '/'
if url.startswith('https://anongit.freedesktop.org/git'):
url = url + '/'
if url.startswith('http://cvs.savannah.nongnu.org:/sources/'):
url = 'http://cvs.savannah.nongnu.org/viewvc/' + url[40:]
if url.startswith('http://cvs.savannah.gnu.org:/sources/'):
url = 'http://cvs.savannah.gnu.org/viewvc/' + url[37:]
# ignore bzr.sourceforge, no web address found
if 'bzr.sourceforge.net/bzrroot/' in url:
continue
if url.endswith('.git'):
# add "/" at the end
if any((url.startswith(x) for x in ('https://anongit.freedesktop.org/git', 'https://git.savannah.gnu.org/git/', 'https://git.savannah.nongnu.org/git/', 'https://git.artsoft.org/'))):
url += '/'
if url.startswith('https://bitbucket.org/') and url.endswith('.git'):
url = url[:-4] + '/commits/'
if url.startswith('https://svn.code.sf.net/p/'):
url = 'http' + url[5:] + '/'
if url.startswith('http://cvs.savannah.nongnu.org:/sources/'):
url = 'http://cvs.savannah.nongnu.org/viewvc/' + url[40:] + '/'
if url.startswith('http://cvs.savannah.gnu.org:/sources/'):
url = 'http://cvs.savannah.gnu.org/viewvc/' + url[37:] + '/'
# generally ".git" at the end is not working well, except sometimes
if url.endswith('.git') and not any((url.startswith(x) for x in ('https://repo.or.cz', 'https://git.tuxfamily.org/fanwor/fanwor'))):
url = url[:-4]
if url in urls:
urls[url].add(entry)
else:
urls[url] = {entry}
print('found {} unique links'.format(len(urls)))
print("start checking external links (can take a while)")
print('found {} unique links'.format(len(urls)))
print("start checking external links (can take a while)")
# now iterate over all urls
for index, url in enumerate(urls.keys()):
for url, names in urls.items():
names = list(names) # was a set
if len(names) == 1:
names = names[0]
try:
verify = True
# some have an expired certificate but otherwise still work
if any((url.startswith(x) for x in ('https://perso.b2b2c.ca/~sarrazip/dev/', 'https://dreerally.com/', 'https://henlin.net/', 'https://www.megamek.org/', 'https://pixeldoctrine.com/', 'https://gitorious.org/'))):
if any((url.startswith(x) for x in ('https://perso.b2b2c.ca/~sarrazip/dev/', 'https://dreerally.com/', 'https://henlin.net/', 'https://www.megamek.org/', 'https://pixeldoctrine.com/', 'https://gitorious.org/', 'https://www.opmon-game.ga/'))):
verify = False
r = requests.head(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64)'}, timeout=10, allow_redirects=True, verify=verify)
r = requests.head(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64)'}, timeout=20, allow_redirects=True, verify=verify)
if r.status_code == 405: # head method not supported, try get
r = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64)'}, timeout=20, allow_redirects=True, verify=verify)
# check for bad status
if r.status_code != requests.codes.ok:
print('{}: URL {} in entry {} has status {}'.format(index, url, urls[url], r.status_code))
print('{}: {} - {}'.format(names, url, r.status_code))
# check for redirect
if r.history and url not in redirect_okay:
# only / added or http->https
print('{}: URL {} in entry {} was redirected to {}'.format(index, url, urls[url], r.url))
# only / added or http->https sometimes
redirected_url = r.url
if redirected_url == url + '/':
output = '{}: {} -> {} - redirect "/" at end '
elif redirected_url == 'https' + url[4:]:
output = '{}: {} -> {} - redirect "https" at start'
else:
output = '{}: {} -> {} - redirect '
print(output.format(names, url, redirected_url))
except Exception as e:
print('{}: URL {} in entry {} gave error {}'.format(index, url, urls[url], e))
# print regular updates
if index > 0 and index % 100 == 0:
print('{} / {}'.format(index, len(urls)))
error_name = type(e).__name__
if error_name == 'SSLError' and url.startswith('https://gitorious.org/'):
continue # even though verify is False, these errors still get through
print('{}: {} - exception {}'.format(names, url, error_name))
def check_template_leftovers():
@ -1030,7 +1041,7 @@ if __name__ == "__main__":
export_git_code_repositories_json()
# check external links (only rarely)
# check_validity_external_links()
check_validity_external_links()
# sort rejected games list file
sort_text_file(os.path.join(c.root_path, 'code', 'rejected.txt'), 'rejected games list')