fixed broken external links, imported some Hg repositories of inactive projects to Git

This commit is contained in:
Trilarion
2019-07-15 13:33:08 +02:00
parent 1672312a0b
commit e673aa8317
51 changed files with 163 additions and 143 deletions

View File

@ -131,12 +131,16 @@ def check_validity_external_links():
print("check external links (can take a while)")
# regex for finding urls (can be in <> or in () or a whitespace
regex = re.compile(r"[\s\n]<(http.+?)>|\]\((http.+?)\)|[\s\n](http[^\s\n,]+)")
# regex for finding urls (can be in <> or in ]() or after a whitespace
#regex = re.compile(r"[\s\n]<(http.+?)>|\]\((http.+?)\)|[\s\n](http[^\s\n,]+?)[\s\n\)]")
regex = re.compile(r"[\s\n<(](http://.*?)[\s\n>)]")
# count
number_checked_links = 0
# ignore the following urls (they give false positives here)
ignored_urls = ('https://git.tukaani.org/xz.git')
# iterate over all entries
for _, entry_path, content in entry_iterator():
@ -150,7 +154,7 @@ def check_validity_external_links():
for url in match:
# if there was something (and not a sourceforge git url)
if url and not url.startswith('https://git.code.sf.net/p/'):
if url and not url.startswith('https://git.code.sf.net/p/') and url not in ignored_urls:
try:
# without a special header, frequent 403 responses occur
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64)'})