diff --git a/build/validate_links.py b/build/validate_links.py new file mode 100755 index 00000000..57d7f801 --- /dev/null +++ b/build/validate_links.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python3 + +import httplib2 +import re +import socket +import sys + + +def parse_links(filename): + """Returns a list of URLs from text file""" + with open(filename, mode='r', encoding='utf-8') as fp: + readme = fp.read() + index_section = readme.find('## Index') + content = readme[index_section:] + + raw_links = re.findall( + '((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’]))', + content) + + links = [ + str(raw_link[0]).rstrip('/') for raw_link in raw_links + ] + + return links + +def dup_links(links): + """Check for duplicated links""" + print(f'Checking for duplicated links...') + hasError = False + seen = {} + dupes = [] + + for link in links: + if link not in seen: + seen[link] = 1 + else: + if seen[link] == 1: + dupes.append(link) + + if not dupes: + print(f"No duplicate links") + else: + print(f"Found duplicate links: {dupes}") + hasError = True + return hasError + +def validate_links(links): + """Checks each entry in JSON file for live link""" + print(f'Validating {len(links)} links...') + hasError = False + for link in links: + h = httplib2.Http(disable_ssl_certificate_validation=True, timeout=25) + try: + # fetching host name, removing leading www + host = link.split('//', 1)[1].split('/', 1)[0] + if host[:3] == 'www': + host = host[4:] + + resp = h.request(link + "/", headers={ + # Faking user agent as some hosting services block not-whitelisted UA + 'User-Agent': 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36', + # setting host because Cloudflare returns 403 asking for captcha if host is missing + 'host': host + }) + code = int(resp[0]['status']) + # Checking status code errors + if (code >= 400): + hasError = True + print(f"ERR:CLT:{code} : {link}") + except TimeoutError: + hasError = True + print(f"ERR:TMO: {link}") + except socket.error as socketerror: + hasError = True + print(f"ERR:SOC: {socketerror} : {link}") + except Exception as e: + hasError = True + # Ignore some exceptions which are not actually errors. + # The list below should be extended with other exceptions in the future if needed + if (-1 != str(e).find("[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:852)")): + print(f"ERR:SSL: {e} : {link}") + elif (-1 != str(e).find("Content purported to be compressed with gzip but failed to decompress.")): + print(f"ERR:GZP: {e} : {link}") + elif (-1 != str(e).find("Unable to find the server at")): + print(f"ERR:SRV: {e} : {link}") + else: + print(f"ERR:UKN: {e} : {link}") + return hasError + +if __name__ == "__main__": + num_args = len(sys.argv) + if num_args < 2: + print("No .md file passed") + sys.exit(1) + links = parse_links(sys.argv[1]) + hasError = dup_links(links) + if not hasError: + hasError = validate_links(links) + if hasError: + sys.exit(1) + +#test \ No newline at end of file