#!/usr/bin/env python3 import httplib2 import re import socket import sys ignored_links = [ 'https://github.com/public-apis/public-apis/actions?query=workflow%3A%22Run+tests%22', 'https://github.com/public-apis/public-apis/workflows/Validate%20links/badge.svg?branch=master', 'https://github.com/public-apis/public-apis/actions?query=workflow%3A%22Validate+links%22', 'https://github.com/davemachado/public-api', ] def parse_links(filename): """Returns a list of URLs from text file""" with open(filename) as fp: data = fp.read() raw_links = re.findall( '((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’]))', data) links = [raw_link[0] for raw_link in raw_links] return links def dup_links(links): """Check for duplicated links""" print(f'Checking for duplicated links...') hasError = False seen = {} dupes = [] for link in links: link = link.rstrip('/') if link in ignored_links: continue if link not in seen: seen[link] = 1 else: if seen[link] == 1: dupes.append(link) if not dupes: print(f"No duplicate links") else: print(f"Found duplicate links: {dupes}") hasError = True return hasError def validate_links(links): """Checks each entry in JSON file for live link""" print(f'Validating {len(links)} links...') hasError = False for link in links: h = httplib2.Http(disable_ssl_certificate_validation=True, timeout=25) try: resp = h.request(link, headers={ # Faking user agent as some hosting services block not-whitelisted UA 'user-agent': 'Mozilla/5.0' }) code = int(resp[0]['status']) # Checking status code errors if (code >= 300): hasError = True print(f"ERR:CLT:{code} : {link}") except TimeoutError: hasError = True print(f"ERR:TMO: {link}") except socket.error as socketerror: hasError = True print(f"ERR:SOC: {socketerror} : {link}") except Exception as e: hasError = True # Ignore some exceptions which are not actually errors. # The list below should be extended with other exceptions in the future if needed if (-1 != str(e).find("[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:852)")): print(f"ERR:SSL: {e} : {link}") elif (-1 != str(e).find("Content purported to be compressed with gzip but failed to decompress.")): print(f"ERR:GZP: {e} : {link}") elif (-1 != str(e).find("Unable to find the server at")): print(f"ERR:SRV: {e} : {link}") else: print(f"ERR:UKN: {e} : {link}") return hasError if __name__ == "__main__": num_args = len(sys.argv) if num_args < 2: print("No .md file passed") sys.exit(1) links = parse_links(sys.argv[1]) hasError = dup_links(links) if not hasError: hasError = validate_links(links) if hasError: sys.exit(1)