#!/usr/bin/env python3 import httplib2 import re import socket import sys def parse_links(filename): """Returns a list of URLs from text file""" with open(filename) as fp: data = fp.read() raw_links = re.findall( '((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’]))', data) links = [raw_link[0] for raw_link in raw_links] return links def validate_links(links): """Checks each entry in JSON file for live link""" print(f'Validating {len(links)} links...') hasError = False for link in links: h = httplib2.Http(disable_ssl_certificate_validation=True, timeout=25) try: resp = h.request(link, headers={ # Faking user agent as some hosting services block not-whitelisted UA 'user-agent': 'Mozilla/5.0' }) code = int(resp[0]['status']) # Checking status code errors if (code >= 300): hasError = True print(f"ERR:CLT:{code} : {link}") except TimeoutError: hasError = True print(f"ERR:TMO: {link}") except socket.error as socketerror: hasError = True print(f"ERR:SOC: {socketerror} : {link}") except Exception as e: hasError = True # Ignore some exceptions which are not actually errors. # The list below should be extended with other exceptions in the future if needed if (-1 != str(e).find("[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:852)")): print(f"ERR:SSL: {e} : {link}") elif (-1 != str(e).find("Content purported to be compressed with gzip but failed to decompress.")): print(f"ERR:GZP: {e} : {link}") elif (-1 != str(e).find("Unable to find the server at")): print(f"ERR:SRV: {e} : {link}") else: print(f"ERR:UKN: {e} : {link}") return hasError if __name__ == "__main__": num_args = len(sys.argv) if num_args < 2: print("No .md file passed") sys.exit(1) hasError = validate_links(parse_links(sys.argv[1])) if hasError: sys.exit(1)