#!/usr/bin/env python3 import httplib2 import re import socket import sys def parse_links(filename): """Returns a list of URLs from text file""" with open(filename) as fp: data = fp.read() raw_links = re.findall( '((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’]))', data) links = [raw_link[0] for raw_link in raw_links] return links def validate_links(links): """Checks each entry in JSON file for live link""" print('Validating {} links...'.format(len(links))) errors = [] for link in links: h = httplib2.Http(disable_ssl_certificate_validation=True, timeout=25) try: resp = h.request(link, headers={'user-agent': 'python-httplib2/0.18.0'}) code = int(resp[0]['status']) # check if status code is a client or server error if code >= 404: errors.append('{}: {}'.format(code, link)) except TimeoutError: errors.append("TMO: " + link) except socket.error as socketerror: errors.append("SOC: {} : {}".format(socketerror, link)) except Exception as e: # Ignore some exceptions which are not actually errors. # The list below should be extended with other exceptions in the future if needed if ((-1 != str(e).find("Content purported to be compressed with gzip but failed to decompress.")) and (-1 != str(e).find("[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:852)"))) : errors.append("ERR: {} : {}".format(e, link)) return errors if __name__ == "__main__": num_args = len(sys.argv) if num_args < 2: print("No .md file passed") sys.exit(1) errors = validate_links(parse_links(sys.argv[1])) if len(errors) > 0: for err in errors: print(err) sys.exit(1)