|
|
@@ -0,0 +1,102 @@ |
|
|
|
#!/usr/bin/env python3 |
|
|
|
|
|
|
|
import httplib2 |
|
|
|
import re |
|
|
|
import socket |
|
|
|
import sys |
|
|
|
|
|
|
|
|
|
|
|
def parse_links(filename): |
|
|
|
"""Returns a list of URLs from text file""" |
|
|
|
with open(filename, mode='r', encoding='utf-8') as fp: |
|
|
|
readme = fp.read() |
|
|
|
index_section = readme.find('## Index') |
|
|
|
content = readme[index_section:] |
|
|
|
|
|
|
|
raw_links = re.findall( |
|
|
|
'((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’]))', |
|
|
|
content) |
|
|
|
|
|
|
|
links = [ |
|
|
|
str(raw_link[0]).rstrip('/') for raw_link in raw_links |
|
|
|
] |
|
|
|
|
|
|
|
return links |
|
|
|
|
|
|
|
def dup_links(links): |
|
|
|
"""Check for duplicated links""" |
|
|
|
print(f'Checking for duplicated links...') |
|
|
|
hasError = False |
|
|
|
seen = {} |
|
|
|
dupes = [] |
|
|
|
|
|
|
|
for link in links: |
|
|
|
if link not in seen: |
|
|
|
seen[link] = 1 |
|
|
|
else: |
|
|
|
if seen[link] == 1: |
|
|
|
dupes.append(link) |
|
|
|
|
|
|
|
if not dupes: |
|
|
|
print(f"No duplicate links") |
|
|
|
else: |
|
|
|
print(f"Found duplicate links: {dupes}") |
|
|
|
hasError = True |
|
|
|
return hasError |
|
|
|
|
|
|
|
def validate_links(links): |
|
|
|
"""Checks each entry in JSON file for live link""" |
|
|
|
print(f'Validating {len(links)} links...') |
|
|
|
hasError = False |
|
|
|
for link in links: |
|
|
|
h = httplib2.Http(disable_ssl_certificate_validation=True, timeout=25) |
|
|
|
try: |
|
|
|
# fetching host name, removing leading www |
|
|
|
host = link.split('//', 1)[1].split('/', 1)[0] |
|
|
|
if host[:3] == 'www': |
|
|
|
host = host[4:] |
|
|
|
|
|
|
|
resp = h.request(link + "/", headers={ |
|
|
|
# Faking user agent as some hosting services block not-whitelisted UA |
|
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36', |
|
|
|
# setting host because Cloudflare returns 403 asking for captcha if host is missing |
|
|
|
'host': host |
|
|
|
}) |
|
|
|
code = int(resp[0]['status']) |
|
|
|
# Checking status code errors |
|
|
|
if (code >= 400): |
|
|
|
hasError = True |
|
|
|
print(f"ERR:CLT:{code} : {link}") |
|
|
|
except TimeoutError: |
|
|
|
hasError = True |
|
|
|
print(f"ERR:TMO: {link}") |
|
|
|
except socket.error as socketerror: |
|
|
|
hasError = True |
|
|
|
print(f"ERR:SOC: {socketerror} : {link}") |
|
|
|
except Exception as e: |
|
|
|
hasError = True |
|
|
|
# Ignore some exceptions which are not actually errors. |
|
|
|
# The list below should be extended with other exceptions in the future if needed |
|
|
|
if (-1 != str(e).find("[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:852)")): |
|
|
|
print(f"ERR:SSL: {e} : {link}") |
|
|
|
elif (-1 != str(e).find("Content purported to be compressed with gzip but failed to decompress.")): |
|
|
|
print(f"ERR:GZP: {e} : {link}") |
|
|
|
elif (-1 != str(e).find("Unable to find the server at")): |
|
|
|
print(f"ERR:SRV: {e} : {link}") |
|
|
|
else: |
|
|
|
print(f"ERR:UKN: {e} : {link}") |
|
|
|
return hasError |
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
num_args = len(sys.argv) |
|
|
|
if num_args < 2: |
|
|
|
print("No .md file passed") |
|
|
|
sys.exit(1) |
|
|
|
links = parse_links(sys.argv[1]) |
|
|
|
hasError = dup_links(links) |
|
|
|
if not hasError: |
|
|
|
hasError = validate_links(links) |
|
|
|
if hasError: |
|
|
|
sys.exit(1) |
|
|
|
|
|
|
|
#test |