Tanmay Goregaonkar 1 year ago
committed by GitHub
parent
commit
fe6abcc41f
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 102 additions and 0 deletions
  1. +102
    -0
      build/validate_links.py

+ 102
- 0
build/validate_links.py View File

@@ -0,0 +1,102 @@
#!/usr/bin/env python3

import httplib2
import re
import socket
import sys


def parse_links(filename):
"""Returns a list of URLs from text file"""
with open(filename, mode='r', encoding='utf-8') as fp:
readme = fp.read()
index_section = readme.find('## Index')
content = readme[index_section:]

raw_links = re.findall(
'((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’]))',
content)

links = [
str(raw_link[0]).rstrip('/') for raw_link in raw_links
]

return links

def dup_links(links):
"""Check for duplicated links"""
print(f'Checking for duplicated links...')
hasError = False
seen = {}
dupes = []

for link in links:
if link not in seen:
seen[link] = 1
else:
if seen[link] == 1:
dupes.append(link)

if not dupes:
print(f"No duplicate links")
else:
print(f"Found duplicate links: {dupes}")
hasError = True
return hasError

def validate_links(links):
"""Checks each entry in JSON file for live link"""
print(f'Validating {len(links)} links...')
hasError = False
for link in links:
h = httplib2.Http(disable_ssl_certificate_validation=True, timeout=25)
try:
# fetching host name, removing leading www
host = link.split('//', 1)[1].split('/', 1)[0]
if host[:3] == 'www':
host = host[4:]
resp = h.request(link + "/", headers={
# Faking user agent as some hosting services block not-whitelisted UA
'User-Agent': 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36',
# setting host because Cloudflare returns 403 asking for captcha if host is missing
'host': host
})
code = int(resp[0]['status'])
# Checking status code errors
if (code >= 400):
hasError = True
print(f"ERR:CLT:{code} : {link}")
except TimeoutError:
hasError = True
print(f"ERR:TMO: {link}")
except socket.error as socketerror:
hasError = True
print(f"ERR:SOC: {socketerror} : {link}")
except Exception as e:
hasError = True
# Ignore some exceptions which are not actually errors.
# The list below should be extended with other exceptions in the future if needed
if (-1 != str(e).find("[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:852)")):
print(f"ERR:SSL: {e} : {link}")
elif (-1 != str(e).find("Content purported to be compressed with gzip but failed to decompress.")):
print(f"ERR:GZP: {e} : {link}")
elif (-1 != str(e).find("Unable to find the server at")):
print(f"ERR:SRV: {e} : {link}")
else:
print(f"ERR:UKN: {e} : {link}")
return hasError

if __name__ == "__main__":
num_args = len(sys.argv)
if num_args < 2:
print("No .md file passed")
sys.exit(1)
links = parse_links(sys.argv[1])
hasError = dup_links(links)
if not hasError:
hasError = validate_links(links)
if hasError:
sys.exit(1)

#test

Loading…
Cancel
Save