Non puoi selezionare più di 25 argomenti Gli argomenti devono iniziare con una lettera o un numero, possono includere trattini ('-') e possono essere lunghi fino a 35 caratteri.

validate_links.py 2.3 KiB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263
  1. #!/usr/bin/env python3
  2. import httplib2
  3. import re
  4. import socket
  5. import sys
  6. def parse_links(filename):
  7. """Returns a list of URLs from text file"""
  8. with open(filename) as fp:
  9. data = fp.read()
  10. raw_links = re.findall(
  11. '((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’]))',
  12. data)
  13. links = [raw_link[0] for raw_link in raw_links]
  14. return links
  15. def validate_links(links):
  16. """Checks each entry in JSON file for live link"""
  17. print(f'Validating {len(links)} links...')
  18. hasError = False
  19. for link in links:
  20. h = httplib2.Http(disable_ssl_certificate_validation=True, timeout=25)
  21. try:
  22. resp = h.request(link, headers={
  23. # Faking user agent as some hosting services block not-whitelisted UA
  24. 'user-agent': 'Mozilla/5.0'
  25. })
  26. code = int(resp[0]['status'])
  27. # Checking status code errors
  28. if (code >= 300):
  29. hasError = True
  30. print(f"ERR:CLT:{code} : {link}")
  31. except TimeoutError:
  32. hasError = True
  33. print(f"ERR:TMO: {link}")
  34. except socket.error as socketerror:
  35. hasError = True
  36. print(f"ERR:SOC: {socketerror} : {link}")
  37. except Exception as e:
  38. hasError = True
  39. # Ignore some exceptions which are not actually errors.
  40. # The list below should be extended with other exceptions in the future if needed
  41. if (-1 != str(e).find("[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:852)")):
  42. print(f"ERR:SSL: {e} : {link}")
  43. elif (-1 != str(e).find("Content purported to be compressed with gzip but failed to decompress.")):
  44. print(f"ERR:GZP: {e} : {link}")
  45. elif (-1 != str(e).find("Unable to find the server at")):
  46. print(f"ERR:SRV: {e} : {link}")
  47. else:
  48. print(f"ERR:UKN: {e} : {link}")
  49. return hasError
  50. if __name__ == "__main__":
  51. num_args = len(sys.argv)
  52. if num_args < 2:
  53. print("No .md file passed")
  54. sys.exit(1)
  55. hasError = validate_links(parse_links(sys.argv[1]))
  56. if hasError:
  57. sys.exit(1)