You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

95 lines
3.2 KiB

  1. #!/usr/bin/env python3
  2. import httplib2
  3. import re
  4. import socket
  5. import sys
  6. ignored_links = [
  7. 'https://github.com/public-apis/public-apis/actions?query=workflow%3A%22Run+tests%22',
  8. 'https://github.com/public-apis/public-apis/workflows/Validate%20links/badge.svg?branch=master',
  9. 'https://github.com/public-apis/public-apis/actions?query=workflow%3A%22Validate+links%22',
  10. ]
  11. def parse_links(filename):
  12. """Returns a list of URLs from text file"""
  13. with open(filename) as fp:
  14. data = fp.read()
  15. raw_links = re.findall(
  16. '((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’]))',
  17. data)
  18. links = [raw_link[0] for raw_link in raw_links]
  19. return links
  20. def dup_links(links):
  21. """Check for duplicated links"""
  22. print(f'Checking for duplicated links...')
  23. hasError = False
  24. seen = {}
  25. dupes = []
  26. for x in links:
  27. if x in ignored_links:
  28. continue
  29. if x not in seen:
  30. seen[x] = 1
  31. else:
  32. if seen[x] == 1:
  33. dupes.append(x)
  34. if not dupes:
  35. print(f"No duplicated links")
  36. else:
  37. print(f"Found duplicated links: {dupes}")
  38. hasError = True
  39. return hasError
  40. def validate_links(links):
  41. """Checks each entry in JSON file for live link"""
  42. print(f'Validating {len(links)} links...')
  43. hasError = False
  44. for link in links:
  45. h = httplib2.Http(disable_ssl_certificate_validation=True, timeout=25)
  46. try:
  47. resp = h.request(link, headers={
  48. # Faking user agent as some hosting services block not-whitelisted UA
  49. 'user-agent': 'Mozilla/5.0'
  50. })
  51. code = int(resp[0]['status'])
  52. # Checking status code errors
  53. if (code >= 300):
  54. hasError = True
  55. print(f"ERR:CLT:{code} : {link}")
  56. except TimeoutError:
  57. hasError = True
  58. print(f"ERR:TMO: {link}")
  59. except socket.error as socketerror:
  60. hasError = True
  61. print(f"ERR:SOC: {socketerror} : {link}")
  62. except Exception as e:
  63. hasError = True
  64. # Ignore some exceptions which are not actually errors.
  65. # The list below should be extended with other exceptions in the future if needed
  66. if (-1 != str(e).find("[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:852)")):
  67. print(f"ERR:SSL: {e} : {link}")
  68. elif (-1 != str(e).find("Content purported to be compressed with gzip but failed to decompress.")):
  69. print(f"ERR:GZP: {e} : {link}")
  70. elif (-1 != str(e).find("Unable to find the server at")):
  71. print(f"ERR:SRV: {e} : {link}")
  72. else:
  73. print(f"ERR:UKN: {e} : {link}")
  74. return hasError
  75. if __name__ == "__main__":
  76. num_args = len(sys.argv)
  77. if num_args < 2:
  78. print("No .md file passed")
  79. sys.exit(1)
  80. links = parse_links(sys.argv[1])
  81. hasError = dup_links(links)
  82. if not hasError:
  83. hasError = validate_links(links)
  84. if hasError:
  85. sys.exit(1)