You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

97 lines
3.3 KiB

  1. #!/usr/bin/env python3
  2. import httplib2
  3. import re
  4. import socket
  5. import sys
  6. ignored_links = [
  7. 'https://github.com/public-apis/public-apis/actions?query=workflow%3A%22Run+tests%22',
  8. 'https://github.com/public-apis/public-apis/workflows/Validate%20links/badge.svg?branch=master',
  9. 'https://github.com/public-apis/public-apis/actions?query=workflow%3A%22Validate+links%22',
  10. 'https://github.com/davemachado/public-api',
  11. ]
  12. def parse_links(filename):
  13. """Returns a list of URLs from text file"""
  14. with open(filename) as fp:
  15. data = fp.read()
  16. raw_links = re.findall(
  17. '((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’]))',
  18. data)
  19. links = [raw_link[0] for raw_link in raw_links]
  20. return links
  21. def dup_links(links):
  22. """Check for duplicated links"""
  23. print(f'Checking for duplicated links...')
  24. hasError = False
  25. seen = {}
  26. dupes = []
  27. for link in links:
  28. link = link.rstrip('/')
  29. if link in ignored_links:
  30. continue
  31. if link not in seen:
  32. seen[link] = 1
  33. else:
  34. if seen[link] == 1:
  35. dupes.append(link)
  36. if not dupes:
  37. print(f"No duplicate links")
  38. else:
  39. print(f"Found duplicate links: {dupes}")
  40. hasError = True
  41. return hasError
  42. def validate_links(links):
  43. """Checks each entry in JSON file for live link"""
  44. print(f'Validating {len(links)} links...')
  45. hasError = False
  46. for link in links:
  47. h = httplib2.Http(disable_ssl_certificate_validation=True, timeout=25)
  48. try:
  49. resp = h.request(link, headers={
  50. # Faking user agent as some hosting services block not-whitelisted UA
  51. 'user-agent': 'Mozilla/5.0'
  52. })
  53. code = int(resp[0]['status'])
  54. # Checking status code errors
  55. if (code >= 300):
  56. hasError = True
  57. print(f"ERR:CLT:{code} : {link}")
  58. except TimeoutError:
  59. hasError = True
  60. print(f"ERR:TMO: {link}")
  61. except socket.error as socketerror:
  62. hasError = True
  63. print(f"ERR:SOC: {socketerror} : {link}")
  64. except Exception as e:
  65. hasError = True
  66. # Ignore some exceptions which are not actually errors.
  67. # The list below should be extended with other exceptions in the future if needed
  68. if (-1 != str(e).find("[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:852)")):
  69. print(f"ERR:SSL: {e} : {link}")
  70. elif (-1 != str(e).find("Content purported to be compressed with gzip but failed to decompress.")):
  71. print(f"ERR:GZP: {e} : {link}")
  72. elif (-1 != str(e).find("Unable to find the server at")):
  73. print(f"ERR:SRV: {e} : {link}")
  74. else:
  75. print(f"ERR:UKN: {e} : {link}")
  76. return hasError
  77. if __name__ == "__main__":
  78. num_args = len(sys.argv)
  79. if num_args < 2:
  80. print("No .md file passed")
  81. sys.exit(1)
  82. links = parse_links(sys.argv[1])
  83. hasError = dup_links(links)
  84. if not hasError:
  85. hasError = validate_links(links)
  86. if hasError:
  87. sys.exit(1)