You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

103 lines
3.4 KiB

  1. #!/usr/bin/env python3
  2. import httplib2
  3. import re
  4. import socket
  5. import sys
  6. ignored_links = [
  7. 'https://github.com/public-apis/public-apis/actions?query=workflow%3A%22Run+tests%22',
  8. 'https://github.com/public-apis/public-apis/workflows/Validate%20links/badge.svg?branch=master',
  9. 'https://github.com/public-apis/public-apis/actions?query=workflow%3A%22Validate+links%22',
  10. 'https://github.com/davemachado/public-api',
  11. ]
  12. def parse_links(filename):
  13. """Returns a list of URLs from text file"""
  14. with open(filename, mode='r', encoding='utf-8') as fp:
  15. readme = fp.read()
  16. index_section = readme.find('## Index')
  17. content = readme[index_section:]
  18. raw_links = re.findall(
  19. '((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’]))',
  20. content)
  21. links = [
  22. str(raw_link[0]).rstrip('/') for raw_link in raw_links
  23. ]
  24. return links
  25. def dup_links(links):
  26. """Check for duplicated links"""
  27. print(f'Checking for duplicated links...')
  28. hasError = False
  29. seen = {}
  30. dupes = []
  31. for link in links:
  32. if link in ignored_links:
  33. continue
  34. if link not in seen:
  35. seen[link] = 1
  36. else:
  37. if seen[link] == 1:
  38. dupes.append(link)
  39. if not dupes:
  40. print(f"No duplicate links")
  41. else:
  42. print(f"Found duplicate links: {dupes}")
  43. hasError = True
  44. return hasError
  45. def validate_links(links):
  46. """Checks each entry in JSON file for live link"""
  47. print(f'Validating {len(links)} links...')
  48. hasError = False
  49. for link in links:
  50. h = httplib2.Http(disable_ssl_certificate_validation=True, timeout=25)
  51. try:
  52. resp = h.request(link, headers={
  53. # Faking user agent as some hosting services block not-whitelisted UA
  54. 'user-agent': 'Mozilla/5.0'
  55. })
  56. code = int(resp[0]['status'])
  57. # Checking status code errors
  58. if (code >= 300):
  59. hasError = True
  60. print(f"ERR:CLT:{code} : {link}")
  61. except TimeoutError:
  62. hasError = True
  63. print(f"ERR:TMO: {link}")
  64. except socket.error as socketerror:
  65. hasError = True
  66. print(f"ERR:SOC: {socketerror} : {link}")
  67. except Exception as e:
  68. hasError = True
  69. # Ignore some exceptions which are not actually errors.
  70. # The list below should be extended with other exceptions in the future if needed
  71. if (-1 != str(e).find("[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:852)")):
  72. print(f"ERR:SSL: {e} : {link}")
  73. elif (-1 != str(e).find("Content purported to be compressed with gzip but failed to decompress.")):
  74. print(f"ERR:GZP: {e} : {link}")
  75. elif (-1 != str(e).find("Unable to find the server at")):
  76. print(f"ERR:SRV: {e} : {link}")
  77. else:
  78. print(f"ERR:UKN: {e} : {link}")
  79. return hasError
  80. if __name__ == "__main__":
  81. num_args = len(sys.argv)
  82. if num_args < 2:
  83. print("No .md file passed")
  84. sys.exit(1)
  85. links = parse_links(sys.argv[1])
  86. hasError = dup_links(links)
  87. if not hasError:
  88. hasError = validate_links(links)
  89. if hasError:
  90. sys.exit(1)