Não pode escolher mais do que 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.

validate_links.py 3.3 KiB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100
  1. #!/usr/bin/env python3
  2. import httplib2
  3. import re
  4. import socket
  5. import sys
  6. def parse_links(filename):
  7. """Returns a list of URLs from text file"""
  8. with open(filename, mode='r', encoding='utf-8') as fp:
  9. readme = fp.read()
  10. index_section = readme.find('## Index')
  11. content = readme[index_section:]
  12. raw_links = re.findall(
  13. '((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’]))',
  14. content)
  15. links = [
  16. str(raw_link[0]).rstrip('/') for raw_link in raw_links
  17. ]
  18. return links
  19. def dup_links(links):
  20. """Check for duplicated links"""
  21. print(f'Checking for duplicated links...')
  22. hasError = False
  23. seen = {}
  24. dupes = []
  25. for link in links:
  26. if link not in seen:
  27. seen[link] = 1
  28. else:
  29. if seen[link] == 1:
  30. dupes.append(link)
  31. if not dupes:
  32. print(f"No duplicate links")
  33. else:
  34. print(f"Found duplicate links: {dupes}")
  35. hasError = True
  36. return hasError
  37. def validate_links(links):
  38. """Checks each entry in JSON file for live link"""
  39. print(f'Validating {len(links)} links...')
  40. hasError = False
  41. for link in links:
  42. h = httplib2.Http(disable_ssl_certificate_validation=True, timeout=25)
  43. try:
  44. # fetching host name, removing leading www
  45. host = link.split('//', 1)[1].split('/', 1)[0]
  46. if host[:3] == 'www':
  47. host = host[4:]
  48. resp = h.request(link, headers={
  49. # Faking user agent as some hosting services block not-whitelisted UA
  50. 'user-agent': 'Mozilla/5.0',
  51. # setting host because Cloudflare returns 403 asking for captcha if host is missing
  52. 'host': host
  53. })
  54. code = int(resp[0]['status'])
  55. # Checking status code errors
  56. if (code >= 400):
  57. hasError = True
  58. print(f"ERR:CLT:{code} : {link}")
  59. except TimeoutError:
  60. hasError = True
  61. print(f"ERR:TMO: {link}")
  62. except socket.error as socketerror:
  63. hasError = True
  64. print(f"ERR:SOC: {socketerror} : {link}")
  65. except Exception as e:
  66. hasError = True
  67. # Ignore some exceptions which are not actually errors.
  68. # The list below should be extended with other exceptions in the future if needed
  69. if (-1 != str(e).find("[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:852)")):
  70. print(f"ERR:SSL: {e} : {link}")
  71. elif (-1 != str(e).find("Content purported to be compressed with gzip but failed to decompress.")):
  72. print(f"ERR:GZP: {e} : {link}")
  73. elif (-1 != str(e).find("Unable to find the server at")):
  74. print(f"ERR:SRV: {e} : {link}")
  75. else:
  76. print(f"ERR:UKN: {e} : {link}")
  77. return hasError
  78. if __name__ == "__main__":
  79. num_args = len(sys.argv)
  80. if num_args < 2:
  81. print("No .md file passed")
  82. sys.exit(1)
  83. links = parse_links(sys.argv[1])
  84. hasError = dup_links(links)
  85. if not hasError:
  86. hasError = validate_links(links)
  87. if hasError:
  88. sys.exit(1)