You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

172 lines
4.6 KiB

  1. # -*- coding: utf-8 -*-
  2. import re
  3. import sys
  4. import random
  5. from typing import List, Tuple
  6. import requests
  7. def find_links_in_text(text: str) -> List[str]:
  8. """Find links in a text and return a list of URLs."""
  9. link_pattern = re.compile(r'((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’]))')
  10. raw_links = re.findall(link_pattern, text)
  11. links = [
  12. str(raw_link[0]).rstrip('/') for raw_link in raw_links
  13. ]
  14. return links
  15. def find_links_in_file(filename: str) -> List[str]:
  16. """Find links in a file and return a list of URLs from text file."""
  17. with open(filename, mode='r', encoding='utf-8') as file:
  18. readme = file.read()
  19. index_section = readme.find('## Index')
  20. content = readme[index_section:]
  21. links = find_links_in_text(content)
  22. return links
  23. def check_duplicate_links(links: List[str]) -> Tuple[bool, List]:
  24. """Check for duplicated links.
  25. Returns a tuple with True or False and duplicate list.
  26. """
  27. seen = {}
  28. duplicates = []
  29. has_duplicate = False
  30. for link in links:
  31. if link not in seen:
  32. seen[link] = 1
  33. else:
  34. if seen[link] == 1:
  35. duplicates.append(link)
  36. if duplicates:
  37. has_duplicate = True
  38. return (has_duplicate, duplicates)
  39. def fake_user_agent() -> str:
  40. """Faking user agent as some hosting services block not-whitelisted UA."""
  41. user_agents = [
  42. 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36',
  43. 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko)',
  44. 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
  45. 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
  46. ]
  47. return random.choice(user_agents)
  48. def get_host_from_link(link: str) -> str:
  49. host = link.split('://', 1)[1] if '://' in link else link
  50. # Remove routes, arguments and anchors
  51. if '/' in host:
  52. host = host.split('/', 1)[0]
  53. elif '?' in host:
  54. host = host.split('?', 1)[0]
  55. elif '#' in host:
  56. host = host.split('#', 1)[0]
  57. return host
  58. def check_if_link_is_working(link: str) -> Tuple[bool, str]:
  59. """Checks if a link is working.
  60. If an error is identified when the request for the link occurs,
  61. the return will be a tuple with the first value True and the second
  62. value a string containing the error message.
  63. If no errors are identified, the return will be a tuple with the
  64. first value False and the second an empty string.
  65. """
  66. has_error = False
  67. error_message = ''
  68. try:
  69. resp = requests.get(link + '/', timeout=25, headers={
  70. 'User-Agent': fake_user_agent(),
  71. 'host': get_host_from_link(link)
  72. })
  73. code = resp.status_code
  74. if code >= 400:
  75. has_error = True
  76. error_message = f'ERR:CLT: {code} : {link}'
  77. except (TimeoutError, requests.exceptions.ConnectTimeout):
  78. has_error = True
  79. error_message = f'ERR:TMO: {link}'
  80. except requests.exceptions.SSLError as error:
  81. has_error = True
  82. error_message = f'ERR:SSL: {error} : {link}'
  83. except requests.exceptions.TooManyRedirects as error:
  84. has_error = True
  85. error_message = f'ERR:TMR: {error} : {link}'
  86. except Exception as error:
  87. has_error = True
  88. error_message = f'ERR:UKN: {error} : {link}'
  89. return (has_error, error_message)
  90. def check_if_list_of_links_are_working(list_of_links: List[str]) -> List[str]:
  91. error_messages = []
  92. for link in list_of_links:
  93. has_error, error_message = check_if_link_is_working(link)
  94. if has_error:
  95. error_messages.append(error_message)
  96. return error_messages
  97. if __name__ == '__main__':
  98. num_args = len(sys.argv)
  99. if num_args < 2:
  100. print('No .md file passed')
  101. sys.exit(1)
  102. links = find_links_in_file(sys.argv[1])
  103. print('Checking for duplicate links...')
  104. has_duplicate_link, duplicates_links = check_duplicate_links(links)
  105. if has_duplicate_link:
  106. print(f'Found duplicate links: {duplicates_links}')
  107. sys.exit(1)
  108. else:
  109. print('No duplicate links.')
  110. print(f'Checking if {len(links)} links are working...')
  111. errors = check_if_list_of_links_are_working(links)
  112. if errors:
  113. for error_message in errors:
  114. print(error_message)
  115. sys.exit(1)