You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

links.py 7.8 KiB

1 year ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275
  1. # -*- coding: utf-8 -*-
  2. import re
  3. import sys
  4. import random
  5. import ssl
  6. from typing import List, Tuple
  7. import requests
  8. from requests.models import Response
  9. ctx = ssl.create_default_context()
  10. ctx.check_hostname = False
  11. ctx.verify_mode = ssl.CERT_NONE
  12. def find_links_in_text(text: str) -> List[str]:
  13. """Find links in a text and return a list of URLs."""
  14. link_pattern = re.compile(r'((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’]))')
  15. raw_links = re.findall(link_pattern, text)
  16. links = [
  17. str(raw_link[0]) for raw_link in raw_links
  18. ]
  19. return links
  20. def find_links_in_file(filename: str) -> List[str]:
  21. """Find links in a file and return a list of URLs from text file."""
  22. with open(filename, mode='r', encoding='utf-8') as file:
  23. readme = file.read()
  24. index_section = readme.find('## Index')
  25. if index_section == -1:
  26. index_section = 0
  27. content = readme[index_section:]
  28. links = find_links_in_text(content)
  29. return links
  30. def check_duplicate_links(links: List[str]) -> Tuple[bool, List]:
  31. """Check for duplicated links.
  32. Returns a tuple with True or False and duplicate list.
  33. """
  34. seen = {}
  35. duplicates = []
  36. has_duplicate = False
  37. for link in links:
  38. link = link.rstrip('/')
  39. if link not in seen:
  40. seen[link] = 1
  41. else:
  42. if seen[link] == 1:
  43. duplicates.append(link)
  44. if duplicates:
  45. has_duplicate = True
  46. return (has_duplicate, duplicates)
  47. def fake_user_agent() -> str:
  48. """Faking user agent as some hosting services block not-whitelisted UA."""
  49. user_agents = [
  50. 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36',
  51. 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko)',
  52. 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
  53. 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
  54. ]
  55. return random.choice(user_agents)
  56. def get_host_from_link(link: str) -> str:
  57. host = link.split('://', 1)[1] if '://' in link else link
  58. # Remove routes, arguments and anchors
  59. if '/' in host:
  60. host = host.split('/', 1)[0]
  61. elif '?' in host:
  62. host = host.split('?', 1)[0]
  63. elif '#' in host:
  64. host = host.split('#', 1)[0]
  65. return host
  66. def has_cloudflare_protection(resp: Response) -> bool:
  67. """Checks if there is any cloudflare protection in the response.
  68. Cloudflare implements multiple network protections on a given link,
  69. this script tries to detect if any of them exist in the response from request.
  70. Common protections have the following HTTP code as a response:
  71. - 403: When host header is missing or incorrect (and more)
  72. - 503: When DDOS protection exists
  73. See more about it at:
  74. - https://support.cloudflare.com/hc/en-us/articles/115003014512-4xx-Client-Error
  75. - https://support.cloudflare.com/hc/en-us/articles/115003011431-Troubleshooting-Cloudflare-5XX-errors
  76. - https://www.cloudflare.com/ddos/
  77. - https://superuser.com/a/888526
  78. Discussions in issues and pull requests:
  79. - https://github.com/public-apis/public-apis/pull/2409
  80. - https://github.com/public-apis/public-apis/issues/2960
  81. """
  82. code = resp.status_code
  83. server = resp.headers.get('Server') or resp.headers.get('server')
  84. cloudflare_flags = [
  85. '403 Forbidden',
  86. 'cloudflare',
  87. 'Cloudflare',
  88. 'Security check',
  89. 'Please Wait... | Cloudflare',
  90. 'We are checking your browser...',
  91. 'Please stand by, while we are checking your browser...',
  92. 'Checking your browser before accessing',
  93. 'This process is automatic.',
  94. 'Your browser will redirect to your requested content shortly.',
  95. 'Please allow up to 5 seconds',
  96. 'DDoS protection by',
  97. 'Ray ID:',
  98. 'Cloudflare Ray ID:',
  99. '_cf_chl',
  100. '_cf_chl_opt',
  101. '__cf_chl_rt_tk',
  102. 'cf-spinner-please-wait',
  103. 'cf-spinner-redirecting'
  104. ]
  105. if code in [403, 503] and server == 'cloudflare':
  106. html = resp.text
  107. flags_found = [flag in html for flag in cloudflare_flags]
  108. any_flag_found = any(flags_found)
  109. if any_flag_found:
  110. return True
  111. return False
  112. def check_if_link_is_working(link: str) -> Tuple[bool, str]:
  113. """Checks if a link is working.
  114. If an error is identified when the request for the link occurs,
  115. the return will be a tuple with the first value True and the second
  116. value a string containing the error message.
  117. If no errors are identified, the return will be a tuple with the
  118. first value False and the second an empty string.
  119. """
  120. has_error = False
  121. error_message = ''
  122. try:
  123. resp = requests.get(link, timeout=25, headers={
  124. 'User-Agent': fake_user_agent(),
  125. 'host': get_host_from_link(link)
  126. })
  127. code = resp.status_code
  128. if code >= 400 and not has_cloudflare_protection(resp):
  129. has_error = True
  130. error_message = f'ERR:CLT: {code} : {link}'
  131. except requests.exceptions.ConnectionError as error:
  132. has_error = True
  133. error_message = f'ERR:CNT: {error} : {link}'
  134. except (TimeoutError, requests.exceptions.ConnectTimeout):
  135. has_error = True
  136. error_message = f'ERR:TMO: {link}'
  137. except requests.exceptions.TooManyRedirects as error:
  138. has_error = True
  139. error_message = f'ERR:TMR: {error} : {link}'
  140. except (Exception, requests.exceptions.RequestException) as error:
  141. has_error = True
  142. error_message = f'ERR:UKN: {error} : {link}'
  143. return (has_error, error_message)
  144. def check_if_list_of_links_are_working(list_of_links: List[str]) -> List[str]:
  145. error_messages = []
  146. for link in list_of_links:
  147. has_error, error_message = check_if_link_is_working(link)
  148. if has_error:
  149. error_messages.append(error_message)
  150. return error_messages
  151. def start_duplicate_links_checker(links: List[str]) -> None:
  152. print('Checking for duplicate links...')
  153. has_duplicate_link, duplicates_links = check_duplicate_links(links)
  154. if has_duplicate_link:
  155. print(f'Found duplicate links:')
  156. for duplicate_link in duplicates_links:
  157. print(duplicate_link)
  158. sys.exit(1)
  159. else:
  160. print('No duplicate links.')
  161. def start_links_working_checker(links: List[str]) -> None:
  162. print(f'Checking if {len(links)} links are working...')
  163. errors = check_if_list_of_links_are_working(links)
  164. if errors:
  165. num_errors = len(errors)
  166. print(f'Apparently {num_errors} links are not working properly. See in:')
  167. for error_message in errors:
  168. print(error_message)
  169. sys.exit(1)
  170. def main(filename: str, only_duplicate_links_checker: bool) -> None:
  171. links = find_links_in_file(filename)
  172. start_duplicate_links_checker(links)
  173. if not only_duplicate_links_checker:
  174. start_links_working_checker(links)
  175. if __name__ == '__main__':
  176. num_args = len(sys.argv)
  177. only_duplicate_links_checker = False
  178. if num_args < 2:
  179. print('No .md file passed')
  180. sys.exit(1)
  181. elif num_args == 3:
  182. third_arg = sys.argv[2].lower()
  183. if third_arg == '-odlc' or third_arg == '--only_duplicate_links_checker':
  184. only_duplicate_links_checker = True
  185. else:
  186. print(f'Third invalid argument. Usage: python {__file__} [-odlc | --only_duplicate_links_checker]')
  187. sys.exit(1)
  188. filename = sys.argv[1]
  189. main(filename, only_duplicate_links_checker)