From 7be0512b5438edf4d434f325181e2e586ee65009 Mon Sep 17 00:00:00 2001 From: Matheus Felipe <50463866+matheusfelipeog@users.noreply.github.com> Date: Wed, 12 Jan 2022 00:55:23 -0300 Subject: [PATCH] Check if a link is working --- scripts/validate/links.py | 79 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 78 insertions(+), 1 deletion(-) diff --git a/scripts/validate/links.py b/scripts/validate/links.py index bb84aac8..2dfbf7db 100644 --- a/scripts/validate/links.py +++ b/scripts/validate/links.py @@ -1,9 +1,12 @@ # -*- coding: utf-8 -*- -import sys import re +import sys +import random from typing import List, Tuple +import requests + def find_links_in_text(text: str) -> List[str]: """Find links in a text and return a list of URLs.""" @@ -55,6 +58,80 @@ def check_duplicate_links(links: List[str]) -> Tuple[bool, List]: return (has_duplicate, duplicates) +def fake_user_agent() -> str: + """Faking user agent as some hosting services block not-whitelisted UA.""" + + user_agents = [ + 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko)', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36', + ] + + return random.choice(user_agents) + + +def get_host_from_link(link: str) -> str: + + host = link.split('://', 1)[1] + + # Remove routes, arguments and anchors + if '/' in host: + host = host.split('/', 1)[0] + + elif '?' in host: + host = host.split('?', 1)[0] + + elif '#' in host: + host = host.split('#', 1)[0] + + return host + + +def check_if_link_is_working(link: str) -> Tuple[bool, str]: + """Checks if a link is working. + + If an error is identified when the request for the link occurs, + the return will be a tuple with the first value True and the second + value a string containing the error message. + + If no errors are identified, the return will be a tuple with the + first value False and the second an empty string. + """ + + has_error = False + error_message = '' + + try: + resp = requests.get(link + '/', timeout=25, headers={ + 'User-Agent': fake_user_agent(), + 'host': get_host_from_link(link) + }) + + code = resp.status_code + if code >= 400: + has_error = True + error_message = f'ERR:CLT: {code} : {link}' + + except (TimeoutError, requests.exceptions.ConnectTimeout): + has_error = True + error_message = f'ERR:TMO: {link}' + + except requests.exceptions.SSLError as error: + has_error = True + error_message = f'ERR:SSL: {error} : {link}' + + except requests.exceptions.TooManyRedirects as error: + has_error = True + error_message = f'ERR:TMR: {error} : {link}' + + except Exception as error: + has_error = True + error_message = f'ERR:UKN: {error} : {link}' + + return (has_error, error_message) + + if __name__ == '__main__': num_args = len(sys.argv)