|
- # -*- coding: utf-8 -*-
-
- import re
- import sys
- import random
- from typing import List, Tuple
-
- import requests
-
-
- def find_links_in_text(text: str) -> List[str]:
- """Find links in a text and return a list of URLs."""
-
- link_pattern = re.compile(r'((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’]))')
-
- raw_links = re.findall(link_pattern, text)
-
- links = [
- str(raw_link[0]).rstrip('/') for raw_link in raw_links
- ]
-
- return links
-
-
- def find_links_in_file(filename: str) -> List[str]:
- """Find links in a file and return a list of URLs from text file."""
-
- with open(filename, mode='r', encoding='utf-8') as file:
- readme = file.read()
- index_section = readme.find('## Index')
- content = readme[index_section:]
-
- links = find_links_in_text(content)
-
- return links
-
-
- def check_duplicate_links(links: List[str]) -> Tuple[bool, List]:
- """Check for duplicated links.
-
- Returns a tuple with True or False and duplicate list.
- """
-
- seen = {}
- duplicates = []
- has_duplicate = False
-
- for link in links:
- if link not in seen:
- seen[link] = 1
- else:
- if seen[link] == 1:
- duplicates.append(link)
-
- if duplicates:
- has_duplicate = True
-
- return (has_duplicate, duplicates)
-
-
- def fake_user_agent() -> str:
- """Faking user agent as some hosting services block not-whitelisted UA."""
-
- user_agents = [
- 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36',
- 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko)',
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
- ]
-
- return random.choice(user_agents)
-
-
- def get_host_from_link(link: str) -> str:
-
- host = link.split('://', 1)[1]
-
- # Remove routes, arguments and anchors
- if '/' in host:
- host = host.split('/', 1)[0]
-
- elif '?' in host:
- host = host.split('?', 1)[0]
-
- elif '#' in host:
- host = host.split('#', 1)[0]
-
- return host
-
-
- def check_if_link_is_working(link: str) -> Tuple[bool, str]:
- """Checks if a link is working.
-
- If an error is identified when the request for the link occurs,
- the return will be a tuple with the first value True and the second
- value a string containing the error message.
-
- If no errors are identified, the return will be a tuple with the
- first value False and the second an empty string.
- """
-
- has_error = False
- error_message = ''
-
- try:
- resp = requests.get(link + '/', timeout=25, headers={
- 'User-Agent': fake_user_agent(),
- 'host': get_host_from_link(link)
- })
-
- code = resp.status_code
- if code >= 400:
- has_error = True
- error_message = f'ERR:CLT: {code} : {link}'
-
- except (TimeoutError, requests.exceptions.ConnectTimeout):
- has_error = True
- error_message = f'ERR:TMO: {link}'
-
- except requests.exceptions.SSLError as error:
- has_error = True
- error_message = f'ERR:SSL: {error} : {link}'
-
- except requests.exceptions.TooManyRedirects as error:
- has_error = True
- error_message = f'ERR:TMR: {error} : {link}'
-
- except Exception as error:
- has_error = True
- error_message = f'ERR:UKN: {error} : {link}'
-
- return (has_error, error_message)
-
-
- if __name__ == '__main__':
- num_args = len(sys.argv)
-
- if num_args < 2:
- print('No .md file passed')
- sys.exit(1)
-
- links = find_links_in_file(sys.argv[1])
-
- print('Checking for duplicate links...')
-
- has_duplicate_link, duplicates_links = check_duplicate_links(links)
-
- if has_duplicate_link:
- print(f'Found duplicate links: {duplicates_links}')
- else:
- print('No duplicate links.')
|