From 4808d633a170a7732a45016da47390a9d5e3638d Mon Sep 17 00:00:00 2001 From: Matheus Felipe <50463866+matheusfelipeog@users.noreply.github.com> Date: Tue, 11 Jan 2022 03:59:18 -0300 Subject: [PATCH] Implement functions to find links in a text/file --- scripts/validate/links.py | 42 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 scripts/validate/links.py diff --git a/scripts/validate/links.py b/scripts/validate/links.py new file mode 100644 index 00000000..b7895097 --- /dev/null +++ b/scripts/validate/links.py @@ -0,0 +1,42 @@ +# -*- coding: utf-8 -*- + +import sys +import re +from typing import List + + +def find_links_in_text(text: str) -> List[str]: + """Find links in a text and return a list of URLs.""" + + link_pattern = re.compile(r'((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’]))') + + raw_links = re.findall(link_pattern, text) + + links = [ + str(raw_link[0]).rstrip('/') for raw_link in raw_links + ] + + return links + + +def find_links_in_file(filename: str) -> List[str]: + """Find links in a file and return a list of URLs from text file.""" + + with open(filename, mode='r', encoding='utf-8') as file: + readme = file.read() + index_section = readme.find('## Index') + content = readme[index_section:] + + links = find_links_in_text(content) + + return links + + +if __name__ == '__main__': + num_args = len(sys.argv) + + if num_args < 2: + print('No .md file passed') + sys.exit(1) + + links = find_links_in_file(sys.argv[1])