You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

links.py 1.8 KiB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374
  1. # -*- coding: utf-8 -*-
  2. import sys
  3. import re
  4. from typing import List, Tuple
  5. def find_links_in_text(text: str) -> List[str]:
  6. """Find links in a text and return a list of URLs."""
  7. link_pattern = re.compile(r'((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’]))')
  8. raw_links = re.findall(link_pattern, text)
  9. links = [
  10. str(raw_link[0]).rstrip('/') for raw_link in raw_links
  11. ]
  12. return links
  13. def find_links_in_file(filename: str) -> List[str]:
  14. """Find links in a file and return a list of URLs from text file."""
  15. with open(filename, mode='r', encoding='utf-8') as file:
  16. readme = file.read()
  17. index_section = readme.find('## Index')
  18. content = readme[index_section:]
  19. links = find_links_in_text(content)
  20. return links
  21. def check_duplicate_links(links: List[str]) -> Tuple[bool, List]:
  22. """Check for duplicated links.
  23. Returns a tuple with True or False and duplicate list.
  24. """
  25. seen = {}
  26. duplicates = []
  27. has_duplicate = False
  28. for link in links:
  29. if link not in seen:
  30. seen[link] = 1
  31. else:
  32. if seen[link] == 1:
  33. duplicates.append(link)
  34. if duplicates:
  35. has_duplicate = True
  36. return (has_duplicate, duplicates)
  37. if __name__ == '__main__':
  38. num_args = len(sys.argv)
  39. if num_args < 2:
  40. print('No .md file passed')
  41. sys.exit(1)
  42. links = find_links_in_file(sys.argv[1])
  43. print('Checking for duplicate links...')
  44. has_duplicate_link, duplicates_links = check_duplicate_links(links)
  45. if has_duplicate_link:
  46. print(f'Found duplicate links: {duplicates_links}')
  47. else:
  48. print('No duplicate links.')