Improve scripts and more (#3011)

2 years ago · d214297c53
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -1,29 +0,0 @@
 name: "Run tests"

 on:
  schedule:
    - cron:  '0 0 * * *'
  push:
    branches:
      - master
  pull_request:
    branches:
      - master

 env:
  FORMAT_FILE: README.md

 jobs:
  test:
    name: 'Validate README.md'
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
        uses: actions/checkout@v2

      - name: Validate Markdown format
        run: build/validate_format.py ${FORMAT_FILE}

      - name: Validate pull request changes
        run: build/github-pull.sh ${{ github.repository }} ${{ github.event.pull_request.number }} ${FORMAT_FILE}
        if: github.event_name == 'pull_request'
--- a/.github/workflows/test_of_push_and_pull.yml
+++ b/.github/workflows/test_of_push_and_pull.yml
@@ -0,0 +1,37 @@
 name: "Tests of push & pull"

 on:
  push:
    branches: [ master ]
  pull_request:
    branches: [ master ]

 env:
  FILENAME: README.md

 jobs:
  tests:
    name: 'Validate README.md changes'
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
        uses: actions/checkout@v2

      - name: Set up Python
        uses: actions/setup-python@v2
        with:
          python-version: '3.8'

      - name: Install dependencies
        run: python -m pip install -r scripts/requirements.txt

      - name: Validate Markdown format
        run: python scripts/validate/format.py ${FILENAME}

      - name: Validate pull request changes
        run: scripts/github_pull_request.sh ${{ github.repository }} ${{ github.event.pull_request.number }} ${FILENAME}
        if: github.event_name == 'pull_request'

      - name: Checking if push changes are duplicated
        run: python scripts/validate/links.py ${FILENAME} --only_duplicate_links_checker
        if: github.event_name == 'push'
--- a/.github/workflows/test_of_validate_package.yml
+++ b/.github/workflows/test_of_validate_package.yml
@@ -0,0 +1,29 @@
 name: "Tests of validate package"

 on:
  push:
    branches: [ master ]
  pull_request:
    branches: [ master ]

 jobs:
  unittest:
    name: 'Run tests of validate package'
    runs-on: ubuntu-latest

    steps:
    - name: Checkout repository
      uses: actions/checkout@v2

    - name: Set up Python
      uses: actions/setup-python@v2
      with:
        python-version: '3.8'
    
    - name: Install dependencies
      run: python -m pip install -r scripts/requirements.txt

    - name: Run Unittest
      run: |
        cd scripts
        python -m unittest discover tests/ --verbose
--- a/.github/workflows/validate_links.yml
+++ b/.github/workflows/validate_links.yml
@@ -3,20 +3,25 @@ name: "Validate links"
 on:
  schedule:
    - cron:  '0 0 * * *'
  push:
    branches:
      - master

 env:
  FORMAT_FILE: README.md
  FILENAME: README.md

 jobs:
  test:
    name: 'Validate links'
  validate_links:
    name: 'Check all links are working'
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
        uses: actions/checkout@v2

      - name: Set up Python
        uses: actions/setup-python@v2
        with:
          python-version: '3.8'

      - name: Install dependencies
        run: python -m pip install -r scripts/requirements.txt

      - name: Validate all links from README.md
        run: build/validate_links.py ${FORMAT_FILE}
        run: python scripts/validate/links.py ${FILENAME}
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,130 @@
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class

 # C extensions
 *.so

 # Distribution / packaging
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 pip-wheel-metadata/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 .pypirc

 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec

 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt

 # Unit test / coverage reports
 htmlcov/
 .tox/
 .nox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 *.py,cover
 .hypothesis/
 .pytest_cache/

 # Translations
 *.mo
 *.pot

 # Django stuff:
 *.log
 local_settings.py
 db.sqlite3
 db.sqlite3-journal

 # Flask stuff:
 instance/
 .webassets-cache

 # Scrapy stuff:
 .scrapy

 # Sphinx documentation
 docs/_build/

 # PyBuilder
 target/

 # Jupyter Notebook
 .ipynb_checkpoints

 # IPython
 profile_default/
 ipython_config.py

 # pyenv
 .python-version

 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 #   install all needed dependencies.
 #Pipfile.lock

 # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 __pypackages__/

 # Celery stuff
 celerybeat-schedule
 celerybeat.pid

 # SageMath parsed files
 *.sage.py

 # Environments
 .env
 .venv
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/

 # Spyder project settings
 .spyderproject
 .spyproject

 # Rope project settings
 .ropeproject

 # mkdocs documentation
 /site

 # mypy
 .mypy_cache/
 .dmypy.json
 dmypy.json

 # Pyre type checker
 .pyre/
--- a/build/requirements.txt
+++ b/build/requirements.txt
@@ -1,2 +0,0 @@
 flake8>=3.5.0
 httplib2==0.19.0
--- a/build/validate_format.py
+++ b/build/validate_format.py
@@ -1,168 +0,0 @@
 #!/usr/bin/env python3

 import re
 import sys

 anchor = '###'
 min_entries_per_section = 3
 auth_keys = ['apiKey', 'OAuth', 'X-Mashape-Key', 'No', 'User-Agent']
 punctuation = ['.', '?', '!']
 https_keys = ['Yes', 'No']
 cors_keys = ['Yes', 'No', 'Unknown']

 index_title = 0
 index_desc = 1
 index_auth = 2
 index_https = 3
 index_cors = 4
 index_link = 5
 num_segments = 5

 errors = []
 title_links = []
 anchor_re = re.compile(anchor + '\s(.+)')
 section_title_re = re.compile('\*\s\[(.*)\]')
 link_re = re.compile('\[(.+)\]\((http.*)\)')


 def add_error(line_num, message):
    """adds an error to the dynamic error list"""
    err = '(L{:03d}) {}'.format(line_num + 1, message)
    errors.append(err)


 def check_alphabetical(lines):
    """
    checks if all entries per section are in alphabetical order based in entry title
    """
    sections = {}
    section_line_num = {}
    for line_num, line in enumerate(lines):
        if line.startswith(anchor):
            category = line.split(anchor)[1].strip()
            sections[category] = []
            section_line_num[category] = line_num
            continue
        if not line.startswith('|') or line.startswith('|---'):
            continue
        raw_title = [x.strip() for x in line.split('|')[1:-1]][0]
        title_re_match = link_re.match(raw_title)
        if title_re_match:
            sections[category].append(title_re_match.group(1).upper())

    for category, entries in sections.items():
        if sorted(entries) != entries:
            add_error(section_line_num[category], "{} section is not in alphabetical order".format(category))


 def check_entry(line_num, segments):
    # START Title
    raw_title = segments[index_title]
    title_re_match = link_re.match(raw_title)
    # url should be wrapped in '[TITLE](LINK)' Markdown syntax
    if not title_re_match:
        add_error(line_num, 'Title syntax should be "[TITLE](LINK)"')
    else:
        # do not allow "... API" in the entry title
        title = title_re_match.group(1)
        if title.upper().endswith(' API'):
            add_error(line_num, 'Title should not end with "... API". Every entry is an API here!')
    # END Title
    # START Description
    # first character should be capitalized
    char = segments[index_desc][0]
    if char.upper() != char:
        add_error(line_num, "first character of description is not capitalized")
    # last character should not punctuation
    char = segments[index_desc][-1]
    if char in punctuation:
        add_error(line_num, "description should not end with {}".format(char))
    desc_length = len(segments[index_desc])
    if desc_length > 100:
        add_error(line_num, "description should not exceed 100 characters (currently {})".format(desc_length))
    # END Description
    # START Auth
    # values should conform to valid options only
    auth = segments[index_auth]
    if auth != 'No' and (not auth.startswith('`') or not auth.endswith('`')):
        add_error(line_num, "auth value is not enclosed with `backticks`")
    if auth.replace('`', '') not in auth_keys:
        add_error(line_num, "{} is not a valid Auth option".format(auth))
    # END Auth
    # START HTTPS
    # values should conform to valid options only
    https = segments[index_https]
    if https not in https_keys:
        add_error(line_num, "{} is not a valid HTTPS option".format(https))
    # END HTTPS
    # START CORS
    # values should conform to valid options only
    cors = segments[index_cors]
    if cors not in cors_keys:
        add_error(line_num, "{} is not a valid CORS option".format(cors))
    # END CORS


 def check_format(filename):
    """
    validates that each line is formatted correctly,
    appending to error list as needed
    """
    with open(filename) as fp:
        lines = list(line.rstrip() for line in fp)
    check_alphabetical(lines)
    # START Check Entries
    num_in_category = min_entries_per_section + 1
    category = ""
    category_line = 0
    for line_num, line in enumerate(lines):
        if section_title_re.match(line):
            title_links.append(section_title_re.match(line).group(1))
        # check each section for the minimum number of entries
        if line.startswith(anchor):
            match = anchor_re.match(line)
            if match:
                if match.group(1) not in title_links:
                    add_error(line_num, "section header ({}) not added as a title link".format(match.group(1)))
            else:
                add_error(line_num, "section header is not formatted correctly")
            if num_in_category < min_entries_per_section:
                add_error(category_line, "{} section does not have the minimum {} entries (only has {})".format(
                    category, min_entries_per_section, num_in_category))
            category = line.split(' ')[1]
            category_line = line_num
            num_in_category = 0
            continue
        # skips lines that we do not care about
        if not line.startswith('|') or line.startswith('|---'):
            continue
        num_in_category += 1
        segments = line.split('|')[1:-1]
        if len(segments) < num_segments:
            add_error(line_num, "entry does not have all the required sections (have {}, need {})".format(
                len(segments), num_segments))
            continue
        # START Global
        for segment in segments:
            # every line segment should start and end with exactly 1 space
            if len(segment) - len(segment.lstrip()) != 1 or len(segment) - len(segment.rstrip()) != 1:
                add_error(line_num, "each segment must start and end with exactly 1 space")
        # END Global
        segments = [seg.strip() for seg in segments]
        check_entry(line_num, segments)
    # END Check Entries


 def main():
    if len(sys.argv) < 2:
        print("No file passed (file should contain Markdown table syntax)")
        sys.exit(1)
    check_format(sys.argv[1])
    if len(errors) > 0:
        for err in errors:
            print(err)
        sys.exit(1)


 if __name__ == "__main__":
    main()
--- a/build/validate_links.py
+++ b/build/validate_links.py
@@ -1,100 +0,0 @@
 #!/usr/bin/env python3

 import httplib2
 import re
 import socket
 import sys


 def parse_links(filename):
    """Returns a list of URLs from text file"""
    with open(filename, mode='r', encoding='utf-8') as fp:
        readme = fp.read()
        index_section = readme.find('## Index')
        content = readme[index_section:]

    raw_links = re.findall(
        '((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’]))',
        content)

    links = [
        str(raw_link[0]).rstrip('/') for raw_link in raw_links
    ]

    return links

 def dup_links(links):
    """Check for duplicated links"""
    print(f'Checking for duplicated links...')
    hasError = False
    seen = {}
    dupes = []

    for link in links:
        if link not in seen:
            seen[link] = 1
        else:
            if seen[link] == 1:
                dupes.append(link)

    if not dupes:
        print(f"No duplicate links")
    else:
        print(f"Found duplicate links: {dupes}")  
        hasError = True  
    return hasError

 def validate_links(links):
    """Checks each entry in JSON file for live link"""
    print(f'Validating {len(links)} links...')
    hasError = False
    for link in links:
        h = httplib2.Http(disable_ssl_certificate_validation=True, timeout=25)
        try:
            # fetching host name, removing leading www
            host = link.split('//', 1)[1].split('/', 1)[0]
            if host[:3] == 'www':
                host = host[4:]
                
            resp = h.request(link + "/", headers={
                # Faking user agent as some hosting services block not-whitelisted UA
                'User-Agent': 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36',
                # setting host because Cloudflare returns 403 asking for captcha if host is missing
                'host': host
            })
            code = int(resp[0]['status'])
            # Checking status code errors
            if (code >= 400):
                hasError = True
                print(f"ERR:CLT:{code} : {link}")
        except TimeoutError:
            hasError = True
            print(f"ERR:TMO: {link}")
        except socket.error as socketerror:
            hasError = True
            print(f"ERR:SOC: {socketerror} : {link}")
        except Exception as e:
            hasError = True
            # Ignore some exceptions which are not actually errors.
            # The list below should be extended with other exceptions in the future if needed
            if (-1 != str(e).find("[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:852)")):
                print(f"ERR:SSL: {e} : {link}")
            elif (-1 != str(e).find("Content purported to be compressed with gzip but failed to decompress.")):
                print(f"ERR:GZP: {e} : {link}")
            elif (-1 != str(e).find("Unable to find the server at")):
                print(f"ERR:SRV: {e} : {link}")
            else:
                print(f"ERR:UKN: {e} : {link}")
    return hasError

 if __name__ == "__main__":
    num_args = len(sys.argv)
    if num_args < 2:
        print("No .md file passed")
        sys.exit(1)
    links = parse_links(sys.argv[1])
    hasError = dup_links(links)
    if not hasError:
        hasError = validate_links(links)
    if hasError:
        sys.exit(1)
--- a/scripts/README.md
+++ b/scripts/README.md
@@ -0,0 +1,75 @@
 # Public APIs Scripts

 This directory contains all validation and testing scripts used by Public APIs.

 ```bash
 scripts
 │   github_pull_request.sh  # used to validate changes of a pull request
 │   requirements.txt  # contains dependencies of validate package
 │
 ├───tests  # contains all unit tests from the validate package
 │       test_validate_format.py
 │       test_validate_links.py
 │
 └───validate  # validate package
        format.py
        links.py
 ```

 ## Install dependencies

 You must have [python](https://www.python.org/) installed to use these scripts.

 it is also necessary to install the validation package dependencies, use [pip package manager](https://pypi.org/project/pip/) for this:

 ```bash
 $ python -m pip install -r scripts/requirements.txt
 ```

 ## Run validations

 To run format validation on the `README.md` file, being in the root directory of public-apis, run:

 ```bash
 $ python scripts/validate/format.py README.md
 ```

 To run link validation on the `README.md` file, being in the root directory of public-apis, run:

 ```bash
 $ python scripts/validate/links.py README.md
 ```

 As there are many links to check, this process can take some time. If your goal is not to check if the links are working, you can only check for duplicate links. Run:

 ```bash
 $ python scripts/validate/links.py README.md -odlc
 ```

 *`-odlc` is an abbreviation of `--only_duplicate_links_checker`*

 ## Running Tests

 To run all tests it is necessary to change to the scripts directory:

 ```bash
 $ cd scripts
 ```

 then run:

 ```bash
 $ python -m unittest discover tests/ --verbose
 ```

 To run only the format tests, run:

 ```bash
 $ python -m unittest discover tests/ --verbose --pattern "test_validate_format.py"
 ```

 To run only the links tests, run:

 ```bash
 $ python -m unittest discover tests/ --verbose --pattern "test_validate_links.py"
 ```
--- a/scripts/github_pull_request.sh
+++ b/scripts/github_pull_request.sh
@@ -4,20 +4,20 @@ set -e

 # Argument validation
 if [ $# -ne 3 ]; then
    echo "Usage: $0 <github-repo> <pull-number> <format-file>"
    echo "Usage: $0 <github-repo> <pull-number> <filename>"
    exit 1
 fi

 # Assign variables
 GITHUB_REPOSITORY="$1"
 GITHUB_PULL_REQUEST="$2"
 FORMAT_FILE="$3"
 FILENAME="$3"

 # Move to root of project
 cd "$GITHUB_WORKSPACE"

 # Determine files
 FORMAT_FILE="$( realpath "${FORMAT_FILE}" )"
 FILENAME="$( realpath "${FILENAME}" )"

 # Skip if build number could not be determined
 if [ -z "$GITHUB_REPOSITORY" -o -z "$GITHUB_PULL_REQUEST" ]; then
@@ -31,7 +31,7 @@ echo "running on Pull Request #$GITHUB_PULL_REQUEST"
 # Trick the URL validator python script into not seeing this as a URL
 DUMMY_SCHEME="https"
 DIFF_URL="$DUMMY_SCHEME://patch-diff.githubusercontent.com/raw/$GITHUB_REPOSITORY/pull/$GITHUB_PULL_REQUEST.diff"
 curl -L -o diff.txt "$DIFF_URL"
 curl -L "$DIFF_URL" -o diff.txt

 # Construct diff
 echo "------- BEGIN DIFF -------"
@@ -45,13 +45,13 @@ echo "------- END ADDITIONS ------"
 LINK_FILE=additions.txt

 # Validate links
 echo "Running link validation..."
 ./build/validate_links.py "$LINK_FILE"
 echo "Running link validation on additions..."
 python scripts/validate/links.py "$LINK_FILE"

 # Vebosity
 if [[ $? != 0 ]]; then
    echo "link validation failed!"
    echo "link validation failed on additions!"
    exit 1
 else
    echo "link validation passed!"
    echo "link validation passed on additions!"
 fi
--- a/scripts/requirements.txt
+++ b/scripts/requirements.txt
@@ -0,0 +1,5 @@
 certifi==2021.10.8
 charset-normalizer==2.0.10
 idna==3.3
 requests==2.27.1
 urllib3==1.26.8
--- a/scripts/tests/init.py
+++ b/scripts/tests/init.py
@@ -0,0 +1 @@
 # -*- coding: utf-8 -*-
--- a/scripts/tests/test_validate_format.py
+++ b/scripts/tests/test_validate_format.py
@@ -0,0 +1,466 @@
 # -*- coding: utf-8 -*-

 import unittest

 from validate.format import error_message
 from validate.format import get_categories_content
 from validate.format import check_alphabetical_order
 from validate.format import check_title
 from validate.format import check_description, max_description_length
 from validate.format import check_auth, auth_keys
 from validate.format import check_https, https_keys
 from validate.format import check_cors, cors_keys
 from validate.format import check_entry
 from validate.format import check_file_format, min_entries_per_category, num_segments


 class TestValidadeFormat(unittest.TestCase):
    
    def test_error_message_return_and_return_type(self):
        line_num_unity = 1
        line_num_ten = 10
        line_num_hundred = 100
        line_num_thousand = 1000

        msg = 'This is a unit test'

        err_msg_unity = error_message(line_num_unity, msg)
        err_msg_ten = error_message(line_num_ten, msg)
        err_msg_hundred = error_message(line_num_hundred, msg)
        err_msg_thousand = error_message(line_num_thousand, msg)

        self.assertIsInstance(err_msg_unity, str)
        self.assertIsInstance(err_msg_ten, str)
        self.assertIsInstance(err_msg_hundred, str)
        self.assertIsInstance(err_msg_thousand, str)

        self.assertEqual(err_msg_unity, '(L002) This is a unit test')
        self.assertEqual(err_msg_ten, '(L011) This is a unit test')
        self.assertEqual(err_msg_hundred, '(L101) This is a unit test')
        self.assertEqual(err_msg_thousand, '(L1001) This is a unit test')

    def test_if_get_categories_content_return_correct_data_of_categories(self):
        fake_contents = [
            '### A',
            'API | Description | Auth | HTTPS | CORS |',
            '|---|---|---|---|---|',
            '| [AA](https://www.ex.com) | Desc | `apiKey` | Yes | Yes |',
            '| [AB](https://www.ex.com) | Desc | `apiKey` | Yes | Yes |',
            '',
            '### B',
            'API | Description | Auth | HTTPS | CORS |',
            '|---|---|---|---|---|',
            '| [BA](https://www.ex.com) | Desc | `apiKey` | Yes | Yes |',
            '| [BB](https://www.ex.com) | Desc | `apiKey` | Yes | Yes |'
        ]

        result = get_categories_content(fake_contents)
        self.assertIsInstance(result, tuple)

        categories, category_line_num = result
        self.assertIsInstance(categories, dict)
        self.assertIsInstance(category_line_num, dict)

        expected_result = ({'A': ['AA', 'AB'], 'B': ['BA', 'BB']}, {'A': 0, 'B': 6})

        for res, ex_res in zip(result, expected_result):

            with self.subTest():
                self.assertEqual(res, ex_res)

    def test_if_check_alphabetical_order_return_correct_msg_error(self):
        correct_lines = [
            '### A',
            'API | Description | Auth | HTTPS | CORS |',
            '|---|---|---|---|---|',
            '| [AA](https://www.ex.com) | Desc | `apiKey` | Yes | Yes |',
            '| [AB](https://www.ex.com) | Desc | `apiKey` | Yes | Yes |',
            '',
            '### B',
            'API | Description | Auth | HTTPS | CORS |',
            '|---|---|---|---|---|',
            '| [BA](https://www.ex.com) | Desc | `apiKey` | Yes | Yes |',
            '| [BB](https://www.ex.com) | Desc | `apiKey` | Yes | Yes |'
        ]

        incorrect_lines = [
            '### A',
            'API | Description | Auth | HTTPS | CORS |',
            '|---|---|---|---|---|',
            '| [AB](https://www.ex.com) | Desc | `apiKey` | Yes | Yes |',
            '| [AA](https://www.ex.com) | Desc | `apiKey` | Yes | Yes |',
            '',
            '### B',
            'API | Description | Auth | HTTPS | CORS |',
            '|---|---|---|---|---|',
            '| [BB](https://www.ex.com) | Desc | `apiKey` | Yes | Yes |',
            '| [BA](https://www.ex.com) | Desc | `apiKey` | Yes | Yes |'
        ]


        err_msgs_1 = check_alphabetical_order(correct_lines)
        err_msgs_2 = check_alphabetical_order(incorrect_lines)

        self.assertIsInstance(err_msgs_1, list)
        self.assertIsInstance(err_msgs_2, list)

        self.assertEqual(len(err_msgs_1), 0)
        self.assertEqual(len(err_msgs_2), 2)

        expected_err_msgs = [
            '(L001) A category is not alphabetical order',
            '(L007) B category is not alphabetical order'
        ]

        for err_msg, ex_err_msg in zip(err_msgs_2, expected_err_msgs):

            with self.subTest():
                self.assertEqual(err_msg, ex_err_msg)
    
    def test_check_title_with_correct_title(self):
        raw_title = '[A](https://www.ex.com)'

        err_msgs = check_title(0, raw_title)

        self.assertIsInstance(err_msgs, list)
        self.assertEqual(len(err_msgs), 0)
        self.assertEqual(err_msgs, [])

    def test_check_title_with_markdown_syntax_incorrect(self):
        raw_title = '[A(https://www.ex.com)'

        err_msgs = check_title(0, raw_title)

        self.assertIsInstance(err_msgs, list)
        self.assertEqual(len(err_msgs), 1)
        
        err_msg = err_msgs[0]
        expected_err_msg = '(L001) Title syntax should be "[TITLE](LINK)"'

        self.assertEqual(err_msg, expected_err_msg)

    def test_check_title_with_api_at_the_end_of_the_title(self):
        raw_title = '[A API](https://www.ex.com)'

        err_msgs = check_title(0, raw_title)
        
        self.assertIsInstance(err_msgs, list)
        self.assertEqual(len(err_msgs), 1)
        
        err_msg = err_msgs[0]
        expected_err_msg = '(L001) Title should not end with "... API". Every entry is an API here!'

        self.assertEqual(err_msg, expected_err_msg)

    def test_check_description_with_correct_description(self):
        desc = 'This is a fake description'

        err_msgs = check_description(0, desc)

        self.assertIsInstance(err_msgs, list)
        self.assertEqual(len(err_msgs), 0)
        self.assertEqual(err_msgs, [])
    
    def test_check_description_with_first_char_is_not_capitalized(self):
        desc = 'this is a fake description'

        err_msgs = check_description(0, desc)

        self.assertIsInstance(err_msgs, list)
        self.assertEqual(len(err_msgs), 1)
        
        err_msg = err_msgs[0]
        expected_err_msg = '(L001) first character of description is not capitalized'

        self.assertIsInstance(err_msg, str)
        self.assertEqual(err_msg, expected_err_msg)
    
    def test_check_description_with_punctuation_in_the_end(self):
        base_desc = 'This is a fake description'
        punctuation = r"""!"#$%&'*+,-./:;<=>?@[\]^_`{|}~"""
        desc_with_punc = [base_desc + punc for punc in punctuation]
        
        for desc in desc_with_punc:

            with self.subTest():
                err_msgs = check_description(0, desc)

                self.assertIsInstance(err_msgs, list)
                self.assertEqual(len(err_msgs), 1)
        
                err_msg = err_msgs[0]
                expected_err_msg = f'(L001) description should not end with {desc[-1]}'

                self.assertIsInstance(err_msg, str)
                self.assertEqual(err_msg, expected_err_msg)

    def test_check_description_that_exceeds_the_character_limit(self):
        long_desc = 'Desc' * max_description_length
        long_desc_length = len(long_desc)

        err_msgs = check_description(0, long_desc)

        self.assertIsInstance(err_msgs, list)
        self.assertEqual(len(err_msgs), 1)

        err_msg = err_msgs[0]
        expected_err_msg = f'(L001) description should not exceed {max_description_length} characters (currently {long_desc_length})'

        self.assertIsInstance(err_msg, str)
        self.assertEqual(err_msg, expected_err_msg)

    def test_check_auth_with_valid_auth(self):
        auth_valid = [f'`{auth}`' for auth in auth_keys if auth != 'No']
        auth_valid.append('No')

        for auth in auth_valid:
            with self.subTest():
                err_msgs = check_auth(0, auth)
                self.assertIsInstance(err_msgs, list)
                self.assertEqual(len(err_msgs), 0)
                self.assertEqual(err_msgs, [])

    def test_check_auth_without_backtick(self):
        auth_without_backtick = [auth for auth in auth_keys if auth != 'No']

        for auth in auth_without_backtick:
            with self.subTest():
                err_msgs = check_auth(0, auth)
                self.assertIsInstance(err_msgs, list)
                self.assertEqual(len(err_msgs), 1)

                err_msg = err_msgs[0]
                expected_err_msg = '(L001) auth value is not enclosed with `backticks`'

                self.assertIsInstance(err_msg, str)
                self.assertEqual(err_msg, expected_err_msg)

    def test_check_auth_with_invalid_auth(self):
        auth_invalid_without_backtick = ['Yes', 'yes', 'no', 'random', 'Unknown']
        auth_invalid_with_backtick = ['`Yes`', '`yes`', '`no`', '`random`', '`Unknown`']

        for auth in auth_invalid_without_backtick:
            with self.subTest():
                err_msgs = check_auth(0, auth)
                self.assertIsInstance(err_msgs, list)
                self.assertEqual(len(err_msgs), 2)

                err_msg_1 = err_msgs[0]
                err_msg_2 = err_msgs[1]

                expected_err_msg_1 = f'(L001) auth value is not enclosed with `backticks`'
                expected_err_msg_2 = f'(L001) {auth} is not a valid Auth option'

                self.assertIsInstance(err_msg_1, str)
                self.assertIsInstance(err_msg_2, str)
                self.assertEqual(err_msg_1, expected_err_msg_1)
                self.assertEqual(err_msg_2, expected_err_msg_2)

        for auth in auth_invalid_with_backtick:
            with self.subTest():
                err_msgs = check_auth(0, auth)
                self.assertIsInstance(err_msgs, list)
                self.assertEqual(len(err_msgs), 1)

                err_msg = err_msgs[0]
                expected_err_msg = f'(L001) {auth} is not a valid Auth option'

                self.assertIsInstance(err_msg, str)
                self.assertEqual(err_msg, expected_err_msg)

    def test_check_https_with_valid_https(self):
        for https in https_keys:
            with self.subTest():
                err_msgs = check_https(0, https)
                self.assertIsInstance(err_msgs, list)
                self.assertEqual(len(err_msgs), 0)
                self.assertEqual(err_msgs, [])

    def test_check_https_with_invalid_https(self):
        invalid_https_keys = ['yes', 'no', 'Unknown', 'https', 'http']

        for https in invalid_https_keys:
            with self.subTest():
                err_msgs = check_https(0, https)
                self.assertIsInstance(err_msgs, list)
                self.assertEqual(len(err_msgs), 1)

                err_msg = err_msgs[0]
                expected_err_msg = f'(L001) {https} is not a valid HTTPS option'

                self.assertIsInstance(err_msg, str)
                self.assertEqual(err_msg, expected_err_msg)

    def test_check_cors_with_valid_cors(self):
        for cors in cors_keys:
            with self.subTest():
                err_msgs = check_cors(0, cors)
                self.assertIsInstance(err_msgs, list)
                self.assertEqual(len(err_msgs), 0)
                self.assertEqual(err_msgs, [])

    def test_check_cors_with_invalid_cors(self):
        invalid_cors_keys = ['yes', 'no', 'unknown', 'cors']

        for cors in invalid_cors_keys:
            with self.subTest():
                err_msgs = check_cors(0, cors)
                self.assertIsInstance(err_msgs, list)
                self.assertEqual(len(err_msgs), 1)

                err_msg = err_msgs[0]
                expected_err_msg = f'(L001) {cors} is not a valid CORS option'

                self.assertIsInstance(err_msg, str)
                self.assertEqual(err_msg, expected_err_msg)

    def test_check_entry_with_correct_segments(self):
        correct_segments = ['[A](https://www.ex.com)', 'Desc', '`apiKey`', 'Yes', 'Yes']

        err_msgs = check_entry(0, correct_segments)
        
        self.assertIsInstance(err_msgs, list)
        self.assertEqual(len(err_msgs), 0)
        self.assertEqual(err_msgs, [])

    def test_check_entry_with_incorrect_segments(self):
        incorrect_segments = ['[A API](https://www.ex.com)', 'desc.', 'yes', 'yes', 'yes']

        err_msgs = check_entry(0, incorrect_segments)
        expected_err_msgs = [
            '(L001) Title should not end with "... API". Every entry is an API here!',
            '(L001) first character of description is not capitalized',
            '(L001) description should not end with .',
            '(L001) auth value is not enclosed with `backticks`',
            '(L001) yes is not a valid Auth option',
            '(L001) yes is not a valid HTTPS option',
            '(L001) yes is not a valid CORS option'
        ]

        self.assertIsInstance(err_msgs, list)
        self.assertEqual(len(err_msgs), 7)
        for err_msg in err_msgs:
            with self.subTest():
                self.assertIsInstance(err_msg, str)
        self.assertEqual(err_msgs, expected_err_msgs)

    def test_check_file_format_with_correct_format(self):
        correct_format = [
            '## Index',
            '* [A](#a)',
            '* [B](#b)',
            '',
            '### A',
            'API | Description | Auth | HTTPS | CORS |',
            '|---|---|---|---|---|',
            '| [AA](https://www.ex.com) | Desc | `apiKey` | Yes | Yes |',
            '| [AB](https://www.ex.com) | Desc | `apiKey` | Yes | Yes |',
            '| [AC](https://www.ex.com) | Desc | `apiKey` | Yes | Yes |',
            '',
            '### B',
            'API | Description | Auth | HTTPS | CORS |',
            '|---|---|---|---|---|',
            '| [BA](https://www.ex.com) | Desc | `apiKey` | Yes | Yes |',
            '| [BB](https://www.ex.com) | Desc | `apiKey` | Yes | Yes |',
            '| [BC](https://www.ex.com) | Desc | `apiKey` | Yes | Yes |'
        ]

        err_msgs = check_file_format(lines=correct_format)

        self.assertIsInstance(err_msgs, list)
        self.assertEqual(len(err_msgs), 0)
        self.assertEqual(err_msgs, [])

    def test_check_file_format_with_category_header_not_added_to_index(self):
        incorrect_format = [
            '## Index',
            '',
            '### A',
            'API | Description | Auth | HTTPS | CORS |',
            '|---|---|---|---|---|',
            '| [AA](https://www.ex.com) | Desc | `apiKey` | Yes | Yes |',
            '| [AB](https://www.ex.com) | Desc | `apiKey` | Yes | Yes |',
            '| [AC](https://www.ex.com) | Desc | `apiKey` | Yes | Yes |',
        ]

        err_msgs = check_file_format(lines=incorrect_format)
        expected_err_msg = '(L003) category header (A) not added to Index section'

        self.assertIsInstance(err_msgs, list)
        self.assertEqual(len(err_msgs), 1)
        err_msg = err_msgs[0]
        self.assertEqual(err_msg, expected_err_msg)

    def test_check_file_format_with_category_without_min_entries(self):
        incorrect_format = [
            '## Index',
            '* [A](#a)',
            '* [B](#b)',
            '',
            '### A',
            'API | Description | Auth | HTTPS | CORS |',
            '|---|---|---|---|---|',
            '| [AA](https://www.ex.com) | Desc | `apiKey` | Yes | Yes |',
            '',
            '### B',
            'API | Description | Auth | HTTPS | CORS |',
            '|---|---|---|---|---|',
            '| [BA](https://www.ex.com) | Desc | `apiKey` | Yes | Yes |',
            '| [BB](https://www.ex.com) | Desc | `apiKey` | Yes | Yes |',
            '| [BC](https://www.ex.com) | Desc | `apiKey` | Yes | Yes |'
        ]

        category_with_err = 'A'
        num_in_category = 1

        err_msgs = check_file_format(lines=incorrect_format)
        expected_err_msg = f'(L005) {category_with_err} category does not have the minimum {min_entries_per_category} entries (only has {num_in_category})'

        self.assertIsInstance(err_msgs, list)
        self.assertEqual(len(err_msgs), 1)
        err_msg = err_msgs[0]
        self.assertEqual(err_msg, expected_err_msg)

    def test_check_file_format_entry_without_all_necessary_columns(self):
        incorrect_format = [
            '## Index',
            '* [A](#a)',
            '',
            '### A',
            'API | Description | Auth | HTTPS | CORS |',
            '|---|---|---|---|---|',
            '| [AA](https://www.ex.com) | Desc | `apiKey` | Yes | Yes |',
            '| [AB](https://www.ex.com) | Desc | `apiKey` |',  # missing https and cors
            '| [AC](https://www.ex.com) | Desc | `apiKey` | Yes | Yes |',
        ]

        current_segments_num = 3

        err_msgs = check_file_format(lines=incorrect_format)
        expected_err_msg = f'(L008) entry does not have all the required columns (have {current_segments_num}, need {num_segments})'

        self.assertIsInstance(err_msgs, list)
        self.assertEqual(len(err_msgs), 1)
        err_msg = err_msgs[0]
        self.assertEqual(err_msg, expected_err_msg)

    def test_check_file_format_without_1_space_between_the_segments(self):
        incorrect_format = [
            '## Index',
            '* [A](#a)',
            '',
            '### A',
            'API | Description | Auth | HTTPS | CORS |',
            '|---|---|---|---|---|',
            '| [AA](https://www.ex.com) | Desc |`apiKey`| Yes | Yes |',  # space between segment of auth column missing
            '| [AB](https://www.ex.com) | Desc | `apiKey` | Yes | Yes |',
            '| [AC](https://www.ex.com) | Desc | `apiKey` | Yes | Yes |',
        ]

        err_msgs = check_file_format(lines=incorrect_format)
        expected_err_msg = f'(L007) each segment must start and end with exactly 1 space'

        self.assertIsInstance(err_msgs, list)
        self.assertEqual(len(err_msgs), 1)
        err_msg = err_msgs[0]
        self.assertEqual(err_msg, expected_err_msg)
--- a/scripts/tests/test_validate_links.py
+++ b/scripts/tests/test_validate_links.py
@@ -0,0 +1,172 @@
 # -*- coding: utf-8 -*-

 import unittest

 from validate.links import find_links_in_text
 from validate.links import check_duplicate_links
 from validate.links import fake_user_agent
 from validate.links import get_host_from_link
 from validate.links import has_cloudflare_protection


 class FakeResponse():
    def __init__(self, code: int, headers: dict, text: str) -> None:
        self.status_code = code
        self.headers = headers
        self.text = text


 class TestValidateLinks(unittest.TestCase):

    def setUp(self):
        self.duplicate_links = [
            'https://www.example.com',
            'https://www.example.com',
            'https://www.example.com',
            'https://www.anotherexample.com',
        ]
        self.no_duplicate_links = [
            'https://www.firstexample.com',
            'https://www.secondexample.com',
            'https://www.anotherexample.com',
        ]

        self.code_200 = 200
        self.code_403 = 403
        self.code_503 = 503

        self.cloudflare_headers = {'Server': 'cloudflare'}
        self.no_cloudflare_headers = {'Server': 'google'}

        self.text_with_cloudflare_flags = '403 Forbidden Cloudflare We are checking your browser...'
        self.text_without_cloudflare_flags = 'Lorem Ipsum'

    def test_find_link_in_text(self):
        text = """
            # this is valid

            http://example.com?param1=1&param2=2#anchor
            https://www.example.com?param1=1&param2=2#anchor
            https://www.example.com.br
            https://www.example.com.gov.br
            [Example](https://www.example.com?param1=1&param2=2#anchor)
            lorem ipsum https://www.example.com?param1=1&param2=2#anchor
            https://www.example.com?param1=1&param2=2#anchor lorem ipsum

            # this not is valid

            example.com
            https:example.com
            https:/example.com
            https//example.com
            https//.com
        """

        links = find_links_in_text(text)

        self.assertIsInstance(links, list)
        self.assertEqual(len(links), 7)

        for link in links:
            with self.subTest():
                self.assertIsInstance(link, str)

    def test_find_link_in_text_with_invalid_argument(self):
        with self.assertRaises(TypeError):
            find_links_in_text()
            find_links_in_text(1)
            find_links_in_text(True)

    def test_if_check_duplicate_links_has_the_correct_return(self):
        result_1 = check_duplicate_links(self.duplicate_links)
        result_2 = check_duplicate_links(self.no_duplicate_links)

        self.assertIsInstance(result_1, tuple)
        self.assertIsInstance(result_2, tuple)

        has_duplicate_links, links = result_1
        no_duplicate_links, no_links = result_2

        self.assertTrue(has_duplicate_links)
        self.assertFalse(no_duplicate_links)

        self.assertIsInstance(links, list)
        self.assertIsInstance(no_links, list)

        self.assertEqual(len(links), 2)
        self.assertEqual(len(no_links), 0)

    def test_if_fake_user_agent_has_a_str_as_return(self):
        user_agent = fake_user_agent()
        self.assertIsInstance(user_agent, str)

    def test_get_host_from_link(self):
        links = [
            'example.com',
            'https://example.com',
            'https://www.example.com',
            'https://www.example.com.br',
            'https://www.example.com/route',
            'https://www.example.com?p=1&q=2',
            'https://www.example.com#anchor'
        ]

        for link in links:
            host = get_host_from_link(link)

            with self.subTest():
                self.assertIsInstance(host, str)

                self.assertNotIn('://', host)
                self.assertNotIn('/', host)
                self.assertNotIn('?', host)
                self.assertNotIn('#', host)

        with self.assertRaises(TypeError):
            get_host_from_link()

    def test_has_cloudflare_protection_with_code_403_and_503_in_response(self):
        resp_with_cloudflare_protection_code_403 = FakeResponse(
            code=self.code_403,
            headers=self.cloudflare_headers,
            text=self.text_with_cloudflare_flags
        )

        resp_with_cloudflare_protection_code_503 = FakeResponse(
            code=self.code_503,
            headers=self.cloudflare_headers,
            text=self.text_with_cloudflare_flags
        )

        result1 = has_cloudflare_protection(resp_with_cloudflare_protection_code_403)
        result2 = has_cloudflare_protection(resp_with_cloudflare_protection_code_503)

        self.assertTrue(result1)
        self.assertTrue(result2)

    def test_has_cloudflare_protection_when_there_is_no_protection(self):
        resp_without_cloudflare_protection1 = FakeResponse(
            code=self.code_200,
            headers=self.no_cloudflare_headers,
            text=self.text_without_cloudflare_flags
        )

        resp_without_cloudflare_protection2 = FakeResponse(
            code=self.code_403,
            headers=self.no_cloudflare_headers,
            text=self.text_without_cloudflare_flags
        )

        resp_without_cloudflare_protection3 = FakeResponse(
            code=self.code_503,
            headers=self.no_cloudflare_headers,
            text=self.text_without_cloudflare_flags
        )

        result1 = has_cloudflare_protection(resp_without_cloudflare_protection1)
        result2 = has_cloudflare_protection(resp_without_cloudflare_protection2)
        result3 = has_cloudflare_protection(resp_without_cloudflare_protection3)

        self.assertFalse(result1)
        self.assertFalse(result2)
        self.assertFalse(result3)
--- a/scripts/validate/init.py
+++ b/scripts/validate/init.py
@@ -0,0 +1,4 @@
 # -*- coding: utf-8 -*-

 from validate import format
 from validate import links
--- a/scripts/validate/format.py
+++ b/scripts/validate/format.py
@@ -0,0 +1,277 @@
 # -*- coding: utf-8 -*-

 import re
 import sys
 from string import punctuation
 from typing import List, Tuple, Dict

 # Temporary replacement
 # The descriptions that contain () at the end must adapt to the new policy later
 punctuation = punctuation.replace('()', '')

 anchor = '###'
 auth_keys = ['apiKey', 'OAuth', 'X-Mashape-Key', 'User-Agent', 'No']
 https_keys = ['Yes', 'No']
 cors_keys = ['Yes', 'No', 'Unknown']

 index_title = 0
 index_desc = 1
 index_auth = 2
 index_https = 3
 index_cors = 4

 num_segments = 5
 min_entries_per_category = 3
 max_description_length = 100

 anchor_re = re.compile(anchor + '\s(.+)')
 category_title_in_index_re = re.compile('\*\s\[(.*)\]')
 link_re = re.compile('\[(.+)\]\((http.*)\)')

 # Type aliases
 APIList = List[str]
 Categories = Dict[str, APIList]
 CategoriesLineNumber = Dict[str, int]


 def error_message(line_number: int, message: str) -> str:
    line = line_number + 1
    return f'(L{line:03d}) {message}'


 def get_categories_content(contents: List[str]) -> Tuple[Categories, CategoriesLineNumber]:

    categories = {}
    category_line_num = {}

    for line_num, line_content in enumerate(contents):

        if line_content.startswith(anchor):
            category = line_content.split(anchor)[1].strip()
            categories[category] = []
            category_line_num[category] = line_num
            continue

        if not line_content.startswith('|') or line_content.startswith('|---'):
            continue

        raw_title = [
            raw_content.strip() for raw_content in line_content.split('|')[1:-1]
        ][0]

        title_match = link_re.match(raw_title)
        if title_match:
                title = title_match.group(1).upper()
                categories[category].append(title)

    return (categories, category_line_num)


 def check_alphabetical_order(lines: List[str]) -> List[str]:

    err_msgs = []

    categories, category_line_num = get_categories_content(contents=lines)

    for category, api_list in categories.items():
        if sorted(api_list) != api_list:
            err_msg = error_message(
                category_line_num[category], 
                f'{category} category is not alphabetical order'
            )
            err_msgs.append(err_msg)
    
    return err_msgs


 def check_title(line_num: int, raw_title: str) -> List[str]:

    err_msgs = []

    title_match = link_re.match(raw_title)

    # url should be wrapped in "[TITLE](LINK)" Markdown syntax
    if not title_match:
        err_msg = error_message(line_num, 'Title syntax should be "[TITLE](LINK)"')
        err_msgs.append(err_msg)
    else:
        # do not allow "... API" in the entry title
        title = title_match.group(1)
        if title.upper().endswith(' API'):
            err_msg = error_message(line_num, 'Title should not end with "... API". Every entry is an API here!')
            err_msgs.append(err_msg)

    return err_msgs


 def check_description(line_num: int, description: str) -> List[str]:

    err_msgs = []

    first_char = description[0]
    if first_char.upper() != first_char:
        err_msg = error_message(line_num, 'first character of description is not capitalized')
        err_msgs.append(err_msg)

    last_char = description[-1]
    if last_char in punctuation:
        err_msg = error_message(line_num, f'description should not end with {last_char}')
        err_msgs.append(err_msg)

    desc_length = len(description)
    if desc_length > max_description_length:
        err_msg = error_message(line_num, f'description should not exceed {max_description_length} characters (currently {desc_length})')
        err_msgs.append(err_msg)
    
    return err_msgs


 def check_auth(line_num: int, auth: str) -> List[str]:

    err_msgs = []

    backtick = '`'
    if auth != 'No' and (not auth.startswith(backtick) or not auth.endswith(backtick)):
        err_msg = error_message(line_num, 'auth value is not enclosed with `backticks`')
        err_msgs.append(err_msg)

    if auth.replace(backtick, '') not in auth_keys:
        err_msg = error_message(line_num, f'{auth} is not a valid Auth option')
        err_msgs.append(err_msg)
    
    return err_msgs


 def check_https(line_num: int, https: str) -> List[str]:

    err_msgs = []

    if https not in https_keys:
        err_msg = error_message(line_num, f'{https} is not a valid HTTPS option')
        err_msgs.append(err_msg)

    return err_msgs


 def check_cors(line_num: int, cors: str) -> List[str]:

    err_msgs = []

    if cors not in cors_keys:
        err_msg = error_message(line_num, f'{cors} is not a valid CORS option')
        err_msgs.append(err_msg)
    
    return err_msgs


 def check_entry(line_num: int, segments: List[str]) -> List[str]:

    raw_title = segments[index_title]
    description = segments[index_desc]
    auth = segments[index_auth]
    https = segments[index_https]
    cors = segments[index_cors]

    title_err_msgs = check_title(line_num, raw_title)
    desc_err_msgs = check_description(line_num, description)
    auth_err_msgs = check_auth(line_num, auth)
    https_err_msgs = check_https(line_num, https)
    cors_err_msgs = check_cors(line_num, cors)

    err_msgs = [
        *title_err_msgs,
        *desc_err_msgs,
        *auth_err_msgs,
        *https_err_msgs,
        *cors_err_msgs
    ]

    return err_msgs


 def check_file_format(lines: List[str]) -> List[str]:

    err_msgs = []
    category_title_in_index = []

    alphabetical_err_msgs = check_alphabetical_order(lines)
    err_msgs.extend(alphabetical_err_msgs)

    num_in_category = min_entries_per_category + 1
    category = ''
    category_line = 0

    for line_num, line_content in enumerate(lines):

        category_title_match = category_title_in_index_re.match(line_content)
        if category_title_match:
            category_title_in_index.append(category_title_match.group(1))

        # check each category for the minimum number of entries
        if line_content.startswith(anchor):
            category_match = anchor_re.match(line_content)
            if category_match:
                if category_match.group(1) not in category_title_in_index:
                    err_msg = error_message(line_num, f'category header ({category_match.group(1)}) not added to Index section')
                    err_msgs.append(err_msg)
            else:
                err_msg = error_message(line_num, 'category header is not formatted correctly')
                err_msgs.append(err_msg)

            if num_in_category < min_entries_per_category:
                err_msg = error_message(category_line, f'{category} category does not have the minimum {min_entries_per_category} entries (only has {num_in_category})')
                err_msgs.append(err_msg)

            category = line_content.split(' ')[1]
            category_line = line_num
            num_in_category = 0
            continue

        # skips lines that we do not care about
        if not line_content.startswith('|') or line_content.startswith('|---'):
            continue

        num_in_category += 1
        segments = line_content.split('|')[1:-1]
        if len(segments) < num_segments:
            err_msg = error_message(line_num, f'entry does not have all the required columns (have {len(segments)}, need {num_segments})')
            err_msgs.append(err_msg)
            continue
    
        for segment in segments:
            # every line segment should start and end with exactly 1 space
            if len(segment) - len(segment.lstrip()) != 1 or len(segment) - len(segment.rstrip()) != 1:
                err_msg = error_message(line_num, 'each segment must start and end with exactly 1 space')
                err_msgs.append(err_msg)
        
        segments = [segment.strip() for segment in segments]
        entry_err_msgs = check_entry(line_num, segments)
        err_msgs.extend(entry_err_msgs)
    
    return err_msgs


 def main(filename: str) -> None:

    with open(filename, mode='r', encoding='utf-8') as file:
        lines = list(line.rstrip() for line in file)

    file_format_err_msgs = check_file_format(lines)

    if file_format_err_msgs:
        for err_msg in file_format_err_msgs:
            print(err_msg)
        sys.exit(1)


 if __name__ == '__main__':

    num_args = len(sys.argv)

    if num_args < 2:
        print('No .md file passed (file should contain Markdown table syntax)')
        sys.exit(1)

    filename = sys.argv[1]

    main(filename)
--- a/scripts/validate/links.py
+++ b/scripts/validate/links.py
@@ -0,0 +1,272 @@
 # -*- coding: utf-8 -*-

 import re
 import sys
 import random
 from typing import List, Tuple

 import requests
 from requests.models import Response


 def find_links_in_text(text: str) -> List[str]:
    """Find links in a text and return a list of URLs."""

    link_pattern = re.compile(r'((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’]))')

    raw_links = re.findall(link_pattern, text)

    links = [
        str(raw_link[0]).rstrip('/') for raw_link in raw_links
    ]

    return links


 def find_links_in_file(filename: str) -> List[str]:
    """Find links in a file and return a list of URLs from text file."""

    with open(filename, mode='r', encoding='utf-8') as file:
        readme = file.read()
        index_section = readme.find('## Index')
        if index_section == -1:
            index_section = 0
        content = readme[index_section:]

    links = find_links_in_text(content)

    return links


 def check_duplicate_links(links: List[str]) -> Tuple[bool, List]:
    """Check for duplicated links.

    Returns a tuple with True or False and duplicate list.
    """

    seen = {}
    duplicates = []
    has_duplicate = False

    for link in links:
        if link not in seen:
            seen[link] = 1
        else:
            if seen[link] == 1:
                duplicates.append(link)

    if duplicates:
        has_duplicate = True

    return (has_duplicate, duplicates)


 def fake_user_agent() -> str:
    """Faking user agent as some hosting services block not-whitelisted UA."""

    user_agents = [
        'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko)',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
    ]

    return random.choice(user_agents)


 def get_host_from_link(link: str) -> str:

    host = link.split('://', 1)[1] if '://' in link else link

    # Remove routes, arguments and anchors
    if '/' in host:
        host = host.split('/', 1)[0]

    elif '?' in host:
        host = host.split('?', 1)[0]

    elif '#' in host:
        host = host.split('#', 1)[0]

    return host


 def has_cloudflare_protection(resp: Response) -> bool:
    """Checks if there is any cloudflare protection in the response.

    Cloudflare implements multiple network protections on a given link,
    this script tries to detect if any of them exist in the response from request.

    Common protections have the following HTTP code as a response:
        - 403: When host header is missing or incorrect (and more)
        - 503: When DDOS protection exists

    See more about it at:
        - https://support.cloudflare.com/hc/en-us/articles/115003014512-4xx-Client-Error
        - https://support.cloudflare.com/hc/en-us/articles/115003011431-Troubleshooting-Cloudflare-5XX-errors
        - https://www.cloudflare.com/ddos/
        - https://superuser.com/a/888526

    Discussions in issues and pull requests:
        - https://github.com/public-apis/public-apis/pull/2409
        - https://github.com/public-apis/public-apis/issues/2960 
    """

    code = resp.status_code
    server = resp.headers.get('Server') or resp.headers.get('server')
    cloudflare_flags = [
        '403 Forbidden',
        'cloudflare',
        'Cloudflare',
        'Security check',
        'Please Wait... | Cloudflare',
        'We are checking your browser...',
        'Please stand by, while we are checking your browser...',
        'Checking your browser before accessing',
        'This process is automatic.',
        'Your browser will redirect to your requested content shortly.',
        'Please allow up to 5 seconds',
        'DDoS protection by',
        'Ray ID:',
        'Cloudflare Ray ID:',
        '_cf_chl',
        '_cf_chl_opt',
        '__cf_chl_rt_tk',
        'cf-spinner-please-wait',
        'cf-spinner-redirecting'
    ]

    if code in [403, 503] and server == 'cloudflare':
        html = resp.text

        flags_found = [flag in html for flag in cloudflare_flags]
        any_flag_found = any(flags_found)

        if any_flag_found:
            return True

    return False


 def check_if_link_is_working(link: str) -> Tuple[bool, str]:
    """Checks if a link is working.

    If an error is identified when the request for the link occurs,
    the return will be a tuple with the first value True and the second
    value a string containing the error message.

    If no errors are identified, the return will be a tuple with the
    first value False and the second an empty string.
    """

    has_error = False
    error_message = ''

    try:
        resp = requests.get(link + '/', timeout=25, headers={
            'User-Agent': fake_user_agent(),
            'host': get_host_from_link(link)
        })

        code = resp.status_code

        if code >= 400 and not has_cloudflare_protection(resp):
            has_error = True
            error_message = f'ERR:CLT: {code} : {link}'

    except requests.exceptions.SSLError as error:
        has_error = True
        error_message = f'ERR:SSL: {error} : {link}'

    except requests.exceptions.ConnectionError as error:
        has_error = True
        error_message = f'ERR:CNT: {error} : {link}'

    except (TimeoutError, requests.exceptions.ConnectTimeout):
        has_error = True
        error_message = f'ERR:TMO: {link}'

    except requests.exceptions.TooManyRedirects as error:
        has_error = True
        error_message = f'ERR:TMR: {error} : {link}'

    except (Exception, requests.exceptions.RequestException) as error:
        has_error = True
        error_message = f'ERR:UKN: {error} : {link}'

    return (has_error, error_message)


 def check_if_list_of_links_are_working(list_of_links: List[str]) -> List[str]:
    error_messages = []
    for link in list_of_links:
        has_error, error_message = check_if_link_is_working(link)

        if has_error:
            error_messages.append(error_message)

    return error_messages


 def start_duplicate_links_checker(links: List[str]) -> None:

    print('Checking for duplicate links...')

    has_duplicate_link, duplicates_links = check_duplicate_links(links)

    if has_duplicate_link:
        print(f'Found duplicate links:')

        for duplicate_link in duplicates_links:
            print(duplicate_link)

        sys.exit(1)
    else:
        print('No duplicate links.')


 def start_links_working_checker(links: List[str]) -> None:

    print(f'Checking if {len(links)} links are working...')

    errors = check_if_list_of_links_are_working(links)
    if errors:

        num_errors = len(errors)
        print(f'Apparently {num_errors} links are not working properly. See in:')

        for error_message in errors:
            print(error_message)

        sys.exit(1)


 def main(filename: str, only_duplicate_links_checker: bool) -> None:

    links = find_links_in_file(filename)

    start_duplicate_links_checker(links)

    if not only_duplicate_links_checker:
        start_links_working_checker(links)


 if __name__ == '__main__':
    num_args = len(sys.argv)
    only_duplicate_links_checker = False

    if num_args < 2:
        print('No .md file passed')
        sys.exit(1)
    elif num_args == 3:
        third_arg = sys.argv[2].lower()

        if third_arg == '-odlc' or third_arg == '--only_duplicate_links_checker':
            only_duplicate_links_checker = True
        else:
            print(f'Third invalid argument. Usage: python {__file__} [-odlc | --only_duplicate_links_checker]')
            sys.exit(1)

    filename = sys.argv[1]

    main(filename, only_duplicate_links_checker)