Merge pull request #569 from davemachado/update-md2json

Update md2json, validate_format, and validate_links
6 years ago · e9e6a71d50
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,10 +1,7 @@
 language: node_js
 language: python
 python: "3.6"
 notifications:
  email: true
 before_install:
  - rvm install 2.4.0
 install:
  - gem install httparty ruby-progressbar
 before_script:
  - cd build
 script:
--- a/build/build.sh
+++ b/build/build.sh
@@ -3,7 +3,6 @@
 # create json directory if not already present
 mkdir -p ../json
 # parse API README and print (minified) JSON to stdout, redirect to /json
 node condenseMd.js ../README.md > single_table.md
 node md2json.js single_table.md > ../json/entries.min.json
 ./md2json.py ../README.md > ../json/entries.min.json
 # beautify the previously created JSON file, redirect to /json
 python -m json.tool ../json/entries.min.json > ../json/entries.json
--- a/build/condenseMd.js
+++ b/build/condenseMd.js
@@ -1,63 +0,0 @@
 fs = require('fs')

 function setupMd(filename, anchor) {
    fs.readFile(filename, 'utf8', function (err,text) {
      if (err) {
        return console.log(err);
      }
    var lines = text.split("\n"),
        cur_line = 0,
        line = "",
        table_name = "",
        col_num = 0,
        cols = [],
        rows = [],
        arr = [];


    function read_line() {
        return lines[cur_line++];
    }

    while (true) {
        var cols = [];
        var rows = [];
        while (line.indexOf(anchor) == -1 && cur_line != lines.length) {
            line = read_line();
        }
        if (cur_line == lines.length) {
            break;
        }
        table_name = line.split(anchor)[1];
        read_line()
        read_line()
        while (true) {
        	line = read_line()
        	if (line.length < 2 || cur_line == lines.length) {
        		break
        	}
 			if (line.indexOf("|") == 0) {
 				arr.push(line + table_name)
 			} 
        }

    }
    console.log(anchor + " entries")
    console.log("API | Description | Auth | HTTPS | Link | Category")
    console.log("|---|---|---|---|---|---|")
    for (i = 0; i < arr.length; i++) {
    	console.log(arr[i])
    }
  });
 }

 if (process.argv.length < 3) {
    console.log("No .md file passed!");
    return;
 }
 if (process.argv.length < 4) {
  anchorText = "###";
 } else {
  anchorText = process.argv[3];
 }
 setupMd(process.argv[2].toString(), anchorText);
--- a/build/main.sh
+++ b/build/main.sh
@@ -30,7 +30,7 @@ else
 fi

 echo "running format validation..."
 ./validate_format.rb $FORMAT_FILE
 ./validate_format.py $FORMAT_FILE
 if [[ $? != 0 ]]; then
  echo "format validation failed!"
  exit 1
--- a/build/md2json.js
+++ b/build/md2json.js
@@ -1,134 +0,0 @@
 fs = require('fs')

 function md_trim(str, context) {
    str = str.replace(/(^\s+)|(\s+$)/g, "");

    if (context == 1) { // Name
        // placeholder for any formatting on name value
    } else if (context == 2) { // Description
        str = str.replace(".", ""); // remove ending periods on descriptions
    } else if (context == 3) { // Auth
        if (str.toUpperCase() == "NO") {
            str = null
        } else {
            str = str.replace("`", "").replace("`", "")
        }
    } else if (context == 4) { // HTTPS
        if (str.toUpperCase() == "YES") {
            str = true
        } else {
            str = false
        }
    } else if (context == 5) { // Link
        str = str.replace("[Go!]", "").slice(1, -1);
    }
    return str;
 }

 function handle(filename, anchor) {
    fs.readFile(filename, 'utf8', function (err,text) {
      if (err) {
        return console.log(err);
      }
    var lines = text.split("\n");
    var cur_line = 0;
    var line = ""
    var table_name = "";
    var col_num = 0;
    var cols = [];
    var rows = [];
    var entry_count = 0;

    function read_line() {
        return lines[cur_line++];
    }
    var root = {};
    while (true) {
        var cols = [];
        var rows = [];
        while (line.indexOf(anchor) == -1 && cur_line != lines.length) {
            line = read_line();
        }
        if (cur_line == lines.length) {
            break;
        }
        table_name = line.split(anchor)[1];
        table_name = md_trim(table_name, 0)

        line = read_line()

        if (line) {
            line = line.split("|")
            for (var j in line) {

                line[j] = md_trim(line[j], 0)
                if ((j == 0 || j == line.length - 1) && line[j] === "") {

                } else {
                    cols.push(line[j]);
                }
            }
            if (line.length) {
                cols = line;
                rows.push(cols)
            } else {
                console.error("markdown expect column title")
                break;
            }
        } else {
            console.error("markdown expect table content")
            break;
        }

        line = read_line()

        if (!line) {
            console.error("markdown expect table spliter")
            break;
        }
        line = read_line()
        while (line.indexOf("|") != -1 && cur_line != lines.length) {

            var line_this = line.split("|")
            var row = []
            for (var j in line_this) {
                line_this[j] = md_trim(line_this[j], j)
                if ((j == 0 || j == line_this.length - 1) && line_this[j] === "") {

                } else {
                    row.push(line_this[j]);
                }

            }
            rows.push(row);
            entry_count++;
            line = read_line()
        }

        var data=[];
        for (var j in rows) {
            if (j != 0) {
                var ele = {};
                for (var k in rows[j]) {
                    ele[rows[0][k]] = rows[j][k];
                }
                data.push(ele);
            }
        }
        root["count"] = entry_count;
        root[table_name] = data;
    }
    console.log(JSON.stringify(root));
  });
 }

 if (process.argv.length < 3) {
    console.log("No .md file passed!");
    return;
 }
 if (process.argv.length < 4) {
  anchorText = "###";
 } else {
  anchorText = process.argv[3];
 }
 handle(process.argv[2].toString(), anchorText);
--- a/build/md2json.py
+++ b/build/md2json.py
@@ -0,0 +1,48 @@
 #!/usr/bin/env python3

 import json
 import sys


 def markdown_to_json(filename, anchor):
    """Convert a Markdown file into a JSON string"""
    category = ""
    entries = []
    with open(filename) as fp:
        lines = (line.rstrip() for line in fp)
        lines = list(line for line in lines if line \
                    and line.startswith(anchor) or line.startswith('| '))
    for line in lines:
        if line.startswith(anchor):
            category = line.split(anchor)[1].strip()
            continue
        chunks = [x.strip() for x in line.split('|')[1:-1]]
        entry = {
            'API': chunks[0],
            'Description': chunks[1],
            'Auth': None if chunks[2].upper() == 'NO' else chunks[2].strip('`'),
            'HTTPS': True if chunks[3].upper() == 'YES' else False,
            'Link': chunks[4].replace('[Go!]', '')[1:-1],
            'Category': category,
        }
        entries.append(entry)
    final = {
        'count': len(entries),
        'entries': entries,
    }
    return json.dumps(final)


 def main():
    num_args = len(sys.argv)
    if num_args < 2:
        print("No .md file passed")
        sys.exit(1)
    if num_args < 3:
        anchor = '###'
    else:
        anchor = sys.argv[2]
    print(markdown_to_json(sys.argv[1], anchor))

 if __name__ == "__main__":
    main()
--- a/build/validate_format.py
+++ b/build/validate_format.py
@@ -0,0 +1,111 @@
 #!/usr/bin/env python3

 import json
 import string
 import sys

 anchor = '###'
 auth_keys = ['apiKey', 'OAuth', 'X-Mashape-Key', 'No']
 punctuation = ['.', '?', '!']
 https_keys = ['Yes', 'No']

 index_title = 0
 index_desc = 1
 index_auth = 2
 index_https = 3
 index_link = 4

 errors = []


 def add_error(line_num, message):
    """adds an error to the dynamic error list"""
    err = '(L{:03d}) {}'.format(line_num+1, message)
    errors.append(err)


 def check_format(filename):
    """
    validates that each line is formatted correctly,
    appending to error list as needed
    """
    with open(filename) as fp:
        lines = list(line.rstrip() for line in fp)

    # START Alphabetical Order
    category = ""
    sections = {}
    section_line_num = {}
    for line_num, line in enumerate(lines):
        if line.startswith(anchor):
            category = line.split(anchor)[1].strip()
            sections[category] = []
            section_line_num[category] = line_num
            continue
        if not line.startswith('|') or line.startswith('|---'):
            continue
        title = [x.strip() for x in line.split('|')[1:-1]][0].upper()
        sections[category].append(title)

    for category, entries in sections.items():
        if sorted(entries) != entries:
            add_error(section_line_num[category], '{} section is not in alphabetical order'.format(category))
    # END Alphabetical Order

    # START Check Entries
    for line_num, line in enumerate(lines):
        if not line.startswith('|') or line.startswith('|---'):
            continue
        segments = line.split('|')[1:-1]
        # START Global
        for segment in segments:
            # every line segment should start and end with exactly 1 space
            if len(segment) - len(segment.lstrip()) != 1 or len(segment) - len(segment.rstrip()) != 1:
                add_error(line_num, "each segment must start and end with exactly 1 space")
        # END Global
        segments = [seg.strip() for seg in segments]
        # START Description
        # first character should be capitalized
        char = segments[index_desc][0]
        if char.upper() != char:
            add_error(line_num, "first char of Description is not capitalized")
        # last character should not punctuation
        char = segments[index_desc][-1]
        if char in punctuation:
            add_error(line_num, "description should not end with {}".format(char))
        # END Description
        # START Auth
        # values should conform to valid options only
        auth = segments[index_auth].replace('`', '')
        if auth not in auth_keys:
            add_error(line_num, "{} is not a valid Auth option".format(auth))
        # END Auth
        # START HTTPS
        # values should conform to valid options only
        https = segments[index_https]
        if https not in https_keys:
            add_error(line_num, "{} is not a valid HTTPS option".format(https))
        # END HTTPS
        # START Link
        # url should be wrapped in '[Go!]()' Markdown syntax
        link = segments[index_link]
        if not link.startswith('[Go!](http') or not link.endswith(')'):
            add_error(line_num, 'link format should be "[Go!](LINK)"')
        # END Link
    # END Check Entries

 def main():
    num_args = len(sys.argv)
    if num_args < 2:
        print("No .md file passed")
        sys.exit(1)

    check_format(sys.argv[1])
    if len(errors) > 0:
        for err in errors:
            print(err)
        sys.exit(1)


 if __name__ == "__main__":
    main()
--- a/build/validate_format.rb
+++ b/build/validate_format.rb
@@ -1,111 +0,0 @@
 #!/usr/bin/env ruby

 auth_keys = ['apiKey', 'OAuth', 'X-Mashape-Key', 'No']
 punctuation = ['.', '?', '!']
 https_keys = ['Yes', 'No']

 INDEX_TITLE = 1
 INDEX_DESCRIPTION = 2
 INDEX_AUTH = 3
 INDEX_HTTPS = 4
 INDEX_LINK = 5
 filename = ARGV[0]
 $errors = []

 def add_error(line_num, val_index, message)
    case val_index
    when INDEX_TITLE
        segment = "Title"
    when INDEX_DESCRIPTION
        segment = "Description"
    when INDEX_AUTH
        segment = "Auth"
    when INDEX_HTTPS
        segment = "HTTPS"
    when INDEX_LINK
        segment = "Link"
    end
    $errors.push("(L%03d) %-14.14s #{message}" % [line_num, segment])
 end

 ################### CHECK ALPHABETICAL ORDER ###################
 section = ''
 sections = []
 section_to_line_num = {}
 section_to_entries = Hash.new {|h,k| h[k] = Array.new }
 File.foreach(filename).with_index do | line, line_num |
 	if line.start_with?('###')
 		section = line.sub('###', '').lstrip.chop
 		sections.push(section)
 		section_to_line_num[section] = line_num + 1
 	end
 	# Skip non-markdown table lines and table schema lines
    if !line.start_with?('|') || line.eql?("|---|---|---|---|---|\n")
        next
    end
    # char to check is the first column
    check_char = line.split("|")[1].strip.upcase
    section_to_entries[section].push(check_char)
 end
 sections.each do | sect |
 	if section_to_entries[sect] != section_to_entries[sect].sort
 		add_error(section_to_line_num[sect], INDEX_TITLE, "#{sect} section is not in alphabetical order")
 	end
 end

 #################### CHECK LINE ENTRIES ########################
 File.foreach(filename).with_index do | line, line_num |
    line_num += 1
        
    # Skip non-markdown table lines and table schema lines
    if !line.start_with?('|') || line.eql?("|---|---|---|---|---|\n")
        next
    end

    values = line.split("|")

    ################### GLOBAL ###################
    values.each.with_index do |val, val_index|
        msg = ""
        case val_index
        when INDEX_TITLE..INDEX_LINK
            # every line segment should start and end with exactly 1 space
            if val[/\A */].size != 1 || val[/ *\z/].size != 1
                add_error(line_num, val_index, "string should start and end with exactly 1 space")
            end
        end
    end
    ################# DESCRIPTION ################
    # First character should be capitalized
    desc_val = values[INDEX_DESCRIPTION].lstrip.chop
    if !/[[:upper:]]/.match(desc_val[0])
        add_error(line_num, INDEX_DESCRIPTION, "first char not uppercase")
    end
    # value should not be punctuated
    last_char = desc_val[desc_val.length-1]
    if punctuation.include?(last_char)
        add_error(line_num, INDEX_DESCRIPTION, "description should not end with \"#{last_char}\"")
    end
    #################### AUTH ####################
    # Values should conform to valid options only
    auth_val = values[INDEX_AUTH].lstrip.chop.tr('``', '')
    if !auth_keys.include?(auth_val)
        add_error(line_num, INDEX_AUTH, "not a valid option: #{auth_val}")
    end
    #################### HTTPS ###################
    # Values should be either "Yes" or "No"
    https_val = values[INDEX_HTTPS].lstrip.chop
    if !https_keys.include?(https_val)
        add_error(line_num, INDEX_HTTPS, "must use \"Yes\" or \"No\": #{https_val}")
    end
    #################### LINK ####################
    # Url should be wrapped in "[Go!]" view
    link_val = values[INDEX_LINK].lstrip.chop
    if !link_val.start_with?("[Go!](") || !link_val.end_with?(')')
        add_error(line_num, INDEX_LINK, "format should be \"[Go!](<LINK>)\": #{link_val}")
    end
 end
 $errors.each do | e |
    puts e
 end
 exit($errors.length)
--- a/build/validate_links.py
+++ b/build/validate_links.py
@@ -0,0 +1,53 @@
 #!/usr/bin/env python3

 import httplib2
 import json
 import socket
 import sys


 def parse_links(filename):
    """Returns a list of links from JSON object"""
    data = json.load(open(filename))
    links = []
    for entry in data['entries']:
        link = entry['Link']
        https = True if link.startswith('https') else False
        x = {
            'link': link,
            'https': https,
        }
        links.append(x)
    return links


 def validate_links(links):
    """Checks each entry in JSON file for live link"""
    print('Validating {} links...'.format(len(links)))
    errors = []
    for each in links:
        link = each['link']
        h = httplib2.Http(disable_ssl_certificate_validation=True, timeout=5)
        try:
            resp = h.request(link, 'HEAD')
            code = int(resp[0]['status'])
            # check if status code is a client or server error
            if code >= 404:
                errors.append('{}: {}'.format(code, link))
        except TimeoutError:
            errors.append("TMO: " + link)
        except socket.error as socketerror:
            errors.append("SOC: {} : {}".format(socketerror, link))
    return errors

 if __name__ == "__main__":
    num_args = len(sys.argv)
    if num_args < 2:
        print("No .json file passed")
        sys.exit(1)
    errors = validate_links(parse_links(sys.argv[1]))
    if len(errors) > 0:
        for err in errors:
            print(err)
        sys.exit(1)

--- a/build/validate_links.rb
+++ b/build/validate_links.rb
@@ -1,81 +0,0 @@
 #!/usr/bin/env ruby
 require 'httparty'
 require 'ruby-progressbar'
 require 'uri'
 allowed_codes = [200, 302, 403, 429]
 allowed_links = ["https://www.yelp.com/developers/documentation/v3"]
 args = ARGV
 filename = args[0]
 contents = File.open(filename, 'rb') { |f| f.read }
 raw_links = URI.extract(contents, ['http', 'https'])
 # Remove trailing ')' from entry URLs
 links = []
 raw_links.each do |link|
    if link.end_with?(')')
        links.push(link[0...-1])
    else
        links.push(link)
    end
 end
 if links.length <= 0
    puts "no links to check"
    exit(0)
 end
 fails = []
 # Fail on any duplicate elements
 dup = links.select{|element| links.count(element) > 1}
 if dup.uniq.length > 0
    dup.uniq.each do |e|
        fails.push("(DUP): #{e}")
    end
 end
 # Remove any duplicates from array
 links = links.uniq
 count = 0
 total = links.length
 progressbar = ProgressBar.create(:total => total,
    :format => "%a %P% | Processed: %c from %C")
 # GET each link and check for valid response code from allowed_codes
 links.each do |link|
    begin
        count += 1
        if allowed_links.include?(link)
            next
        end
        res = HTTParty.get(link, timeout: 10)
        if res.code.nil?
            fails.push("(NIL): #{link}")
            next
        end
        if !allowed_codes.include?(res.code)
            fails.push("(#{res.code}): #{link}")
        end
    rescue HTTParty::RedirectionTooDeep
        fails.push("(RTD): #{link}")
    rescue Net::ReadTimeout
        fails.push("(TMO): #{link}")
    rescue Net::OpenTimeout
        fails.push("(TMO): #{link}")
    rescue OpenSSL::SSL::SSLError
        fails.push("(SSL): #{link}")
    rescue SocketError
        fails.push("(SOK): #{link}")
    rescue Errno::ECONNREFUSED
        fails.push("(CON): #{link}")
    rescue Errno::ECONNRESET
        next
    end
    progressbar.increment
 end
 puts "#{count}/#{total} links checked"
 if fails.length <= 0
    puts "all links valid"
    exit(0)
 else
    puts "-- RESULTS --"
    fails.sort!
    fails.each do |e|
        puts e
    end
    exit(1)
 end
--- a/json/entries.json
+++ b/json/entries.json
@@ -197,7 +197,7 @@
            "API": "BookNomads",
            "Auth": null,
            "Category": "Books",
            "Description": "Books published in the Netherlands and Flanders (about 25 million), book covers, and related data",
            "Description": "Books published in the Netherlands and Flanders (about 2.5 million), book covers, and related data",
            "HTTPS": true,
            "Link": "https://www.booknomads.com/dev"
        },
@@ -765,7 +765,7 @@
            "API": "JSONbin.io",
            "Auth": "apiKey",
            "Category": "Development",
            "Description": "Free JSON storage service Ideal for small scale Web apps, Websites and Mobile apps",
            "Description": "Free JSON storage service. Ideal for small scale Web apps, Websites and Mobile apps",
            "HTTPS": true,
            "Link": "https://jsonbin.io"
        },
@@ -1741,7 +1741,7 @@
            "API": "BusinessUSA",
            "Auth": "apiKey",
            "Category": "Government",
            "Description": "BusinessUSA gives developers access to authoritative information on US. programs, events, services and more",
            "Description": "BusinessUSA gives developers access to authoritative information on U.S. programs, events, services and more",
            "HTTPS": true,
            "Link": "https://business.usa.gov/developer"
        },
@@ -1781,7 +1781,7 @@
            "API": "Regulations.gov",
            "Auth": "apiKey",
            "Category": "Government",
            "Description": "Regulationsgov provides access to Federal regulatory materials and increases public participation and their understanding of the Federal rule making process",
            "Description": "Regulations.gov provides access to Federal regulatory materials and increases public participation and their understanding of the Federal rule making process",
            "HTTPS": true,
            "Link": "https://regulationsgov.github.io/developers/"
        },
@@ -1837,7 +1837,7 @@
            "API": "Medicare",
            "Auth": null,
            "Category": "Health",
            "Description": "Access to the data from the CMS - medicaregov",
            "Description": "Access to the data from the CMS - medicare.gov",
            "HTTPS": true,
            "Link": "https://data.medicare.gov/developers"
        },
@@ -2477,7 +2477,7 @@
            "API": "UPC database",
            "Auth": "apiKey",
            "Category": "Open Data",
            "Description": "More than 15 million barcode numbers from all around the world",
            "Description": "More than 1.5 million barcode numbers from all around the world",
            "HTTPS": true,
            "Link": "https://upcdatabase.org/api"
        },
@@ -2517,7 +2517,7 @@
            "API": "Drupal.org",
            "Auth": null,
            "Category": "Open Source Projects",
            "Description": "Drupalorg",
            "Description": "Drupal.org",
            "HTTPS": true,
            "Link": "https://www.drupal.org/drupalorg/docs/api"
        },
@@ -2733,7 +2733,7 @@
            "API": "inspirehep.net",
            "Auth": null,
            "Category": "Science",
            "Description": "High Energy Physics info system",
            "Description": "High Energy Physics info. system",
            "HTTPS": true,
            "Link": "https://inspirehep.net/info/hep/api?ln=en"
        },
@@ -2749,7 +2749,7 @@
            "API": "Minor Planet Center",
            "Auth": null,
            "Category": "Science",
            "Description": "Asterankcom Information",
            "Description": "Asterank.com Information",
            "HTTPS": false,
            "Link": "http://www.asterank.com/mpc"
        },
@@ -2949,7 +2949,7 @@
            "API": "Meetup.com",
            "Auth": "apiKey",
            "Category": "Social",
            "Description": "Data about Meetups from Meetupcom",
            "Description": "Data about Meetups from Meetup.com",
            "HTTPS": true,
            "Link": "https://www.meetup.com/meetup_api/"
        },