tokenize.js (4246B)
1 /** 2 * @license Apache-2.0 3 * 4 * Copyright (c) 2018 The Stdlib Authors. 5 * 6 * Licensed under the Apache License, Version 2.0 (the "License"); 7 * you may not use this file except in compliance with the License. 8 * You may obtain a copy of the License at 9 * 10 * http://www.apache.org/licenses/LICENSE-2.0 11 * 12 * Unless required by applicable law or agreed to in writing, software 13 * distributed under the License is distributed on an "AS IS" BASIS, 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 * See the License for the specific language governing permissions and 16 * limitations under the License. 17 */ 18 19 'use strict'; 20 21 // MODULES // 22 23 var isBoolean = require( '@stdlib/assert/is-boolean' ).isPrimitive; 24 var isString = require( '@stdlib/assert/is-string' ).isPrimitive; 25 var hasOwnProp = require( '@stdlib/assert/has-own-property' ); 26 var ABBRS = require( './abbreviations.json' ); 27 var EMOJIS = require( './emojis.json' ); 28 var CONTRACT = require( './contractions.json' ); 29 30 31 // VARIABLES // 32 33 var REGEXP_PREFIXES = /^([,([{*<"“'`‘.])/gi; 34 var REGEXP_SUFFIXES = /([,.!?%*>:;"'”`)\]}])$/gi; 35 36 37 // FUNCTIONS // 38 39 /** 40 * Extends an array by the elements of another array. 41 * 42 * @private 43 * @param {Array} arr - input array 44 * @param {Array} ext - array to extend `arr` with 45 * @returns {Array} mutated input array 46 * 47 * @example 48 * var arr = [ 1, 2, 3 ]; 49 * var out = extend( arr, [ 4, 5 ] ); 50 * // returns [ 1, 2, 3, 4, 5 ] 51 */ 52 function extend( arr, ext ) { 53 var i; 54 for ( i = 0; i < ext.length; i++ ) { 55 arr.push( ext[ i ] ); 56 } 57 return arr; 58 } 59 60 /** 61 * Tokenizes a substring. 62 * 63 * @private 64 * @param {string} substr - input string 65 * @returns {Array} token array 66 * 67 * @example 68 * var str = '(never)'; 69 * var out = tokenizeSubstring( str ); 70 * // returns [ '(', 'never', ')' ] 71 */ 72 function tokenizeSubstring( substr ) { 73 var prefixes = []; 74 var suffixes = []; 75 var match; 76 var done; 77 var res; 78 79 do { 80 if ( 81 !EMOJIS[ substr ] && 82 !ABBRS[ substr ] && 83 !CONTRACT[ substr ] 84 ) { 85 match = substr.split( REGEXP_PREFIXES ); 86 if ( match.length > 1 ) { 87 prefixes.push( match[ 1 ] ); 88 substr = match[ 2 ]; 89 } 90 else { 91 match = substr.split( REGEXP_SUFFIXES ); 92 if ( match.length > 1 ) { 93 substr = match[ 0 ]; 94 suffixes.push( match[ 1 ] ); 95 } else { 96 done = true; 97 } 98 } 99 } 100 else { 101 done = true; 102 } 103 } while ( !done ); 104 105 res = prefixes; 106 res.push( substr ); 107 extend( res, suffixes ); 108 return res; 109 } 110 111 112 // MAIN // 113 114 /** 115 * Tokenize a string. 116 * 117 * @param {string} str - input string 118 * @param {boolean} [keepWhitespace=false] - boolean indicating whether whitespace characters should be returned as part of the token array 119 * @throws {TypeError} first argument must be a string primitive 120 * @throws {TypeError} second argument must be a boolean primitive 121 * @returns {Array} array of tokens 122 * 123 * @example 124 * var str = 'Hello World!'; 125 * var out = tokenize( str ); 126 * // returns [ 'Hello', 'World', '!' ] 127 * 128 * @example 129 * var str = ''; 130 * var out = tokenize( str ); 131 * // returns [] 132 * 133 * @example 134 * var str = 'Hello Mrs. Maple, could you call me back?'; 135 * var out = tokenize( str ); 136 * // returns [ 'Hello', 'Mrs.', 'Maple', ',', 'could', 'you', 'call', 'me', 'back', '?' ] 137 */ 138 function tokenize( str, keepWhitespace ) { 139 var subtkns; 140 var substrs; 141 var tokens; 142 var substr; 143 var cache; 144 var i; 145 if ( !isString( str ) ) { 146 throw new TypeError( 'invalid argument. First argument must be a string primitive. Value: `' + str + '`.' ); 147 } 148 if ( arguments.length > 1 ) { 149 if ( !isBoolean( keepWhitespace ) ) { 150 throw new TypeError( 'invalid argument. Second argument must be a boolean primitive. Value: `' + keepWhitespace + '`.' ); 151 } 152 } 153 if ( !str ) { 154 return []; 155 } 156 157 // Split on whitespace: 158 if ( keepWhitespace ) { 159 substrs = str.split( /(\s+)/ ); 160 } else { 161 substrs = str.split( /\s+/ ); 162 } 163 164 // Set up cache to hold tokens for substring matches: 165 cache = {}; 166 167 // Initialize token array: 168 tokens = []; 169 170 for ( i = 0; i < substrs.length; i++ ) { 171 substr = substrs[ i ]; 172 if ( hasOwnProp( cache, substr ) ) { 173 extend( tokens, cache[ substr ] ); 174 } 175 else { 176 subtkns = tokenizeSubstring( substr ); 177 extend( tokens, subtkns ); 178 cache[ substr ] = subtkns; 179 } 180 } 181 return tokens; 182 } 183 184 185 // EXPORTS // 186 187 module.exports = tokenize;