time-to-botec

Benchmark sampling in different programming languages
Log | Files | Refs | README

tokenize.js (4246B)


      1 /**
      2 * @license Apache-2.0
      3 *
      4 * Copyright (c) 2018 The Stdlib Authors.
      5 *
      6 * Licensed under the Apache License, Version 2.0 (the "License");
      7 * you may not use this file except in compliance with the License.
      8 * You may obtain a copy of the License at
      9 *
     10 *    http://www.apache.org/licenses/LICENSE-2.0
     11 *
     12 * Unless required by applicable law or agreed to in writing, software
     13 * distributed under the License is distributed on an "AS IS" BASIS,
     14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     15 * See the License for the specific language governing permissions and
     16 * limitations under the License.
     17 */
     18 
     19 'use strict';
     20 
     21 // MODULES //
     22 
     23 var isBoolean = require( '@stdlib/assert/is-boolean' ).isPrimitive;
     24 var isString = require( '@stdlib/assert/is-string' ).isPrimitive;
     25 var hasOwnProp = require( '@stdlib/assert/has-own-property' );
     26 var ABBRS = require( './abbreviations.json' );
     27 var EMOJIS = require( './emojis.json' );
     28 var CONTRACT = require( './contractions.json' );
     29 
     30 
     31 // VARIABLES //
     32 
     33 var REGEXP_PREFIXES = /^([,([{*<"“'`‘.])/gi;
     34 var REGEXP_SUFFIXES = /([,.!?%*>:;"'”`)\]}])$/gi;
     35 
     36 
     37 // FUNCTIONS //
     38 
     39 /**
     40 * Extends an array by the elements of another array.
     41 *
     42 * @private
     43 * @param {Array} arr - input array
     44 * @param {Array} ext - array to extend `arr` with
     45 * @returns {Array} mutated input array
     46 *
     47 * @example
     48 * var arr = [ 1, 2, 3 ];
     49 * var out = extend( arr, [ 4, 5 ] );
     50 * // returns [ 1, 2, 3, 4, 5 ]
     51 */
     52 function extend( arr, ext ) {
     53 	var i;
     54 	for ( i = 0; i < ext.length; i++ ) {
     55 		arr.push( ext[ i ] );
     56 	}
     57 	return arr;
     58 }
     59 
     60 /**
     61 * Tokenizes a substring.
     62 *
     63 * @private
     64 * @param {string} substr - input string
     65 * @returns {Array} token array
     66 *
     67 * @example
     68 * var str = '(never)';
     69 * var out = tokenizeSubstring( str );
     70 * // returns [ '(', 'never', ')' ]
     71 */
     72 function tokenizeSubstring( substr ) {
     73 	var prefixes = [];
     74 	var suffixes = [];
     75 	var match;
     76 	var done;
     77 	var res;
     78 
     79 	do {
     80 		if (
     81 			!EMOJIS[ substr ] &&
     82 			!ABBRS[ substr ] &&
     83 			!CONTRACT[ substr ]
     84 		) {
     85 			match = substr.split( REGEXP_PREFIXES );
     86 			if ( match.length > 1 ) {
     87 				prefixes.push( match[ 1 ] );
     88 				substr = match[ 2 ];
     89 			}
     90 			else {
     91 				match = substr.split( REGEXP_SUFFIXES );
     92 				if ( match.length > 1 ) {
     93 					substr = match[ 0 ];
     94 					suffixes.push( match[ 1 ] );
     95 				} else {
     96 					done = true;
     97 				}
     98 			}
     99 		}
    100 		else {
    101 			done = true;
    102 		}
    103 	} while ( !done );
    104 
    105 	res = prefixes;
    106 	res.push( substr );
    107 	extend( res, suffixes );
    108 	return res;
    109 }
    110 
    111 
    112 // MAIN //
    113 
    114 /**
    115 * Tokenize a string.
    116 *
    117 * @param {string} str - input string
    118 * @param {boolean} [keepWhitespace=false] - boolean indicating whether whitespace characters should be returned as part of the token array
    119 * @throws {TypeError} first argument must be a string primitive
    120 * @throws {TypeError} second argument must be a boolean primitive
    121 * @returns {Array} array of tokens
    122 *
    123 * @example
    124 * var str = 'Hello World!';
    125 * var out = tokenize( str );
    126 * // returns [ 'Hello', 'World', '!' ]
    127 *
    128 * @example
    129 * var str = '';
    130 * var out = tokenize( str );
    131 * // returns []
    132 *
    133 * @example
    134 * var str = 'Hello Mrs. Maple, could you call me back?';
    135 * var out = tokenize( str );
    136 * // returns [ 'Hello', 'Mrs.', 'Maple', ',', 'could', 'you', 'call', 'me', 'back', '?' ]
    137 */
    138 function tokenize( str, keepWhitespace ) {
    139 	var subtkns;
    140 	var substrs;
    141 	var tokens;
    142 	var substr;
    143 	var cache;
    144 	var i;
    145 	if ( !isString( str ) ) {
    146 		throw new TypeError( 'invalid argument. First argument must be a string primitive. Value: `' + str + '`.' );
    147 	}
    148 	if ( arguments.length > 1 ) {
    149 		if ( !isBoolean( keepWhitespace ) ) {
    150 			throw new TypeError( 'invalid argument. Second argument must be a boolean primitive. Value: `' + keepWhitespace + '`.' );
    151 		}
    152 	}
    153 	if ( !str ) {
    154 		return [];
    155 	}
    156 
    157 	// Split on whitespace:
    158 	if ( keepWhitespace ) {
    159 		substrs = str.split( /(\s+)/ );
    160 	} else {
    161 		substrs = str.split( /\s+/ );
    162 	}
    163 
    164 	// Set up cache to hold tokens for substring matches:
    165 	cache = {};
    166 
    167 	// Initialize token array:
    168 	tokens = [];
    169 
    170 	for ( i = 0; i < substrs.length; i++ ) {
    171 		substr = substrs[ i ];
    172 		if ( hasOwnProp( cache, substr ) ) {
    173 			extend( tokens, cache[ substr ] );
    174 		}
    175 		else {
    176 			subtkns = tokenizeSubstring( substr );
    177 			extend( tokens, subtkns );
    178 			cache[ substr ] = subtkns;
    179 		}
    180 	}
    181 	return tokens;
    182 }
    183 
    184 
    185 // EXPORTS //
    186 
    187 module.exports = tokenize;