time-to-botec

Benchmark sampling in different programming languages
Log | Files | Refs | README

main.js (6110B)


      1 /**
      2 * @license Apache-2.0
      3 *
      4 * Copyright (c) 2019 The Stdlib Authors.
      5 *
      6 * Licensed under the Apache License, Version 2.0 (the "License");
      7 * you may not use this file except in compliance with the License.
      8 * You may obtain a copy of the License at
      9 *
     10 *    http://www.apache.org/licenses/LICENSE-2.0
     11 *
     12 * Unless required by applicable law or agreed to in writing, software
     13 * distributed under the License is distributed on an "AS IS" BASIS,
     14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     15 * See the License for the specific language governing permissions and
     16 * limitations under the License.
     17 *
     18 *
     19 * ## Notice
     20 *
     21 * This code is a modification of an existing JavaScript implementation of ther [Porter stemming algorithm]{@link https://tartarus.org/martin/PorterStemmer/}.
     22 *
     23 * ```text
     24 * Release 1 be 'andargor', Jul 2004
     25 * Release 2 (substantially revised) by Christopher McKenzie, Aug 2009
     26 * ```
     27 */
     28 
     29 'use strict';
     30 
     31 // MODULES //
     32 
     33 var isString = require( '@stdlib/assert/is-string' ).isPrimitive;
     34 var endsWith = require( '@stdlib/string/ends-with' );
     35 var lowercase = require( '@stdlib/string/lowercase' );
     36 var replace = require( '@stdlib/string/replace' );
     37 
     38 
     39 // VARIABLES //
     40 
     41 var step2list = {
     42 	'ational': 'ate',
     43 	'tional': 'tion',
     44 	'enci': 'ence',
     45 	'anci': 'ance',
     46 	'izer': 'ize',
     47 	'bli': 'ble',
     48 	'alli': 'al',
     49 	'entli': 'ent',
     50 	'eli': 'e',
     51 	'ousli': 'ous',
     52 	'ization': 'ize',
     53 	'ation': 'ate',
     54 	'ator': 'ate',
     55 	'alism': 'al',
     56 	'iveness': 'ive',
     57 	'fulness': 'ful',
     58 	'ousness': 'ous',
     59 	'aliti': 'al',
     60 	'iviti': 'ive',
     61 	'biliti': 'ble',
     62 	'logi': 'log'
     63 };
     64 var step3list = {
     65 	'icate': 'ic',
     66 	'ative': '',
     67 	'alize': 'al',
     68 	'iciti': 'ic',
     69 	'ical': 'ic',
     70 	'ful': '',
     71 	'ness': ''
     72 };
     73 var c = '[^aeiou]'; // consonant
     74 var v = '[aeiouy]'; // vowel
     75 var C = c + '[^aeiouy]*'; // consonant sequence
     76 var V = v + '[aeiou]*'; // vowel sequence
     77 var RE_CV = new RegExp( '^' + C + v + '[^aeiouwxy]$' );
     78 var mgr0 = '^(' + C + ')?' + V + C; // [C]VC... is m>0
     79 var RE_MGR0 = new RegExp( mgr0 );
     80 var meq1 = '^(' + C + ')?' + V + C + '(' + V + ')?$'; // [C]VC[V] is m=1
     81 var RE_MEQ1 = new RegExp( meq1 );
     82 var mgr1 = '^(' + C + ')?' + V + C + V + C; // [C]VCVC... is m>1
     83 var RE_MGR1 = new RegExp( mgr1 );
     84 var sV = '^(' + C + ')?' + v; // vowel in stem
     85 var RE_SV = new RegExp( sV );
     86 var RE_STEP1A = /^(.+?)(ss|i)es$/;
     87 var RE2_STEP1A = /^(.+?)([^s])s$/;
     88 var RE_STEP1B = /^(.+?)eed$/;
     89 var RE2_STEP1B = /^(.+?)(ed|ing)$/;
     90 var RE_STEP1C = /^(.+?)y$/;
     91 var RE_STEP2 = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/;
     92 var RE_STEP3 = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/;
     93 var RE_STEP4 = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/;
     94 var RE2_STEP4 = /^(.+?)(s|t)(ion)$/;
     95 var RE_STEP5 = /^(.+?)e$/;
     96 var RE_LAST = /.$/;
     97 var RE_ATBLIZ = /(at|bl|iz)$/;
     98 var RE_DOUBLE = new RegExp( '([^aeiouylsz])\\1$' );
     99 
    100 
    101 // MAIN //
    102 
    103 /**
    104 * Extracts the stem of a given word using the Porter stemming algorithm.
    105 *
    106 * ## References
    107 *
    108 * -   Porter, Michael F. 1980. "An algorithm for suffix stripping." _Program_ 13 (3): 130–37. doi:[10.1108/eb046814][@porter:1980].
    109 *
    110 * [@porter:1980]: https://doi.org/10.1108/eb046814
    111 *
    112 * @param {string} word - input word
    113 * @throws {TypeError} first argument must be a string primitive
    114 * @returns {string} word stem
    115 *
    116 * @example
    117 * var out = porterStemmer( 'walking' );
    118 * // returns 'walk'
    119 *
    120 * @example
    121 * var out = porterStemmer( 'walked' );
    122 * // returns 'walk'
    123 *
    124 * @example
    125 * var out = porterStemmer( 'walks' );
    126 * // returns 'walk'
    127 *
    128 * @example
    129 * var out = porterStemmer( 'worldwide' );
    130 * // returns 'worldwid'
    131 *
    132 * @example
    133 * var out = porterStemmer( '' );
    134 * // returns ''
    135 */
    136 function porterStemmer( word ) {
    137 	var firstch;
    138 	var suffix;
    139 	var stem;
    140 	var fp;
    141 
    142 	if ( !isString( word ) ) {
    143 		throw new TypeError( 'invalid argument. First argument must be a string primitive. Value: `' + word + '`.' );
    144 	}
    145 	if ( word.length < 3 ) {
    146 		return word;
    147 	}
    148 	word = lowercase( word );
    149 	firstch = word[ 0 ];
    150 	if ( firstch === 'y' ) {
    151 		word = firstch.toUpperCase() + word.substr( 1 );
    152 	}
    153 
    154 	// Step 1a:
    155 	if ( RE_STEP1A.test( word ) ) {
    156 		word = replace( word, RE_STEP1A, '$1$2' );
    157 	} else if ( RE2_STEP1A.test( word ) ) {
    158 		word = replace( word, RE2_STEP1A, '$1$2' );
    159 	}
    160 
    161 	// Step 1b:
    162 	if ( RE_STEP1B.test( word ) ) {
    163 		fp = RE_STEP1B.exec( word );
    164 		if ( RE_MGR0.test( fp[ 1 ] ) ) {
    165 			word = replace( word, RE_LAST, '' );
    166 		}
    167 	} else if ( RE2_STEP1B.test( word ) ) {
    168 		fp = RE2_STEP1B.exec( word );
    169 		stem = fp[ 1 ];
    170 		if ( RE_SV.test( stem ) ) {
    171 			word = stem;
    172 			if ( RE_ATBLIZ.test( word ) ) {
    173 				word += 'e';
    174 			} else if ( RE_DOUBLE.test( word ) ) {
    175 				word = replace( word, RE_LAST, '' );
    176 			} else if ( RE_CV.test( word ) ) {
    177 				word += 'e';
    178 			}
    179 		}
    180 	}
    181 
    182 	// Step 1c:
    183 	if ( RE_STEP1C.test( word ) ) {
    184 		fp = RE_STEP1C.exec( word );
    185 		stem = fp[ 1 ];
    186 		if ( RE_SV.test( stem ) ) {
    187 			word = stem + 'i';
    188 		}
    189 	}
    190 
    191 	// Step 2:
    192 	if ( RE_STEP2.test( word ) ) {
    193 		fp = RE_STEP2.exec( word );
    194 		stem = fp[ 1 ];
    195 		suffix = fp[ 2 ];
    196 		if ( RE_MGR0.test( stem ) ) {
    197 			word = stem + step2list[ suffix ];
    198 		}
    199 	}
    200 
    201 	// Step 3:
    202 	if ( RE_STEP3.test( word ) ) {
    203 		fp = RE_STEP3.exec( word );
    204 		stem = fp[ 1 ];
    205 		suffix = fp[ 2 ];
    206 		if ( RE_MGR0.test( stem ) ) {
    207 			word = stem + step3list[ suffix ];
    208 		}
    209 	}
    210 
    211 	// Step 4:
    212 	if ( RE_STEP4.test( word ) ) {
    213 		fp = RE_STEP4.exec( word );
    214 		stem = fp[ 1 ];
    215 		if ( RE_MGR1.test( stem ) ) {
    216 			word = stem;
    217 		}
    218 	} else if ( RE2_STEP4.test( word ) ) {
    219 		fp = RE2_STEP4.exec( word );
    220 		stem = fp[ 1 ] + fp[ 2 ];
    221 		if ( RE_MGR1.test( stem ) ) {
    222 			word = stem;
    223 		}
    224 	}
    225 
    226 	// Step 5:
    227 	if ( RE_STEP5.test( word ) ) {
    228 		fp = RE_STEP5.exec( word );
    229 		stem = fp[ 1 ];
    230 		if (
    231 			RE_MGR1.test( stem ) ||
    232 			( RE_MEQ1.test( stem ) && !( RE_CV.test( stem ) ) )
    233 		) {
    234 			word = stem;
    235 		}
    236 	}
    237 	if ( endsWith( word, 'll' ) && RE_MGR1.test( word ) ) {
    238 		word = replace( word, RE_LAST, '' );
    239 	}
    240 
    241 	// Turn initial Y back to y:
    242 	if ( firstch === 'y' ) {
    243 		word = firstch.toLowerCase() + word.substr( 1 );
    244 	}
    245 	return word;
    246 }
    247 
    248 
    249 // EXPORTS //
    250 
    251 module.exports = porterStemmer;