main.js (6110B)
1 /** 2 * @license Apache-2.0 3 * 4 * Copyright (c) 2019 The Stdlib Authors. 5 * 6 * Licensed under the Apache License, Version 2.0 (the "License"); 7 * you may not use this file except in compliance with the License. 8 * You may obtain a copy of the License at 9 * 10 * http://www.apache.org/licenses/LICENSE-2.0 11 * 12 * Unless required by applicable law or agreed to in writing, software 13 * distributed under the License is distributed on an "AS IS" BASIS, 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 * See the License for the specific language governing permissions and 16 * limitations under the License. 17 * 18 * 19 * ## Notice 20 * 21 * This code is a modification of an existing JavaScript implementation of ther [Porter stemming algorithm]{@link https://tartarus.org/martin/PorterStemmer/}. 22 * 23 * ```text 24 * Release 1 be 'andargor', Jul 2004 25 * Release 2 (substantially revised) by Christopher McKenzie, Aug 2009 26 * ``` 27 */ 28 29 'use strict'; 30 31 // MODULES // 32 33 var isString = require( '@stdlib/assert/is-string' ).isPrimitive; 34 var endsWith = require( '@stdlib/string/ends-with' ); 35 var lowercase = require( '@stdlib/string/lowercase' ); 36 var replace = require( '@stdlib/string/replace' ); 37 38 39 // VARIABLES // 40 41 var step2list = { 42 'ational': 'ate', 43 'tional': 'tion', 44 'enci': 'ence', 45 'anci': 'ance', 46 'izer': 'ize', 47 'bli': 'ble', 48 'alli': 'al', 49 'entli': 'ent', 50 'eli': 'e', 51 'ousli': 'ous', 52 'ization': 'ize', 53 'ation': 'ate', 54 'ator': 'ate', 55 'alism': 'al', 56 'iveness': 'ive', 57 'fulness': 'ful', 58 'ousness': 'ous', 59 'aliti': 'al', 60 'iviti': 'ive', 61 'biliti': 'ble', 62 'logi': 'log' 63 }; 64 var step3list = { 65 'icate': 'ic', 66 'ative': '', 67 'alize': 'al', 68 'iciti': 'ic', 69 'ical': 'ic', 70 'ful': '', 71 'ness': '' 72 }; 73 var c = '[^aeiou]'; // consonant 74 var v = '[aeiouy]'; // vowel 75 var C = c + '[^aeiouy]*'; // consonant sequence 76 var V = v + '[aeiou]*'; // vowel sequence 77 var RE_CV = new RegExp( '^' + C + v + '[^aeiouwxy]$' ); 78 var mgr0 = '^(' + C + ')?' + V + C; // [C]VC... is m>0 79 var RE_MGR0 = new RegExp( mgr0 ); 80 var meq1 = '^(' + C + ')?' + V + C + '(' + V + ')?$'; // [C]VC[V] is m=1 81 var RE_MEQ1 = new RegExp( meq1 ); 82 var mgr1 = '^(' + C + ')?' + V + C + V + C; // [C]VCVC... is m>1 83 var RE_MGR1 = new RegExp( mgr1 ); 84 var sV = '^(' + C + ')?' + v; // vowel in stem 85 var RE_SV = new RegExp( sV ); 86 var RE_STEP1A = /^(.+?)(ss|i)es$/; 87 var RE2_STEP1A = /^(.+?)([^s])s$/; 88 var RE_STEP1B = /^(.+?)eed$/; 89 var RE2_STEP1B = /^(.+?)(ed|ing)$/; 90 var RE_STEP1C = /^(.+?)y$/; 91 var RE_STEP2 = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/; 92 var RE_STEP3 = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/; 93 var RE_STEP4 = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/; 94 var RE2_STEP4 = /^(.+?)(s|t)(ion)$/; 95 var RE_STEP5 = /^(.+?)e$/; 96 var RE_LAST = /.$/; 97 var RE_ATBLIZ = /(at|bl|iz)$/; 98 var RE_DOUBLE = new RegExp( '([^aeiouylsz])\\1$' ); 99 100 101 // MAIN // 102 103 /** 104 * Extracts the stem of a given word using the Porter stemming algorithm. 105 * 106 * ## References 107 * 108 * - Porter, Michael F. 1980. "An algorithm for suffix stripping." _Program_ 13 (3): 130–37. doi:[10.1108/eb046814][@porter:1980]. 109 * 110 * [@porter:1980]: https://doi.org/10.1108/eb046814 111 * 112 * @param {string} word - input word 113 * @throws {TypeError} first argument must be a string primitive 114 * @returns {string} word stem 115 * 116 * @example 117 * var out = porterStemmer( 'walking' ); 118 * // returns 'walk' 119 * 120 * @example 121 * var out = porterStemmer( 'walked' ); 122 * // returns 'walk' 123 * 124 * @example 125 * var out = porterStemmer( 'walks' ); 126 * // returns 'walk' 127 * 128 * @example 129 * var out = porterStemmer( 'worldwide' ); 130 * // returns 'worldwid' 131 * 132 * @example 133 * var out = porterStemmer( '' ); 134 * // returns '' 135 */ 136 function porterStemmer( word ) { 137 var firstch; 138 var suffix; 139 var stem; 140 var fp; 141 142 if ( !isString( word ) ) { 143 throw new TypeError( 'invalid argument. First argument must be a string primitive. Value: `' + word + '`.' ); 144 } 145 if ( word.length < 3 ) { 146 return word; 147 } 148 word = lowercase( word ); 149 firstch = word[ 0 ]; 150 if ( firstch === 'y' ) { 151 word = firstch.toUpperCase() + word.substr( 1 ); 152 } 153 154 // Step 1a: 155 if ( RE_STEP1A.test( word ) ) { 156 word = replace( word, RE_STEP1A, '$1$2' ); 157 } else if ( RE2_STEP1A.test( word ) ) { 158 word = replace( word, RE2_STEP1A, '$1$2' ); 159 } 160 161 // Step 1b: 162 if ( RE_STEP1B.test( word ) ) { 163 fp = RE_STEP1B.exec( word ); 164 if ( RE_MGR0.test( fp[ 1 ] ) ) { 165 word = replace( word, RE_LAST, '' ); 166 } 167 } else if ( RE2_STEP1B.test( word ) ) { 168 fp = RE2_STEP1B.exec( word ); 169 stem = fp[ 1 ]; 170 if ( RE_SV.test( stem ) ) { 171 word = stem; 172 if ( RE_ATBLIZ.test( word ) ) { 173 word += 'e'; 174 } else if ( RE_DOUBLE.test( word ) ) { 175 word = replace( word, RE_LAST, '' ); 176 } else if ( RE_CV.test( word ) ) { 177 word += 'e'; 178 } 179 } 180 } 181 182 // Step 1c: 183 if ( RE_STEP1C.test( word ) ) { 184 fp = RE_STEP1C.exec( word ); 185 stem = fp[ 1 ]; 186 if ( RE_SV.test( stem ) ) { 187 word = stem + 'i'; 188 } 189 } 190 191 // Step 2: 192 if ( RE_STEP2.test( word ) ) { 193 fp = RE_STEP2.exec( word ); 194 stem = fp[ 1 ]; 195 suffix = fp[ 2 ]; 196 if ( RE_MGR0.test( stem ) ) { 197 word = stem + step2list[ suffix ]; 198 } 199 } 200 201 // Step 3: 202 if ( RE_STEP3.test( word ) ) { 203 fp = RE_STEP3.exec( word ); 204 stem = fp[ 1 ]; 205 suffix = fp[ 2 ]; 206 if ( RE_MGR0.test( stem ) ) { 207 word = stem + step3list[ suffix ]; 208 } 209 } 210 211 // Step 4: 212 if ( RE_STEP4.test( word ) ) { 213 fp = RE_STEP4.exec( word ); 214 stem = fp[ 1 ]; 215 if ( RE_MGR1.test( stem ) ) { 216 word = stem; 217 } 218 } else if ( RE2_STEP4.test( word ) ) { 219 fp = RE2_STEP4.exec( word ); 220 stem = fp[ 1 ] + fp[ 2 ]; 221 if ( RE_MGR1.test( stem ) ) { 222 word = stem; 223 } 224 } 225 226 // Step 5: 227 if ( RE_STEP5.test( word ) ) { 228 fp = RE_STEP5.exec( word ); 229 stem = fp[ 1 ]; 230 if ( 231 RE_MGR1.test( stem ) || 232 ( RE_MEQ1.test( stem ) && !( RE_CV.test( stem ) ) ) 233 ) { 234 word = stem; 235 } 236 } 237 if ( endsWith( word, 'll' ) && RE_MGR1.test( word ) ) { 238 word = replace( word, RE_LAST, '' ); 239 } 240 241 // Turn initial Y back to y: 242 if ( firstch === 'y' ) { 243 word = firstch.toLowerCase() + word.substr( 1 ); 244 } 245 return word; 246 } 247 248 249 // EXPORTS // 250 251 module.exports = porterStemmer;