time-to-botec

Benchmark sampling in different programming languages
Log | Files | Refs | README

_unicodeWords.js (3060B)


      1 /** Used to compose unicode character classes. */
      2 var rsAstralRange = '\\ud800-\\udfff',
      3     rsComboMarksRange = '\\u0300-\\u036f',
      4     reComboHalfMarksRange = '\\ufe20-\\ufe2f',
      5     rsComboSymbolsRange = '\\u20d0-\\u20ff',
      6     rsComboRange = rsComboMarksRange + reComboHalfMarksRange + rsComboSymbolsRange,
      7     rsDingbatRange = '\\u2700-\\u27bf',
      8     rsLowerRange = 'a-z\\xdf-\\xf6\\xf8-\\xff',
      9     rsMathOpRange = '\\xac\\xb1\\xd7\\xf7',
     10     rsNonCharRange = '\\x00-\\x2f\\x3a-\\x40\\x5b-\\x60\\x7b-\\xbf',
     11     rsPunctuationRange = '\\u2000-\\u206f',
     12     rsSpaceRange = ' \\t\\x0b\\f\\xa0\\ufeff\\n\\r\\u2028\\u2029\\u1680\\u180e\\u2000\\u2001\\u2002\\u2003\\u2004\\u2005\\u2006\\u2007\\u2008\\u2009\\u200a\\u202f\\u205f\\u3000',
     13     rsUpperRange = 'A-Z\\xc0-\\xd6\\xd8-\\xde',
     14     rsVarRange = '\\ufe0e\\ufe0f',
     15     rsBreakRange = rsMathOpRange + rsNonCharRange + rsPunctuationRange + rsSpaceRange;
     16 
     17 /** Used to compose unicode capture groups. */
     18 var rsApos = "['\u2019]",
     19     rsBreak = '[' + rsBreakRange + ']',
     20     rsCombo = '[' + rsComboRange + ']',
     21     rsDigits = '\\d+',
     22     rsDingbat = '[' + rsDingbatRange + ']',
     23     rsLower = '[' + rsLowerRange + ']',
     24     rsMisc = '[^' + rsAstralRange + rsBreakRange + rsDigits + rsDingbatRange + rsLowerRange + rsUpperRange + ']',
     25     rsFitz = '\\ud83c[\\udffb-\\udfff]',
     26     rsModifier = '(?:' + rsCombo + '|' + rsFitz + ')',
     27     rsNonAstral = '[^' + rsAstralRange + ']',
     28     rsRegional = '(?:\\ud83c[\\udde6-\\uddff]){2}',
     29     rsSurrPair = '[\\ud800-\\udbff][\\udc00-\\udfff]',
     30     rsUpper = '[' + rsUpperRange + ']',
     31     rsZWJ = '\\u200d';
     32 
     33 /** Used to compose unicode regexes. */
     34 var rsMiscLower = '(?:' + rsLower + '|' + rsMisc + ')',
     35     rsMiscUpper = '(?:' + rsUpper + '|' + rsMisc + ')',
     36     rsOptContrLower = '(?:' + rsApos + '(?:d|ll|m|re|s|t|ve))?',
     37     rsOptContrUpper = '(?:' + rsApos + '(?:D|LL|M|RE|S|T|VE))?',
     38     reOptMod = rsModifier + '?',
     39     rsOptVar = '[' + rsVarRange + ']?',
     40     rsOptJoin = '(?:' + rsZWJ + '(?:' + [rsNonAstral, rsRegional, rsSurrPair].join('|') + ')' + rsOptVar + reOptMod + ')*',
     41     rsOrdLower = '\\d*(?:1st|2nd|3rd|(?![123])\\dth)(?=\\b|[A-Z_])',
     42     rsOrdUpper = '\\d*(?:1ST|2ND|3RD|(?![123])\\dTH)(?=\\b|[a-z_])',
     43     rsSeq = rsOptVar + reOptMod + rsOptJoin,
     44     rsEmoji = '(?:' + [rsDingbat, rsRegional, rsSurrPair].join('|') + ')' + rsSeq;
     45 
     46 /** Used to match complex or compound words. */
     47 var reUnicodeWord = RegExp([
     48   rsUpper + '?' + rsLower + '+' + rsOptContrLower + '(?=' + [rsBreak, rsUpper, '$'].join('|') + ')',
     49   rsMiscUpper + '+' + rsOptContrUpper + '(?=' + [rsBreak, rsUpper + rsMiscLower, '$'].join('|') + ')',
     50   rsUpper + '?' + rsMiscLower + '+' + rsOptContrLower,
     51   rsUpper + '+' + rsOptContrUpper,
     52   rsOrdUpper,
     53   rsOrdLower,
     54   rsDigits,
     55   rsEmoji
     56 ].join('|'), 'g');
     57 
     58 /**
     59  * Splits a Unicode `string` into an array of its words.
     60  *
     61  * @private
     62  * @param {string} The string to inspect.
     63  * @returns {Array} Returns the words of `string`.
     64  */
     65 function unicodeWords(string) {
     66   return string.match(reUnicodeWord) || [];
     67 }
     68 
     69 module.exports = unicodeWords;