time-to-botec

Benchmark sampling in different programming languages
Log | Files | Refs | README

main.js (4236B)


      1 /**
      2 * @license Apache-2.0
      3 *
      4 * Copyright (c) 2018 The Stdlib Authors.
      5 *
      6 * Licensed under the Apache License, Version 2.0 (the "License");
      7 * you may not use this file except in compliance with the License.
      8 * You may obtain a copy of the License at
      9 *
     10 *    http://www.apache.org/licenses/LICENSE-2.0
     11 *
     12 * Unless required by applicable law or agreed to in writing, software
     13 * distributed under the License is distributed on an "AS IS" BASIS,
     14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     15 * See the License for the specific language governing permissions and
     16 * limitations under the License.
     17 */
     18 
     19 'use strict';
     20 
     21 // MODULES //
     22 
     23 var isString = require( '@stdlib/assert/is-string' ).isPrimitive;
     24 var format = require( './../../format' );
     25 
     26 
     27 // VARIABLES //
     28 
     29 // 2^6-1 = 63 => 0x3f => 00111111
     30 var Ox3F = 63|0;
     31 
     32 // 2^7 = 128 => 0x80 => 10000000
     33 var Ox80 = 128|0;
     34 
     35 // 192 => 0xc0 => 11000000
     36 var OxC0 = 192|0;
     37 
     38 // 224 => 0xe0 => 11100000
     39 var OxE0 = 224|0;
     40 
     41 // 240 => 0xf0 => 11110000
     42 var OxF0 = 240|0;
     43 
     44 // 2^10-1 = 1023 => 0x3ff => 00000011 11111111
     45 var Ox3FF = 1023|0;
     46 
     47 // 2^11 = 2048 => 0x800 => 00001000 00000000
     48 var Ox800 = 2048|0;
     49 
     50 // 55296 => 11011000 00000000
     51 var OxD800 = 55296|0;
     52 
     53 // 57344 => 11100000 00000000
     54 var OxE000 = 57344|0;
     55 
     56 // 2^16 = 65536 => 00000000 00000001 00000000 00000000
     57 var Ox10000 = 65536|0;
     58 
     59 
     60 // MAIN //
     61 
     62 /**
     63 * Converts a UTF-16 encoded string to an array of integers using UTF-8 encoding.
     64 *
     65 * ## Method
     66 *
     67 * -   UTF-8 is defined to encode code points in one to four bytes, depending on the number of significant bits in the numerical value of the code point.
     68 *
     69 * -   UTF-16 encoding uses one 16-bit unit for non-surrogates (U+0000 to U+D7FF and U+E000 to U+FFFF).
     70 *
     71 * -   UTF-16 encoding uses two 16-bit units (surrogate pairs) for U+10000 to U+10FFFF and encodes U+10000-U+10FFFF by subtracting 0x10000 from the code point, expressing the result as a 20-bit binary, and splitting the 20 bits of 0x0-0xFFFFF as upper and lower 10-bits. The respective 10-bits are stored in two 16-bit words.
     72 *
     73 * -   Let `N` be the number of significant bits.
     74 *
     75 * -   If `N <= 7` (i.e., U+0000 to U+007F), a code point is encoded in a single byte.
     76 *
     77 *     ```text
     78 *     0xxxxxxx
     79 *     ```
     80 *
     81 *     where an `x` refers to a code point bit.
     82 *
     83 * -   If `N <= 11` (i.e., U+0080 to U+07FF; ASCII characters), a code point is encoded in two bytes (5+6 bits).
     84 *
     85 *     ```text
     86 *     110xxxxx 10xxxxxx
     87 *     ```
     88 *
     89 * -   If `N <= 16` (i.e., U+0800 to U+FFFF), a code point is encoded in three bytes (4+6+6 bits).
     90 *
     91 *     ```text
     92 *     1110xxxx 10xxxxxx 10xxxxxx
     93 *     ```
     94 *
     95 * -   If `N <= 21` (i.e., U+10000 to U+10FFFF), a code point is encoded in four bytes (3+6+6+6 bits).
     96 *
     97 *     ```text
     98 *     11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
     99 *     ```
    100 *
    101 *
    102 * @param {string} str - string to convert
    103 * @throws {TypeError} must provide a string
    104 * @returns {Array} array of integers
    105 * @see [UTF-8]{@link https://en.wikipedia.org/wiki/UTF-8}
    106 * @see [Stack Overflow]{@link https://stackoverflow.com/questions/6240055/manually-converting-unicode-codepoints-into-utf-8-and-utf-16}
    107 *
    108 * @example
    109 * var str = '☃';
    110 * var out = utf16ToUTF8Array( str );
    111 * // returns [ 226, 152, 131 ]
    112 */
    113 function utf16ToUTF8Array( str ) {
    114 	var code;
    115 	var out;
    116 	var len;
    117 	var i;
    118 
    119 	if ( !isString( str ) ) {
    120 		throw new TypeError( format( 'invalid argument. Must provide a string. Value: `%s`.', str ) );
    121 	}
    122 	len = str.length;
    123 	out = [];
    124 	for ( i = 0; i < len; i++ ) {
    125 		code = str.charCodeAt( i );
    126 
    127 		// ASCII...
    128 		if ( code < Ox80 ) {
    129 			out.push( code );
    130 		}
    131 		// UTF-16 non-surrogate pair...
    132 		else if ( code < Ox800 ) {
    133 			out.push( OxC0 | (code>>6) );
    134 			out.push( Ox80 | (code & Ox3F) );
    135 		}
    136 		else if ( code < OxD800 || code >= OxE000 ) {
    137 			out.push( OxE0 | (code>>12) );
    138 			out.push( Ox80 | ((code>>6) & Ox3F) );
    139 			out.push( Ox80 | (code & Ox3F) );
    140 		}
    141 		// UTF-16 surrogate pair...
    142 		else {
    143 			i += 1;
    144 
    145 			// eslint-disable-next-line max-len
    146 			code = Ox10000 + (((code & Ox3FF)<<10) | (str.charCodeAt(i) & Ox3FF));
    147 
    148 			out.push( OxF0 | (code>>18) );
    149 			out.push( Ox80 | ((code>>12) & Ox3F) );
    150 			out.push( Ox80 | ((code>>6) & Ox3F) );
    151 			out.push( Ox80 | (code & Ox3F) );
    152 		}
    153 	}
    154 	return out;
    155 }
    156 
    157 
    158 // EXPORTS //
    159 
    160 module.exports = utf16ToUTF8Array;