main.js (4236B)
1 /** 2 * @license Apache-2.0 3 * 4 * Copyright (c) 2018 The Stdlib Authors. 5 * 6 * Licensed under the Apache License, Version 2.0 (the "License"); 7 * you may not use this file except in compliance with the License. 8 * You may obtain a copy of the License at 9 * 10 * http://www.apache.org/licenses/LICENSE-2.0 11 * 12 * Unless required by applicable law or agreed to in writing, software 13 * distributed under the License is distributed on an "AS IS" BASIS, 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 * See the License for the specific language governing permissions and 16 * limitations under the License. 17 */ 18 19 'use strict'; 20 21 // MODULES // 22 23 var isString = require( '@stdlib/assert/is-string' ).isPrimitive; 24 var format = require( './../../format' ); 25 26 27 // VARIABLES // 28 29 // 2^6-1 = 63 => 0x3f => 00111111 30 var Ox3F = 63|0; 31 32 // 2^7 = 128 => 0x80 => 10000000 33 var Ox80 = 128|0; 34 35 // 192 => 0xc0 => 11000000 36 var OxC0 = 192|0; 37 38 // 224 => 0xe0 => 11100000 39 var OxE0 = 224|0; 40 41 // 240 => 0xf0 => 11110000 42 var OxF0 = 240|0; 43 44 // 2^10-1 = 1023 => 0x3ff => 00000011 11111111 45 var Ox3FF = 1023|0; 46 47 // 2^11 = 2048 => 0x800 => 00001000 00000000 48 var Ox800 = 2048|0; 49 50 // 55296 => 11011000 00000000 51 var OxD800 = 55296|0; 52 53 // 57344 => 11100000 00000000 54 var OxE000 = 57344|0; 55 56 // 2^16 = 65536 => 00000000 00000001 00000000 00000000 57 var Ox10000 = 65536|0; 58 59 60 // MAIN // 61 62 /** 63 * Converts a UTF-16 encoded string to an array of integers using UTF-8 encoding. 64 * 65 * ## Method 66 * 67 * - UTF-8 is defined to encode code points in one to four bytes, depending on the number of significant bits in the numerical value of the code point. 68 * 69 * - UTF-16 encoding uses one 16-bit unit for non-surrogates (U+0000 to U+D7FF and U+E000 to U+FFFF). 70 * 71 * - UTF-16 encoding uses two 16-bit units (surrogate pairs) for U+10000 to U+10FFFF and encodes U+10000-U+10FFFF by subtracting 0x10000 from the code point, expressing the result as a 20-bit binary, and splitting the 20 bits of 0x0-0xFFFFF as upper and lower 10-bits. The respective 10-bits are stored in two 16-bit words. 72 * 73 * - Let `N` be the number of significant bits. 74 * 75 * - If `N <= 7` (i.e., U+0000 to U+007F), a code point is encoded in a single byte. 76 * 77 * ```text 78 * 0xxxxxxx 79 * ``` 80 * 81 * where an `x` refers to a code point bit. 82 * 83 * - If `N <= 11` (i.e., U+0080 to U+07FF; ASCII characters), a code point is encoded in two bytes (5+6 bits). 84 * 85 * ```text 86 * 110xxxxx 10xxxxxx 87 * ``` 88 * 89 * - If `N <= 16` (i.e., U+0800 to U+FFFF), a code point is encoded in three bytes (4+6+6 bits). 90 * 91 * ```text 92 * 1110xxxx 10xxxxxx 10xxxxxx 93 * ``` 94 * 95 * - If `N <= 21` (i.e., U+10000 to U+10FFFF), a code point is encoded in four bytes (3+6+6+6 bits). 96 * 97 * ```text 98 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 99 * ``` 100 * 101 * 102 * @param {string} str - string to convert 103 * @throws {TypeError} must provide a string 104 * @returns {Array} array of integers 105 * @see [UTF-8]{@link https://en.wikipedia.org/wiki/UTF-8} 106 * @see [Stack Overflow]{@link https://stackoverflow.com/questions/6240055/manually-converting-unicode-codepoints-into-utf-8-and-utf-16} 107 * 108 * @example 109 * var str = '☃'; 110 * var out = utf16ToUTF8Array( str ); 111 * // returns [ 226, 152, 131 ] 112 */ 113 function utf16ToUTF8Array( str ) { 114 var code; 115 var out; 116 var len; 117 var i; 118 119 if ( !isString( str ) ) { 120 throw new TypeError( format( 'invalid argument. Must provide a string. Value: `%s`.', str ) ); 121 } 122 len = str.length; 123 out = []; 124 for ( i = 0; i < len; i++ ) { 125 code = str.charCodeAt( i ); 126 127 // ASCII... 128 if ( code < Ox80 ) { 129 out.push( code ); 130 } 131 // UTF-16 non-surrogate pair... 132 else if ( code < Ox800 ) { 133 out.push( OxC0 | (code>>6) ); 134 out.push( Ox80 | (code & Ox3F) ); 135 } 136 else if ( code < OxD800 || code >= OxE000 ) { 137 out.push( OxE0 | (code>>12) ); 138 out.push( Ox80 | ((code>>6) & Ox3F) ); 139 out.push( Ox80 | (code & Ox3F) ); 140 } 141 // UTF-16 surrogate pair... 142 else { 143 i += 1; 144 145 // eslint-disable-next-line max-len 146 code = Ox10000 + (((code & Ox3FF)<<10) | (str.charCodeAt(i) & Ox3FF)); 147 148 out.push( OxF0 | (code>>18) ); 149 out.push( Ox80 | ((code>>12) & Ox3F) ); 150 out.push( Ox80 | ((code>>6) & Ox3F) ); 151 out.push( Ox80 | (code & Ox3F) ); 152 } 153 } 154 return out; 155 } 156 157 158 // EXPORTS // 159 160 module.exports = utf16ToUTF8Array;