string_decoder.js (9465B)
1 // Copyright Joyent, Inc. and other Node contributors. 2 // 3 // Permission is hereby granted, free of charge, to any person obtaining a 4 // copy of this software and associated documentation files (the 5 // "Software"), to deal in the Software without restriction, including 6 // without limitation the rights to use, copy, modify, merge, publish, 7 // distribute, sublicense, and/or sell copies of the Software, and to permit 8 // persons to whom the Software is furnished to do so, subject to the 9 // following conditions: 10 // 11 // The above copyright notice and this permission notice shall be included 12 // in all copies or substantial portions of the Software. 13 // 14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 15 // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN 17 // NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, 18 // DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 19 // OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 20 // USE OR OTHER DEALINGS IN THE SOFTWARE. 21 22 'use strict'; 23 24 /*<replacement>*/ 25 26 var Buffer = require('safe-buffer').Buffer; 27 /*</replacement>*/ 28 29 var isEncoding = Buffer.isEncoding || function (encoding) { 30 encoding = '' + encoding; 31 switch (encoding && encoding.toLowerCase()) { 32 case 'hex':case 'utf8':case 'utf-8':case 'ascii':case 'binary':case 'base64':case 'ucs2':case 'ucs-2':case 'utf16le':case 'utf-16le':case 'raw': 33 return true; 34 default: 35 return false; 36 } 37 }; 38 39 function _normalizeEncoding(enc) { 40 if (!enc) return 'utf8'; 41 var retried; 42 while (true) { 43 switch (enc) { 44 case 'utf8': 45 case 'utf-8': 46 return 'utf8'; 47 case 'ucs2': 48 case 'ucs-2': 49 case 'utf16le': 50 case 'utf-16le': 51 return 'utf16le'; 52 case 'latin1': 53 case 'binary': 54 return 'latin1'; 55 case 'base64': 56 case 'ascii': 57 case 'hex': 58 return enc; 59 default: 60 if (retried) return; // undefined 61 enc = ('' + enc).toLowerCase(); 62 retried = true; 63 } 64 } 65 }; 66 67 // Do not cache `Buffer.isEncoding` when checking encoding names as some 68 // modules monkey-patch it to support additional encodings 69 function normalizeEncoding(enc) { 70 var nenc = _normalizeEncoding(enc); 71 if (typeof nenc !== 'string' && (Buffer.isEncoding === isEncoding || !isEncoding(enc))) throw new Error('Unknown encoding: ' + enc); 72 return nenc || enc; 73 } 74 75 // StringDecoder provides an interface for efficiently splitting a series of 76 // buffers into a series of JS strings without breaking apart multi-byte 77 // characters. 78 exports.StringDecoder = StringDecoder; 79 function StringDecoder(encoding) { 80 this.encoding = normalizeEncoding(encoding); 81 var nb; 82 switch (this.encoding) { 83 case 'utf16le': 84 this.text = utf16Text; 85 this.end = utf16End; 86 nb = 4; 87 break; 88 case 'utf8': 89 this.fillLast = utf8FillLast; 90 nb = 4; 91 break; 92 case 'base64': 93 this.text = base64Text; 94 this.end = base64End; 95 nb = 3; 96 break; 97 default: 98 this.write = simpleWrite; 99 this.end = simpleEnd; 100 return; 101 } 102 this.lastNeed = 0; 103 this.lastTotal = 0; 104 this.lastChar = Buffer.allocUnsafe(nb); 105 } 106 107 StringDecoder.prototype.write = function (buf) { 108 if (buf.length === 0) return ''; 109 var r; 110 var i; 111 if (this.lastNeed) { 112 r = this.fillLast(buf); 113 if (r === undefined) return ''; 114 i = this.lastNeed; 115 this.lastNeed = 0; 116 } else { 117 i = 0; 118 } 119 if (i < buf.length) return r ? r + this.text(buf, i) : this.text(buf, i); 120 return r || ''; 121 }; 122 123 StringDecoder.prototype.end = utf8End; 124 125 // Returns only complete characters in a Buffer 126 StringDecoder.prototype.text = utf8Text; 127 128 // Attempts to complete a partial non-UTF-8 character using bytes from a Buffer 129 StringDecoder.prototype.fillLast = function (buf) { 130 if (this.lastNeed <= buf.length) { 131 buf.copy(this.lastChar, this.lastTotal - this.lastNeed, 0, this.lastNeed); 132 return this.lastChar.toString(this.encoding, 0, this.lastTotal); 133 } 134 buf.copy(this.lastChar, this.lastTotal - this.lastNeed, 0, buf.length); 135 this.lastNeed -= buf.length; 136 }; 137 138 // Checks the type of a UTF-8 byte, whether it's ASCII, a leading byte, or a 139 // continuation byte. If an invalid byte is detected, -2 is returned. 140 function utf8CheckByte(byte) { 141 if (byte <= 0x7F) return 0;else if (byte >> 5 === 0x06) return 2;else if (byte >> 4 === 0x0E) return 3;else if (byte >> 3 === 0x1E) return 4; 142 return byte >> 6 === 0x02 ? -1 : -2; 143 } 144 145 // Checks at most 3 bytes at the end of a Buffer in order to detect an 146 // incomplete multi-byte UTF-8 character. The total number of bytes (2, 3, or 4) 147 // needed to complete the UTF-8 character (if applicable) are returned. 148 function utf8CheckIncomplete(self, buf, i) { 149 var j = buf.length - 1; 150 if (j < i) return 0; 151 var nb = utf8CheckByte(buf[j]); 152 if (nb >= 0) { 153 if (nb > 0) self.lastNeed = nb - 1; 154 return nb; 155 } 156 if (--j < i || nb === -2) return 0; 157 nb = utf8CheckByte(buf[j]); 158 if (nb >= 0) { 159 if (nb > 0) self.lastNeed = nb - 2; 160 return nb; 161 } 162 if (--j < i || nb === -2) return 0; 163 nb = utf8CheckByte(buf[j]); 164 if (nb >= 0) { 165 if (nb > 0) { 166 if (nb === 2) nb = 0;else self.lastNeed = nb - 3; 167 } 168 return nb; 169 } 170 return 0; 171 } 172 173 // Validates as many continuation bytes for a multi-byte UTF-8 character as 174 // needed or are available. If we see a non-continuation byte where we expect 175 // one, we "replace" the validated continuation bytes we've seen so far with 176 // a single UTF-8 replacement character ('\ufffd'), to match v8's UTF-8 decoding 177 // behavior. The continuation byte check is included three times in the case 178 // where all of the continuation bytes for a character exist in the same buffer. 179 // It is also done this way as a slight performance increase instead of using a 180 // loop. 181 function utf8CheckExtraBytes(self, buf, p) { 182 if ((buf[0] & 0xC0) !== 0x80) { 183 self.lastNeed = 0; 184 return '\ufffd'; 185 } 186 if (self.lastNeed > 1 && buf.length > 1) { 187 if ((buf[1] & 0xC0) !== 0x80) { 188 self.lastNeed = 1; 189 return '\ufffd'; 190 } 191 if (self.lastNeed > 2 && buf.length > 2) { 192 if ((buf[2] & 0xC0) !== 0x80) { 193 self.lastNeed = 2; 194 return '\ufffd'; 195 } 196 } 197 } 198 } 199 200 // Attempts to complete a multi-byte UTF-8 character using bytes from a Buffer. 201 function utf8FillLast(buf) { 202 var p = this.lastTotal - this.lastNeed; 203 var r = utf8CheckExtraBytes(this, buf, p); 204 if (r !== undefined) return r; 205 if (this.lastNeed <= buf.length) { 206 buf.copy(this.lastChar, p, 0, this.lastNeed); 207 return this.lastChar.toString(this.encoding, 0, this.lastTotal); 208 } 209 buf.copy(this.lastChar, p, 0, buf.length); 210 this.lastNeed -= buf.length; 211 } 212 213 // Returns all complete UTF-8 characters in a Buffer. If the Buffer ended on a 214 // partial character, the character's bytes are buffered until the required 215 // number of bytes are available. 216 function utf8Text(buf, i) { 217 var total = utf8CheckIncomplete(this, buf, i); 218 if (!this.lastNeed) return buf.toString('utf8', i); 219 this.lastTotal = total; 220 var end = buf.length - (total - this.lastNeed); 221 buf.copy(this.lastChar, 0, end); 222 return buf.toString('utf8', i, end); 223 } 224 225 // For UTF-8, a replacement character is added when ending on a partial 226 // character. 227 function utf8End(buf) { 228 var r = buf && buf.length ? this.write(buf) : ''; 229 if (this.lastNeed) return r + '\ufffd'; 230 return r; 231 } 232 233 // UTF-16LE typically needs two bytes per character, but even if we have an even 234 // number of bytes available, we need to check if we end on a leading/high 235 // surrogate. In that case, we need to wait for the next two bytes in order to 236 // decode the last character properly. 237 function utf16Text(buf, i) { 238 if ((buf.length - i) % 2 === 0) { 239 var r = buf.toString('utf16le', i); 240 if (r) { 241 var c = r.charCodeAt(r.length - 1); 242 if (c >= 0xD800 && c <= 0xDBFF) { 243 this.lastNeed = 2; 244 this.lastTotal = 4; 245 this.lastChar[0] = buf[buf.length - 2]; 246 this.lastChar[1] = buf[buf.length - 1]; 247 return r.slice(0, -1); 248 } 249 } 250 return r; 251 } 252 this.lastNeed = 1; 253 this.lastTotal = 2; 254 this.lastChar[0] = buf[buf.length - 1]; 255 return buf.toString('utf16le', i, buf.length - 1); 256 } 257 258 // For UTF-16LE we do not explicitly append special replacement characters if we 259 // end on a partial character, we simply let v8 handle that. 260 function utf16End(buf) { 261 var r = buf && buf.length ? this.write(buf) : ''; 262 if (this.lastNeed) { 263 var end = this.lastTotal - this.lastNeed; 264 return r + this.lastChar.toString('utf16le', 0, end); 265 } 266 return r; 267 } 268 269 function base64Text(buf, i) { 270 var n = (buf.length - i) % 3; 271 if (n === 0) return buf.toString('base64', i); 272 this.lastNeed = 3 - n; 273 this.lastTotal = 3; 274 if (n === 1) { 275 this.lastChar[0] = buf[buf.length - 1]; 276 } else { 277 this.lastChar[0] = buf[buf.length - 2]; 278 this.lastChar[1] = buf[buf.length - 1]; 279 } 280 return buf.toString('base64', i, buf.length - n); 281 } 282 283 function base64End(buf) { 284 var r = buf && buf.length ? this.write(buf) : ''; 285 if (this.lastNeed) return r + this.lastChar.toString('base64', 0, 3 - this.lastNeed); 286 return r; 287 } 288 289 // Pass bytes on through for single-byte encodings (e.g. ascii, latin1, hex) 290 function simpleWrite(buf) { 291 return buf.toString(this.encoding); 292 } 293 294 function simpleEnd(buf) { 295 return buf && buf.length ? this.write(buf) : ''; 296 }