time-to-botec

Benchmark sampling in different programming languages
Log | Files | Refs | README

string_decoder.js (9465B)


      1 // Copyright Joyent, Inc. and other Node contributors.
      2 //
      3 // Permission is hereby granted, free of charge, to any person obtaining a
      4 // copy of this software and associated documentation files (the
      5 // "Software"), to deal in the Software without restriction, including
      6 // without limitation the rights to use, copy, modify, merge, publish,
      7 // distribute, sublicense, and/or sell copies of the Software, and to permit
      8 // persons to whom the Software is furnished to do so, subject to the
      9 // following conditions:
     10 //
     11 // The above copyright notice and this permission notice shall be included
     12 // in all copies or substantial portions of the Software.
     13 //
     14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
     15 // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
     16 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
     17 // NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
     18 // DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
     19 // OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
     20 // USE OR OTHER DEALINGS IN THE SOFTWARE.
     21 
     22 'use strict';
     23 
     24 /*<replacement>*/
     25 
     26 var Buffer = require('safe-buffer').Buffer;
     27 /*</replacement>*/
     28 
     29 var isEncoding = Buffer.isEncoding || function (encoding) {
     30   encoding = '' + encoding;
     31   switch (encoding && encoding.toLowerCase()) {
     32     case 'hex':case 'utf8':case 'utf-8':case 'ascii':case 'binary':case 'base64':case 'ucs2':case 'ucs-2':case 'utf16le':case 'utf-16le':case 'raw':
     33       return true;
     34     default:
     35       return false;
     36   }
     37 };
     38 
     39 function _normalizeEncoding(enc) {
     40   if (!enc) return 'utf8';
     41   var retried;
     42   while (true) {
     43     switch (enc) {
     44       case 'utf8':
     45       case 'utf-8':
     46         return 'utf8';
     47       case 'ucs2':
     48       case 'ucs-2':
     49       case 'utf16le':
     50       case 'utf-16le':
     51         return 'utf16le';
     52       case 'latin1':
     53       case 'binary':
     54         return 'latin1';
     55       case 'base64':
     56       case 'ascii':
     57       case 'hex':
     58         return enc;
     59       default:
     60         if (retried) return; // undefined
     61         enc = ('' + enc).toLowerCase();
     62         retried = true;
     63     }
     64   }
     65 };
     66 
     67 // Do not cache `Buffer.isEncoding` when checking encoding names as some
     68 // modules monkey-patch it to support additional encodings
     69 function normalizeEncoding(enc) {
     70   var nenc = _normalizeEncoding(enc);
     71   if (typeof nenc !== 'string' && (Buffer.isEncoding === isEncoding || !isEncoding(enc))) throw new Error('Unknown encoding: ' + enc);
     72   return nenc || enc;
     73 }
     74 
     75 // StringDecoder provides an interface for efficiently splitting a series of
     76 // buffers into a series of JS strings without breaking apart multi-byte
     77 // characters.
     78 exports.StringDecoder = StringDecoder;
     79 function StringDecoder(encoding) {
     80   this.encoding = normalizeEncoding(encoding);
     81   var nb;
     82   switch (this.encoding) {
     83     case 'utf16le':
     84       this.text = utf16Text;
     85       this.end = utf16End;
     86       nb = 4;
     87       break;
     88     case 'utf8':
     89       this.fillLast = utf8FillLast;
     90       nb = 4;
     91       break;
     92     case 'base64':
     93       this.text = base64Text;
     94       this.end = base64End;
     95       nb = 3;
     96       break;
     97     default:
     98       this.write = simpleWrite;
     99       this.end = simpleEnd;
    100       return;
    101   }
    102   this.lastNeed = 0;
    103   this.lastTotal = 0;
    104   this.lastChar = Buffer.allocUnsafe(nb);
    105 }
    106 
    107 StringDecoder.prototype.write = function (buf) {
    108   if (buf.length === 0) return '';
    109   var r;
    110   var i;
    111   if (this.lastNeed) {
    112     r = this.fillLast(buf);
    113     if (r === undefined) return '';
    114     i = this.lastNeed;
    115     this.lastNeed = 0;
    116   } else {
    117     i = 0;
    118   }
    119   if (i < buf.length) return r ? r + this.text(buf, i) : this.text(buf, i);
    120   return r || '';
    121 };
    122 
    123 StringDecoder.prototype.end = utf8End;
    124 
    125 // Returns only complete characters in a Buffer
    126 StringDecoder.prototype.text = utf8Text;
    127 
    128 // Attempts to complete a partial non-UTF-8 character using bytes from a Buffer
    129 StringDecoder.prototype.fillLast = function (buf) {
    130   if (this.lastNeed <= buf.length) {
    131     buf.copy(this.lastChar, this.lastTotal - this.lastNeed, 0, this.lastNeed);
    132     return this.lastChar.toString(this.encoding, 0, this.lastTotal);
    133   }
    134   buf.copy(this.lastChar, this.lastTotal - this.lastNeed, 0, buf.length);
    135   this.lastNeed -= buf.length;
    136 };
    137 
    138 // Checks the type of a UTF-8 byte, whether it's ASCII, a leading byte, or a
    139 // continuation byte. If an invalid byte is detected, -2 is returned.
    140 function utf8CheckByte(byte) {
    141   if (byte <= 0x7F) return 0;else if (byte >> 5 === 0x06) return 2;else if (byte >> 4 === 0x0E) return 3;else if (byte >> 3 === 0x1E) return 4;
    142   return byte >> 6 === 0x02 ? -1 : -2;
    143 }
    144 
    145 // Checks at most 3 bytes at the end of a Buffer in order to detect an
    146 // incomplete multi-byte UTF-8 character. The total number of bytes (2, 3, or 4)
    147 // needed to complete the UTF-8 character (if applicable) are returned.
    148 function utf8CheckIncomplete(self, buf, i) {
    149   var j = buf.length - 1;
    150   if (j < i) return 0;
    151   var nb = utf8CheckByte(buf[j]);
    152   if (nb >= 0) {
    153     if (nb > 0) self.lastNeed = nb - 1;
    154     return nb;
    155   }
    156   if (--j < i || nb === -2) return 0;
    157   nb = utf8CheckByte(buf[j]);
    158   if (nb >= 0) {
    159     if (nb > 0) self.lastNeed = nb - 2;
    160     return nb;
    161   }
    162   if (--j < i || nb === -2) return 0;
    163   nb = utf8CheckByte(buf[j]);
    164   if (nb >= 0) {
    165     if (nb > 0) {
    166       if (nb === 2) nb = 0;else self.lastNeed = nb - 3;
    167     }
    168     return nb;
    169   }
    170   return 0;
    171 }
    172 
    173 // Validates as many continuation bytes for a multi-byte UTF-8 character as
    174 // needed or are available. If we see a non-continuation byte where we expect
    175 // one, we "replace" the validated continuation bytes we've seen so far with
    176 // a single UTF-8 replacement character ('\ufffd'), to match v8's UTF-8 decoding
    177 // behavior. The continuation byte check is included three times in the case
    178 // where all of the continuation bytes for a character exist in the same buffer.
    179 // It is also done this way as a slight performance increase instead of using a
    180 // loop.
    181 function utf8CheckExtraBytes(self, buf, p) {
    182   if ((buf[0] & 0xC0) !== 0x80) {
    183     self.lastNeed = 0;
    184     return '\ufffd';
    185   }
    186   if (self.lastNeed > 1 && buf.length > 1) {
    187     if ((buf[1] & 0xC0) !== 0x80) {
    188       self.lastNeed = 1;
    189       return '\ufffd';
    190     }
    191     if (self.lastNeed > 2 && buf.length > 2) {
    192       if ((buf[2] & 0xC0) !== 0x80) {
    193         self.lastNeed = 2;
    194         return '\ufffd';
    195       }
    196     }
    197   }
    198 }
    199 
    200 // Attempts to complete a multi-byte UTF-8 character using bytes from a Buffer.
    201 function utf8FillLast(buf) {
    202   var p = this.lastTotal - this.lastNeed;
    203   var r = utf8CheckExtraBytes(this, buf, p);
    204   if (r !== undefined) return r;
    205   if (this.lastNeed <= buf.length) {
    206     buf.copy(this.lastChar, p, 0, this.lastNeed);
    207     return this.lastChar.toString(this.encoding, 0, this.lastTotal);
    208   }
    209   buf.copy(this.lastChar, p, 0, buf.length);
    210   this.lastNeed -= buf.length;
    211 }
    212 
    213 // Returns all complete UTF-8 characters in a Buffer. If the Buffer ended on a
    214 // partial character, the character's bytes are buffered until the required
    215 // number of bytes are available.
    216 function utf8Text(buf, i) {
    217   var total = utf8CheckIncomplete(this, buf, i);
    218   if (!this.lastNeed) return buf.toString('utf8', i);
    219   this.lastTotal = total;
    220   var end = buf.length - (total - this.lastNeed);
    221   buf.copy(this.lastChar, 0, end);
    222   return buf.toString('utf8', i, end);
    223 }
    224 
    225 // For UTF-8, a replacement character is added when ending on a partial
    226 // character.
    227 function utf8End(buf) {
    228   var r = buf && buf.length ? this.write(buf) : '';
    229   if (this.lastNeed) return r + '\ufffd';
    230   return r;
    231 }
    232 
    233 // UTF-16LE typically needs two bytes per character, but even if we have an even
    234 // number of bytes available, we need to check if we end on a leading/high
    235 // surrogate. In that case, we need to wait for the next two bytes in order to
    236 // decode the last character properly.
    237 function utf16Text(buf, i) {
    238   if ((buf.length - i) % 2 === 0) {
    239     var r = buf.toString('utf16le', i);
    240     if (r) {
    241       var c = r.charCodeAt(r.length - 1);
    242       if (c >= 0xD800 && c <= 0xDBFF) {
    243         this.lastNeed = 2;
    244         this.lastTotal = 4;
    245         this.lastChar[0] = buf[buf.length - 2];
    246         this.lastChar[1] = buf[buf.length - 1];
    247         return r.slice(0, -1);
    248       }
    249     }
    250     return r;
    251   }
    252   this.lastNeed = 1;
    253   this.lastTotal = 2;
    254   this.lastChar[0] = buf[buf.length - 1];
    255   return buf.toString('utf16le', i, buf.length - 1);
    256 }
    257 
    258 // For UTF-16LE we do not explicitly append special replacement characters if we
    259 // end on a partial character, we simply let v8 handle that.
    260 function utf16End(buf) {
    261   var r = buf && buf.length ? this.write(buf) : '';
    262   if (this.lastNeed) {
    263     var end = this.lastTotal - this.lastNeed;
    264     return r + this.lastChar.toString('utf16le', 0, end);
    265   }
    266   return r;
    267 }
    268 
    269 function base64Text(buf, i) {
    270   var n = (buf.length - i) % 3;
    271   if (n === 0) return buf.toString('base64', i);
    272   this.lastNeed = 3 - n;
    273   this.lastTotal = 3;
    274   if (n === 1) {
    275     this.lastChar[0] = buf[buf.length - 1];
    276   } else {
    277     this.lastChar[0] = buf[buf.length - 2];
    278     this.lastChar[1] = buf[buf.length - 1];
    279   }
    280   return buf.toString('base64', i, buf.length - n);
    281 }
    282 
    283 function base64End(buf) {
    284   var r = buf && buf.length ? this.write(buf) : '';
    285   if (this.lastNeed) return r + this.lastChar.toString('base64', 0, 3 - this.lastNeed);
    286   return r;
    287 }
    288 
    289 // Pass bytes on through for single-byte encodings (e.g. ascii, latin1, hex)
    290 function simpleWrite(buf) {
    291   return buf.toString(this.encoding);
    292 }
    293 
    294 function simpleEnd(buf) {
    295   return buf && buf.length ? this.write(buf) : '';
    296 }