Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,17 +73,21 @@ utf8.encode('\uD800\uDC01');
// → '\xF0\x90\x80\x81'
```

### `utf8.decode(byteString)`
### `utf8.decode(byteString, options)`

Decodes any given UTF-8-encoded string (`byteString`) as UTF-8, and returns the UTF-8-decoded version of the string. It throws an error when malformed UTF-8 is detected. (If you need to be able to decode encoded non-scalar values as well, use [WTF-8](https://mths.be/wtf8) instead.)

You can use `allowTruncatedEnd` option to ignore last symbol if it was truncated on the end of input.
```js
utf8.decode('\xC2\xA9');
// → '\xA9'

utf8.decode('\xF0\x90\x80\x81');
// → '\uD800\uDC01'
// → U+10001 LINEAR B SYLLABLE B038 E

utf8.decode('\xC2\xA9\xC2', { allowTruncatedEnd: true });
// → '\xA9'
```

### `utf8.version`
Expand Down
90 changes: 90 additions & 0 deletions tests/tests.js
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,54 @@
}
});

// Broken ending
equal(
'\x61\x62',
utf8.decode('\x61\x62\xD7', { allowTruncatedEnd: true }), // \xD7\x8A
'Decoding: 2-byte ending is broken'
);
equal(
'\x61\x62',
utf8.decode('\x61\x62\xE2', { allowTruncatedEnd: true }), // \xE2\xB0\xBC
'Decoding: 3-byte ending is broken'
);
equal(
'\x61\x62',
utf8.decode('\x61\x62\xE2\xB0', { allowTruncatedEnd: true }), // \xE2\xB0\xBC
'Decoding: 3-byte ending is broken'
);
equal(
'\x61\x62',
utf8.decode('\x61\x62\xF0', { allowTruncatedEnd: true }), // \xF0\x9D\x8C\x86
'Decoding: 4-byte ending is broken'
);
equal(
'\x61\x62',
utf8.decode('\x61\x62\xF0\x9D', { allowTruncatedEnd: true }), // \xF0\x9D\x8C\x86
'Decoding: 4-byte ending is broken'
);
equal(
'\x61\x62',
utf8.decode('\x61\x62\xF0\x9D\x8C', { allowTruncatedEnd: true }), // \xF0\x9D\x8C\x86
'Decoding: 4-byte ending is broken'
);
// The only symbol is broken
equal(
'',
utf8.decode('\xD7', { allowTruncatedEnd: true }), // \xD7\x8A
'Decoding: The only 2-byte is broken'
);
equal(
'',
utf8.decode('\xE2\xB0', { allowTruncatedEnd: true }), // \xE2\xB0\xBC
'Decoding: The only 3-byte is broken'
);
equal(
'',
utf8.decode('\xF0\x9D\x8C', { allowTruncatedEnd: true }), // \xF0\x9D\x8C\x86
'Decoding: The only 4-byte is broken'
);

// Error handling
raises(
function() {
Expand Down Expand Up @@ -263,6 +311,48 @@
Error,
'Error: invalid byte index'
);
raises(
function () {
utf8.decode('\xD7\x00', { allowTruncatedEnd: true });
},
Error,
'Error: invalid continuation byte (2-byte sequence expected)'
);
raises(
function () {
utf8.decode('\xE2\x00', { allowTruncatedEnd: true });
},
Error,
'Error: invalid continuation byte (3-byte sequence expected)'
);
raises(
function () {
utf8.decode('\xE2\xB0\x00', { allowTruncatedEnd: true });
},
Error,
'Error: invalid continuation byte (3-byte sequence expected)'
);
raises(
function () {
utf8.decode('\xF0\x00', { allowTruncatedEnd: true });
},
Error,
'Error: invalid continuation byte (4-byte sequence expected)'
);
raises(
function () {
utf8.decode('\xF0\x9D\x00', { allowTruncatedEnd: true });
},
Error,
'Error: invalid continuation byte (4-byte sequence expected)'
);
raises(
function () {
utf8.decode('\xF0\x9D\x8C\x00', { allowTruncatedEnd: true });
},
Error,
'Error: invalid continuation byte (4-byte sequence expected)'
);
});

/*--------------------------------------------------------------------------*/
Expand Down
17 changes: 16 additions & 1 deletion utf8.js
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,10 @@

function readContinuationByte() {
if (byteIndex >= byteCount) {
if (allowTruncatedEnd) {
return false;
}

throw Error('Invalid byte index');
}

Expand Down Expand Up @@ -158,6 +162,9 @@
// 2-byte sequence
if ((byte1 & 0xE0) == 0xC0) {
byte2 = readContinuationByte();
if (byte2 === false) {
return false;
}
codePoint = ((byte1 & 0x1F) << 6) | byte2;
if (codePoint >= 0x80) {
return codePoint;
Expand All @@ -170,6 +177,9 @@
if ((byte1 & 0xF0) == 0xE0) {
byte2 = readContinuationByte();
byte3 = readContinuationByte();
if (byte3 === false) {
return false;
}
codePoint = ((byte1 & 0x0F) << 12) | (byte2 << 6) | byte3;
if (codePoint >= 0x0800) {
checkScalarValue(codePoint);
Expand All @@ -184,6 +194,9 @@
byte2 = readContinuationByte();
byte3 = readContinuationByte();
byte4 = readContinuationByte();
if (byte4 === false) {
return false;
}
codePoint = ((byte1 & 0x07) << 0x12) | (byte2 << 0x0C) |
(byte3 << 0x06) | byte4;
if (codePoint >= 0x010000 && codePoint <= 0x10FFFF) {
Expand All @@ -197,7 +210,9 @@
var byteArray;
var byteCount;
var byteIndex;
function utf8decode(byteString) {
var allowTruncatedEnd;
function utf8decode(byteString, options) {
allowTruncatedEnd = typeof options != 'undefined' ? options.allowTruncatedEnd : false;
byteArray = ucs2decode(byteString);
byteCount = byteArray.length;
byteIndex = 0;
Expand Down