Skip to content

Commit 5f1e994

Browse files
committed
feat: add /text-encoding (not shipped yet)
1 parent 815f135 commit 5f1e994

File tree

6 files changed

+181
-5
lines changed

6 files changed

+181
-5
lines changed

.eslintrc.cjs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ module.exports = {
1616
'unicorn/no-new-array': 'off',
1717
'unicorn/prefer-code-point': 'off',
1818
'unicorn/prefer-math-trunc': 'off',
19+
'unicorn/prefer-spread': 'off',
1920
},
2021
},
2122
{

package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@
6868
"node": "./hex.node.js",
6969
"default": "./hex.js"
7070
},
71+
"./text-encoding.js": "./text-encoding.js",
7172
"./utf16.js": {
7273
"node": "./utf16.node.js",
7374
"default": "./utf16.js"

tests/wpt/fallback.test.js

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,9 +43,10 @@ globalThis.TextDecoder = class {
4343
}
4444

4545
let res
46+
// eslint-disable-next-line unicorn/prefer-switch
4647
if (this.encoding === 'utf-8') {
4748
res = utf8.decode(input, !this.fatal)
48-
} else if (this.encoding === 'utf-16le' || (this.encoding === 'utf-16' && isLE)) {
49+
} else if (this.encoding === 'utf-16le' || this.encoding === 'utf-16') {
4950
if (!this.fatal && input.byteLength % 2 !== 0) {
5051
const tmp = new Uint8Array(input.byteLength + 1)
5152
tmp.set(input)
@@ -60,7 +61,7 @@ globalThis.TextDecoder = class {
6061
if (input.byteLength % 2 !== 0) throw new TypeError('Expected even number of bytes')
6162
const u16 = new Uint16Array(input.buffer, input.byteOffset, input.byteLength / 2)
6263
res = utf16.decode(u16, !this.fatal)
63-
} else if (this.encoding === 'utf-16be' || (this.encoding === 'utf-16' && !isLE)) {
64+
} else if (this.encoding === 'utf-16be') {
6465
if (!this.fatal && input.byteLength % 2 !== 0) {
6566
const tmp = new Uint8Array(input.byteLength + 1)
6667
tmp.set(input)

tests/wpt/index.test.js

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ globalThis.TextEncoder = class {
1818
}
1919

2020
// Not a proper impl, not getters, etc
21-
const isLE = new Uint8Array(Uint16Array.of(258).buffer)[0] === 2
2221
globalThis.TextDecoder = class {
2322
constructor(label = 'utf-8', { fatal = false, ignoreBOM = false } = {}) {
2423
this.encoding = label
@@ -34,9 +33,10 @@ globalThis.TextDecoder = class {
3433
}
3534

3635
let res
36+
// eslint-disable-next-line unicorn/prefer-switch
3737
if (this.encoding === 'utf-8') {
3838
res = this.fatal ? utf8toString(input) : utf8toStringLoose(input)
39-
} else if (this.encoding === 'utf-16le' || (this.encoding === 'utf-16' && isLE)) {
39+
} else if (this.encoding === 'utf-16le' || this.encoding === 'utf-16') {
4040
if (!this.fatal && input.byteLength % 2 !== 0) {
4141
const tmp = new Uint8Array(input.byteLength + 1)
4242
tmp.set(input)
@@ -46,7 +46,7 @@ globalThis.TextDecoder = class {
4646
}
4747

4848
res = this.fatal ? utf16toString(input, 'uint8-le') : utf16toStringLoose(input, 'uint8-le')
49-
} else if (this.encoding === 'utf-16be' || (this.encoding === 'utf-16' && !isLE)) {
49+
} else if (this.encoding === 'utf-16be') {
5050
if (!this.fatal && input.byteLength % 2 !== 0) {
5151
const tmp = new Uint8Array(input.byteLength + 1)
5252
tmp.set(input)

tests/wpt/text-encoding.test.js

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
import { describe } from 'node:test'
2+
import { loadDir } from './loader.cjs'
3+
import { toBase64, fromBase64 } from '@exodus/bytes/base64.js'
4+
import { TextEncoder, TextDecoder } from '@exodus/bytes/text-encoding.js'
5+
6+
globalThis.TextEncoder = TextEncoder
7+
globalThis.TextDecoder = TextDecoder
8+
9+
if (!globalThis.atob || !globalThis.HermesInternal) {
10+
globalThis.atob = (x) => {
11+
x = String(x).replaceAll(/[\t\n\f\r ]/g, '')
12+
13+
// hack around non-strict input just for testing
14+
x = x.replace(/^ab(={0,4})$/, 'aQ$1')
15+
if (x === 'NaN') x = 'NaM'
16+
if (x === '12') x = '1w'
17+
if (x === 'YR') x = 'YQ'
18+
if (x === 'A/') x = 'Aw'
19+
if (x === 'AA/') x = 'AA8'
20+
21+
const res = fromBase64(x)
22+
return String.fromCharCode(...res)
23+
}
24+
}
25+
26+
if (!globalThis.btoa || !globalThis.HermesInternal) {
27+
globalThis.btoa = (s) => {
28+
s = String(s)
29+
const ua = new Uint8Array(s.length)
30+
for (let i = 0; i < s.length; i++) {
31+
const c = s.charCodeAt(i)
32+
if (c > 255) throw new Error('INVALID_CHARACTER_ERR')
33+
ua[i] = c
34+
}
35+
36+
return toBase64(ua)
37+
}
38+
}
39+
40+
describe('Web Platform Tests', () => {
41+
loadDir('encoding')
42+
loadDir('html/webappapis/atob')
43+
})
44+
45+
// List of files so that bundler can locate all these
46+
/* @preserve
47+
fs.readFileSync(path.join(__dirname, 'fixtures/encoding/api-basics.any.js'))
48+
fs.readFileSync(path.join(__dirname, 'fixtures/encoding/api-surrogates-utf8.any.js'))
49+
fs.readFileSync(path.join(__dirname, 'fixtures/encoding/textdecoder-byte-order-marks.any.js'))
50+
fs.readFileSync(path.join(__dirname, 'fixtures/encoding/textdecoder-fatal.any.js'))
51+
fs.readFileSync(path.join(__dirname, 'fixtures/encoding/textdecoder-ignorebom.any.js'))
52+
fs.readFileSync(path.join(__dirname, 'fixtures/encoding/textdecoder-utf16-surrogates.any.js'))
53+
fs.readFileSync(path.join(__dirname, 'fixtures/encoding/textencoder-utf16-surrogates.any.js'))
54+
fs.readFileSync(path.join(__dirname, 'fixtures/html/webappapis/atob/base64.any.js'))
55+
*/

text-encoding.js

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
// A limited subset of TextEncoder / TextDecoder API
2+
3+
// We can't return native TextDecoder if it's present, as Node.js one is broken on windows-1252 and we fix that
4+
// We are also faster than Node.js built-in on both TextEncoder and TextDecoder
5+
6+
/* eslint-disable unicorn/text-encoding-identifier-case, @exodus/import/no-unresolved */
7+
8+
import { utf16toString, utf16toStringLoose } from '@exodus/bytes/utf16.js'
9+
import { utf8fromStringLoose, utf8toString, utf8toStringLoose } from '@exodus/bytes/utf8.js'
10+
import { windows1252toString } from '@exodus/bytes/windows1252.js'
11+
12+
const Utf8 = 'utf-8'
13+
const Utf16LE = 'utf-16le'
14+
const Utf16BE = 'utf-16be'
15+
const Win1252 = 'windows-1252'
16+
17+
// https://encoding.spec.whatwg.org/#names-and-labels
18+
// prettier-ignore
19+
const Utf8alias = new Set(['utf8', 'unicode-1-1-utf-8', 'unicode11utf8', 'unicode20utf8', 'x-unicode20utf8'])
20+
// prettier-ignore
21+
const Utf16LEalias = new Set(['utf-16', 'ucs-2', 'unicode', 'unicodefeff', 'iso-10646-ucs-2', 'csunicode']) // there is no 'utf16'
22+
const Utf16BEalias = new Set(['unicodefffe'])
23+
// prettier-ignore
24+
const Win1252alias = new Set([
25+
'ascii', 'latin1', 'l1', 'us-ascii', 'ansi_x3.4-1968', 'cp1252', 'cp819', 'csisolatin1', 'ibm819',
26+
'iso-8859-1', 'iso-ir-100', 'iso8859-1', 'iso88591', 'iso_8859-1', 'iso_8859-1:1987', 'x-cp1252'
27+
])
28+
29+
const replacementChar = '\uFFFD'
30+
31+
const normalizeEncoding = (encoding) => {
32+
const lower = `${encoding}`.trim().toLowerCase()
33+
if (Utf8 === lower || Utf16LE === lower || Utf16BE === lower || Win1252 === lower) return lower // fast path
34+
if (Utf8alias.has(lower)) return Utf8
35+
if (Utf16LEalias.has(lower)) return Utf16LE
36+
if (Utf16BEalias.has(lower)) return Utf16BE
37+
if (Win1252alias.has(lower)) return Win1252
38+
throw new RangeError('Only utf-8, utf-16le, utf-16be and windows-1252/latin1/ascii are supported')
39+
}
40+
41+
const define = (obj, key, value) => Object.defineProperty(obj, key, { value, writable: false })
42+
43+
const fromSource = (x) => {
44+
if (x instanceof Uint8Array) return x
45+
if (x instanceof ArrayBuffer) return new Uint8Array(x)
46+
if (ArrayBuffer.isView(x)) return new Uint8Array(x.buffer, x.byteOffset, x.byteLength)
47+
if (globalThis.SharedArrayBuffer && x instanceof globalThis.SharedArrayBuffer) {
48+
return new Uint8Array(x.buffer, x.byteOffset, x.byteLength)
49+
}
50+
51+
throw new TypeError('Argument must be a SharedArrayBuffer, ArrayBuffer or ArrayBufferView')
52+
}
53+
54+
export function TextEncoder() {
55+
define(this, 'encoding', 'utf-8')
56+
}
57+
58+
TextEncoder.prototype.encode = function (str = '') {
59+
const res = utf8fromStringLoose(str)
60+
return res.byteOffset === 0 ? res : res.slice(0) // Ensure 0-offset. TODO: do we need this?
61+
}
62+
63+
// npmjs.com/text-encoding polyfill doesn't support this at all
64+
TextEncoder.prototype.encodeInto = function (str, target) {
65+
if (!(target instanceof Uint8Array)) throw new TypeError('Second argument must be an Uint8Array')
66+
const u8 = utf8fromStringLoose(str)
67+
if (target.length < u8.length) throw new RangeError('Truncation not supported') // TODO
68+
target.set(u8) // TODO: perf
69+
return { read: str.length, written: u8.length }
70+
}
71+
72+
export function TextDecoder(encoding = Utf8, options = {}) {
73+
if (typeof options !== 'object') throw new TypeError('"options" argument must be of type object')
74+
const { fatal = false, ignoreBOM = false, stream = false } = options
75+
if (stream !== false) throw new TypeError('Option "stream" is not supported')
76+
77+
define(this, 'encoding', normalizeEncoding(encoding))
78+
define(this, 'fatal', fatal)
79+
define(this, 'ignoreBOM', ignoreBOM)
80+
}
81+
82+
// TODO: test behavior on BOM for LE/BE
83+
TextDecoder.prototype.decode = function (input, { stream = false } = {}) {
84+
if (stream) throw new TypeError('Option "stream" is not supported')
85+
if (input === undefined) return ''
86+
let u = fromSource(input)
87+
let suffix = ''
88+
if (this.encoding === 'utf-8') {
89+
if (!this.ignoreBOM && u.byteLength >= 3 && u[0] === 0xef && u[1] === 0xbb && u[2] === 0xbf) {
90+
u = u.subarray(3)
91+
}
92+
93+
return this.fatal ? utf8toString(u) : utf8toStringLoose(u)
94+
}
95+
96+
if (this.encoding === 'utf-16le') {
97+
if (!this.ignoreBOM && u.byteLength >= 2 && u[0] === 0xff && u[1] === 0xfe) u = u.subarray(2)
98+
if (!this.fatal && u.byteLength % 2 !== 0) {
99+
u = u.subarray(0, -1)
100+
suffix = replacementChar
101+
}
102+
103+
return (this.fatal ? utf16toString(u, 'uint8-le') : utf16toStringLoose(u, 'uint8-le')) + suffix
104+
}
105+
106+
if (this.encoding === 'utf-16be') {
107+
if (!this.ignoreBOM && u.byteLength >= 2 && u[0] === 0xfe && u[1] === 0xff) u = u.subarray(2)
108+
if (!this.fatal && u.byteLength % 2 !== 0) {
109+
u = u.subarray(0, -1)
110+
suffix = replacementChar
111+
}
112+
113+
return (this.fatal ? utf16toString(u, 'uint8-be') : utf16toStringLoose(u, 'uint8-be')) + suffix
114+
}
115+
116+
if (this.encoding === 'windows-1252') return windows1252toString(u) // no BOM possible
117+
throw new RangeError('Unsupported encoding')
118+
}

0 commit comments

Comments
 (0)