Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,8 @@ Returns a function `encode(string)` that encodes a string to bytes.
In `'fatal'` mode (default), will throw on non well-formed strings or any codepoints which could
not be encoded in the target encoding.

In `'replacement'` mode, all unmapped codepoints and unpaired surrogates will be replaced with `U+3F` (codepoint for '?').

##### `latin1toString(arr)`

Decode `iso-8859-1` bytes to a string.
Expand Down
54 changes: 36 additions & 18 deletions single-byte.js
Original file line number Diff line number Diff line change
Expand Up @@ -61,38 +61,59 @@

const NON_LATIN = /[^\x00-\xFF]/ // eslint-disable-line no-control-regex

function encode(s, m) {
function encode(s, m, loose) {
const len = s.length
const x = new Uint8Array(len)
let i = nativeEncoder ? 0 : encodeAsciiPrefix(x, s)

for (const len3 = len - 3; i < len3; i += 4) {
if (!m || m.length < 256) return null // perf
const len3 = len - 3
while (i < len3) {
const x0 = s.charCodeAt(i), x1 = s.charCodeAt(i + 1), x2 = s.charCodeAt(i + 2), x3 = s.charCodeAt(i + 3) // prettier-ignore
const c0 = m[x0], c1 = m[x1], c2 = m[x2], c3 = m[x3] // prettier-ignore
if ((!c0 && x0) || (!c1 && x1) || (!c2 && x2) || (!c3 && x3)) return null
if ((!c0 && x0) || (!c1 && x1) || (!c2 && x2) || (!c3 && x3)) break

x[i] = c0
x[i + 1] = c1
x[i + 2] = c2
x[i + 3] = c3
i += 4
}

for (; i < len; i++) {
const x0 = s.charCodeAt(i)
const c0 = m[x0]
if (!c0 && x0) return null
if (!c0 && x0) break
x[i] = c0
}

return x
if (i === len) return x
if (!loose) return null
let j = i
while (i < len) {
const x0 = s.charCodeAt(i++)
if (x0 >= 0xd8_00 && x0 < 0xdc_00) {
if (i < len) {
const x1 = s.charCodeAt(i)
if (x1 >= 0xdc_00 && x1 < 0xe0_00) i++
}
x[j++] = 63 // '?'

Check failure on line 100 in single-byte.js

View workflow job for this annotation

GitHub Actions / build

Expected blank line before this statement
} else {
const c0 = m[x0]
x[j++] = !c0 && x0 ? 63 : c0
}

Check failure on line 105 in single-byte.js

View workflow job for this annotation

GitHub Actions / build

Delete `⏎`
}

return j === len ? x : x.subarray(0, j)
}

// fromBase64+btoa path is faster on everything where fromBase64 is fast
const useLatin1btoa = Uint8Array.fromBase64 && btoa && !skipWeb

export function createSinglebyteEncoder(encoding, { mode = 'fatal' } = {}) {
// TODO: replacement, truncate (replacement will need varying length)
if (mode !== 'fatal') throw new Error('Unsupported mode')
const loose = mode === 'replacement'
if (mode !== 'fatal' && !loose) throw new Error('Unsupported mode')
const m = encodeMap(encoding) // asserts
const isLatin1 = encoding === 'iso-8859-1'

Expand All @@ -106,24 +127,21 @@
if (useLatin1btoa && s.length >= 1024 && s.length < 1e8) {
try {
return Uint8Array.fromBase64(btoa(s)) // fails on non-latin1
} catch {
throw new TypeError(E_STRICT)
}
} catch {}
} else if (!NON_LATIN.test(s)) {
return encodeLatin1(s)
}

if (NON_LATIN.test(s)) throw new TypeError(E_STRICT)
return encodeLatin1(s)
}

// Instead of an ASCII regex check, encode optimistically - this is faster
// Check for 8-bit string with a regex though, this is instant on 8-bit strings so doesn't hurt the ASCII fast path
if (nativeEncoder && !NON_LATIN.test(s)) {
if (!loose) throw new TypeError(E_STRICT)
} else if (nativeEncoder && !NON_LATIN.test(s)) {
// Instead of an ASCII regex check, encode optimistically - this is faster
// Check for 8-bit string with a regex though, this is instant on 8-bit strings so doesn't hurt the ASCII fast path
try {
return encodeAscii(s, E_STRICT)
} catch {}
}

const res = encode(s, m)
const res = encode(s, m, loose)
if (!res) throw new TypeError(E_STRICT)
return res
}
Expand Down
57 changes: 41 additions & 16 deletions single-byte.node.js
Original file line number Diff line number Diff line change
Expand Up @@ -61,54 +61,79 @@

const NON_LATIN = /[^\x00-\xFF]/ // eslint-disable-line no-control-regex

function encode(s, m) {
function encode(s, m, loose) {
const len = s.length
let i = 0
const b = Buffer.from(s, 'utf-16le') // aligned
if (!isLE) b.swap16()
const x = new Uint16Array(b.buffer, b.byteOffset, b.byteLength / 2)
for (const len3 = len - 3; i < len3; i += 4) {
if (!m || m.length < 256) return null // perf
const len3 = len - 3
while (i < len3) {
const x0 = x[i], x1 = x[i + 1], x2 = x[i + 2], x3 = x[i + 3] // prettier-ignore
const c0 = m[x0], c1 = m[x1], c2 = m[x2], c3 = m[x3] // prettier-ignore
if (!(c0 && c1 && c2 && c3) && ((!c0 && x0) || (!c1 && x1) || (!c2 && x2) || (!c3 && x3))) return null // prettier-ignore
if (!(c0 && c1 && c2 && c3) && ((!c0 && x0) || (!c1 && x1) || (!c2 && x2) || (!c3 && x3))) break
x[i] = c0
x[i + 1] = c1
x[i + 2] = c2
x[i + 3] = c3
i += 4
}

const mlen = m.length
for (; i < len; i++) {
const x0 = x[i]
if (x0 >= mlen) break
const c0 = m[x0]
if (!c0 && x0) return null
if (!c0 && x0) break
x[i] = c0
}

return new Uint8Array(x)
if (i === len) return new Uint8Array(x)
if (!loose) return null
let j = i
while (i < len) {
const x0 = x[i++]
if (x0 >= 0xd8_00 && x0 < 0xdc_00) {
if (i < len) {
const x1 = x[i]
if (x1 >= 0xdc_00 && x1 < 0xe0_00) i++
}
x[j++] = 63 // '?'

Check failure on line 102 in single-byte.node.js

View workflow job for this annotation

GitHub Actions / build

Expected blank line before this statement
} else if (x0 >= mlen) {
x[j++] = 63 // '?'
} else {
const c0 = m[x0]
x[j++] = !c0 && x0 ? 63 : c0
}
}

return new Uint8Array(j === len ? x : x.subarray(0, j))
}

export function createSinglebyteEncoder(encoding, { mode = 'fatal' } = {}) {
// TODO: replacement, truncate (replacement will need varying length)
if (mode !== 'fatal') throw new Error('Unsupported mode')
const loose = mode === 'replacement'
if (mode !== 'fatal' && !loose) throw new Error('Unsupported mode')
const m = encodeMap(encoding) // asserts
const isLatin1 = encoding === 'iso-8859-1'

return (s) => {
if (typeof s !== 'string') throw new TypeError(E_STRING)
if (isLatin1) {
if (NON_LATIN.test(s)) throw new TypeError(E_STRICT)
const b = Buffer.from(s, 'latin1')
return new Uint8Array(b.buffer, b.byteOffset, b.byteLength)
}

// Instead of an ASCII regex check, encode optimistically - this is faster
// Check for 8-bit string with a regex though, this is instant on 8-bit strings so doesn't hurt the ASCII fast path
if (!NON_LATIN.test(s)) {
if (!NON_LATIN.test(s)) {
const b = Buffer.from(s, 'latin1')
return new Uint8Array(b.buffer, b.byteOffset, b.byteLength)
}

if (!loose) throw new TypeError(E_STRICT)
} else if (!NON_LATIN.test(s)) {
// Instead of an ASCII regex check, encode optimistically - this is faster
// Check for 8-bit string with a regex though, this is instant on 8-bit strings so doesn't hurt the ASCII fast path
const b = Buffer.from(s, 'utf8') // ascii/latin1 coerces, we need to check
if (b.length === s.length) return new Uint8Array(b.buffer, b.byteOffset, b.byteLength)
}

const res = encode(s, m)
const res = encode(s, m, loose)
if (!res) throw new TypeError(E_STRICT)
return res
}
Expand Down
23 changes: 23 additions & 0 deletions tests/single-byte.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@ describe('single-byte encodings are supersets of ascii', () => {
for (const encoding of encodings) {
test(encoding, (t) => {
const decoder = createSinglebyteDecoder(encoding)
const decoderLoose = createSinglebyteDecoder(encoding, true)
const encoder = createSinglebyteEncoder(encoding)
const encoderLoose = createSinglebyteEncoder(encoding, { mode: 'replacement' })
for (let i = 0; i < 128; i++) {
let str
try {
Expand All @@ -27,7 +29,9 @@ describe('single-byte encodings are supersets of ascii', () => {
t.assert.strictEqual(str.length, 1, i)
t.assert.strictEqual(str.codePointAt(0), i, i)

t.assert.strictEqual(decoderLoose(Uint8Array.of(i)), str, i)
t.assert.deepStrictEqual(encoder(str), Uint8Array.of(i))
t.assert.deepStrictEqual(encoderLoose(str), Uint8Array.of(i))
}
})
}
Expand Down Expand Up @@ -84,6 +88,7 @@ describe('single-byte encodings index: Unicode', () => {
const decoder = createSinglebyteDecoder(encoding)
const decoderLoose = createSinglebyteDecoder(encoding, true)
const encoder = createSinglebyteEncoder(encoding)
const encoderLoose = createSinglebyteEncoder(encoding, { mode: 'replacement' })
const text = readFileSync(
join(import.meta.dirname, 'encoding/fixtures/unicode/', fileName),
'utf8'
Expand Down Expand Up @@ -145,6 +150,7 @@ describe('single-byte encodings index: Unicode', () => {
t.assert.strictEqual(str, decoderLoose(Uint8Array.of(byte)))

t.assert.deepStrictEqual(encoder(str), Uint8Array.of(byte))
t.assert.deepStrictEqual(encoderLoose(str), Uint8Array.of(byte))
}
}
})
Expand All @@ -158,6 +164,7 @@ describe('single-byte encodings index: WHATWG', () => {
const decoder = createSinglebyteDecoder(encoding)
const decoderLoose = createSinglebyteDecoder(encoding, true)
const encoder = createSinglebyteEncoder(encoding)
const encoderLoose = createSinglebyteEncoder(encoding, { mode: 'replacement' })
const text = readFileSync(
join(import.meta.dirname, 'encoding/fixtures/single-byte', `index-${encoding}.txt`),
'utf8'
Expand Down Expand Up @@ -199,6 +206,7 @@ describe('single-byte encodings index: WHATWG', () => {
t.assert.strictEqual(str, decoderLoose(Uint8Array.of(byte)))

t.assert.deepStrictEqual(encoder(str), Uint8Array.of(byte))
t.assert.deepStrictEqual(encoderLoose(str), Uint8Array.of(byte))
} else {
t.assert.throws(() => decoder(Uint8Array.of(byte)))
try {
Expand Down Expand Up @@ -230,6 +238,7 @@ describe('single-byte encodings index: WHATWG non-normative indexes.json', () =>
const decoder = createSinglebyteDecoder(encoding)
const decoderLoose = createSinglebyteDecoder(encoding, true)
const encoder = createSinglebyteEncoder(encoding)
const encoderLoose = createSinglebyteEncoder(encoding, { mode: 'replacement' })

t.assert.strictEqual(data.length, 128)
for (let i = 0; i < data.length; i++) {
Expand All @@ -244,6 +253,7 @@ describe('single-byte encodings index: WHATWG non-normative indexes.json', () =>
t.assert.strictEqual(decoder(Uint8Array.of(byte)), str)
t.assert.strictEqual(decoderLoose(Uint8Array.of(byte)), str)
t.assert.deepStrictEqual(encoder(str), Uint8Array.of(byte))
t.assert.deepStrictEqual(encoderLoose(str), Uint8Array.of(byte))
} else {
t.assert.throws(() => decoder(Uint8Array.of(byte)))
t.assert.strictEqual(decoderLoose(Uint8Array.of(byte)), '\uFFFD')
Expand All @@ -268,13 +278,16 @@ describe('x-user-defined', () => {

test('encode', (t) => {
const encoder = createSinglebyteEncoder(encoding)
const encoderLoose = createSinglebyteEncoder(encoding, { mode: 'replacement' })
for (let byte = 0; byte < 256; byte++) {
const str = String.fromCodePoint(byte >= 0x80 ? 0xf7_80 + byte - 0x80 : byte)
t.assert.deepStrictEqual(encoder(str), Uint8Array.of(byte), byte)
t.assert.deepStrictEqual(encoderLoose(str), Uint8Array.of(byte), byte)
}

for (let i = 128; i < 512; i++) {
t.assert.throws(() => encoder(String.fromCodePoint(i)), /Input is not well-formed/)
t.assert.deepStrictEqual(encoderLoose(String.fromCodePoint(i)), Uint8Array.of(0x3f), i)
}
})
})
Expand All @@ -284,21 +297,31 @@ describe('codes above 0x7F are non-ASCII', () => {
for (const encoding of ['iso-8859-2', 'iso-8859-16']) {
test(encoding, (t) => {
const encoder = createSinglebyteEncoder(encoding)
const encoderLoose = createSinglebyteEncoder(encoding, { mode: 'replacement' })
t.assert.deepStrictEqual(encoder('\x80'), new Uint8Array(1).fill(0x80))
t.assert.deepStrictEqual(encoder('\x80'.repeat(4)), new Uint8Array(4).fill(0x80))
t.assert.deepStrictEqual(encoder('\x80'.repeat(8)), new Uint8Array(8).fill(0x80))
t.assert.deepStrictEqual(encoder('\x80'.repeat(16)), new Uint8Array(16).fill(0x80))
t.assert.deepStrictEqual(encoderLoose('\x80'), new Uint8Array(1).fill(0x80))
t.assert.deepStrictEqual(encoderLoose('\x80'.repeat(4)), new Uint8Array(4).fill(0x80))
t.assert.deepStrictEqual(encoderLoose('\x80'.repeat(8)), new Uint8Array(8).fill(0x80))
t.assert.deepStrictEqual(encoderLoose('\x80'.repeat(16)), new Uint8Array(16).fill(0x80))
})
}

// 0x80 maps to something else
for (const encoding of ['windows-1250', 'windows-1252', 'x-user-defined']) {
test(encoding, (t) => {
const encoder = createSinglebyteEncoder(encoding)
const encoderLoose = createSinglebyteEncoder(encoding, { mode: 'replacement' })
t.assert.throws(() => encoder('\x80'))
t.assert.throws(() => encoder('\x80'.repeat(4)))
t.assert.throws(() => encoder('\x80'.repeat(8)))
t.assert.throws(() => encoder('\x80'.repeat(16)))
t.assert.deepStrictEqual(encoderLoose('\x80'), new Uint8Array(1).fill(0x3f))
t.assert.deepStrictEqual(encoderLoose('\x80'.repeat(4)), new Uint8Array(4).fill(0x3f))
t.assert.deepStrictEqual(encoderLoose('\x80'.repeat(8)), new Uint8Array(8).fill(0x3f))
t.assert.deepStrictEqual(encoderLoose('\x80'.repeat(16)), new Uint8Array(16).fill(0x3f))
})
}
})
Loading