Skip to content

Commit 6ecf4aa

Browse files
committed
feat: add replacement support in single-byte encoders
1 parent c5ff201 commit 6ecf4aa

File tree

3 files changed

+68
-32
lines changed

3 files changed

+68
-32
lines changed

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,8 @@ Returns a function `encode(string)` that encodes a string to bytes.
189189
In `'fatal'` mode (default), will throw on non well-formed strings or any codepoints which could
190190
not be encoded in the target encoding.
191191

192+
In `'replacement'` mode, all unmapped codepoints and unpaired surrogates will be replaced with `U+3F` (codepoint for '?').
193+
192194
##### `latin1toString(arr)`
193195

194196
Decode `iso-8859-1` bytes to a string.

single-byte.js

Lines changed: 32 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -61,15 +61,16 @@ export function createSinglebyteDecoder(encoding, loose = false) {
6161

6262
const NON_LATIN = /[^\x00-\xFF]/ // eslint-disable-line no-control-regex
6363

64-
function encode(s, m) {
64+
function encode(s, m, loose) {
6565
const len = s.length
6666
const x = new Uint8Array(len)
6767
let i = nativeEncoder ? 0 : encodeAsciiPrefix(x, s)
6868

69+
if (!m || m.length < 256) return null // perf
6970
for (const len3 = len - 3; i < len3; i += 4) {
7071
const x0 = s.charCodeAt(i), x1 = s.charCodeAt(i + 1), x2 = s.charCodeAt(i + 2), x3 = s.charCodeAt(i + 3) // prettier-ignore
7172
const c0 = m[x0], c1 = m[x1], c2 = m[x2], c3 = m[x3] // prettier-ignore
72-
if ((!c0 && x0) || (!c1 && x1) || (!c2 && x2) || (!c3 && x3)) return null
73+
if ((!c0 && x0) || (!c1 && x1) || (!c2 && x2) || (!c3 && x3)) break
7374

7475
x[i] = c0
7576
x[i + 1] = c1
@@ -80,19 +81,36 @@ function encode(s, m) {
8081
for (; i < len; i++) {
8182
const x0 = s.charCodeAt(i)
8283
const c0 = m[x0]
83-
if (!c0 && x0) return null
84+
if (!c0 && x0) break
8485
x[i] = c0
8586
}
8687

87-
return x
88+
if (i === len) return x
89+
if (!loose) return null
90+
let j = i
91+
while (i < len) {
92+
const x0 = s.charCodeAt(i++)
93+
let c0 = m[x0]
94+
if (!c0 && x0) {
95+
c0 = 63 // '?'
96+
if (x0 >= 0xd8_00 && x0 < 0xdc_00 && i < len) {
97+
const x1 = s.charCodeAt(i)
98+
if (x1 >= 0xdc_00 && x1 < 0xe0_00) i++
99+
}
100+
}
101+
102+
x[j++] = c0
103+
}
104+
105+
return j === len ? x : x.subarray(0, j)
88106
}
89107

90108
// fromBase64+btoa path is faster on everything where fromBase64 is fast
91109
const useLatin1btoa = Uint8Array.fromBase64 && btoa && !skipWeb
92110

93111
export function createSinglebyteEncoder(encoding, { mode = 'fatal' } = {}) {
94-
// TODO: replacement, truncate (replacement will need varying length)
95-
if (mode !== 'fatal') throw new Error('Unsupported mode')
112+
const loose = mode === 'replacement'
113+
if (mode !== 'fatal' && !loose) throw new Error('Unsupported mode')
96114
const m = encodeMap(encoding) // asserts
97115
const isLatin1 = encoding === 'iso-8859-1'
98116

@@ -106,24 +124,21 @@ export function createSinglebyteEncoder(encoding, { mode = 'fatal' } = {}) {
106124
if (useLatin1btoa && s.length >= 1024 && s.length < 1e8) {
107125
try {
108126
return Uint8Array.fromBase64(btoa(s)) // fails on non-latin1
109-
} catch {
110-
throw new TypeError(E_STRICT)
111-
}
127+
} catch {}
128+
} else if (!NON_LATIN.test(s)) {
129+
return encodeLatin1(s)
112130
}
113131

114-
if (NON_LATIN.test(s)) throw new TypeError(E_STRICT)
115-
return encodeLatin1(s)
116-
}
117-
118-
// Instead of an ASCII regex check, encode optimistically - this is faster
119-
// Check for 8-bit string with a regex though, this is instant on 8-bit strings so doesn't hurt the ASCII fast path
120-
if (nativeEncoder && !NON_LATIN.test(s)) {
132+
if (!loose) throw new TypeError(E_STRICT)
133+
} else if (nativeEncoder && !NON_LATIN.test(s)) {
134+
// Instead of an ASCII regex check, encode optimistically - this is faster
135+
// Check for 8-bit string with a regex though, this is instant on 8-bit strings so doesn't hurt the ASCII fast path
121136
try {
122137
return encodeAscii(s, E_STRICT)
123138
} catch {}
124139
}
125140

126-
const res = encode(s, m)
141+
const res = encode(s, m, loose)
127142
if (!res) throw new TypeError(E_STRICT)
128143
return res
129144
}

single-byte.node.js

Lines changed: 34 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -61,16 +61,17 @@ export function createSinglebyteDecoder(encoding, loose = false) {
6161

6262
const NON_LATIN = /[^\x00-\xFF]/ // eslint-disable-line no-control-regex
6363

64-
function encode(s, m) {
64+
function encode(s, m, loose) {
6565
const len = s.length
6666
let i = 0
6767
const b = Buffer.from(s, 'utf-16le') // aligned
6868
if (!isLE) b.swap16()
6969
const x = new Uint16Array(b.buffer, b.byteOffset, b.byteLength / 2)
70+
if (!m || m.length < 256) return null // perf
7071
for (const len3 = len - 3; i < len3; i += 4) {
7172
const x0 = x[i], x1 = x[i + 1], x2 = x[i + 2], x3 = x[i + 3] // prettier-ignore
7273
const c0 = m[x0], c1 = m[x1], c2 = m[x2], c3 = m[x3] // prettier-ignore
73-
if (!(c0 && c1 && c2 && c3) && ((!c0 && x0) || (!c1 && x1) || (!c2 && x2) || (!c3 && x3))) return null // prettier-ignore
74+
if (!(c0 && c1 && c2 && c3) && ((!c0 && x0) || (!c1 && x1) || (!c2 && x2) || (!c3 && x3))) break
7475
x[i] = c0
7576
x[i + 1] = c1
7677
x[i + 2] = c2
@@ -80,35 +81,53 @@ function encode(s, m) {
8081
for (; i < len; i++) {
8182
const x0 = x[i]
8283
const c0 = m[x0]
83-
if (!c0 && x0) return null
84+
if (!c0 && x0) break
8485
x[i] = c0
8586
}
8687

87-
return new Uint8Array(x)
88+
if (i === len) return new Uint8Array(x)
89+
if (!loose) return null
90+
let j = i
91+
while (i < len) {
92+
const x0 = x[i++]
93+
let c0 = m[x0]
94+
if (!c0 && x0) {
95+
c0 = 63 // '?'
96+
if (x0 >= 0xd8_00 && x0 < 0xdc_00 && i < len) {
97+
const x1 = x[i]
98+
if (x1 >= 0xdc_00 && x1 < 0xe0_00) i++
99+
}
100+
}
101+
102+
x[j++] = c0
103+
}
104+
105+
return new Uint8Array(j === len ? x : x.subarray(0, j))
88106
}
89107

90108
export function createSinglebyteEncoder(encoding, { mode = 'fatal' } = {}) {
91-
// TODO: replacement, truncate (replacement will need varying length)
92-
if (mode !== 'fatal') throw new Error('Unsupported mode')
109+
const loose = mode === 'replacement'
110+
if (mode !== 'fatal' && !loose) throw new Error('Unsupported mode')
93111
const m = encodeMap(encoding) // asserts
94112
const isLatin1 = encoding === 'iso-8859-1'
95113

96114
return (s) => {
97115
if (typeof s !== 'string') throw new TypeError(E_STRING)
98116
if (isLatin1) {
99-
if (NON_LATIN.test(s)) throw new TypeError(E_STRICT)
100-
const b = Buffer.from(s, 'latin1')
101-
return new Uint8Array(b.buffer, b.byteOffset, b.byteLength)
102-
}
103-
104-
// Instead of an ASCII regex check, encode optimistically - this is faster
105-
// Check for 8-bit string with a regex though, this is instant on 8-bit strings so doesn't hurt the ASCII fast path
106-
if (!NON_LATIN.test(s)) {
117+
if (!NON_LATIN.test(s)) {
118+
const b = Buffer.from(s, 'latin1')
119+
return new Uint8Array(b.buffer, b.byteOffset, b.byteLength)
120+
}
121+
122+
if (!loose) throw new TypeError(E_STRICT)
123+
} else if (!NON_LATIN.test(s)) {
124+
// Instead of an ASCII regex check, encode optimistically - this is faster
125+
// Check for 8-bit string with a regex though, this is instant on 8-bit strings so doesn't hurt the ASCII fast path
107126
const b = Buffer.from(s, 'utf8') // ascii/latin1 coerces, we need to check
108127
if (b.length === s.length) return new Uint8Array(b.buffer, b.byteOffset, b.byteLength)
109128
}
110129

111-
const res = encode(s, m)
130+
const res = encode(s, m, loose)
112131
if (!res) throw new TypeError(E_STRICT)
113132
return res
114133
}

0 commit comments

Comments
 (0)