Skip to content

Commit aa5a8a7

Browse files
committed
feat: add replacement support in single-byte encoders
1 parent c5ff201 commit aa5a8a7

File tree

3 files changed

+70
-30
lines changed

3 files changed

+70
-30
lines changed

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,8 @@ Returns a function `encode(string)` that encodes a string to bytes.
189189
In `'fatal'` mode (default), will throw on non well-formed strings or any codepoints which could
190190
not be encoded in the target encoding.
191191

192+
In `'replacement'` mode, all unmapped codepoints and unpaired surrogates will be replaced with `U+3F` (codepoint for '?').
193+
192194
##### `latin1toString(arr)`
193195

194196
Decode `iso-8859-1` bytes to a string.

single-byte.js

Lines changed: 33 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -61,15 +61,15 @@ export function createSinglebyteDecoder(encoding, loose = false) {
6161

6262
const NON_LATIN = /[^\x00-\xFF]/ // eslint-disable-line no-control-regex
6363

64-
function encode(s, m) {
64+
function encode(s, m, loose) {
6565
const len = s.length
6666
const x = new Uint8Array(len)
6767
let i = nativeEncoder ? 0 : encodeAsciiPrefix(x, s)
6868

6969
for (const len3 = len - 3; i < len3; i += 4) {
7070
const x0 = s.charCodeAt(i), x1 = s.charCodeAt(i + 1), x2 = s.charCodeAt(i + 2), x3 = s.charCodeAt(i + 3) // prettier-ignore
7171
const c0 = m[x0], c1 = m[x1], c2 = m[x2], c3 = m[x3] // prettier-ignore
72-
if ((!c0 && x0) || (!c1 && x1) || (!c2 && x2) || (!c3 && x3)) return null
72+
if ((!c0 && x0) || (!c1 && x1) || (!c2 && x2) || (!c3 && x3)) break
7373

7474
x[i] = c0
7575
x[i + 1] = c1
@@ -80,19 +80,39 @@ function encode(s, m) {
8080
for (; i < len; i++) {
8181
const x0 = s.charCodeAt(i)
8282
const c0 = m[x0]
83-
if (!c0 && x0) return null
83+
if (!c0 && x0) break
8484
x[i] = c0
8585
}
8686

87+
if (i < len) {
88+
if (!loose) return null
89+
let j = i
90+
while (i < len) {
91+
const x0 = s.charCodeAt(i++)
92+
let c0 = m[x0]
93+
if (!c0 && x0) {
94+
c0 = 63 // '?'
95+
if (x0 >= 0xd8_00 && x0 < 0xdc_00 && i < len) {
96+
const x1 = s.charCodeAt(i)
97+
if (x1 >= 0xdc_00 && x1 < 0xe0_00) i++
98+
}
99+
}
100+
101+
x[j++] = c0
102+
}
103+
104+
return j === len ? x : x.subarray(0, j)
105+
}
106+
87107
return x
88108
}
89109

90110
// fromBase64+btoa path is faster on everything where fromBase64 is fast
91111
const useLatin1btoa = Uint8Array.fromBase64 && btoa && !skipWeb
92112

93113
export function createSinglebyteEncoder(encoding, { mode = 'fatal' } = {}) {
94-
// TODO: replacement, truncate (replacement will need varying length)
95-
if (mode !== 'fatal') throw new Error('Unsupported mode')
114+
const loose = mode === 'replacement'
115+
if (mode !== 'fatal' && !loose) throw new Error('Unsupported mode')
96116
const m = encodeMap(encoding) // asserts
97117
const isLatin1 = encoding === 'iso-8859-1'
98118

@@ -106,24 +126,21 @@ export function createSinglebyteEncoder(encoding, { mode = 'fatal' } = {}) {
106126
if (useLatin1btoa && s.length >= 1024 && s.length < 1e8) {
107127
try {
108128
return Uint8Array.fromBase64(btoa(s)) // fails on non-latin1
109-
} catch {
110-
throw new TypeError(E_STRICT)
111-
}
129+
} catch {}
130+
} else if (!NON_LATIN.test(s)) {
131+
return encodeLatin1(s)
112132
}
113133

114-
if (NON_LATIN.test(s)) throw new TypeError(E_STRICT)
115-
return encodeLatin1(s)
116-
}
117-
118-
// Instead of an ASCII regex check, encode optimistically - this is faster
119-
// Check for 8-bit string with a regex though, this is instant on 8-bit strings so doesn't hurt the ASCII fast path
120-
if (nativeEncoder && !NON_LATIN.test(s)) {
134+
if (!loose) throw new TypeError(E_STRICT)
135+
} else if (nativeEncoder && !NON_LATIN.test(s)) {
136+
// Instead of an ASCII regex check, encode optimistically - this is faster
137+
// Check for 8-bit string with a regex though, this is instant on 8-bit strings so doesn't hurt the ASCII fast path
121138
try {
122139
return encodeAscii(s, E_STRICT)
123140
} catch {}
124141
}
125142

126-
const res = encode(s, m)
143+
const res = encode(s, m, loose)
127144
if (!res) throw new TypeError(E_STRICT)
128145
return res
129146
}

single-byte.node.js

Lines changed: 35 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ export function createSinglebyteDecoder(encoding, loose = false) {
6161

6262
const NON_LATIN = /[^\x00-\xFF]/ // eslint-disable-line no-control-regex
6363

64-
function encode(s, m) {
64+
function encode(s, m, loose) {
6565
const len = s.length
6666
let i = 0
6767
const b = Buffer.from(s, 'utf-16le') // aligned
@@ -70,7 +70,7 @@ function encode(s, m) {
7070
for (const len3 = len - 3; i < len3; i += 4) {
7171
const x0 = x[i], x1 = x[i + 1], x2 = x[i + 2], x3 = x[i + 3] // prettier-ignore
7272
const c0 = m[x0], c1 = m[x1], c2 = m[x2], c3 = m[x3] // prettier-ignore
73-
if (!(c0 && c1 && c2 && c3) && ((!c0 && x0) || (!c1 && x1) || (!c2 && x2) || (!c3 && x3))) return null // prettier-ignore
73+
if (!(c0 && c1 && c2 && c3) && ((!c0 && x0) || (!c1 && x1) || (!c2 && x2) || (!c3 && x3))) break
7474
x[i] = c0
7575
x[i + 1] = c1
7676
x[i + 2] = c2
@@ -80,35 +80,56 @@ function encode(s, m) {
8080
for (; i < len; i++) {
8181
const x0 = x[i]
8282
const c0 = m[x0]
83-
if (!c0 && x0) return null
83+
if (!c0 && x0) break
8484
x[i] = c0
8585
}
8686

87+
if (i < len) {
88+
if (!loose) return null
89+
let j = i
90+
while (i < len) {
91+
const x0 = x[i++]
92+
let c0 = m[x0]
93+
if (!c0 && x0) {
94+
c0 = 63 // '?'
95+
if (x0 >= 0xd8_00 && x0 < 0xdc_00 && i < len) {
96+
const x1 = x[i]
97+
if (x1 >= 0xdc_00 && x1 < 0xe0_00) i++
98+
}
99+
}
100+
101+
x[j++] = c0
102+
}
103+
104+
return new Uint8Array(j === len ? x : x.subarray(0, j))
105+
}
106+
87107
return new Uint8Array(x)
88108
}
89109

90110
export function createSinglebyteEncoder(encoding, { mode = 'fatal' } = {}) {
91-
// TODO: replacement, truncate (replacement will need varying length)
92-
if (mode !== 'fatal') throw new Error('Unsupported mode')
111+
const loose = mode === 'replacement'
112+
if (mode !== 'fatal' && !loose) throw new Error('Unsupported mode')
93113
const m = encodeMap(encoding) // asserts
94114
const isLatin1 = encoding === 'iso-8859-1'
95115

96116
return (s) => {
97117
if (typeof s !== 'string') throw new TypeError(E_STRING)
98118
if (isLatin1) {
99-
if (NON_LATIN.test(s)) throw new TypeError(E_STRICT)
100-
const b = Buffer.from(s, 'latin1')
101-
return new Uint8Array(b.buffer, b.byteOffset, b.byteLength)
102-
}
103-
104-
// Instead of an ASCII regex check, encode optimistically - this is faster
105-
// Check for 8-bit string with a regex though, this is instant on 8-bit strings so doesn't hurt the ASCII fast path
106-
if (!NON_LATIN.test(s)) {
119+
if (!NON_LATIN.test(s)) {
120+
const b = Buffer.from(s, 'latin1')
121+
return new Uint8Array(b.buffer, b.byteOffset, b.byteLength)
122+
}
123+
124+
if (!loose) throw new TypeError(E_STRICT)
125+
} else if (!NON_LATIN.test(s)) {
126+
// Instead of an ASCII regex check, encode optimistically - this is faster
127+
// Check for 8-bit string with a regex though, this is instant on 8-bit strings so doesn't hurt the ASCII fast path
107128
const b = Buffer.from(s, 'utf8') // ascii/latin1 coerces, we need to check
108129
if (b.length === s.length) return new Uint8Array(b.buffer, b.byteOffset, b.byteLength)
109130
}
110131

111-
const res = encode(s, m)
132+
const res = encode(s, m, loose)
112133
if (!res) throw new TypeError(E_STRICT)
113134
return res
114135
}

0 commit comments

Comments
 (0)