Skip to content

Commit fe9a903

Browse files
committed
feat: add replacement support in single-byte encoders
1 parent c5ff201 commit fe9a903

File tree

3 files changed

+79
-34
lines changed

3 files changed

+79
-34
lines changed

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,8 @@ Returns a function `encode(string)` that encodes a string to bytes.
189189
In `'fatal'` mode (default), will throw on non well-formed strings or any codepoints which could
190190
not be encoded in the target encoding.
191191

192+
In `'replacement'` mode, all unmapped codepoints and unpaired surrogates will be replaced with `U+3F` (codepoint for '?').
193+
192194
##### `latin1toString(arr)`
193195

194196
Decode `iso-8859-1` bytes to a string.

single-byte.js

Lines changed: 36 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -61,38 +61,59 @@ export function createSinglebyteDecoder(encoding, loose = false) {
6161

6262
const NON_LATIN = /[^\x00-\xFF]/ // eslint-disable-line no-control-regex
6363

64-
function encode(s, m) {
64+
function encode(s, m, loose) {
6565
const len = s.length
6666
const x = new Uint8Array(len)
6767
let i = nativeEncoder ? 0 : encodeAsciiPrefix(x, s)
6868

69-
for (const len3 = len - 3; i < len3; i += 4) {
69+
if (!m || m.length < 256) return null // perf
70+
const len3 = len - 3
71+
while (i < len3) {
7072
const x0 = s.charCodeAt(i), x1 = s.charCodeAt(i + 1), x2 = s.charCodeAt(i + 2), x3 = s.charCodeAt(i + 3) // prettier-ignore
7173
const c0 = m[x0], c1 = m[x1], c2 = m[x2], c3 = m[x3] // prettier-ignore
72-
if ((!c0 && x0) || (!c1 && x1) || (!c2 && x2) || (!c3 && x3)) return null
74+
if ((!c0 && x0) || (!c1 && x1) || (!c2 && x2) || (!c3 && x3)) break
7375

7476
x[i] = c0
7577
x[i + 1] = c1
7678
x[i + 2] = c2
7779
x[i + 3] = c3
80+
i += 4
7881
}
7982

8083
for (; i < len; i++) {
8184
const x0 = s.charCodeAt(i)
8285
const c0 = m[x0]
83-
if (!c0 && x0) return null
86+
if (!c0 && x0) break
8487
x[i] = c0
8588
}
8689

87-
return x
90+
if (i === len) return x
91+
if (!loose) return null
92+
let j = i
93+
while (i < len) {
94+
const x0 = s.charCodeAt(i++)
95+
if (x0 >= 0xd8_00 && x0 < 0xdc_00) {
96+
if (i < len) {
97+
const x1 = s.charCodeAt(i)
98+
if (x1 >= 0xdc_00 && x1 < 0xe0_00) i++
99+
}
100+
x[j++] = 63 // '?'
101+
} else {
102+
const c0 = m[x0]
103+
x[j++] = !c0 && x0 ? 63 : c0
104+
}
105+
106+
}
107+
108+
return j === len ? x : x.subarray(0, j)
88109
}
89110

90111
// fromBase64+btoa path is faster on everything where fromBase64 is fast
91112
const useLatin1btoa = Uint8Array.fromBase64 && btoa && !skipWeb
92113

93114
export function createSinglebyteEncoder(encoding, { mode = 'fatal' } = {}) {
94-
// TODO: replacement, truncate (replacement will need varying length)
95-
if (mode !== 'fatal') throw new Error('Unsupported mode')
115+
const loose = mode === 'replacement'
116+
if (mode !== 'fatal' && !loose) throw new Error('Unsupported mode')
96117
const m = encodeMap(encoding) // asserts
97118
const isLatin1 = encoding === 'iso-8859-1'
98119

@@ -106,24 +127,21 @@ export function createSinglebyteEncoder(encoding, { mode = 'fatal' } = {}) {
106127
if (useLatin1btoa && s.length >= 1024 && s.length < 1e8) {
107128
try {
108129
return Uint8Array.fromBase64(btoa(s)) // fails on non-latin1
109-
} catch {
110-
throw new TypeError(E_STRICT)
111-
}
130+
} catch {}
131+
} else if (!NON_LATIN.test(s)) {
132+
return encodeLatin1(s)
112133
}
113134

114-
if (NON_LATIN.test(s)) throw new TypeError(E_STRICT)
115-
return encodeLatin1(s)
116-
}
117-
118-
// Instead of an ASCII regex check, encode optimistically - this is faster
119-
// Check for 8-bit string with a regex though, this is instant on 8-bit strings so doesn't hurt the ASCII fast path
120-
if (nativeEncoder && !NON_LATIN.test(s)) {
135+
if (!loose) throw new TypeError(E_STRICT)
136+
} else if (nativeEncoder && !NON_LATIN.test(s)) {
137+
// Instead of an ASCII regex check, encode optimistically - this is faster
138+
// Check for 8-bit string with a regex though, this is instant on 8-bit strings so doesn't hurt the ASCII fast path
121139
try {
122140
return encodeAscii(s, E_STRICT)
123141
} catch {}
124142
}
125143

126-
const res = encode(s, m)
144+
const res = encode(s, m, loose)
127145
if (!res) throw new TypeError(E_STRICT)
128146
return res
129147
}

single-byte.node.js

Lines changed: 41 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -61,54 +61,79 @@ export function createSinglebyteDecoder(encoding, loose = false) {
6161

6262
const NON_LATIN = /[^\x00-\xFF]/ // eslint-disable-line no-control-regex
6363

64-
function encode(s, m) {
64+
function encode(s, m, loose) {
6565
const len = s.length
6666
let i = 0
6767
const b = Buffer.from(s, 'utf-16le') // aligned
6868
if (!isLE) b.swap16()
6969
const x = new Uint16Array(b.buffer, b.byteOffset, b.byteLength / 2)
70-
for (const len3 = len - 3; i < len3; i += 4) {
70+
if (!m || m.length < 256) return null // perf
71+
const len3 = len - 3
72+
while (i < len3) {
7173
const x0 = x[i], x1 = x[i + 1], x2 = x[i + 2], x3 = x[i + 3] // prettier-ignore
7274
const c0 = m[x0], c1 = m[x1], c2 = m[x2], c3 = m[x3] // prettier-ignore
73-
if (!(c0 && c1 && c2 && c3) && ((!c0 && x0) || (!c1 && x1) || (!c2 && x2) || (!c3 && x3))) return null // prettier-ignore
75+
if (!(c0 && c1 && c2 && c3) && ((!c0 && x0) || (!c1 && x1) || (!c2 && x2) || (!c3 && x3))) break
7476
x[i] = c0
7577
x[i + 1] = c1
7678
x[i + 2] = c2
7779
x[i + 3] = c3
80+
i += 4
7881
}
7982

83+
const mlen = m.length
8084
for (; i < len; i++) {
8185
const x0 = x[i]
86+
if (x0 >= mlen) break
8287
const c0 = m[x0]
83-
if (!c0 && x0) return null
88+
if (!c0 && x0) break
8489
x[i] = c0
8590
}
8691

87-
return new Uint8Array(x)
92+
if (i === len) return new Uint8Array(x)
93+
if (!loose) return null
94+
let j = i
95+
while (i < len) {
96+
const x0 = x[i++]
97+
if (x0 >= 0xd8_00 && x0 < 0xdc_00) {
98+
if (i < len) {
99+
const x1 = x[i]
100+
if (x1 >= 0xdc_00 && x1 < 0xe0_00) i++
101+
}
102+
x[j++] = 63 // '?'
103+
} else if (x0 >= mlen) {
104+
x[j++] = 63 // '?'
105+
} else {
106+
const c0 = m[x0]
107+
x[j++] = !c0 && x0 ? 63 : c0
108+
}
109+
}
110+
111+
return new Uint8Array(j === len ? x : x.subarray(0, j))
88112
}
89113

90114
export function createSinglebyteEncoder(encoding, { mode = 'fatal' } = {}) {
91-
// TODO: replacement, truncate (replacement will need varying length)
92-
if (mode !== 'fatal') throw new Error('Unsupported mode')
115+
const loose = mode === 'replacement'
116+
if (mode !== 'fatal' && !loose) throw new Error('Unsupported mode')
93117
const m = encodeMap(encoding) // asserts
94118
const isLatin1 = encoding === 'iso-8859-1'
95119

96120
return (s) => {
97121
if (typeof s !== 'string') throw new TypeError(E_STRING)
98122
if (isLatin1) {
99-
if (NON_LATIN.test(s)) throw new TypeError(E_STRICT)
100-
const b = Buffer.from(s, 'latin1')
101-
return new Uint8Array(b.buffer, b.byteOffset, b.byteLength)
102-
}
103-
104-
// Instead of an ASCII regex check, encode optimistically - this is faster
105-
// Check for 8-bit string with a regex though, this is instant on 8-bit strings so doesn't hurt the ASCII fast path
106-
if (!NON_LATIN.test(s)) {
123+
if (!NON_LATIN.test(s)) {
124+
const b = Buffer.from(s, 'latin1')
125+
return new Uint8Array(b.buffer, b.byteOffset, b.byteLength)
126+
}
127+
128+
if (!loose) throw new TypeError(E_STRICT)
129+
} else if (!NON_LATIN.test(s)) {
130+
// Instead of an ASCII regex check, encode optimistically - this is faster
131+
// Check for 8-bit string with a regex though, this is instant on 8-bit strings so doesn't hurt the ASCII fast path
107132
const b = Buffer.from(s, 'utf8') // ascii/latin1 coerces, we need to check
108133
if (b.length === s.length) return new Uint8Array(b.buffer, b.byteOffset, b.byteLength)
109134
}
110135

111-
const res = encode(s, m)
136+
const res = encode(s, m, loose)
112137
if (!res) throw new TypeError(E_STRICT)
113138
return res
114139
}

0 commit comments

Comments
 (0)