feat: add replacement support in single-byte encoders

ChALkeR · ChALkeR · commit 6ecf4aa2eec0 · 2026-01-19T05:36:17.000+04:00
diff --git a/README.md b/README.md
@@ -189,6 +189,8 @@ Returns a function `encode(string)` that encodes a string to bytes.
 In `'fatal'` mode (default), will throw on non well-formed strings or any codepoints which could
 not be encoded in the target encoding.
 
+In `'replacement'` mode, all unmapped codepoints and unpaired surrogates will be replaced with `U+3F` (codepoint for '?').
+
 ##### `latin1toString(arr)`
 
 Decode `iso-8859-1` bytes to a string.
diff --git a/single-byte.js b/single-byte.js
@@ -61,15 +61,16 @@ export function createSinglebyteDecoder(encoding, loose = false) {
 
 const NON_LATIN = /[^\x00-\xFF]/ // eslint-disable-line no-control-regex
 
-function encode(s, m) {
+function encode(s, m, loose) {
   const len = s.length
   const x = new Uint8Array(len)
   let i = nativeEncoder ? 0 : encodeAsciiPrefix(x, s)
 
+  if (!m || m.length < 256) return null // perf
   for (const len3 = len - 3; i < len3; i += 4) {
     const x0 = s.charCodeAt(i), x1 = s.charCodeAt(i + 1), x2 = s.charCodeAt(i + 2), x3 = s.charCodeAt(i + 3) // prettier-ignore
     const c0 = m[x0], c1 = m[x1], c2 = m[x2], c3 = m[x3] // prettier-ignore
-    if ((!c0 && x0) || (!c1 && x1) || (!c2 && x2) || (!c3 && x3)) return null
+    if ((!c0 && x0) || (!c1 && x1) || (!c2 && x2) || (!c3 && x3)) break
 
     x[i] = c0
     x[i + 1] = c1
@@ -80,19 +81,36 @@ function encode(s, m) {
   for (; i < len; i++) {
     const x0 = s.charCodeAt(i)
     const c0 = m[x0]
-    if (!c0 && x0) return null
+    if (!c0 && x0) break
     x[i] = c0
   }
 
-  return x
+  if (i === len) return x
+  if (!loose) return null
+  let j = i
+  while (i < len) {
+    const x0 = s.charCodeAt(i++)
+    let c0 = m[x0]
+    if (!c0 && x0) {
+      c0 = 63 // '?'
+      if (x0 >= 0xd8_00 && x0 < 0xdc_00 && i < len) {
+        const x1 = s.charCodeAt(i)
+        if (x1 >= 0xdc_00 && x1 < 0xe0_00) i++
+      }
+    }
+
+    x[j++] = c0
+  }
+
+  return j === len ? x : x.subarray(0, j)
 }
 
 // fromBase64+btoa path is faster on everything where fromBase64 is fast
 const useLatin1btoa = Uint8Array.fromBase64 && btoa && !skipWeb
 
 export function createSinglebyteEncoder(encoding, { mode = 'fatal' } = {}) {
-  // TODO: replacement, truncate (replacement will need varying length)
-  if (mode !== 'fatal') throw new Error('Unsupported mode')
+  const loose = mode === 'replacement'
+  if (mode !== 'fatal' && !loose) throw new Error('Unsupported mode')
   const m = encodeMap(encoding) // asserts
   const isLatin1 = encoding === 'iso-8859-1'
 
@@ -106,24 +124,21 @@ export function createSinglebyteEncoder(encoding, { mode = 'fatal' } = {}) {
       if (useLatin1btoa && s.length >= 1024 && s.length < 1e8) {
         try {
           return Uint8Array.fromBase64(btoa(s)) // fails on non-latin1
-        } catch {
-          throw new TypeError(E_STRICT)
-        }
+        } catch {}
+      } else if (!NON_LATIN.test(s)) {
+        return encodeLatin1(s)
       }
 
-      if (NON_LATIN.test(s)) throw new TypeError(E_STRICT)
-      return encodeLatin1(s)
-    }
-
-    // Instead of an ASCII regex check, encode optimistically - this is faster
-    // Check for 8-bit string with a regex though, this is instant on 8-bit strings so doesn't hurt the ASCII fast path
-    if (nativeEncoder && !NON_LATIN.test(s)) {
+      if (!loose) throw new TypeError(E_STRICT)
+    } else if (nativeEncoder && !NON_LATIN.test(s)) {
+      // Instead of an ASCII regex check, encode optimistically - this is faster
+      // Check for 8-bit string with a regex though, this is instant on 8-bit strings so doesn't hurt the ASCII fast path
       try {
         return encodeAscii(s, E_STRICT)
       } catch {}
     }
 
-    const res = encode(s, m)
+    const res = encode(s, m, loose)
     if (!res) throw new TypeError(E_STRICT)
     return res
   }
diff --git a/single-byte.node.js b/single-byte.node.js
@@ -61,16 +61,17 @@ export function createSinglebyteDecoder(encoding, loose = false) {
 
 const NON_LATIN = /[^\x00-\xFF]/ // eslint-disable-line no-control-regex
 
-function encode(s, m) {
+function encode(s, m, loose) {
   const len = s.length
   let i = 0
   const b = Buffer.from(s, 'utf-16le') // aligned
   if (!isLE) b.swap16()
   const x = new Uint16Array(b.buffer, b.byteOffset, b.byteLength / 2)
+  if (!m || m.length < 256) return null // perf
   for (const len3 = len - 3; i < len3; i += 4) {
     const x0 = x[i], x1 = x[i + 1], x2 = x[i + 2], x3 = x[i + 3] // prettier-ignore
     const c0 = m[x0], c1 = m[x1], c2 = m[x2], c3 = m[x3] // prettier-ignore
-    if (!(c0 && c1 && c2 && c3) && ((!c0 && x0) || (!c1 && x1) || (!c2 && x2) || (!c3 && x3))) return null // prettier-ignore
+    if (!(c0 && c1 && c2 && c3) && ((!c0 && x0) || (!c1 && x1) || (!c2 && x2) || (!c3 && x3))) break
     x[i] = c0
     x[i + 1] = c1
     x[i + 2] = c2
@@ -80,35 +81,53 @@ function encode(s, m) {
   for (; i < len; i++) {
     const x0 = x[i]
     const c0 = m[x0]
-    if (!c0 && x0) return null
+    if (!c0 && x0) break
     x[i] = c0
   }
 
-  return new Uint8Array(x)
+  if (i === len) return new Uint8Array(x)
+  if (!loose) return null
+  let j = i
+  while (i < len) {
+    const x0 = x[i++]
+    let c0 = m[x0]
+    if (!c0 && x0) {
+      c0 = 63 // '?'
+      if (x0 >= 0xd8_00 && x0 < 0xdc_00 && i < len) {
+        const x1 = x[i]
+        if (x1 >= 0xdc_00 && x1 < 0xe0_00) i++
+      }
+    }
+
+    x[j++] = c0
+  }
+
+  return new Uint8Array(j === len ? x : x.subarray(0, j))
 }
 
 export function createSinglebyteEncoder(encoding, { mode = 'fatal' } = {}) {
-  // TODO: replacement, truncate (replacement will need varying length)
-  if (mode !== 'fatal') throw new Error('Unsupported mode')
+  const loose = mode === 'replacement'
+  if (mode !== 'fatal' && !loose) throw new Error('Unsupported mode')
   const m = encodeMap(encoding) // asserts
   const isLatin1 = encoding === 'iso-8859-1'
 
   return (s) => {
     if (typeof s !== 'string') throw new TypeError(E_STRING)
     if (isLatin1) {
-      if (NON_LATIN.test(s)) throw new TypeError(E_STRICT)
-      const b = Buffer.from(s, 'latin1')
-      return new Uint8Array(b.buffer, b.byteOffset, b.byteLength)
-    }
-
-    // Instead of an ASCII regex check, encode optimistically - this is faster
-    // Check for 8-bit string with a regex though, this is instant on 8-bit strings so doesn't hurt the ASCII fast path
-    if (!NON_LATIN.test(s)) {
+      if (!NON_LATIN.test(s)) {
+        const b = Buffer.from(s, 'latin1')
+        return new Uint8Array(b.buffer, b.byteOffset, b.byteLength)
+      }
+
+      if (!loose) throw new TypeError(E_STRICT)
+    } else if (!NON_LATIN.test(s)) {
+      // Instead of an ASCII regex check, encode optimistically - this is faster
+      // Check for 8-bit string with a regex though, this is instant on 8-bit strings so doesn't hurt the ASCII fast path
       const b = Buffer.from(s, 'utf8') // ascii/latin1 coerces, we need to check
       if (b.length === s.length) return new Uint8Array(b.buffer, b.byteOffset, b.byteLength)
     }
 
-    const res = encode(s, m)
+    const res = encode(s, m, loose)
     if (!res) throw new TypeError(E_STRICT)
     return res
   }