diff --git a/src/generic.rs b/src/generic.rs new file mode 100644 index 0000000..69d42ed --- /dev/null +++ b/src/generic.rs @@ -0,0 +1,133 @@ +#[inline] +// Slightly modified version of +// +// Borrowed from: +// +pub fn escape_generic>(s: S) -> String { + let s = s.as_ref(); + let bytes = s.as_bytes(); + + // Estimate capacity - most strings don't need much escaping + // Add some padding for potential escapes + let estimated_capacity = bytes.len() + bytes.len() / 2 + 2; + let mut result = Vec::with_capacity(estimated_capacity); + + result.push(b'"'); + + let mut start = 0; + let mut i = 0; + + while i < bytes.len() { + let b = bytes[i]; + + // Use lookup table to check if escaping is needed + let escape_byte = ESCAPE[b as usize]; + + if escape_byte == 0 { + // No escape needed, continue scanning + i += 1; + continue; + } + + // Copy any unescaped bytes before this position + if start < i { + result.extend_from_slice(&bytes[start..i]); + } + + // Handle the escape + result.push(b'\\'); + if escape_byte == b'u' { + // Unicode escape for control characters + result.extend_from_slice(b"u00"); + let hex_digits = &HEX_BYTES[b as usize]; + result.push(hex_digits.0); + result.push(hex_digits.1); + } else { + // Simple escape + result.push(escape_byte); + } + + i += 1; + start = i; + } + + // Copy any remaining unescaped bytes + if start < bytes.len() { + result.extend_from_slice(&bytes[start..]); + } + + result.push(b'"'); + + // SAFETY: We only pushed valid UTF-8 bytes (original string bytes and ASCII escape sequences) + unsafe { String::from_utf8_unchecked(result) } +} + +const BB: u8 = b'b'; // \x08 +const TT: u8 = b't'; // \x09 +const NN: u8 = b'n'; // \x0A +const FF: u8 = b'f'; // \x0C +const RR: u8 = b'r'; // \x0D +const QU: u8 = b'"'; // \x22 +const BS: u8 = b'\\'; // \x5C +pub(crate) const UU: u8 = b'u'; // \x00...\x1F except the ones above +const __: u8 = 0; + +// Lookup table of escape sequences. A value of b'x' at index i means that byte +// i is escaped as "\x" in JSON. A value of 0 means that byte i is not escaped. +pub(crate) static ESCAPE: [u8; 256] = [ + // 1 2 3 4 5 6 7 8 9 A B C D E F + UU, UU, UU, UU, UU, UU, UU, UU, BB, TT, NN, UU, FF, RR, UU, UU, // 0 + UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, // 1 + __, __, QU, __, __, __, __, __, __, __, __, __, __, __, __, __, // 2 + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 3 + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 4 + __, __, __, __, __, __, __, __, __, __, __, __, BS, __, __, __, // 5 + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 6 + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 7 + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 8 + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 9 + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // A + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // B + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // C + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // D + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // E + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // F +]; + +// Pre-computed hex digit pairs for control characters +pub(crate) struct HexPair(u8, u8); + +pub(crate) static HEX_BYTES: [HexPair; 32] = [ + HexPair(b'0', b'0'), + HexPair(b'0', b'1'), + HexPair(b'0', b'2'), + HexPair(b'0', b'3'), + HexPair(b'0', b'4'), + HexPair(b'0', b'5'), + HexPair(b'0', b'6'), + HexPair(b'0', b'7'), + HexPair(b'0', b'8'), + HexPair(b'0', b'9'), + HexPair(b'0', b'a'), + HexPair(b'0', b'b'), + HexPair(b'0', b'c'), + HexPair(b'0', b'd'), + HexPair(b'0', b'e'), + HexPair(b'0', b'f'), + HexPair(b'1', b'0'), + HexPair(b'1', b'1'), + HexPair(b'1', b'2'), + HexPair(b'1', b'3'), + HexPair(b'1', b'4'), + HexPair(b'1', b'5'), + HexPair(b'1', b'6'), + HexPair(b'1', b'7'), + HexPair(b'1', b'8'), + HexPair(b'1', b'9'), + HexPair(b'1', b'a'), + HexPair(b'1', b'b'), + HexPair(b'1', b'c'), + HexPair(b'1', b'd'), + HexPair(b'1', b'e'), + HexPair(b'1', b'f'), +]; diff --git a/src/lib.rs b/src/lib.rs index 8778629..af71975 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -108,11 +108,13 @@ //! | `escape v_jsonescape` | 21.09 ms | 1.18× | //! | `json-escape` | 22.43 ms | 1.25× | +#[cfg(target_arch = "aarch64")] +mod aarch64; +mod generic; #[cfg(target_arch = "x86_64")] mod x86; -#[cfg(target_arch = "aarch64")] -mod aarch64; +pub use generic::escape_generic; const BB: u8 = b'b'; // \x08 const TT: u8 = b't'; // \x09 @@ -170,67 +172,6 @@ pub(crate) const HEX_BYTES: [(u8, u8); 256] = { bytes }; -#[inline] -/// Cross platform generic implementation without any platform specific instructions -pub fn escape_generic>(input: S) -> String { - let s = input.as_ref(); - let bytes = s.as_bytes(); - - // Estimate capacity - most strings don't need much escaping - // Add some padding for potential escapes - let estimated_capacity = bytes.len() + bytes.len() / 2 + 2; - let mut result = Vec::with_capacity(estimated_capacity); - - result.push(b'"'); - - let mut start = 0; - let mut i = 0; - - while i < bytes.len() { - let b = bytes[i]; - - // Use lookup table to check if escaping is needed - let escape_byte = ESCAPE[b as usize]; - - if escape_byte == 0 { - // No escape needed, continue scanning - i += 1; - continue; - } - - // Copy any unescaped bytes before this position - if start < i { - result.extend_from_slice(&bytes[start..i]); - } - - // Handle the escape - result.push(b'\\'); - if escape_byte == UU { - // Unicode escape for control characters - result.extend_from_slice(b"u00"); - let hex_digits = &HEX_BYTES[b as usize]; - result.push(hex_digits.0); - result.push(hex_digits.1); - } else { - // Simple escape - result.push(escape_byte); - } - - i += 1; - start = i; - } - - // Copy any remaining unescaped bytes - if start < bytes.len() { - result.extend_from_slice(&bytes[start..]); - } - - result.push(b'"'); - - // SAFETY: We only pushed valid UTF-8 bytes (original string bytes and ASCII escape sequences) - unsafe { String::from_utf8_unchecked(result) } -} - /// Main entry point for JSON string escaping with SIMD acceleration /// If the platform is supported, the SIMD path will be used. Otherwise, the generic fallback will be used. pub fn escape>(input: S) -> String {