Add readme

Brooooooklyn · Brooooooklyn · commit 91a11ce2e9ce · 2025-09-23T14:54:16.000+08:00
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -3,14 +3,18 @@ members = ["cpu-features"]
 
 [package]
 name = "json-escape-simd"
-version = "0.1.0"
+version = "1.0.0"
 edition = "2024"
 rust-version = "1.89.0"
+include = ["src/**/*.rs"]
 
 [[example]]
 name = "escape"
 path = "examples/escape.rs"
 
+[features]
+force_aarch64_generic = [] # Force use of generic implementation on aarch64
+
 [[bench]]
 name = "escape"
 harness = false
diff --git a/README.md b/README.md
@@ -0,0 +1,89 @@
+# json-escape-simd
+
+Optimized SIMD routines for escaping JSON strings. This repository contains the `json-escape-simd` crate, comparison fixtures, and Criterion benches against commonly used alternatives.
+
+> [!IMPORTANT]
+>
+> On aarch64 NEON hosts the available register width is **128** bits, which is narrower than the lookup table this implementation prefers. As a result the SIMD path may not outperform the generic fallback, which is reflected in the benchmark numbers below.
+>
+> On some modern macOS devices with larger register numbers, the SIMD path may outperform the generic fallback, see the [M3 max benchmark](#apple-m3-max) below.
+
+> [!NOTE]
+>
+> The `force_aarch64_generic` feature flag can be used to force use of the generic fallback on aarch64. This is useful for testing the generic fallback on aarch64 devices with smaller register numbers.
+
+## Benchmarks
+
+Numbers below come from `cargo bench` runs on GitHub Actions hardware. Criterion reports are summarized to make it easier to spot relative performance. "vs fastest" shows how much slower each implementation is compared to the fastest entry in the table (1.00× means fastest).
+
+### GitHub Actions x86_64 (`ubuntu-latest`)
+
+`AVX2` enabled.
+
+**RxJS payload (~10k iterations)**
+
+| Implementation        | Median time   | vs fastest |
+| --------------------- | ------------- | ---------- |
+| **`escape simd`**     | **345.06 µs** | **1.00×**  |
+| `escape v_jsonescape` | 576.25 µs     | 1.67×      |
+| `escape generic`      | 657.94 µs     | 1.91×      |
+| `serde_json`          | 766.72 µs     | 2.22×      |
+| `json-escape`         | 782.65 µs     | 2.27×      |
+
+**Fixtures payload (~300 iterations)**
+
+| Implementation        | Median time  | vs fastest |
+| --------------------- | ------------ | ---------- |
+| **`escape simd`**     | **12.84 ms** | **1.00×**  |
+| `escape v_jsonescape` | 19.66 ms     | 1.53×      |
+| `escape generic`      | 22.53 ms     | 1.75×      |
+| `serde_json`          | 24.65 ms     | 1.92×      |
+| `json-escape`         | 26.64 ms     | 2.07×      |
+
+### GitHub Actions aarch64 (`ubuntu-24.04-arm`)
+
+Neon enabled.
+
+**RxJS payload (~10k iterations)**
+
+| Implementation        | Median time   | vs fastest |
+| --------------------- | ------------- | ---------- |
+| **`escape generic`**  | **546.89 µs** | **1.00×**  |
+| `escape simd`         | 589.29 µs     | 1.08×      |
+| `serde_json`          | 612.33 µs     | 1.12×      |
+| `json-escape`         | 624.66 µs     | 1.14×      |
+| `escape v_jsonescape` | 789.14 µs     | 1.44×      |
+
+**Fixtures payload (~300 iterations)**
+
+| Implementation        | Median time  | vs fastest |
+| --------------------- | ------------ | ---------- |
+| **`escape generic`**  | **17.81 ms** | **1.00×**  |
+| `serde_json`          | 19.77 ms     | 1.11×      |
+| `json-escape`         | 20.84 ms     | 1.17×      |
+| `escape simd`         | 21.04 ms     | 1.18×      |
+| `escape v_jsonescape` | 25.57 ms     | 1.44×      |
+
+### Apple M3 Max
+
+
+
+**RxJS payload (~10k iterations)**
+
+| Implementation        | Median time   | vs fastest |
+| --------------------- | ------------- | ---------- |
+| **`escape simd`**     | **307.20 µs** | **1.00×**  |
+| `escape generic`      | 490.00 µs     | 1.60×      |
+| `serde_json`          | 570.35 µs     | 1.86×      |
+| `escape v_jsonescape` | 599.72 µs     | 1.95×      |
+| `json-escape`         | 644.73 µs     | 2.10×      |
+
+**Fixtures payload (~300 iterations)**
+
+| Implementation        | Median time  | vs fastest |
+| --------------------- | ------------ | ---------- |
+| **`escape generic`**  | **17.89 ms** | **1.00×**  |
+| **`escape simd`**     | **17.92 ms** | **1.00×**  |
+| `serde_json`          | 19.78 ms     | 1.11×      |
+| `escape v_jsonescape` | 21.09 ms     | 1.18×      |
+| `json-escape`         | 22.43 ms     | 1.25×      |
diff --git a/src/aarch64.rs b/src/aarch64.rs
@@ -5,7 +5,8 @@ use std::arch::aarch64::{
 use crate::{ESCAPE, HEX_BYTES, UU};
 
 const CHUNK: usize = 64;
-const PREFETCH_DISTANCE: usize = CHUNK * 4;
+// 128 bytes ahead
+const PREFETCH_DISTANCE: usize = CHUNK * 2;
 const SLASH_SENTINEL: u8 = 0xFF;
 
 #[inline]
@@ -30,9 +31,8 @@ pub fn escape_neon<S: AsRef<str>>(input: S) -> String {
             let ptr = bytes.as_ptr().add(i);
 
             core::arch::asm!(
-                "prfm pldl1keep, [{0}, #{1}]",
-                in(reg) ptr,
-                const PREFETCH_DISTANCE,
+                "prfm pldl1keep, [{0}]",
+                in(reg) ptr.add(PREFETCH_DISTANCE),
             );
 
             let quad = vld1q_u8_x4(ptr);
diff --git a/src/lib.rs b/src/lib.rs
@@ -1,7 +1,7 @@
 #[cfg(target_arch = "x86_64")]
 mod x86;
 
-#[cfg(target_arch = "aarch64")]
+#[cfg(all(target_arch = "aarch64", not(feature = "force_aarch64_generic")))]
 mod aarch64;
 
 const BB: u8 = b'b'; // \x08
@@ -150,7 +150,14 @@ pub fn escape<S: AsRef<str>>(input: S) -> String {
 
     #[cfg(target_arch = "aarch64")]
     {
-        return aarch64::escape_neon(input);
+        #[cfg(feature = "force_aarch64_generic")]
+        {
+            return escape_generic(input);
+        }
+        #[cfg(not(feature = "force_aarch64_generic"))]
+        {
+            return aarch64::escape_neon(input);
+        }
     }
 
     #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]