diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index ec874e6..196aa9f 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -21,4 +21,4 @@ jobs: - name: Run benchmarks run: cargo bench env: - RUSTFLAGS: '-C target-cpu=native' \ No newline at end of file + RUSTFLAGS: '-C target-cpu=neoverse-n2 -C target-feature=+sve2,+ls64' \ No newline at end of file diff --git a/src/aarch64.rs b/src/aarch64.rs index ab9c6f5..7e750fe 100644 --- a/src/aarch64.rs +++ b/src/aarch64.rs @@ -1,93 +1,86 @@ -use std::arch::aarch64::{ - vceqq_u8, vdupq_n_u8, vld1q_u8_x4, vmaxvq_u8, vorrq_u8, vqtbl4q_u8, vst1q_u8, +use std::arch::{ + aarch64::{vceqq_u8, vdupq_n_u8, vld1q_u8_x4, vmaxvq_u8, vorrq_u8, vqtbl4q_u8, vst1q_u8}, + asm, is_aarch64_feature_detected, }; use crate::{encode_str_inner, write_char_escape, CharEscape, ESCAPE, REVERSE_SOLIDUS}; -/// Four contiguous 16-byte NEON registers (64 B) per loop. +/// Bytes handled per *outer* iteration in the new 8.7 path. +/// (Still 64 B in the NEON fallback.) const CHUNK: usize = 64; -/// Distance (in bytes) to prefetch ahead. Must be a multiple of 8 for PRFM. -/// Keeping ~4 iterations (4 × CHUNK = 256 B) ahead strikes a good balance -/// between hiding memory latency and not evicting useful cache lines. +/// Prefetch distance (works for both paths). const PREFETCH_DISTANCE: usize = CHUNK * 4; pub fn encode_str>(input: S) -> String { let s = input.as_ref(); let mut out = Vec::with_capacity(s.len() + 2); - let bytes = s.as_bytes(); - let n = bytes.len(); + let b = s.as_bytes(); + let n = b.len(); out.push(b'"'); unsafe { - let tbl = vld1q_u8_x4(ESCAPE.as_ptr()); // first 64 B of the escape table - let slash = vdupq_n_u8(b'\\'); - let mut i = 0; - // Re-usable scratch – *uninitialised*, so no memset in the loop. - // Using MaybeUninit instead of mem::zeroed() prevents the compiler from inserting an implicit memset (observable with -Cllvm-args=-print-after=expand-memcmp). - // This is a proven micro-optimisation in Rust's standard library I/O stack. #[allow(invalid_value)] - let mut placeholder: [u8; 16] = core::mem::MaybeUninit::uninit().assume_init(); + let mut scratch: [u8; 16] = core::mem::MaybeUninit::uninit().assume_init(); + + let mut i = 0; + + /* ------------------------------------------------------------------ */ + /* === Arm v8.7 fast path: LS64 + SVE2 =============================== */ + + let tbl = vld1q_u8_x4(ESCAPE.as_ptr()); + let slash = vdupq_n_u8(b'\\'); while i + CHUNK <= n { - let ptr = bytes.as_ptr().add(i); - - /* ---- L1 prefetch: PREFETCH_DISTANCE bytes ahead ---- */ - core::arch::asm!( - "prfm pldl1keep, [{0}, #{1}]", - in(reg) ptr, - const PREFETCH_DISTANCE, - ); - /* ------------------------------------------ */ - - let quad = vld1q_u8_x4(ptr); - - // load 64 B (four q-regs) - let a = quad.0; - let b = quad.1; - let c = quad.2; - let d = quad.3; - - let mask_1 = vorrq_u8(vqtbl4q_u8(tbl, a), vceqq_u8(slash, a)); - let mask_2 = vorrq_u8(vqtbl4q_u8(tbl, b), vceqq_u8(slash, b)); - let mask_3 = vorrq_u8(vqtbl4q_u8(tbl, c), vceqq_u8(slash, c)); - let mask_4 = vorrq_u8(vqtbl4q_u8(tbl, d), vceqq_u8(slash, d)); - - let mask_r_1 = vmaxvq_u8(mask_1); - let mask_r_2 = vmaxvq_u8(mask_2); - let mask_r_3 = vmaxvq_u8(mask_3); - let mask_r_4 = vmaxvq_u8(mask_4); - - // fast path: nothing needs escaping - if mask_r_1 | mask_r_2 | mask_r_3 | mask_r_4 == 0 { - out.extend_from_slice(std::slice::from_raw_parts(ptr, CHUNK)); - i += CHUNK; + let ptr = b.as_ptr().add(i); + if is_aarch64_feature_detected!("sve2") { + i += escape_block_sve(ptr, &mut out); continue; - } + } else { + asm!("prfm pldl1keep, [{0}, #{1}]", + in(reg) ptr, const PREFETCH_DISTANCE); - macro_rules! handle { - ($mask:expr, $mask_r:expr, $off:expr) => { - if $mask_r == 0 { - out.extend_from_slice(std::slice::from_raw_parts(ptr.add($off), 16)); - } else { - vst1q_u8(placeholder.as_mut_ptr(), $mask); - handle_block(&bytes[i + $off..i + $off + 16], &placeholder, &mut out); - } - }; - } + let quad = vld1q_u8_x4(ptr); + let a = quad.0; + let b1 = quad.1; + let c = quad.2; + let d = quad.3; + + let m1 = vorrq_u8(vqtbl4q_u8(tbl, a), vceqq_u8(slash, a)); + let m2 = vorrq_u8(vqtbl4q_u8(tbl, b1), vceqq_u8(slash, b1)); + let m3 = vorrq_u8(vqtbl4q_u8(tbl, c), vceqq_u8(slash, c)); + let m4 = vorrq_u8(vqtbl4q_u8(tbl, d), vceqq_u8(slash, d)); - handle!(mask_1, mask_r_1, 0); - handle!(mask_2, mask_r_2, 16); - handle!(mask_3, mask_r_3, 32); - handle!(mask_4, mask_r_4, 48); + if vmaxvq_u8(m1) | vmaxvq_u8(m2) | vmaxvq_u8(m3) | vmaxvq_u8(m4) == 0 { + out.extend_from_slice(std::slice::from_raw_parts(ptr, CHUNK)); + i += CHUNK; + continue; + } - i += CHUNK; + macro_rules! handle { + ($m:expr,$r:expr,$off:expr) => { + if $r == 0 { + out.extend_from_slice(std::slice::from_raw_parts(ptr.add($off), 16)); + } else { + vst1q_u8(scratch.as_mut_ptr(), $m); + handle_block(&b[i + $off..i + $off + 16], &scratch, &mut out); + } + }; + } + handle!(m1, vmaxvq_u8(m1), 0); + handle!(m2, vmaxvq_u8(m2), 16); + handle!(m3, vmaxvq_u8(m3), 32); + handle!(m4, vmaxvq_u8(m4), 48); + + i += CHUNK; + } } + /* ------------------------------------------------------------------ */ + if i < n { - encode_str_inner(&bytes[i..], &mut out); + encode_str_inner(&b[i..], &mut out); } } out.push(b'"'); - // SAFETY: we only emit valid UTF-8 unsafe { String::from_utf8_unchecked(out) } } @@ -100,8 +93,75 @@ unsafe fn handle_block(src: &[u8], mask: &[u8; 16], dst: &mut Vec) { } else if m == 0xFF { dst.extend_from_slice(REVERSE_SOLIDUS); } else { - let e = CharEscape::from_escape_table(m, c); - write_char_escape(dst, e); + write_char_escape(dst, CharEscape::from_escape_table(m, c)); + } + } +} + +#[inline(always)] +unsafe fn escape_block_sve(ptr: *const u8, dst: &mut Vec) -> usize { + /* ------------------------------------------------------------------ */ + /* One-shot: copy ESCAPE[0..64] into z4-z7 */ + /* Each LD1B uses an in-range offset and bumps x9 by 16 bytes. */ + core::arch::asm!( + "ptrue p0.b", + "mov x9, {tbl}", + "ld1b z4.b, p0/z, [x9]", + "add x9, x9, #16", + "ld1b z5.b, p0/z, [x9]", + "add x9, x9, #16", + "ld1b z6.b, p0/z, [x9]", + "add x9, x9, #16", + "ld1b z7.b, p0/z, [x9]", + tbl = in(reg) crate::ESCAPE.as_ptr(), + out("x9") _, + options(readonly, nostack, preserves_flags) + ); + /* ------------------------------------------------------------------ */ + + /* 1️⃣ Single-copy 64-byte fetch into L1 */ + core::arch::asm!( + "ld64b x0, [{src}]", + src = in(reg) ptr, + out("x0") _, out("x1") _, out("x2") _, out("x3") _, + out("x4") _, out("x5") _, out("x6") _, out("x7") _, + options(nostack) + ); + + /* 2️⃣ Build escape mask */ + let mut mask: u32; + core::arch::asm!( + "ptrue p0.b", + "ld1b z0.b, p0/z, [{src}]", + "tbl z1.b, {{z4.b, z5.b, z6.b, z7.b}}, z0.b", + "dup z2.b, {slash}", + "cmeq z2.b, p0/m, z0.b, z2.b", + "orr z3.b, z1.b, z2.b", + "umaxv {mask:w}, p0, z3.b", // scalar result → wMask + src = in(reg) ptr, + slash = const b'\\', + mask = lateout(reg) mask, + options(preserves_flags, nostack, readonly) + ); + + if mask == 0 { + dst.extend_from_slice(std::slice::from_raw_parts(ptr, CHUNK)); + return CHUNK; + } + + /* 3️⃣ Spill z3 and escape the bad bytes */ + let mut m = [0u8; CHUNK]; + core::arch::asm!("ptrue p0.b", "st1b z3.b, p0, [{buf}]", + buf = in(reg) m.as_mut_ptr(), options(nostack)); + for (i, &bit) in m.iter().enumerate() { + let c = *ptr.add(i); + if bit == 0 { + dst.push(c); + } else if bit == 0xFF { + dst.extend_from_slice(crate::REVERSE_SOLIDUS); + } else { + crate::write_char_escape(dst, CharEscape::from_escape_table(bit, c)); } } + CHUNK }