Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,4 @@ jobs:
- name: Run benchmarks
run: cargo bench
env:
RUSTFLAGS: '-C target-cpu=native'
RUSTFLAGS: '-C target-cpu=neoverse-n2 -C target-feature=+sve2,+ls64'
194 changes: 127 additions & 67 deletions src/aarch64.rs
Original file line number Diff line number Diff line change
@@ -1,93 +1,86 @@
use std::arch::aarch64::{
vceqq_u8, vdupq_n_u8, vld1q_u8_x4, vmaxvq_u8, vorrq_u8, vqtbl4q_u8, vst1q_u8,
use std::arch::{
aarch64::{vceqq_u8, vdupq_n_u8, vld1q_u8_x4, vmaxvq_u8, vorrq_u8, vqtbl4q_u8, vst1q_u8},
asm, is_aarch64_feature_detected,
};

use crate::{encode_str_inner, write_char_escape, CharEscape, ESCAPE, REVERSE_SOLIDUS};

/// Four contiguous 16-byte NEON registers (64 B) per loop.
/// Bytes handled per *outer* iteration in the new 8.7 path.
/// (Still 64 B in the NEON fallback.)
const CHUNK: usize = 64;
/// Distance (in bytes) to prefetch ahead. Must be a multiple of 8 for PRFM.
/// Keeping ~4 iterations (4 × CHUNK = 256 B) ahead strikes a good balance
/// between hiding memory latency and not evicting useful cache lines.
/// Prefetch distance (works for both paths).
const PREFETCH_DISTANCE: usize = CHUNK * 4;

pub fn encode_str<S: AsRef<str>>(input: S) -> String {
let s = input.as_ref();
let mut out = Vec::with_capacity(s.len() + 2);
let bytes = s.as_bytes();
let n = bytes.len();
let b = s.as_bytes();
let n = b.len();
out.push(b'"');

unsafe {
let tbl = vld1q_u8_x4(ESCAPE.as_ptr()); // first 64 B of the escape table
let slash = vdupq_n_u8(b'\\');
let mut i = 0;
// Re-usable scratch – *uninitialised*, so no memset in the loop.
// Using MaybeUninit instead of mem::zeroed() prevents the compiler from inserting an implicit memset (observable with -Cllvm-args=-print-after=expand-memcmp).
// This is a proven micro-optimisation in Rust's standard library I/O stack.
#[allow(invalid_value)]
let mut placeholder: [u8; 16] = core::mem::MaybeUninit::uninit().assume_init();
let mut scratch: [u8; 16] = core::mem::MaybeUninit::uninit().assume_init();

let mut i = 0;

/* ------------------------------------------------------------------ */
/* === Arm v8.7 fast path: LS64 + SVE2 =============================== */

let tbl = vld1q_u8_x4(ESCAPE.as_ptr());
let slash = vdupq_n_u8(b'\\');

while i + CHUNK <= n {
let ptr = bytes.as_ptr().add(i);

/* ---- L1 prefetch: PREFETCH_DISTANCE bytes ahead ---- */
core::arch::asm!(
"prfm pldl1keep, [{0}, #{1}]",
in(reg) ptr,
const PREFETCH_DISTANCE,
);
/* ------------------------------------------ */

let quad = vld1q_u8_x4(ptr);

// load 64 B (four q-regs)
let a = quad.0;
let b = quad.1;
let c = quad.2;
let d = quad.3;

let mask_1 = vorrq_u8(vqtbl4q_u8(tbl, a), vceqq_u8(slash, a));
let mask_2 = vorrq_u8(vqtbl4q_u8(tbl, b), vceqq_u8(slash, b));
let mask_3 = vorrq_u8(vqtbl4q_u8(tbl, c), vceqq_u8(slash, c));
let mask_4 = vorrq_u8(vqtbl4q_u8(tbl, d), vceqq_u8(slash, d));

let mask_r_1 = vmaxvq_u8(mask_1);
let mask_r_2 = vmaxvq_u8(mask_2);
let mask_r_3 = vmaxvq_u8(mask_3);
let mask_r_4 = vmaxvq_u8(mask_4);

// fast path: nothing needs escaping
if mask_r_1 | mask_r_2 | mask_r_3 | mask_r_4 == 0 {
out.extend_from_slice(std::slice::from_raw_parts(ptr, CHUNK));
i += CHUNK;
let ptr = b.as_ptr().add(i);
if is_aarch64_feature_detected!("sve2") {
i += escape_block_sve(ptr, &mut out);
continue;
}
} else {
asm!("prfm pldl1keep, [{0}, #{1}]",
in(reg) ptr, const PREFETCH_DISTANCE);

macro_rules! handle {
($mask:expr, $mask_r:expr, $off:expr) => {
if $mask_r == 0 {
out.extend_from_slice(std::slice::from_raw_parts(ptr.add($off), 16));
} else {
vst1q_u8(placeholder.as_mut_ptr(), $mask);
handle_block(&bytes[i + $off..i + $off + 16], &placeholder, &mut out);
}
};
}
let quad = vld1q_u8_x4(ptr);
let a = quad.0;
let b1 = quad.1;
let c = quad.2;
let d = quad.3;

let m1 = vorrq_u8(vqtbl4q_u8(tbl, a), vceqq_u8(slash, a));
let m2 = vorrq_u8(vqtbl4q_u8(tbl, b1), vceqq_u8(slash, b1));
let m3 = vorrq_u8(vqtbl4q_u8(tbl, c), vceqq_u8(slash, c));
let m4 = vorrq_u8(vqtbl4q_u8(tbl, d), vceqq_u8(slash, d));

handle!(mask_1, mask_r_1, 0);
handle!(mask_2, mask_r_2, 16);
handle!(mask_3, mask_r_3, 32);
handle!(mask_4, mask_r_4, 48);
if vmaxvq_u8(m1) | vmaxvq_u8(m2) | vmaxvq_u8(m3) | vmaxvq_u8(m4) == 0 {
out.extend_from_slice(std::slice::from_raw_parts(ptr, CHUNK));
i += CHUNK;
continue;
}

i += CHUNK;
macro_rules! handle {
($m:expr,$r:expr,$off:expr) => {
if $r == 0 {
out.extend_from_slice(std::slice::from_raw_parts(ptr.add($off), 16));
} else {
vst1q_u8(scratch.as_mut_ptr(), $m);
handle_block(&b[i + $off..i + $off + 16], &scratch, &mut out);
}
};
}
handle!(m1, vmaxvq_u8(m1), 0);
handle!(m2, vmaxvq_u8(m2), 16);
handle!(m3, vmaxvq_u8(m3), 32);
handle!(m4, vmaxvq_u8(m4), 48);

i += CHUNK;
}
}
/* ------------------------------------------------------------------ */

if i < n {
encode_str_inner(&bytes[i..], &mut out);
encode_str_inner(&b[i..], &mut out);
}
}
out.push(b'"');
// SAFETY: we only emit valid UTF-8
unsafe { String::from_utf8_unchecked(out) }
}

Expand All @@ -100,8 +93,75 @@ unsafe fn handle_block(src: &[u8], mask: &[u8; 16], dst: &mut Vec<u8>) {
} else if m == 0xFF {
dst.extend_from_slice(REVERSE_SOLIDUS);
} else {
let e = CharEscape::from_escape_table(m, c);
write_char_escape(dst, e);
write_char_escape(dst, CharEscape::from_escape_table(m, c));
}
}
}

#[inline(always)]
unsafe fn escape_block_sve(ptr: *const u8, dst: &mut Vec<u8>) -> usize {
/* ------------------------------------------------------------------ */
/* One-shot: copy ESCAPE[0..64] into z4-z7 */
/* Each LD1B uses an in-range offset and bumps x9 by 16 bytes. */
core::arch::asm!(
"ptrue p0.b",
"mov x9, {tbl}",
"ld1b z4.b, p0/z, [x9]",
"add x9, x9, #16",
"ld1b z5.b, p0/z, [x9]",
"add x9, x9, #16",
"ld1b z6.b, p0/z, [x9]",
"add x9, x9, #16",
"ld1b z7.b, p0/z, [x9]",
tbl = in(reg) crate::ESCAPE.as_ptr(),
out("x9") _,
options(readonly, nostack, preserves_flags)
);
/* ------------------------------------------------------------------ */

/* 1️⃣ Single-copy 64-byte fetch into L1 */
core::arch::asm!(
"ld64b x0, [{src}]",
src = in(reg) ptr,
out("x0") _, out("x1") _, out("x2") _, out("x3") _,
out("x4") _, out("x5") _, out("x6") _, out("x7") _,
options(nostack)
);

/* 2️⃣ Build escape mask */
let mut mask: u32;
core::arch::asm!(
"ptrue p0.b",
"ld1b z0.b, p0/z, [{src}]",
"tbl z1.b, {{z4.b, z5.b, z6.b, z7.b}}, z0.b",
"dup z2.b, {slash}",
"cmeq z2.b, p0/m, z0.b, z2.b",
"orr z3.b, z1.b, z2.b",
"umaxv {mask:w}, p0, z3.b", // scalar result → wMask
src = in(reg) ptr,
slash = const b'\\',
mask = lateout(reg) mask,
options(preserves_flags, nostack, readonly)
);

if mask == 0 {
dst.extend_from_slice(std::slice::from_raw_parts(ptr, CHUNK));
return CHUNK;
}

/* 3️⃣ Spill z3 and escape the bad bytes */
let mut m = [0u8; CHUNK];
core::arch::asm!("ptrue p0.b", "st1b z3.b, p0, [{buf}]",
buf = in(reg) m.as_mut_ptr(), options(nostack));
for (i, &bit) in m.iter().enumerate() {
let c = *ptr.add(i);
if bit == 0 {
dst.push(c);
} else if bit == 0xFF {
dst.extend_from_slice(crate::REVERSE_SOLIDUS);
} else {
crate::write_char_escape(dst, CharEscape::from_escape_table(bit, c));
}
}
CHUNK
}
Loading