Skip to content

Commit 4a5eaab

Browse files
committed
use vld1q_u8_x4 to load 4 at the sametime
1 parent e21ab98 commit 4a5eaab

File tree

1 file changed

+22
-11
lines changed

1 file changed

+22
-11
lines changed

src/aarch64.rs

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,15 @@
11
use std::arch::aarch64::{
2-
vceqq_u8, vdupq_n_u8, vld1q_u8, vld1q_u8_x4, vmaxvq_u8, vorrq_u8, vqtbl4q_u8, vst1q_u8,
2+
vceqq_u8, vdupq_n_u8, vld1q_u8_x4, vmaxvq_u8, vorrq_u8, vqtbl4q_u8, vst1q_u8,
33
};
44

55
use crate::{encode_str_inner, write_char_escape, CharEscape, ESCAPE, REVERSE_SOLIDUS};
66

77
/// Four contiguous 16-byte NEON registers (64 B) per loop.
88
const CHUNK: usize = 64;
9+
/// Distance (in bytes) to prefetch ahead. Must be a multiple of 8 for PRFM.
10+
/// Keeping ~4 iterations (4 × CHUNK = 256 B) ahead strikes a good balance
11+
/// between hiding memory latency and not evicting useful cache lines.
12+
const PREFETCH_DISTANCE: usize = CHUNK * 4;
913

1014
pub fn encode_str<S: AsRef<str>>(input: S) -> String {
1115
let s = input.as_ref();
@@ -18,23 +22,30 @@ pub fn encode_str<S: AsRef<str>>(input: S) -> String {
1822
let tbl = vld1q_u8_x4(ESCAPE.as_ptr()); // first 64 B of the escape table
1923
let slash = vdupq_n_u8(b'\\');
2024
let mut i = 0;
21-
let mut placeholder: [u8; 16] = core::mem::zeroed();
25+
// Re-usable scratch – *uninitialised*, so no memset in the loop.
26+
// Using MaybeUninit instead of mem::zeroed() prevents the compiler from inserting an implicit memset (observable with -Cllvm-args=-print-after=expand-memcmp).
27+
// This is a proven micro-optimisation in Rust's standard library I/O stack.
28+
#[allow(invalid_value)]
29+
let mut placeholder: [u8; 16] = core::mem::MaybeUninit::uninit().assume_init();
2230

2331
while i + CHUNK <= n {
2432
let ptr = bytes.as_ptr().add(i);
2533

26-
/* ---- L1 prefetch: CHUNK size ahead ---- */
27-
core::arch::asm!("prfm pldl1keep, [{0}, #64]", in(reg) ptr);
34+
/* ---- L1 prefetch: PREFETCH_DISTANCE bytes ahead ---- */
35+
core::arch::asm!(
36+
"prfm pldl1keep, [{0}, #{1}]",
37+
in(reg) ptr,
38+
const PREFETCH_DISTANCE,
39+
);
2840
/* ------------------------------------------ */
2941

30-
// load 64 B (four q-regs)
31-
let a = vld1q_u8(ptr);
32-
33-
let b = vld1q_u8(ptr.add(16));
42+
let quad = vld1q_u8_x4(ptr);
3443

35-
let c = vld1q_u8(ptr.add(32));
36-
37-
let d = vld1q_u8(ptr.add(48));
44+
// load 64 B (four q-regs)
45+
let a = quad.0;
46+
let b = quad.1;
47+
let c = quad.2;
48+
let d = quad.3;
3849

3950
let mask_1 = vorrq_u8(vqtbl4q_u8(tbl, a), vceqq_u8(slash, a));
4051
let mask_2 = vorrq_u8(vqtbl4q_u8(tbl, b), vceqq_u8(slash, b));

0 commit comments

Comments
 (0)