Skip to content

Commit 7db66bd

Browse files
committed
revert impl
1 parent 33b7b76 commit 7db66bd

File tree

1 file changed

+82
-73
lines changed

1 file changed

+82
-73
lines changed

src/aarch64.rs

Lines changed: 82 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -1,100 +1,109 @@
1-
/*!
2-
* High-performance JSON string escaping using V8-style SIMD optimizations for aarch64.
3-
*
4-
* Core V8 insight: Optimize for the common case where most data needs NO escaping.
5-
* Use SIMD for fast detection, bulk copy for clean chunks, scalar fallback for dirty chunks.
6-
*/
7-
81
use std::arch::aarch64::{
9-
vceqq_u8, vdupq_n_u8, vld1q_u8_x4, vmaxvq_u8, vorrq_u8, vcltq_u8,
2+
vceqq_u8, vdupq_n_u8, vld1q_u8_x4, vmaxvq_u8, vorrq_u8, vqtbl4q_u8, vst1q_u8,
103
};
114

12-
use crate::encode_str_inner;
5+
use crate::{encode_str_inner, write_char_escape, CharEscape, ESCAPE, REVERSE_SOLIDUS};
136

14-
/// Process 64 bytes per check - optimal for cache and SIMD
7+
/// Four contiguous 16-byte NEON registers (64 B) per loop.
158
const CHUNK: usize = 64;
16-
17-
/// Ultra-fast SIMD check: does this 64-byte chunk need ANY escaping?
18-
/// Returns true if completely clean (bulk copy safe)
19-
#[inline(always)]
20-
unsafe fn chunk_is_clean(ptr: *const u8) -> bool {
21-
let quad = vld1q_u8_x4(ptr);
22-
23-
// Check for escape characters in all four 16-byte vectors
24-
// Characters needing escape: < 0x20, == 0x22 ("), == 0x5C (\)
25-
let needs_escape_0 = vorrq_u8(
26-
vcltq_u8(quad.0, vdupq_n_u8(0x20)),
27-
vorrq_u8(vceqq_u8(quad.0, vdupq_n_u8(0x22)), vceqq_u8(quad.0, vdupq_n_u8(0x5C)))
28-
);
29-
let needs_escape_1 = vorrq_u8(
30-
vcltq_u8(quad.1, vdupq_n_u8(0x20)),
31-
vorrq_u8(vceqq_u8(quad.1, vdupq_n_u8(0x22)), vceqq_u8(quad.1, vdupq_n_u8(0x5C)))
32-
);
33-
let needs_escape_2 = vorrq_u8(
34-
vcltq_u8(quad.2, vdupq_n_u8(0x20)),
35-
vorrq_u8(vceqq_u8(quad.2, vdupq_n_u8(0x22)), vceqq_u8(quad.2, vdupq_n_u8(0x5C)))
36-
);
37-
let needs_escape_3 = vorrq_u8(
38-
vcltq_u8(quad.3, vdupq_n_u8(0x20)),
39-
vorrq_u8(vceqq_u8(quad.3, vdupq_n_u8(0x22)), vceqq_u8(quad.3, vdupq_n_u8(0x5C)))
40-
);
41-
42-
// Combine all masks and check if ANY byte needs escaping
43-
let all_masks = vorrq_u8(
44-
vorrq_u8(needs_escape_0, needs_escape_1),
45-
vorrq_u8(needs_escape_2, needs_escape_3)
46-
);
47-
48-
// Return true if NO bytes need escaping (chunk is clean)
49-
vmaxvq_u8(all_masks) == 0
50-
}
9+
/// Distance (in bytes) to prefetch ahead. Must be a multiple of 8 for PRFM.
10+
/// Keeping ~4 iterations (4 × CHUNK = 256 B) ahead strikes a good balance
11+
/// between hiding memory latency and not evicting useful cache lines.
12+
const PREFETCH_DISTANCE: usize = CHUNK * 4;
5113

5214
pub fn encode_str<S: AsRef<str>>(input: S) -> String {
5315
let s = input.as_ref();
16+
let mut out = Vec::with_capacity(s.len() + 2);
5417
let bytes = s.as_bytes();
5518
let n = bytes.len();
56-
57-
// Simple capacity estimation
58-
let mut out = Vec::with_capacity(n + n / 16 + 2);
5919
out.push(b'"');
6020

61-
// V8-style optimization: Focus on the fast path for clean data
6221
unsafe {
22+
let tbl = vld1q_u8_x4(ESCAPE.as_ptr()); // first 64 B of the escape table
23+
let slash = vdupq_n_u8(b'\\');
6324
let mut i = 0;
64-
let mut clean_start = 0;
65-
66-
// Process in 64-byte chunks optimized for clean data
25+
// Re-usable scratch – *uninitialised*, so no memset in the loop.
26+
// Using MaybeUninit instead of mem::zeroed() prevents the compiler from inserting an implicit memset (observable with -Cllvm-args=-print-after=expand-memcmp).
27+
// This is a proven micro-optimisation in Rust's standard library I/O stack.
28+
#[allow(invalid_value)]
29+
let mut placeholder: [u8; 16] = core::mem::MaybeUninit::uninit().assume_init();
30+
6731
while i + CHUNK <= n {
6832
let ptr = bytes.as_ptr().add(i);
69-
70-
if chunk_is_clean(ptr) {
71-
// Clean chunk - continue scanning
72-
i += CHUNK;
73-
} else {
74-
// Found dirty chunk - flush any accumulated clean data first
75-
if clean_start < i {
76-
out.extend_from_slice(&bytes[clean_start..i]);
77-
}
78-
79-
// Process this single dirty chunk with proven scalar code
80-
encode_str_inner(&bytes[i..i + CHUNK], &mut out);
33+
34+
/* ---- L1 prefetch: PREFETCH_DISTANCE bytes ahead ---- */
35+
core::arch::asm!(
36+
"prfm pldl1keep, [{0}, #{1}]",
37+
"prfm pldl1keep, [{0}, #{2}]",
38+
in(reg) ptr,
39+
const PREFETCH_DISTANCE,
40+
const PREFETCH_DISTANCE + 256,
41+
);
42+
/* ------------------------------------------ */
43+
44+
let quad = vld1q_u8_x4(ptr);
45+
46+
// load 64 B (four q-regs)
47+
let a = quad.0;
48+
let b = quad.1;
49+
let c = quad.2;
50+
let d = quad.3;
51+
52+
let mask_1 = vorrq_u8(vqtbl4q_u8(tbl, a), vceqq_u8(slash, a));
53+
let mask_2 = vorrq_u8(vqtbl4q_u8(tbl, b), vceqq_u8(slash, b));
54+
let mask_3 = vorrq_u8(vqtbl4q_u8(tbl, c), vceqq_u8(slash, c));
55+
let mask_4 = vorrq_u8(vqtbl4q_u8(tbl, d), vceqq_u8(slash, d));
56+
57+
let mask_r_1 = vmaxvq_u8(mask_1);
58+
let mask_r_2 = vmaxvq_u8(mask_2);
59+
let mask_r_3 = vmaxvq_u8(mask_3);
60+
let mask_r_4 = vmaxvq_u8(mask_4);
61+
62+
// fast path: nothing needs escaping
63+
if mask_r_1 | mask_r_2 | mask_r_3 | mask_r_4 == 0 {
64+
out.extend_from_slice(std::slice::from_raw_parts(ptr, CHUNK));
8165
i += CHUNK;
82-
clean_start = i;
66+
continue;
8367
}
68+
69+
macro_rules! handle {
70+
($mask:expr, $mask_r:expr, $off:expr) => {
71+
if $mask_r == 0 {
72+
out.extend_from_slice(std::slice::from_raw_parts(ptr.add($off), 16));
73+
} else {
74+
vst1q_u8(placeholder.as_mut_ptr(), $mask);
75+
handle_block(&bytes[i + $off..i + $off + 16], &placeholder, &mut out);
76+
}
77+
};
78+
}
79+
80+
handle!(mask_1, mask_r_1, 0);
81+
handle!(mask_2, mask_r_2, 16);
82+
handle!(mask_3, mask_r_3, 32);
83+
handle!(mask_4, mask_r_4, 48);
84+
85+
i += CHUNK;
8486
}
85-
86-
// Flush any remaining clean data
87-
if clean_start < i {
88-
out.extend_from_slice(&bytes[clean_start..i]);
89-
}
90-
91-
// Handle remaining bytes (less than CHUNK)
9287
if i < n {
9388
encode_str_inner(&bytes[i..], &mut out);
9489
}
9590
}
96-
9791
out.push(b'"');
9892
// SAFETY: we only emit valid UTF-8
9993
unsafe { String::from_utf8_unchecked(out) }
10094
}
95+
96+
#[inline(always)]
97+
unsafe fn handle_block(src: &[u8], mask: &[u8; 16], dst: &mut Vec<u8>) {
98+
for (j, &m) in mask.iter().enumerate() {
99+
let c = src[j];
100+
if m == 0 {
101+
dst.push(c);
102+
} else if m == 0xFF {
103+
dst.extend_from_slice(REVERSE_SOLIDUS);
104+
} else {
105+
let e = CharEscape::from_escape_table(m, c);
106+
write_char_escape(dst, e);
107+
}
108+
}
109+
}

0 commit comments

Comments
 (0)