|
1 |
| -/*! |
2 |
| - * High-performance JSON string escaping using V8-style SIMD optimizations for aarch64. |
3 |
| - * |
4 |
| - * Core V8 insight: Optimize for the common case where most data needs NO escaping. |
5 |
| - * Use SIMD for fast detection, bulk copy for clean chunks, scalar fallback for dirty chunks. |
6 |
| - */ |
7 |
| - |
8 | 1 | use std::arch::aarch64::{
|
9 |
| - vceqq_u8, vdupq_n_u8, vld1q_u8_x4, vmaxvq_u8, vorrq_u8, vcltq_u8, |
| 2 | + vceqq_u8, vdupq_n_u8, vld1q_u8_x4, vmaxvq_u8, vorrq_u8, vqtbl4q_u8, vst1q_u8, |
10 | 3 | };
|
11 | 4 |
|
12 |
| -use crate::encode_str_inner; |
| 5 | +use crate::{encode_str_inner, write_char_escape, CharEscape, ESCAPE, REVERSE_SOLIDUS}; |
13 | 6 |
|
14 |
| -/// Process 64 bytes per check - optimal for cache and SIMD |
| 7 | +/// Four contiguous 16-byte NEON registers (64 B) per loop. |
15 | 8 | const CHUNK: usize = 64;
|
16 |
| - |
17 |
| -/// Ultra-fast SIMD check: does this 64-byte chunk need ANY escaping? |
18 |
| -/// Returns true if completely clean (bulk copy safe) |
19 |
| -#[inline(always)] |
20 |
| -unsafe fn chunk_is_clean(ptr: *const u8) -> bool { |
21 |
| - let quad = vld1q_u8_x4(ptr); |
22 |
| - |
23 |
| - // Check for escape characters in all four 16-byte vectors |
24 |
| - // Characters needing escape: < 0x20, == 0x22 ("), == 0x5C (\) |
25 |
| - let needs_escape_0 = vorrq_u8( |
26 |
| - vcltq_u8(quad.0, vdupq_n_u8(0x20)), |
27 |
| - vorrq_u8(vceqq_u8(quad.0, vdupq_n_u8(0x22)), vceqq_u8(quad.0, vdupq_n_u8(0x5C))) |
28 |
| - ); |
29 |
| - let needs_escape_1 = vorrq_u8( |
30 |
| - vcltq_u8(quad.1, vdupq_n_u8(0x20)), |
31 |
| - vorrq_u8(vceqq_u8(quad.1, vdupq_n_u8(0x22)), vceqq_u8(quad.1, vdupq_n_u8(0x5C))) |
32 |
| - ); |
33 |
| - let needs_escape_2 = vorrq_u8( |
34 |
| - vcltq_u8(quad.2, vdupq_n_u8(0x20)), |
35 |
| - vorrq_u8(vceqq_u8(quad.2, vdupq_n_u8(0x22)), vceqq_u8(quad.2, vdupq_n_u8(0x5C))) |
36 |
| - ); |
37 |
| - let needs_escape_3 = vorrq_u8( |
38 |
| - vcltq_u8(quad.3, vdupq_n_u8(0x20)), |
39 |
| - vorrq_u8(vceqq_u8(quad.3, vdupq_n_u8(0x22)), vceqq_u8(quad.3, vdupq_n_u8(0x5C))) |
40 |
| - ); |
41 |
| - |
42 |
| - // Combine all masks and check if ANY byte needs escaping |
43 |
| - let all_masks = vorrq_u8( |
44 |
| - vorrq_u8(needs_escape_0, needs_escape_1), |
45 |
| - vorrq_u8(needs_escape_2, needs_escape_3) |
46 |
| - ); |
47 |
| - |
48 |
| - // Return true if NO bytes need escaping (chunk is clean) |
49 |
| - vmaxvq_u8(all_masks) == 0 |
50 |
| -} |
| 9 | +/// Distance (in bytes) to prefetch ahead. Must be a multiple of 8 for PRFM. |
| 10 | +/// Keeping ~4 iterations (4 × CHUNK = 256 B) ahead strikes a good balance |
| 11 | +/// between hiding memory latency and not evicting useful cache lines. |
| 12 | +const PREFETCH_DISTANCE: usize = CHUNK * 4; |
51 | 13 |
|
52 | 14 | pub fn encode_str<S: AsRef<str>>(input: S) -> String {
|
53 | 15 | let s = input.as_ref();
|
| 16 | + let mut out = Vec::with_capacity(s.len() + 2); |
54 | 17 | let bytes = s.as_bytes();
|
55 | 18 | let n = bytes.len();
|
56 |
| - |
57 |
| - // Simple capacity estimation |
58 |
| - let mut out = Vec::with_capacity(n + n / 16 + 2); |
59 | 19 | out.push(b'"');
|
60 | 20 |
|
61 |
| - // V8-style optimization: Focus on the fast path for clean data |
62 | 21 | unsafe {
|
| 22 | + let tbl = vld1q_u8_x4(ESCAPE.as_ptr()); // first 64 B of the escape table |
| 23 | + let slash = vdupq_n_u8(b'\\'); |
63 | 24 | let mut i = 0;
|
64 |
| - let mut clean_start = 0; |
65 |
| - |
66 |
| - // Process in 64-byte chunks optimized for clean data |
| 25 | + // Re-usable scratch – *uninitialised*, so no memset in the loop. |
| 26 | + // Using MaybeUninit instead of mem::zeroed() prevents the compiler from inserting an implicit memset (observable with -Cllvm-args=-print-after=expand-memcmp). |
| 27 | + // This is a proven micro-optimisation in Rust's standard library I/O stack. |
| 28 | + #[allow(invalid_value)] |
| 29 | + let mut placeholder: [u8; 16] = core::mem::MaybeUninit::uninit().assume_init(); |
| 30 | + |
67 | 31 | while i + CHUNK <= n {
|
68 | 32 | let ptr = bytes.as_ptr().add(i);
|
69 |
| - |
70 |
| - if chunk_is_clean(ptr) { |
71 |
| - // Clean chunk - continue scanning |
72 |
| - i += CHUNK; |
73 |
| - } else { |
74 |
| - // Found dirty chunk - flush any accumulated clean data first |
75 |
| - if clean_start < i { |
76 |
| - out.extend_from_slice(&bytes[clean_start..i]); |
77 |
| - } |
78 |
| - |
79 |
| - // Process this single dirty chunk with proven scalar code |
80 |
| - encode_str_inner(&bytes[i..i + CHUNK], &mut out); |
| 33 | + |
| 34 | + /* ---- L1 prefetch: PREFETCH_DISTANCE bytes ahead ---- */ |
| 35 | + core::arch::asm!( |
| 36 | + "prfm pldl1keep, [{0}, #{1}]", |
| 37 | + "prfm pldl1keep, [{0}, #{2}]", |
| 38 | + in(reg) ptr, |
| 39 | + const PREFETCH_DISTANCE, |
| 40 | + const PREFETCH_DISTANCE + 256, |
| 41 | + ); |
| 42 | + /* ------------------------------------------ */ |
| 43 | + |
| 44 | + let quad = vld1q_u8_x4(ptr); |
| 45 | + |
| 46 | + // load 64 B (four q-regs) |
| 47 | + let a = quad.0; |
| 48 | + let b = quad.1; |
| 49 | + let c = quad.2; |
| 50 | + let d = quad.3; |
| 51 | + |
| 52 | + let mask_1 = vorrq_u8(vqtbl4q_u8(tbl, a), vceqq_u8(slash, a)); |
| 53 | + let mask_2 = vorrq_u8(vqtbl4q_u8(tbl, b), vceqq_u8(slash, b)); |
| 54 | + let mask_3 = vorrq_u8(vqtbl4q_u8(tbl, c), vceqq_u8(slash, c)); |
| 55 | + let mask_4 = vorrq_u8(vqtbl4q_u8(tbl, d), vceqq_u8(slash, d)); |
| 56 | + |
| 57 | + let mask_r_1 = vmaxvq_u8(mask_1); |
| 58 | + let mask_r_2 = vmaxvq_u8(mask_2); |
| 59 | + let mask_r_3 = vmaxvq_u8(mask_3); |
| 60 | + let mask_r_4 = vmaxvq_u8(mask_4); |
| 61 | + |
| 62 | + // fast path: nothing needs escaping |
| 63 | + if mask_r_1 | mask_r_2 | mask_r_3 | mask_r_4 == 0 { |
| 64 | + out.extend_from_slice(std::slice::from_raw_parts(ptr, CHUNK)); |
81 | 65 | i += CHUNK;
|
82 |
| - clean_start = i; |
| 66 | + continue; |
83 | 67 | }
|
| 68 | + |
| 69 | + macro_rules! handle { |
| 70 | + ($mask:expr, $mask_r:expr, $off:expr) => { |
| 71 | + if $mask_r == 0 { |
| 72 | + out.extend_from_slice(std::slice::from_raw_parts(ptr.add($off), 16)); |
| 73 | + } else { |
| 74 | + vst1q_u8(placeholder.as_mut_ptr(), $mask); |
| 75 | + handle_block(&bytes[i + $off..i + $off + 16], &placeholder, &mut out); |
| 76 | + } |
| 77 | + }; |
| 78 | + } |
| 79 | + |
| 80 | + handle!(mask_1, mask_r_1, 0); |
| 81 | + handle!(mask_2, mask_r_2, 16); |
| 82 | + handle!(mask_3, mask_r_3, 32); |
| 83 | + handle!(mask_4, mask_r_4, 48); |
| 84 | + |
| 85 | + i += CHUNK; |
84 | 86 | }
|
85 |
| - |
86 |
| - // Flush any remaining clean data |
87 |
| - if clean_start < i { |
88 |
| - out.extend_from_slice(&bytes[clean_start..i]); |
89 |
| - } |
90 |
| - |
91 |
| - // Handle remaining bytes (less than CHUNK) |
92 | 87 | if i < n {
|
93 | 88 | encode_str_inner(&bytes[i..], &mut out);
|
94 | 89 | }
|
95 | 90 | }
|
96 |
| - |
97 | 91 | out.push(b'"');
|
98 | 92 | // SAFETY: we only emit valid UTF-8
|
99 | 93 | unsafe { String::from_utf8_unchecked(out) }
|
100 | 94 | }
|
| 95 | + |
| 96 | +#[inline(always)] |
| 97 | +unsafe fn handle_block(src: &[u8], mask: &[u8; 16], dst: &mut Vec<u8>) { |
| 98 | + for (j, &m) in mask.iter().enumerate() { |
| 99 | + let c = src[j]; |
| 100 | + if m == 0 { |
| 101 | + dst.push(c); |
| 102 | + } else if m == 0xFF { |
| 103 | + dst.extend_from_slice(REVERSE_SOLIDUS); |
| 104 | + } else { |
| 105 | + let e = CharEscape::from_escape_table(m, c); |
| 106 | + write_char_escape(dst, e); |
| 107 | + } |
| 108 | + } |
| 109 | +} |
0 commit comments