Skip to content

Commit cdabb70

Browse files
authored
perf: SIMD neon support (#133)
First pass at neon support, building off #132
1 parent 58a6293 commit cdabb70

File tree

3 files changed

+307
-0
lines changed

3 files changed

+307
-0
lines changed

.github/workflows/ci.yml

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,3 +147,41 @@ jobs:
147147

148148
- name: Test
149149
run: MIRIFLAGS="-Zmiri-tag-raw-pointers -Zmiri-check-number-validity" cargo miri test
150+
151+
aarch64:
152+
name: Test aarch64 (neon)
153+
runs-on: ubuntu-latest
154+
env:
155+
CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_LINKER: aarch64-linux-gnu-gcc
156+
steps:
157+
- name: Checkout repository
158+
uses: actions/checkout@v2
159+
160+
- name: Setup Rust
161+
uses: actions-rs/toolchain@v1
162+
with:
163+
profile: minimal
164+
toolchain: stable
165+
override: true
166+
target: aarch64-unknown-linux-gnu
167+
168+
- name: Install QEMU and dependencies
169+
run: |
170+
sudo apt-get update
171+
sudo apt-get install -y qemu qemu-user gcc-aarch64-linux-gnu
172+
173+
- name: Build tests
174+
run: cargo build --tests --target aarch64-unknown-linux-gnu
175+
176+
- name: Run tests with QEMU
177+
run: |
178+
test_binaries=$(find target/aarch64-unknown-linux-gnu/debug/deps/ -type f -executable -name 'httparse-*')
179+
if [ -n "$test_binaries" ]; then
180+
for test_binary in $test_binaries
181+
do
182+
echo "Running tests in ${test_binary}"
183+
/usr/bin/qemu-aarch64 -L /usr/aarch64-linux-gnu/ "${test_binary}"
184+
done
185+
else
186+
echo "No test binaries found."
187+
fi

src/simd/mod.rs

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ mod swar;
55
any(
66
target_arch = "x86",
77
target_arch = "x86_64",
8+
target_arch = "aarch64",
89
),
910
)))]
1011
pub use self::swar::*;
@@ -132,3 +133,15 @@ mod avx2_compile_time {
132133
),
133134
))]
134135
pub use self::avx2_compile_time::*;
136+
137+
#[cfg(all(
138+
httparse_simd,
139+
target_arch = "aarch64",
140+
))]
141+
mod neon;
142+
143+
#[cfg(all(
144+
httparse_simd,
145+
target_arch = "aarch64",
146+
))]
147+
pub use self::neon::*;

src/simd/neon.rs

Lines changed: 256 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,256 @@
1+
use crate::iter::Bytes;
2+
use core::arch::aarch64::*;
3+
4+
#[inline]
5+
pub fn match_header_name_vectored(bytes: &mut Bytes) {
6+
while bytes.as_ref().len() >= 16 {
7+
unsafe {
8+
let advance = match_header_name_char_16_neon(bytes.as_ref().as_ptr());
9+
bytes.advance(advance);
10+
11+
if advance != 16 {
12+
return;
13+
}
14+
}
15+
}
16+
super::swar::match_header_name_vectored(bytes);
17+
}
18+
19+
#[inline]
20+
pub fn match_header_value_vectored(bytes: &mut Bytes) {
21+
while bytes.as_ref().len() >= 16 {
22+
unsafe {
23+
let advance = match_header_value_char_16_neon(bytes.as_ref().as_ptr());
24+
bytes.advance(advance);
25+
26+
if advance != 16 {
27+
return;
28+
}
29+
}
30+
}
31+
super::swar::match_header_value_vectored(bytes);
32+
}
33+
34+
#[inline]
35+
pub fn match_uri_vectored(bytes: &mut Bytes) {
36+
while bytes.as_ref().len() >= 16 {
37+
unsafe {
38+
let advance = match_url_char_16_neon(bytes.as_ref().as_ptr());
39+
bytes.advance(advance);
40+
41+
if advance != 16 {
42+
return;
43+
}
44+
}
45+
}
46+
super::swar::match_uri_vectored(bytes);
47+
}
48+
49+
const fn bit_set(x: u8) -> bool {
50+
// Validates if a byte is a valid header name character
51+
// https://tools.ietf.org/html/rfc7230#section-3.2.6
52+
matches!(x, b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'!' | b'#' | b'$' | b'%' | b'&' | b'\'' | b'*' | b'+' | b'-' | b'.' | b'^' | b'_' | b'`' | b'|' | b'~')
53+
}
54+
55+
// A 256-bit bitmap, split into two halves
56+
// lower half contains bits whose higher nibble is <= 7
57+
// higher half contains bits whose higher nibble is >= 8
58+
const fn build_bitmap() -> ([u8; 16], [u8; 16]) {
59+
let mut bitmap_0_7 = [0u8; 16]; // 0x00..0x7F
60+
let mut bitmap_8_15 = [0u8; 16]; // 0x80..0xFF
61+
let mut i = 0;
62+
while i < 256 {
63+
if bit_set(i as u8) {
64+
// Nibbles
65+
let (lo, hi) = (i & 0x0F, i >> 4);
66+
if i < 128 {
67+
bitmap_0_7[lo] |= 1 << hi;
68+
} else {
69+
bitmap_8_15[lo] |= 1 << hi;
70+
}
71+
}
72+
i += 1;
73+
}
74+
(bitmap_0_7, bitmap_8_15)
75+
}
76+
77+
const BITMAPS: ([u8; 16], [u8; 16]) = build_bitmap();
78+
79+
// NOTE: adapted from 256-bit version, with upper 128-bit ops commented out
80+
#[inline]
81+
unsafe fn match_header_name_char_16_neon(ptr: *const u8) -> usize {
82+
let bitmaps = BITMAPS;
83+
// NOTE: ideally compile-time constants
84+
let (bitmap_0_7, _bitmap_8_15) = bitmaps;
85+
let bitmap_0_7 = vld1q_u8(bitmap_0_7.as_ptr());
86+
// let bitmap_8_15 = vld1q_u8(bitmap_8_15.as_ptr());
87+
88+
// Initialize the bitmask_lookup.
89+
const BITMASK_LOOKUP_DATA: [u8; 16] =
90+
[1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128];
91+
let bitmask_lookup = vld1q_u8(BITMASK_LOOKUP_DATA.as_ptr());
92+
93+
// Load 16 input bytes.
94+
let input = vld1q_u8(ptr);
95+
96+
// Extract indices for row_0_7.
97+
let indices_0_7 = vandq_u8(input, vdupq_n_u8(0x8F)); // 0b1000_1111;
98+
99+
// Extract indices for row_8_15.
100+
// let msb = vandq_u8(input, vdupq_n_u8(0x80));
101+
// let indices_8_15 = veorq_u8(indices_0_7, msb);
102+
103+
// Fetch row_0_7 and row_8_15.
104+
let row_0_7 = vqtbl1q_u8(bitmap_0_7, indices_0_7);
105+
// let row_8_15 = vqtbl1q_u8(bitmap_8_15, indices_8_15);
106+
107+
// Calculate a bitmask, i.e. (1 << hi_nibble % 8).
108+
let bitmask = vqtbl1q_u8(bitmask_lookup, vshrq_n_u8(input, 4));
109+
110+
// Choose rows halves depending on higher nibbles.
111+
// let bitsets = vorrq_u8(row_0_7, row_8_15);
112+
let bitsets = row_0_7;
113+
114+
// Finally check which bytes belong to the set.
115+
let tmp = vandq_u8(bitsets, bitmask);
116+
let result = vceqq_u8(tmp, bitmask);
117+
118+
offsetz(result) as usize
119+
}
120+
121+
#[inline]
122+
unsafe fn match_url_char_16_neon(ptr: *const u8) -> usize {
123+
let input = vld1q_u8(ptr);
124+
125+
// Check that b'!' <= input <= b'~'
126+
let result = vandq_u8(
127+
vcleq_u8(vdupq_n_u8(b'!'), input),
128+
vcleq_u8(input, vdupq_n_u8(b'~')),
129+
);
130+
// Check that input != b'<' and input != b'>'
131+
let lt = vceqq_u8(input, vdupq_n_u8(b'<'));
132+
let gt = vceqq_u8(input, vdupq_n_u8(b'>'));
133+
let ltgt = vorrq_u8(lt, gt);
134+
// Nand with result
135+
let result = vbicq_u8(result, ltgt);
136+
137+
offsetz(result) as usize
138+
}
139+
140+
#[inline]
141+
unsafe fn match_header_value_char_16_neon(ptr: *const u8) -> usize {
142+
let input = vld1q_u8(ptr);
143+
144+
// Check that b' ' <= and b != 127 or b == 9
145+
let result = vcleq_u8(vdupq_n_u8(b' '), input);
146+
147+
// Allow tab
148+
let tab = vceqq_u8(input, vdupq_n_u8(0x09));
149+
let result = vorrq_u8(result, tab);
150+
151+
// Disallow del
152+
let del = vceqq_u8(input, vdupq_n_u8(0x7F));
153+
let result = vbicq_u8(result, del);
154+
155+
offsetz(result) as usize
156+
}
157+
158+
#[inline]
159+
unsafe fn offsetz(x: uint8x16_t) -> u32 {
160+
// NOT the vector since it's faster to operate with zeros instead
161+
offsetnz(vmvnq_u8(x))
162+
}
163+
164+
#[inline]
165+
unsafe fn offsetnz(x: uint8x16_t) -> u32 {
166+
// Extract two u64
167+
let x = vreinterpretq_u64_u8(x);
168+
let low: u64 = std::mem::transmute(vget_low_u64(x));
169+
let high: u64 = std::mem::transmute(vget_high_u64(x));
170+
171+
#[inline]
172+
fn clz(x: u64) -> u32 {
173+
// perf: rust will unroll this loop
174+
// and it's much faster than rbit + clz so voila
175+
for (i, b) in x.to_ne_bytes().iter().copied().enumerate() {
176+
if b != 0 {
177+
return i as u32;
178+
}
179+
}
180+
8 // Technically not reachable since zero-guarded
181+
}
182+
183+
if low != 0 {
184+
return clz(low);
185+
} else if high != 0 {
186+
return 8 + clz(high);
187+
} else {
188+
return 16;
189+
}
190+
}
191+
192+
#[test]
193+
fn neon_code_matches_uri_chars_table() {
194+
unsafe {
195+
assert!(byte_is_allowed(b'_', match_uri_vectored));
196+
197+
for (b, allowed) in crate::URI_MAP.iter().cloned().enumerate() {
198+
assert_eq!(
199+
byte_is_allowed(b as u8, match_uri_vectored),
200+
allowed,
201+
"byte_is_allowed({:?}) should be {:?}",
202+
b,
203+
allowed,
204+
);
205+
}
206+
}
207+
}
208+
209+
#[test]
210+
fn neon_code_matches_header_value_chars_table() {
211+
unsafe {
212+
assert!(byte_is_allowed(b'_', match_header_value_vectored));
213+
214+
for (b, allowed) in crate::HEADER_VALUE_MAP.iter().cloned().enumerate() {
215+
assert_eq!(
216+
byte_is_allowed(b as u8, match_header_value_vectored),
217+
allowed,
218+
"byte_is_allowed({:?}) should be {:?}",
219+
b,
220+
allowed,
221+
);
222+
}
223+
}
224+
}
225+
226+
#[test]
227+
fn neon_code_matches_header_name_chars_table() {
228+
unsafe {
229+
assert!(byte_is_allowed(b'_', match_header_name_vectored));
230+
231+
for (b, allowed) in crate::HEADER_NAME_MAP.iter().cloned().enumerate() {
232+
assert_eq!(
233+
byte_is_allowed(b as u8, match_header_name_vectored),
234+
allowed,
235+
"byte_is_allowed({:?}) should be {:?}",
236+
b,
237+
allowed,
238+
);
239+
}
240+
}
241+
}
242+
243+
#[cfg(test)]
244+
unsafe fn byte_is_allowed(byte: u8, f: unsafe fn(bytes: &mut Bytes<'_>)) -> bool {
245+
let mut slice = [b'_'; 16];
246+
slice[10] = byte;
247+
let mut bytes = Bytes::new(&slice);
248+
249+
f(&mut bytes);
250+
251+
match bytes.pos() {
252+
16 => true,
253+
10 => false,
254+
x => panic!("unexpected pos: {}", x),
255+
}
256+
}

0 commit comments

Comments
 (0)