Skip to content

Commit 331c070

Browse files
Complete impl for avx512 and neon arm
1 parent f0256bb commit 331c070

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+2442
-33
lines changed

Cargo.lock

Lines changed: 12 additions & 12 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Justfile

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,9 @@ build-bin profile="dev": (build-lib profile)
4747
build-lib profile="dev":
4848
cargo build --package rsonpath-lib --profile {{profile}}
4949

50+
build-avx512 profile="dev":
51+
rustup run nightly cargo build --package rsonpath-lib --profile dev
52+
5053
# Build all rsonpath parts, the binary and library.
5154
build-all profile="dev": (build-lib profile) (build-bin profile) (gen-tests)
5255

crates/rsonpath-lib/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ default = ["simd"]
4949
arbitrary = ["dep:arbitrary"]
5050
simd = []
5151

52+
5253
[[example]]
5354
name = "approx_spans_usage"
5455
path = "examples/approx_spans_usage.rs"

crates/rsonpath-lib/src/classification/depth.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,10 +75,15 @@ pub(crate) mod shared;
7575
pub(crate) mod avx2_32;
7676
#[cfg(target_arch = "x86_64")]
7777
pub(crate) mod avx2_64;
78+
#[cfg(target_arch = "x86_64")]
79+
pub(crate) mod avx512_64;
7880
#[cfg(target_arch = "x86")]
7981
pub(crate) mod sse2_32;
8082
#[cfg(target_arch = "x86_64")]
8183
pub(crate) mod sse2_64;
84+
#[cfg(target_arch = "aarch64")]
85+
pub(crate) mod neon_64;
86+
8287

8388
pub(crate) trait DepthImpl {
8489
type Classifier<'i, I, Q>: DepthIterator<'i, I, Q, MaskType, BLOCK_SIZE>
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
use super::{
2+
shared::{mask_64::DepthVector64, vector_512::DelimiterClassifierImpl512},
3+
*,
4+
};
5+
use crate::{
6+
classification::{QuoteClassifiedBlock, ResumeClassifierBlockState},
7+
debug,
8+
input::InputBlock,
9+
};
10+
use std::marker::PhantomData;
11+
12+
const SIZE: usize = 64;
13+
14+
shared::depth_classifier!(Avx512VectorIterator64, DelimiterClassifierImpl512, DepthVector64, 64, u64);
15+
16+
#[inline(always)]
17+
fn new_vector<'a, B: InputBlock<'a, SIZE>>(
18+
bytes: QuoteClassifiedBlock<B, u64, SIZE>,
19+
classifier: &DelimiterClassifierImpl512,
20+
) -> DepthVector64<'a, B> {
21+
new_vector_from(bytes, classifier, 0)
22+
}
23+
24+
#[inline(always)]
25+
fn new_vector_from<'a, B: InputBlock<'a, SIZE>>(
26+
bytes: QuoteClassifiedBlock<B, u64, SIZE>,
27+
classifier: &DelimiterClassifierImpl512,
28+
idx: usize,
29+
) -> DepthVector64<'a, B> {
30+
// SAFETY: target_feature invariant
31+
unsafe { new_avx512(bytes, classifier, idx) }
32+
}
33+
34+
#[inline(always)]
35+
unsafe fn new_avx512<'a, B: InputBlock<'a, SIZE>>(
36+
bytes: QuoteClassifiedBlock<B, u64, SIZE>,
37+
classifier: &DelimiterClassifierImpl512,
38+
start_idx: usize,
39+
) -> DepthVector64<'a, B> {
40+
let idx_mask = 0xFFFF_FFFF_FFFF_FFFF_u64 << start_idx;
41+
let block = &bytes.block;
42+
let (opening_mask, closing_mask) = classifier.get_opening_and_closing_masks(block);
43+
44+
let opening_mask = opening_mask & (!bytes.within_quotes_mask) & idx_mask;
45+
let closing_mask = closing_mask & (!bytes.within_quotes_mask) & idx_mask;
46+
47+
DepthVector64 {
48+
quote_classified: bytes,
49+
opening_mask,
50+
closing_mask,
51+
opening_count: opening_mask.count_ones(),
52+
depth: 0,
53+
idx: 0,
54+
phantom: PhantomData,
55+
}
56+
}
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
use super::{
2+
shared::{mask_neon::DepthVectorNeon, vector_neon::DelimiterClassifierImplNeon},
3+
*,
4+
};
5+
use crate::{
6+
classification::{mask::m64, QuoteClassifiedBlock, ResumeClassifierBlockState},
7+
debug,
8+
input::InputBlock,
9+
};
10+
use std::marker::PhantomData;
11+
12+
const SIZE: usize = 64;
13+
14+
shared::depth_classifier!(NeonVectorIterator, DelimiterClassifierImplNeon, DepthVectorNeon, 64, u64);
15+
16+
#[inline(always)]
17+
fn new_vector<'a, B: InputBlock<'a, SIZE>>(
18+
bytes: QuoteClassifiedBlock<B, u64, SIZE>,
19+
classifier: &DelimiterClassifierImplNeon,
20+
) -> DepthVectorNeon<'a, B> {
21+
new_vector_from(bytes, classifier, 0)
22+
}
23+
24+
#[inline(always)]
25+
fn new_vector_from<'a, B: InputBlock<'a, SIZE>>(
26+
bytes: QuoteClassifiedBlock<B, u64, SIZE>,
27+
classifier: &DelimiterClassifierImplNeon,
28+
idx: usize,
29+
) -> DepthVectorNeon<'a, B> {
30+
// SAFETY: target_feature invariant
31+
unsafe { new_neon(bytes, classifier, idx) }
32+
}
33+
34+
#[inline(always)]
35+
unsafe fn new_neon<'a, B: InputBlock<'a, SIZE>>(
36+
bytes: QuoteClassifiedBlock<B, u64, SIZE>,
37+
classifier: &DelimiterClassifierImplNeon,
38+
start_idx: usize,
39+
) -> DepthVectorNeon<'a, B> {
40+
let idx_mask = 0xFFFF_FFFF_FFFF_FFFF_u64 << start_idx;
41+
let (block1, block2, block3, block4) = bytes.block.quarters();
42+
let (opening_mask1, closing_mask1) = classifier.get_opening_and_closing_masks(block1);
43+
let (opening_mask2, closing_mask2) = classifier.get_opening_and_closing_masks(block2);
44+
let (opening_mask3, closing_mask3) = classifier.get_opening_and_closing_masks(block3);
45+
let (opening_mask4, closing_mask4) = classifier.get_opening_and_closing_masks(block4);
46+
47+
let combined_opening_mask = m64::combine_16(opening_mask1, opening_mask2, opening_mask3, opening_mask4);
48+
let combined_closing_mask = m64::combine_16(closing_mask1, closing_mask2, closing_mask3, closing_mask4);
49+
50+
let opening_mask = combined_opening_mask & (!bytes.within_quotes_mask) & idx_mask;
51+
let closing_mask = combined_closing_mask & (!bytes.within_quotes_mask) & idx_mask;
52+
53+
DepthVectorNeon {
54+
quote_classified: bytes,
55+
opening_mask,
56+
closing_mask,
57+
opening_count: opening_mask.count_ones(),
58+
depth: 0,
59+
idx: 0,
60+
phantom: PhantomData,
61+
}
62+
}

crates/rsonpath-lib/src/classification/depth/shared.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,16 @@
22
pub(super) mod mask_32;
33
#[cfg(target_arch = "x86_64")]
44
pub(super) mod mask_64;
5+
#[cfg(target_arch = "aarch64")]
6+
pub(super) mod mask_neon;
57
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
68
pub(super) mod vector_128;
79
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
810
pub(super) mod vector_256;
11+
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
12+
pub(super) mod vector_512;
13+
#[cfg(target_arch = "aarch64")]
14+
pub(super) mod vector_neon;
915

1016
#[allow(unused_macros)]
1117
macro_rules! depth_classifier {
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
use crate::{
2+
bin_u64,
3+
classification::{depth::DepthBlock, quotes::QuoteClassifiedBlock},
4+
debug,
5+
input::InputBlock,
6+
};
7+
use std::marker::PhantomData;
8+
9+
const SIZE: usize = 64;
10+
11+
/// Works on a 64-byte slice, but uses a heuristic to quickly
12+
/// respond to queries and not count the depth exactly unless
13+
/// needed.
14+
///
15+
/// The heuristic checks if it is possible to achieve the queried
16+
/// depth within the block by counting the number of opening
17+
/// and closing structural characters. This can be done much
18+
/// more quickly than precise depth calculation.
19+
pub(crate) struct DepthVectorNeon<'a, B: InputBlock<'a, SIZE>> {
20+
pub(crate) quote_classified: QuoteClassifiedBlock<B, u64, SIZE>,
21+
pub(crate) opening_mask: u64,
22+
pub(crate) opening_count: u32,
23+
pub(crate) closing_mask: u64,
24+
pub(crate) idx: usize,
25+
pub(crate) depth: i32,
26+
pub(crate) phantom: PhantomData<&'a ()>,
27+
}
28+
29+
// TODO FIXME: consider rewriting training and count_zeros etc. functions.
30+
31+
impl<'a, B: InputBlock<'a, SIZE>> DepthBlock<'a> for DepthVectorNeon<'a, B> {
32+
#[inline(always)]
33+
fn advance_to_next_depth_decrease(&mut self) -> bool {
34+
let next_closing = self.closing_mask.trailing_zeros() as usize;
35+
36+
if next_closing == SIZE {
37+
return false;
38+
}
39+
40+
bin_u64!("opening_mask", self.opening_mask);
41+
bin_u64!("closing_mask", self.closing_mask);
42+
43+
self.opening_mask >>= next_closing;
44+
self.closing_mask >>= next_closing;
45+
self.opening_mask >>= 1;
46+
self.closing_mask >>= 1;
47+
48+
bin_u64!("new opening_mask", self.opening_mask);
49+
bin_u64!("new closing_mask", self.closing_mask);
50+
51+
let new_opening_count = self.opening_mask.count_ones() as i32;
52+
let delta = (self.opening_count as i32) - new_opening_count - 1;
53+
self.opening_count = new_opening_count as u32;
54+
55+
debug!("next_closing: {next_closing}");
56+
debug!("new_opening_count: {new_opening_count}");
57+
debug!("delta: {delta}");
58+
59+
self.depth += delta;
60+
self.idx += next_closing + 1;
61+
62+
true
63+
}
64+
65+
#[inline(always)]
66+
fn get_depth(&self) -> isize {
67+
self.depth as isize
68+
}
69+
70+
#[inline(always)]
71+
fn depth_at_end(&self) -> isize {
72+
(((self.opening_count as i32) - self.closing_mask.count_ones() as i32) + self.depth) as isize
73+
}
74+
75+
#[inline(always)]
76+
fn add_depth(&mut self, depth: isize) {
77+
self.depth += depth as i32;
78+
}
79+
80+
#[inline(always)]
81+
fn estimate_lowest_possible_depth(&self) -> isize {
82+
(self.depth - self.closing_mask.count_ones() as i32) as isize
83+
}
84+
}
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
use crate::classification::structural::BracketType;
2+
3+
#[cfg(target_arch = "x86")]
4+
use core::arch::x86::*;
5+
#[cfg(target_arch = "x86_64")]
6+
use core::arch::x86_64::*;
7+
8+
pub(crate) struct DelimiterClassifierImpl512 {
9+
opening: i8,
10+
}
11+
12+
impl DelimiterClassifierImpl512 {
13+
pub(crate) fn new(opening: BracketType) -> Self {
14+
let opening = match opening {
15+
BracketType::Square => b'[',
16+
BracketType::Curly => b'{',
17+
};
18+
19+
Self { opening: opening as i8 }
20+
}
21+
22+
#[inline(always)]
23+
unsafe fn opening_mask(&self) -> __m512i {
24+
_mm512_set1_epi8(self.opening)
25+
}
26+
27+
#[inline(always)]
28+
unsafe fn closing_mask(&self) -> __m512i {
29+
_mm512_set1_epi8(self.opening + 2)
30+
}
31+
32+
#[target_feature(enable = "avx512f")]
33+
#[target_feature(enable = "avx512bw")]
34+
#[inline]
35+
pub(crate) unsafe fn get_opening_and_closing_masks(&self, bytes: &[u8]) -> (u64, u64) {
36+
assert_eq!(64, bytes.len());
37+
// SAFETY: target_feature invariant
38+
unsafe {
39+
let byte_vector = _mm512_loadu_si512(bytes.as_ptr().cast::<i32>());
40+
let opening_mask = _mm512_cmpeq_epi8_mask(byte_vector, self.opening_mask());
41+
let closing_mask = _mm512_cmpeq_epi8_mask(byte_vector, self.closing_mask());
42+
43+
(opening_mask, closing_mask)
44+
}
45+
}
46+
}

0 commit comments

Comments
 (0)