Skip to content

Commit 2aa5491

Browse files
committed
Add 4-lanes searcher
Benchmarks results summary: * `short_haystack`: -10.7% instructions * `long_haystack`: +0% instructions * `random_haystack`: +0% instructions
1 parent 782fc0b commit 2aa5491

File tree

1 file changed

+61
-5
lines changed

1 file changed

+61
-5
lines changed

src/x86.rs

Lines changed: 61 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,45 @@ trait Vector: Copy {
6969
unsafe fn movemask_epi8(a: Self) -> i32;
7070
}
7171

72+
#[derive(Clone, Copy)]
73+
#[repr(transparent)]
74+
#[allow(non_camel_case_types)]
75+
struct __m32i(__m128i);
76+
77+
impl Vector for __m32i {
78+
const LANES: usize = 4;
79+
80+
#[inline]
81+
#[target_feature(enable = "avx2")]
82+
unsafe fn set1_epi8(a: i8) -> Self {
83+
__m32i(_mm_set1_epi8(a))
84+
}
85+
86+
#[inline]
87+
#[target_feature(enable = "avx2")]
88+
unsafe fn loadu_si(a: *const Self) -> Self {
89+
__m32i(_mm_set1_epi32(std::ptr::read_unaligned(a as *const i32)))
90+
}
91+
92+
#[inline]
93+
#[target_feature(enable = "avx2")]
94+
unsafe fn cmpeq_epi8(a: Self, b: Self) -> Self {
95+
__m32i(_mm_cmpeq_epi8(a.0, b.0))
96+
}
97+
98+
#[inline]
99+
#[target_feature(enable = "avx2")]
100+
unsafe fn and_si(a: Self, b: Self) -> Self {
101+
__m32i(_mm_and_si128(a.0, b.0))
102+
}
103+
104+
#[inline]
105+
#[target_feature(enable = "avx2")]
106+
unsafe fn movemask_epi8(a: Self) -> i32 {
107+
_mm_movemask_epi8(a.0) & 0xF
108+
}
109+
}
110+
72111
#[derive(Clone, Copy)]
73112
#[repr(transparent)]
74113
#[allow(non_camel_case_types)]
@@ -205,6 +244,16 @@ impl From<&VectorHash<__m128i>> for VectorHash<__m64i> {
205244
}
206245
}
207246

247+
impl From<&VectorHash<__m128i>> for VectorHash<__m32i> {
248+
#[inline]
249+
fn from(hash: &VectorHash<__m128i>) -> Self {
250+
Self {
251+
first: __m32i(hash.first),
252+
last: __m32i(hash.last),
253+
}
254+
}
255+
}
256+
208257
/// Single-substring searcher using an AVX2 algorithm based on the "Generic
209258
/// SIMD" algorithm [presented by Wojciech
210259
/// Muła](http://0x80.pl/articles/simd-strfind.html).
@@ -403,21 +452,28 @@ impl<N: Needle> Avx2Searcher<N> {
403452

404453
#[inline]
405454
#[target_feature(enable = "avx2")]
406-
unsafe fn u64_search_in(&self, haystack: &[u8]) -> bool {
407-
let hash = VectorHash::<__m64i>::from(&self.sse2_hash);
455+
unsafe fn sse2_4_search_in(&self, haystack: &[u8]) -> bool {
456+
let hash = VectorHash::<__m32i>::from(&self.sse2_hash);
408457
self.vector_search_in(haystack, &hash, Self::scalar_search_in)
409458
}
410459

411460
#[inline]
412461
#[target_feature(enable = "avx2")]
413-
unsafe fn sse2_search_in(&self, haystack: &[u8]) -> bool {
414-
self.vector_search_in(haystack, &self.sse2_hash, Self::u64_search_in)
462+
unsafe fn sse2_8_search_in(&self, haystack: &[u8]) -> bool {
463+
let hash = VectorHash::<__m64i>::from(&self.sse2_hash);
464+
self.vector_search_in(haystack, &hash, Self::sse2_4_search_in)
465+
}
466+
467+
#[inline]
468+
#[target_feature(enable = "avx2")]
469+
unsafe fn sse2_16_search_in(&self, haystack: &[u8]) -> bool {
470+
self.vector_search_in(haystack, &self.sse2_hash, Self::sse2_8_search_in)
415471
}
416472

417473
#[inline]
418474
#[target_feature(enable = "avx2")]
419475
unsafe fn avx2_search_in(&self, haystack: &[u8]) -> bool {
420-
self.vector_search_in(haystack, &self.avx2_hash, Self::sse2_search_in)
476+
self.vector_search_in(haystack, &self.avx2_hash, Self::sse2_16_search_in)
421477
}
422478

423479
/// Inlined version of `search_in` for hot call sites.

0 commit comments

Comments
 (0)