Skip to content

Commit 7f2899a

Browse files
Improve: Reduce startup overhead for sz_find_byteset_haswell (#293)
Co-authored-by: Ash Vardanian <[email protected]> Co-authored-by: Liang Pengyu <[email protected]> Co-authored-by: Liang Pengyu <[email protected]>
1 parent 5f91e7e commit 7f2899a

File tree

1 file changed

+21
-4
lines changed

1 file changed

+21
-4
lines changed

include/stringzilla/find.h

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1132,17 +1132,34 @@ SZ_PUBLIC sz_cptr_t sz_find_byteset_haswell(sz_cptr_t text, sz_size_t length, sz
11321132

11331133
// Let's unzip even and odd elements and replicate them into both lanes of the YMM register.
11341134
// That way when we invoke `_mm256_shuffle_epi8` we can use the same mask for both lanes.
1135+
// Load the 32-byte filter as two 16-byte halves, separate even/odd bytes, pack, and broadcast to YMM.
1136+
sz_u128_vec_t byte_mask_vec;
1137+
sz_u128_vec_t filter_lo_vec, filter_hi_vec;
1138+
sz_u128_vec_t lo_evens_vec, hi_evens_vec;
1139+
sz_u128_vec_t lo_odds_vec, hi_odds_vec;
1140+
sz_u128_vec_t evens_xmm_vec, odds_xmm_vec;
11351141
sz_u256_vec_t filter_even_vec, filter_odd_vec;
1136-
for (sz_size_t i = 0; i != 16; ++i)
1137-
filter_even_vec.u8s[i] = filter->_u8s[i * 2], filter_odd_vec.u8s[i] = filter->_u8s[i * 2 + 1];
1138-
filter_even_vec.xmms[1] = filter_even_vec.xmms[0];
1139-
filter_odd_vec.xmms[1] = filter_odd_vec.xmms[0];
11401142

11411143
sz_u256_vec_t text_vec;
11421144
sz_u256_vec_t matches_vec;
11431145
sz_u256_vec_t lower_nibbles_vec, higher_nibbles_vec;
11441146
sz_u256_vec_t bitset_even_vec, bitset_odd_vec;
11451147
sz_u256_vec_t bitmask_vec, bitmask_lookup_vec;
1148+
1149+
byte_mask_vec.xmm = _mm_set1_epi16(0x00ff);
1150+
1151+
filter_lo_vec.xmm = _mm_lddqu_si128((__m128i const *)(filter));
1152+
filter_hi_vec.xmm = _mm_lddqu_si128((__m128i const *)(filter) + 1);
1153+
lo_evens_vec.xmm = _mm_and_si128(filter_lo_vec.xmm, byte_mask_vec.xmm);
1154+
hi_evens_vec.xmm = _mm_and_si128(filter_hi_vec.xmm, byte_mask_vec.xmm);
1155+
lo_odds_vec.xmm = _mm_srli_epi16(filter_lo_vec.xmm, 8);
1156+
hi_odds_vec.xmm = _mm_srli_epi16(filter_hi_vec.xmm, 8);
1157+
1158+
evens_xmm_vec.xmm = _mm_packus_epi16(lo_evens_vec.xmm, hi_evens_vec.xmm);
1159+
odds_xmm_vec.xmm = _mm_packus_epi16(lo_odds_vec.xmm, hi_odds_vec.xmm);
1160+
filter_even_vec.ymm = _mm256_set_m128i(evens_xmm_vec.xmm, evens_xmm_vec.xmm);
1161+
filter_odd_vec.ymm = _mm256_set_m128i(odds_xmm_vec.xmm, odds_xmm_vec.xmm);
1162+
11461163
bitmask_lookup_vec.ymm = _mm256_set_epi8( //
11471164
-128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1, //
11481165
-128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1);

0 commit comments

Comments
 (0)