@@ -1132,17 +1132,34 @@ SZ_PUBLIC sz_cptr_t sz_find_byteset_haswell(sz_cptr_t text, sz_size_t length, sz
11321132
11331133 // Let's unzip even and odd elements and replicate them into both lanes of the YMM register.
11341134 // That way when we invoke `_mm256_shuffle_epi8` we can use the same mask for both lanes.
1135+ // Load the 32-byte filter as two 16-byte halves, separate even/odd bytes, pack, and broadcast to YMM.
1136+ sz_u128_vec_t byte_mask_vec ;
1137+ sz_u128_vec_t filter_lo_vec , filter_hi_vec ;
1138+ sz_u128_vec_t lo_evens_vec , hi_evens_vec ;
1139+ sz_u128_vec_t lo_odds_vec , hi_odds_vec ;
1140+ sz_u128_vec_t evens_xmm_vec , odds_xmm_vec ;
11351141 sz_u256_vec_t filter_even_vec , filter_odd_vec ;
1136- for (sz_size_t i = 0 ; i != 16 ; ++ i )
1137- filter_even_vec .u8s [i ] = filter -> _u8s [i * 2 ], filter_odd_vec .u8s [i ] = filter -> _u8s [i * 2 + 1 ];
1138- filter_even_vec .xmms [1 ] = filter_even_vec .xmms [0 ];
1139- filter_odd_vec .xmms [1 ] = filter_odd_vec .xmms [0 ];
11401142
11411143 sz_u256_vec_t text_vec ;
11421144 sz_u256_vec_t matches_vec ;
11431145 sz_u256_vec_t lower_nibbles_vec , higher_nibbles_vec ;
11441146 sz_u256_vec_t bitset_even_vec , bitset_odd_vec ;
11451147 sz_u256_vec_t bitmask_vec , bitmask_lookup_vec ;
1148+
1149+ byte_mask_vec .xmm = _mm_set1_epi16 (0x00ff );
1150+
1151+ filter_lo_vec .xmm = _mm_lddqu_si128 ((__m128i const * )(filter ));
1152+ filter_hi_vec .xmm = _mm_lddqu_si128 ((__m128i const * )(filter ) + 1 );
1153+ lo_evens_vec .xmm = _mm_and_si128 (filter_lo_vec .xmm , byte_mask_vec .xmm );
1154+ hi_evens_vec .xmm = _mm_and_si128 (filter_hi_vec .xmm , byte_mask_vec .xmm );
1155+ lo_odds_vec .xmm = _mm_srli_epi16 (filter_lo_vec .xmm , 8 );
1156+ hi_odds_vec .xmm = _mm_srli_epi16 (filter_hi_vec .xmm , 8 );
1157+
1158+ evens_xmm_vec .xmm = _mm_packus_epi16 (lo_evens_vec .xmm , hi_evens_vec .xmm );
1159+ odds_xmm_vec .xmm = _mm_packus_epi16 (lo_odds_vec .xmm , hi_odds_vec .xmm );
1160+ filter_even_vec .ymm = _mm256_set_m128i (evens_xmm_vec .xmm , evens_xmm_vec .xmm );
1161+ filter_odd_vec .ymm = _mm256_set_m128i (odds_xmm_vec .xmm , odds_xmm_vec .xmm );
1162+
11461163 bitmask_lookup_vec .ymm = _mm256_set_epi8 ( //
11471164 -128 , 64 , 32 , 16 , 8 , 4 , 2 , 1 , -128 , 64 , 32 , 16 , 8 , 4 , 2 , 1 , //
11481165 -128 , 64 , 32 , 16 , 8 , 4 , 2 , 1 , -128 , 64 , 32 , 16 , 8 , 4 , 2 , 1 );
0 commit comments