@@ -1115,7 +1115,12 @@ fn slices_equal_avx2(a: &[u8], b: &[u8]) -> bool {
11151115 }
11161116
11171117 unsafe {
1118- let mut aggregate_mask = -1i32 ;
1118+ let mut aggregate_mask_a = -1i32 ;
1119+ let mut aggregate_mask_b = -1i32 ;
1120+ let mut aggregate_mask_c = -1i32 ;
1121+ let mut aggregate_mask_d = -1i32 ;
1122+ let mut aggregate_mask_a_b = -1i32 ;
1123+ let mut aggregate_mask_c_d = -1i32 ;
11191124
11201125 while i + CHUNK <= len {
11211126 let simd_a1 = _mm256_loadu_si256 ( a. as_ptr ( ) . add ( i) as * const _ ) ;
@@ -1151,9 +1156,15 @@ fn slices_equal_avx2(a: &[u8], b: &[u8]) -> bool {
11511156 let cmp7 = _mm256_movemask_epi8 ( _mm256_cmpeq_epi8 ( simd_a7, simd_b7) ) ;
11521157 let cmp8 = _mm256_movemask_epi8 ( _mm256_cmpeq_epi8 ( simd_a8, simd_b8) ) ;
11531158
1154- aggregate_mask &= cmp1 & cmp2 & cmp3 & cmp4 & cmp5 & cmp6 & cmp7 & cmp8;
1159+ aggregate_mask_a &= cmp1 & cmp2;
1160+ aggregate_mask_b &= cmp3 & cmp4;
1161+ aggregate_mask_c &= cmp5 & cmp6;
1162+ aggregate_mask_d &= cmp7 & cmp8;
11551163
1156- if aggregate_mask != -1i32 {
1164+ aggregate_mask_a_b &= aggregate_mask_a & aggregate_mask_b;
1165+ aggregate_mask_c_d &= aggregate_mask_c & aggregate_mask_d;
1166+
1167+ if aggregate_mask_a_b & aggregate_mask_c_d != -1i32 {
11571168 return false ;
11581169 }
11591170
@@ -1209,7 +1220,6 @@ fn slices_equal_avx512(a: &[u8], b: &[u8]) -> bool {
12091220 let simd_a8 = _mm512_loadu_si512 ( a. as_ptr ( ) . add ( i + 448 ) as * const _ ) ;
12101221 let simd_b8 = _mm512_loadu_si512 ( b. as_ptr ( ) . add ( i + 448 ) as * const _ ) ;
12111222
1212- // Compare each pair of registers
12131223 let cmp1 = _mm512_cmpeq_epi8_mask ( simd_a1, simd_b1) ;
12141224 let cmp2 = _mm512_cmpeq_epi8_mask ( simd_a2, simd_b2) ;
12151225 let cmp3 = _mm512_cmpeq_epi8_mask ( simd_a3, simd_b3) ;
@@ -1219,11 +1229,17 @@ fn slices_equal_avx512(a: &[u8], b: &[u8]) -> bool {
12191229 let cmp7 = _mm512_cmpeq_epi8_mask ( simd_a7, simd_b7) ;
12201230 let cmp8 = _mm512_cmpeq_epi8_mask ( simd_a8, simd_b8) ;
12211231
1222- // Combine all comparison masks
1223- let combined_cmp = cmp1 & cmp2 & cmp3 & cmp4 & cmp5 & cmp6 & cmp7 & cmp8;
1232+ let cmp1_2 = cmp1 & cmp2;
1233+ let cmp3_4 = cmp3 & cmp4;
1234+ let cmp5_6 = cmp5 & cmp6;
1235+ let cmp7_8 = cmp7 & cmp8;
1236+
1237+ let cmp1_4 = cmp1_2 & cmp3_4;
1238+ let cmp5_8 = cmp5_6 & cmp7_8;
1239+
1240+ let full_cmp = cmp1_4 & cmp5_8;
12241241
1225- // Check if all bytes are equal (mask should have all bits set)
1226- if combined_cmp != u64:: MAX {
1242+ if full_cmp != u64:: MAX {
12271243 return false ;
12281244 }
12291245
0 commit comments