@@ -5236,50 +5236,64 @@ SZ_INTERNAL __mmask64 sz_utf8_case_insensitive_find_ice_western_europe_alarm_nai
52365236/**
52375237 * @brief Optimized alarm function for Western Europe danger zone detection.
52385238 *
5239- * Reduces port 5 pressure from 12 to 10 operations by:
5239+ * Reduces port 5 pressure from 12 to 8 operations by:
52405240 * - E1/E2 consecutive: 2 CMPEQ -> 1 CMPLT + 1 VPTESTNMB (p0)
52415241 * - AA/AB consecutive: 2 CMPEQ -> 1 CMPLT
5242+ * - C3/C5 range: 2 CMPEQ -> 1 CMPLT + 2 VPTESTNMB (p0)
5243+ * - 9F/BF bit masking: 2 CMPEQ -> 1 CMPEQ + 1 VPTESTMB (p0)
52425244 *
5243- * Port summary: 10 p5 ops + 1 p0 op (vs 12 p5 originally)
5245+ * Port summary: 8 p5 ops + 4 p0 ops (vs 12 p5 originally)
52445246 *
5245- * @param[in] h The haystack ZMM register.
5247+ * @param[in] text_zmm The haystack ZMM register.
52465248 * @return Bitmask of positions where danger characters are detected.
52475249 */
5248- SZ_INTERNAL __mmask64 sz_utf8_case_insensitive_find_ice_western_europe_alarm_efficiently_zmm_ (__m512i h_zmm ) {
5250+ SZ_INTERNAL __mmask64 sz_utf8_case_insensitive_find_ice_western_europe_alarm_efficiently_zmm_ (__m512i text_zmm ) {
52495251 // Range constants
52505252 __m512i const x_e1_zmm = _mm512_set1_epi8 ((char )0xE1 );
52515253 __m512i const x_ef_zmm = _mm512_set1_epi8 ((char )0xEF );
5252- __m512i const x_c5_zmm = _mm512_set1_epi8 ((char )0xC5 );
52535254 __m512i const x_c3_zmm = _mm512_set1_epi8 ((char )0xC3 );
52545255 __m512i const x_ba_zmm = _mm512_set1_epi8 ((char )0xBA );
52555256 __m512i const x_84_zmm = _mm512_set1_epi8 ((char )0x84 );
52565257 __m512i const x_ac_zmm = _mm512_set1_epi8 ((char )0xAC );
52575258 __m512i const x_bf_zmm = _mm512_set1_epi8 ((char )0xBF );
5258- __m512i const x_9f_zmm = _mm512_set1_epi8 ((char )0x9F );
52595259 __m512i const x_aa_zmm = _mm512_set1_epi8 ((char )0xAA );
5260+ __m512i const x_20_zmm = _mm512_set1_epi8 (0x20 );
52605261 __m512i const x_02_zmm = _mm512_set1_epi8 (0x02 );
5262+ __m512i const x_03_zmm = _mm512_set1_epi8 (0x03 );
52615263
52625264 // Check for E1/E2 range: (byte - 0xE1) < 2 [1 CMPLT on p5]
5263- __m512i off_e1_zmm = _mm512_sub_epi8 (h_zmm , x_e1_zmm );
5265+ __m512i off_e1_zmm = _mm512_sub_epi8 (text_zmm , x_e1_zmm );
52645266 __mmask64 is_e1_or_e2_mask = _mm512_cmplt_epu8_mask (off_e1_zmm , x_02_zmm );
52655267 __mmask64 is_e1_mask = is_e1_or_e2_mask & _mm512_testn_epi8_mask (off_e1_zmm , off_e1_zmm ); // offset==0 [p0]
52665268 __mmask64 is_e2_mask = is_e1_or_e2_mask & ~is_e1_mask ;
52675269
52685270 // Check for AA/AB range: (byte - 0xAA) < 2 [1 CMPLT on p5]
5269- __m512i off_aa_zmm = _mm512_sub_epi8 (h_zmm , x_aa_zmm );
5271+ __m512i off_aa_zmm = _mm512_sub_epi8 (text_zmm , x_aa_zmm );
52705272 __mmask64 is_aa_or_ab_mask = _mm512_cmplt_epu8_mask (off_aa_zmm , x_02_zmm );
52715273
5272- // Other lead bytes (3 CMPEQ on p5)
5273- __mmask64 is_ef_mask = _mm512_cmpeq_epi8_mask (h_zmm , x_ef_zmm );
5274- __mmask64 is_c5_mask = _mm512_cmpeq_epi8_mask (h_zmm , x_c5_zmm );
5275- __mmask64 is_c3_mask = _mm512_cmpeq_epi8_mask (h_zmm , x_c3_zmm );
5274+ // Check for C3/C4/C5 range: (byte - 0xC3) < 3 [1 CMPLT on p5]
5275+ // We only need C3 and C5; C4 is captured but unused
5276+ __m512i off_c3_zmm = _mm512_sub_epi8 (text_zmm , x_c3_zmm );
5277+ __mmask64 is_c3_c4_c5_mask = _mm512_cmplt_epu8_mask (off_c3_zmm , x_03_zmm );
5278+ __mmask64 is_c3_mask = is_c3_c4_c5_mask & _mm512_testn_epi8_mask (off_c3_zmm , off_c3_zmm ); // offset==0 [p0]
5279+ __m512i off_xor_2_zmm = _mm512_xor_si512 (off_c3_zmm , x_02_zmm );
5280+ __mmask64 is_c5_mask = is_c3_c4_c5_mask & _mm512_testn_epi8_mask (off_xor_2_zmm , off_xor_2_zmm ); // offset==2 [p0]
5281+
5282+ // Other lead byte (1 CMPEQ on p5)
5283+ __mmask64 is_ef_mask = _mm512_cmpeq_epi8_mask (text_zmm , x_ef_zmm );
5284+
5285+ // Second bytes: BA, 84, AC (3 CMPEQ on p5)
5286+ __mmask64 is_ba_mask = _mm512_cmpeq_epi8_mask (text_zmm , x_ba_zmm );
5287+ __mmask64 is_84_mask = _mm512_cmpeq_epi8_mask (text_zmm , x_84_zmm );
5288+ __mmask64 is_ac_mask = _mm512_cmpeq_epi8_mask (text_zmm , x_ac_zmm );
52765289
5277- // Other second bytes (5 CMPEQ on p5)
5278- __mmask64 is_ba_mask = _mm512_cmpeq_epi8_mask (h_zmm , x_ba_zmm );
5279- __mmask64 is_84_mask = _mm512_cmpeq_epi8_mask (h_zmm , x_84_zmm );
5280- __mmask64 is_ac_mask = _mm512_cmpeq_epi8_mask (h_zmm , x_ac_zmm );
5281- __mmask64 is_bf_mask = _mm512_cmpeq_epi8_mask (h_zmm , x_bf_zmm );
5282- __mmask64 is_9f_mask = _mm512_cmpeq_epi8_mask (h_zmm , x_9f_zmm );
5290+ // 9F/BF bit masking: (byte | 0x20) == 0xBF catches exactly {0x9F, 0xBF} [1 CMPEQ on p5]
5291+ // 0x9F = 1001_1111, 0xBF = 1011_1111, differ only in bit 5
5292+ __m512i masked_zmm = _mm512_or_si512 (text_zmm , x_20_zmm );
5293+ __mmask64 is_9f_or_bf_mask = _mm512_cmpeq_epi8_mask (masked_zmm , x_bf_zmm );
5294+ __mmask64 has_bit5_mask = _mm512_test_epi8_mask (text_zmm , x_20_zmm ); // VPTESTMB [p0]
5295+ __mmask64 is_bf_mask = is_9f_or_bf_mask & has_bit5_mask ;
5296+ __mmask64 is_9f_mask = is_9f_or_bf_mask & ~has_bit5_mask ;
52835297
52845298 // Danger mask construction
52855299 __mmask64 danger_mask =
@@ -5289,7 +5303,7 @@ SZ_INTERNAL __mmask64 sz_utf8_case_insensitive_find_ice_western_europe_alarm_eff
52895303 ((is_c5_mask << 1 ) & is_bf_mask ) | // Long S (C5 BF)
52905304 ((is_c3_mask << 1 ) & is_9f_mask ); // Sharp S (C3 9F)
52915305
5292- sz_assert_ (danger_mask == sz_utf8_case_insensitive_find_ice_western_europe_alarm_naively_zmm_ (h_zmm ) &&
5306+ sz_assert_ (danger_mask == sz_utf8_case_insensitive_find_ice_western_europe_alarm_naively_zmm_ (text_zmm ) &&
52935307 "Efficient Western Europe alarm must match naive implementation" );
52945308 return danger_mask ;
52955309}
0 commit comments