Skip to content

Commit fe94e1c

Browse files
committed
Improve: Western European register pressure
Naive: 12 p5 Before: 10 p5 ops + 1 p0 op After: 8 p5 ops + 4 p0 ops
1 parent cc2b5a6 commit fe94e1c

File tree

1 file changed

+33
-19
lines changed

1 file changed

+33
-19
lines changed

include/stringzilla/utf8_case.h

Lines changed: 33 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -5236,50 +5236,64 @@ SZ_INTERNAL __mmask64 sz_utf8_case_insensitive_find_ice_western_europe_alarm_nai
52365236
/**
52375237
* @brief Optimized alarm function for Western Europe danger zone detection.
52385238
*
5239-
* Reduces port 5 pressure from 12 to 10 operations by:
5239+
* Reduces port 5 pressure from 12 to 8 operations by:
52405240
* - E1/E2 consecutive: 2 CMPEQ -> 1 CMPLT + 1 VPTESTNMB (p0)
52415241
* - AA/AB consecutive: 2 CMPEQ -> 1 CMPLT
5242+
* - C3/C5 range: 2 CMPEQ -> 1 CMPLT + 2 VPTESTNMB (p0)
5243+
* - 9F/BF bit masking: 2 CMPEQ -> 1 CMPEQ + 1 VPTESTMB (p0)
52425244
*
5243-
* Port summary: 10 p5 ops + 1 p0 op (vs 12 p5 originally)
5245+
* Port summary: 8 p5 ops + 4 p0 ops (vs 12 p5 originally)
52445246
*
5245-
* @param[in] h The haystack ZMM register.
5247+
* @param[in] text_zmm The haystack ZMM register.
52465248
* @return Bitmask of positions where danger characters are detected.
52475249
*/
5248-
SZ_INTERNAL __mmask64 sz_utf8_case_insensitive_find_ice_western_europe_alarm_efficiently_zmm_(__m512i h_zmm) {
5250+
SZ_INTERNAL __mmask64 sz_utf8_case_insensitive_find_ice_western_europe_alarm_efficiently_zmm_(__m512i text_zmm) {
52495251
// Range constants
52505252
__m512i const x_e1_zmm = _mm512_set1_epi8((char)0xE1);
52515253
__m512i const x_ef_zmm = _mm512_set1_epi8((char)0xEF);
5252-
__m512i const x_c5_zmm = _mm512_set1_epi8((char)0xC5);
52535254
__m512i const x_c3_zmm = _mm512_set1_epi8((char)0xC3);
52545255
__m512i const x_ba_zmm = _mm512_set1_epi8((char)0xBA);
52555256
__m512i const x_84_zmm = _mm512_set1_epi8((char)0x84);
52565257
__m512i const x_ac_zmm = _mm512_set1_epi8((char)0xAC);
52575258
__m512i const x_bf_zmm = _mm512_set1_epi8((char)0xBF);
5258-
__m512i const x_9f_zmm = _mm512_set1_epi8((char)0x9F);
52595259
__m512i const x_aa_zmm = _mm512_set1_epi8((char)0xAA);
5260+
__m512i const x_20_zmm = _mm512_set1_epi8(0x20);
52605261
__m512i const x_02_zmm = _mm512_set1_epi8(0x02);
5262+
__m512i const x_03_zmm = _mm512_set1_epi8(0x03);
52615263

52625264
// Check for E1/E2 range: (byte - 0xE1) < 2 [1 CMPLT on p5]
5263-
__m512i off_e1_zmm = _mm512_sub_epi8(h_zmm, x_e1_zmm);
5265+
__m512i off_e1_zmm = _mm512_sub_epi8(text_zmm, x_e1_zmm);
52645266
__mmask64 is_e1_or_e2_mask = _mm512_cmplt_epu8_mask(off_e1_zmm, x_02_zmm);
52655267
__mmask64 is_e1_mask = is_e1_or_e2_mask & _mm512_testn_epi8_mask(off_e1_zmm, off_e1_zmm); // offset==0 [p0]
52665268
__mmask64 is_e2_mask = is_e1_or_e2_mask & ~is_e1_mask;
52675269

52685270
// Check for AA/AB range: (byte - 0xAA) < 2 [1 CMPLT on p5]
5269-
__m512i off_aa_zmm = _mm512_sub_epi8(h_zmm, x_aa_zmm);
5271+
__m512i off_aa_zmm = _mm512_sub_epi8(text_zmm, x_aa_zmm);
52705272
__mmask64 is_aa_or_ab_mask = _mm512_cmplt_epu8_mask(off_aa_zmm, x_02_zmm);
52715273

5272-
// Other lead bytes (3 CMPEQ on p5)
5273-
__mmask64 is_ef_mask = _mm512_cmpeq_epi8_mask(h_zmm, x_ef_zmm);
5274-
__mmask64 is_c5_mask = _mm512_cmpeq_epi8_mask(h_zmm, x_c5_zmm);
5275-
__mmask64 is_c3_mask = _mm512_cmpeq_epi8_mask(h_zmm, x_c3_zmm);
5274+
// Check for C3/C4/C5 range: (byte - 0xC3) < 3 [1 CMPLT on p5]
5275+
// We only need C3 and C5; C4 is captured but unused
5276+
__m512i off_c3_zmm = _mm512_sub_epi8(text_zmm, x_c3_zmm);
5277+
__mmask64 is_c3_c4_c5_mask = _mm512_cmplt_epu8_mask(off_c3_zmm, x_03_zmm);
5278+
__mmask64 is_c3_mask = is_c3_c4_c5_mask & _mm512_testn_epi8_mask(off_c3_zmm, off_c3_zmm); // offset==0 [p0]
5279+
__m512i off_xor_2_zmm = _mm512_xor_si512(off_c3_zmm, x_02_zmm);
5280+
__mmask64 is_c5_mask = is_c3_c4_c5_mask & _mm512_testn_epi8_mask(off_xor_2_zmm, off_xor_2_zmm); // offset==2 [p0]
5281+
5282+
// Other lead byte (1 CMPEQ on p5)
5283+
__mmask64 is_ef_mask = _mm512_cmpeq_epi8_mask(text_zmm, x_ef_zmm);
5284+
5285+
// Second bytes: BA, 84, AC (3 CMPEQ on p5)
5286+
__mmask64 is_ba_mask = _mm512_cmpeq_epi8_mask(text_zmm, x_ba_zmm);
5287+
__mmask64 is_84_mask = _mm512_cmpeq_epi8_mask(text_zmm, x_84_zmm);
5288+
__mmask64 is_ac_mask = _mm512_cmpeq_epi8_mask(text_zmm, x_ac_zmm);
52765289

5277-
// Other second bytes (5 CMPEQ on p5)
5278-
__mmask64 is_ba_mask = _mm512_cmpeq_epi8_mask(h_zmm, x_ba_zmm);
5279-
__mmask64 is_84_mask = _mm512_cmpeq_epi8_mask(h_zmm, x_84_zmm);
5280-
__mmask64 is_ac_mask = _mm512_cmpeq_epi8_mask(h_zmm, x_ac_zmm);
5281-
__mmask64 is_bf_mask = _mm512_cmpeq_epi8_mask(h_zmm, x_bf_zmm);
5282-
__mmask64 is_9f_mask = _mm512_cmpeq_epi8_mask(h_zmm, x_9f_zmm);
5290+
// 9F/BF bit masking: (byte | 0x20) == 0xBF catches exactly {0x9F, 0xBF} [1 CMPEQ on p5]
5291+
// 0x9F = 1001_1111, 0xBF = 1011_1111, differ only in bit 5
5292+
__m512i masked_zmm = _mm512_or_si512(text_zmm, x_20_zmm);
5293+
__mmask64 is_9f_or_bf_mask = _mm512_cmpeq_epi8_mask(masked_zmm, x_bf_zmm);
5294+
__mmask64 has_bit5_mask = _mm512_test_epi8_mask(text_zmm, x_20_zmm); // VPTESTMB [p0]
5295+
__mmask64 is_bf_mask = is_9f_or_bf_mask & has_bit5_mask;
5296+
__mmask64 is_9f_mask = is_9f_or_bf_mask & ~has_bit5_mask;
52835297

52845298
// Danger mask construction
52855299
__mmask64 danger_mask =
@@ -5289,7 +5303,7 @@ SZ_INTERNAL __mmask64 sz_utf8_case_insensitive_find_ice_western_europe_alarm_eff
52895303
((is_c5_mask << 1) & is_bf_mask) | // Long S (C5 BF)
52905304
((is_c3_mask << 1) & is_9f_mask); // Sharp S (C3 9F)
52915305

5292-
sz_assert_(danger_mask == sz_utf8_case_insensitive_find_ice_western_europe_alarm_naively_zmm_(h_zmm) &&
5306+
sz_assert_(danger_mask == sz_utf8_case_insensitive_find_ice_western_europe_alarm_naively_zmm_(text_zmm) &&
52935307
"Efficient Western Europe alarm must match naive implementation");
52945308
return danger_mask;
52955309
}

0 commit comments

Comments
 (0)