Skip to content

Commit 22dc88c

Browse files
committed
Improve: Greek alarm with less register pressure
Port pressure went down from 8+6 on p5 and p0 respectively, to 6+5.
1 parent 936fc22 commit 22dc88c

File tree

1 file changed

+47
-46
lines changed

1 file changed

+47
-46
lines changed

include/stringzilla/utf8_case.h

Lines changed: 47 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -6707,89 +6707,90 @@ SZ_INTERNAL __mmask64 sz_utf8_case_insensitive_find_ice_greek_alarm_naively_zmm_
67076707
* @brief Optimized danger zone detection for Greek text using Range+LUT technique.
67086708
* @sa sz_utf8_case_insensitive_find_ice_greek_alarm_naively_zmm_
67096709
*
6710-
* Reduces port 5 pressure from 13 to 8 operations by:
6711-
* - Lead bytes CE/CF and E1/E2: 4 CMPEQ -> 2 CMPLT + 2 VPTESTNMB (p0)
6712-
* - CF 9x set {90,91,95,96}: 4 CMPEQ -> 1 CMPLT + 1 VPSHUFB + 1 VPTESTMB (p0)
6713-
* - CF Bx set {B0,B1,B5}: 3 CMPEQ -> 1 CMPLT + 1 VPSHUFB + 1 VPTESTMB (p0)
6714-
* - CE 90/B0 derived from range checks using VPTESTNMB (p0)
6710+
* Reduces port 5 pressure from 13 to 6 operations by:
6711+
* - Lead bytes CD/CE/CF: 3 CMPEQ -> 1 CMPLT + 2 VPTESTNMB (p0)
6712+
* - Lead bytes E1/E2: 2 CMPEQ -> 1 CMPLT + 1 VPTESTNMB (p0)
6713+
* - Second bytes 9x/Bx unified: 7 CMPEQ -> 1 CMPLT + 1 VPSHUFB + 1 CMPEQ (B6 filter)
6714+
* Key insight: (byte & 0xDF) collapses 9x and Bx to same offset space
6715+
* - Second byte 84: 1 CMPEQ (unchanged)
67156716
*
6716-
* Port summary: 8 p5 ops + 6 p0 ops (vs 13 p5 originally)
6717+
* Port summary: 6 p5 ops + 5 p0 ops (vs 13 p5 originally)
67176718
*
67186719
* @param[in] h The haystack ZMM register.
67196720
* @return Bitmask of positions where danger characters are detected.
67206721
*/
67216722
SZ_INTERNAL __mmask64 sz_utf8_case_insensitive_find_ice_greek_alarm_efficiently_zmm_(__m512i h_zmm) {
67226723
// Range constants
6723-
__m512i const x_ce_zmm = _mm512_set1_epi8((char)0xCE);
6724-
__m512i const x_e1_zmm = _mm512_set1_epi8((char)0xE1);
67256724
__m512i const x_cd_zmm = _mm512_set1_epi8((char)0xCD);
6725+
__m512i const x_e1_zmm = _mm512_set1_epi8((char)0xE1);
67266726
__m512i const x_90_zmm = _mm512_set1_epi8((char)0x90);
6727-
__m512i const x_b0_zmm = _mm512_set1_epi8((char)0xB0);
67286727
__m512i const x_84_zmm = _mm512_set1_epi8((char)0x84);
6728+
__m512i const x_01_zmm = _mm512_set1_epi8(0x01);
67296729
__m512i const x_02_zmm = _mm512_set1_epi8(0x02);
6730-
__m512i const x_07_zmm = _mm512_set1_epi8(0x07);
6730+
__m512i const x_03_zmm = _mm512_set1_epi8(0x03);
67316731
__m512i const x_06_zmm = _mm512_set1_epi8(0x06);
6732+
__m512i const x_07_zmm = _mm512_set1_epi8(0x07);
67326733

6733-
// LUT for 9x set {90,91,95,96}: positions 0,1,5,6 are 0xFF (valid)
6734+
// Unified LUT for 9x/Bx set: positions 0,1,5,6 are 0xFF (valid)
6735+
// After masking with 0xDF, both 9x (90,91,95,96) and Bx (B0,B1,B5) collapse to same offsets
67346736
// VPSHUFB uses low 4 bits as index; high bit set -> output 0
67356737
__m512i const lut_9x_zmm = _mm512_set_epi8( //
67366738
0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, -1, -1, // lane 3: [15..0]
67376739
0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, -1, -1, // lane 2
67386740
0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, -1, -1, // lane 1
67396741
0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, -1, -1); // lane 0
67406742

6741-
// LUT for Bx set {B0,B1,B5}: positions 0,1,5 are 0xFF (valid)
6742-
__m512i const lut_bx_zmm = _mm512_set_epi8( //
6743-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, -1, -1, // lane 3: [15..0]
6744-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, -1, -1, // lane 2
6745-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, -1, -1, // lane 1
6746-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, -1, -1); // lane 0
6747-
67486743
// Lead byte detection using range compression
6749-
// Check for CE/CF range: (byte - 0xCE) < 2 [1 CMPLT on p5]
6750-
__m512i off_ce_zmm = _mm512_sub_epi8(h_zmm, x_ce_zmm);
6751-
__mmask64 is_ce_or_cf_mask = _mm512_cmplt_epu8_mask(off_ce_zmm, x_02_zmm);
6752-
__mmask64 is_ce_mask = is_ce_or_cf_mask & _mm512_testn_epi8_mask(off_ce_zmm, off_ce_zmm); // offset==0 [p0]
6753-
__mmask64 is_cf_mask = is_ce_or_cf_mask & ~is_ce_mask;
6744+
// Check for CD/CE/CF range: (byte - 0xCD) < 3 [1 CMPLT on p5]
6745+
// Saves 1 p5 op by consolidating CD check with CE/CF range
6746+
__m512i off_cd_zmm = _mm512_sub_epi8(h_zmm, x_cd_zmm);
6747+
__mmask64 is_cd_ce_cf_mask = _mm512_cmplt_epu8_mask(off_cd_zmm, x_03_zmm);
6748+
// Derive CD (offset==0), CE (offset==1), CF (offset==2) using TESTNM on p0
6749+
__mmask64 is_cd_mask = is_cd_ce_cf_mask & _mm512_testn_epi8_mask(off_cd_zmm, off_cd_zmm);
6750+
__m512i off_xor_1_zmm = _mm512_xor_si512(off_cd_zmm, x_01_zmm);
6751+
__mmask64 is_ce_mask = is_cd_ce_cf_mask & _mm512_testn_epi8_mask(off_xor_1_zmm, off_xor_1_zmm);
6752+
__mmask64 is_cf_mask = is_cd_ce_cf_mask & ~is_cd_mask & ~is_ce_mask;
67546753

67556754
// Check for E1/E2 range: (byte - 0xE1) < 2 [1 CMPLT on p5]
67566755
__m512i off_e1_zmm = _mm512_sub_epi8(h_zmm, x_e1_zmm);
67576756
__mmask64 is_e1_or_e2_mask = _mm512_cmplt_epu8_mask(off_e1_zmm, x_02_zmm);
67586757
__mmask64 is_e1_mask = is_e1_or_e2_mask & _mm512_testn_epi8_mask(off_e1_zmm, off_e1_zmm); // offset==0 [p0]
67596758
__mmask64 is_e2_mask = is_e1_or_e2_mask & ~is_e1_mask;
67606759

6761-
// Check for CD (no adjacent partner) [1 CMPEQ on p5]
6762-
__mmask64 is_cd_mask = _mm512_cmpeq_epi8_mask(h_zmm, x_cd_zmm);
6763-
6764-
// Second byte detection using Range+LUT
6765-
// Check for 9x range [90-96]: compute offsets [1 CMPLT on p5]
6766-
__m512i off_9x_zmm = _mm512_sub_epi8(h_zmm, x_90_zmm);
6767-
__mmask64 in_9x_range_mask = _mm512_cmplt_epu8_mask(off_9x_zmm, x_07_zmm);
6768-
6769-
// Validate 9x set {90,91,95,96} using LUT [1 VPSHUFB on p5, 1 VPTESTMB on p0]
6770-
__m512i shuffled_9x_zmm = _mm512_shuffle_epi8(lut_9x_zmm, off_9x_zmm);
6771-
__mmask64 valid_9x_mask = in_9x_range_mask & _mm512_test_epi8_mask(shuffled_9x_zmm, shuffled_9x_zmm);
6760+
// Second byte detection using unified Range+LUT for 9x and Bx
6761+
// Key insight: 0x90-0x96 and 0xB0-0xB5 share the same low nibble pattern.
6762+
// Masking with 0xDF collapses both ranges to offsets 0-6 from 0x90:
6763+
// 0x90 & 0xDF = 0x90, 0xB0 & 0xDF = 0x90 -> offset 0
6764+
// 0x91 & 0xDF = 0x91, 0xB1 & 0xDF = 0x91 -> offset 1
6765+
// 0x95 & 0xDF = 0x95, 0xB5 & 0xDF = 0x95 -> offset 5
6766+
// 0x96 & 0xDF = 0x96, 0xB6 & 0xDF = 0x96 -> offset 6 (B6 must be excluded)
6767+
__m512i const x_df_zmm = _mm512_set1_epi8((char)0xDF);
6768+
__m512i const x_20_zmm = _mm512_set1_epi8(0x20);
6769+
__m512i masked_zmm = _mm512_and_si512(h_zmm, x_df_zmm);
6770+
__m512i offset_9x_bx_zmm = _mm512_sub_epi8(masked_zmm, x_90_zmm);
67726771

6773-
// Derive is_90 (offset == 0 within range) [1 VPTESTNMB on p0]
6774-
__mmask64 is_90_mask = in_9x_range_mask & _mm512_testn_epi8_mask(off_9x_zmm, off_9x_zmm);
6772+
// Check if masked byte is in range [0x90, 0x96] [1 CMPLT on p5]
6773+
__mmask64 in_9x_bx_range_mask = _mm512_cmplt_epu8_mask(offset_9x_bx_zmm, x_07_zmm);
67756774

6776-
// Check for Bx range [B0-B5]: compute offsets [1 CMPLT on p5]
6777-
__m512i off_bx_zmm = _mm512_sub_epi8(h_zmm, x_b0_zmm);
6778-
__mmask64 in_bx_range_mask = _mm512_cmplt_epu8_mask(off_bx_zmm, x_06_zmm);
6775+
// For CE: need exactly 90 or B0 (both map to offset 0) [1 VPTESTNMB on p0]
6776+
__mmask64 is_90_or_b0_mask = in_9x_bx_range_mask & _mm512_testn_epi8_mask(offset_9x_bx_zmm, offset_9x_bx_zmm);
67796777

6780-
// Validate Bx set {B0,B1,B5} using LUT [1 VPSHUFB on p5, 1 VPTESTMB on p0]
6781-
__m512i shuffled_bx_zmm = _mm512_shuffle_epi8(lut_bx_zmm, off_bx_zmm);
6782-
__mmask64 valid_bx_mask = in_bx_range_mask & _mm512_test_epi8_mask(shuffled_bx_zmm, shuffled_bx_zmm);
6778+
// For CF: validate using LUT, then exclude B6 false positives [1 VPSHUFB on p5, 1 VPTESTMB on p0]
6779+
__m512i shuffled_9x_bx_zmm = _mm512_shuffle_epi8(lut_9x_zmm, offset_9x_bx_zmm);
6780+
__mmask64 valid_prelim_mask = in_9x_bx_range_mask & _mm512_test_epi8_mask(shuffled_9x_bx_zmm, shuffled_9x_bx_zmm);
67836781

6784-
// Derive is_b0 (offset == 0 within range) [1 VPTESTNMB on p0]
6785-
__mmask64 is_b0_mask = in_bx_range_mask & _mm512_testn_epi8_mask(off_bx_zmm, off_bx_zmm);
6782+
// Exclude B6: offset==6 && bit5 set means original was 0xB6 not 0x96 [1 CMPEQ on p5, 1 VPTESTMB on p0]
6783+
__mmask64 is_offset_6_mask = valid_prelim_mask & _mm512_cmpeq_epi8_mask(offset_9x_bx_zmm, x_06_zmm);
6784+
__m512i bit5_zmm = _mm512_and_si512(h_zmm, x_20_zmm);
6785+
__mmask64 is_b6_mask = is_offset_6_mask & _mm512_test_epi8_mask(bit5_zmm, bit5_zmm);
6786+
__mmask64 valid_9x_bx_mask = valid_prelim_mask & ~is_b6_mask;
67866787

67876788
// Check for E2 84 [1 CMPEQ on p5]
67886789
__mmask64 is_84_mask = _mm512_cmpeq_epi8_mask(h_zmm, x_84_zmm);
67896790

67906791
// Danger mask construction
6791-
__mmask64 ce_danger_mask = (is_ce_mask << 1) & (is_90_mask | is_b0_mask);
6792-
__mmask64 cf_danger_mask = (is_cf_mask << 1) & (valid_9x_mask | valid_bx_mask);
6792+
__mmask64 ce_danger_mask = (is_ce_mask << 1) & is_90_or_b0_mask;
6793+
__mmask64 cf_danger_mask = (is_cf_mask << 1) & valid_9x_bx_mask;
67936794
__mmask64 e2_danger_mask = (is_e2_mask << 1) & is_84_mask;
67946795
__mmask64 danger_mask = ce_danger_mask | cf_danger_mask | e2_danger_mask | is_e1_mask | is_cd_mask;
67956796

0 commit comments

Comments
 (0)