@@ -6707,89 +6707,90 @@ SZ_INTERNAL __mmask64 sz_utf8_case_insensitive_find_ice_greek_alarm_naively_zmm_
67076707 * @brief Optimized danger zone detection for Greek text using Range+LUT technique.
67086708 * @sa sz_utf8_case_insensitive_find_ice_greek_alarm_naively_zmm_
67096709 *
6710- * Reduces port 5 pressure from 13 to 8 operations by:
6711- * - Lead bytes CE/CF and E1/E2: 4 CMPEQ -> 2 CMPLT + 2 VPTESTNMB (p0)
6712- * - CF 9x set {90,91,95,96}: 4 CMPEQ -> 1 CMPLT + 1 VPSHUFB + 1 VPTESTMB (p0)
6713- * - CF Bx set {B0,B1,B5}: 3 CMPEQ -> 1 CMPLT + 1 VPSHUFB + 1 VPTESTMB (p0)
6714- * - CE 90/B0 derived from range checks using VPTESTNMB (p0)
6710+ * Reduces port 5 pressure from 13 to 6 operations by:
6711+ * - Lead bytes CD/CE/CF: 3 CMPEQ -> 1 CMPLT + 2 VPTESTNMB (p0)
6712+ * - Lead bytes E1/E2: 2 CMPEQ -> 1 CMPLT + 1 VPTESTNMB (p0)
6713+ * - Second bytes 9x/Bx unified: 7 CMPEQ -> 1 CMPLT + 1 VPSHUFB + 1 CMPEQ (B6 filter)
6714+ * Key insight: (byte & 0xDF) collapses 9x and Bx to same offset space
6715+ * - Second byte 84: 1 CMPEQ (unchanged)
67156716 *
6716- * Port summary: 8 p5 ops + 6 p0 ops (vs 13 p5 originally)
6717+ * Port summary: 6 p5 ops + 5 p0 ops (vs 13 p5 originally)
67176718 *
67186719 * @param[in] h The haystack ZMM register.
67196720 * @return Bitmask of positions where danger characters are detected.
67206721 */
67216722SZ_INTERNAL __mmask64 sz_utf8_case_insensitive_find_ice_greek_alarm_efficiently_zmm_ (__m512i h_zmm ) {
67226723 // Range constants
6723- __m512i const x_ce_zmm = _mm512_set1_epi8 ((char )0xCE );
6724- __m512i const x_e1_zmm = _mm512_set1_epi8 ((char )0xE1 );
67256724 __m512i const x_cd_zmm = _mm512_set1_epi8 ((char )0xCD );
6725+ __m512i const x_e1_zmm = _mm512_set1_epi8 ((char )0xE1 );
67266726 __m512i const x_90_zmm = _mm512_set1_epi8 ((char )0x90 );
6727- __m512i const x_b0_zmm = _mm512_set1_epi8 ((char )0xB0 );
67286727 __m512i const x_84_zmm = _mm512_set1_epi8 ((char )0x84 );
6728+ __m512i const x_01_zmm = _mm512_set1_epi8 (0x01 );
67296729 __m512i const x_02_zmm = _mm512_set1_epi8 (0x02 );
6730- __m512i const x_07_zmm = _mm512_set1_epi8 (0x07 );
6730+ __m512i const x_03_zmm = _mm512_set1_epi8 (0x03 );
67316731 __m512i const x_06_zmm = _mm512_set1_epi8 (0x06 );
6732+ __m512i const x_07_zmm = _mm512_set1_epi8 (0x07 );
67326733
6733- // LUT for 9x set {90,91,95,96}: positions 0,1,5,6 are 0xFF (valid)
6734+ // Unified LUT for 9x/Bx set: positions 0,1,5,6 are 0xFF (valid)
6735+ // After masking with 0xDF, both 9x (90,91,95,96) and Bx (B0,B1,B5) collapse to same offsets
67346736 // VPSHUFB uses low 4 bits as index; high bit set -> output 0
67356737 __m512i const lut_9x_zmm = _mm512_set_epi8 ( //
67366738 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , -1 , -1 , 0 , 0 , 0 , -1 , -1 , // lane 3: [15..0]
67376739 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , -1 , -1 , 0 , 0 , 0 , -1 , -1 , // lane 2
67386740 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , -1 , -1 , 0 , 0 , 0 , -1 , -1 , // lane 1
67396741 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , -1 , -1 , 0 , 0 , 0 , -1 , -1 ); // lane 0
67406742
6741- // LUT for Bx set {B0,B1,B5}: positions 0,1,5 are 0xFF (valid)
6742- __m512i const lut_bx_zmm = _mm512_set_epi8 ( //
6743- 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , -1 , 0 , 0 , 0 , -1 , -1 , // lane 3: [15..0]
6744- 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , -1 , 0 , 0 , 0 , -1 , -1 , // lane 2
6745- 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , -1 , 0 , 0 , 0 , -1 , -1 , // lane 1
6746- 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , -1 , 0 , 0 , 0 , -1 , -1 ); // lane 0
6747-
67486743 // Lead byte detection using range compression
6749- // Check for CE/CF range: (byte - 0xCE) < 2 [1 CMPLT on p5]
6750- __m512i off_ce_zmm = _mm512_sub_epi8 (h_zmm , x_ce_zmm );
6751- __mmask64 is_ce_or_cf_mask = _mm512_cmplt_epu8_mask (off_ce_zmm , x_02_zmm );
6752- __mmask64 is_ce_mask = is_ce_or_cf_mask & _mm512_testn_epi8_mask (off_ce_zmm , off_ce_zmm ); // offset==0 [p0]
6753- __mmask64 is_cf_mask = is_ce_or_cf_mask & ~is_ce_mask ;
6744+ // Check for CD/CE/CF range: (byte - 0xCD) < 3 [1 CMPLT on p5]
6745+ // Saves 1 p5 op by consolidating CD check with CE/CF range
6746+ __m512i off_cd_zmm = _mm512_sub_epi8 (h_zmm , x_cd_zmm );
6747+ __mmask64 is_cd_ce_cf_mask = _mm512_cmplt_epu8_mask (off_cd_zmm , x_03_zmm );
6748+ // Derive CD (offset==0), CE (offset==1), CF (offset==2) using TESTNM on p0
6749+ __mmask64 is_cd_mask = is_cd_ce_cf_mask & _mm512_testn_epi8_mask (off_cd_zmm , off_cd_zmm );
6750+ __m512i off_xor_1_zmm = _mm512_xor_si512 (off_cd_zmm , x_01_zmm );
6751+ __mmask64 is_ce_mask = is_cd_ce_cf_mask & _mm512_testn_epi8_mask (off_xor_1_zmm , off_xor_1_zmm );
6752+ __mmask64 is_cf_mask = is_cd_ce_cf_mask & ~is_cd_mask & ~is_ce_mask ;
67546753
67556754 // Check for E1/E2 range: (byte - 0xE1) < 2 [1 CMPLT on p5]
67566755 __m512i off_e1_zmm = _mm512_sub_epi8 (h_zmm , x_e1_zmm );
67576756 __mmask64 is_e1_or_e2_mask = _mm512_cmplt_epu8_mask (off_e1_zmm , x_02_zmm );
67586757 __mmask64 is_e1_mask = is_e1_or_e2_mask & _mm512_testn_epi8_mask (off_e1_zmm , off_e1_zmm ); // offset==0 [p0]
67596758 __mmask64 is_e2_mask = is_e1_or_e2_mask & ~is_e1_mask ;
67606759
6761- // Check for CD (no adjacent partner) [1 CMPEQ on p5]
6762- __mmask64 is_cd_mask = _mm512_cmpeq_epi8_mask ( h_zmm , x_cd_zmm );
6763-
6764- // Second byte detection using Range+LUT
6765- // Check for 9x range [90-96]: compute offsets [1 CMPLT on p5]
6766- __m512i off_9x_zmm = _mm512_sub_epi8 ( h_zmm , x_90_zmm );
6767- __mmask64 in_9x_range_mask = _mm512_cmplt_epu8_mask ( off_9x_zmm , x_07_zmm );
6768-
6769- // Validate 9x set {90,91,95,96} using LUT [1 VPSHUFB on p5, 1 VPTESTMB on p0]
6770- __m512i shuffled_9x_zmm = _mm512_shuffle_epi8 ( lut_9x_zmm , off_9x_zmm );
6771- __mmask64 valid_9x_mask = in_9x_range_mask & _mm512_test_epi8_mask ( shuffled_9x_zmm , shuffled_9x_zmm );
6760+ // Second byte detection using unified Range+LUT for 9x and Bx
6761+ // Key insight: 0x90-0x96 and 0xB0-0xB5 share the same low nibble pattern.
6762+ // Masking with 0xDF collapses both ranges to offsets 0-6 from 0x90:
6763+ // 0x90 & 0xDF = 0x90, 0xB0 & 0xDF = 0x90 -> offset 0
6764+ // 0x91 & 0xDF = 0x91, 0xB1 & 0xDF = 0x91 -> offset 1
6765+ // 0x95 & 0xDF = 0x95, 0xB5 & 0xDF = 0x95 -> offset 5
6766+ // 0x96 & 0xDF = 0x96, 0xB6 & 0xDF = 0x96 -> offset 6 (B6 must be excluded)
6767+ __m512i const x_df_zmm = _mm512_set1_epi8 (( char ) 0xDF );
6768+ __m512i const x_20_zmm = _mm512_set1_epi8 ( 0x20 );
6769+ __m512i masked_zmm = _mm512_and_si512 ( h_zmm , x_df_zmm );
6770+ __m512i offset_9x_bx_zmm = _mm512_sub_epi8 ( masked_zmm , x_90_zmm );
67726771
6773- // Derive is_90 (offset == 0 within range) [1 VPTESTNMB on p0 ]
6774- __mmask64 is_90_mask = in_9x_range_mask & _mm512_testn_epi8_mask ( off_9x_zmm , off_9x_zmm );
6772+ // Check if masked byte is in range [0x90, 0x96] [1 CMPLT on p5 ]
6773+ __mmask64 in_9x_bx_range_mask = _mm512_cmplt_epu8_mask ( offset_9x_bx_zmm , x_07_zmm );
67756774
6776- // Check for Bx range [B0-B5]: compute offsets [1 CMPLT on p5]
6777- __m512i off_bx_zmm = _mm512_sub_epi8 (h_zmm , x_b0_zmm );
6778- __mmask64 in_bx_range_mask = _mm512_cmplt_epu8_mask (off_bx_zmm , x_06_zmm );
6775+ // For CE: need exactly 90 or B0 (both map to offset 0) [1 VPTESTNMB on p0]
6776+ __mmask64 is_90_or_b0_mask = in_9x_bx_range_mask & _mm512_testn_epi8_mask (offset_9x_bx_zmm , offset_9x_bx_zmm );
67796777
6780- // Validate Bx set {B0,B1,B5} using LUT [1 VPSHUFB on p5, 1 VPTESTMB on p0]
6781- __m512i shuffled_bx_zmm = _mm512_shuffle_epi8 (lut_bx_zmm , off_bx_zmm );
6782- __mmask64 valid_bx_mask = in_bx_range_mask & _mm512_test_epi8_mask (shuffled_bx_zmm , shuffled_bx_zmm );
6778+ // For CF: validate using LUT, then exclude B6 false positives [1 VPSHUFB on p5, 1 VPTESTMB on p0]
6779+ __m512i shuffled_9x_bx_zmm = _mm512_shuffle_epi8 (lut_9x_zmm , offset_9x_bx_zmm );
6780+ __mmask64 valid_prelim_mask = in_9x_bx_range_mask & _mm512_test_epi8_mask (shuffled_9x_bx_zmm , shuffled_9x_bx_zmm );
67836781
6784- // Derive is_b0 (offset == 0 within range) [1 VPTESTNMB on p0]
6785- __mmask64 is_b0_mask = in_bx_range_mask & _mm512_testn_epi8_mask (off_bx_zmm , off_bx_zmm );
6782+ // Exclude B6: offset==6 && bit5 set means original was 0xB6 not 0x96 [1 CMPEQ on p5, 1 VPTESTMB on p0]
6783+ __mmask64 is_offset_6_mask = valid_prelim_mask & _mm512_cmpeq_epi8_mask (offset_9x_bx_zmm , x_06_zmm );
6784+ __m512i bit5_zmm = _mm512_and_si512 (h_zmm , x_20_zmm );
6785+ __mmask64 is_b6_mask = is_offset_6_mask & _mm512_test_epi8_mask (bit5_zmm , bit5_zmm );
6786+ __mmask64 valid_9x_bx_mask = valid_prelim_mask & ~is_b6_mask ;
67866787
67876788 // Check for E2 84 [1 CMPEQ on p5]
67886789 __mmask64 is_84_mask = _mm512_cmpeq_epi8_mask (h_zmm , x_84_zmm );
67896790
67906791 // Danger mask construction
6791- __mmask64 ce_danger_mask = (is_ce_mask << 1 ) & ( is_90_mask | is_b0_mask ) ;
6792- __mmask64 cf_danger_mask = (is_cf_mask << 1 ) & ( valid_9x_mask | valid_bx_mask ) ;
6792+ __mmask64 ce_danger_mask = (is_ce_mask << 1 ) & is_90_or_b0_mask ;
6793+ __mmask64 cf_danger_mask = (is_cf_mask << 1 ) & valid_9x_bx_mask ;
67936794 __mmask64 e2_danger_mask = (is_e2_mask << 1 ) & is_84_mask ;
67946795 __mmask64 danger_mask = ce_danger_mask | cf_danger_mask | e2_danger_mask | is_e1_mask | is_cd_mask ;
67956796
0 commit comments