@@ -1146,13 +1146,24 @@ SZ_PUBLIC sz_size_t sz_utf8_case_fold_ice(sz_cptr_t source, sz_size_t source_len
11461146 __mmask64 is_valid_pure_3byte = is_three_byte_lead | is_cont ;
11471147 // Quick check: if all bits match, we have pure 3-byte content
11481148 if ((is_valid_pure_3byte & load_mask ) == (is_non_ascii & load_mask ) && !is_four_byte_lead ) {
1149- // Check for problematic leads: E1, E2, EF have case folding
1149+ // Check for problematic leads that have case folding:
1150+ // - E1: Georgian, Greek Extended, Latin Extended Additional
1151+ // - E2: Glagolitic (B0-B1), Coptic (B2-B3), Letterlike (84 = Kelvin/Angstrom)
1152+ // - EF: Fullwidth A-Z
1153+ // E2 80-83, 85-9F, A0-AF are safe (punctuation, symbols, currency, math)
11501154 // EA is mostly safe (Hangul B0-BF) but some second bytes have folding:
11511155 // - 0x99-0x9F: Cyrillic Ext-B, Latin Ext-D (A640-A7FF)
11521156 // - 0xAD-0xAE: Cherokee Supplement (AB70-ABBF)
11531157 __mmask64 is_e1 = _mm512_cmpeq_epi8_mask (source_vec .zmm , _mm512_set1_epi8 ((char )0xE1 ));
1154- __mmask64 is_e2 = _mm512_cmpeq_epi8_mask (source_vec .zmm , _mm512_set1_epi8 ((char )0xE2 ));
11551158 __mmask64 is_ef = _mm512_cmpeq_epi8_mask (source_vec .zmm , _mm512_set1_epi8 ((char )0xEF ));
1159+ // For E2, only allow 80-83 (General Punctuation quotes) through - many other E2 ranges have folding
1160+ // (84 Letterlike, 93 Enclosed Alphanumerics, B0-B3 Glagolitic/Coptic, etc.)
1161+ __mmask64 is_e2 = _mm512_cmpeq_epi8_mask (source_vec .zmm , _mm512_set1_epi8 ((char )0xE2 ));
1162+ __mmask64 e2_seconds = is_e2 << 1 ;
1163+ // E2 folding needed if second byte is NOT in 80-83 range
1164+ __mmask64 is_e2_folding =
1165+ e2_seconds & ~_mm512_cmp_epu8_mask (_mm512_sub_epi8 (source_vec .zmm , _mm512_set1_epi8 ((char )0x80 )),
1166+ _mm512_set1_epi8 (0x04 ), _MM_CMPINT_LT ); // NOT 80-83
11561167 // For EA, check if second byte is in problematic ranges
11571168 __mmask64 is_ea = _mm512_cmpeq_epi8_mask (source_vec .zmm , _mm512_set1_epi8 ((char )0xEA ));
11581169 __mmask64 ea_seconds = is_ea << 1 ;
@@ -1162,7 +1173,7 @@ SZ_PUBLIC sz_size_t sz_utf8_case_fold_ice(sz_cptr_t source, sz_size_t source_len
11621173 _mm512_set1_epi8 (0x07 ), _MM_CMPINT_LT ) | // 0x99-0x9F
11631174 _mm512_cmp_epu8_mask (_mm512_sub_epi8 (source_vec .zmm , _mm512_set1_epi8 ((char )0xAD )),
11641175 _mm512_set1_epi8 (0x02 ), _MM_CMPINT_LT )); // 0xAD-0xAE
1165- if (!(is_e1 | is_e2 | is_ea_folding | is_ef )) {
1176+ if (!(is_e1 | is_e2_folding | is_ea_folding | is_ef )) {
11661177 // Pure safe 3-byte content (E0, E3-E9, EB-EE) - no case folding needed
11671178 // Just need to avoid splitting a 3-byte sequence at the end
11681179 sz_size_t copy_len = chunk_size ;
@@ -1286,7 +1297,33 @@ SZ_PUBLIC sz_size_t sz_utf8_case_fold_ice(sz_cptr_t source, sz_size_t source_len
12861297 }
12871298 }
12881299
1289- // 2.3. Other 2-byte scripts (Latin Extended, Greek, Cyrillic, Armenian)
1300+ // 2.3. Fast path for 2-byte scripts without case folding (Hebrew, Arabic, Syriac, etc.)
1301+ //
1302+ // Lead bytes D7-DF cover Hebrew (D7), Arabic (D8-DB), Syriac (DC-DD), Thaana/NKo (DE-DF).
1303+ // None of these scripts have case distinctions, so we can just copy them unchanged.
1304+ // NOTE: D5/D6 cover Armenian which HAS case folding (including U+0587 which expands).
1305+ __mmask64 is_caseless_2byte = _mm512_cmp_epu8_mask (source_vec .zmm , _mm512_set1_epi8 ((char )0xD7 ), _MM_CMPINT_GE ) &
1306+ _mm512_cmp_epu8_mask (source_vec .zmm , _mm512_set1_epi8 ((char )0xDF ), _MM_CMPINT_LE );
1307+ if (is_caseless_2byte ) {
1308+ __mmask64 is_caseless_second = is_caseless_2byte << 1 ;
1309+ __mmask64 is_valid_caseless = ~is_non_ascii | is_caseless_2byte | is_caseless_second ;
1310+ sz_size_t caseless_length = sz_u64_ctz (~is_valid_caseless | ~load_mask );
1311+ caseless_length -= caseless_length && ((is_caseless_2byte >> (caseless_length - 1 )) & 1 );
1312+
1313+ if (caseless_length >= 2 ) {
1314+ __mmask64 prefix_mask = sz_u64_mask_until_ (caseless_length );
1315+ // Fold only ASCII A-Z, copy 2-byte unchanged
1316+ __mmask64 is_upper_ascii =
1317+ _mm512_cmp_epu8_mask (_mm512_sub_epi8 (source_vec .zmm , a_upper_vec ), subtract26_vec , _MM_CMPINT_LT );
1318+ __m512i folded =
1319+ _mm512_mask_add_epi8 (source_vec .zmm , is_upper_ascii & prefix_mask , source_vec .zmm , x20_vec );
1320+ _mm512_mask_storeu_epi8 (target , prefix_mask , folded );
1321+ target += caseless_length , source += caseless_length , source_length -= caseless_length ;
1322+ continue ;
1323+ }
1324+ }
1325+
1326+ // 2.4. Other 2-byte scripts (Latin Extended, Greek, Cyrillic, Armenian)
12901327 //
12911328 // Unlike Latin-1 where folding is a simple +0x20 to the second byte in-place, these scripts
12921329 // require unpacking to 32-bit codepoints because:
@@ -1571,6 +1608,127 @@ SZ_PUBLIC sz_size_t sz_utf8_case_fold_ice(sz_cptr_t source, sz_size_t source_len
15711608 }
15721609 }
15731610
1611+ // 3.1. Georgian fast path: handles E1 82/83 content
1612+ //
1613+ // Georgian script uses E1 82 and E1 83 lead sequences:
1614+ // - E1 82 A0-BF: Uppercase (Ⴀ-Ⴟ) - folds to E2 B4 80-9F
1615+ // - E1 83 80-85: Uppercase (Ⴠ-Ⴥ) - folds to E2 B4 A0-A5
1616+ // - E1 83 86-BF: Lowercase/other (ა-ჿ) - no folding needed
1617+ //
1618+ // We include ALL E1 82/83 content in the fast path, but only transform uppercase.
1619+ if (is_e1_lead && source_length >= 3 ) {
1620+ // Check if E1 leads have Georgian second bytes (82 or 83)
1621+ __m512i indices_vec =
1622+ _mm512_set_epi8 (63 , 62 , 61 , 60 , 59 , 58 , 57 , 56 , 55 , 54 , 53 , 52 , 51 , 50 , 49 , 48 , 47 , 46 , 45 , 44 , 43 ,
1623+ 42 , 41 , 40 , 39 , 38 , 37 , 36 , 35 , 34 , 33 , 32 , 31 , 30 , 29 , 28 , 27 , 26 , 25 , 24 , 23 , 22 ,
1624+ 21 , 20 , 19 , 18 , 17 , 16 , 15 , 14 , 13 , 12 , 11 , 10 , 9 , 8 , 7 , 6 , 5 , 4 , 3 , 2 , 1 , 0 );
1625+ __m512i second_bytes =
1626+ _mm512_permutexvar_epi8 (_mm512_add_epi8 (indices_vec , _mm512_set1_epi8 (1 )), source_vec .zmm );
1627+
1628+ // Check for Georgian second bytes: 82 or 83
1629+ // Only check positions where second byte is within chunk (positions 0-62 for 64-byte chunk)
1630+ // Position 63's second byte would wrap around in the permutation
1631+ __mmask64 safe_e1_mask = is_e1_lead & (load_mask >> 1 );
1632+ __mmask64 is_82_at_e1 =
1633+ _mm512_mask_cmpeq_epi8_mask (safe_e1_mask , second_bytes , _mm512_set1_epi8 ((char )0x82 ));
1634+ __mmask64 is_83_at_e1 =
1635+ _mm512_mask_cmpeq_epi8_mask (safe_e1_mask , second_bytes , _mm512_set1_epi8 ((char )0x83 ));
1636+ __mmask64 is_georgian_e1 = is_82_at_e1 | is_83_at_e1 ;
1637+
1638+ // If all checkable E1 leads are Georgian (82/83) and no other complex content
1639+ // E1 leads at the edge (position 63) are handled by not including them in this check
1640+ __mmask64 non_georgian_e1 = safe_e1_mask & ~is_georgian_e1 ;
1641+ if (!non_georgian_e1 && is_georgian_e1 ) {
1642+ // All Georgian 3-byte sequences are valid (E1 82 80-BF, E1 83 80-BF)
1643+ // We only transform the uppercase subset, rest passes through
1644+
1645+ // Find uppercase positions that need transformation:
1646+ // - E1 82 A0-BF: third byte in A0-BF range (U+10A0-10BF)
1647+ // - E1 83 80-85: third byte in 80-85 range (U+10C0-10C5)
1648+ // - E1 83 87: third byte = 87 (U+10C7)
1649+ // - E1 83 8D: third byte = 8D (U+10CD)
1650+ __mmask64 third_pos_82 = is_82_at_e1 << 2 ;
1651+ __mmask64 third_pos_83 = is_83_at_e1 << 2 ;
1652+
1653+ __mmask64 is_82_uppercase = _mm512_mask_cmp_epu8_mask (
1654+ third_pos_82 & load_mask , _mm512_sub_epi8 (source_vec .zmm , _mm512_set1_epi8 ((char )0xA0 )),
1655+ _mm512_set1_epi8 (0x20 ), _MM_CMPINT_LT );
1656+ // For E1 83: check 80-85, 87, or 8D
1657+ __mmask64 is_83_range = _mm512_mask_cmp_epu8_mask (
1658+ third_pos_83 & load_mask , _mm512_sub_epi8 (source_vec .zmm , _mm512_set1_epi8 ((char )0x80 )),
1659+ _mm512_set1_epi8 (0x06 ), _MM_CMPINT_LT );
1660+ __mmask64 is_83_c7 = _mm512_mask_cmpeq_epi8_mask (
1661+ third_pos_83 & load_mask , source_vec .zmm , _mm512_set1_epi8 ((char )0x87 ));
1662+ __mmask64 is_83_cd = _mm512_mask_cmpeq_epi8_mask (
1663+ third_pos_83 & load_mask , source_vec .zmm , _mm512_set1_epi8 ((char )0x8D ));
1664+ __mmask64 is_83_uppercase = is_83_range | is_83_c7 | is_83_cd ;
1665+
1666+ // Include ASCII, ALL Georgian E1 (not just uppercase), E2 (punctuation), continuations, safe EA
1667+ // E2 is mostly safe (punctuation, symbols) - only a few codepoints fold (Kelvin, Angstrom)
1668+ // but we can safely pass those through unchanged (they're rare in Georgian text)
1669+ // Also include C2 leads (Latin-1 Supplement: U+0080-00BF) - no case folding needed
1670+ __mmask64 is_safe_ea = is_ea_lead & ~(is_ea_complex >> 1 );
1671+ __mmask64 is_c2_lead = _mm512_cmpeq_epi8_mask (source_vec .zmm , _mm512_set1_epi8 ((char )0xC2 ));
1672+ __mmask64 is_valid_georgian_mix =
1673+ ~is_non_ascii | is_georgian_e1 | is_e2_lead | is_cont | is_safe_ea | is_c2_lead ;
1674+ // Exclude other 2-byte leads (C3-DF may need folding), 4-byte, EF
1675+ __mmask64 is_foldable_2byte = is_two_byte_lead & ~is_c2_lead ;
1676+ is_valid_georgian_mix &= ~(is_foldable_2byte | is_four_byte_lead | is_ef_lead );
1677+ sz_size_t georgian_length = sz_u64_ctz (~is_valid_georgian_mix | ~load_mask );
1678+
1679+ // Don't split multi-byte sequences (2-byte C2, 3-byte E1/E2/EA)
1680+ if (georgian_length >= 1 && georgian_length < 64 ) {
1681+ __mmask64 prefix = sz_u64_mask_until_ (georgian_length );
1682+ // Check for incomplete 3-byte sequences (leads in last 2 positions)
1683+ __mmask64 leads3_in_prefix = is_three_byte_lead & prefix ;
1684+ __mmask64 safe3_mask = georgian_length >= 3 ? sz_u64_mask_until_ (georgian_length - 2 ) : 0 ;
1685+ __mmask64 unsafe3 = leads3_in_prefix & ~safe3_mask ;
1686+ // Check for incomplete 2-byte sequences (leads in last position)
1687+ __mmask64 leads2_in_prefix = is_c2_lead & prefix ;
1688+ __mmask64 safe2_mask = georgian_length >= 2 ? sz_u64_mask_until_ (georgian_length - 1 ) : 0 ;
1689+ __mmask64 unsafe2 = leads2_in_prefix & ~safe2_mask ;
1690+ __mmask64 unsafe = unsafe3 | unsafe2 ;
1691+ if (unsafe ) georgian_length = sz_u64_ctz (unsafe );
1692+ }
1693+
1694+ if (georgian_length >= 2 ) {
1695+ __mmask64 prefix_mask = sz_u64_mask_until_ (georgian_length );
1696+
1697+ // Find uppercase leads that need transformation within prefix
1698+ __mmask64 uppercase_leads = ((is_82_uppercase | is_83_uppercase ) >> 2 ) & is_georgian_e1 ;
1699+ uppercase_leads &= prefix_mask ;
1700+
1701+ // Transform only uppercase Georgian: E1 82/83 XX → E2 B4 YY
1702+ __m512i folded = source_vec .zmm ;
1703+
1704+ // Set lead bytes to E2 where uppercase Georgian
1705+ folded = _mm512_mask_blend_epi8 (uppercase_leads , folded , _mm512_set1_epi8 ((char )0xE2 ));
1706+
1707+ // Set second bytes to B4 where uppercase Georgian
1708+ __mmask64 uppercase_second_pos = uppercase_leads << 1 ;
1709+ folded = _mm512_mask_blend_epi8 (uppercase_second_pos , folded , _mm512_set1_epi8 ((char )0xB4 ));
1710+
1711+ // Adjust third bytes for uppercase only: -0x20 for 82, +0x20 for 83
1712+ __mmask64 prefix_82_upper = is_82_uppercase & prefix_mask ;
1713+ __mmask64 prefix_83_upper = is_83_uppercase & prefix_mask ;
1714+ folded = _mm512_mask_sub_epi8 (folded , prefix_82_upper , folded , _mm512_set1_epi8 (0x20 ));
1715+ folded = _mm512_mask_add_epi8 (folded , prefix_83_upper , folded , _mm512_set1_epi8 (0x20 ));
1716+
1717+ // Also fold ASCII A-Z
1718+ __mmask64 is_upper_ascii = _mm512_cmp_epu8_mask (_mm512_sub_epi8 (source_vec .zmm , a_upper_vec ),
1719+ subtract26_vec , _MM_CMPINT_LT );
1720+ folded = _mm512_mask_add_epi8 (folded , is_upper_ascii & prefix_mask , folded , x20_vec );
1721+
1722+ _mm512_mask_storeu_epi8 (target , prefix_mask , folded );
1723+ target += georgian_length , source += georgian_length , source_length -= georgian_length ;
1724+ // DEBUG: printf("Georgian path: processed %zu bytes\n", georgian_length);
1725+ continue ;
1726+ }
1727+ }
1728+ }
1729+ // DEBUG: Add counter for when Georgian path is skipped
1730+ // static int skip_count = 0; if (is_e1_lead && ++skip_count < 10) printf("Skipped Georgian path at len=%zu\n", source_length);
1731+
15741732 // Slow path: Has 2-byte, 4-byte, or E1/E2/EF leads that need special handling
15751733 // EA with problematic second bytes (is_ea_complex) also needs special handling
15761734 // But plain EA (Hangul) is safe
@@ -1691,6 +1849,67 @@ SZ_PUBLIC sz_size_t sz_utf8_case_fold_ice(sz_cptr_t source, sz_size_t source_len
16911849 target += three_byte_length , source += three_byte_length , source_length -= three_byte_length ;
16921850 continue ;
16931851 }
1852+
1853+ // Handle Georgian uppercase (E1 82/83) → lowercase (E2 B4)
1854+ // Georgian Mkhedruli: U+10A0-10C5 (E1 82 A0 - E1 83 85) → U+2D00-2D25 (E2 B4 80 - E2 B4 A5)
1855+ // This transformation changes the UTF-8 lead byte from E1 to E2, requiring special handling.
1856+ __mmask64 is_82_second =
1857+ _mm512_mask_cmpeq_epi8_mask (is_e1_in_prefix , second_bytes , _mm512_set1_epi8 ((char )0x82 ));
1858+ __mmask64 is_83_second =
1859+ _mm512_mask_cmpeq_epi8_mask (is_e1_in_prefix , second_bytes , _mm512_set1_epi8 ((char )0x83 ));
1860+ __mmask64 is_georgian_second = is_82_second | is_83_second ;
1861+
1862+ if (is_georgian_second ) {
1863+ // Validate third byte range for Georgian uppercase:
1864+ // - E1 82 A0-BF: U+10A0-10BF (32 chars)
1865+ // - E1 83 80-85: U+10C0-10C5 (6 chars)
1866+ __mmask64 third_pos_82 = is_82_second << 2 ;
1867+ __mmask64 third_pos_83 = is_83_second << 2 ;
1868+
1869+ // For E1 82: third byte must be A0-BF
1870+ __mmask64 is_82_valid = _mm512_mask_cmp_epu8_mask (
1871+ third_pos_82 , _mm512_sub_epi8 (source_vec .zmm , _mm512_set1_epi8 ((char )0xA0 )),
1872+ _mm512_set1_epi8 (0x20 ), _MM_CMPINT_LT );
1873+
1874+ // For E1 83: third byte must be 80-85
1875+ __mmask64 is_83_valid = _mm512_mask_cmp_epu8_mask (
1876+ third_pos_83 , _mm512_sub_epi8 (source_vec .zmm , _mm512_set1_epi8 ((char )0x80 )),
1877+ _mm512_set1_epi8 (0x06 ), _MM_CMPINT_LT );
1878+
1879+ __mmask64 georgian_leads = ((is_82_valid | is_83_valid ) >> 2 ) & is_e1_in_prefix ;
1880+
1881+ if (georgian_leads ) {
1882+ // Transform: E1 82/83 XX → E2 B4 YY
1883+ // For 82: YY = XX - 0x20 (A0-BF → 80-9F)
1884+ // For 83: YY = XX + 0x20 (80-85 → A0-A5)
1885+
1886+ // Start with source, then apply transformations
1887+ __m512i folded = source_vec .zmm ;
1888+
1889+ // Set lead bytes to E2 where Georgian
1890+ folded = _mm512_mask_blend_epi8 (georgian_leads , folded , _mm512_set1_epi8 ((char )0xE2 ));
1891+
1892+ // Set second bytes to B4 where Georgian
1893+ __mmask64 georgian_second_pos = georgian_leads << 1 ;
1894+ folded =
1895+ _mm512_mask_blend_epi8 (georgian_second_pos , folded , _mm512_set1_epi8 ((char )0xB4 ));
1896+
1897+ // Adjust third bytes based on original second byte
1898+ // -0x20 for sequences that had 82, +0x20 for sequences that had 83
1899+ folded = _mm512_mask_sub_epi8 (folded , is_82_valid , folded , _mm512_set1_epi8 (0x20 ));
1900+ folded = _mm512_mask_add_epi8 (folded , is_83_valid , folded , _mm512_set1_epi8 (0x20 ));
1901+
1902+ // Also fold any ASCII A-Z that might be mixed in
1903+ __mmask64 is_upper_ascii = _mm512_cmp_epu8_mask (
1904+ _mm512_sub_epi8 (source_vec .zmm , a_upper_vec ), subtract26_vec , _MM_CMPINT_LT );
1905+ folded = _mm512_mask_add_epi8 (folded , is_upper_ascii & prefix_mask_3 , folded , x20_vec );
1906+
1907+ _mm512_mask_storeu_epi8 (target , prefix_mask_3 , folded );
1908+ target += three_byte_length , source += three_byte_length ,
1909+ source_length -= three_byte_length ;
1910+ continue ;
1911+ }
1912+ }
16941913 }
16951914
16961915 // Handle EF leads - check for Fullwidth A-Z (FF21-FF3A = EF BC A1 - EF BC BA)
0 commit comments