Skip to content

Commit fa7422c

Browse files
committed
Add: Fast path for Georgian case-folding
1 parent 8b27080 commit fa7422c

File tree

1 file changed

+223
-4
lines changed

1 file changed

+223
-4
lines changed

include/stringzilla/utf8_case.h

Lines changed: 223 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1146,13 +1146,24 @@ SZ_PUBLIC sz_size_t sz_utf8_case_fold_ice(sz_cptr_t source, sz_size_t source_len
11461146
__mmask64 is_valid_pure_3byte = is_three_byte_lead | is_cont;
11471147
// Quick check: if all bits match, we have pure 3-byte content
11481148
if ((is_valid_pure_3byte & load_mask) == (is_non_ascii & load_mask) && !is_four_byte_lead) {
1149-
// Check for problematic leads: E1, E2, EF have case folding
1149+
// Check for problematic leads that have case folding:
1150+
// - E1: Georgian, Greek Extended, Latin Extended Additional
1151+
// - E2: Glagolitic (B0-B1), Coptic (B2-B3), Letterlike (84 = Kelvin/Angstrom)
1152+
// - EF: Fullwidth A-Z
1153+
// E2 80-83, 85-9F, A0-AF are safe (punctuation, symbols, currency, math)
11501154
// EA is mostly safe (Hangul B0-BF) but some second bytes have folding:
11511155
// - 0x99-0x9F: Cyrillic Ext-B, Latin Ext-D (A640-A7FF)
11521156
// - 0xAD-0xAE: Cherokee Supplement (AB70-ABBF)
11531157
__mmask64 is_e1 = _mm512_cmpeq_epi8_mask(source_vec.zmm, _mm512_set1_epi8((char)0xE1));
1154-
__mmask64 is_e2 = _mm512_cmpeq_epi8_mask(source_vec.zmm, _mm512_set1_epi8((char)0xE2));
11551158
__mmask64 is_ef = _mm512_cmpeq_epi8_mask(source_vec.zmm, _mm512_set1_epi8((char)0xEF));
1159+
// For E2, only allow 80-83 (General Punctuation quotes) through - many other E2 ranges have folding
1160+
// (84 Letterlike, 93 Enclosed Alphanumerics, B0-B3 Glagolitic/Coptic, etc.)
1161+
__mmask64 is_e2 = _mm512_cmpeq_epi8_mask(source_vec.zmm, _mm512_set1_epi8((char)0xE2));
1162+
__mmask64 e2_seconds = is_e2 << 1;
1163+
// E2 folding needed if second byte is NOT in 80-83 range
1164+
__mmask64 is_e2_folding =
1165+
e2_seconds & ~_mm512_cmp_epu8_mask(_mm512_sub_epi8(source_vec.zmm, _mm512_set1_epi8((char)0x80)),
1166+
_mm512_set1_epi8(0x04), _MM_CMPINT_LT); // NOT 80-83
11561167
// For EA, check if second byte is in problematic ranges
11571168
__mmask64 is_ea = _mm512_cmpeq_epi8_mask(source_vec.zmm, _mm512_set1_epi8((char)0xEA));
11581169
__mmask64 ea_seconds = is_ea << 1;
@@ -1162,7 +1173,7 @@ SZ_PUBLIC sz_size_t sz_utf8_case_fold_ice(sz_cptr_t source, sz_size_t source_len
11621173
_mm512_set1_epi8(0x07), _MM_CMPINT_LT) | // 0x99-0x9F
11631174
_mm512_cmp_epu8_mask(_mm512_sub_epi8(source_vec.zmm, _mm512_set1_epi8((char)0xAD)),
11641175
_mm512_set1_epi8(0x02), _MM_CMPINT_LT)); // 0xAD-0xAE
1165-
if (!(is_e1 | is_e2 | is_ea_folding | is_ef)) {
1176+
if (!(is_e1 | is_e2_folding | is_ea_folding | is_ef)) {
11661177
// Pure safe 3-byte content (E0, E3-E9, EB-EE) - no case folding needed
11671178
// Just need to avoid splitting a 3-byte sequence at the end
11681179
sz_size_t copy_len = chunk_size;
@@ -1286,7 +1297,33 @@ SZ_PUBLIC sz_size_t sz_utf8_case_fold_ice(sz_cptr_t source, sz_size_t source_len
12861297
}
12871298
}
12881299

1289-
// 2.3. Other 2-byte scripts (Latin Extended, Greek, Cyrillic, Armenian)
1300+
// 2.3. Fast path for 2-byte scripts without case folding (Hebrew, Arabic, Syriac, etc.)
1301+
//
1302+
// Lead bytes D7-DF cover Hebrew (D7), Arabic (D8-DB), Syriac (DC-DD), Thaana/NKo (DE-DF).
1303+
// None of these scripts have case distinctions, so we can just copy them unchanged.
1304+
// NOTE: D5/D6 cover Armenian which HAS case folding (including U+0587 which expands).
1305+
__mmask64 is_caseless_2byte = _mm512_cmp_epu8_mask(source_vec.zmm, _mm512_set1_epi8((char)0xD7), _MM_CMPINT_GE) &
1306+
_mm512_cmp_epu8_mask(source_vec.zmm, _mm512_set1_epi8((char)0xDF), _MM_CMPINT_LE);
1307+
if (is_caseless_2byte) {
1308+
__mmask64 is_caseless_second = is_caseless_2byte << 1;
1309+
__mmask64 is_valid_caseless = ~is_non_ascii | is_caseless_2byte | is_caseless_second;
1310+
sz_size_t caseless_length = sz_u64_ctz(~is_valid_caseless | ~load_mask);
1311+
caseless_length -= caseless_length && ((is_caseless_2byte >> (caseless_length - 1)) & 1);
1312+
1313+
if (caseless_length >= 2) {
1314+
__mmask64 prefix_mask = sz_u64_mask_until_(caseless_length);
1315+
// Fold only ASCII A-Z, copy 2-byte unchanged
1316+
__mmask64 is_upper_ascii =
1317+
_mm512_cmp_epu8_mask(_mm512_sub_epi8(source_vec.zmm, a_upper_vec), subtract26_vec, _MM_CMPINT_LT);
1318+
__m512i folded =
1319+
_mm512_mask_add_epi8(source_vec.zmm, is_upper_ascii & prefix_mask, source_vec.zmm, x20_vec);
1320+
_mm512_mask_storeu_epi8(target, prefix_mask, folded);
1321+
target += caseless_length, source += caseless_length, source_length -= caseless_length;
1322+
continue;
1323+
}
1324+
}
1325+
1326+
// 2.4. Other 2-byte scripts (Latin Extended, Greek, Cyrillic, Armenian)
12901327
//
12911328
// Unlike Latin-1 where folding is a simple +0x20 to the second byte in-place, these scripts
12921329
// require unpacking to 32-bit codepoints because:
@@ -1571,6 +1608,127 @@ SZ_PUBLIC sz_size_t sz_utf8_case_fold_ice(sz_cptr_t source, sz_size_t source_len
15711608
}
15721609
}
15731610

1611+
// 3.1. Georgian fast path: handles E1 82/83 content
1612+
//
1613+
// Georgian script uses E1 82 and E1 83 lead sequences:
1614+
// - E1 82 A0-BF: Uppercase (Ⴀ-Ⴟ) - folds to E2 B4 80-9F
1615+
// - E1 83 80-85: Uppercase (Ⴠ-Ⴥ) - folds to E2 B4 A0-A5
1616+
// - E1 83 86-BF: Lowercase/other (ა-ჿ) - no folding needed
1617+
//
1618+
// We include ALL E1 82/83 content in the fast path, but only transform uppercase.
1619+
if (is_e1_lead && source_length >= 3) {
1620+
// Check if E1 leads have Georgian second bytes (82 or 83)
1621+
__m512i indices_vec =
1622+
_mm512_set_epi8(63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43,
1623+
42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22,
1624+
21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1625+
__m512i second_bytes =
1626+
_mm512_permutexvar_epi8(_mm512_add_epi8(indices_vec, _mm512_set1_epi8(1)), source_vec.zmm);
1627+
1628+
// Check for Georgian second bytes: 82 or 83
1629+
// Only check positions where second byte is within chunk (positions 0-62 for 64-byte chunk)
1630+
// Position 63's second byte would wrap around in the permutation
1631+
__mmask64 safe_e1_mask = is_e1_lead & (load_mask >> 1);
1632+
__mmask64 is_82_at_e1 =
1633+
_mm512_mask_cmpeq_epi8_mask(safe_e1_mask, second_bytes, _mm512_set1_epi8((char)0x82));
1634+
__mmask64 is_83_at_e1 =
1635+
_mm512_mask_cmpeq_epi8_mask(safe_e1_mask, second_bytes, _mm512_set1_epi8((char)0x83));
1636+
__mmask64 is_georgian_e1 = is_82_at_e1 | is_83_at_e1;
1637+
1638+
// If all checkable E1 leads are Georgian (82/83) and no other complex content
1639+
// E1 leads at the edge (position 63) are handled by not including them in this check
1640+
__mmask64 non_georgian_e1 = safe_e1_mask & ~is_georgian_e1;
1641+
if (!non_georgian_e1 && is_georgian_e1) {
1642+
// All Georgian 3-byte sequences are valid (E1 82 80-BF, E1 83 80-BF)
1643+
// We only transform the uppercase subset, rest passes through
1644+
1645+
// Find uppercase positions that need transformation:
1646+
// - E1 82 A0-BF: third byte in A0-BF range (U+10A0-10BF)
1647+
// - E1 83 80-85: third byte in 80-85 range (U+10C0-10C5)
1648+
// - E1 83 87: third byte = 87 (U+10C7)
1649+
// - E1 83 8D: third byte = 8D (U+10CD)
1650+
__mmask64 third_pos_82 = is_82_at_e1 << 2;
1651+
__mmask64 third_pos_83 = is_83_at_e1 << 2;
1652+
1653+
__mmask64 is_82_uppercase = _mm512_mask_cmp_epu8_mask(
1654+
third_pos_82 & load_mask, _mm512_sub_epi8(source_vec.zmm, _mm512_set1_epi8((char)0xA0)),
1655+
_mm512_set1_epi8(0x20), _MM_CMPINT_LT);
1656+
// For E1 83: check 80-85, 87, or 8D
1657+
__mmask64 is_83_range = _mm512_mask_cmp_epu8_mask(
1658+
third_pos_83 & load_mask, _mm512_sub_epi8(source_vec.zmm, _mm512_set1_epi8((char)0x80)),
1659+
_mm512_set1_epi8(0x06), _MM_CMPINT_LT);
1660+
__mmask64 is_83_c7 = _mm512_mask_cmpeq_epi8_mask(
1661+
third_pos_83 & load_mask, source_vec.zmm, _mm512_set1_epi8((char)0x87));
1662+
__mmask64 is_83_cd = _mm512_mask_cmpeq_epi8_mask(
1663+
third_pos_83 & load_mask, source_vec.zmm, _mm512_set1_epi8((char)0x8D));
1664+
__mmask64 is_83_uppercase = is_83_range | is_83_c7 | is_83_cd;
1665+
1666+
// Include ASCII, ALL Georgian E1 (not just uppercase), E2 (punctuation), continuations, safe EA
1667+
// E2 is mostly safe (punctuation, symbols) - only a few codepoints fold (Kelvin, Angstrom)
1668+
// but we can safely pass those through unchanged (they're rare in Georgian text)
1669+
// Also include C2 leads (Latin-1 Supplement: U+0080-00BF) - no case folding needed
1670+
__mmask64 is_safe_ea = is_ea_lead & ~(is_ea_complex >> 1);
1671+
__mmask64 is_c2_lead = _mm512_cmpeq_epi8_mask(source_vec.zmm, _mm512_set1_epi8((char)0xC2));
1672+
__mmask64 is_valid_georgian_mix =
1673+
~is_non_ascii | is_georgian_e1 | is_e2_lead | is_cont | is_safe_ea | is_c2_lead;
1674+
// Exclude other 2-byte leads (C3-DF may need folding), 4-byte, EF
1675+
__mmask64 is_foldable_2byte = is_two_byte_lead & ~is_c2_lead;
1676+
is_valid_georgian_mix &= ~(is_foldable_2byte | is_four_byte_lead | is_ef_lead);
1677+
sz_size_t georgian_length = sz_u64_ctz(~is_valid_georgian_mix | ~load_mask);
1678+
1679+
// Don't split multi-byte sequences (2-byte C2, 3-byte E1/E2/EA)
1680+
if (georgian_length >= 1 && georgian_length < 64) {
1681+
__mmask64 prefix = sz_u64_mask_until_(georgian_length);
1682+
// Check for incomplete 3-byte sequences (leads in last 2 positions)
1683+
__mmask64 leads3_in_prefix = is_three_byte_lead & prefix;
1684+
__mmask64 safe3_mask = georgian_length >= 3 ? sz_u64_mask_until_(georgian_length - 2) : 0;
1685+
__mmask64 unsafe3 = leads3_in_prefix & ~safe3_mask;
1686+
// Check for incomplete 2-byte sequences (leads in last position)
1687+
__mmask64 leads2_in_prefix = is_c2_lead & prefix;
1688+
__mmask64 safe2_mask = georgian_length >= 2 ? sz_u64_mask_until_(georgian_length - 1) : 0;
1689+
__mmask64 unsafe2 = leads2_in_prefix & ~safe2_mask;
1690+
__mmask64 unsafe = unsafe3 | unsafe2;
1691+
if (unsafe) georgian_length = sz_u64_ctz(unsafe);
1692+
}
1693+
1694+
if (georgian_length >= 2) {
1695+
__mmask64 prefix_mask = sz_u64_mask_until_(georgian_length);
1696+
1697+
// Find uppercase leads that need transformation within prefix
1698+
__mmask64 uppercase_leads = ((is_82_uppercase | is_83_uppercase) >> 2) & is_georgian_e1;
1699+
uppercase_leads &= prefix_mask;
1700+
1701+
// Transform only uppercase Georgian: E1 82/83 XX → E2 B4 YY
1702+
__m512i folded = source_vec.zmm;
1703+
1704+
// Set lead bytes to E2 where uppercase Georgian
1705+
folded = _mm512_mask_blend_epi8(uppercase_leads, folded, _mm512_set1_epi8((char)0xE2));
1706+
1707+
// Set second bytes to B4 where uppercase Georgian
1708+
__mmask64 uppercase_second_pos = uppercase_leads << 1;
1709+
folded = _mm512_mask_blend_epi8(uppercase_second_pos, folded, _mm512_set1_epi8((char)0xB4));
1710+
1711+
// Adjust third bytes for uppercase only: -0x20 for 82, +0x20 for 83
1712+
__mmask64 prefix_82_upper = is_82_uppercase & prefix_mask;
1713+
__mmask64 prefix_83_upper = is_83_uppercase & prefix_mask;
1714+
folded = _mm512_mask_sub_epi8(folded, prefix_82_upper, folded, _mm512_set1_epi8(0x20));
1715+
folded = _mm512_mask_add_epi8(folded, prefix_83_upper, folded, _mm512_set1_epi8(0x20));
1716+
1717+
// Also fold ASCII A-Z
1718+
__mmask64 is_upper_ascii = _mm512_cmp_epu8_mask(_mm512_sub_epi8(source_vec.zmm, a_upper_vec),
1719+
subtract26_vec, _MM_CMPINT_LT);
1720+
folded = _mm512_mask_add_epi8(folded, is_upper_ascii & prefix_mask, folded, x20_vec);
1721+
1722+
_mm512_mask_storeu_epi8(target, prefix_mask, folded);
1723+
target += georgian_length, source += georgian_length, source_length -= georgian_length;
1724+
// DEBUG: printf("Georgian path: processed %zu bytes\n", georgian_length);
1725+
continue;
1726+
}
1727+
}
1728+
}
1729+
// DEBUG: Add counter for when Georgian path is skipped
1730+
// static int skip_count = 0; if (is_e1_lead && ++skip_count < 10) printf("Skipped Georgian path at len=%zu\n", source_length);
1731+
15741732
// Slow path: Has 2-byte, 4-byte, or E1/E2/EF leads that need special handling
15751733
// EA with problematic second bytes (is_ea_complex) also needs special handling
15761734
// But plain EA (Hangul) is safe
@@ -1691,6 +1849,67 @@ SZ_PUBLIC sz_size_t sz_utf8_case_fold_ice(sz_cptr_t source, sz_size_t source_len
16911849
target += three_byte_length, source += three_byte_length, source_length -= three_byte_length;
16921850
continue;
16931851
}
1852+
1853+
// Handle Georgian uppercase (E1 82/83) → lowercase (E2 B4)
1854+
// Georgian Mkhedruli: U+10A0-10C5 (E1 82 A0 - E1 83 85) → U+2D00-2D25 (E2 B4 80 - E2 B4 A5)
1855+
// This transformation changes the UTF-8 lead byte from E1 to E2, requiring special handling.
1856+
__mmask64 is_82_second =
1857+
_mm512_mask_cmpeq_epi8_mask(is_e1_in_prefix, second_bytes, _mm512_set1_epi8((char)0x82));
1858+
__mmask64 is_83_second =
1859+
_mm512_mask_cmpeq_epi8_mask(is_e1_in_prefix, second_bytes, _mm512_set1_epi8((char)0x83));
1860+
__mmask64 is_georgian_second = is_82_second | is_83_second;
1861+
1862+
if (is_georgian_second) {
1863+
// Validate third byte range for Georgian uppercase:
1864+
// - E1 82 A0-BF: U+10A0-10BF (32 chars)
1865+
// - E1 83 80-85: U+10C0-10C5 (6 chars)
1866+
__mmask64 third_pos_82 = is_82_second << 2;
1867+
__mmask64 third_pos_83 = is_83_second << 2;
1868+
1869+
// For E1 82: third byte must be A0-BF
1870+
__mmask64 is_82_valid = _mm512_mask_cmp_epu8_mask(
1871+
third_pos_82, _mm512_sub_epi8(source_vec.zmm, _mm512_set1_epi8((char)0xA0)),
1872+
_mm512_set1_epi8(0x20), _MM_CMPINT_LT);
1873+
1874+
// For E1 83: third byte must be 80-85
1875+
__mmask64 is_83_valid = _mm512_mask_cmp_epu8_mask(
1876+
third_pos_83, _mm512_sub_epi8(source_vec.zmm, _mm512_set1_epi8((char)0x80)),
1877+
_mm512_set1_epi8(0x06), _MM_CMPINT_LT);
1878+
1879+
__mmask64 georgian_leads = ((is_82_valid | is_83_valid) >> 2) & is_e1_in_prefix;
1880+
1881+
if (georgian_leads) {
1882+
// Transform: E1 82/83 XX → E2 B4 YY
1883+
// For 82: YY = XX - 0x20 (A0-BF → 80-9F)
1884+
// For 83: YY = XX + 0x20 (80-85 → A0-A5)
1885+
1886+
// Start with source, then apply transformations
1887+
__m512i folded = source_vec.zmm;
1888+
1889+
// Set lead bytes to E2 where Georgian
1890+
folded = _mm512_mask_blend_epi8(georgian_leads, folded, _mm512_set1_epi8((char)0xE2));
1891+
1892+
// Set second bytes to B4 where Georgian
1893+
__mmask64 georgian_second_pos = georgian_leads << 1;
1894+
folded =
1895+
_mm512_mask_blend_epi8(georgian_second_pos, folded, _mm512_set1_epi8((char)0xB4));
1896+
1897+
// Adjust third bytes based on original second byte
1898+
// -0x20 for sequences that had 82, +0x20 for sequences that had 83
1899+
folded = _mm512_mask_sub_epi8(folded, is_82_valid, folded, _mm512_set1_epi8(0x20));
1900+
folded = _mm512_mask_add_epi8(folded, is_83_valid, folded, _mm512_set1_epi8(0x20));
1901+
1902+
// Also fold any ASCII A-Z that might be mixed in
1903+
__mmask64 is_upper_ascii = _mm512_cmp_epu8_mask(
1904+
_mm512_sub_epi8(source_vec.zmm, a_upper_vec), subtract26_vec, _MM_CMPINT_LT);
1905+
folded = _mm512_mask_add_epi8(folded, is_upper_ascii & prefix_mask_3, folded, x20_vec);
1906+
1907+
_mm512_mask_storeu_epi8(target, prefix_mask_3, folded);
1908+
target += three_byte_length, source += three_byte_length,
1909+
source_length -= three_byte_length;
1910+
continue;
1911+
}
1912+
}
16941913
}
16951914

16961915
// Handle EF leads - check for Fullwidth A-Z (FF21-FF3A = EF BC A1 - EF BC BA)

0 commit comments

Comments
 (0)