@@ -1471,18 +1471,45 @@ SZ_PUBLIC sz_cptr_t sz_utf8_find_whitespace_neon(sz_cptr_t text, sz_size_t lengt
14711471 uint8x16_t range_cmp = vandq_u8 (vcgeq_u8 (text_vec .u8x16 , t_vec ), vcleq_u8 (text_vec .u8x16 , r_vec ));
14721472 uint8x16_t one_vec = vorrq_u8 (x20_cmp , range_cmp );
14731473
1474+ // 2-byte and 3-byte prefix indicators
1475+ uint8x16_t xc2_cmp = vandq_u8 (vceqq_u8 (text_vec .u8x16 , xc2_vec ), drop1_vec );
1476+ uint8x16_t xe1_cmp = vandq_u8 (vceqq_u8 (text_vec .u8x16 , xe1_vec ), drop2_vec );
1477+ uint8x16_t xe2_cmp = vandq_u8 (vceqq_u8 (text_vec .u8x16 , xe2_vec ), drop2_vec );
1478+ uint8x16_t xe3_cmp = vandq_u8 (vceqq_u8 (text_vec .u8x16 , xe3_vec ), drop2_vec );
1479+ uint8x16_t prefix_vec = vorrq_u8 (vorrq_u8 (xc2_cmp , xe1_cmp ), vorrq_u8 (xe2_cmp , xe3_cmp ));
1480+
1481+ sz_u64_t one_mask = sz_utf8_vreinterpretq_u8_u4_ (one_vec );
1482+ sz_u64_t prefix_mask = sz_utf8_vreinterpretq_u8_u4_ (prefix_vec );
1483+
1484+ // Check for fast path: one-byte match before any prefix
1485+ if (one_mask ) {
1486+ if (prefix_mask ) {
1487+ int first_one_byte_offset = sz_u64_ctz (one_mask );
1488+ int first_prefix_offset = sz_u64_ctz (prefix_mask );
1489+ if (first_one_byte_offset < first_prefix_offset ) {
1490+ * matched_length = 1 ;
1491+ return text + (first_one_byte_offset / 4 );
1492+ }
1493+ }
1494+ else {
1495+ int first_one_byte_offset = sz_u64_ctz (one_mask );
1496+ * matched_length = 1 ;
1497+ return text + (first_one_byte_offset / 4 );
1498+ }
1499+ }
1500+ else if (!prefix_mask ) {
1501+ text += 14 ;
1502+ length -= 14 ;
1503+ continue ;
1504+ }
1505+
14741506 // 2-byte matches
14751507 uint8x16_t text1 = vextq_u8 (text_vec .u8x16 , text_vec .u8x16 , 1 );
1476- uint8x16_t xc2_cmp = vceqq_u8 (text_vec .u8x16 , xc2_vec );
14771508 uint8x16_t two_vec =
14781509 vorrq_u8 (vandq_u8 (xc2_cmp , vceqq_u8 (text1 , x85_vec )), vandq_u8 (xc2_cmp , vceqq_u8 (text1 , xa0_vec )));
1479- two_vec = vandq_u8 (two_vec , drop1_vec ); // Ignore last split match
14801510
14811511 // 3-byte matches
14821512 uint8x16_t text2 = vextq_u8 (text_vec .u8x16 , text_vec .u8x16 , 2 );
1483- uint8x16_t xe1_cmp = vceqq_u8 (text_vec .u8x16 , xe1_vec );
1484- uint8x16_t xe2_cmp = vceqq_u8 (text_vec .u8x16 , xe2_vec );
1485- uint8x16_t xe3_cmp = vceqq_u8 (text_vec .u8x16 , xe3_vec );
14861513 uint8x16_t x80_ge_cmp = vcgeq_u8 (text2 , x80_vec );
14871514 uint8x16_t x8d_le_cmp = vcleq_u8 (text2 , x8d_vec );
14881515
@@ -1496,16 +1523,12 @@ SZ_PUBLIC sz_cptr_t sz_utf8_find_whitespace_neon(sz_cptr_t text, sz_size_t lengt
14961523 uint8x16_t ideographic_vec = vandq_u8 (xe3_cmp , vandq_u8 (vceqq_u8 (text1 , x80_vec ), vceqq_u8 (text2 , x80_vec )));
14971524 uint8x16_t three_vec = vorrq_u8 (vorrq_u8 (vorrq_u8 (ogham_vec , range_e280_vec ), vorrq_u8 (u2028_vec , u2029_vec )),
14981525 vorrq_u8 (vorrq_u8 (u202f_vec , u205f_vec ), ideographic_vec ));
1499- three_vec = vandq_u8 (three_vec , drop2_vec ); // Ignore last two split matches
15001526
1501- uint8x16_t combined_vec = vorrq_u8 (one_vec , vorrq_u8 (two_vec , three_vec ));
1502- if (vmaxvq_u8 (combined_vec )) {
1503- // Late mask extraction only when a match exists
1504- sz_u64_t one_mask = sz_utf8_vreinterpretq_u8_u4_ (one_vec );
1505- sz_u64_t two_mask = sz_utf8_vreinterpretq_u8_u4_ (two_vec );
1506- sz_u64_t three_mask = sz_utf8_vreinterpretq_u8_u4_ (three_vec );
1507- sz_u64_t combined_mask = one_mask | two_mask | three_mask ;
1527+ sz_u64_t two_mask = sz_utf8_vreinterpretq_u8_u4_ (two_vec );
1528+ sz_u64_t three_mask = sz_utf8_vreinterpretq_u8_u4_ (three_vec );
1529+ sz_u64_t combined_mask = one_mask | two_mask | three_mask ;
15081530
1531+ if (combined_mask ) {
15091532 int bit_index = sz_u64_ctz (combined_mask );
15101533 sz_u64_t first_match_mask = (sz_u64_t )1 << bit_index ;
15111534 sz_size_t length_value = 1 ;
0 commit comments