Skip to content

Commit 73da441

Browse files
committed
Improve: Fast path for 1-byte whitespace in NEON
Benchmarking `sz_utf8_find_whitespace_serial`: > Throughput: 264.52 MiB/s @ 15.48 s/call Benchmarking `sz_utf8_find_whitespace_neon`: > Throughput: 4.98 GiB/s @ 803.59 ms/call > + 19.3 x against `sz_utf8_find_whitespace_serial` Benchmarking `sz_utf8_find_newline_serial`: > Throughput: 6.36 GiB/s @ 628.70 ms/call Benchmarking `sz_utf8_find_newline_neon`: > Throughput: 10.15 GiB/s @ 394.23 ms/call > + 59.5 % against `sz_utf8_find_newline_serial`
1 parent 0259f58 commit 73da441

File tree

1 file changed

+36
-13
lines changed

1 file changed

+36
-13
lines changed

include/stringzilla/utf8.h

Lines changed: 36 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1471,18 +1471,45 @@ SZ_PUBLIC sz_cptr_t sz_utf8_find_whitespace_neon(sz_cptr_t text, sz_size_t lengt
14711471
uint8x16_t range_cmp = vandq_u8(vcgeq_u8(text_vec.u8x16, t_vec), vcleq_u8(text_vec.u8x16, r_vec));
14721472
uint8x16_t one_vec = vorrq_u8(x20_cmp, range_cmp);
14731473

1474+
// 2-byte and 3-byte prefix indicators
1475+
uint8x16_t xc2_cmp = vandq_u8(vceqq_u8(text_vec.u8x16, xc2_vec), drop1_vec);
1476+
uint8x16_t xe1_cmp = vandq_u8(vceqq_u8(text_vec.u8x16, xe1_vec), drop2_vec);
1477+
uint8x16_t xe2_cmp = vandq_u8(vceqq_u8(text_vec.u8x16, xe2_vec), drop2_vec);
1478+
uint8x16_t xe3_cmp = vandq_u8(vceqq_u8(text_vec.u8x16, xe3_vec), drop2_vec);
1479+
uint8x16_t prefix_vec = vorrq_u8(vorrq_u8(xc2_cmp, xe1_cmp), vorrq_u8(xe2_cmp, xe3_cmp));
1480+
1481+
sz_u64_t one_mask = sz_utf8_vreinterpretq_u8_u4_(one_vec);
1482+
sz_u64_t prefix_mask = sz_utf8_vreinterpretq_u8_u4_(prefix_vec);
1483+
1484+
// Check for fast path: one-byte match before any prefix
1485+
if (one_mask) {
1486+
if (prefix_mask) {
1487+
int first_one_byte_offset = sz_u64_ctz(one_mask);
1488+
int first_prefix_offset = sz_u64_ctz(prefix_mask);
1489+
if (first_one_byte_offset < first_prefix_offset) {
1490+
*matched_length = 1;
1491+
return text + (first_one_byte_offset / 4);
1492+
}
1493+
}
1494+
else {
1495+
int first_one_byte_offset = sz_u64_ctz(one_mask);
1496+
*matched_length = 1;
1497+
return text + (first_one_byte_offset / 4);
1498+
}
1499+
}
1500+
else if (!prefix_mask) {
1501+
text += 14;
1502+
length -= 14;
1503+
continue;
1504+
}
1505+
14741506
// 2-byte matches
14751507
uint8x16_t text1 = vextq_u8(text_vec.u8x16, text_vec.u8x16, 1);
1476-
uint8x16_t xc2_cmp = vceqq_u8(text_vec.u8x16, xc2_vec);
14771508
uint8x16_t two_vec =
14781509
vorrq_u8(vandq_u8(xc2_cmp, vceqq_u8(text1, x85_vec)), vandq_u8(xc2_cmp, vceqq_u8(text1, xa0_vec)));
1479-
two_vec = vandq_u8(two_vec, drop1_vec); // Ignore last split match
14801510

14811511
// 3-byte matches
14821512
uint8x16_t text2 = vextq_u8(text_vec.u8x16, text_vec.u8x16, 2);
1483-
uint8x16_t xe1_cmp = vceqq_u8(text_vec.u8x16, xe1_vec);
1484-
uint8x16_t xe2_cmp = vceqq_u8(text_vec.u8x16, xe2_vec);
1485-
uint8x16_t xe3_cmp = vceqq_u8(text_vec.u8x16, xe3_vec);
14861513
uint8x16_t x80_ge_cmp = vcgeq_u8(text2, x80_vec);
14871514
uint8x16_t x8d_le_cmp = vcleq_u8(text2, x8d_vec);
14881515

@@ -1496,16 +1523,12 @@ SZ_PUBLIC sz_cptr_t sz_utf8_find_whitespace_neon(sz_cptr_t text, sz_size_t lengt
14961523
uint8x16_t ideographic_vec = vandq_u8(xe3_cmp, vandq_u8(vceqq_u8(text1, x80_vec), vceqq_u8(text2, x80_vec)));
14971524
uint8x16_t three_vec = vorrq_u8(vorrq_u8(vorrq_u8(ogham_vec, range_e280_vec), vorrq_u8(u2028_vec, u2029_vec)),
14981525
vorrq_u8(vorrq_u8(u202f_vec, u205f_vec), ideographic_vec));
1499-
three_vec = vandq_u8(three_vec, drop2_vec); // Ignore last two split matches
15001526

1501-
uint8x16_t combined_vec = vorrq_u8(one_vec, vorrq_u8(two_vec, three_vec));
1502-
if (vmaxvq_u8(combined_vec)) {
1503-
// Late mask extraction only when a match exists
1504-
sz_u64_t one_mask = sz_utf8_vreinterpretq_u8_u4_(one_vec);
1505-
sz_u64_t two_mask = sz_utf8_vreinterpretq_u8_u4_(two_vec);
1506-
sz_u64_t three_mask = sz_utf8_vreinterpretq_u8_u4_(three_vec);
1507-
sz_u64_t combined_mask = one_mask | two_mask | three_mask;
1527+
sz_u64_t two_mask = sz_utf8_vreinterpretq_u8_u4_(two_vec);
1528+
sz_u64_t three_mask = sz_utf8_vreinterpretq_u8_u4_(three_vec);
1529+
sz_u64_t combined_mask = one_mask | two_mask | three_mask;
15081530

1531+
if (combined_mask) {
15091532
int bit_index = sz_u64_ctz(combined_mask);
15101533
sz_u64_t first_match_mask = (sz_u64_t)1 << bit_index;
15111534
sz_size_t length_value = 1;

0 commit comments

Comments
 (0)