Skip to content

Commit 607dd14

Browse files
Add: Big-endian SWAR backends
Enhances performance for 2, 3, and 4-byte substring searches on big endian architectures by implementing endianness-aware SWAR techniques. This bridges the performance gap between little and big endian systems by making optimized SWAR functions work on both architectures, eliminating the need for separate fallback paths to slower Horspool algorithms. Co-authored-by: Semyon Danilov <samvimes@yandex.ru> Co-authored-by: Semyon Danilov <4058545+SammyVimes@users.noreply.github.com>
1 parent c2e384a commit 607dd14

File tree

1 file changed

+46
-11
lines changed

1 file changed

+46
-11
lines changed

include/stringzilla/find.h

Lines changed: 46 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -439,16 +439,29 @@ SZ_INTERNAL sz_cptr_t sz_find_2byte_serial_(sz_cptr_t h, sz_size_t h_length, sz_
439439

440440
// This code simulates hyper-scalar execution, analyzing 8 offsets at a time.
441441
for (; h + 9 <= h_end; h += 8) {
442+
#if !SZ_IS_BIG_ENDIAN_
442443
h_even_vec.u64 = *(sz_u64_t *)h;
443444
h_odd_vec.u64 = (h_even_vec.u64 >> 8) | ((sz_u64_t)h[8] << 56);
445+
#else
446+
h_even_vec.u64 = *(sz_u64_t *)h;
447+
h_odd_vec.u64 = (h_even_vec.u64 << 8) | ((sz_u64_t)h[8] >> 56);
448+
#endif
444449
matches_even_vec = sz_u64_each_2byte_equal_(h_even_vec, n_vec);
445450
matches_odd_vec = sz_u64_each_2byte_equal_(h_odd_vec, n_vec);
446451

452+
#if !SZ_IS_BIG_ENDIAN_
447453
matches_even_vec.u64 >>= 8;
448454
if (matches_even_vec.u64 + matches_odd_vec.u64) {
449455
sz_u64_t match_indicators = matches_even_vec.u64 | matches_odd_vec.u64;
450456
return h + sz_u64_ctz(match_indicators) / 8;
451457
}
458+
#else
459+
matches_even_vec.u64 <<= 8;
460+
if (matches_even_vec.u64 + matches_odd_vec.u64) {
461+
sz_u64_t match_indicators = matches_even_vec.u64 | matches_odd_vec.u64;
462+
return h + sz_u64_clz(match_indicators) / 8;
463+
}
464+
#endif
452465
}
453466

454467
for (; h + 2 <= h_end; ++h)
@@ -498,21 +511,36 @@ SZ_INTERNAL sz_cptr_t sz_find_4byte_serial_(sz_cptr_t h, sz_size_t h_length, sz_
498511
for (; h + sizeof(sz_u64_t) + sizeof(sz_u32_t) <= h_end; h += sizeof(sz_u64_t)) {
499512
h_page_current = *(sz_u64_t *)h;
500513
h_page_next = *(sz_u32_t *)(h + 8);
514+
#if !SZ_IS_BIG_ENDIAN_
501515
h0_vec.u64 = (h_page_current);
502516
h1_vec.u64 = (h_page_current >> 8) | (h_page_next << 56);
503517
h2_vec.u64 = (h_page_current >> 16) | (h_page_next << 48);
504518
h3_vec.u64 = (h_page_current >> 24) | (h_page_next << 40);
519+
#else
520+
h0_vec.u64 = (h_page_current);
521+
h1_vec.u64 = (h_page_current << 8) | (h_page_next >> 24);
522+
h2_vec.u64 = (h_page_current << 16) | (h_page_next >> 16);
523+
h3_vec.u64 = (h_page_current << 24) | (h_page_next >> 8);
524+
#endif
505525
matches0_vec = sz_u64_each_4byte_equal_(h0_vec, n_vec);
506526
matches1_vec = sz_u64_each_4byte_equal_(h1_vec, n_vec);
507527
matches2_vec = sz_u64_each_4byte_equal_(h2_vec, n_vec);
508528
matches3_vec = sz_u64_each_4byte_equal_(h3_vec, n_vec);
509529

510530
if (matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64) {
531+
#if !SZ_IS_BIG_ENDIAN_
511532
matches0_vec.u64 >>= 24;
512533
matches1_vec.u64 >>= 16;
513534
matches2_vec.u64 >>= 8;
514535
sz_u64_t match_indicators = matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64;
515536
return h + sz_u64_ctz(match_indicators) / 8;
537+
#else
538+
matches0_vec.u64 <<= 24;
539+
matches1_vec.u64 <<= 16;
540+
matches2_vec.u64 <<= 8;
541+
sz_u64_t match_indicators = matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64;
542+
return h + sz_u64_clz(match_indicators) / 8;
543+
#endif
516544
}
517545
}
518546

@@ -567,24 +595,41 @@ SZ_INTERNAL sz_cptr_t sz_find_3byte_serial_(sz_cptr_t h, sz_size_t h_length, sz_
567595
h_page_current = *(sz_u64_t *)h;
568596
h_page_next = *(sz_u16_t *)(h + 8);
569597
h0_vec.u64 = (h_page_current);
598+
#if !SZ_IS_BIG_ENDIAN_
570599
h1_vec.u64 = (h_page_current >> 8) | (h_page_next << 56);
571600
h2_vec.u64 = (h_page_current >> 16) | (h_page_next << 48);
572601
h3_vec.u64 = (h_page_current >> 24) | (h_page_next << 40);
573602
h4_vec.u64 = (h_page_current >> 32) | (h_page_next << 32);
603+
#else
604+
h1_vec.u64 = (h_page_current << 8) | (h_page_next >> 8);
605+
h2_vec.u64 = (h_page_current << 16) | (h_page_next >> 16);
606+
h3_vec.u64 = (h_page_current << 24) | (h_page_next >> 24);
607+
h4_vec.u64 = (h_page_current << 32) | (h_page_next >> 32);
608+
#endif
574609
matches0_vec = sz_u64_each_3byte_equal_(h0_vec, n_vec);
575610
matches1_vec = sz_u64_each_3byte_equal_(h1_vec, n_vec);
576611
matches2_vec = sz_u64_each_3byte_equal_(h2_vec, n_vec);
577612
matches3_vec = sz_u64_each_3byte_equal_(h3_vec, n_vec);
578613
matches4_vec = sz_u64_each_3byte_equal_(h4_vec, n_vec);
579614

580615
if (matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64 | matches4_vec.u64) {
616+
#if !SZ_IS_BIG_ENDIAN_
581617
matches0_vec.u64 >>= 16;
582618
matches1_vec.u64 >>= 8;
583619
matches3_vec.u64 <<= 8;
584620
matches4_vec.u64 <<= 16;
585621
sz_u64_t match_indicators =
586622
matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64 | matches4_vec.u64;
587623
return h + sz_u64_ctz(match_indicators) / 8;
624+
#else
625+
matches0_vec.u64 <<= 16;
626+
matches1_vec.u64 <<= 8;
627+
matches3_vec.u64 >>= 8;
628+
matches4_vec.u64 >>= 16;
629+
sz_u64_t match_indicators =
630+
matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64 | matches4_vec.u64;
631+
return h + sz_u64_clz(match_indicators) / 8;
632+
#endif
588633
}
589634
}
590635

@@ -768,17 +813,8 @@ SZ_PUBLIC sz_cptr_t sz_find_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
768813
// This almost never fires, but it's better to be safe than sorry.
769814
if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
770815

771-
#if SZ_IS_BIG_ENDIAN_
772-
sz_find_t backends[] = {
773-
sz_find_1byte_serial_,
774-
sz_find_horspool_upto_256bytes_serial_,
775-
sz_find_horspool_over_256bytes_serial_,
776-
};
777-
778-
return backends[(n_length > 1) + (n_length > 256)](h, h_length, n, n_length);
779-
#else
780816
sz_find_t backends[] = {
781-
// For very short strings brute-force SWAR makes sense.
817+
// For very short strings brute-force SWAR makes sense - now optimized for both endianness!
782818
sz_find_1byte_serial_,
783819
sz_find_2byte_serial_,
784820
sz_find_3byte_serial_,
@@ -797,7 +833,6 @@ SZ_PUBLIC sz_cptr_t sz_find_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
797833
(n_length > 4) +
798834
// For longer needles - use skip tables.
799835
(n_length > 8) + (n_length > 256)](h, h_length, n, n_length);
800-
#endif
801836
}
802837

803838
SZ_PUBLIC sz_cptr_t sz_rfind_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {

0 commit comments

Comments
 (0)