@@ -439,16 +439,29 @@ SZ_INTERNAL sz_cptr_t sz_find_2byte_serial_(sz_cptr_t h, sz_size_t h_length, sz_
439439
440440 // This code simulates hyper-scalar execution, analyzing 8 offsets at a time.
441441 for (; h + 9 <= h_end ; h += 8 ) {
442+ #if !SZ_IS_BIG_ENDIAN_
442443 h_even_vec .u64 = * (sz_u64_t * )h ;
443444 h_odd_vec .u64 = (h_even_vec .u64 >> 8 ) | ((sz_u64_t )h [8 ] << 56 );
445+ #else
446+ h_even_vec .u64 = * (sz_u64_t * )h ;
447+ h_odd_vec .u64 = (h_even_vec .u64 << 8 ) | ((sz_u64_t )h [8 ] >> 56 );
448+ #endif
444449 matches_even_vec = sz_u64_each_2byte_equal_ (h_even_vec , n_vec );
445450 matches_odd_vec = sz_u64_each_2byte_equal_ (h_odd_vec , n_vec );
446451
452+ #if !SZ_IS_BIG_ENDIAN_
447453 matches_even_vec .u64 >>= 8 ;
448454 if (matches_even_vec .u64 + matches_odd_vec .u64 ) {
449455 sz_u64_t match_indicators = matches_even_vec .u64 | matches_odd_vec .u64 ;
450456 return h + sz_u64_ctz (match_indicators ) / 8 ;
451457 }
458+ #else
459+ matches_even_vec .u64 <<= 8 ;
460+ if (matches_even_vec .u64 + matches_odd_vec .u64 ) {
461+ sz_u64_t match_indicators = matches_even_vec .u64 | matches_odd_vec .u64 ;
462+ return h + sz_u64_clz (match_indicators ) / 8 ;
463+ }
464+ #endif
452465 }
453466
454467 for (; h + 2 <= h_end ; ++ h )
@@ -498,21 +511,36 @@ SZ_INTERNAL sz_cptr_t sz_find_4byte_serial_(sz_cptr_t h, sz_size_t h_length, sz_
498511 for (; h + sizeof (sz_u64_t ) + sizeof (sz_u32_t ) <= h_end ; h += sizeof (sz_u64_t )) {
499512 h_page_current = * (sz_u64_t * )h ;
500513 h_page_next = * (sz_u32_t * )(h + 8 );
514+ #if !SZ_IS_BIG_ENDIAN_
501515 h0_vec .u64 = (h_page_current );
502516 h1_vec .u64 = (h_page_current >> 8 ) | (h_page_next << 56 );
503517 h2_vec .u64 = (h_page_current >> 16 ) | (h_page_next << 48 );
504518 h3_vec .u64 = (h_page_current >> 24 ) | (h_page_next << 40 );
519+ #else
520+ h0_vec .u64 = (h_page_current );
521+ h1_vec .u64 = (h_page_current << 8 ) | (h_page_next >> 24 );
522+ h2_vec .u64 = (h_page_current << 16 ) | (h_page_next >> 16 );
523+ h3_vec .u64 = (h_page_current << 24 ) | (h_page_next >> 8 );
524+ #endif
505525 matches0_vec = sz_u64_each_4byte_equal_ (h0_vec , n_vec );
506526 matches1_vec = sz_u64_each_4byte_equal_ (h1_vec , n_vec );
507527 matches2_vec = sz_u64_each_4byte_equal_ (h2_vec , n_vec );
508528 matches3_vec = sz_u64_each_4byte_equal_ (h3_vec , n_vec );
509529
510530 if (matches0_vec .u64 | matches1_vec .u64 | matches2_vec .u64 | matches3_vec .u64 ) {
531+ #if !SZ_IS_BIG_ENDIAN_
511532 matches0_vec .u64 >>= 24 ;
512533 matches1_vec .u64 >>= 16 ;
513534 matches2_vec .u64 >>= 8 ;
514535 sz_u64_t match_indicators = matches0_vec .u64 | matches1_vec .u64 | matches2_vec .u64 | matches3_vec .u64 ;
515536 return h + sz_u64_ctz (match_indicators ) / 8 ;
537+ #else
538+ matches0_vec .u64 <<= 24 ;
539+ matches1_vec .u64 <<= 16 ;
540+ matches2_vec .u64 <<= 8 ;
541+ sz_u64_t match_indicators = matches0_vec .u64 | matches1_vec .u64 | matches2_vec .u64 | matches3_vec .u64 ;
542+ return h + sz_u64_clz (match_indicators ) / 8 ;
543+ #endif
516544 }
517545 }
518546
@@ -567,24 +595,41 @@ SZ_INTERNAL sz_cptr_t sz_find_3byte_serial_(sz_cptr_t h, sz_size_t h_length, sz_
567595 h_page_current = * (sz_u64_t * )h ;
568596 h_page_next = * (sz_u16_t * )(h + 8 );
569597 h0_vec .u64 = (h_page_current );
598+ #if !SZ_IS_BIG_ENDIAN_
570599 h1_vec .u64 = (h_page_current >> 8 ) | (h_page_next << 56 );
571600 h2_vec .u64 = (h_page_current >> 16 ) | (h_page_next << 48 );
572601 h3_vec .u64 = (h_page_current >> 24 ) | (h_page_next << 40 );
573602 h4_vec .u64 = (h_page_current >> 32 ) | (h_page_next << 32 );
603+ #else
604+ h1_vec .u64 = (h_page_current << 8 ) | (h_page_next >> 8 );
605+ h2_vec .u64 = (h_page_current << 16 ) | (h_page_next >> 16 );
606+ h3_vec .u64 = (h_page_current << 24 ) | (h_page_next >> 24 );
607+ h4_vec .u64 = (h_page_current << 32 ) | (h_page_next >> 32 );
608+ #endif
574609 matches0_vec = sz_u64_each_3byte_equal_ (h0_vec , n_vec );
575610 matches1_vec = sz_u64_each_3byte_equal_ (h1_vec , n_vec );
576611 matches2_vec = sz_u64_each_3byte_equal_ (h2_vec , n_vec );
577612 matches3_vec = sz_u64_each_3byte_equal_ (h3_vec , n_vec );
578613 matches4_vec = sz_u64_each_3byte_equal_ (h4_vec , n_vec );
579614
580615 if (matches0_vec .u64 | matches1_vec .u64 | matches2_vec .u64 | matches3_vec .u64 | matches4_vec .u64 ) {
616+ #if !SZ_IS_BIG_ENDIAN_
581617 matches0_vec .u64 >>= 16 ;
582618 matches1_vec .u64 >>= 8 ;
583619 matches3_vec .u64 <<= 8 ;
584620 matches4_vec .u64 <<= 16 ;
585621 sz_u64_t match_indicators =
586622 matches0_vec .u64 | matches1_vec .u64 | matches2_vec .u64 | matches3_vec .u64 | matches4_vec .u64 ;
587623 return h + sz_u64_ctz (match_indicators ) / 8 ;
624+ #else
625+ matches0_vec .u64 <<= 16 ;
626+ matches1_vec .u64 <<= 8 ;
627+ matches3_vec .u64 >>= 8 ;
628+ matches4_vec .u64 >>= 16 ;
629+ sz_u64_t match_indicators =
630+ matches0_vec .u64 | matches1_vec .u64 | matches2_vec .u64 | matches3_vec .u64 | matches4_vec .u64 ;
631+ return h + sz_u64_clz (match_indicators ) / 8 ;
632+ #endif
588633 }
589634 }
590635
@@ -768,17 +813,8 @@ SZ_PUBLIC sz_cptr_t sz_find_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
768813 // This almost never fires, but it's better to be safe than sorry.
769814 if (h_length < n_length || !n_length ) return SZ_NULL_CHAR ;
770815
771- #if SZ_IS_BIG_ENDIAN_
772- sz_find_t backends [] = {
773- sz_find_1byte_serial_ ,
774- sz_find_horspool_upto_256bytes_serial_ ,
775- sz_find_horspool_over_256bytes_serial_ ,
776- };
777-
778- return backends [(n_length > 1 ) + (n_length > 256 )](h , h_length , n , n_length );
779- #else
780816 sz_find_t backends [] = {
781- // For very short strings brute-force SWAR makes sense.
817+ // For very short strings brute-force SWAR makes sense - now optimized for both endianness!
782818 sz_find_1byte_serial_ ,
783819 sz_find_2byte_serial_ ,
784820 sz_find_3byte_serial_ ,
@@ -797,7 +833,6 @@ SZ_PUBLIC sz_cptr_t sz_find_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
797833 (n_length > 4 ) +
798834 // For longer needles - use skip tables.
799835 (n_length > 8 ) + (n_length > 256 )](h , h_length , n , n_length );
800- #endif
801836}
802837
803838SZ_PUBLIC sz_cptr_t sz_rfind_serial (sz_cptr_t h , sz_size_t h_length , sz_cptr_t n , sz_size_t n_length ) {
0 commit comments