5656# define ZSTD_HASHLOG3_MAX 17
5757#endif
5858
59+
60+ /*-*************************************
61+ * Forward declarations
62+ ***************************************/
63+ size_t convertSequences_noRepcodes (SeqDef * dstSeqs , const ZSTD_Sequence * inSeqs ,
64+ size_t nbSequences );
65+
66+
5967/*-*************************************
6068* Helper functions
6169***************************************/
@@ -7118,7 +7126,7 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* cctx,
71187126}
71197127
71207128
7121- #if defined(__AVX2__ )
7129+ #if defined(ZSTD_ARCH_X86_AVX2 )
71227130
71237131#include <immintrin.h> /* AVX2 intrinsics */
71247132
@@ -7138,7 +7146,7 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* cctx,
71387146 * @returns > 0 if there is one long length (> 65535),
71397147 * indicating the position, and type.
71407148 */
7141- static size_t convertSequences_noRepcodes (
7149+ size_t convertSequences_noRepcodes (
71427150 SeqDef * dstSeqs ,
71437151 const ZSTD_Sequence * inSeqs ,
71447152 size_t nbSequences )
@@ -7298,7 +7306,7 @@ static size_t convertSequences_noRepcodes(
72987306 * @returns > 0 if there is one long length (> 65535),
72997307 * indicating the position, and type.
73007308 */
7301- static size_t convertSequences_noRepcodes (SeqDef * dstSeqs , const ZSTD_Sequence * inSeqs , size_t nbSequences ) {
7309+ size_t convertSequences_noRepcodes (SeqDef * dstSeqs , const ZSTD_Sequence * inSeqs , size_t nbSequences ) {
73027310 size_t longLen = 0 ;
73037311
73047312 /* RVV depends on the specific definition of target structures */
@@ -7375,9 +7383,131 @@ static size_t convertSequences_noRepcodes(SeqDef* dstSeqs, const ZSTD_Sequence*
73757383 * but since this implementation is targeting modern systems (>= Sapphire Rapid),
73767384 * it's not useful to develop and maintain code for older pre-AVX2 platforms */
73777385
7378- #else /* no AVX2 */
7386+ #elif defined(ZSTD_ARCH_ARM_NEON ) && (defined(__aarch64__ ) || defined(_M_ARM64 ))
7387+
7388+ size_t convertSequences_noRepcodes (
7389+ SeqDef * dstSeqs ,
7390+ const ZSTD_Sequence * inSeqs ,
7391+ size_t nbSequences )
7392+ {
7393+ size_t longLen = 0 ;
7394+ size_t n = 0 ;
7395+
7396+ /* Neon permutation depends on the specific definition of target structures. */
7397+ ZSTD_STATIC_ASSERT (sizeof (ZSTD_Sequence ) == 16 );
7398+ ZSTD_STATIC_ASSERT (offsetof(ZSTD_Sequence , offset ) == 0 );
7399+ ZSTD_STATIC_ASSERT (offsetof(ZSTD_Sequence , litLength ) == 4 );
7400+ ZSTD_STATIC_ASSERT (offsetof(ZSTD_Sequence , matchLength ) == 8 );
7401+ ZSTD_STATIC_ASSERT (sizeof (SeqDef ) == 8 );
7402+ ZSTD_STATIC_ASSERT (offsetof(SeqDef , offBase ) == 0 );
7403+ ZSTD_STATIC_ASSERT (offsetof(SeqDef , litLength ) == 4 );
7404+ ZSTD_STATIC_ASSERT (offsetof(SeqDef , mlBase ) == 6 );
7405+
7406+ if (nbSequences > 3 ) {
7407+ static const ZSTD_ALIGNED (16 ) U32 constAddition [4 ] = {
7408+ ZSTD_REP_NUM , 0 , - MINMATCH , 0
7409+ };
7410+ static const ZSTD_ALIGNED (16 ) U8 constMask [16 ] = {
7411+ 0 , 1 , 2 , 3 , 4 , 5 , 8 , 9 , 16 , 17 , 18 , 19 , 20 , 21 , 24 , 25
7412+ };
7413+ static const ZSTD_ALIGNED (16 ) U16 constCounter [8 ] = {
7414+ 1 , 1 , 1 , 1 , 2 , 2 , 2 , 2
7415+ };
7416+
7417+ const uint32x4_t vaddition = vld1q_u32 (constAddition );
7418+ const uint8x16_t vmask = vld1q_u8 (constMask );
7419+ uint16x8_t vcounter = vld1q_u16 (constCounter );
7420+ uint16x8_t vindex01 = vdupq_n_u16 (0 );
7421+ uint16x8_t vindex23 = vdupq_n_u16 (0 );
7422+
7423+ do {
7424+ /* Load 4 ZSTD_Sequence (64 bytes). */
7425+ const uint32x4_t vin0 = vld1q_u32 (& inSeqs [n + 0 ].offset );
7426+ const uint32x4_t vin1 = vld1q_u32 (& inSeqs [n + 1 ].offset );
7427+ const uint32x4_t vin2 = vld1q_u32 (& inSeqs [n + 2 ].offset );
7428+ const uint32x4_t vin3 = vld1q_u32 (& inSeqs [n + 3 ].offset );
7429+
7430+ /* Add {ZSTD_REP_NUM, 0, -MINMATCH, 0} to each vector. */
7431+ const uint8x16x2_t vadd01 = { {
7432+ vreinterpretq_u8_u32 (vaddq_u32 (vin0 , vaddition )),
7433+ vreinterpretq_u8_u32 (vaddq_u32 (vin1 , vaddition )),
7434+ } };
7435+ const uint8x16x2_t vadd23 = { {
7436+ vreinterpretq_u8_u32 (vaddq_u32 (vin2 , vaddition )),
7437+ vreinterpretq_u8_u32 (vaddq_u32 (vin3 , vaddition )),
7438+ } };
7439+
7440+ /* Shuffle and pack bytes so each vector contains 2 SeqDef structures. */
7441+ const uint8x16_t vout01 = vqtbl2q_u8 (vadd01 , vmask );
7442+ const uint8x16_t vout23 = vqtbl2q_u8 (vadd23 , vmask );
7443+
7444+ /* Pack the upper 16-bits of 32-bit lanes for overflow check. */
7445+ uint16x8_t voverflow01 = vuzp2q_u16 (vreinterpretq_u16_u8 (vadd01 .val [0 ]),
7446+ vreinterpretq_u16_u8 (vadd01 .val [1 ]));
7447+ uint16x8_t voverflow23 = vuzp2q_u16 (vreinterpretq_u16_u8 (vadd23 .val [0 ]),
7448+ vreinterpretq_u16_u8 (vadd23 .val [1 ]));
7449+
7450+ /* Store 4 SeqDef structures. */
7451+ vst1q_u32 (& dstSeqs [n + 0 ].offBase , vreinterpretq_u32_u8 (vout01 ));
7452+ vst1q_u32 (& dstSeqs [n + 2 ].offBase , vreinterpretq_u32_u8 (vout23 ));
7453+
7454+ /* Create masks in case of overflow. */
7455+ voverflow01 = vcgtzq_s16 (vreinterpretq_s16_u16 (voverflow01 ));
7456+ voverflow23 = vcgtzq_s16 (vreinterpretq_s16_u16 (voverflow23 ));
7457+
7458+ /* Update overflow indices. */
7459+ vindex01 = vbslq_u16 (voverflow01 , vcounter , vindex01 );
7460+ vindex23 = vbslq_u16 (voverflow23 , vcounter , vindex23 );
7461+
7462+ /* Update counter for overflow check. */
7463+ vcounter = vaddq_u16 (vcounter , vdupq_n_u16 (4 ));
7464+
7465+ n += 4 ;
7466+ } while (n < nbSequences - 3 );
7467+
7468+ /* Fixup indices in the second vector, we saved an additional counter
7469+ in the loop to update the second overflow index, we need to add 2
7470+ here when the indices are not 0. */
7471+ { uint16x8_t nonzero = vtstq_u16 (vindex23 , vindex23 );
7472+ vindex23 = vsubq_u16 (vindex23 , nonzero );
7473+ vindex23 = vsubq_u16 (vindex23 , nonzero );
7474+ }
7475+
7476+ /* Merge indices in the vectors, maximums are needed. */
7477+ vindex01 = vmaxq_u16 (vindex01 , vindex23 );
7478+ vindex01 = vmaxq_u16 (vindex01 , vextq_u16 (vindex01 , vindex01 , 4 ));
7479+
7480+ /* Compute `longLen`, maximums of matchLength and litLength
7481+ with a preference on litLength. */
7482+ { U64 maxLitMatchIndices = vgetq_lane_u64 (vreinterpretq_u64_u16 (vindex01 ), 0 );
7483+ size_t maxLitIndex = (maxLitMatchIndices >> 16 ) & 0xFFFF ;
7484+ size_t maxMatchIndex = (maxLitMatchIndices >> 32 ) & 0xFFFF ;
7485+ longLen = maxLitIndex > maxMatchIndex ? maxLitIndex + nbSequences
7486+ : maxMatchIndex ;
7487+ }
7488+ }
7489+
7490+ /* Handle remaining elements. */
7491+ for (; n < nbSequences ; n ++ ) {
7492+ dstSeqs [n ].offBase = OFFSET_TO_OFFBASE (inSeqs [n ].offset );
7493+ dstSeqs [n ].litLength = (U16 )inSeqs [n ].litLength ;
7494+ dstSeqs [n ].mlBase = (U16 )(inSeqs [n ].matchLength - MINMATCH );
7495+ /* Check for long length > 65535. */
7496+ if (UNLIKELY (inSeqs [n ].matchLength > 65535 + MINMATCH )) {
7497+ assert (longLen == 0 );
7498+ longLen = n + 1 ;
7499+ }
7500+ if (UNLIKELY (inSeqs [n ].litLength > 65535 )) {
7501+ assert (longLen == 0 );
7502+ longLen = n + nbSequences + 1 ;
7503+ }
7504+ }
7505+ return longLen ;
7506+ }
7507+
7508+ #else /* No vectorization. */
73797509
7380- static size_t convertSequences_noRepcodes (
7510+ size_t convertSequences_noRepcodes (
73817511 SeqDef * dstSeqs ,
73827512 const ZSTD_Sequence * inSeqs ,
73837513 size_t nbSequences )
@@ -7388,7 +7518,7 @@ static size_t convertSequences_noRepcodes(
73887518 dstSeqs [n ].offBase = OFFSET_TO_OFFBASE (inSeqs [n ].offset );
73897519 dstSeqs [n ].litLength = (U16 )inSeqs [n ].litLength ;
73907520 dstSeqs [n ].mlBase = (U16 )(inSeqs [n ].matchLength - MINMATCH );
7391- /* check for long length > 65535 */
7521+ /* Check for long length > 65535. */
73927522 if (UNLIKELY (inSeqs [n ].matchLength > 65535 + MINMATCH )) {
73937523 assert (longLen == 0 );
73947524 longLen = n + 1 ;
0 commit comments