@@ -7383,6 +7383,155 @@ size_t convertSequences_noRepcodes(SeqDef* dstSeqs, const ZSTD_Sequence* inSeqs,
73837383 * but since this implementation is targeting modern systems (>= Sapphire Rapid),
73847384 * it's not useful to develop and maintain code for older pre-AVX2 platforms */
73857385
7386+ #elif defined(ZSTD_ARCH_ARM_SVE2 )
7387+
7388+ /*
7389+ * Checks if any active element in a signed 8-bit integer vector is greater
7390+ * than zero.
7391+ *
7392+ * @param g Governing predicate selecting active lanes.
7393+ * @param a Input vector of signed 8-bit integers.
7394+ *
7395+ * @return True if any active element in `a` is > 0, false otherwise.
7396+ */
7397+ FORCE_INLINE_TEMPLATE int cmpgtz_any_s8 (svbool_t g , svint8_t a )
7398+ {
7399+ svbool_t ptest = svcmpgt_n_s8 (g , a , 0 );
7400+ return svptest_any (ptest , ptest );
7401+ }
7402+
7403+ size_t convertSequences_noRepcodes (
7404+ SeqDef * dstSeqs ,
7405+ const ZSTD_Sequence * inSeqs ,
7406+ size_t nbSequences )
7407+ {
7408+ /* Process the input with `8 * VL / element` lanes. */
7409+ const size_t lanes = 8 * svcntb () / sizeof (ZSTD_Sequence );
7410+ size_t longLen = 0 ;
7411+ size_t n = 0 ;
7412+
7413+ /* SVE permutation depends on the specific definition of target structures. */
7414+ ZSTD_STATIC_ASSERT (sizeof (ZSTD_Sequence ) == 16 );
7415+ ZSTD_STATIC_ASSERT (offsetof(ZSTD_Sequence , offset ) == 0 );
7416+ ZSTD_STATIC_ASSERT (offsetof(ZSTD_Sequence , litLength ) == 4 );
7417+ ZSTD_STATIC_ASSERT (offsetof(ZSTD_Sequence , matchLength ) == 8 );
7418+ ZSTD_STATIC_ASSERT (sizeof (SeqDef ) == 8 );
7419+ ZSTD_STATIC_ASSERT (offsetof(SeqDef , offBase ) == 0 );
7420+ ZSTD_STATIC_ASSERT (offsetof(SeqDef , litLength ) == 4 );
7421+ ZSTD_STATIC_ASSERT (offsetof(SeqDef , mlBase ) == 6 );
7422+
7423+ if (nbSequences >= lanes ) {
7424+ const svbool_t ptrue = svptrue_b8 ();
7425+ /* 16-bit of {ZSTD_REP_NUM, 0, -MINMATCH, 0} extended to 32-bit lanes. */
7426+ const svuint32_t vaddition = svreinterpret_u32 (
7427+ svunpklo_s32 (svreinterpret_s16 (svdup_n_u64 (ZSTD_REP_NUM | (((U64 )(U16 )- MINMATCH ) << 32 )))));
7428+ /* For permutation of 16-bit units: 0, 1, 2, 4, 8, 9, 10, 12, ... */
7429+ const svuint16_t vmask = svreinterpret_u16 (
7430+ svindex_u64 (0x0004000200010000 , 0x0008000800080008 ));
7431+ /* Upper bytes of `litLength` and `matchLength` will be packed into the
7432+ * middle of overflow check vector. */
7433+ const svbool_t pmid = svcmpne_n_u8 (
7434+ ptrue , svreinterpret_u8 (svdup_n_u64 (0x0000FFFFFFFF0000 )), 0 );
7435+
7436+ do {
7437+ /* Load `lanes` number of `ZSTD_Sequence` into 8 vectors. */
7438+ const svuint32_t vin0 = svld1_vnum_u32 (ptrue , & inSeqs [n ].offset , 0 );
7439+ const svuint32_t vin1 = svld1_vnum_u32 (ptrue , & inSeqs [n ].offset , 1 );
7440+ const svuint32_t vin2 = svld1_vnum_u32 (ptrue , & inSeqs [n ].offset , 2 );
7441+ const svuint32_t vin3 = svld1_vnum_u32 (ptrue , & inSeqs [n ].offset , 3 );
7442+ const svuint32_t vin4 = svld1_vnum_u32 (ptrue , & inSeqs [n ].offset , 4 );
7443+ const svuint32_t vin5 = svld1_vnum_u32 (ptrue , & inSeqs [n ].offset , 5 );
7444+ const svuint32_t vin6 = svld1_vnum_u32 (ptrue , & inSeqs [n ].offset , 6 );
7445+ const svuint32_t vin7 = svld1_vnum_u32 (ptrue , & inSeqs [n ].offset , 7 );
7446+
7447+ /* Add {ZSTD_REP_NUM, 0, -MINMATCH, 0} to each structures. */
7448+ const svuint16x2_t vadd01 = svcreate2_u16 (
7449+ svreinterpret_u16 (svadd_u32_x (ptrue , vin0 , vaddition )),
7450+ svreinterpret_u16 (svadd_u32_x (ptrue , vin1 , vaddition )));
7451+ const svuint16x2_t vadd23 = svcreate2_u16 (
7452+ svreinterpret_u16 (svadd_u32_x (ptrue , vin2 , vaddition )),
7453+ svreinterpret_u16 (svadd_u32_x (ptrue , vin3 , vaddition )));
7454+ const svuint16x2_t vadd45 = svcreate2_u16 (
7455+ svreinterpret_u16 (svadd_u32_x (ptrue , vin4 , vaddition )),
7456+ svreinterpret_u16 (svadd_u32_x (ptrue , vin5 , vaddition )));
7457+ const svuint16x2_t vadd67 = svcreate2_u16 (
7458+ svreinterpret_u16 (svadd_u32_x (ptrue , vin6 , vaddition )),
7459+ svreinterpret_u16 (svadd_u32_x (ptrue , vin7 , vaddition )));
7460+
7461+ /* Shuffle and pack bytes so each vector contains SeqDef structures. */
7462+ const svuint16_t vout01 = svtbl2_u16 (vadd01 , vmask );
7463+ const svuint16_t vout23 = svtbl2_u16 (vadd23 , vmask );
7464+ const svuint16_t vout45 = svtbl2_u16 (vadd45 , vmask );
7465+ const svuint16_t vout67 = svtbl2_u16 (vadd67 , vmask );
7466+
7467+ /* Pack the upper 16-bits of 32-bit lanes for overflow check. */
7468+ const svuint16_t voverflow01 = svuzp2_u16 (svget2_u16 (vadd01 , 0 ),
7469+ svget2_u16 (vadd01 , 1 ));
7470+ const svuint16_t voverflow23 = svuzp2_u16 (svget2_u16 (vadd23 , 0 ),
7471+ svget2_u16 (vadd23 , 1 ));
7472+ const svuint16_t voverflow45 = svuzp2_u16 (svget2_u16 (vadd45 , 0 ),
7473+ svget2_u16 (vadd45 , 1 ));
7474+ const svuint16_t voverflow67 = svuzp2_u16 (svget2_u16 (vadd67 , 0 ),
7475+ svget2_u16 (vadd67 , 1 ));
7476+
7477+ /* We don't need the whole 16 bits of the overflow part. Only 1 bit
7478+ * is needed, so we pack tightly and merge multiple vectors to be
7479+ * able to use a single comparison to handle the overflow case.
7480+ * However, we also need to handle the possible negative values of
7481+ * matchLength parts, so we use signed comparison later. */
7482+ const svint8_t voverflow =
7483+ svmax_s8_x (pmid ,
7484+ svtrn1_s8 (svreinterpret_s8 (voverflow01 ),
7485+ svreinterpret_s8 (voverflow23 )),
7486+ svtrn1_s8 (svreinterpret_s8 (voverflow45 ),
7487+ svreinterpret_s8 (voverflow67 )));
7488+
7489+ /* Store `lanes` number of `SeqDef` structures from 4 vectors. */
7490+ svst1_vnum_u32 (ptrue , & dstSeqs [n ].offBase , 0 , svreinterpret_u32 (vout01 ));
7491+ svst1_vnum_u32 (ptrue , & dstSeqs [n ].offBase , 1 , svreinterpret_u32 (vout23 ));
7492+ svst1_vnum_u32 (ptrue , & dstSeqs [n ].offBase , 2 , svreinterpret_u32 (vout45 ));
7493+ svst1_vnum_u32 (ptrue , & dstSeqs [n ].offBase , 3 , svreinterpret_u32 (vout67 ));
7494+
7495+ /* Check if any enabled lanes of the overflow vector is larger than
7496+ * zero, only one such may happen. */
7497+ if (UNLIKELY (cmpgtz_any_s8 (pmid , voverflow ))) {
7498+ /* Scalar search for long match is needed because we merged
7499+ * multiple overflow bytes with `max`. */
7500+ size_t i ;
7501+ for (i = n ; i < n + lanes ; i ++ ) {
7502+ if (inSeqs [i ].matchLength > 65535 + MINMATCH ) {
7503+ assert (longLen == 0 );
7504+ longLen = i + 1 ;
7505+ }
7506+ if (inSeqs [i ].litLength > 65535 ) {
7507+ assert (longLen == 0 );
7508+ longLen = i + nbSequences + 1 ;
7509+ }
7510+ }
7511+ }
7512+
7513+ n += lanes ;
7514+ } while (n <= nbSequences - lanes );
7515+ }
7516+
7517+ /* Handle remaining elements. */
7518+ for (; n < nbSequences ; n ++ ) {
7519+ dstSeqs [n ].offBase = OFFSET_TO_OFFBASE (inSeqs [n ].offset );
7520+ dstSeqs [n ].litLength = (U16 )inSeqs [n ].litLength ;
7521+ dstSeqs [n ].mlBase = (U16 )(inSeqs [n ].matchLength - MINMATCH );
7522+ /* Check for long length > 65535. */
7523+ if (UNLIKELY (inSeqs [n ].matchLength > 65535 + MINMATCH )) {
7524+ assert (longLen == 0 );
7525+ longLen = n + 1 ;
7526+ }
7527+ if (UNLIKELY (inSeqs [n ].litLength > 65535 )) {
7528+ assert (longLen == 0 );
7529+ longLen = n + nbSequences + 1 ;
7530+ }
7531+ }
7532+ return longLen ;
7533+ }
7534+
73867535#elif defined(ZSTD_ARCH_ARM_NEON ) && (defined(__aarch64__ ) || defined(_M_ARM64 ))
73877536
73887537size_t convertSequences_noRepcodes (
0 commit comments