|
29 | 29 | #include "zstd_ldm.h" |
30 | 30 | #include "zstd_compress_superblock.h" |
31 | 31 | #include "../common/bits.h" /* ZSTD_highbit32, ZSTD_rotateRight_U64 */ |
| 32 | +#include <stdbool.h> |
32 | 33 |
|
33 | 34 | /* *************************************************************** |
34 | 35 | * Tuning parameters |
@@ -7383,6 +7384,155 @@ size_t convertSequences_noRepcodes(SeqDef* dstSeqs, const ZSTD_Sequence* inSeqs, |
7383 | 7384 | * but since this implementation is targeting modern systems (>= Sapphire Rapid), |
7384 | 7385 | * it's not useful to develop and maintain code for older pre-AVX2 platforms */ |
7385 | 7386 |
|
| 7387 | +#elif defined(ZSTD_ARCH_ARM_SVE2) |
| 7388 | + |
| 7389 | +/* |
| 7390 | + * Checks if any active element in a signed 8-bit integer vector is greater |
| 7391 | + * than zero. |
| 7392 | + * |
| 7393 | + * @param g Governing predicate selecting active lanes. |
| 7394 | + * @param a Input vector of signed 8-bit integers. |
| 7395 | + * |
| 7396 | + * @return True if any active element in `a` is > 0, false otherwise. |
| 7397 | + */ |
| 7398 | +FORCE_INLINE_TEMPLATE bool cmpgtz_any_s8(svbool_t g, svint8_t a) |
| 7399 | +{ |
| 7400 | + svbool_t ptest = svcmpgt_n_s8(g, a, 0); |
| 7401 | + return svptest_any(ptest, ptest); |
| 7402 | +} |
| 7403 | + |
| 7404 | +size_t convertSequences_noRepcodes( |
| 7405 | + SeqDef* dstSeqs, |
| 7406 | + const ZSTD_Sequence* inSeqs, |
| 7407 | + size_t nbSequences) |
| 7408 | +{ |
| 7409 | + /* Process the input with `8 * VL / element` lanes. */ |
| 7410 | + const size_t lanes = 8 * svcntb() / sizeof(ZSTD_Sequence); |
| 7411 | + size_t longLen = 0; |
| 7412 | + size_t n = 0; |
| 7413 | + |
| 7414 | + /* SVE permutation depends on the specific definition of target structures. */ |
| 7415 | + ZSTD_STATIC_ASSERT(sizeof(ZSTD_Sequence) == 16); |
| 7416 | + ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, offset) == 0); |
| 7417 | + ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, litLength) == 4); |
| 7418 | + ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, matchLength) == 8); |
| 7419 | + ZSTD_STATIC_ASSERT(sizeof(SeqDef) == 8); |
| 7420 | + ZSTD_STATIC_ASSERT(offsetof(SeqDef, offBase) == 0); |
| 7421 | + ZSTD_STATIC_ASSERT(offsetof(SeqDef, litLength) == 4); |
| 7422 | + ZSTD_STATIC_ASSERT(offsetof(SeqDef, mlBase) == 6); |
| 7423 | + |
| 7424 | + if (nbSequences >= lanes) { |
| 7425 | + const svbool_t ptrue = svptrue_b8(); |
| 7426 | + /* 16-bit of {ZSTD_REP_NUM, 0, -MINMATCH, 0} extended to 32-bit lanes. */ |
| 7427 | + const svuint32_t vaddition = svreinterpret_u32( |
| 7428 | + svunpklo_s32(svreinterpret_s16(svdup_n_u64(ZSTD_REP_NUM | (((U64)(U16)-MINMATCH) << 32))))); |
| 7429 | + /* For permutation of 16-bit units: 0, 1, 2, 4, 8, 9, 10, 12, ... */ |
| 7430 | + const svuint16_t vmask = svreinterpret_u16( |
| 7431 | + svindex_u64(0x0004000200010000, 0x0008000800080008)); |
| 7432 | + /* Upper bytes of `litLength` and `matchLength` will be packed into the |
| 7433 | + * middle of overflow check vector. */ |
| 7434 | + const svbool_t pmid = svcmpne_n_u8( |
| 7435 | + ptrue, svreinterpret_u8(svdup_n_u64(0x0000FFFFFFFF0000)), 0); |
| 7436 | + |
| 7437 | + do { |
| 7438 | + /* Load `lanes` number of `ZSTD_Sequence` into 8 vectors. */ |
| 7439 | + const svuint32_t vin0 = svld1_vnum_u32(ptrue, &inSeqs[n].offset, 0); |
| 7440 | + const svuint32_t vin1 = svld1_vnum_u32(ptrue, &inSeqs[n].offset, 1); |
| 7441 | + const svuint32_t vin2 = svld1_vnum_u32(ptrue, &inSeqs[n].offset, 2); |
| 7442 | + const svuint32_t vin3 = svld1_vnum_u32(ptrue, &inSeqs[n].offset, 3); |
| 7443 | + const svuint32_t vin4 = svld1_vnum_u32(ptrue, &inSeqs[n].offset, 4); |
| 7444 | + const svuint32_t vin5 = svld1_vnum_u32(ptrue, &inSeqs[n].offset, 5); |
| 7445 | + const svuint32_t vin6 = svld1_vnum_u32(ptrue, &inSeqs[n].offset, 6); |
| 7446 | + const svuint32_t vin7 = svld1_vnum_u32(ptrue, &inSeqs[n].offset, 7); |
| 7447 | + |
| 7448 | + /* Add {ZSTD_REP_NUM, 0, -MINMATCH, 0} to each structures. */ |
| 7449 | + const svuint16x2_t vadd01 = svcreate2_u16( |
| 7450 | + svreinterpret_u16(svadd_u32_x(ptrue, vin0, vaddition)), |
| 7451 | + svreinterpret_u16(svadd_u32_x(ptrue, vin1, vaddition))); |
| 7452 | + const svuint16x2_t vadd23 = svcreate2_u16( |
| 7453 | + svreinterpret_u16(svadd_u32_x(ptrue, vin2, vaddition)), |
| 7454 | + svreinterpret_u16(svadd_u32_x(ptrue, vin3, vaddition))); |
| 7455 | + const svuint16x2_t vadd45 = svcreate2_u16( |
| 7456 | + svreinterpret_u16(svadd_u32_x(ptrue, vin4, vaddition)), |
| 7457 | + svreinterpret_u16(svadd_u32_x(ptrue, vin5, vaddition))); |
| 7458 | + const svuint16x2_t vadd67 = svcreate2_u16( |
| 7459 | + svreinterpret_u16(svadd_u32_x(ptrue, vin6, vaddition)), |
| 7460 | + svreinterpret_u16(svadd_u32_x(ptrue, vin7, vaddition))); |
| 7461 | + |
| 7462 | + /* Shuffle and pack bytes so each vector contains SeqDef structures. */ |
| 7463 | + const svuint16_t vout01 = svtbl2_u16(vadd01, vmask); |
| 7464 | + const svuint16_t vout23 = svtbl2_u16(vadd23, vmask); |
| 7465 | + const svuint16_t vout45 = svtbl2_u16(vadd45, vmask); |
| 7466 | + const svuint16_t vout67 = svtbl2_u16(vadd67, vmask); |
| 7467 | + |
| 7468 | + /* Pack the upper 16-bits of 32-bit lanes for overflow check. */ |
| 7469 | + const svuint16_t voverflow01 = svuzp2_u16(svget2_u16(vadd01, 0), |
| 7470 | + svget2_u16(vadd01, 1)); |
| 7471 | + const svuint16_t voverflow23 = svuzp2_u16(svget2_u16(vadd23, 0), |
| 7472 | + svget2_u16(vadd23, 1)); |
| 7473 | + const svuint16_t voverflow45 = svuzp2_u16(svget2_u16(vadd45, 0), |
| 7474 | + svget2_u16(vadd45, 1)); |
| 7475 | + const svuint16_t voverflow67 = svuzp2_u16(svget2_u16(vadd67, 0), |
| 7476 | + svget2_u16(vadd67, 1)); |
| 7477 | + |
| 7478 | + /* We don't need the whole 16 bits of the overflow part. Only 1 bit |
| 7479 | + * is needed, so we pack tightly and merge multiple vectors to be |
| 7480 | + * able to use a single comparison to handle the overflow case. |
| 7481 | + * However, we also need to handle the possible negative values of |
| 7482 | + * matchLength parts, so we use signed comparison later. */ |
| 7483 | + const svint8_t voverflow = |
| 7484 | + svmax_s8_x(pmid, |
| 7485 | + svtrn1_s8(svreinterpret_s8(voverflow01), |
| 7486 | + svreinterpret_s8(voverflow23)), |
| 7487 | + svtrn1_s8(svreinterpret_s8(voverflow45), |
| 7488 | + svreinterpret_s8(voverflow67))); |
| 7489 | + |
| 7490 | + /* Store `lanes` number of `SeqDef` structures from 4 vectors. */ |
| 7491 | + svst1_vnum_u32(ptrue, &dstSeqs[n].offBase, 0, svreinterpret_u32(vout01)); |
| 7492 | + svst1_vnum_u32(ptrue, &dstSeqs[n].offBase, 1, svreinterpret_u32(vout23)); |
| 7493 | + svst1_vnum_u32(ptrue, &dstSeqs[n].offBase, 2, svreinterpret_u32(vout45)); |
| 7494 | + svst1_vnum_u32(ptrue, &dstSeqs[n].offBase, 3, svreinterpret_u32(vout67)); |
| 7495 | + |
| 7496 | + /* Check if any enabled lanes of the overflow vector is larger than |
| 7497 | + * zero, only one such may happen. */ |
| 7498 | + if (UNLIKELY(cmpgtz_any_s8(pmid, voverflow))) { |
| 7499 | + /* Scalar search for long match is needed because we merged |
| 7500 | + * multiple overflow bytes with `max`. */ |
| 7501 | + size_t i; |
| 7502 | + for (i = n; i < n + lanes; i++) { |
| 7503 | + if (inSeqs[i].matchLength > 65535 + MINMATCH) { |
| 7504 | + assert(longLen == 0); |
| 7505 | + longLen = i + 1; |
| 7506 | + } |
| 7507 | + if (inSeqs[i].litLength > 65535) { |
| 7508 | + assert(longLen == 0); |
| 7509 | + longLen = i + nbSequences + 1; |
| 7510 | + } |
| 7511 | + } |
| 7512 | + } |
| 7513 | + |
| 7514 | + n += lanes; |
| 7515 | + } while(n <= nbSequences - lanes); |
| 7516 | + } |
| 7517 | + |
| 7518 | + /* Handle remaining elements. */ |
| 7519 | + for (; n < nbSequences; n++) { |
| 7520 | + dstSeqs[n].offBase = OFFSET_TO_OFFBASE(inSeqs[n].offset); |
| 7521 | + dstSeqs[n].litLength = (U16)inSeqs[n].litLength; |
| 7522 | + dstSeqs[n].mlBase = (U16)(inSeqs[n].matchLength - MINMATCH); |
| 7523 | + /* Check for long length > 65535. */ |
| 7524 | + if (UNLIKELY(inSeqs[n].matchLength > 65535 + MINMATCH)) { |
| 7525 | + assert(longLen == 0); |
| 7526 | + longLen = n + 1; |
| 7527 | + } |
| 7528 | + if (UNLIKELY(inSeqs[n].litLength > 65535)) { |
| 7529 | + assert(longLen == 0); |
| 7530 | + longLen = n + nbSequences + 1; |
| 7531 | + } |
| 7532 | + } |
| 7533 | + return longLen; |
| 7534 | +} |
| 7535 | + |
7386 | 7536 | #elif defined(ZSTD_ARCH_ARM_NEON) && (defined(__aarch64__) || defined(_M_ARM64)) |
7387 | 7537 |
|
7388 | 7538 | size_t convertSequences_noRepcodes( |
|
0 commit comments