Skip to content

Commit 36ccef5

Browse files
AArch64: Add SVE2 path for convertSequences_noRepcodes
Add an 8-way vector length agnostic (VLA) SVE2 code path for convertSequences_noRepcodes. It works with any SVE vector length. Relative performance to GCC-13 using: `./fullbench -b18 -l5 enwik5` Neon SVE2 Neoverse-V2 before after uplift GCC-13: 100.000% 103.209% 1.032x GCC-14: 100.309% 134.872% 1.344x GCC-15: 100.355% 134.827% 1.343x Clang-18: 123.614% 128.565% 1.040x Clang-19: 123.587% 132.984% 1.076x Clang-20: 123.629% 133.023% 1.075x Neon SVE2 Cortex-A720 before after uplift GCC-13: 100.000% 116.032% 1.160x GCC-14: 99.700% 116.648% 1.169x GCC-15: 100.354% 117.047% 1.166x Clang-18: 100.447% 116.762% 1.162x Clang-19: 100.454% 116.627% 1.160x Clang-20: 100.452% 116.649% 1.161x
1 parent afa96bb commit 36ccef5

File tree

1 file changed

+150
-0
lines changed

1 file changed

+150
-0
lines changed

lib/compress/zstd_compress.c

Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#include "zstd_ldm.h"
3030
#include "zstd_compress_superblock.h"
3131
#include "../common/bits.h" /* ZSTD_highbit32, ZSTD_rotateRight_U64 */
32+
#include <stdbool.h>
3233

3334
/* ***************************************************************
3435
* Tuning parameters
@@ -7383,6 +7384,155 @@ size_t convertSequences_noRepcodes(SeqDef* dstSeqs, const ZSTD_Sequence* inSeqs,
73837384
* but since this implementation is targeting modern systems (>= Sapphire Rapid),
73847385
* it's not useful to develop and maintain code for older pre-AVX2 platforms */
73857386

7387+
#elif defined(ZSTD_ARCH_ARM_SVE2)
7388+
7389+
/*
7390+
* Checks if any active element in a signed 8-bit integer vector is greater
7391+
* than zero.
7392+
*
7393+
* @param g Governing predicate selecting active lanes.
7394+
* @param a Input vector of signed 8-bit integers.
7395+
*
7396+
* @return True if any active element in `a` is > 0, false otherwise.
7397+
*/
7398+
FORCE_INLINE_TEMPLATE bool cmpgtz_any_s8(svbool_t g, svint8_t a)
7399+
{
7400+
svbool_t ptest = svcmpgt_n_s8(g, a, 0);
7401+
return svptest_any(ptest, ptest);
7402+
}
7403+
7404+
size_t convertSequences_noRepcodes(
7405+
SeqDef* dstSeqs,
7406+
const ZSTD_Sequence* inSeqs,
7407+
size_t nbSequences)
7408+
{
7409+
/* Process the input with `8 * VL / element` lanes. */
7410+
const size_t lanes = 8 * svcntb() / sizeof(ZSTD_Sequence);
7411+
size_t longLen = 0;
7412+
size_t n = 0;
7413+
7414+
/* SVE permutation depends on the specific definition of target structures. */
7415+
ZSTD_STATIC_ASSERT(sizeof(ZSTD_Sequence) == 16);
7416+
ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, offset) == 0);
7417+
ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, litLength) == 4);
7418+
ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, matchLength) == 8);
7419+
ZSTD_STATIC_ASSERT(sizeof(SeqDef) == 8);
7420+
ZSTD_STATIC_ASSERT(offsetof(SeqDef, offBase) == 0);
7421+
ZSTD_STATIC_ASSERT(offsetof(SeqDef, litLength) == 4);
7422+
ZSTD_STATIC_ASSERT(offsetof(SeqDef, mlBase) == 6);
7423+
7424+
if (nbSequences >= lanes) {
7425+
const svbool_t ptrue = svptrue_b8();
7426+
/* 16-bit of {ZSTD_REP_NUM, 0, -MINMATCH, 0} extended to 32-bit lanes. */
7427+
const svuint32_t vaddition = svreinterpret_u32(
7428+
svunpklo_s32(svreinterpret_s16(svdup_n_u64(ZSTD_REP_NUM | (((U64)(U16)-MINMATCH) << 32)))));
7429+
/* For permutation of 16-bit units: 0, 1, 2, 4, 8, 9, 10, 12, ... */
7430+
const svuint16_t vmask = svreinterpret_u16(
7431+
svindex_u64(0x0004000200010000, 0x0008000800080008));
7432+
/* Upper bytes of `litLength` and `matchLength` will be packed into the
7433+
* middle of overflow check vector. */
7434+
const svbool_t pmid = svcmpne_n_u8(
7435+
ptrue, svreinterpret_u8(svdup_n_u64(0x0000FFFFFFFF0000)), 0);
7436+
7437+
do {
7438+
/* Load `lanes` number of `ZSTD_Sequence` into 8 vectors. */
7439+
const svuint32_t vin0 = svld1_vnum_u32(ptrue, &inSeqs[n].offset, 0);
7440+
const svuint32_t vin1 = svld1_vnum_u32(ptrue, &inSeqs[n].offset, 1);
7441+
const svuint32_t vin2 = svld1_vnum_u32(ptrue, &inSeqs[n].offset, 2);
7442+
const svuint32_t vin3 = svld1_vnum_u32(ptrue, &inSeqs[n].offset, 3);
7443+
const svuint32_t vin4 = svld1_vnum_u32(ptrue, &inSeqs[n].offset, 4);
7444+
const svuint32_t vin5 = svld1_vnum_u32(ptrue, &inSeqs[n].offset, 5);
7445+
const svuint32_t vin6 = svld1_vnum_u32(ptrue, &inSeqs[n].offset, 6);
7446+
const svuint32_t vin7 = svld1_vnum_u32(ptrue, &inSeqs[n].offset, 7);
7447+
7448+
/* Add {ZSTD_REP_NUM, 0, -MINMATCH, 0} to each structures. */
7449+
const svuint16x2_t vadd01 = svcreate2_u16(
7450+
svreinterpret_u16(svadd_u32_x(ptrue, vin0, vaddition)),
7451+
svreinterpret_u16(svadd_u32_x(ptrue, vin1, vaddition)));
7452+
const svuint16x2_t vadd23 = svcreate2_u16(
7453+
svreinterpret_u16(svadd_u32_x(ptrue, vin2, vaddition)),
7454+
svreinterpret_u16(svadd_u32_x(ptrue, vin3, vaddition)));
7455+
const svuint16x2_t vadd45 = svcreate2_u16(
7456+
svreinterpret_u16(svadd_u32_x(ptrue, vin4, vaddition)),
7457+
svreinterpret_u16(svadd_u32_x(ptrue, vin5, vaddition)));
7458+
const svuint16x2_t vadd67 = svcreate2_u16(
7459+
svreinterpret_u16(svadd_u32_x(ptrue, vin6, vaddition)),
7460+
svreinterpret_u16(svadd_u32_x(ptrue, vin7, vaddition)));
7461+
7462+
/* Shuffle and pack bytes so each vector contains SeqDef structures. */
7463+
const svuint16_t vout01 = svtbl2_u16(vadd01, vmask);
7464+
const svuint16_t vout23 = svtbl2_u16(vadd23, vmask);
7465+
const svuint16_t vout45 = svtbl2_u16(vadd45, vmask);
7466+
const svuint16_t vout67 = svtbl2_u16(vadd67, vmask);
7467+
7468+
/* Pack the upper 16-bits of 32-bit lanes for overflow check. */
7469+
const svuint16_t voverflow01 = svuzp2_u16(svget2_u16(vadd01, 0),
7470+
svget2_u16(vadd01, 1));
7471+
const svuint16_t voverflow23 = svuzp2_u16(svget2_u16(vadd23, 0),
7472+
svget2_u16(vadd23, 1));
7473+
const svuint16_t voverflow45 = svuzp2_u16(svget2_u16(vadd45, 0),
7474+
svget2_u16(vadd45, 1));
7475+
const svuint16_t voverflow67 = svuzp2_u16(svget2_u16(vadd67, 0),
7476+
svget2_u16(vadd67, 1));
7477+
7478+
/* We don't need the whole 16 bits of the overflow part. Only 1 bit
7479+
* is needed, so we pack tightly and merge multiple vectors to be
7480+
* able to use a single comparison to handle the overflow case.
7481+
* However, we also need to handle the possible negative values of
7482+
* matchLength parts, so we use signed comparison later. */
7483+
const svint8_t voverflow =
7484+
svmax_s8_x(pmid,
7485+
svtrn1_s8(svreinterpret_s8(voverflow01),
7486+
svreinterpret_s8(voverflow23)),
7487+
svtrn1_s8(svreinterpret_s8(voverflow45),
7488+
svreinterpret_s8(voverflow67)));
7489+
7490+
/* Store `lanes` number of `SeqDef` structures from 4 vectors. */
7491+
svst1_vnum_u32(ptrue, &dstSeqs[n].offBase, 0, svreinterpret_u32(vout01));
7492+
svst1_vnum_u32(ptrue, &dstSeqs[n].offBase, 1, svreinterpret_u32(vout23));
7493+
svst1_vnum_u32(ptrue, &dstSeqs[n].offBase, 2, svreinterpret_u32(vout45));
7494+
svst1_vnum_u32(ptrue, &dstSeqs[n].offBase, 3, svreinterpret_u32(vout67));
7495+
7496+
/* Check if any enabled lanes of the overflow vector is larger than
7497+
* zero, only one such may happen. */
7498+
if (UNLIKELY(cmpgtz_any_s8(pmid, voverflow))) {
7499+
/* Scalar search for long match is needed because we merged
7500+
* multiple overflow bytes with `max`. */
7501+
size_t i;
7502+
for (i = n; i < n + lanes; i++) {
7503+
if (inSeqs[i].matchLength > 65535 + MINMATCH) {
7504+
assert(longLen == 0);
7505+
longLen = i + 1;
7506+
}
7507+
if (inSeqs[i].litLength > 65535) {
7508+
assert(longLen == 0);
7509+
longLen = i + nbSequences + 1;
7510+
}
7511+
}
7512+
}
7513+
7514+
n += lanes;
7515+
} while(n <= nbSequences - lanes);
7516+
}
7517+
7518+
/* Handle remaining elements. */
7519+
for (; n < nbSequences; n++) {
7520+
dstSeqs[n].offBase = OFFSET_TO_OFFBASE(inSeqs[n].offset);
7521+
dstSeqs[n].litLength = (U16)inSeqs[n].litLength;
7522+
dstSeqs[n].mlBase = (U16)(inSeqs[n].matchLength - MINMATCH);
7523+
/* Check for long length > 65535. */
7524+
if (UNLIKELY(inSeqs[n].matchLength > 65535 + MINMATCH)) {
7525+
assert(longLen == 0);
7526+
longLen = n + 1;
7527+
}
7528+
if (UNLIKELY(inSeqs[n].litLength > 65535)) {
7529+
assert(longLen == 0);
7530+
longLen = n + nbSequences + 1;
7531+
}
7532+
}
7533+
return longLen;
7534+
}
7535+
73867536
#elif defined(ZSTD_ARCH_ARM_NEON) && (defined(__aarch64__) || defined(_M_ARM64))
73877537

73887538
size_t convertSequences_noRepcodes(

0 commit comments

Comments
 (0)