Skip to content

Commit 07cd78d

Browse files
AArch64: Add Neon path for convertSequences_noRepcodes
Add a 4-way Neon implementation for the convertSequences_noRepcodes function. Remove 'static' keywords from all of its implementations to be able to add unit tests. Relative performance to Clang-18 using: `./fullbench -b18 -l5 enwik5` Neoverse-V2 before after Clang-18: 100.000% 311.703% Clang-19: 100.191% 311.714% Clang-20: 100.181% 311.723% GCC-13: 107.520% 252.309% GCC-14: 107.652% 253.158% GCC-15: 107.674% 253.168% Cortex-A720 before after Clang-18: 100.000% 204.512% Clang-19: 102.825% 204.600% Clang-20: 102.807% 204.558% GCC-13: 110.668% 203.594% GCC-14: 110.684% 203.978% GCC-15: 102.864% 204.299% Co-authored by, Thomas Daubney <Thomas.Daubney@arm.com>
1 parent 8e44004 commit 07cd78d

File tree

2 files changed

+262
-6
lines changed

2 files changed

+262
-6
lines changed

lib/compress/zstd_compress.c

Lines changed: 136 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,14 @@
5656
# define ZSTD_HASHLOG3_MAX 17
5757
#endif
5858

59+
60+
/*-*************************************
61+
* Forward declarations
62+
***************************************/
63+
size_t convertSequences_noRepcodes(SeqDef* dstSeqs, const ZSTD_Sequence* inSeqs,
64+
size_t nbSequences);
65+
66+
5967
/*-*************************************
6068
* Helper functions
6169
***************************************/
@@ -7118,7 +7126,7 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* cctx,
71187126
}
71197127

71207128

7121-
#if defined(__AVX2__)
7129+
#if defined(ZSTD_ARCH_X86_AVX2)
71227130

71237131
#include <immintrin.h> /* AVX2 intrinsics */
71247132

@@ -7138,7 +7146,7 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* cctx,
71387146
* @returns > 0 if there is one long length (> 65535),
71397147
* indicating the position, and type.
71407148
*/
7141-
static size_t convertSequences_noRepcodes(
7149+
size_t convertSequences_noRepcodes(
71427150
SeqDef* dstSeqs,
71437151
const ZSTD_Sequence* inSeqs,
71447152
size_t nbSequences)
@@ -7298,7 +7306,7 @@ static size_t convertSequences_noRepcodes(
72987306
* @returns > 0 if there is one long length (> 65535),
72997307
* indicating the position, and type.
73007308
*/
7301-
static size_t convertSequences_noRepcodes(SeqDef* dstSeqs, const ZSTD_Sequence* inSeqs, size_t nbSequences) {
7309+
size_t convertSequences_noRepcodes(SeqDef* dstSeqs, const ZSTD_Sequence* inSeqs, size_t nbSequences) {
73027310
size_t longLen = 0;
73037311

73047312
/* RVV depends on the specific definition of target structures */
@@ -7375,9 +7383,131 @@ static size_t convertSequences_noRepcodes(SeqDef* dstSeqs, const ZSTD_Sequence*
73757383
* but since this implementation is targeting modern systems (>= Sapphire Rapid),
73767384
* it's not useful to develop and maintain code for older pre-AVX2 platforms */
73777385

7378-
#else /* no AVX2 */
7386+
#elif defined(ZSTD_ARCH_ARM_NEON) && (defined(__aarch64__) || defined(_M_ARM64))
7387+
7388+
size_t convertSequences_noRepcodes(
7389+
SeqDef* dstSeqs,
7390+
const ZSTD_Sequence* inSeqs,
7391+
size_t nbSequences)
7392+
{
7393+
size_t longLen = 0;
7394+
size_t n = 0;
7395+
7396+
/* Neon permutation depends on the specific definition of target structures. */
7397+
ZSTD_STATIC_ASSERT(sizeof(ZSTD_Sequence) == 16);
7398+
ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, offset) == 0);
7399+
ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, litLength) == 4);
7400+
ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, matchLength) == 8);
7401+
ZSTD_STATIC_ASSERT(sizeof(SeqDef) == 8);
7402+
ZSTD_STATIC_ASSERT(offsetof(SeqDef, offBase) == 0);
7403+
ZSTD_STATIC_ASSERT(offsetof(SeqDef, litLength) == 4);
7404+
ZSTD_STATIC_ASSERT(offsetof(SeqDef, mlBase) == 6);
7405+
7406+
if (nbSequences > 3) {
7407+
static const ZSTD_ALIGNED(16) U32 constAddition[4] = {
7408+
ZSTD_REP_NUM, 0, -MINMATCH, 0
7409+
};
7410+
static const ZSTD_ALIGNED(16) U8 constMask[16] = {
7411+
0, 1, 2, 3, 4, 5, 8, 9, 16, 17, 18, 19, 20, 21, 24, 25
7412+
};
7413+
static const ZSTD_ALIGNED(16) U16 constCounter[8] = {
7414+
1, 1, 1, 1, 2, 2, 2, 2
7415+
};
7416+
7417+
const uint32x4_t vaddition = vld1q_u32(constAddition);
7418+
const uint8x16_t vmask = vld1q_u8(constMask);
7419+
uint16x8_t vcounter = vld1q_u16(constCounter);
7420+
uint16x8_t vindex01 = vdupq_n_u16(0);
7421+
uint16x8_t vindex23 = vdupq_n_u16(0);
7422+
7423+
do {
7424+
/* Load 4 ZSTD_Sequence (64 bytes). */
7425+
const uint32x4_t vin0 = vld1q_u32(&inSeqs[n + 0].offset);
7426+
const uint32x4_t vin1 = vld1q_u32(&inSeqs[n + 1].offset);
7427+
const uint32x4_t vin2 = vld1q_u32(&inSeqs[n + 2].offset);
7428+
const uint32x4_t vin3 = vld1q_u32(&inSeqs[n + 3].offset);
7429+
7430+
/* Add {ZSTD_REP_NUM, 0, -MINMATCH, 0} to each vector. */
7431+
const uint8x16x2_t vadd01 = { {
7432+
vreinterpretq_u8_u32(vaddq_u32(vin0, vaddition)),
7433+
vreinterpretq_u8_u32(vaddq_u32(vin1, vaddition)),
7434+
} };
7435+
const uint8x16x2_t vadd23 = { {
7436+
vreinterpretq_u8_u32(vaddq_u32(vin2, vaddition)),
7437+
vreinterpretq_u8_u32(vaddq_u32(vin3, vaddition)),
7438+
} };
7439+
7440+
/* Shuffle and pack bytes so each vector contains 2 SeqDef structures. */
7441+
const uint8x16_t vout01 = vqtbl2q_u8(vadd01, vmask);
7442+
const uint8x16_t vout23 = vqtbl2q_u8(vadd23, vmask);
7443+
7444+
/* Pack the upper 16-bits of 32-bit lanes for overflow check. */
7445+
uint16x8_t voverflow01 = vuzp2q_u16(vreinterpretq_u16_u8(vadd01.val[0]),
7446+
vreinterpretq_u16_u8(vadd01.val[1]));
7447+
uint16x8_t voverflow23 = vuzp2q_u16(vreinterpretq_u16_u8(vadd23.val[0]),
7448+
vreinterpretq_u16_u8(vadd23.val[1]));
7449+
7450+
/* Store 4 SeqDef structures. */
7451+
vst1q_u32(&dstSeqs[n + 0].offBase, vreinterpretq_u32_u8(vout01));
7452+
vst1q_u32(&dstSeqs[n + 2].offBase, vreinterpretq_u32_u8(vout23));
7453+
7454+
/* Create masks in case of overflow. */
7455+
voverflow01 = vcgtzq_s16(vreinterpretq_s16_u16(voverflow01));
7456+
voverflow23 = vcgtzq_s16(vreinterpretq_s16_u16(voverflow23));
7457+
7458+
/* Update overflow indices. */
7459+
vindex01 = vbslq_u16(voverflow01, vcounter, vindex01);
7460+
vindex23 = vbslq_u16(voverflow23, vcounter, vindex23);
7461+
7462+
/* Update counter for overflow check. */
7463+
vcounter = vaddq_u16(vcounter, vdupq_n_u16(4));
7464+
7465+
n += 4;
7466+
} while(n < nbSequences - 3);
7467+
7468+
/* Fixup indices in the second vector, we saved an additional counter
7469+
in the loop to update the second overflow index, we need to add 2
7470+
here when the indices are not 0. */
7471+
{ uint16x8_t nonzero = vtstq_u16(vindex23, vindex23);
7472+
vindex23 = vsubq_u16(vindex23, nonzero);
7473+
vindex23 = vsubq_u16(vindex23, nonzero);
7474+
}
7475+
7476+
/* Merge indices in the vectors, maximums are needed. */
7477+
vindex01 = vmaxq_u16(vindex01, vindex23);
7478+
vindex01 = vmaxq_u16(vindex01, vextq_u16(vindex01, vindex01, 4));
7479+
7480+
/* Compute `longLen`, maximums of matchLength and litLength
7481+
with a preference on litLength. */
7482+
{ U64 maxLitMatchIndices = vgetq_lane_u64(vreinterpretq_u64_u16(vindex01), 0);
7483+
size_t maxLitIndex = (maxLitMatchIndices >> 16) & 0xFFFF;
7484+
size_t maxMatchIndex = (maxLitMatchIndices >> 32) & 0xFFFF;
7485+
longLen = maxLitIndex > maxMatchIndex ? maxLitIndex + nbSequences
7486+
: maxMatchIndex;
7487+
}
7488+
}
7489+
7490+
/* Handle remaining elements. */
7491+
for (; n < nbSequences; n++) {
7492+
dstSeqs[n].offBase = OFFSET_TO_OFFBASE(inSeqs[n].offset);
7493+
dstSeqs[n].litLength = (U16)inSeqs[n].litLength;
7494+
dstSeqs[n].mlBase = (U16)(inSeqs[n].matchLength - MINMATCH);
7495+
/* Check for long length > 65535. */
7496+
if (UNLIKELY(inSeqs[n].matchLength > 65535 + MINMATCH)) {
7497+
assert(longLen == 0);
7498+
longLen = n + 1;
7499+
}
7500+
if (UNLIKELY(inSeqs[n].litLength > 65535)) {
7501+
assert(longLen == 0);
7502+
longLen = n + nbSequences + 1;
7503+
}
7504+
}
7505+
return longLen;
7506+
}
7507+
7508+
#else /* No vectorization. */
73797509

7380-
static size_t convertSequences_noRepcodes(
7510+
size_t convertSequences_noRepcodes(
73817511
SeqDef* dstSeqs,
73827512
const ZSTD_Sequence* inSeqs,
73837513
size_t nbSequences)
@@ -7388,7 +7518,7 @@ static size_t convertSequences_noRepcodes(
73887518
dstSeqs[n].offBase = OFFSET_TO_OFFBASE(inSeqs[n].offset);
73897519
dstSeqs[n].litLength = (U16)inSeqs[n].litLength;
73907520
dstSeqs[n].mlBase = (U16)(inSeqs[n].matchLength - MINMATCH);
7391-
/* check for long length > 65535 */
7521+
/* Check for long length > 65535. */
73927522
if (UNLIKELY(inSeqs[n].matchLength > 65535+MINMATCH)) {
73937523
assert(longLen == 0);
73947524
longLen = n + 1;

tests/fuzzer.c

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -770,6 +770,130 @@ static void test_blockSplitter_incompressibleExpansionProtection(unsigned testNb
770770
DISPLAYLEVEL(3, "OK \n");
771771
}
772772

773+
size_t convertSequences_noRepcodes(SeqDef* dstSeqs, const ZSTD_Sequence* inSeqs,
774+
size_t nbSequences);
775+
776+
static size_t convertSequences_noRepcodes_ref(
777+
SeqDef* dstSeqs,
778+
const ZSTD_Sequence* inSeqs,
779+
size_t nbSequences)
780+
{
781+
size_t longLen = 0;
782+
size_t n;
783+
for (n=0; n<nbSequences; n++) {
784+
dstSeqs[n].offBase = OFFSET_TO_OFFBASE(inSeqs[n].offset);
785+
dstSeqs[n].litLength = (U16)inSeqs[n].litLength;
786+
dstSeqs[n].mlBase = (U16)(inSeqs[n].matchLength - MINMATCH);
787+
/* Check for long length > 65535. */
788+
if (UNLIKELY(inSeqs[n].matchLength > 65535+MINMATCH)) {
789+
assert(longLen == 0);
790+
longLen = n + 1;
791+
}
792+
if (UNLIKELY(inSeqs[n].litLength > 65535)) {
793+
assert(longLen == 0);
794+
longLen = n + nbSequences + 1;
795+
}
796+
}
797+
return longLen;
798+
}
799+
800+
static unsigned test_convertSequences_noRepcodes(unsigned seed, unsigned testNb)
801+
{
802+
ZSTD_Sequence nsrc[12];
803+
SeqDef ndst[12], rdst[12];
804+
size_t ref, ret, i, j;
805+
806+
seed += 0xDEADBEEF;
807+
for (i = 0; i < COUNTOF(nsrc); ++i) {
808+
seed = 48271 * ((unsigned)i + seed);
809+
nsrc[i].offset = (seed & 0xFFFF) | 1; /* Offset shall not be zero. */
810+
seed = 48271 * ((unsigned)i + seed);
811+
nsrc[i].litLength = seed & 0xFFFF;
812+
seed = 48271 * ((unsigned)i + seed);
813+
nsrc[i].matchLength = (seed & 0xFFFFFF) % (65536 + MINMATCH);
814+
seed = 48271 * ((unsigned)i + seed);
815+
nsrc[i].rep = seed & 0xFF;
816+
}
817+
818+
/* For near overflow and proper negative value handling. */
819+
nsrc[5].matchLength = 65535 + MINMATCH;
820+
nsrc[6].litLength = 65535;
821+
nsrc[6].matchLength = 0;
822+
nsrc[7].litLength = 0;
823+
nsrc[7].matchLength = MINMATCH;
824+
825+
for (i = 0; i <= COUNTOF(nsrc); ++i) {
826+
DISPLAYLEVEL(3, "test%3u : convertSequences_noRepcodes with %u inputs : ",
827+
testNb++, (unsigned)i);
828+
memset(ndst, 0, sizeof(ndst));
829+
memset(rdst, 0, sizeof(rdst));
830+
ref = convertSequences_noRepcodes_ref(rdst, nsrc, i);
831+
ret = convertSequences_noRepcodes(ndst, nsrc, i);
832+
CHECK_EQ(ret, ref);
833+
CHECK_EQ(memcmp(rdst, ndst, sizeof(ndst)), 0);
834+
DISPLAYLEVEL(3, "OK \n");
835+
}
836+
837+
nsrc[7].matchLength = 65536 + MINMATCH;
838+
for (i = 8; i <= COUNTOF(nsrc); ++i) {
839+
DISPLAYLEVEL(3, "test%3u : convertSequences_noRepcodes with %u inputs and "
840+
"matchLength overflow : ",
841+
testNb++, (unsigned)i);
842+
memset(ndst, 0, sizeof(ndst));
843+
memset(rdst, 0, sizeof(rdst));
844+
ref = convertSequences_noRepcodes_ref(rdst, nsrc, i);
845+
ret = convertSequences_noRepcodes(ndst, nsrc, i);
846+
CHECK_EQ(ret, ref);
847+
CHECK_EQ(memcmp(rdst, ndst, sizeof(ndst)), 0);
848+
DISPLAYLEVEL(3, "OK \n");
849+
850+
assert(COUNTOF(nsrc) > 8);
851+
for (j = 4; j < 8; ++j) {
852+
DISPLAYLEVEL(3, "test%3u : convertSequences_noRepcodes with %u inputs and "
853+
"matchLength overflow #%u : ",
854+
testNb++, (unsigned)i, (unsigned)(i - j));
855+
memset(ndst, 0, sizeof(ndst));
856+
memset(rdst, 0, sizeof(rdst));
857+
ref = convertSequences_noRepcodes_ref(rdst, nsrc + j, i - j);
858+
ret = convertSequences_noRepcodes(ndst, nsrc + j, i - j);
859+
CHECK_EQ(ret, ref);
860+
CHECK_EQ(memcmp(rdst, ndst, sizeof(ndst)), 0);
861+
DISPLAYLEVEL(3, "OK \n");
862+
}
863+
}
864+
nsrc[7].matchLength = 1;
865+
866+
nsrc[7].litLength = 65536;
867+
for (i = 8; i <= COUNTOF(nsrc); ++i) {
868+
DISPLAYLEVEL(3, "test%3u : convertSequences_noRepcodes with %u inputs and "
869+
"litLength overflow: ",
870+
testNb++, (unsigned)i);
871+
memset(ndst, 0, sizeof(ndst));
872+
memset(rdst, 0, sizeof(rdst));
873+
ref = convertSequences_noRepcodes_ref(rdst, nsrc, i);
874+
ret = convertSequences_noRepcodes(ndst, nsrc, i);
875+
CHECK_EQ(ret, ref);
876+
CHECK_EQ(memcmp(rdst, ndst, sizeof(ndst)), 0);
877+
DISPLAYLEVEL(3, "OK \n");
878+
879+
assert(COUNTOF(nsrc) > 8);
880+
for (j = 4; j < 8; ++j) {
881+
DISPLAYLEVEL(3, "test%3u : convertSequences_noRepcodes with %u inputs and "
882+
"litLength overflow #%u: ",
883+
testNb++, (unsigned)i, (unsigned)(i - j));
884+
memset(ndst, 0, sizeof(ndst));
885+
memset(rdst, 0, sizeof(rdst));
886+
ref = convertSequences_noRepcodes_ref(rdst, nsrc + j, i - j);
887+
ret = convertSequences_noRepcodes(ndst, nsrc + j, i - j);
888+
CHECK_EQ(ret, ref);
889+
CHECK_EQ(memcmp(rdst, ndst, sizeof(ndst)), 0);
890+
DISPLAYLEVEL(3, "OK \n");
891+
}
892+
}
893+
894+
return testNb;
895+
}
896+
773897
static unsigned test_get1BlockSummary(unsigned testNb)
774898
{
775899
static const ZSTD_Sequence nseqs[] = {
@@ -4085,6 +4209,8 @@ static int basicUnitTests(U32 const seed, double compressibility)
40854209
}
40864210
DISPLAYLEVEL(3, "OK \n");
40874211

4212+
testNb = test_convertSequences_noRepcodes(seed, testNb);
4213+
40884214
testNb = test_get1BlockSummary(testNb);
40894215

40904216
DISPLAYLEVEL(3, "test%3i : ZSTD_compressSequencesAndLiterals : ", testNb++);

0 commit comments

Comments
 (0)