Skip to content

Commit 310d5ce

Browse files
committed
Added SSE2 SIMD paths to carquet_rle_decode_levels
1 parent 434e9e7 commit 310d5ce

File tree

1 file changed

+23
-3
lines changed

1 file changed

+23
-3
lines changed

src/encoding/rle.c

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,9 @@
88
#include "core/bitpack.h"
99
#include <string.h>
1010

11-
#if defined(__ARM_NEON) || defined(__ARM_NEON__)
11+
#if defined(__SSE2__)
12+
#include <emmintrin.h>
13+
#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
1214
#include <arm_neon.h>
1315
#endif
1416

@@ -486,7 +488,15 @@ int64_t carquet_rle_decode_levels(
486488
int16_t* dst = output + count;
487489
int64_t i = 0;
488490

489-
#if defined(__ARM_NEON) || defined(__ARM_NEON__)
491+
#if defined(__SSE2__)
492+
/* SSE2: fill 8 int16_t at a time */
493+
if (to_fill >= 8) {
494+
__m128i vval = _mm_set1_epi16(val16);
495+
for (; i + 8 <= to_fill; i += 8) {
496+
_mm_storeu_si128((__m128i*)(dst + i), vval);
497+
}
498+
}
499+
#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
490500
/* NEON: fill 8 int16_t at a time */
491501
if (to_fill >= 8) {
492502
int16x8_t vval = vdupq_n_s16(val16);
@@ -523,7 +533,17 @@ int64_t carquet_rle_decode_levels(
523533
to_store = max_values - count;
524534
}
525535

526-
#if defined(__ARM_NEON) || defined(__ARM_NEON__)
536+
#if defined(__SSE2__)
537+
if (to_store == 8) {
538+
/* SSE2: load 2x4 int32_t, pack-saturate to 8 int16_t */
539+
__m128i v0 = _mm_loadu_si128((const __m128i*)temp);
540+
__m128i v1 = _mm_loadu_si128((const __m128i*)(temp + 4));
541+
__m128i packed = _mm_packs_epi32(v0, v1);
542+
_mm_storeu_si128((__m128i*)(output + count), packed);
543+
count += 8;
544+
continue;
545+
}
546+
#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
527547
if (to_store == 8) {
528548
/* NEON: load 8 uint32_t, narrow to int16_t */
529549
uint32x4_t v0 = vld1q_u32(temp);

0 commit comments

Comments
 (0)