|
8 | 8 | #include "core/bitpack.h" |
9 | 9 | #include <string.h> |
10 | 10 |
|
11 | | -#if defined(__ARM_NEON) || defined(__ARM_NEON__) |
| 11 | +#if defined(__SSE2__) |
| 12 | +#include <emmintrin.h> |
| 13 | +#elif defined(__ARM_NEON) || defined(__ARM_NEON__) |
12 | 14 | #include <arm_neon.h> |
13 | 15 | #endif |
14 | 16 |
|
@@ -486,7 +488,15 @@ int64_t carquet_rle_decode_levels( |
486 | 488 | int16_t* dst = output + count; |
487 | 489 | int64_t i = 0; |
488 | 490 |
|
489 | | -#if defined(__ARM_NEON) || defined(__ARM_NEON__) |
| 491 | +#if defined(__SSE2__) |
| 492 | + /* SSE2: fill 8 int16_t at a time */ |
| 493 | + if (to_fill >= 8) { |
| 494 | + __m128i vval = _mm_set1_epi16(val16); |
| 495 | + for (; i + 8 <= to_fill; i += 8) { |
| 496 | + _mm_storeu_si128((__m128i*)(dst + i), vval); |
| 497 | + } |
| 498 | + } |
| 499 | +#elif defined(__ARM_NEON) || defined(__ARM_NEON__) |
490 | 500 | /* NEON: fill 8 int16_t at a time */ |
491 | 501 | if (to_fill >= 8) { |
492 | 502 | int16x8_t vval = vdupq_n_s16(val16); |
@@ -523,7 +533,17 @@ int64_t carquet_rle_decode_levels( |
523 | 533 | to_store = max_values - count; |
524 | 534 | } |
525 | 535 |
|
526 | | -#if defined(__ARM_NEON) || defined(__ARM_NEON__) |
| 536 | +#if defined(__SSE2__) |
| 537 | + if (to_store == 8) { |
| 538 | + /* SSE2: load 2x4 int32_t, pack-saturate to 8 int16_t */ |
| 539 | + __m128i v0 = _mm_loadu_si128((const __m128i*)temp); |
| 540 | + __m128i v1 = _mm_loadu_si128((const __m128i*)(temp + 4)); |
| 541 | + __m128i packed = _mm_packs_epi32(v0, v1); |
| 542 | + _mm_storeu_si128((__m128i*)(output + count), packed); |
| 543 | + count += 8; |
| 544 | + continue; |
| 545 | + } |
| 546 | +#elif defined(__ARM_NEON) || defined(__ARM_NEON__) |
527 | 547 | if (to_store == 8) { |
528 | 548 | /* NEON: load 8 uint32_t, narrow to int16_t */ |
529 | 549 | uint32x4_t v0 = vld1q_u32(temp); |
|
0 commit comments