Skip to content

Commit 33468dd

Browse files
committed
Added NEON code optimizations
1 parent deb4e63 commit 33468dd

File tree

1 file changed

+13
-32
lines changed

1 file changed

+13
-32
lines changed

src/simd/x86/sse_ops.c

Lines changed: 13 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -717,26 +717,15 @@ void carquet_sse_unpack_bools(const uint8_t* input, uint8_t* output, int64_t cou
717717
void carquet_sse_pack_bools(const uint8_t* input, uint8_t* output, int64_t count) {
718718
int64_t i = 0;
719719

720-
/* Process 8 bools (1 output byte) at a time */
720+
/* Process 8 bools (1 output byte) at a time using movemask trick:
721+
* 1. Load 8 bytes (each 0 or 1)
722+
* 2. Shift left by 7 within 32-bit lanes: moves bit 0 of each byte to bit 7
723+
* 3. movemask extracts bit 7 from each byte position
724+
*/
721725
for (; i + 8 <= count; i += 8) {
722726
__m128i bools = _mm_loadl_epi64((const __m128i*)(input + i));
723-
724-
/* Multiply by bit positions: 1, 2, 4, 8, 16, 32, 64, 128 */
725-
__m128i mult = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
726-
(char)128, 64, 32, 16, 8, 4, 2, 1);
727-
728-
/* Zero extend to 16-bit, multiply, and sum */
729-
__m128i zero = _mm_setzero_si128();
730-
__m128i words = _mm_unpacklo_epi8(bools, zero);
731-
__m128i mwords = _mm_unpacklo_epi8(mult, zero);
732-
733-
/* Horizontal operations to sum */
734-
__m128i prod = _mm_mullo_epi16(words, mwords);
735-
prod = _mm_add_epi16(prod, _mm_srli_si128(prod, 2));
736-
prod = _mm_add_epi16(prod, _mm_srli_si128(prod, 4));
737-
prod = _mm_add_epi16(prod, _mm_srli_si128(prod, 8));
738-
739-
output[i / 8] = (uint8_t)_mm_extract_epi16(prod, 0);
727+
__m128i shifted = _mm_slli_epi32(bools, 7);
728+
output[i / 8] = (uint8_t)_mm_movemask_epi8(shifted);
740729
}
741730

742731
/* Handle remaining */
@@ -911,26 +900,18 @@ void carquet_sse_build_null_bitmap(const int16_t* def_levels, int64_t count,
911900
int64_t i = 0;
912901

913902
__m128i max_vec = _mm_set1_epi16(max_def_level);
903+
__m128i zero = _mm_setzero_si128();
914904

915905
/* Process 8 int16_t values -> 1 byte of bitmap */
916906
int64_t full_bytes = count / 8;
917907
for (int64_t b = 0; b < full_bytes; b++) {
918908
__m128i levels = _mm_loadu_si128((const __m128i*)(def_levels + b * 8));
919-
/* levels < max_def means null */
909+
/* levels < max_def means null: result is 0x0000 or 0xFFFF per lane */
920910
__m128i cmp = _mm_cmplt_epi16(levels, max_vec);
921-
/* Pack 8 comparison results to 8 bits */
922-
int mask = _mm_movemask_epi8(cmp);
923-
/* Take every other bit (since int16 comparisons set 2 bytes each) */
924-
uint8_t null_bits = 0;
925-
if (mask & 0x0001) null_bits |= 0x01;
926-
if (mask & 0x0004) null_bits |= 0x02;
927-
if (mask & 0x0010) null_bits |= 0x04;
928-
if (mask & 0x0040) null_bits |= 0x08;
929-
if (mask & 0x0100) null_bits |= 0x10;
930-
if (mask & 0x0400) null_bits |= 0x20;
931-
if (mask & 0x1000) null_bits |= 0x40;
932-
if (mask & 0x4000) null_bits |= 0x80;
933-
null_bitmap[b] = null_bits;
911+
/* Pack 8 int16 results (0x0000 or 0xFFFF) to 8 int8 (0x00 or 0xFF) */
912+
__m128i packed = _mm_packs_epi16(cmp, zero);
913+
/* movemask extracts bit 7 from each byte -> 8-bit result in low byte */
914+
null_bitmap[b] = (uint8_t)_mm_movemask_epi8(packed);
934915
i += 8;
935916
}
936917

0 commit comments

Comments
 (0)