@@ -717,26 +717,15 @@ void carquet_sse_unpack_bools(const uint8_t* input, uint8_t* output, int64_t cou
717717void carquet_sse_pack_bools (const uint8_t * input , uint8_t * output , int64_t count ) {
718718 int64_t i = 0 ;
719719
720- /* Process 8 bools (1 output byte) at a time */
720+ /* Process 8 bools (1 output byte) at a time using movemask trick:
721+ * 1. Load 8 bytes (each 0 or 1)
722+ * 2. Shift left by 7 within 32-bit lanes: moves bit 0 of each byte to bit 7
723+ * 3. movemask extracts bit 7 from each byte position
724+ */
721725 for (; i + 8 <= count ; i += 8 ) {
722726 __m128i bools = _mm_loadl_epi64 ((const __m128i * )(input + i ));
723-
724- /* Multiply by bit positions: 1, 2, 4, 8, 16, 32, 64, 128 */
725- __m128i mult = _mm_set_epi8 (0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
726- (char )128 , 64 , 32 , 16 , 8 , 4 , 2 , 1 );
727-
728- /* Zero extend to 16-bit, multiply, and sum */
729- __m128i zero = _mm_setzero_si128 ();
730- __m128i words = _mm_unpacklo_epi8 (bools , zero );
731- __m128i mwords = _mm_unpacklo_epi8 (mult , zero );
732-
733- /* Horizontal operations to sum */
734- __m128i prod = _mm_mullo_epi16 (words , mwords );
735- prod = _mm_add_epi16 (prod , _mm_srli_si128 (prod , 2 ));
736- prod = _mm_add_epi16 (prod , _mm_srli_si128 (prod , 4 ));
737- prod = _mm_add_epi16 (prod , _mm_srli_si128 (prod , 8 ));
738-
739- output [i / 8 ] = (uint8_t )_mm_extract_epi16 (prod , 0 );
727+ __m128i shifted = _mm_slli_epi32 (bools , 7 );
728+ output [i / 8 ] = (uint8_t )_mm_movemask_epi8 (shifted );
740729 }
741730
742731 /* Handle remaining */
@@ -911,26 +900,18 @@ void carquet_sse_build_null_bitmap(const int16_t* def_levels, int64_t count,
911900 int64_t i = 0 ;
912901
913902 __m128i max_vec = _mm_set1_epi16 (max_def_level );
903+ __m128i zero = _mm_setzero_si128 ();
914904
915905 /* Process 8 int16_t values -> 1 byte of bitmap */
916906 int64_t full_bytes = count / 8 ;
917907 for (int64_t b = 0 ; b < full_bytes ; b ++ ) {
918908 __m128i levels = _mm_loadu_si128 ((const __m128i * )(def_levels + b * 8 ));
919- /* levels < max_def means null */
909+ /* levels < max_def means null: result is 0x0000 or 0xFFFF per lane */
920910 __m128i cmp = _mm_cmplt_epi16 (levels , max_vec );
921- /* Pack 8 comparison results to 8 bits */
922- int mask = _mm_movemask_epi8 (cmp );
923- /* Take every other bit (since int16 comparisons set 2 bytes each) */
924- uint8_t null_bits = 0 ;
925- if (mask & 0x0001 ) null_bits |= 0x01 ;
926- if (mask & 0x0004 ) null_bits |= 0x02 ;
927- if (mask & 0x0010 ) null_bits |= 0x04 ;
928- if (mask & 0x0040 ) null_bits |= 0x08 ;
929- if (mask & 0x0100 ) null_bits |= 0x10 ;
930- if (mask & 0x0400 ) null_bits |= 0x20 ;
931- if (mask & 0x1000 ) null_bits |= 0x40 ;
932- if (mask & 0x4000 ) null_bits |= 0x80 ;
933- null_bitmap [b ] = null_bits ;
911+ /* Pack 8 int16 results (0x0000 or 0xFFFF) to 8 int8 (0x00 or 0xFF) */
912+ __m128i packed = _mm_packs_epi16 (cmp , zero );
913+ /* movemask extracts bit 7 from each byte -> 8-bit result in low byte */
914+ null_bitmap [b ] = (uint8_t )_mm_movemask_epi8 (packed );
934915 i += 8 ;
935916 }
936917
0 commit comments