@@ -371,6 +371,7 @@ void carquet_neon_byte_stream_split_decode_float(
371371
372372/**
373373 * Encode doubles using byte stream split with NEON.
374+ * Uses table lookup for efficient transpose of 2 doubles (16 bytes) at a time.
374375 */
375376void carquet_neon_byte_stream_split_encode_double (
376377 const double * values ,
@@ -380,13 +381,52 @@ void carquet_neon_byte_stream_split_encode_double(
380381 const uint8_t * src = (const uint8_t * )values ;
381382 int64_t i = 0 ;
382383
383- /* Process 2 doubles (16 bytes) at a time */
384+ /* Process 2 doubles (16 bytes) at a time with NEON table lookup */
384385 for (; i + 2 <= count ; i += 2 ) {
385- /* Transpose: extract each byte position from both doubles */
386- for (int b = 0 ; b < 8 ; b ++ ) {
387- output [b * count + i + 0 ] = src [i * 8 + b ];
388- output [b * count + i + 1 ] = src [i * 8 + 8 + b ];
389- }
386+ /* Load 2 doubles = 16 bytes */
387+ uint8x16_t v = vld1q_u8 (src + i * 8 );
388+
389+ /* v = [a0 a1 a2 a3 a4 a5 a6 a7 | b0 b1 b2 b3 b4 b5 b6 b7]
390+ * Want streams: [a0 b0], [a1 b1], [a2 b2], ... [a7 b7]
391+ */
392+
393+ /* Table indices to extract byte pairs */
394+ static const uint8_t tbl_byte0 [16 ] = {0 , 8 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 };
395+ static const uint8_t tbl_byte1 [16 ] = {1 , 9 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 };
396+ static const uint8_t tbl_byte2 [16 ] = {2 , 10 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 };
397+ static const uint8_t tbl_byte3 [16 ] = {3 , 11 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 };
398+ static const uint8_t tbl_byte4 [16 ] = {4 , 12 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 };
399+ static const uint8_t tbl_byte5 [16 ] = {5 , 13 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 };
400+ static const uint8_t tbl_byte6 [16 ] = {6 , 14 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 };
401+ static const uint8_t tbl_byte7 [16 ] = {7 , 15 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 };
402+
403+ uint8x16_t out0 = vqtbl1q_u8 (v , vld1q_u8 (tbl_byte0 ));
404+ uint8x16_t out1 = vqtbl1q_u8 (v , vld1q_u8 (tbl_byte1 ));
405+ uint8x16_t out2 = vqtbl1q_u8 (v , vld1q_u8 (tbl_byte2 ));
406+ uint8x16_t out3 = vqtbl1q_u8 (v , vld1q_u8 (tbl_byte3 ));
407+ uint8x16_t out4 = vqtbl1q_u8 (v , vld1q_u8 (tbl_byte4 ));
408+ uint8x16_t out5 = vqtbl1q_u8 (v , vld1q_u8 (tbl_byte5 ));
409+ uint8x16_t out6 = vqtbl1q_u8 (v , vld1q_u8 (tbl_byte6 ));
410+ uint8x16_t out7 = vqtbl1q_u8 (v , vld1q_u8 (tbl_byte7 ));
411+
412+ /* Extract first 2 bytes and store to each stream */
413+ uint16_t t0 = vgetq_lane_u16 (vreinterpretq_u16_u8 (out0 ), 0 );
414+ uint16_t t1 = vgetq_lane_u16 (vreinterpretq_u16_u8 (out1 ), 0 );
415+ uint16_t t2 = vgetq_lane_u16 (vreinterpretq_u16_u8 (out2 ), 0 );
416+ uint16_t t3 = vgetq_lane_u16 (vreinterpretq_u16_u8 (out3 ), 0 );
417+ uint16_t t4 = vgetq_lane_u16 (vreinterpretq_u16_u8 (out4 ), 0 );
418+ uint16_t t5 = vgetq_lane_u16 (vreinterpretq_u16_u8 (out5 ), 0 );
419+ uint16_t t6 = vgetq_lane_u16 (vreinterpretq_u16_u8 (out6 ), 0 );
420+ uint16_t t7 = vgetq_lane_u16 (vreinterpretq_u16_u8 (out7 ), 0 );
421+
422+ memcpy (output + 0 * count + i , & t0 , sizeof (uint16_t ));
423+ memcpy (output + 1 * count + i , & t1 , sizeof (uint16_t ));
424+ memcpy (output + 2 * count + i , & t2 , sizeof (uint16_t ));
425+ memcpy (output + 3 * count + i , & t3 , sizeof (uint16_t ));
426+ memcpy (output + 4 * count + i , & t4 , sizeof (uint16_t ));
427+ memcpy (output + 5 * count + i , & t5 , sizeof (uint16_t ));
428+ memcpy (output + 6 * count + i , & t6 , sizeof (uint16_t ));
429+ memcpy (output + 7 * count + i , & t7 , sizeof (uint16_t ));
390430 }
391431
392432 /* Handle remaining values */
@@ -399,6 +439,7 @@ void carquet_neon_byte_stream_split_encode_double(
399439
400440/**
401441 * Decode byte stream split doubles using NEON.
442+ * Gathers bytes from 8 streams and interleaves them back into doubles.
402443 */
403444void carquet_neon_byte_stream_split_decode_double (
404445 const uint8_t * data ,
@@ -410,11 +451,56 @@ void carquet_neon_byte_stream_split_decode_double(
410451
411452 /* Process 2 doubles at a time */
412453 for (; i + 2 <= count ; i += 2 ) {
413- /* Gather bytes from 8 streams */
414- for (int b = 0 ; b < 8 ; b ++ ) {
415- dst [i * 8 + b ] = data [b * count + i ];
416- dst [i * 8 + 8 + b ] = data [b * count + i + 1 ];
417- }
454+ /* Load 2 bytes from each of the 8 streams */
455+ uint16_t b0 , b1 , b2 , b3 , b4 , b5 , b6 , b7 ;
456+ memcpy (& b0 , data + 0 * count + i , sizeof (uint16_t ));
457+ memcpy (& b1 , data + 1 * count + i , sizeof (uint16_t ));
458+ memcpy (& b2 , data + 2 * count + i , sizeof (uint16_t ));
459+ memcpy (& b3 , data + 3 * count + i , sizeof (uint16_t ));
460+ memcpy (& b4 , data + 4 * count + i , sizeof (uint16_t ));
461+ memcpy (& b5 , data + 5 * count + i , sizeof (uint16_t ));
462+ memcpy (& b6 , data + 6 * count + i , sizeof (uint16_t ));
463+ memcpy (& b7 , data + 7 * count + i , sizeof (uint16_t ));
464+
465+ /* Create vectors with each byte pair */
466+ uint8x8_t bytes0 = vreinterpret_u8_u16 (vdup_n_u16 (b0 ));
467+ uint8x8_t bytes1 = vreinterpret_u8_u16 (vdup_n_u16 (b1 ));
468+ uint8x8_t bytes2 = vreinterpret_u8_u16 (vdup_n_u16 (b2 ));
469+ uint8x8_t bytes3 = vreinterpret_u8_u16 (vdup_n_u16 (b3 ));
470+ uint8x8_t bytes4 = vreinterpret_u8_u16 (vdup_n_u16 (b4 ));
471+ uint8x8_t bytes5 = vreinterpret_u8_u16 (vdup_n_u16 (b5 ));
472+ uint8x8_t bytes6 = vreinterpret_u8_u16 (vdup_n_u16 (b6 ));
473+ uint8x8_t bytes7 = vreinterpret_u8_u16 (vdup_n_u16 (b7 ));
474+
475+ /* Interleave bytes to reconstruct doubles:
476+ * Input: b0=[a0,b0], b1=[a1,b1], ..., b7=[a7,b7]
477+ * Output: [a0,a1,a2,a3,a4,a5,a6,a7, b0,b1,b2,b3,b4,b5,b6,b7]
478+ */
479+
480+ /* Interleave pairs of byte streams */
481+ uint8x8x2_t zip01 = vzip_u8 (bytes0 , bytes1 ); /* [a0,a1,b0,b1,...] */
482+ uint8x8x2_t zip23 = vzip_u8 (bytes2 , bytes3 );
483+ uint8x8x2_t zip45 = vzip_u8 (bytes4 , bytes5 );
484+ uint8x8x2_t zip67 = vzip_u8 (bytes6 , bytes7 );
485+
486+ /* Now interleave 16-bit pairs */
487+ uint16x4_t lo01 = vreinterpret_u16_u8 (zip01 .val [0 ]);
488+ uint16x4_t lo23 = vreinterpret_u16_u8 (zip23 .val [0 ]);
489+ uint16x4_t lo45 = vreinterpret_u16_u8 (zip45 .val [0 ]);
490+ uint16x4_t lo67 = vreinterpret_u16_u8 (zip67 .val [0 ]);
491+
492+ uint16x4x2_t zip0123 = vzip_u16 (lo01 , lo23 );
493+ uint16x4x2_t zip4567 = vzip_u16 (lo45 , lo67 );
494+
495+ /* Interleave 32-bit pairs */
496+ uint32x2_t lo0123 = vreinterpret_u32_u16 (zip0123 .val [0 ]);
497+ uint32x2_t lo4567 = vreinterpret_u32_u16 (zip4567 .val [0 ]);
498+
499+ uint32x2x2_t zip_final = vzip_u32 (lo0123 , lo4567 );
500+
501+ /* Store the two doubles */
502+ vst1_u8 (dst + i * 8 , vreinterpret_u8_u32 (zip_final .val [0 ]));
503+ vst1_u8 (dst + i * 8 + 8 , vreinterpret_u8_u32 (zip_final .val [1 ]));
418504 }
419505
420506 /* Handle remaining values */
@@ -1144,5 +1230,130 @@ size_t carquet_neon_match_length(const uint8_t* p, const uint8_t* match, const u
11441230 return (size_t )(p - start );
11451231}
11461232
1233+ /* ============================================================================
1234+ * Definition Level Processing - NEON Optimized
1235+ * ============================================================================
1236+ */
1237+
1238+ /**
1239+ * Count non-null values using NEON.
1240+ * Counts how many def_levels[i] == max_def_level.
1241+ */
1242+ int64_t carquet_neon_count_non_nulls (const int16_t * def_levels , int64_t count , int16_t max_def_level ) {
1243+ int64_t non_null_count = 0 ;
1244+ int64_t i = 0 ;
1245+
1246+ int16x8_t max_vec = vdupq_n_s16 (max_def_level );
1247+
1248+ /* Process 8 int16_t values at a time */
1249+ for (; i + 8 <= count ; i += 8 ) {
1250+ int16x8_t levels = vld1q_s16 (def_levels + i );
1251+ uint16x8_t cmp = vceqq_s16 (levels , max_vec );
1252+
1253+ /* Narrow to 8-bit: 0xFFFF -> 0xFF, 0x0000 -> 0x00 */
1254+ uint8x8_t narrow = vmovn_u16 (cmp );
1255+
1256+ /* AND with 1 to get 0 or 1 per lane */
1257+ uint8x8_t ones = vand_u8 (narrow , vdup_n_u8 (1 ));
1258+
1259+ /* Horizontal add all 8 values */
1260+ uint16x4_t sum16 = vpaddl_u8 (ones );
1261+ uint32x2_t sum32 = vpaddl_u16 (sum16 );
1262+ uint64x1_t sum64 = vpaddl_u32 (sum32 );
1263+
1264+ non_null_count += vget_lane_u64 (sum64 , 0 );
1265+ }
1266+
1267+ /* Handle remaining */
1268+ for (; i < count ; i ++ ) {
1269+ if (def_levels [i ] == max_def_level ) {
1270+ non_null_count ++ ;
1271+ }
1272+ }
1273+
1274+ return non_null_count ;
1275+ }
1276+
1277+ /**
1278+ * Build null bitmap from definition levels using NEON.
1279+ * Sets bit to 1 if def_levels[i] < max_def_level (null).
1280+ */
1281+ void carquet_neon_build_null_bitmap (const int16_t * def_levels , int64_t count ,
1282+ int16_t max_def_level , uint8_t * null_bitmap ) {
1283+ int64_t i = 0 ;
1284+
1285+ int16x8_t max_vec = vdupq_n_s16 (max_def_level );
1286+
1287+ /* Process 8 int16_t values -> 1 byte of bitmap */
1288+ int64_t full_bytes = count / 8 ;
1289+ for (int64_t b = 0 ; b < full_bytes ; b ++ ) {
1290+ int16x8_t levels = vld1q_s16 (def_levels + b * 8 );
1291+
1292+ /* levels < max_def means null */
1293+ uint16x8_t cmp = vcltq_s16 (levels , max_vec );
1294+
1295+ /* Extract one bit per lane to form a byte
1296+ * cmp has 0xFFFF for null, 0x0000 for non-null
1297+ * We need bit 0 from lane 0, bit 1 from lane 1, etc.
1298+ */
1299+
1300+ /* Narrow to 8-bit: 0xFFFF -> 0xFF, 0x0000 -> 0x00 */
1301+ uint8x8_t narrow = vmovn_u16 (cmp );
1302+
1303+ /* Use bit extraction pattern:
1304+ * Multiply each lane by its bit position weight and sum */
1305+ static const uint8_t bit_weights [8 ] = {1 , 2 , 4 , 8 , 16 , 32 , 64 , 128 };
1306+ uint8x8_t weights = vld1_u8 (bit_weights );
1307+
1308+ /* AND with weights (0xFF & weight = weight, 0x00 & weight = 0) */
1309+ uint8x8_t weighted = vand_u8 (narrow , weights );
1310+
1311+ /* Horizontal add to get final byte */
1312+ uint16x4_t sum16 = vpaddl_u8 (weighted );
1313+ uint32x2_t sum32 = vpaddl_u16 (sum16 );
1314+ uint64x1_t sum64 = vpaddl_u32 (sum32 );
1315+
1316+ null_bitmap [b ] = (uint8_t )vget_lane_u64 (sum64 , 0 );
1317+ i += 8 ;
1318+ }
1319+
1320+ /* Handle remaining bits */
1321+ if (i < count ) {
1322+ uint8_t null_bits = 0 ;
1323+ for (int64_t j = 0 ; i + j < count && j < 8 ; j ++ ) {
1324+ if (def_levels [i + j ] < max_def_level ) {
1325+ null_bits |= (1 << j );
1326+ }
1327+ }
1328+ null_bitmap [full_bytes ] = null_bits ;
1329+ }
1330+ }
1331+
1332+ /**
1333+ * Fill definition levels with a constant value using NEON.
1334+ */
1335+ void carquet_neon_fill_def_levels (int16_t * def_levels , int64_t count , int16_t value ) {
1336+ int64_t i = 0 ;
1337+ int16x8_t val_vec = vdupq_n_s16 (value );
1338+
1339+ /* Process 32 int16_t values at a time (unrolled) */
1340+ for (; i + 32 <= count ; i += 32 ) {
1341+ vst1q_s16 (def_levels + i , val_vec );
1342+ vst1q_s16 (def_levels + i + 8 , val_vec );
1343+ vst1q_s16 (def_levels + i + 16 , val_vec );
1344+ vst1q_s16 (def_levels + i + 24 , val_vec );
1345+ }
1346+
1347+ /* Process 8 int16_t values at a time */
1348+ for (; i + 8 <= count ; i += 8 ) {
1349+ vst1q_s16 (def_levels + i , val_vec );
1350+ }
1351+
1352+ /* Handle remaining */
1353+ for (; i < count ; i ++ ) {
1354+ def_levels [i ] = value ;
1355+ }
1356+ }
1357+
11471358#endif /* __ARM_NEON */
11481359#endif /* ARM */
0 commit comments