Skip to content

Commit ed8d652

Browse files
committed
added ARM NEON SIMD BYTE_STREAM_SPLIT optimizations
1 parent 2ac4e72 commit ed8d652

File tree

3 files changed

+238
-29
lines changed

3 files changed

+238
-29
lines changed

benchmark/results.csv

Lines changed: 9 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,10 @@
11
library,dataset,compression,rows,write_ms,read_ms,file_bytes
2-
carquet,small,none,100000,1.94,0.63,2000366
3-
carquet,small,snappy,100000,2.75,0.78,1933903
4-
carquet,small,zstd,100000,4.82,1.13,1121063
5-
carquet,medium,none,1000000,12.20,4.03,20000380
6-
carquet,medium,snappy,1000000,22.17,4.60,19933920
7-
carquet,medium,zstd,1000000,55.22,10.62,11259205
8-
carquet,large,none,10000000,134.15,25.50,200000392
9-
carquet,large,snappy,10000000,226.14,30.36,199933935
10-
carquet,large,zstd,10000000,540.97,95.65,112635043
11-
pyarrow,small,none,100000,5.89,1.02,2109803
12-
pyarrow,small,snappy,100000,6.76,1.28,1829413
13-
pyarrow,small,zstd,100000,8.37,1.52,1574730
14-
pyarrow,medium,none,1000000,59.24,5.13,21097309
15-
pyarrow,medium,snappy,1000000,73.57,4.78,18292050
16-
pyarrow,medium,zstd,1000000,82.90,7.19,15742253
17-
pyarrow,large,none,10000000,584.07,29.01,210969394
18-
pyarrow,large,snappy,10000000,677.42,32.73,182922756
19-
pyarrow,large,zstd,10000000,799.22,51.67,157429767
2+
carquet,small,none,100000,0.89,0.60,2000366
3+
carquet,small,snappy,100000,1.92,0.72,1933903
4+
carquet,small,zstd,100000,4.54,1.15,1121063
5+
carquet,medium,none,1000000,10.33,3.61,20000380
6+
carquet,medium,snappy,1000000,20.62,4.42,19933920
7+
carquet,medium,zstd,1000000,53.29,10.48,11259205
8+
carquet,large,none,10000000,111.52,21.33,200000392
9+
carquet,large,snappy,10000000,225.04,28.74,199933935
10+
carquet,large,zstd,10000000,523.43,93.49,112635043

src/simd/arm/neon_ops.c

Lines changed: 222 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -371,6 +371,7 @@ void carquet_neon_byte_stream_split_decode_float(
371371

372372
/**
373373
* Encode doubles using byte stream split with NEON.
374+
* Uses table lookup for efficient transpose of 2 doubles (16 bytes) at a time.
374375
*/
375376
void carquet_neon_byte_stream_split_encode_double(
376377
const double* values,
@@ -380,13 +381,52 @@ void carquet_neon_byte_stream_split_encode_double(
380381
const uint8_t* src = (const uint8_t*)values;
381382
int64_t i = 0;
382383

383-
/* Process 2 doubles (16 bytes) at a time */
384+
/* Process 2 doubles (16 bytes) at a time with NEON table lookup */
384385
for (; i + 2 <= count; i += 2) {
385-
/* Transpose: extract each byte position from both doubles */
386-
for (int b = 0; b < 8; b++) {
387-
output[b * count + i + 0] = src[i * 8 + b];
388-
output[b * count + i + 1] = src[i * 8 + 8 + b];
389-
}
386+
/* Load 2 doubles = 16 bytes */
387+
uint8x16_t v = vld1q_u8(src + i * 8);
388+
389+
/* v = [a0 a1 a2 a3 a4 a5 a6 a7 | b0 b1 b2 b3 b4 b5 b6 b7]
390+
* Want streams: [a0 b0], [a1 b1], [a2 b2], ... [a7 b7]
391+
*/
392+
393+
/* Table indices to extract byte pairs */
394+
static const uint8_t tbl_byte0[16] = {0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
395+
static const uint8_t tbl_byte1[16] = {1, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
396+
static const uint8_t tbl_byte2[16] = {2, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
397+
static const uint8_t tbl_byte3[16] = {3, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
398+
static const uint8_t tbl_byte4[16] = {4, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
399+
static const uint8_t tbl_byte5[16] = {5, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
400+
static const uint8_t tbl_byte6[16] = {6, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
401+
static const uint8_t tbl_byte7[16] = {7, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
402+
403+
uint8x16_t out0 = vqtbl1q_u8(v, vld1q_u8(tbl_byte0));
404+
uint8x16_t out1 = vqtbl1q_u8(v, vld1q_u8(tbl_byte1));
405+
uint8x16_t out2 = vqtbl1q_u8(v, vld1q_u8(tbl_byte2));
406+
uint8x16_t out3 = vqtbl1q_u8(v, vld1q_u8(tbl_byte3));
407+
uint8x16_t out4 = vqtbl1q_u8(v, vld1q_u8(tbl_byte4));
408+
uint8x16_t out5 = vqtbl1q_u8(v, vld1q_u8(tbl_byte5));
409+
uint8x16_t out6 = vqtbl1q_u8(v, vld1q_u8(tbl_byte6));
410+
uint8x16_t out7 = vqtbl1q_u8(v, vld1q_u8(tbl_byte7));
411+
412+
/* Extract first 2 bytes and store to each stream */
413+
uint16_t t0 = vgetq_lane_u16(vreinterpretq_u16_u8(out0), 0);
414+
uint16_t t1 = vgetq_lane_u16(vreinterpretq_u16_u8(out1), 0);
415+
uint16_t t2 = vgetq_lane_u16(vreinterpretq_u16_u8(out2), 0);
416+
uint16_t t3 = vgetq_lane_u16(vreinterpretq_u16_u8(out3), 0);
417+
uint16_t t4 = vgetq_lane_u16(vreinterpretq_u16_u8(out4), 0);
418+
uint16_t t5 = vgetq_lane_u16(vreinterpretq_u16_u8(out5), 0);
419+
uint16_t t6 = vgetq_lane_u16(vreinterpretq_u16_u8(out6), 0);
420+
uint16_t t7 = vgetq_lane_u16(vreinterpretq_u16_u8(out7), 0);
421+
422+
memcpy(output + 0 * count + i, &t0, sizeof(uint16_t));
423+
memcpy(output + 1 * count + i, &t1, sizeof(uint16_t));
424+
memcpy(output + 2 * count + i, &t2, sizeof(uint16_t));
425+
memcpy(output + 3 * count + i, &t3, sizeof(uint16_t));
426+
memcpy(output + 4 * count + i, &t4, sizeof(uint16_t));
427+
memcpy(output + 5 * count + i, &t5, sizeof(uint16_t));
428+
memcpy(output + 6 * count + i, &t6, sizeof(uint16_t));
429+
memcpy(output + 7 * count + i, &t7, sizeof(uint16_t));
390430
}
391431

392432
/* Handle remaining values */
@@ -399,6 +439,7 @@ void carquet_neon_byte_stream_split_encode_double(
399439

400440
/**
401441
* Decode byte stream split doubles using NEON.
442+
* Gathers bytes from 8 streams and interleaves them back into doubles.
402443
*/
403444
void carquet_neon_byte_stream_split_decode_double(
404445
const uint8_t* data,
@@ -410,11 +451,56 @@ void carquet_neon_byte_stream_split_decode_double(
410451

411452
/* Process 2 doubles at a time */
412453
for (; i + 2 <= count; i += 2) {
413-
/* Gather bytes from 8 streams */
414-
for (int b = 0; b < 8; b++) {
415-
dst[i * 8 + b] = data[b * count + i];
416-
dst[i * 8 + 8 + b] = data[b * count + i + 1];
417-
}
454+
/* Load 2 bytes from each of the 8 streams */
455+
uint16_t b0, b1, b2, b3, b4, b5, b6, b7;
456+
memcpy(&b0, data + 0 * count + i, sizeof(uint16_t));
457+
memcpy(&b1, data + 1 * count + i, sizeof(uint16_t));
458+
memcpy(&b2, data + 2 * count + i, sizeof(uint16_t));
459+
memcpy(&b3, data + 3 * count + i, sizeof(uint16_t));
460+
memcpy(&b4, data + 4 * count + i, sizeof(uint16_t));
461+
memcpy(&b5, data + 5 * count + i, sizeof(uint16_t));
462+
memcpy(&b6, data + 6 * count + i, sizeof(uint16_t));
463+
memcpy(&b7, data + 7 * count + i, sizeof(uint16_t));
464+
465+
/* Create vectors with each byte pair */
466+
uint8x8_t bytes0 = vreinterpret_u8_u16(vdup_n_u16(b0));
467+
uint8x8_t bytes1 = vreinterpret_u8_u16(vdup_n_u16(b1));
468+
uint8x8_t bytes2 = vreinterpret_u8_u16(vdup_n_u16(b2));
469+
uint8x8_t bytes3 = vreinterpret_u8_u16(vdup_n_u16(b3));
470+
uint8x8_t bytes4 = vreinterpret_u8_u16(vdup_n_u16(b4));
471+
uint8x8_t bytes5 = vreinterpret_u8_u16(vdup_n_u16(b5));
472+
uint8x8_t bytes6 = vreinterpret_u8_u16(vdup_n_u16(b6));
473+
uint8x8_t bytes7 = vreinterpret_u8_u16(vdup_n_u16(b7));
474+
475+
/* Interleave bytes to reconstruct doubles:
476+
* Input: b0=[a0,b0], b1=[a1,b1], ..., b7=[a7,b7]
477+
* Output: [a0,a1,a2,a3,a4,a5,a6,a7, b0,b1,b2,b3,b4,b5,b6,b7]
478+
*/
479+
480+
/* Interleave pairs of byte streams */
481+
uint8x8x2_t zip01 = vzip_u8(bytes0, bytes1); /* [a0,a1,b0,b1,...] */
482+
uint8x8x2_t zip23 = vzip_u8(bytes2, bytes3);
483+
uint8x8x2_t zip45 = vzip_u8(bytes4, bytes5);
484+
uint8x8x2_t zip67 = vzip_u8(bytes6, bytes7);
485+
486+
/* Now interleave 16-bit pairs */
487+
uint16x4_t lo01 = vreinterpret_u16_u8(zip01.val[0]);
488+
uint16x4_t lo23 = vreinterpret_u16_u8(zip23.val[0]);
489+
uint16x4_t lo45 = vreinterpret_u16_u8(zip45.val[0]);
490+
uint16x4_t lo67 = vreinterpret_u16_u8(zip67.val[0]);
491+
492+
uint16x4x2_t zip0123 = vzip_u16(lo01, lo23);
493+
uint16x4x2_t zip4567 = vzip_u16(lo45, lo67);
494+
495+
/* Interleave 32-bit pairs */
496+
uint32x2_t lo0123 = vreinterpret_u32_u16(zip0123.val[0]);
497+
uint32x2_t lo4567 = vreinterpret_u32_u16(zip4567.val[0]);
498+
499+
uint32x2x2_t zip_final = vzip_u32(lo0123, lo4567);
500+
501+
/* Store the two doubles */
502+
vst1_u8(dst + i * 8, vreinterpret_u8_u32(zip_final.val[0]));
503+
vst1_u8(dst + i * 8 + 8, vreinterpret_u8_u32(zip_final.val[1]));
418504
}
419505

420506
/* Handle remaining values */
@@ -1144,5 +1230,130 @@ size_t carquet_neon_match_length(const uint8_t* p, const uint8_t* match, const u
11441230
return (size_t)(p - start);
11451231
}
11461232

1233+
/* ============================================================================
1234+
* Definition Level Processing - NEON Optimized
1235+
* ============================================================================
1236+
*/
1237+
1238+
/**
1239+
* Count non-null values using NEON.
1240+
* Counts how many def_levels[i] == max_def_level.
1241+
*/
1242+
int64_t carquet_neon_count_non_nulls(const int16_t* def_levels, int64_t count, int16_t max_def_level) {
1243+
int64_t non_null_count = 0;
1244+
int64_t i = 0;
1245+
1246+
int16x8_t max_vec = vdupq_n_s16(max_def_level);
1247+
1248+
/* Process 8 int16_t values at a time */
1249+
for (; i + 8 <= count; i += 8) {
1250+
int16x8_t levels = vld1q_s16(def_levels + i);
1251+
uint16x8_t cmp = vceqq_s16(levels, max_vec);
1252+
1253+
/* Narrow to 8-bit: 0xFFFF -> 0xFF, 0x0000 -> 0x00 */
1254+
uint8x8_t narrow = vmovn_u16(cmp);
1255+
1256+
/* AND with 1 to get 0 or 1 per lane */
1257+
uint8x8_t ones = vand_u8(narrow, vdup_n_u8(1));
1258+
1259+
/* Horizontal add all 8 values */
1260+
uint16x4_t sum16 = vpaddl_u8(ones);
1261+
uint32x2_t sum32 = vpaddl_u16(sum16);
1262+
uint64x1_t sum64 = vpaddl_u32(sum32);
1263+
1264+
non_null_count += vget_lane_u64(sum64, 0);
1265+
}
1266+
1267+
/* Handle remaining */
1268+
for (; i < count; i++) {
1269+
if (def_levels[i] == max_def_level) {
1270+
non_null_count++;
1271+
}
1272+
}
1273+
1274+
return non_null_count;
1275+
}
1276+
1277+
/**
1278+
* Build null bitmap from definition levels using NEON.
1279+
* Sets bit to 1 if def_levels[i] < max_def_level (null).
1280+
*/
1281+
void carquet_neon_build_null_bitmap(const int16_t* def_levels, int64_t count,
1282+
int16_t max_def_level, uint8_t* null_bitmap) {
1283+
int64_t i = 0;
1284+
1285+
int16x8_t max_vec = vdupq_n_s16(max_def_level);
1286+
1287+
/* Process 8 int16_t values -> 1 byte of bitmap */
1288+
int64_t full_bytes = count / 8;
1289+
for (int64_t b = 0; b < full_bytes; b++) {
1290+
int16x8_t levels = vld1q_s16(def_levels + b * 8);
1291+
1292+
/* levels < max_def means null */
1293+
uint16x8_t cmp = vcltq_s16(levels, max_vec);
1294+
1295+
/* Extract one bit per lane to form a byte
1296+
* cmp has 0xFFFF for null, 0x0000 for non-null
1297+
* We need bit 0 from lane 0, bit 1 from lane 1, etc.
1298+
*/
1299+
1300+
/* Narrow to 8-bit: 0xFFFF -> 0xFF, 0x0000 -> 0x00 */
1301+
uint8x8_t narrow = vmovn_u16(cmp);
1302+
1303+
/* Use bit extraction pattern:
1304+
* Multiply each lane by its bit position weight and sum */
1305+
static const uint8_t bit_weights[8] = {1, 2, 4, 8, 16, 32, 64, 128};
1306+
uint8x8_t weights = vld1_u8(bit_weights);
1307+
1308+
/* AND with weights (0xFF & weight = weight, 0x00 & weight = 0) */
1309+
uint8x8_t weighted = vand_u8(narrow, weights);
1310+
1311+
/* Horizontal add to get final byte */
1312+
uint16x4_t sum16 = vpaddl_u8(weighted);
1313+
uint32x2_t sum32 = vpaddl_u16(sum16);
1314+
uint64x1_t sum64 = vpaddl_u32(sum32);
1315+
1316+
null_bitmap[b] = (uint8_t)vget_lane_u64(sum64, 0);
1317+
i += 8;
1318+
}
1319+
1320+
/* Handle remaining bits */
1321+
if (i < count) {
1322+
uint8_t null_bits = 0;
1323+
for (int64_t j = 0; i + j < count && j < 8; j++) {
1324+
if (def_levels[i + j] < max_def_level) {
1325+
null_bits |= (1 << j);
1326+
}
1327+
}
1328+
null_bitmap[full_bytes] = null_bits;
1329+
}
1330+
}
1331+
1332+
/**
1333+
* Fill definition levels with a constant value using NEON.
1334+
*/
1335+
void carquet_neon_fill_def_levels(int16_t* def_levels, int64_t count, int16_t value) {
1336+
int64_t i = 0;
1337+
int16x8_t val_vec = vdupq_n_s16(value);
1338+
1339+
/* Process 32 int16_t values at a time (unrolled) */
1340+
for (; i + 32 <= count; i += 32) {
1341+
vst1q_s16(def_levels + i, val_vec);
1342+
vst1q_s16(def_levels + i + 8, val_vec);
1343+
vst1q_s16(def_levels + i + 16, val_vec);
1344+
vst1q_s16(def_levels + i + 24, val_vec);
1345+
}
1346+
1347+
/* Process 8 int16_t values at a time */
1348+
for (; i + 8 <= count; i += 8) {
1349+
vst1q_s16(def_levels + i, val_vec);
1350+
}
1351+
1352+
/* Handle remaining */
1353+
for (; i < count; i++) {
1354+
def_levels[i] = value;
1355+
}
1356+
}
1357+
11471358
#endif /* __ARM_NEON */
11481359
#endif /* ARM */

src/simd/dispatch.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -397,6 +397,10 @@ extern int64_t carquet_neon_find_run_length_i32(const int32_t* values, int64_t c
397397
extern uint32_t carquet_neon_crc32c(uint32_t crc, const uint8_t* data, size_t len);
398398
extern void carquet_neon_match_copy(uint8_t* dst, const uint8_t* src, size_t len, size_t offset);
399399
extern size_t carquet_neon_match_length(const uint8_t* p, const uint8_t* match, const uint8_t* limit);
400+
extern int64_t carquet_neon_count_non_nulls(const int16_t* def_levels, int64_t count, int16_t max_def_level);
401+
extern void carquet_neon_build_null_bitmap(const int16_t* def_levels, int64_t count,
402+
int16_t max_def_level, uint8_t* null_bitmap);
403+
extern void carquet_neon_fill_def_levels(int16_t* def_levels, int64_t count, int16_t value);
400404
#endif
401405

402406
#ifdef __ARM_FEATURE_SVE
@@ -558,6 +562,9 @@ void carquet_simd_dispatch_init(void) {
558562
g_dispatch.crc32c = carquet_neon_crc32c;
559563
g_dispatch.match_copy = carquet_neon_match_copy;
560564
g_dispatch.match_length = carquet_neon_match_length;
565+
g_dispatch.count_non_nulls = carquet_neon_count_non_nulls;
566+
g_dispatch.build_null_bitmap = carquet_neon_build_null_bitmap;
567+
g_dispatch.fill_def_levels = carquet_neon_fill_def_levels;
561568
#endif
562569

563570
/* SVE overrides NEON if available (better performance on supporting hardware) */

0 commit comments

Comments
 (0)