Skip to content

Commit 2280bb9

Browse files
committed
Add benchmarks for fused DecodeAndCount optimization
Add ReadLevels_RleCountSeparate and ReadLevels_RleCountFused benchmarks to compare the old approach (Decode + std::count) vs the new fused DecodeAndCount approach. Results show ~12% speedup for RLE-heavy data (high repeat counts) where counting is O(1) for entire runs.
1 parent 0292132 commit 2280bb9

File tree

1 file changed

+93
-0
lines changed

1 file changed

+93
-0
lines changed

cpp/src/parquet/column_reader_benchmark.cc

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -336,6 +336,77 @@ static void DecodeLevels(Encoding::type level_encoding, int16_t max_level, int n
336336
state.SetItemsProcessed(state.iterations() * num_levels);
337337
}
338338

339+
// Benchmark that simulates the old approach: Decode + separate count
340+
static void DecodeLevelsAndCountSeparate(Encoding::type level_encoding, int16_t max_level,
341+
int num_levels, int batch_size,
342+
int level_repeat_count,
343+
::benchmark::State& state) {
344+
std::vector<uint8_t> bytes;
345+
{
346+
std::vector<int16_t> input_levels;
347+
GenerateLevels(/*level_repeats=*/level_repeat_count, /*max_repeat_factor=*/max_level,
348+
num_levels, &input_levels);
349+
EncodeLevels(level_encoding, max_level, num_levels, input_levels.data(), &bytes);
350+
}
351+
352+
LevelDecoder decoder;
353+
std::vector<int16_t> output_levels(batch_size);
354+
for (auto _ : state) {
355+
state.PauseTiming();
356+
decoder.SetData(level_encoding, max_level, num_levels, bytes.data(),
357+
static_cast<int>(bytes.size()));
358+
int64_t total_count = 0;
359+
state.ResumeTiming();
360+
// Decode + count separately (old approach)
361+
while (true) {
362+
int levels_decoded = decoder.Decode(batch_size, output_levels.data());
363+
if (levels_decoded == 0) {
364+
break;
365+
}
366+
// Separate count pass (simulating the old approach)
367+
total_count +=
368+
std::count(output_levels.data(), output_levels.data() + levels_decoded, max_level);
369+
}
370+
DoNotOptimize(total_count);
371+
}
372+
state.SetBytesProcessed(state.iterations() * num_levels * sizeof(int16_t));
373+
state.SetItemsProcessed(state.iterations() * num_levels);
374+
}
375+
376+
// Benchmark that uses the new fused DecodeAndCount approach
377+
static void DecodeLevelsAndCountFused(Encoding::type level_encoding, int16_t max_level,
378+
int num_levels, int batch_size,
379+
int level_repeat_count, ::benchmark::State& state) {
380+
std::vector<uint8_t> bytes;
381+
{
382+
std::vector<int16_t> input_levels;
383+
GenerateLevels(/*level_repeats=*/level_repeat_count, /*max_repeat_factor=*/max_level,
384+
num_levels, &input_levels);
385+
EncodeLevels(level_encoding, max_level, num_levels, input_levels.data(), &bytes);
386+
}
387+
388+
LevelDecoder decoder;
389+
std::vector<int16_t> output_levels(batch_size);
390+
for (auto _ : state) {
391+
state.PauseTiming();
392+
decoder.SetData(level_encoding, max_level, num_levels, bytes.data(),
393+
static_cast<int>(bytes.size()));
394+
int64_t total_count = 0;
395+
state.ResumeTiming();
396+
// Fused decode + count (new approach)
397+
while (true) {
398+
int levels_decoded =
399+
decoder.DecodeAndCount(batch_size, output_levels.data(), &total_count);
400+
if (levels_decoded == 0) {
401+
break;
402+
}
403+
}
404+
DoNotOptimize(total_count);
405+
}
406+
state.SetBytesProcessed(state.iterations() * num_levels * sizeof(int16_t));
407+
state.SetItemsProcessed(state.iterations() * num_levels);
408+
}
409+
339410
static void ReadLevels_Rle(::benchmark::State& state) {
340411
int16_t max_level = static_cast<int16_t>(state.range(0));
341412
int num_levels = static_cast<int>(state.range(1));
@@ -354,6 +425,26 @@ static void ReadLevels_BitPack(::benchmark::State& state) {
354425
level_repeat_count, state);
355426
}
356427

428+
// Benchmark: Decode + Count separately (old approach)
429+
static void ReadLevels_RleCountSeparate(::benchmark::State& state) {
430+
int16_t max_level = static_cast<int16_t>(state.range(0));
431+
int num_levels = static_cast<int>(state.range(1));
432+
int batch_size = static_cast<int>(state.range(2));
433+
int level_repeat_count = static_cast<int>(state.range(3));
434+
DecodeLevelsAndCountSeparate(Encoding::RLE, max_level, num_levels, batch_size,
435+
level_repeat_count, state);
436+
}
437+
438+
// Benchmark: Fused DecodeAndCount (new approach)
439+
static void ReadLevels_RleCountFused(::benchmark::State& state) {
440+
int16_t max_level = static_cast<int16_t>(state.range(0));
441+
int num_levels = static_cast<int>(state.range(1));
442+
int batch_size = static_cast<int>(state.range(2));
443+
int level_repeat_count = static_cast<int>(state.range(3));
444+
DecodeLevelsAndCountFused(Encoding::RLE, max_level, num_levels, batch_size,
445+
level_repeat_count, state);
446+
}
447+
357448
static void ReadLevelsArguments(::benchmark::internal::Benchmark* b) {
358449
b->ArgNames({"MaxLevel", "NumLevels", "BatchSize", "LevelRepeatCount"})
359450
->Args({1, 8096, 1024, 1})
@@ -367,6 +458,8 @@ static void ReadLevelsArguments(::benchmark::internal::Benchmark* b) {
367458

368459
BENCHMARK(ReadLevels_Rle)->Apply(ReadLevelsArguments);
369460
BENCHMARK(ReadLevels_BitPack)->Apply(ReadLevelsArguments);
461+
BENCHMARK(ReadLevels_RleCountSeparate)->Apply(ReadLevelsArguments);
462+
BENCHMARK(ReadLevels_RleCountFused)->Apply(ReadLevelsArguments);
370463

371464
} // namespace benchmarks
372465
} // namespace parquet

0 commit comments

Comments
 (0)