@@ -336,6 +336,77 @@ static void DecodeLevels(Encoding::type level_encoding, int16_t max_level, int n
336336 state.SetItemsProcessed (state.iterations () * num_levels);
337337}
338338
339+ // Benchmark that simulates the old approach: Decode + separate count
340+ static void DecodeLevelsAndCountSeparate (Encoding::type level_encoding, int16_t max_level,
341+ int num_levels, int batch_size,
342+ int level_repeat_count,
343+ ::benchmark::State& state) {
344+ std::vector<uint8_t > bytes;
345+ {
346+ std::vector<int16_t > input_levels;
347+ GenerateLevels (/* level_repeats=*/ level_repeat_count, /* max_repeat_factor=*/ max_level,
348+ num_levels, &input_levels);
349+ EncodeLevels (level_encoding, max_level, num_levels, input_levels.data (), &bytes);
350+ }
351+
352+ LevelDecoder decoder;
353+ std::vector<int16_t > output_levels (batch_size);
354+ for (auto _ : state) {
355+ state.PauseTiming ();
356+ decoder.SetData (level_encoding, max_level, num_levels, bytes.data (),
357+ static_cast <int >(bytes.size ()));
358+ int64_t total_count = 0 ;
359+ state.ResumeTiming ();
360+ // Decode + count separately (old approach)
361+ while (true ) {
362+ int levels_decoded = decoder.Decode (batch_size, output_levels.data ());
363+ if (levels_decoded == 0 ) {
364+ break ;
365+ }
366+ // Separate count pass (simulating the old approach)
367+ total_count +=
368+ std::count (output_levels.data (), output_levels.data () + levels_decoded, max_level);
369+ }
370+ DoNotOptimize (total_count);
371+ }
372+ state.SetBytesProcessed (state.iterations () * num_levels * sizeof (int16_t ));
373+ state.SetItemsProcessed (state.iterations () * num_levels);
374+ }
375+
376+ // Benchmark that uses the new fused DecodeAndCount approach
377+ static void DecodeLevelsAndCountFused (Encoding::type level_encoding, int16_t max_level,
378+ int num_levels, int batch_size,
379+ int level_repeat_count, ::benchmark::State& state) {
380+ std::vector<uint8_t > bytes;
381+ {
382+ std::vector<int16_t > input_levels;
383+ GenerateLevels (/* level_repeats=*/ level_repeat_count, /* max_repeat_factor=*/ max_level,
384+ num_levels, &input_levels);
385+ EncodeLevels (level_encoding, max_level, num_levels, input_levels.data (), &bytes);
386+ }
387+
388+ LevelDecoder decoder;
389+ std::vector<int16_t > output_levels (batch_size);
390+ for (auto _ : state) {
391+ state.PauseTiming ();
392+ decoder.SetData (level_encoding, max_level, num_levels, bytes.data (),
393+ static_cast <int >(bytes.size ()));
394+ int64_t total_count = 0 ;
395+ state.ResumeTiming ();
396+ // Fused decode + count (new approach)
397+ while (true ) {
398+ int levels_decoded =
399+ decoder.DecodeAndCount (batch_size, output_levels.data (), &total_count);
400+ if (levels_decoded == 0 ) {
401+ break ;
402+ }
403+ }
404+ DoNotOptimize (total_count);
405+ }
406+ state.SetBytesProcessed (state.iterations () * num_levels * sizeof (int16_t ));
407+ state.SetItemsProcessed (state.iterations () * num_levels);
408+ }
409+
339410static void ReadLevels_Rle (::benchmark::State& state) {
340411 int16_t max_level = static_cast <int16_t >(state.range (0 ));
341412 int num_levels = static_cast <int >(state.range (1 ));
@@ -354,6 +425,26 @@ static void ReadLevels_BitPack(::benchmark::State& state) {
354425 level_repeat_count, state);
355426}
356427
428+ // Benchmark: Decode + Count separately (old approach)
429+ static void ReadLevels_RleCountSeparate (::benchmark::State& state) {
430+ int16_t max_level = static_cast <int16_t >(state.range (0 ));
431+ int num_levels = static_cast <int >(state.range (1 ));
432+ int batch_size = static_cast <int >(state.range (2 ));
433+ int level_repeat_count = static_cast <int >(state.range (3 ));
434+ DecodeLevelsAndCountSeparate (Encoding::RLE, max_level, num_levels, batch_size,
435+ level_repeat_count, state);
436+ }
437+
438+ // Benchmark: Fused DecodeAndCount (new approach)
439+ static void ReadLevels_RleCountFused (::benchmark::State& state) {
440+ int16_t max_level = static_cast <int16_t >(state.range (0 ));
441+ int num_levels = static_cast <int >(state.range (1 ));
442+ int batch_size = static_cast <int >(state.range (2 ));
443+ int level_repeat_count = static_cast <int >(state.range (3 ));
444+ DecodeLevelsAndCountFused (Encoding::RLE, max_level, num_levels, batch_size,
445+ level_repeat_count, state);
446+ }
447+
357448static void ReadLevelsArguments (::benchmark::internal::Benchmark* b) {
358449 b->ArgNames ({" MaxLevel" , " NumLevels" , " BatchSize" , " LevelRepeatCount" })
359450 ->Args ({1 , 8096 , 1024 , 1 })
@@ -367,6 +458,8 @@ static void ReadLevelsArguments(::benchmark::internal::Benchmark* b) {
367458
368459BENCHMARK (ReadLevels_Rle)->Apply (ReadLevelsArguments);
369460BENCHMARK (ReadLevels_BitPack)->Apply (ReadLevelsArguments);
461+ BENCHMARK (ReadLevels_RleCountSeparate)->Apply (ReadLevelsArguments);
462+ BENCHMARK (ReadLevels_RleCountFused)->Apply (ReadLevelsArguments);
370463
371464} // namespace benchmarks
372465} // namespace parquet
0 commit comments