@@ -429,6 +429,7 @@ void MeanAggregatorData::aggregate(
429
429
const ColumnWithStrings& input_column, const std::vector<size_t >& groups, size_t unique_values
430
430
) {
431
431
fractions_.resize (unique_values);
432
+ sparse_map_.resize (unique_values);
432
433
details::visit_type (input_column.column_ ->type ().data_type (), [&input_column, &groups, this ](auto col_tag) {
433
434
using col_type_info = ScalarTypeInfo<decltype (col_tag)>;
434
435
if constexpr (is_sequence_type (col_type_info::data_type)) {
@@ -444,10 +445,12 @@ void MeanAggregatorData::aggregate(
444
445
if (ARCTICDB_LIKELY (!std::isnan (enumerating_it.value ()))) {
445
446
fraction.numerator_ += static_cast <double >(enumerating_it.value ());
446
447
++fraction.denominator_ ;
448
+ sparse_map_.set (groups[enumerating_it.idx ()]);
447
449
}
448
450
} else {
449
451
fraction.numerator_ += static_cast <double >(enumerating_it.value ());
450
452
++fraction.denominator_ ;
453
+ sparse_map_.set (groups[enumerating_it.idx ()]);
451
454
}
452
455
}
453
456
);
@@ -458,34 +461,25 @@ SegmentInMemory MeanAggregatorData::finalize(const ColumnName& output_column_nam
458
461
SegmentInMemory res;
459
462
if (!fractions_.empty ()) {
460
463
fractions_.resize (unique_values);
461
- auto col = std::make_shared<Column>(
462
- make_scalar_type (get_output_data_type ()),
463
- fractions_.size (),
464
- AllocationType::PRESIZED,
465
- Sparsity::NOT_PERMITTED
466
- );
467
- auto column_data = col->data ();
468
- // TODO: Empty type needs more though. Maybe we should emit a column of empty value and leave it to the
469
- // NullValueReducer to handle it. As of this PR (04.07.2025) the empty type is feature flagged and not used so
470
- // we don't worry too much about optimizing it.
464
+ sparse_map_.resize (unique_values);
465
+ auto col =
466
+ create_output_column (make_scalar_type (get_output_data_type ()), std::move (sparse_map_), unique_values);
467
+ // TODO: Empty type needs more thought. Currently we emit a fully sparse column which will be populated by
468
+ // `copy_frame_data_to_buffer` but this might not be the right approach. As of this PR (11.09.2025) the empty
469
+ // type is feature flagged and not used so we don't worry too much about optimizing it.
471
470
if (data_type_ && *data_type_ == DataType::EMPTYVAL) [[unlikely]] {
472
- std::fill_n (column_data.begin <ScalarTagType<DataTypeTag<DataType::FLOAT64>>>(), fractions_.size (), 0 .f );
471
+ auto empty_bitset = util::BitSet (unique_values);
472
+ col->set_sparse_map (std::move (empty_bitset));
473
473
} else {
474
474
details::visit_type (col->type ().data_type (), [&, this ]<typename TypeTag>(TypeTag) {
475
475
using OutputDataTypeTag =
476
476
std::conditional_t <is_time_type (TypeTag::data_type), TypeTag, DataTypeTag<DataType::FLOAT64>>;
477
477
using OutputTypeDescriptor = typename ScalarTypeInfo<OutputDataTypeTag>::TDT;
478
- std::transform (
479
- fractions_.cbegin (),
480
- fractions_.cend (),
481
- column_data.begin <OutputTypeDescriptor>(),
482
- [](const auto & fraction) {
483
- return static_cast <typename OutputDataTypeTag::raw_type>(fraction.to_double ());
484
- }
485
- );
478
+ Column::for_each_enumerated<OutputTypeDescriptor>(*col, [&](auto row) {
479
+ row.value () = static_cast <typename OutputDataTypeTag::raw_type>(fractions_[row.idx ()].to_double ());
480
+ });
486
481
});
487
482
}
488
- col->set_row_data (fractions_.size () - 1 );
489
483
res.add_column (scalar_field (get_output_data_type (), output_column_name.value ), std::move (col));
490
484
}
491
485
return res;
@@ -505,6 +499,7 @@ void CountAggregatorData::aggregate(
505
499
const ColumnWithStrings& input_column, const std::vector<size_t >& groups, size_t unique_values
506
500
) {
507
501
aggregated_.resize (unique_values);
502
+ sparse_map_.resize (unique_values);
508
503
details::visit_type (input_column.column_ ->type ().data_type (), [&input_column, &groups, this ](auto col_tag) {
509
504
using col_type_info = ScalarTypeInfo<decltype (col_tag)>;
510
505
Column::for_each_enumerated<typename col_type_info::TDT>(
@@ -514,10 +509,12 @@ void CountAggregatorData::aggregate(
514
509
if (ARCTICDB_LIKELY (!std::isnan (enumerating_it.value ()))) {
515
510
auto & val = aggregated_[groups[enumerating_it.idx ()]];
516
511
++val;
512
+ sparse_map_.set (groups[enumerating_it.idx ()]);
517
513
}
518
514
} else {
519
515
auto & val = aggregated_[groups[enumerating_it.idx ()]];
520
516
++val;
517
+ sparse_map_.set (groups[enumerating_it.idx ()]);
521
518
}
522
519
}
523
520
);
@@ -528,13 +525,20 @@ SegmentInMemory CountAggregatorData::finalize(const ColumnName& output_column_na
528
525
SegmentInMemory res;
529
526
if (!aggregated_.empty ()) {
530
527
aggregated_.resize (unique_values);
531
- auto pos = res.add_column (
532
- scalar_field (DataType::UINT64, output_column_name.value ), unique_values, AllocationType::PRESIZED
533
- );
534
- auto & column = res.column (pos);
535
- auto ptr = reinterpret_cast <uint64_t *>(column.ptr ());
536
- column.set_row_data (unique_values - 1 );
537
- memcpy (ptr, aggregated_.data (), sizeof (uint64_t ) * unique_values);
528
+ sparse_map_.resize (unique_values);
529
+ auto col =
530
+ create_output_column (make_scalar_type (get_output_data_type ()), std::move (sparse_map_), unique_values);
531
+ if (!col->opt_sparse_map ().has_value ()) {
532
+ // If all values are set we use memcpy for efficiency
533
+ auto ptr = reinterpret_cast <uint64_t *>(col->ptr ());
534
+ memcpy (ptr, aggregated_.data (), sizeof (uint64_t ) * unique_values);
535
+ } else {
536
+ using OutputTypeDescriptor = typename ScalarTypeInfo<DataTypeTag<DataType::UINT64>>::TDT;
537
+ Column::for_each_enumerated<OutputTypeDescriptor>(*col, [&](auto row) {
538
+ row.value () = aggregated_[row.idx ()];
539
+ });
540
+ }
541
+ res.add_column (scalar_field (get_output_data_type (), output_column_name.value ), std::move (col));
538
542
}
539
543
return res;
540
544
}
@@ -556,6 +560,7 @@ void FirstAggregatorData::aggregate(
556
560
using GlobalTypeDescriptorTag = typename OutputType<GlobalInputType>::type;
557
561
using GlobalRawType = typename GlobalTypeDescriptorTag::DataTypeTag::raw_type;
558
562
aggregated_.resize (sizeof (GlobalRawType) * unique_values);
563
+ sparse_map_.resize (unique_values);
559
564
auto col_data = input_column.column_ ->data ();
560
565
auto out_ptr = reinterpret_cast <GlobalRawType*>(aggregated_.data ());
561
566
details::visit_type (
@@ -575,11 +580,13 @@ void FirstAggregatorData::aggregate(
575
580
if (is_first_group_el || std::isnan (static_cast <ColumnType>(val))) {
576
581
groups_cache_.insert (groups[groups_pos]);
577
582
val = GlobalRawType (*ptr);
583
+ sparse_map_.set (groups[groups_pos]);
578
584
}
579
585
} else {
580
586
if (is_first_group_el) {
581
587
groups_cache_.insert (groups[groups_pos]);
582
588
val = GlobalRawType (*ptr);
589
+ sparse_map_.set (groups[groups_pos]);
583
590
}
584
591
}
585
592
}
@@ -594,17 +601,23 @@ SegmentInMemory FirstAggregatorData::finalize(const ColumnName& output_column_na
594
601
SegmentInMemory res;
595
602
if (!aggregated_.empty ()) {
596
603
details::visit_type (*data_type_, [this , &res, &output_column_name, unique_values](auto col_tag) {
597
- using RawType = typename decltype (col_tag)::DataTypeTag::raw_type;
604
+ using col_type_info = ScalarTypeInfo<decltype (col_tag)>;
605
+ using RawType = typename col_type_info::RawType;
598
606
aggregated_.resize (sizeof (RawType) * unique_values);
599
- auto col = std::make_shared<Column>(
600
- make_scalar_type (data_type_.value ()),
601
- unique_values,
602
- AllocationType::PRESIZED,
603
- Sparsity::NOT_PERMITTED
604
- );
605
- memcpy (col->ptr (), aggregated_.data (), aggregated_.size ());
607
+ sparse_map_.resize (unique_values);
608
+ auto col =
609
+ create_output_column (make_scalar_type (data_type_.value ()), std::move (sparse_map_), unique_values);
610
+ if (!col->opt_sparse_map ().has_value ()) {
611
+ memcpy (col->ptr (), aggregated_.data (), aggregated_.size ());
612
+ } else {
613
+ const std::span<const RawType> group_values{
614
+ reinterpret_cast <const RawType*>(aggregated_.data ()), aggregated_.size () / sizeof (RawType)
615
+ };
616
+ Column::for_each_enumerated<typename col_type_info::TDT>(*col, [&](auto row) {
617
+ row.value () = group_values[row.idx ()];
618
+ });
619
+ }
606
620
res.add_column (scalar_field (data_type_.value (), output_column_name.value ), col);
607
- col->set_row_data (unique_values - 1 );
608
621
});
609
622
}
610
623
return res;
@@ -627,6 +640,7 @@ void LastAggregatorData::aggregate(
627
640
using GlobalTypeDescriptorTag = typename OutputType<GlobalInputType>::type;
628
641
using GlobalRawType = typename GlobalTypeDescriptorTag::DataTypeTag::raw_type;
629
642
aggregated_.resize (sizeof (GlobalRawType) * unique_values);
643
+ sparse_map_.resize (unique_values);
630
644
auto col_data = input_column.column_ ->data ();
631
645
auto out_ptr = reinterpret_cast <GlobalRawType*>(aggregated_.data ());
632
646
details::visit_type (
@@ -648,9 +662,11 @@ void LastAggregatorData::aggregate(
648
662
if (is_first_group_el || !std::isnan (static_cast <ColumnType>(curr))) {
649
663
groups_cache_.insert (groups[groups_pos]);
650
664
val = curr;
665
+ sparse_map_.set (groups[groups_pos]);
651
666
}
652
667
} else {
653
668
val = GlobalRawType (*ptr);
669
+ sparse_map_.set (groups[groups_pos]);
654
670
}
655
671
}
656
672
}
@@ -663,18 +679,24 @@ void LastAggregatorData::aggregate(
663
679
SegmentInMemory LastAggregatorData::finalize (const ColumnName& output_column_name, bool , size_t unique_values) {
664
680
SegmentInMemory res;
665
681
if (!aggregated_.empty ()) {
666
- details::visit_type (*data_type_, [that = this , &res, &output_column_name, unique_values](auto col_tag) {
667
- using RawType = typename decltype (col_tag)::DataTypeTag::raw_type;
668
- that->aggregated_ .resize (sizeof (RawType) * unique_values);
669
- auto col = std::make_shared<Column>(
670
- make_scalar_type (that->data_type_ .value ()),
671
- unique_values,
672
- AllocationType::PRESIZED,
673
- Sparsity::NOT_PERMITTED
674
- );
675
- memcpy (col->ptr (), that->aggregated_ .data (), that->aggregated_ .size ());
676
- res.add_column (scalar_field (that->data_type_ .value (), output_column_name.value ), col);
677
- col->set_row_data (unique_values - 1 );
682
+ details::visit_type (*data_type_, [&res, &output_column_name, unique_values, this ](auto col_tag) {
683
+ using col_type_info = ScalarTypeInfo<decltype (col_tag)>;
684
+ using RawType = typename col_type_info::RawType;
685
+ aggregated_.resize (sizeof (RawType) * unique_values);
686
+ sparse_map_.resize (unique_values);
687
+ auto col =
688
+ create_output_column (make_scalar_type (data_type_.value ()), std::move (sparse_map_), unique_values);
689
+ if (!col->opt_sparse_map ().has_value ()) {
690
+ memcpy (col->ptr (), aggregated_.data (), aggregated_.size ());
691
+ } else {
692
+ const std::span<const RawType> group_values{
693
+ reinterpret_cast <const RawType*>(aggregated_.data ()), aggregated_.size () / sizeof (RawType)
694
+ };
695
+ Column::for_each_enumerated<typename col_type_info::TDT>(*col, [&](auto row) {
696
+ row.value () = group_values[row.idx ()];
697
+ });
698
+ }
699
+ res.add_column (scalar_field (data_type_.value (), output_column_name.value ), col);
678
700
});
679
701
}
680
702
return res;
0 commit comments