@@ -426,6 +426,7 @@ DataType MeanAggregatorData::get_output_data_type() {
426
426
427
427
void MeanAggregatorData::aggregate (const ColumnWithStrings& input_column, const std::vector<size_t >& groups, size_t unique_values) {
428
428
fractions_.resize (unique_values);
429
+ sparse_map_.resize (unique_values);
429
430
details::visit_type (input_column.column_ ->type ().data_type (), [&input_column, &groups, this ] (auto col_tag) {
430
431
using col_type_info = ScalarTypeInfo<decltype (col_tag)>;
431
432
if constexpr (is_sequence_type (col_type_info::data_type)) {
@@ -439,10 +440,12 @@ void MeanAggregatorData::aggregate(const ColumnWithStrings& input_column, const
439
440
if (ARCTICDB_LIKELY (!std::isnan (enumerating_it.value ()))) {
440
441
fraction.numerator_ += static_cast <double >(enumerating_it.value ());
441
442
++fraction.denominator_ ;
443
+ sparse_map_.set (groups[enumerating_it.idx ()]);
442
444
}
443
445
} else {
444
446
fraction.numerator_ += static_cast <double >(enumerating_it.value ());
445
447
++fraction.denominator_ ;
448
+ sparse_map_.set (groups[enumerating_it.idx ()]);
446
449
}
447
450
});
448
451
});
@@ -452,25 +455,23 @@ SegmentInMemory MeanAggregatorData::finalize(const ColumnName& output_column_nam
452
455
SegmentInMemory res;
453
456
if (!fractions_.empty ()) {
454
457
fractions_.resize (unique_values);
455
- auto col = std::make_shared<Column>( make_scalar_type ( get_output_data_type ()), fractions_. size (), AllocationType::PRESIZED, Sparsity::NOT_PERMITTED );
456
- auto column_data = col-> data ( );
457
- // TODO: Empty type needs more though. Maybe we should emit a column of empty value and leave it to the
458
- // NullValueReducer to handle it . As of this PR (04.07 .2025) the empty type is feature flagged and not used so
459
- // we don't worry too much about optimizing it.
458
+ sparse_map_. resize (unique_values );
459
+ auto col = create_output_column ( make_scalar_type ( get_output_data_type ()), std::move (sparse_map_), unique_values );
460
+ // TODO: Empty type needs more thought. Currently we emit a fully sparse column which will be populated by
461
+ // `copy_frame_data_to_buffer` but this might not be the right approach . As of this PR (11.09 .2025) the empty
462
+ // type is feature flagged and not used so we don't worry too much about optimizing it.
460
463
if (data_type_ && *data_type_ == DataType::EMPTYVAL) [[unlikely]] {
461
- std::fill_n (column_data.begin <ScalarTagType<DataTypeTag<DataType::FLOAT64>>>(), fractions_.size (), 0 .f );
464
+ auto empty_bitset = util::BitSet (unique_values);
465
+ col->set_sparse_map (std::move (empty_bitset));
462
466
} else {
463
467
details::visit_type (col->type ().data_type (), [&, this ]<typename TypeTag>(TypeTag) {
464
- using OutputDataTypeTag = std::conditional_t <is_time_type (TypeTag::data_type), TypeTag, DataTypeTag<DataType::FLOAT64>>;
465
- using OutputTypeDescriptor = typename ScalarTypeInfo<OutputDataTypeTag>::TDT;
466
- std::transform (fractions_.cbegin (), fractions_.cend (),
467
- column_data.begin <OutputTypeDescriptor>(),
468
- [](const auto &fraction) {
469
- return static_cast <typename OutputDataTypeTag::raw_type>(fraction.to_double ());
470
- });
471
- });
468
+ using OutputDataTypeTag = std::conditional_t <is_time_type (TypeTag::data_type), TypeTag, DataTypeTag<DataType::FLOAT64>>;
469
+ using OutputTypeDescriptor = typename ScalarTypeInfo<OutputDataTypeTag>::TDT;
470
+ Column::for_each_enumerated<OutputTypeDescriptor>(*col, [&](auto row) {
471
+ row.value () = static_cast <typename OutputDataTypeTag::raw_type>(fractions_[row.idx ()].to_double ());
472
+ });
473
+ });
472
474
}
473
- col->set_row_data (fractions_.size () - 1 );
474
475
res.add_column (scalar_field (get_output_data_type (), output_column_name.value ), std::move (col));
475
476
}
476
477
return res;
@@ -490,17 +491,20 @@ std::optional<Value> MeanAggregatorData::get_default_value() {
490
491
491
492
void CountAggregatorData::aggregate (const ColumnWithStrings& input_column, const std::vector<size_t >& groups, size_t unique_values) {
492
493
aggregated_.resize (unique_values);
494
+ sparse_map_.resize (unique_values);
493
495
details::visit_type (input_column.column_ ->type ().data_type (), [&input_column, &groups, this ] (auto col_tag) {
494
496
using col_type_info = ScalarTypeInfo<decltype (col_tag)>;
495
497
Column::for_each_enumerated<typename col_type_info::TDT>(*input_column.column_ , [&groups, this ](auto enumerating_it) {
496
498
if constexpr (is_floating_point_type (col_type_info::data_type)) {
497
499
if (ARCTICDB_LIKELY (!std::isnan (enumerating_it.value ()))) {
498
500
auto & val = aggregated_[groups[enumerating_it.idx ()]];
499
501
++val;
502
+ sparse_map_.set (groups[enumerating_it.idx ()]);
500
503
}
501
504
} else {
502
505
auto & val = aggregated_[groups[enumerating_it.idx ()]];
503
506
++val;
507
+ sparse_map_.set (groups[enumerating_it.idx ()]);
504
508
}
505
509
});
506
510
});
@@ -510,11 +514,19 @@ SegmentInMemory CountAggregatorData::finalize(const ColumnName& output_column_na
510
514
SegmentInMemory res;
511
515
if (!aggregated_.empty ()) {
512
516
aggregated_.resize (unique_values);
513
- auto pos = res.add_column (scalar_field (DataType::UINT64, output_column_name.value ), unique_values, AllocationType::PRESIZED);
514
- auto & column = res.column (pos);
515
- auto ptr = reinterpret_cast <uint64_t *>(column.ptr ());
516
- column.set_row_data (unique_values - 1 );
517
- memcpy (ptr, aggregated_.data (), sizeof (uint64_t )*unique_values);
517
+ sparse_map_.resize (unique_values);
518
+ auto col = create_output_column (make_scalar_type (get_output_data_type ()), std::move (sparse_map_), unique_values);
519
+ if (!col->opt_sparse_map ().has_value ()) {
520
+ // If all values are set we use memcpy for efficiency
521
+ auto ptr = reinterpret_cast <uint64_t *>(col->ptr ());
522
+ memcpy (ptr, aggregated_.data (), sizeof (uint64_t )*unique_values);
523
+ } else {
524
+ using OutputTypeDescriptor = typename ScalarTypeInfo<DataTypeTag<DataType::UINT64>>::TDT;
525
+ Column::for_each_enumerated<OutputTypeDescriptor>(*col, [&](auto row) {
526
+ row.value () = aggregated_[row.idx ()];
527
+ });
528
+ }
529
+ res.add_column (scalar_field (get_output_data_type (), output_column_name.value ), std::move (col));
518
530
}
519
531
return res;
520
532
}
@@ -538,6 +550,7 @@ void FirstAggregatorData::aggregate(const ColumnWithStrings& input_column, const
538
550
using GlobalTypeDescriptorTag = typename OutputType<GlobalInputType>::type;
539
551
using GlobalRawType = typename GlobalTypeDescriptorTag::DataTypeTag::raw_type;
540
552
aggregated_.resize (sizeof (GlobalRawType)* unique_values);
553
+ sparse_map_.resize (unique_values);
541
554
auto col_data = input_column.column_ ->data ();
542
555
auto out_ptr = reinterpret_cast <GlobalRawType*>(aggregated_.data ());
543
556
details::visit_type (input_column.column_ ->type ().data_type (), [this , &groups, &out_ptr, &col_data] (auto col_tag) {
@@ -553,11 +566,13 @@ void FirstAggregatorData::aggregate(const ColumnWithStrings& input_column, const
553
566
if (is_first_group_el || std::isnan (static_cast <ColumnType>(val))) {
554
567
groups_cache_.insert (groups[groups_pos]);
555
568
val = GlobalRawType (*ptr);
569
+ sparse_map_.set (groups[groups_pos]);
556
570
}
557
571
} else {
558
572
if (is_first_group_el) {
559
573
groups_cache_.insert (groups[groups_pos]);
560
574
val = GlobalRawType (*ptr);
575
+ sparse_map_.set (groups[groups_pos]);
561
576
}
562
577
}
563
578
}
@@ -571,12 +586,20 @@ SegmentInMemory FirstAggregatorData::finalize(const ColumnName& output_column_na
571
586
SegmentInMemory res;
572
587
if (!aggregated_.empty ()) {
573
588
details::visit_type (*data_type_, [this , &res, &output_column_name, unique_values] (auto col_tag) {
574
- using RawType = typename decltype (col_tag)::DataTypeTag::raw_type;
589
+ using col_type_info = ScalarTypeInfo<decltype (col_tag)>;
590
+ using RawType = typename col_type_info::RawType;
575
591
aggregated_.resize (sizeof (RawType)* unique_values);
576
- auto col = std::make_shared<Column>(make_scalar_type (data_type_.value ()), unique_values, AllocationType::PRESIZED, Sparsity::NOT_PERMITTED);
577
- memcpy (col->ptr (), aggregated_.data (), aggregated_.size ());
592
+ sparse_map_.resize (unique_values);
593
+ auto col = create_output_column (make_scalar_type (data_type_.value ()), std::move (sparse_map_), unique_values);
594
+ if (!col->opt_sparse_map ().has_value ()) {
595
+ memcpy (col->ptr (), aggregated_.data (), aggregated_.size ());
596
+ } else {
597
+ const std::span<const RawType> group_values{reinterpret_cast <const RawType*>(aggregated_.data ()), aggregated_.size () / sizeof (RawType)};
598
+ Column::for_each_enumerated<typename col_type_info::TDT>(*col, [&](auto row) {
599
+ row.value () = group_values[row.idx ()];
600
+ });
601
+ }
578
602
res.add_column (scalar_field (data_type_.value (), output_column_name.value ), col);
579
- col->set_row_data (unique_values - 1 );
580
603
});
581
604
}
582
605
return res;
@@ -601,6 +624,7 @@ void LastAggregatorData::aggregate(const ColumnWithStrings& input_column, const
601
624
using GlobalTypeDescriptorTag = typename OutputType<GlobalInputType>::type;
602
625
using GlobalRawType = typename GlobalTypeDescriptorTag::DataTypeTag::raw_type;
603
626
aggregated_.resize (sizeof (GlobalRawType)* unique_values);
627
+ sparse_map_.resize (unique_values);
604
628
auto col_data = input_column.column_ ->data ();
605
629
auto out_ptr = reinterpret_cast <GlobalRawType*>(aggregated_.data ());
606
630
details::visit_type (input_column.column_ ->type ().data_type (), [&groups, &out_ptr, &col_data, this ] (auto col_tag) {
@@ -617,9 +641,11 @@ void LastAggregatorData::aggregate(const ColumnWithStrings& input_column, const
617
641
if (is_first_group_el || !std::isnan (static_cast <ColumnType>(curr))) {
618
642
groups_cache_.insert (groups[groups_pos]);
619
643
val = curr;
644
+ sparse_map_.set (groups[groups_pos]);
620
645
}
621
646
} else {
622
647
val = GlobalRawType (*ptr);
648
+ sparse_map_.set (groups[groups_pos]);
623
649
}
624
650
}
625
651
}
@@ -631,13 +657,21 @@ void LastAggregatorData::aggregate(const ColumnWithStrings& input_column, const
631
657
SegmentInMemory LastAggregatorData::finalize (const ColumnName& output_column_name, bool , size_t unique_values) {
632
658
SegmentInMemory res;
633
659
if (!aggregated_.empty ()) {
634
- details::visit_type (*data_type_, [that=this , &res, &output_column_name, unique_values] (auto col_tag) {
635
- using RawType = typename decltype (col_tag)::DataTypeTag::raw_type;
636
- that->aggregated_ .resize (sizeof (RawType)* unique_values);
637
- auto col = std::make_shared<Column>(make_scalar_type (that->data_type_ .value ()), unique_values, AllocationType::PRESIZED, Sparsity::NOT_PERMITTED);
638
- memcpy (col->ptr (), that->aggregated_ .data (), that->aggregated_ .size ());
639
- res.add_column (scalar_field (that->data_type_ .value (), output_column_name.value ), col);
640
- col->set_row_data (unique_values - 1 );
660
+ details::visit_type (*data_type_, [&res, &output_column_name, unique_values, this ] (auto col_tag) {
661
+ using col_type_info = ScalarTypeInfo<decltype (col_tag)>;
662
+ using RawType = typename col_type_info::RawType;
663
+ aggregated_.resize (sizeof (RawType)* unique_values);
664
+ sparse_map_.resize (unique_values);
665
+ auto col = create_output_column (make_scalar_type (data_type_.value ()), std::move (sparse_map_), unique_values);
666
+ if (!col->opt_sparse_map ().has_value ()) {
667
+ memcpy (col->ptr (), aggregated_.data (), aggregated_.size ());
668
+ } else {
669
+ const std::span<const RawType> group_values{reinterpret_cast <const RawType*>(aggregated_.data ()), aggregated_.size () / sizeof (RawType)};
670
+ Column::for_each_enumerated<typename col_type_info::TDT>(*col, [&](auto row) {
671
+ row.value () = group_values[row.idx ()];
672
+ });
673
+ }
674
+ res.add_column (scalar_field (data_type_.value (), output_column_name.value ), col);
641
675
});
642
676
}
643
677
return res;
0 commit comments