3232#include " vec/common/hash_table/string_hash_map.h"
3333#include " vec/common/string_ref.h"
3434#include " vec/core/types.h"
35+ #include " vec/utils/template_helpers.hpp"
3536
3637namespace doris ::vectorized {
3738#include " common/compile_check_begin.h"
@@ -477,6 +478,69 @@ struct MethodOneNumberDirect : public MethodOneNumber<FieldType, TData> {
477478 }
478479};
479480
481+ template <int N>
482+ void pack_nullmaps_interleaved (const uint8_t * const * datas, const uint8_t * bit_offsets,
483+ size_t row_numbers, size_t stride, uint8_t * __restrict out) {
484+ static_assert (N >= 1 && N <= BITSIZE);
485+
486+ const uint8_t * __restrict p0 = (N > 0 ) ? datas[0 ] : nullptr ;
487+ const uint8_t * __restrict p1 = (N > 1 ) ? datas[1 ] : nullptr ;
488+ const uint8_t * __restrict p2 = (N > 2 ) ? datas[2 ] : nullptr ;
489+ const uint8_t * __restrict p3 = (N > 3 ) ? datas[3 ] : nullptr ;
490+ const uint8_t * __restrict p4 = (N > 4 ) ? datas[4 ] : nullptr ;
491+ const uint8_t * __restrict p5 = (N > 5 ) ? datas[5 ] : nullptr ;
492+ const uint8_t * __restrict p6 = (N > 6 ) ? datas[6 ] : nullptr ;
493+ const uint8_t * __restrict p7 = (N > 7 ) ? datas[7 ] : nullptr ;
494+
495+ const uint8_t m0 = (N > 0 ) ? bit_offsets[0 ] : 0 ;
496+ const uint8_t m1 = (N > 1 ) ? bit_offsets[1 ] : 0 ;
497+ const uint8_t m2 = (N > 2 ) ? bit_offsets[2 ] : 0 ;
498+ const uint8_t m3 = (N > 3 ) ? bit_offsets[3 ] : 0 ;
499+ const uint8_t m4 = (N > 4 ) ? bit_offsets[4 ] : 0 ;
500+ const uint8_t m5 = (N > 5 ) ? bit_offsets[5 ] : 0 ;
501+ const uint8_t m6 = (N > 6 ) ? bit_offsets[6 ] : 0 ;
502+ const uint8_t m7 = (N > 7 ) ? bit_offsets[7 ] : 0 ;
503+
504+ for (size_t i = 0 ; i < row_numbers; ++i) {
505+ uint8_t byte = 0 ;
506+
507+ if constexpr (N > 0 ) {
508+ byte |= p0[i] << m0;
509+ }
510+ if constexpr (N > 1 ) {
511+ byte |= p1[i] << m1;
512+ }
513+ if constexpr (N > 2 ) {
514+ byte |= p2[i] << m2;
515+ }
516+ if constexpr (N > 3 ) {
517+ byte |= p3[i] << m3;
518+ }
519+ if constexpr (N > 4 ) {
520+ byte |= p4[i] << m4;
521+ }
522+ if constexpr (N > 5 ) {
523+ byte |= p5[i] << m5;
524+ }
525+ if constexpr (N > 6 ) {
526+ byte |= p6[i] << m6;
527+ }
528+ if constexpr (N > 7 ) {
529+ byte |= p7[i] << m7;
530+ }
531+
532+ out[i * stride] |= byte;
533+ }
534+ }
535+
536+ template <int N>
537+ struct PackNullmapsReducer {
538+ static void run (const uint8_t * const * datas, const uint8_t * coefficients, size_t row_numbers,
539+ size_t stride, uint8_t * __restrict out) {
540+ pack_nullmaps_interleaved<N>(datas, coefficients, row_numbers, stride, out);
541+ }
542+ };
543+
480544template <typename TData>
481545struct MethodKeysFixed : public MethodBase <TData> {
482546 using Base = MethodBase<TData>;
@@ -499,36 +563,40 @@ struct MethodKeysFixed : public MethodBase<TData> {
499563 void pack_fixeds (size_t row_numbers, const ColumnRawPtrs& key_columns,
500564 const ColumnRawPtrs& nullmap_columns, DorisVector<T>& result) {
501565 size_t bitmap_size = get_bitmap_size (nullmap_columns.size ());
502- // set size to 0 at first, then use resize to call default constructor on index included from [0, row_numbers) to reset all memory
503- result.clear ();
566+ if (bitmap_size) {
567+ // set size to 0 at first, then use resize to call default constructor on index included from [0, row_numbers) to reset all memory
568+ // only need to reset the memory used to bitmap
569+ result.clear ();
570+ }
504571 result.resize (row_numbers);
505572
506573 auto * __restrict result_data = reinterpret_cast <char *>(result.data ());
507574
508575 size_t offset = 0 ;
509576 std::vector<bool > has_null_column (nullmap_columns.size (), false );
510577 if (bitmap_size > 0 ) {
578+ std::vector<const uint8_t *> nullmap_datas;
579+ std::vector<uint8_t > bit_offsets;
511580 for (size_t j = 0 ; j < nullmap_columns.size (); j++) {
512581 if (!nullmap_columns[j]) {
513582 continue ;
514583 }
515- const auto * __restrict data =
584+ const uint8_t * __restrict data =
516585 assert_cast<const ColumnUInt8&>(*nullmap_columns[j]).get_data ().data ();
517586
518- if (!simd::contain_one (data, row_numbers)) {
519- continue ;
520- }
521- has_null_column[j] = true ;
522-
523- size_t bucket = j / BITSIZE;
524- size_t local_offset = j - bucket * BITSIZE;
525- const auto mask = uint8_t (1 << local_offset);
526- auto * __restrict current = result_data + bucket;
527- for (size_t i = 0 ; i < row_numbers; ++i) {
528- *(current) |= data[i] * mask;
529- current += sizeof (T);
587+ has_null_column[j] = simd::contain_one (data, row_numbers);
588+ if (has_null_column[j]) {
589+ nullmap_datas.emplace_back (data);
590+ bit_offsets.emplace_back (j % BITSIZE);
530591 }
531592 }
593+ for (size_t j = 0 , bucket = 0 ; j < nullmap_datas.size (); j += BITSIZE, bucket++) {
594+ int column_batch = std::min (BITSIZE, (int )(nullmap_datas.size () - j));
595+ constexpr_int_match<1 , BITSIZE, PackNullmapsReducer>::run (
596+ column_batch, nullmap_datas.data () + j, bit_offsets.data () + j, row_numbers,
597+ sizeof (T), reinterpret_cast <uint8_t *>(result_data + bucket));
598+ }
599+
532600 offset += bitmap_size;
533601 }
534602
@@ -545,8 +613,9 @@ struct MethodKeysFixed : public MethodBase<TData> {
545613 }
546614 auto * __restrict current = result_data + offset;
547615 for (size_t i = 0 ; i < row_numbers; ++i) {
548- memcpy_fixed<Fixed, true >(current, data + i * sizeof (Fixed) );
616+ memcpy_fixed<Fixed, true >(current, data);
549617 current += sizeof (T);
618+ data += sizeof (Fixed);
550619 }
551620 };
552621
0 commit comments