Skip to content

Commit 39f9a4b

Browse files
committed
pack fixed opt2
1 parent dda0d7c commit 39f9a4b

File tree

2 files changed

+87
-17
lines changed

2 files changed

+87
-17
lines changed

be/src/vec/columns/column_vector.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -515,8 +515,9 @@ MutableColumnPtr ColumnVector<T>::permute(const IColumn::Permutation& perm, size
515515
template <PrimitiveType T>
516516
void ColumnVector<T>::replace_column_null_data(const uint8_t* __restrict null_map) {
517517
auto s = size();
518+
auto value = default_value();
518519
for (size_t i = 0; i < s; ++i) {
519-
data[i] = null_map[i] ? default_value() : data[i];
520+
data[i] = null_map[i] ? value : data[i];
520521
}
521522
}
522523

be/src/vec/common/hash_table/hash_map_context.h

Lines changed: 85 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
#include "vec/common/hash_table/string_hash_map.h"
3333
#include "vec/common/string_ref.h"
3434
#include "vec/core/types.h"
35+
#include "vec/utils/template_helpers.hpp"
3536

3637
namespace doris::vectorized {
3738
#include "common/compile_check_begin.h"
@@ -477,6 +478,69 @@ struct MethodOneNumberDirect : public MethodOneNumber<FieldType, TData> {
477478
}
478479
};
479480

481+
template <int N>
482+
void pack_nullmaps_interleaved(const uint8_t* const* datas, const uint8_t* bit_offsets,
483+
size_t row_numbers, size_t stride, uint8_t* __restrict out) {
484+
static_assert(N >= 1 && N <= BITSIZE);
485+
486+
const uint8_t* __restrict p0 = (N > 0) ? datas[0] : nullptr;
487+
const uint8_t* __restrict p1 = (N > 1) ? datas[1] : nullptr;
488+
const uint8_t* __restrict p2 = (N > 2) ? datas[2] : nullptr;
489+
const uint8_t* __restrict p3 = (N > 3) ? datas[3] : nullptr;
490+
const uint8_t* __restrict p4 = (N > 4) ? datas[4] : nullptr;
491+
const uint8_t* __restrict p5 = (N > 5) ? datas[5] : nullptr;
492+
const uint8_t* __restrict p6 = (N > 6) ? datas[6] : nullptr;
493+
const uint8_t* __restrict p7 = (N > 7) ? datas[7] : nullptr;
494+
495+
const uint8_t m0 = (N > 0) ? bit_offsets[0] : 0;
496+
const uint8_t m1 = (N > 1) ? bit_offsets[1] : 0;
497+
const uint8_t m2 = (N > 2) ? bit_offsets[2] : 0;
498+
const uint8_t m3 = (N > 3) ? bit_offsets[3] : 0;
499+
const uint8_t m4 = (N > 4) ? bit_offsets[4] : 0;
500+
const uint8_t m5 = (N > 5) ? bit_offsets[5] : 0;
501+
const uint8_t m6 = (N > 6) ? bit_offsets[6] : 0;
502+
const uint8_t m7 = (N > 7) ? bit_offsets[7] : 0;
503+
504+
for (size_t i = 0; i < row_numbers; ++i) {
505+
uint8_t byte = 0;
506+
507+
if constexpr (N > 0) {
508+
byte |= p0[i] << m0;
509+
}
510+
if constexpr (N > 1) {
511+
byte |= p1[i] << m1;
512+
}
513+
if constexpr (N > 2) {
514+
byte |= p2[i] << m2;
515+
}
516+
if constexpr (N > 3) {
517+
byte |= p3[i] << m3;
518+
}
519+
if constexpr (N > 4) {
520+
byte |= p4[i] << m4;
521+
}
522+
if constexpr (N > 5) {
523+
byte |= p5[i] << m5;
524+
}
525+
if constexpr (N > 6) {
526+
byte |= p6[i] << m6;
527+
}
528+
if constexpr (N > 7) {
529+
byte |= p7[i] << m7;
530+
}
531+
532+
out[i * stride] |= byte;
533+
}
534+
}
535+
536+
template <int N>
537+
struct PackNullmapsReducer {
538+
static void run(const uint8_t* const* datas, const uint8_t* coefficients, size_t row_numbers,
539+
size_t stride, uint8_t* __restrict out) {
540+
pack_nullmaps_interleaved<N>(datas, coefficients, row_numbers, stride, out);
541+
}
542+
};
543+
480544
template <typename TData>
481545
struct MethodKeysFixed : public MethodBase<TData> {
482546
using Base = MethodBase<TData>;
@@ -499,36 +563,40 @@ struct MethodKeysFixed : public MethodBase<TData> {
499563
void pack_fixeds(size_t row_numbers, const ColumnRawPtrs& key_columns,
500564
const ColumnRawPtrs& nullmap_columns, DorisVector<T>& result) {
501565
size_t bitmap_size = get_bitmap_size(nullmap_columns.size());
502-
// set size to 0 at first, then use resize to call default constructor on index included from [0, row_numbers) to reset all memory
503-
result.clear();
566+
if (bitmap_size) {
567+
// set size to 0 at first, then use resize to call default constructor on index included from [0, row_numbers) to reset all memory
568+
// only need to reset the memory used to bitmap
569+
result.clear();
570+
}
504571
result.resize(row_numbers);
505572

506573
auto* __restrict result_data = reinterpret_cast<char*>(result.data());
507574

508575
size_t offset = 0;
509576
std::vector<bool> has_null_column(nullmap_columns.size(), false);
510577
if (bitmap_size > 0) {
578+
std::vector<const uint8_t*> nullmap_datas;
579+
std::vector<uint8_t> bit_offsets;
511580
for (size_t j = 0; j < nullmap_columns.size(); j++) {
512581
if (!nullmap_columns[j]) {
513582
continue;
514583
}
515-
const auto* __restrict data =
584+
const uint8_t* __restrict data =
516585
assert_cast<const ColumnUInt8&>(*nullmap_columns[j]).get_data().data();
517586

518-
if (!simd::contain_one(data, row_numbers)) {
519-
continue;
520-
}
521-
has_null_column[j] = true;
522-
523-
size_t bucket = j / BITSIZE;
524-
size_t local_offset = j - bucket * BITSIZE;
525-
const auto mask = uint8_t(1 << local_offset);
526-
auto* __restrict current = result_data + bucket;
527-
for (size_t i = 0; i < row_numbers; ++i) {
528-
*(current) |= data[i] * mask;
529-
current += sizeof(T);
587+
has_null_column[j] = simd::contain_one(data, row_numbers);
588+
if (has_null_column[j]) {
589+
nullmap_datas.emplace_back(data);
590+
bit_offsets.emplace_back(j % BITSIZE);
530591
}
531592
}
593+
for (size_t j = 0, bucket = 0; j < nullmap_datas.size(); j += BITSIZE, bucket++) {
594+
int column_batch = std::min(BITSIZE, (int)(nullmap_datas.size() - j));
595+
constexpr_int_match<1, BITSIZE, PackNullmapsReducer>::run(
596+
column_batch, nullmap_datas.data() + j, bit_offsets.data() + j, row_numbers,
597+
sizeof(T), reinterpret_cast<uint8_t*>(result_data + bucket));
598+
}
599+
532600
offset += bitmap_size;
533601
}
534602

@@ -545,8 +613,9 @@ struct MethodKeysFixed : public MethodBase<TData> {
545613
}
546614
auto* __restrict current = result_data + offset;
547615
for (size_t i = 0; i < row_numbers; ++i) {
548-
memcpy_fixed<Fixed, true>(current, data + i * sizeof(Fixed));
616+
memcpy_fixed<Fixed, true>(current, data);
549617
current += sizeof(T);
618+
data += sizeof(Fixed);
550619
}
551620
};
552621

0 commit comments

Comments
 (0)