Skip to content

Commit 8bf2dae

Browse files
authored
[refactor](parquet) Use column predicates to do filtering (#56904) (#59619)
pick #56739 #56904
1 parent 24a3d05 commit 8bf2dae

23 files changed

+1694
-455
lines changed

be/src/olap/block_column_predicate.h

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
#include "olap/column_predicate.h"
3434
#include "olap/olap_common.h"
3535
#include "vec/columns/column.h"
36+
#include "vec/exec/format/parquet/parquet_pred_cmp.h"
3637

3738
namespace roaring {
3839
class Roaring;
@@ -79,6 +80,10 @@ class BlockColumnPredicate {
7980
throw Exception(Status::FatalError("should not reach here"));
8081
}
8182

83+
virtual bool evaluate_and(vectorized::ParquetPredicate::ColumnStat* statistic) const {
84+
throw Exception(Status::FatalError("should not reach here"));
85+
}
86+
8287
virtual bool evaluate_and(const segment_v2::BloomFilter* bf) const {
8388
throw Exception(Status::FatalError("should not reach here"));
8489
}
@@ -117,6 +122,9 @@ class SingleColumnBlockPredicate : public BlockColumnPredicate {
117122
bool* flags) const override;
118123
bool support_zonemap() const override { return _predicate->support_zonemap(); }
119124
bool evaluate_and(const std::pair<WrapperField*, WrapperField*>& statistic) const override;
125+
bool evaluate_and(vectorized::ParquetPredicate::ColumnStat* statistic) const override {
126+
return _predicate->evaluate_and(statistic);
127+
}
120128
bool evaluate_and(const segment_v2::BloomFilter* bf) const override;
121129
bool evaluate_and(const StringRef* dict_words, const size_t dict_num) const override;
122130
void evaluate_or(vectorized::MutableColumns& block, uint16_t* sel, uint16_t selected_size,
@@ -180,6 +188,18 @@ class OrBlockColumnPredicate : public MutilColumnBlockPredicate {
180188
bool* flags) const override;
181189
void evaluate_or(vectorized::MutableColumns& block, uint16_t* sel, uint16_t selected_size,
182190
bool* flags) const override;
191+
bool evaluate_and(vectorized::ParquetPredicate::ColumnStat* statistic) const override {
192+
if (num_of_column_predicate() == 1) {
193+
return _block_column_predicate_vec[0]->evaluate_and(statistic);
194+
} else {
195+
for (int i = 0; i < num_of_column_predicate(); ++i) {
196+
if (_block_column_predicate_vec[i]->evaluate_and(statistic)) {
197+
return true;
198+
}
199+
}
200+
return false;
201+
}
202+
}
183203

184204
// note(wb) we didnt't implement evaluate_vec method here, because storage layer only support AND predicate now;
185205
};
@@ -203,6 +223,15 @@ class AndBlockColumnPredicate : public MutilColumnBlockPredicate {
203223

204224
bool evaluate_and(const StringRef* dict_words, const size_t dict_num) const override;
205225

226+
bool evaluate_and(vectorized::ParquetPredicate::ColumnStat* statistic) const override {
227+
for (auto& block_column_predicate : _block_column_predicate_vec) {
228+
if (!block_column_predicate->evaluate_and(statistic)) {
229+
return false;
230+
}
231+
}
232+
return true;
233+
}
234+
206235
bool can_do_bloom_filter(bool ngram) const override {
207236
for (auto& pred : _block_column_predicate_vec) {
208237
if (!pred->can_do_bloom_filter(ngram)) {

be/src/olap/column_predicate.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
#include "util/defer_op.h"
2828
#include "util/runtime_profile.h"
2929
#include "vec/columns/column.h"
30+
#include "vec/exec/format/parquet/parquet_pred_cmp.h"
3031
#include "vec/exprs/vruntimefilter_wrapper.h"
3132

3233
using namespace doris::segment_v2;
@@ -222,6 +223,15 @@ class ColumnPredicate {
222223

223224
virtual bool can_do_bloom_filter(bool ngram) const { return false; }
224225

226+
/**
227+
* Figure out whether this page is matched partially or completely.
228+
*/
229+
virtual bool evaluate_and(vectorized::ParquetPredicate::ColumnStat* statistic) const {
230+
throw Exception(ErrorCode::INTERNAL_ERROR,
231+
"ParquetPredicate is not supported by this predicate!");
232+
return true;
233+
}
234+
225235
// used to evaluate pre read column in lazy materialization
226236
// now only support integer/float
227237
// a vectorized eval way

be/src/olap/comparison_predicate.h

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,53 @@ class ComparisonPredicateBase : public ColumnPredicate {
142142
}
143143
}
144144

145+
/**
146+
* To figure out whether this page is matched partially or completely.
147+
*
148+
* 1. EQ: if `_value` belongs to the interval [min, max], return true to further compute each value in this page.
149+
* 2. NE: return true to further compute each value in this page if some values not equal to `_value`.
150+
* 3. LT|LE: if `_value` is greater than min, return true to further compute each value in this page.
151+
* 4. GT|GE: if `_value` is less than max, return true to further compute each value in this page.
152+
*/
153+
bool evaluate_and(vectorized::ParquetPredicate::ColumnStat* statistic) const override {
154+
if (!(*statistic->get_stat_func)(statistic, column_id())) {
155+
return true;
156+
}
157+
vectorized::Field min_field;
158+
vectorized::Field max_field;
159+
if (!vectorized::ParquetPredicate::get_min_max_value(
160+
statistic->col_schema, statistic->encoded_min_value,
161+
statistic->encoded_max_value, *statistic->ctz, &min_field, &max_field)
162+
.ok()) {
163+
return true;
164+
};
165+
T min_value;
166+
T max_value;
167+
if constexpr (is_int_or_bool(Type) || is_float_or_double(Type)) {
168+
min_value =
169+
(typename PrimitiveTypeTraits<Type>::CppType)min_field
170+
.template get<typename PrimitiveTypeTraits<Type>::NearestFieldType>();
171+
max_value =
172+
(typename PrimitiveTypeTraits<Type>::CppType)max_field
173+
.template get<typename PrimitiveTypeTraits<Type>::NearestFieldType>();
174+
} else {
175+
min_value = min_field.template get<typename PrimitiveTypeTraits<Type>::CppType>();
176+
max_value = max_field.template get<typename PrimitiveTypeTraits<Type>::CppType>();
177+
}
178+
179+
if constexpr (PT == PredicateType::EQ) {
180+
return Compare::less_equal(min_value, _value) &&
181+
Compare::greater_equal(max_value, _value);
182+
} else if constexpr (PT == PredicateType::NE) {
183+
return !Compare::equal(min_value, _value) || !Compare::equal(max_value, _value);
184+
} else if constexpr (PT == PredicateType::LT || PT == PredicateType::LE) {
185+
return Compare::less_equal(min_value, _value);
186+
} else {
187+
static_assert(PT == PredicateType::GT || PT == PredicateType::GE);
188+
return Compare::greater_equal(max_value, _value);
189+
}
190+
}
191+
145192
bool is_always_true(const std::pair<WrapperField*, WrapperField*>& statistic) const override {
146193
if (statistic.first->is_null() || statistic.second->is_null()) {
147194
return false;

be/src/olap/delete_handler.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -370,7 +370,8 @@ Status DeleteHandler::_parse_column_pred(TabletSchemaSPtr complete_schema,
370370
condition.__set_column_unique_id(col_unique_id);
371371
const auto& column = complete_schema->column_by_uid(col_unique_id);
372372
uint32_t index = complete_schema->field_index(col_unique_id);
373-
auto* predicate = parse_to_predicate(column, index, condition, _predicate_arena, true);
373+
auto* predicate =
374+
parse_to_predicate(column.get_vec_type(), index, condition, _predicate_arena, true);
374375
if (predicate != nullptr) {
375376
delete_conditions->column_predicate_vec.push_back(predicate);
376377
}
@@ -440,8 +441,8 @@ Status DeleteHandler::init(TabletSchemaSPtr tablet_schema,
440441
}
441442
const auto& column = tablet_schema->column_by_uid(col_unique_id);
442443
uint32_t index = tablet_schema->field_index(col_unique_id);
443-
temp.column_predicate_vec.push_back(
444-
parse_to_predicate(column, index, condition, _predicate_arena, true));
444+
temp.column_predicate_vec.push_back(parse_to_predicate(
445+
column.get_vec_type(), index, condition, _predicate_arena, true));
445446
}
446447

447448
_del_conds.emplace_back(std::move(temp));

be/src/olap/in_list_predicate.h

Lines changed: 56 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
#include "vec/columns/column_dictionary.h"
3737
#include "vec/common/string_ref.h"
3838
#include "vec/core/types.h"
39+
#include "vec/data_types/data_type.h"
3940

4041
// for uint24_t
4142
template <>
@@ -67,19 +68,19 @@ class InListPredicateBase : public ColumnPredicate {
6768
using T = typename PrimitiveTypeTraits<Type>::CppType;
6869
template <typename ConditionType, typename ConvertFunc>
6970
InListPredicateBase(uint32_t column_id, const ConditionType& conditions,
70-
const ConvertFunc& convert, bool is_opposite, const TabletColumn* col,
71-
vectorized::Arena& arena)
71+
const ConvertFunc& convert, bool is_opposite,
72+
const vectorized::DataTypePtr& data_type, vectorized::Arena& arena)
7273
: ColumnPredicate(column_id, is_opposite),
7374
_min_value(type_limit<T>::max()),
7475
_max_value(type_limit<T>::min()) {
7576
_values = std::make_shared<HybridSetType>(false);
7677
for (const auto& condition : conditions) {
7778
T tmp;
7879
if constexpr (Type == TYPE_STRING || Type == TYPE_CHAR) {
79-
tmp = convert(*col, condition, arena);
80+
tmp = convert(data_type, condition, arena);
8081
} else if constexpr (Type == TYPE_DECIMAL32 || Type == TYPE_DECIMAL64 ||
8182
Type == TYPE_DECIMAL128I || Type == TYPE_DECIMAL256) {
82-
tmp = convert(*col, condition);
83+
tmp = convert(data_type, condition);
8384
} else {
8485
tmp = convert(condition);
8586
}
@@ -245,6 +246,42 @@ class InListPredicateBase : public ColumnPredicate {
245246
}
246247
}
247248

249+
bool evaluate_and(vectorized::ParquetPredicate::ColumnStat* statistic) const override {
250+
if (!(*statistic->get_stat_func)(statistic, column_id())) {
251+
return true;
252+
}
253+
vectorized::Field min_field;
254+
vectorized::Field max_field;
255+
if (!vectorized::ParquetPredicate::get_min_max_value(
256+
statistic->col_schema, statistic->encoded_min_value,
257+
statistic->encoded_max_value, *statistic->ctz, &min_field, &max_field)
258+
.ok()) {
259+
return true;
260+
};
261+
T min_value;
262+
T max_value;
263+
if constexpr (is_int_or_bool(Type) || is_float_or_double(Type)) {
264+
min_value =
265+
(typename PrimitiveTypeTraits<Type>::CppType)min_field
266+
.template get<typename PrimitiveTypeTraits<Type>::NearestFieldType>();
267+
max_value =
268+
(typename PrimitiveTypeTraits<Type>::CppType)max_field
269+
.template get<typename PrimitiveTypeTraits<Type>::NearestFieldType>();
270+
} else {
271+
min_value = min_field.template get<typename PrimitiveTypeTraits<Type>::CppType>();
272+
max_value = max_field.template get<typename PrimitiveTypeTraits<Type>::CppType>();
273+
}
274+
275+
if constexpr (PT == PredicateType::IN_LIST) {
276+
return (Compare::less_equal(min_value, _max_value) &&
277+
Compare::greater_equal(max_value, _min_value)) ||
278+
(Compare::greater_equal(max_value, _min_value) &&
279+
Compare::less_equal(min_value, _max_value));
280+
} else {
281+
return true;
282+
}
283+
}
284+
248285
bool evaluate_and(const StringRef* dict_words, const size_t count) const override {
249286
for (size_t i = 0; i != count; ++i) {
250287
const auto found = _values->find(dict_words[i].data, dict_words[i].size) ^ _opposite;
@@ -548,57 +585,59 @@ template <PrimitiveType Type, PredicateType PT, typename ConditionType, typename
548585
size_t N = 0>
549586
ColumnPredicate* _create_in_list_predicate(uint32_t column_id, const ConditionType& conditions,
550587
const ConvertFunc& convert, bool is_opposite,
551-
const TabletColumn* col, vectorized::Arena& arena) {
588+
const vectorized::DataTypePtr& data_type,
589+
vectorized::Arena& arena) {
552590
using T = typename PrimitiveTypeTraits<Type>::CppType;
553591
if constexpr (N >= 1 && N <= FIXED_CONTAINER_MAX_SIZE) {
554592
using Set = std::conditional_t<
555593
std::is_same_v<T, StringRef>, StringSet<FixedContainer<std::string, N>>,
556594
HybridSet<Type, FixedContainer<T, N>,
557595
vectorized::PredicateColumnType<PredicateEvaluateType<Type>>>>;
558596
return new InListPredicateBase<Type, PT, Set>(column_id, conditions, convert, is_opposite,
559-
col, arena);
597+
data_type, arena);
560598
} else {
561599
using Set = std::conditional_t<
562600
std::is_same_v<T, StringRef>, StringSet<DynamicContainer<std::string>>,
563601
HybridSet<Type, DynamicContainer<T>,
564602
vectorized::PredicateColumnType<PredicateEvaluateType<Type>>>>;
565603
return new InListPredicateBase<Type, PT, Set>(column_id, conditions, convert, is_opposite,
566-
col, arena);
604+
data_type, arena);
567605
}
568606
}
569607

570608
template <PrimitiveType Type, PredicateType PT, typename ConditionType, typename ConvertFunc>
571609
ColumnPredicate* create_in_list_predicate(uint32_t column_id, const ConditionType& conditions,
572610
const ConvertFunc& convert, bool is_opposite,
573-
const TabletColumn* col, vectorized::Arena& arena) {
611+
const vectorized::DataTypePtr& data_type,
612+
vectorized::Arena& arena) {
574613
if (conditions.size() == 1) {
575614
return _create_in_list_predicate<Type, PT, ConditionType, ConvertFunc, 1>(
576-
column_id, conditions, convert, is_opposite, col, arena);
615+
column_id, conditions, convert, is_opposite, data_type, arena);
577616
} else if (conditions.size() == 2) {
578617
return _create_in_list_predicate<Type, PT, ConditionType, ConvertFunc, 2>(
579-
column_id, conditions, convert, is_opposite, col, arena);
618+
column_id, conditions, convert, is_opposite, data_type, arena);
580619
} else if (conditions.size() == 3) {
581620
return _create_in_list_predicate<Type, PT, ConditionType, ConvertFunc, 3>(
582-
column_id, conditions, convert, is_opposite, col, arena);
621+
column_id, conditions, convert, is_opposite, data_type, arena);
583622
} else if (conditions.size() == 4) {
584623
return _create_in_list_predicate<Type, PT, ConditionType, ConvertFunc, 4>(
585-
column_id, conditions, convert, is_opposite, col, arena);
624+
column_id, conditions, convert, is_opposite, data_type, arena);
586625
} else if (conditions.size() == 5) {
587626
return _create_in_list_predicate<Type, PT, ConditionType, ConvertFunc, 5>(
588-
column_id, conditions, convert, is_opposite, col, arena);
627+
column_id, conditions, convert, is_opposite, data_type, arena);
589628
} else if (conditions.size() == 6) {
590629
return _create_in_list_predicate<Type, PT, ConditionType, ConvertFunc, 6>(
591-
column_id, conditions, convert, is_opposite, col, arena);
630+
column_id, conditions, convert, is_opposite, data_type, arena);
592631
} else if (conditions.size() == 7) {
593632
return _create_in_list_predicate<Type, PT, ConditionType, ConvertFunc, 7>(
594-
column_id, conditions, convert, is_opposite, col, arena);
633+
column_id, conditions, convert, is_opposite, data_type, arena);
595634
} else if (conditions.size() == FIXED_CONTAINER_MAX_SIZE) {
596635
return _create_in_list_predicate<Type, PT, ConditionType, ConvertFunc,
597636
FIXED_CONTAINER_MAX_SIZE>(column_id, conditions, convert,
598-
is_opposite, col, arena);
637+
is_opposite, data_type, arena);
599638
} else {
600639
return _create_in_list_predicate<Type, PT, ConditionType, ConvertFunc>(
601-
column_id, conditions, convert, is_opposite, col, arena);
640+
column_id, conditions, convert, is_opposite, data_type, arena);
602641
}
603642
}
604643

be/src/olap/null_predicate.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,17 @@ class NullPredicate : public ColumnPredicate {
6666
}
6767
}
6868

69+
bool evaluate_and(vectorized::ParquetPredicate::ColumnStat* statistic) const override {
70+
if (!(*statistic->get_stat_func)(statistic, column_id())) {
71+
return true;
72+
}
73+
if (_is_null) {
74+
return true;
75+
} else {
76+
return !statistic->is_all_null;
77+
}
78+
}
79+
6980
bool evaluate_del(const std::pair<WrapperField*, WrapperField*>& statistic) const override {
7081
// evaluate_del only use for delete condition to filter page, need use delete condition origin value,
7182
// when opposite==true, origin value 'is null'->'is not null' and 'is not null'->'is null',

0 commit comments

Comments
 (0)