diff --git a/be/src/exec/olap_common.h b/be/src/exec/olap_common.h index 936517b83f4b53..357f408f262f6e 100644 --- a/be/src/exec/olap_common.h +++ b/be/src/exec/olap_common.h @@ -106,6 +106,8 @@ class ColumnValueRange { ColumnValueRange(std::string col_name); ColumnValueRange(std::string col_name, int precision, int scale); + ColumnValueRange(std::string col_name, const CppType& min, const CppType& max, + bool contain_null); ColumnValueRange(std::string col_name, bool is_nullable_col, int precision, int scale); @@ -127,8 +129,6 @@ class ColumnValueRange { bool is_range_value_convertible() const; - void convert_to_fixed_value(); - void convert_to_range_value(); bool convert_to_avg_range_value(std::vector& begin_scan_keys, @@ -141,8 +141,6 @@ class ColumnValueRange { constexpr bool is_reject_split_type() const { return _is_reject_split_type; } - bool has_intersection(ColumnValueRange& range); - void intersection(ColumnValueRange& range); void set_empty_value_range() { @@ -168,12 +166,8 @@ class ColumnValueRange { bool is_low_value_minimum() const { return Compare::equal(_low_value, TYPE_MIN); } - bool is_low_value_maximum() const { return Compare::equal(_low_value, TYPE_MAX); } - bool is_high_value_maximum() const { return Compare::equal(_high_value, TYPE_MAX); } - bool is_high_value_minimum() const { return Compare::equal(_high_value, TYPE_MIN); } - bool is_begin_include() const { return _low_op == FILTER_LARGER_OR_EQUAL; } bool is_end_include() const { return _high_op == FILTER_LESS_OR_EQUAL; } @@ -182,98 +176,10 @@ class ColumnValueRange { const std::string& column_name() const { return _column_name; } - bool is_nullable_col() const { return _is_nullable_col; } - bool contain_null() const { return _contain_null; } size_t get_fixed_value_size() const { return _fixed_values.size(); } - void to_olap_filter(std::vector>& filters) { - if (is_fixed_value_range()) { - // 1. convert to in filter condition - to_in_condition(filters, true); - } else if (Compare::less(_low_value, _high_value)) { - // 2. convert to min max filter condition - TCondition null_pred; - if (Compare::equal(TYPE_MAX, _high_value) && _high_op == FILTER_LESS_OR_EQUAL && - Compare::equal(TYPE_MIN, _low_value) && _low_op == FILTER_LARGER_OR_EQUAL && - _is_nullable_col && !contain_null()) { - null_pred.__set_column_name(_column_name); - null_pred.__set_condition_op("is"); - null_pred.condition_values.emplace_back("not null"); - } - - if (null_pred.condition_values.size() != 0) { - filters.emplace_back(_column_name, null_pred, _runtime_filter_id, - _predicate_filtered_rows_counter, - _predicate_input_rows_counter, - _predicate_always_true_rows_counter); - return; - } - - TCondition low; - if (Compare::not_equal(TYPE_MIN, _low_value) || FILTER_LARGER_OR_EQUAL != _low_op) { - low.__set_column_name(_column_name); - low.__set_condition_op((_low_op == FILTER_LARGER_OR_EQUAL ? ">=" : ">>")); - low.condition_values.push_back( - cast_to_string(_low_value, _scale)); - } - - if (low.condition_values.size() != 0) { - filters.emplace_back( - _column_name, low, _runtime_filter_id, _predicate_filtered_rows_counter, - _predicate_input_rows_counter, _predicate_always_true_rows_counter); - } - - TCondition high; - if (Compare::not_equal(TYPE_MAX, _high_value) || FILTER_LESS_OR_EQUAL != _high_op) { - high.__set_column_name(_column_name); - high.__set_condition_op((_high_op == FILTER_LESS_OR_EQUAL ? "<=" : "<<")); - high.condition_values.push_back( - cast_to_string(_high_value, _scale)); - } - - if (high.condition_values.size() != 0) { - filters.emplace_back( - _column_name, high, _runtime_filter_id, _predicate_filtered_rows_counter, - _predicate_input_rows_counter, _predicate_always_true_rows_counter); - } - } else { - // 3. convert to is null and is not null filter condition - TCondition null_pred; - if (Compare::equal(TYPE_MAX, _low_value) && Compare::equal(TYPE_MIN, _high_value) && - _is_nullable_col && contain_null()) { - null_pred.__set_column_name(_column_name); - null_pred.__set_condition_op("is"); - null_pred.condition_values.emplace_back("null"); - } - - if (null_pred.condition_values.size() != 0) { - filters.emplace_back(_column_name, null_pred, _runtime_filter_id, - _predicate_filtered_rows_counter, - _predicate_input_rows_counter, - _predicate_always_true_rows_counter); - } - } - } - - void to_in_condition(std::vector>& filters, bool is_in = true) { - TCondition condition; - condition.__set_column_name(_column_name); - condition.__set_condition_op(is_in ? "*=" : "!*="); - - for (const auto& value : _fixed_values) { - condition.condition_values.push_back( - cast_to_string(value, _scale)); - } - - if (condition.condition_values.size() != 0) { - filters.emplace_back(_column_name, condition, _runtime_filter_id, - _predicate_filtered_rows_counter, _predicate_input_rows_counter, - _predicate_always_true_rows_counter); - } - } - void set_whole_value_range() { _fixed_values.clear(); _low_value = TYPE_MIN; @@ -307,41 +213,23 @@ class ColumnValueRange { _contain_null = _is_nullable_col && contain_null; } - void attach_profile_counter( - int runtime_filter_id, - std::shared_ptr predicate_filtered_rows_counter, - std::shared_ptr predicate_input_rows_counter, - std::shared_ptr predicate_always_true_rows_counter) { - DCHECK(predicate_filtered_rows_counter != nullptr); - DCHECK(predicate_input_rows_counter != nullptr); - - _runtime_filter_id = runtime_filter_id; - - if (predicate_filtered_rows_counter != nullptr) { - _predicate_filtered_rows_counter = predicate_filtered_rows_counter; - } - if (predicate_input_rows_counter != nullptr) { - _predicate_input_rows_counter = predicate_input_rows_counter; - } - if (predicate_always_true_rows_counter != nullptr) { - _predicate_always_true_rows_counter = predicate_always_true_rows_counter; - } - } - int precision() const { return _precision; } int scale() const { return _scale; } - static void add_fixed_value_range(ColumnValueRange& range, + static void add_fixed_value_range(ColumnValueRange& range, SQLFilterOp op, const CppType* value) { static_cast(range.add_fixed_value(*value)); } - static void remove_fixed_value_range(ColumnValueRange& range, + static void remove_fixed_value_range(ColumnValueRange& range, SQLFilterOp op, const CppType* value) { range.remove_fixed_value(*value); } + static void empty_function(ColumnValueRange& range, SQLFilterOp op, + const CppType* value) {} + static void add_value_range(ColumnValueRange& range, SQLFilterOp op, const CppType* value) { static_cast(range.add_range(op, *value)); @@ -350,23 +238,14 @@ class ColumnValueRange { static ColumnValueRange create_empty_column_value_range(bool is_nullable_col, int precision, int scale) { - return ColumnValueRange::create_empty_column_value_range( - "", is_nullable_col, precision, scale); - } - - static ColumnValueRange create_empty_column_value_range( - const std::string& col_name, bool is_nullable_col, int precision, int scale) { - return ColumnValueRange(col_name, TYPE_MAX, TYPE_MIN, is_nullable_col, - false, precision, scale); + return ColumnValueRange("", TYPE_MAX, TYPE_MIN, is_nullable_col, false, + precision, scale); } protected: bool is_in_range(const CppType& value); private: - ColumnValueRange(std::string col_name, const CppType& min, const CppType& max, - bool contain_null); - ColumnValueRange(std::string col_name, const CppType& min, const CppType& max, bool is_nullable_col, bool contain_null, int precision, int scale); @@ -400,15 +279,6 @@ class ColumnValueRange { primitive_type == PrimitiveType::TYPE_DATETIMEV2 || primitive_type == PrimitiveType::TYPE_TIMESTAMPTZ || primitive_type == PrimitiveType::TYPE_DECIMAL256; - - int _runtime_filter_id = -1; - - std::shared_ptr _predicate_filtered_rows_counter = - std::make_shared(TUnit::UNIT, 0); - std::shared_ptr _predicate_input_rows_counter = - std::make_shared(TUnit::UNIT, 0); - std::shared_ptr _predicate_always_true_rows_counter = - std::make_shared(TUnit::UNIT, 0); }; template <> const typename ColumnValueRange::CppType ColumnValueRange::TYPE_MIN; @@ -421,12 +291,7 @@ const typename ColumnValueRange::CppType ColumnValueRange Status extend_scan_key(ColumnValueRange& range, int32_t max_scan_key_num, bool* exact_value, bool* eos, bool* should_break); @@ -458,30 +323,15 @@ class OlapScanKeys { return _begin_scan_keys.size(); } - void set_begin_include(bool begin_include) { _begin_include = begin_include; } - - bool begin_include() const { return _begin_include; } - - void set_end_include(bool end_include) { _end_include = end_include; } - - bool end_include() const { return _end_include; } - void set_is_convertible(bool is_convertible) { _is_convertible = is_convertible; } - // now, only use in UT - static std::string to_print_key(const OlapTuple& scan_keys) { - std::stringstream sstream; - sstream << scan_keys; - return sstream.str(); - } - private: std::vector _begin_scan_keys; std::vector _end_scan_keys; - bool _has_range_value; - bool _begin_include; - bool _end_include; - bool _is_convertible; + bool _has_range_value = false; + bool _begin_include = false; + bool _end_include = false; + bool _is_convertible = false; }; using ColumnValueRangeType = std::variant< @@ -509,10 +359,6 @@ template ColumnValueRange::ColumnValueRange() : _column_type(INVALID_TYPE), _precision(-1), _scale(-1) {} -template -ColumnValueRange::ColumnValueRange(std::string col_name) - : ColumnValueRange(std::move(col_name), TYPE_MIN, TYPE_MAX, true) {} - template ColumnValueRange::ColumnValueRange(std::string col_name, const CppType& min, const CppType& max, bool contain_null) @@ -542,10 +388,6 @@ ColumnValueRange::ColumnValueRange(std::string col_name, const C _precision(precision), _scale(scale) {} -template -ColumnValueRange::ColumnValueRange(std::string col_name, int precision, int scale) - : ColumnValueRange(std::move(col_name), TYPE_MIN, TYPE_MAX, true, true, precision, scale) {} - template ColumnValueRange::ColumnValueRange(std::string col_name, bool is_nullable_col, int precision, int scale) @@ -974,77 +816,6 @@ void ColumnValueRange::intersection(ColumnValueRange -bool ColumnValueRange::has_intersection(ColumnValueRange& range) { - // 1. return false if column type not match - if (_column_type != range._column_type) { - return false; - } - - // 2. return false if any range is empty - if (is_empty_value_range() || range.is_empty_value_range()) { - return false; - } - - // 3.1 return false if two int fixedRange has no intersection - if (is_fixed_value_range() && range.is_fixed_value_range()) { - SetType result_values; - set_intersection(_fixed_values.begin(), _fixed_values.end(), range._fixed_values.begin(), - range._fixed_values.end(), - std::inserter(result_values, result_values.begin())); - - if (result_values.size() != 0) { - return true; - } else { - return false; - } - } // 3.2 - else if (is_fixed_value_range() && !range.is_fixed_value_range()) { - IteratorType iter = _fixed_values.begin(); - - while (iter != _fixed_values.end()) { - if (range.is_in_range(*iter)) { - return true; - } - - ++iter; - } - - return false; - } else if (!is_fixed_value_range() && range.is_fixed_value_range()) { - IteratorType iter = range._fixed_values.begin(); - - while (iter != range._fixed_values.end()) { - if (this->is_in_range(*iter)) { - return true; - } - - ++iter; - } - - return false; - } else { - if (Compare::greater(_low_value, range._high_value) || - Compare::greater(range._low_value, _high_value)) { - return false; - } else if (Compare::equal(_low_value, range._high_value)) { - if (FILTER_LARGER_OR_EQUAL == _low_op && FILTER_LESS_OR_EQUAL == range._high_op) { - return true; - } else { - return false; - } - } else if (Compare::equal(range._low_value, _high_value)) { - if (FILTER_LARGER_OR_EQUAL == range._low_op && FILTER_LESS_OR_EQUAL == _high_op) { - return true; - } else { - return false; - } - } else { - return true; - } - } -} - template Status OlapScanKeys::extend_scan_key(ColumnValueRange& range, int32_t max_scan_key_num, bool* exact_value, bool* eos, @@ -1199,10 +970,5 @@ Status OlapScanKeys::extend_scan_key(ColumnValueRange& range, return Status::OK(); } -struct ScanPredicate { - TCondition condition; - PrimitiveType primitiveType; -}; - #include "common/compile_check_end.h" } // namespace doris diff --git a/be/src/exec/olap_utils.h b/be/src/exec/olap_utils.h index ddf8562fea1daa..d192ed1d49693c 100644 --- a/be/src/exec/olap_utils.h +++ b/be/src/exec/olap_utils.h @@ -67,60 +67,31 @@ enum SQLFilterOp { FILTER_LESS = 2, FILTER_LESS_OR_EQUAL = 3, FILTER_IN = 4, - FILTER_NOT_IN = 5 + FILTER_NOT_IN = 5, + FILTER_EQ = 6, + FILTER_NE = 7 }; template constexpr bool always_false_v = false; -inline SQLFilterOp to_olap_filter_type(TExprOpcode::type type, bool opposite) { - switch (type) { - case TExprOpcode::LT: - return opposite ? FILTER_LARGER : FILTER_LESS; - - case TExprOpcode::LE: - return opposite ? FILTER_LARGER_OR_EQUAL : FILTER_LESS_OR_EQUAL; - - case TExprOpcode::GT: - return opposite ? FILTER_LESS : FILTER_LARGER; - - case TExprOpcode::GE: - return opposite ? FILTER_LESS_OR_EQUAL : FILTER_LARGER_OR_EQUAL; - - case TExprOpcode::EQ: - return opposite ? FILTER_NOT_IN : FILTER_IN; - - case TExprOpcode::NE: - return opposite ? FILTER_IN : FILTER_NOT_IN; - - case TExprOpcode::EQ_FOR_NULL: - return FILTER_IN; - - default: - VLOG_CRITICAL << "TExprOpcode: " << type; - DCHECK(false); - } - - return FILTER_IN; -} - -inline SQLFilterOp to_olap_filter_type(const std::string& function_name, bool opposite) { +inline SQLFilterOp to_olap_filter_type(const std::string& function_name) { if (function_name == "lt") { - return opposite ? FILTER_LARGER : FILTER_LESS; + return FILTER_LESS; } else if (function_name == "gt") { - return opposite ? FILTER_LESS : FILTER_LARGER; + return FILTER_LARGER; } else if (function_name == "le") { - return opposite ? FILTER_LARGER_OR_EQUAL : FILTER_LESS_OR_EQUAL; + return FILTER_LESS_OR_EQUAL; } else if (function_name == "ge") { - return opposite ? FILTER_LESS_OR_EQUAL : FILTER_LARGER_OR_EQUAL; + return FILTER_LARGER_OR_EQUAL; } else if (function_name == "eq") { - return opposite ? FILTER_NOT_IN : FILTER_IN; + return FILTER_EQ; } else if (function_name == "ne") { - return opposite ? FILTER_IN : FILTER_NOT_IN; + return FILTER_NE; } else if (function_name == "in") { - return opposite ? FILTER_NOT_IN : FILTER_IN; + return FILTER_IN; } else if (function_name == "not_in") { - return opposite ? FILTER_IN : FILTER_NOT_IN; + return FILTER_NOT_IN; } else { DCHECK(false) << "Function Name: " << function_name; return FILTER_IN; diff --git a/be/src/exprs/bitmapfilter_predicate.h b/be/src/exprs/bitmapfilter_predicate.h index b695883205fbae..e8f149ce87f694 100644 --- a/be/src/exprs/bitmapfilter_predicate.h +++ b/be/src/exprs/bitmapfilter_predicate.h @@ -19,6 +19,7 @@ #include +#include "common/cast_set.h" #include "runtime/define_primitive_type.h" #include "runtime/primitive_type.h" #include "runtime_filter/runtime_filter_definitions.h" @@ -67,7 +68,8 @@ class BitmapFilterFunc : public BitmapFilterFuncBase { if (right < 0) { return false; } - return _bitmap_value->contains_any(std::max(left, (CppType)0), right); + return _bitmap_value->contains_any(cast_set(std::max(left, (CppType)0)), + cast_set(right)); } private: diff --git a/be/src/exprs/create_predicate_function.h b/be/src/exprs/create_predicate_function.h index 6463e501a0fab8..2c79566a013f4c 100644 --- a/be/src/exprs/create_predicate_function.h +++ b/be/src/exprs/create_predicate_function.h @@ -232,55 +232,58 @@ inline auto create_bitmap_filter(PrimitiveType type) { } template -ColumnPredicate* create_olap_column_predicate(uint32_t column_id, - const std::shared_ptr& filter, - const TabletColumn*, bool null_aware) { +std::shared_ptr create_olap_column_predicate( + uint32_t column_id, const std::shared_ptr& filter, const TabletColumn*, + bool null_aware) { std::shared_ptr filter_olap; filter_olap.reset(create_bloom_filter(PT, null_aware)); filter_olap->light_copy(filter.get()); // create a new filter to match the input filter and PT. For example, filter may be varchar, but PT is char - return new BloomFilterColumnPredicate(column_id, filter_olap); + return BloomFilterColumnPredicate::create_shared(column_id, filter_olap); } template -ColumnPredicate* create_olap_column_predicate(uint32_t column_id, - const std::shared_ptr& filter, - const TabletColumn*, bool) { +std::shared_ptr create_olap_column_predicate( + uint32_t column_id, const std::shared_ptr& filter, + const TabletColumn*, bool) { if constexpr (PT == TYPE_TINYINT || PT == TYPE_SMALLINT || PT == TYPE_INT || PT == TYPE_BIGINT) { - return new BitmapFilterColumnPredicate(column_id, filter); + return BitmapFilterColumnPredicate::create_shared(column_id, filter); } else { throw Exception(ErrorCode::INTERNAL_ERROR, "bitmap filter do not support type {}", PT); } } template -ColumnPredicate* create_olap_column_predicate(uint32_t column_id, - const std::shared_ptr& filter, - const TabletColumn* column, bool) { +std::shared_ptr create_olap_column_predicate( + uint32_t column_id, const std::shared_ptr& filter, + const TabletColumn* column, bool) { return create_in_list_predicate(column_id, filter, column->length()); } template -ColumnPredicate* create_olap_column_predicate(uint32_t column_id, - const std::shared_ptr& filter, - const TabletColumn* column, bool) { +std::shared_ptr create_olap_column_predicate( + uint32_t column_id, const std::shared_ptr& filter, + const TabletColumn* column, bool) { // currently only support like predicate if constexpr (PT == TYPE_CHAR) { - return new LikeColumnPredicate(filter->_opposite, column_id, filter->_fn_ctx, - filter->_string_param); + return LikeColumnPredicate::create_shared(filter->_opposite, column_id, + column->name(), filter->_fn_ctx, + filter->_string_param); } else if constexpr (PT == TYPE_VARCHAR || PT == TYPE_STRING) { - return new LikeColumnPredicate(filter->_opposite, column_id, filter->_fn_ctx, - filter->_string_param); + return LikeColumnPredicate::create_shared(filter->_opposite, column_id, + column->name(), filter->_fn_ctx, + filter->_string_param); } throw Exception(ErrorCode::INTERNAL_ERROR, "function filter do not support type {}", PT); } template -ColumnPredicate* create_column_predicate(uint32_t column_id, const std::shared_ptr& filter, - FieldType type, const TabletColumn* column, - bool null_aware = false) { +std::shared_ptr create_column_predicate(uint32_t column_id, + const std::shared_ptr& filter, + FieldType type, const TabletColumn* column, + bool null_aware = false) { switch (type) { #define M(NAME) \ case FieldType::OLAP_FIELD_##NAME: { \ diff --git a/be/src/olap/accept_null_predicate.h b/be/src/olap/accept_null_predicate.h index 85135f9440aca2..b223cd3a401aef 100644 --- a/be/src/olap/accept_null_predicate.h +++ b/be/src/olap/accept_null_predicate.h @@ -40,8 +40,28 @@ class AcceptNullPredicate : public ColumnPredicate { ENABLE_FACTORY_CREATOR(AcceptNullPredicate); public: - AcceptNullPredicate(ColumnPredicate* nested) - : ColumnPredicate(nested->column_id(), nested->opposite()), _nested {nested} {} + AcceptNullPredicate(const std::shared_ptr& nested) + : ColumnPredicate(nested->column_id(), nested->col_name(), nested->primitive_type(), + nested->opposite()), + _nested {nested} {} + AcceptNullPredicate(const AcceptNullPredicate& other, uint32_t col_id) + : ColumnPredicate(other, col_id), + _nested(assert_cast(other)._nested + ? assert_cast(other)._nested->clone( + col_id) + : nullptr) {} + AcceptNullPredicate(const AcceptNullPredicate& other) = delete; + ~AcceptNullPredicate() override = default; + std::shared_ptr clone(uint32_t col_id) const override { + return AcceptNullPredicate::create_shared(*this, col_id); + } + std::string debug_string() const override { + auto n = _nested; + fmt::memory_buffer debug_string_buffer; + fmt::format_to(debug_string_buffer, "AcceptNullPredicate({}, nested={})", + ColumnPredicate::debug_string(), n ? n->debug_string() : "null"); + return fmt::to_string(debug_string_buffer); + } PredicateType type() const override { return _nested->type(); } @@ -173,11 +193,7 @@ class AcceptNullPredicate : public ColumnPredicate { return _nested->evaluate(column, sel, size); } - std::string _debug_string() const override { - return "passnull predicate for " + _nested->debug_string(); - } - - std::unique_ptr _nested; + std::shared_ptr _nested; }; } //namespace doris diff --git a/be/src/olap/bitmap_filter_predicate.h b/be/src/olap/bitmap_filter_predicate.h index 506e8b8c6f3563..9afaac4608220e 100644 --- a/be/src/olap/bitmap_filter_predicate.h +++ b/be/src/olap/bitmap_filter_predicate.h @@ -27,17 +27,32 @@ namespace doris { template -class BitmapFilterColumnPredicate : public ColumnPredicate { +class BitmapFilterColumnPredicate final : public ColumnPredicate { public: + ENABLE_FACTORY_CREATOR(BitmapFilterColumnPredicate); using CppType = typename PrimitiveTypeTraits::CppType; using SpecificFilter = BitmapFilterFunc; - BitmapFilterColumnPredicate(uint32_t column_id, + BitmapFilterColumnPredicate(uint32_t column_id, std::string col_name, const std::shared_ptr& filter) - : ColumnPredicate(column_id), + : ColumnPredicate(column_id, col_name, T), _filter(filter), _specific_filter(assert_cast(_filter.get())) {} ~BitmapFilterColumnPredicate() override = default; + BitmapFilterColumnPredicate(const BitmapFilterColumnPredicate& other, uint32_t col_id) + : ColumnPredicate(other, col_id), + _filter(other._filter), + _specific_filter(assert_cast(_filter.get())) {} + BitmapFilterColumnPredicate(const BitmapFilterColumnPredicate& other) = delete; + std::shared_ptr clone(uint32_t col_id) const override { + return BitmapFilterColumnPredicate::create_shared(*this, col_id); + } + std::string debug_string() const override { + fmt::memory_buffer debug_string_buffer; + fmt::format_to(debug_string_buffer, "BitmapFilterColumnPredicate({})", + ColumnPredicate::debug_string()); + return fmt::to_string(debug_string_buffer); + } PredicateType type() const override { return PredicateType::BITMAP_FILTER; } @@ -85,10 +100,6 @@ class BitmapFilterColumnPredicate : public ColumnPredicate { return new_size; } - std::string _debug_string() const override { - return "BitmapFilterColumnPredicate(" + type_to_string(T) + ")"; - } - std::shared_ptr _filter; SpecificFilter* _specific_filter; // owned by _filter diff --git a/be/src/olap/block_column_predicate.h b/be/src/olap/block_column_predicate.h index 3074ec2fb6f76b..ee73daeb4504e0 100644 --- a/be/src/olap/block_column_predicate.h +++ b/be/src/olap/block_column_predicate.h @@ -33,7 +33,7 @@ #include "olap/column_predicate.h" #include "olap/olap_common.h" #include "vec/columns/column.h" -#include "vec/exec/format/parquet/parquet_pred_cmp.h" +#include "vec/exec/format/parquet/parquet_predicate.h" namespace roaring { class Roaring; @@ -60,7 +60,7 @@ class BlockColumnPredicate { virtual void get_all_column_ids(std::set& column_id_set) const = 0; virtual void get_all_column_predicate( - std::set& predicate_set) const = 0; + std::set>& predicate_set) const = 0; virtual uint16_t evaluate(vectorized::MutableColumns& block, uint16_t* sel, uint16_t selected_size) const { @@ -118,13 +118,15 @@ class SingleColumnBlockPredicate : public BlockColumnPredicate { ENABLE_FACTORY_CREATOR(SingleColumnBlockPredicate); public: - explicit SingleColumnBlockPredicate(const ColumnPredicate* pre) : _predicate(pre) {} + explicit SingleColumnBlockPredicate(const std::shared_ptr& pre) + : _predicate(pre) {} void get_all_column_ids(std::set& column_id_set) const override { column_id_set.insert(_predicate->column_id()); } - void get_all_column_predicate(std::set& predicate_set) const override { + void get_all_column_predicate( + std::set>& predicate_set) const override { predicate_set.insert(_predicate); } @@ -154,7 +156,7 @@ class SingleColumnBlockPredicate : public BlockColumnPredicate { } private: - const ColumnPredicate* _predicate = nullptr; + const std::shared_ptr _predicate = nullptr; }; class MutilColumnBlockPredicate : public BlockColumnPredicate { @@ -185,7 +187,8 @@ class MutilColumnBlockPredicate : public BlockColumnPredicate { } } - void get_all_column_predicate(std::set& predicate_set) const override { + void get_all_column_predicate( + std::set>& predicate_set) const override { for (auto& child_block_predicate : _block_column_predicate_vec) { child_block_predicate->get_all_column_predicate(predicate_set); } diff --git a/be/src/olap/bloom_filter_predicate.h b/be/src/olap/bloom_filter_predicate.h index 972ff3845dd82f..e25afc878aa066 100644 --- a/be/src/olap/bloom_filter_predicate.h +++ b/be/src/olap/bloom_filter_predicate.h @@ -30,16 +30,31 @@ namespace doris { template -class BloomFilterColumnPredicate : public ColumnPredicate { +class BloomFilterColumnPredicate final : public ColumnPredicate { public: + ENABLE_FACTORY_CREATOR(BloomFilterColumnPredicate); using SpecificFilter = BloomFilterFunc; - BloomFilterColumnPredicate(uint32_t column_id, + BloomFilterColumnPredicate(uint32_t column_id, std::string col_name, const std::shared_ptr& filter) - : ColumnPredicate(column_id), + : ColumnPredicate(column_id, col_name, T), _filter(filter), _specific_filter(assert_cast(_filter.get())) {} ~BloomFilterColumnPredicate() override = default; + BloomFilterColumnPredicate(const BloomFilterColumnPredicate& other, uint32_t col_id) + : ColumnPredicate(other, col_id), + _filter(other._filter), + _specific_filter(assert_cast(_filter.get())) {} + BloomFilterColumnPredicate(const BloomFilterColumnPredicate& other) = delete; + std::shared_ptr clone(uint32_t col_id) const override { + return BloomFilterColumnPredicate::create_shared(*this, col_id); + } + std::string debug_string() const override { + fmt::memory_buffer debug_string_buffer; + fmt::format_to(debug_string_buffer, "BloomFilterColumnPredicate({})", + ColumnPredicate::debug_string()); + return fmt::to_string(debug_string_buffer); + } PredicateType type() const override { return PredicateType::BF; } @@ -76,8 +91,6 @@ class BloomFilterColumnPredicate : public ColumnPredicate { return new_size; } - std::string _debug_string() const override { return "BloomFilter(" + type_to_string(T) + ")"; } - std::shared_ptr _filter; SpecificFilter* _specific_filter; // owned by _filter }; diff --git a/be/src/olap/column_predicate.h b/be/src/olap/column_predicate.h index 863c064ff86fef..692729a8987d23 100644 --- a/be/src/olap/column_predicate.h +++ b/be/src/olap/column_predicate.h @@ -27,7 +27,7 @@ #include "util/defer_op.h" #include "util/runtime_profile.h" #include "vec/columns/column.h" -#include "vec/exec/format/parquet/parquet_pred_cmp.h" +#include "vec/exec/format/parquet/parquet_predicate.h" #include "vec/exprs/vruntimefilter_wrapper.h" using namespace doris::segment_v2; @@ -120,6 +120,43 @@ inline std::string type_to_string(PredicateType type) { return ""; } +inline std::string type_to_op_str(PredicateType type) { + switch (type) { + case PredicateType::EQ: + return "="; + + case PredicateType::NE: + return "!="; + + case PredicateType::LT: + return "<<"; + + case PredicateType::LE: + return "<="; + + case PredicateType::GT: + return ">>"; + + case PredicateType::GE: + return ">="; + + case PredicateType::IN_LIST: + return "*="; + + case PredicateType::NOT_IN_LIST: + return "!*="; + + case PredicateType::IS_NULL: + case PredicateType::IS_NOT_NULL: + return "is"; + + default: + break; + }; + + return ""; +} + struct PredicateTypeTraits { static constexpr bool is_range(PredicateType type) { return (type == PredicateType::LT || type == PredicateType::LE || @@ -158,16 +195,25 @@ struct PredicateTypeTraits { } \ } -class ColumnPredicate { +class ColumnPredicate : public std::enable_shared_from_this { public: - explicit ColumnPredicate(uint32_t column_id, bool opposite = false) - : _column_id(column_id), _opposite(opposite) { + explicit ColumnPredicate(uint32_t column_id, std::string col_name, PrimitiveType primitive_type, + bool opposite = false) + : _column_id(column_id), + _col_name(col_name), + _primitive_type(primitive_type), + _opposite(opposite) { reset_judge_selectivity(); } + ColumnPredicate(const ColumnPredicate& other, uint32_t col_id) : ColumnPredicate(other) { + _column_id = col_id; + } virtual ~ColumnPredicate() = default; virtual PredicateType type() const = 0; + virtual PrimitiveType primitive_type() const { return _primitive_type; } + virtual std::shared_ptr clone(uint32_t col_id) const = 0; //evaluate predicate on inverted virtual Status evaluate(const vectorized::IndexFieldNameAndTypePair& name_with_type, @@ -178,6 +224,16 @@ class ColumnPredicate { } virtual double get_ignore_threshold() const { return 0; } + // Return the size of value set for IN/NOT IN predicates and 0 for others. + virtual std::string debug_string() const { + fmt::memory_buffer debug_string_buffer; + fmt::format_to(debug_string_buffer, + "Column ID: {}, Data Type: {}, PredicateType: {}, opposite: {}, Runtime " + "Filter ID: {}", + _column_id, type_to_string(primitive_type()), pred_type_string(type()), + _opposite, _runtime_filter_id); + return fmt::to_string(debug_string_buffer); + } // evaluate predicate on IColumn // a short circuit eval way @@ -263,17 +319,10 @@ class ColumnPredicate { DCHECK(false) << "should not reach here"; } uint32_t column_id() const { return _column_id; } + std::string col_name() const { return _col_name; } bool opposite() const { return _opposite; } - std::string debug_string() const { - return _debug_string() + - fmt::format(", column_id={}, opposite={}, can_ignore={}, runtime_filter_id={}", - _column_id, _opposite, _can_ignore(), _runtime_filter_id); - } - - int get_runtime_filter_id() const { return _runtime_filter_id; } - void attach_profile_counter( int filter_id, std::shared_ptr predicate_filtered_rows_counter, std::shared_ptr predicate_input_rows_counter, @@ -347,7 +396,6 @@ class ColumnPredicate { virtual bool is_runtime_filter() const { return _can_ignore(); } protected: - virtual std::string _debug_string() const = 0; virtual bool _can_ignore() const { return _runtime_filter_id != -1; } virtual uint16_t _evaluate_inner(const vectorized::IColumn& column, uint16_t* sel, uint16_t size) const { @@ -377,6 +425,8 @@ class ColumnPredicate { } uint32_t _column_id; + const std::string _col_name; + PrimitiveType _primitive_type; // TODO: the value is only in delete condition, better be template value bool _opposite; int _runtime_filter_id = -1; @@ -399,6 +449,9 @@ class ColumnPredicate { std::make_shared(TUnit::UNIT, 0); std::shared_ptr _predicate_always_true_rows_counter = std::make_shared(TUnit::UNIT, 0); + +private: + ColumnPredicate(const ColumnPredicate& other) = default; }; } //namespace doris diff --git a/be/src/olap/comparison_predicate.h b/be/src/olap/comparison_predicate.h index 12db94a5f716dc..2bd7e81ecaaeb3 100644 --- a/be/src/olap/comparison_predicate.h +++ b/be/src/olap/comparison_predicate.h @@ -31,11 +31,26 @@ namespace doris { #include "common/compile_check_begin.h" template -class ComparisonPredicateBase : public ColumnPredicate { +class ComparisonPredicateBase final : public ColumnPredicate { public: + ENABLE_FACTORY_CREATOR(ComparisonPredicateBase); using T = typename PrimitiveTypeTraits::CppType; - ComparisonPredicateBase(uint32_t column_id, const T& value, bool opposite = false) - : ColumnPredicate(column_id, opposite), _value(value) {} + ComparisonPredicateBase(uint32_t column_id, std::string col_name, const T& value, + bool opposite = false) + : ColumnPredicate(column_id, col_name, Type, opposite), _value(value) {} + ComparisonPredicateBase(const ComparisonPredicateBase& other, uint32_t col_id) + : ColumnPredicate(other, col_id), _value(other._value) {} + ComparisonPredicateBase(const ComparisonPredicateBase& other) = delete; + std::shared_ptr clone(uint32_t col_id) const override { + DCHECK(_segment_id_to_cached_code.empty()); + return ComparisonPredicateBase::create_shared(*this, col_id); + } + std::string debug_string() const override { + fmt::memory_buffer debug_string_buffer; + fmt::format_to(debug_string_buffer, "ComparisonPredicateBase({})", + ColumnPredicate::debug_string()); + return fmt::to_string(debug_string_buffer); + } PredicateType type() const override { return PT; } @@ -695,12 +710,6 @@ class ComparisonPredicateBase : public ColumnPredicate { return code; } - std::string _debug_string() const override { - std::string info = - "ComparisonPredicateBase(" + type_to_string(Type) + ", " + type_to_string(PT) + ")"; - return info; - } - mutable phmap::parallel_flat_hash_map< std::pair, int32_t, phmap::priv::hash_default_hash>, diff --git a/be/src/olap/delete_handler.cpp b/be/src/olap/delete_handler.cpp index 11f89eff0c0475..2b40351296ddca 100644 --- a/be/src/olap/delete_handler.cpp +++ b/be/src/olap/delete_handler.cpp @@ -28,12 +28,20 @@ #include "common/logging.h" #include "common/status.h" #include "olap/block_column_predicate.h" -#include "olap/column_predicate.h" #include "olap/olap_common.h" #include "olap/predicate_creator.h" #include "olap/tablet_schema.h" #include "olap/utils.h" #include "util/debug_points.h" +#include "vec/functions/cast/cast_parameters.h" +#include "vec/functions/cast/cast_to_boolean.h" +#include "vec/functions/cast/cast_to_date_or_datetime_impl.hpp" +#include "vec/functions/cast/cast_to_datetimev2_impl.hpp" +#include "vec/functions/cast/cast_to_datev2_impl.hpp" +#include "vec/functions/cast/cast_to_decimal.h" +#include "vec/functions/cast/cast_to_float.h" +#include "vec/functions/cast/cast_to_int.h" +#include "vec/functions/cast/cast_to_ip.h" using apache::thrift::ThriftDebugString; using std::vector; @@ -43,6 +51,340 @@ using ::google::protobuf::RepeatedPtrField; namespace doris { +template +Status convert(const vectorized::DataTypePtr& data_type, const std::string& str, + vectorized::Arena& arena, typename PrimitiveTypeTraits::CppType& res) { + if constexpr (PType == TYPE_TINYINT || PType == TYPE_SMALLINT || PType == TYPE_INT || + PType == TYPE_BIGINT || PType == TYPE_LARGEINT) { + vectorized::CastParameters parameters; + if (!vectorized::CastToInt::from_string({str.data(), str.size()}, res, parameters)) { + return Status::Error( + "invalid {} string. str={}", type_to_string(data_type->get_primitive_type()), + str); + } + return Status::OK(); + } + if constexpr (PType == TYPE_FLOAT || PType == TYPE_DOUBLE) { + vectorized::CastParameters parameters; + if (!vectorized::CastToFloat::from_string({str.data(), str.size()}, res, parameters)) { + return Status::Error( + "invalid {} string. str={}", type_to_string(data_type->get_primitive_type()), + str); + } + return Status::OK(); + } + if constexpr (PType == TYPE_DATE) { + vectorized::CastParameters parameters; + if (!vectorized::CastToDateOrDatetime::from_string({str.data(), str.size()}, res, + nullptr, parameters)) { + return Status::Error( + "invalid {} string. str={}", type_to_string(data_type->get_primitive_type()), + str); + } + return Status::OK(); + } + if constexpr (PType == TYPE_DATETIME) { + vectorized::CastParameters parameters; + if (!vectorized::CastToDateOrDatetime::from_string({str.data(), str.size()}, res, + nullptr, parameters)) { + return Status::Error( + "invalid {} string. str={}", type_to_string(data_type->get_primitive_type()), + str); + } + return Status::OK(); + } + if constexpr (PType == TYPE_DATEV2) { + vectorized::CastParameters parameters; + if (!vectorized::CastToDateV2::from_string({str.data(), str.size()}, res, nullptr, + parameters)) { + return Status::Error( + "invalid {} string. str={}", type_to_string(data_type->get_primitive_type()), + str); + } + return Status::OK(); + } + if constexpr (PType == TYPE_DATETIMEV2) { + vectorized::CastParameters parameters; + if (!vectorized::CastToDatetimeV2::from_string({str.data(), str.size()}, res, nullptr, + data_type->get_scale(), parameters)) { + return Status::Error( + "invalid {} string. str={}", type_to_string(data_type->get_primitive_type()), + str); + } + return Status::OK(); + } + if constexpr (PType == TYPE_TIMESTAMPTZ) { + vectorized::CastParameters parameters; + if (!vectorized::CastToTimstampTz::from_string({str.data(), str.size()}, res, parameters, + nullptr, data_type->get_scale())) { + return Status::Error( + "invalid {} string. str={}", type_to_string(data_type->get_primitive_type()), + str); + } + return Status::OK(); + } + if constexpr (PType == TYPE_CHAR) { + size_t target = assert_cast( + vectorized::remove_nullable(data_type).get()) + ->len(); + res = {str.data(), str.size()}; + if (target > str.size()) { + char* buffer = arena.alloc(target); + memset(buffer, 0, target); + memcpy(buffer, str.data(), str.size()); + res = {buffer, target}; + } + return Status::OK(); + } + if constexpr (PType == TYPE_STRING || PType == TYPE_VARCHAR) { + char* buffer = arena.alloc(str.size()); + memcpy(buffer, str.data(), str.size()); + res = {buffer, str.size()}; + return Status::OK(); + } + if constexpr (PType == TYPE_BOOLEAN) { + vectorized::CastParameters parameters; + vectorized::UInt8 tmp; + if (!vectorized::CastToBool::from_string({str.data(), str.size()}, tmp, parameters)) { + return Status::Error( + "invalid {} string. str={}", type_to_string(data_type->get_primitive_type()), + str); + } + res = tmp != 0; + return Status::OK(); + } + if constexpr (PType == TYPE_IPV4) { + vectorized::CastParameters parameters; + if (!vectorized::CastToIPv4::from_string({str.data(), str.size()}, res, parameters)) { + return Status::Error( + "invalid {} string. str={}", type_to_string(data_type->get_primitive_type()), + str); + } + return Status::OK(); + } + if constexpr (PType == TYPE_IPV6) { + vectorized::CastParameters parameters; + if (!vectorized::CastToIPv6::from_string({str.data(), str.size()}, res, parameters)) { + return Status::Error( + "invalid {} string. str={}", type_to_string(data_type->get_primitive_type()), + str); + } + return Status::OK(); + } + if constexpr (PType == TYPE_DECIMALV2) { + vectorized::CastParameters parameters; + vectorized::Decimal128V2 tmp; + if (!vectorized::CastToDecimal::from_string({str.data(), str.size()}, tmp, + data_type->get_precision(), + data_type->get_scale(), parameters)) { + return Status::Error( + "invalid {} string. str={}", type_to_string(data_type->get_primitive_type()), + str); + } + res = DecimalV2Value(tmp.value); + return Status::OK(); + } else if constexpr (is_decimal(PType)) { + vectorized::CastParameters parameters; + if (!vectorized::CastToDecimal::from_string({str.data(), str.size()}, res, + data_type->get_precision(), + data_type->get_scale(), parameters)) { + return Status::Error( + "invalid {} string. str={}", type_to_string(data_type->get_primitive_type()), + str); + } + return Status::OK(); + } + return Status::Error( + "unsupported data type in delete handler. type={}", + type_to_string(data_type->get_primitive_type())); +} + +#define CONVERT_CASE(PType) \ + case PType: { \ + set = build_set(); \ + for (const auto& s : str) { \ + typename PrimitiveTypeTraits::CppType tmp; \ + RETURN_IF_ERROR(convert(data_type, s, arena, tmp)); \ + set->insert(reinterpret_cast(&tmp)); \ + } \ + return Status::OK(); \ + } +Status convert(const vectorized::DataTypePtr& data_type, const std::list& str, + vectorized::Arena& arena, std::shared_ptr& set) { + switch (data_type->get_primitive_type()) { + CONVERT_CASE(TYPE_TINYINT); + CONVERT_CASE(TYPE_SMALLINT); + CONVERT_CASE(TYPE_INT); + CONVERT_CASE(TYPE_BIGINT); + CONVERT_CASE(TYPE_LARGEINT); + CONVERT_CASE(TYPE_FLOAT); + CONVERT_CASE(TYPE_DOUBLE); + CONVERT_CASE(TYPE_DATE); + CONVERT_CASE(TYPE_DATETIME); + CONVERT_CASE(TYPE_DATEV2); + CONVERT_CASE(TYPE_DATETIMEV2); + CONVERT_CASE(TYPE_TIMESTAMPTZ); + CONVERT_CASE(TYPE_BOOLEAN); + CONVERT_CASE(TYPE_IPV4); + CONVERT_CASE(TYPE_IPV6); + CONVERT_CASE(TYPE_DECIMALV2); + CONVERT_CASE(TYPE_DECIMAL32); + CONVERT_CASE(TYPE_DECIMAL64); + CONVERT_CASE(TYPE_DECIMAL128I); + CONVERT_CASE(TYPE_DECIMAL256); + CONVERT_CASE(TYPE_CHAR); + CONVERT_CASE(TYPE_VARCHAR); + CONVERT_CASE(TYPE_STRING); + default: + return Status::Error( + "unsupported data type in delete handler. type={}", + type_to_string(data_type->get_primitive_type())); + } + return Status::OK(); +} +#undef CONVERT_CASE + +#define CONVERT_CASE(PType) \ + case PType: { \ + typename PrimitiveTypeTraits::CppType tmp; \ + RETURN_IF_ERROR(convert(type, res.value_str.front(), arena, tmp)); \ + v.data = reinterpret_cast(&tmp); \ + v.size = sizeof(tmp); \ + switch (res.condition_op) { \ + case PredicateType::EQ: \ + predicate = create_comparison_predicate0(index, col_name, type, v, \ + true, arena); \ + return Status::OK(); \ + case PredicateType::NE: \ + predicate = create_comparison_predicate0(index, col_name, type, v, \ + true, arena); \ + return Status::OK(); \ + case PredicateType::GT: \ + predicate = create_comparison_predicate0(index, col_name, type, v, \ + true, arena); \ + return Status::OK(); \ + case PredicateType::GE: \ + predicate = create_comparison_predicate0(index, col_name, type, v, \ + true, arena); \ + return Status::OK(); \ + case PredicateType::LT: \ + predicate = create_comparison_predicate0(index, col_name, type, v, \ + true, arena); \ + return Status::OK(); \ + case PredicateType::LE: \ + predicate = create_comparison_predicate0(index, col_name, type, v, \ + true, arena); \ + return Status::OK(); \ + default: \ + return Status::Error( \ + "invalid condition operator. operator={}", type_to_op_str(res.condition_op)); \ + } \ + } +Status parse_to_predicate(const uint32_t index, const std::string col_name, + const vectorized::DataTypePtr& type, + DeleteHandler::ConditionParseResult& res, vectorized::Arena& arena, + std::shared_ptr& predicate) { + DCHECK_EQ(res.value_str.size(), 1); + if (res.condition_op == PredicateType::IS_NULL || + res.condition_op == PredicateType::IS_NOT_NULL) { + predicate = NullPredicate::create_shared(index, col_name, + res.condition_op == PredicateType::IS_NOT_NULL, + type->get_primitive_type()); + return Status::OK(); + } + StringRef v; + switch (type->get_primitive_type()) { + CONVERT_CASE(TYPE_TINYINT); + CONVERT_CASE(TYPE_SMALLINT); + CONVERT_CASE(TYPE_INT); + CONVERT_CASE(TYPE_BIGINT); + CONVERT_CASE(TYPE_LARGEINT); + CONVERT_CASE(TYPE_FLOAT); + CONVERT_CASE(TYPE_DOUBLE); + CONVERT_CASE(TYPE_DATE); + CONVERT_CASE(TYPE_DATETIME); + CONVERT_CASE(TYPE_DATEV2); + CONVERT_CASE(TYPE_DATETIMEV2); + CONVERT_CASE(TYPE_TIMESTAMPTZ); + CONVERT_CASE(TYPE_BOOLEAN); + CONVERT_CASE(TYPE_IPV4); + CONVERT_CASE(TYPE_IPV6); + CONVERT_CASE(TYPE_DECIMALV2); + CONVERT_CASE(TYPE_DECIMAL32); + CONVERT_CASE(TYPE_DECIMAL64); + CONVERT_CASE(TYPE_DECIMAL128I); + CONVERT_CASE(TYPE_DECIMAL256); + case TYPE_CHAR: + case TYPE_VARCHAR: + case TYPE_STRING: { + RETURN_IF_ERROR(convert(type, res.value_str.front(), arena, v)); + switch (res.condition_op) { + case PredicateType::EQ: + predicate = create_comparison_predicate0(index, col_name, type, v, + true, arena); + return Status::OK(); + case PredicateType::NE: + predicate = create_comparison_predicate0(index, col_name, type, v, + true, arena); + return Status::OK(); + case PredicateType::GT: + predicate = create_comparison_predicate0(index, col_name, type, v, + true, arena); + return Status::OK(); + case PredicateType::GE: + predicate = create_comparison_predicate0(index, col_name, type, v, + true, arena); + return Status::OK(); + case PredicateType::LT: + predicate = create_comparison_predicate0(index, col_name, type, v, + true, arena); + return Status::OK(); + case PredicateType::LE: + predicate = create_comparison_predicate0(index, col_name, type, v, + true, arena); + return Status::OK(); + default: + return Status::Error( + "invalid condition operator. operator={}", type_to_op_str(res.condition_op)); + } + break; + } + default: + return Status::Error( + "unsupported data type in delete handler. type={}", + type_to_string(type->get_primitive_type())); + } + return Status::OK(); +#undef CONVERT_CASE +} + +Status parse_to_in_predicate(const uint32_t index, const std::string& col_name, + const vectorized::DataTypePtr& type, + DeleteHandler::ConditionParseResult& res, vectorized::Arena& arena, + std::shared_ptr& predicate) { + DCHECK_GT(res.value_str.size(), 1); + switch (res.condition_op) { + case PredicateType::IN_LIST: { + std::shared_ptr set; + RETURN_IF_ERROR(convert(type, res.value_str, arena, set)); + predicate = + create_in_list_predicate(index, col_name, type, set, true); + break; + } + case PredicateType::NOT_IN_LIST: { + std::shared_ptr set; + RETURN_IF_ERROR(convert(type, res.value_str, arena, set)); + predicate = create_in_list_predicate(index, col_name, type, set, + true); + break; + } + default: + return Status::Error("invalid condition operator. operator={}", + type_to_op_str(res.condition_op)); + } + return Status::OK(); +} + // construct sub condition from TCondition std::string construct_sub_predicate(const TCondition& condition) { string op = condition.condition_op; @@ -126,12 +468,14 @@ Status DeleteHandler::generate_delete_predicate(const TabletSchema& schema, if (condition.__isset.column_unique_id) { // only light schema change capable table set this field sub_predicate->set_column_unique_id(condition.column_unique_id); - } else if (TCondition tmp; !DeleteHandler::parse_condition(condition_str, &tmp)) { - // for non light shema change tables, check regex match for condition str - LOG(WARNING) << "failed to parse condition_str, condtion=" - << ThriftDebugString(condition); - return Status::Error( - "failed to parse condition_str, condtion={}", ThriftDebugString(condition)); + } else { + try { + [[maybe_unused]] auto parsed_cond = parse_condition(condition_str); + } catch (const Exception& e) { + return Status::Error( + "failed to parse condition_str, condition={}, error={}", + ThriftDebugString(condition), e.to_string()); + } } sub_predicate->set_column_name(condition.column_name); @@ -152,13 +496,12 @@ Status DeleteHandler::convert_to_sub_pred_v2(DeletePredicatePB* delete_pred, if (!delete_pred->sub_predicates().empty() && delete_pred->sub_predicates_v2().empty()) { for (const auto& condition_str : delete_pred->sub_predicates()) { auto* sub_pred = delete_pred->add_sub_predicates_v2(); - TCondition condition; - static_cast(parse_condition(condition_str, &condition)); + auto condition = parse_condition(condition_str); const auto& column = *DORIS_TRY(schema->column(condition.column_name)); sub_pred->set_column_unique_id(column.unique_id()); sub_pred->set_column_name(condition.column_name); - sub_pred->set_op(condition.condition_op); - sub_pred->set_cond_value(condition.condition_values[0]); + sub_pred->set_op(type_to_op_str(condition.condition_op)); + sub_pred->set_cond_value(condition.value_str.front()); } } @@ -287,19 +630,49 @@ Status DeleteHandler::check_condition_valid(const TabletSchema& schema, const TC return Status::OK(); } -Status DeleteHandler::parse_condition(const DeleteSubPredicatePB& sub_cond, TCondition* condition) { +PredicateType DeleteHandler::parse_condition_op(const std::string& op_str, + const std::list& cond_values) { + if (trim(to_lower(op_str)) == "=") { + return PredicateType::EQ; + } else if (trim(to_lower(op_str)) == "!=") { + return PredicateType::NE; + } else if (trim(to_lower(op_str)) == ">>") { + return PredicateType::GT; + } else if (trim(to_lower(op_str)) == "<<") { + return PredicateType::LT; + } else if (trim(to_lower(op_str)) == ">=") { + return PredicateType::GE; + } else if (trim(to_lower(op_str)) == "<=") { + return PredicateType::LE; + } else if (trim(to_lower(op_str)) == "*=") { + return cond_values.size() > 1 ? PredicateType::IN_LIST : PredicateType::EQ; + } else if (trim(to_lower(op_str)) == "!*=") { + return cond_values.size() > 1 ? PredicateType::NOT_IN_LIST : PredicateType::NE; + } else if (trim(to_lower(op_str)) == "is") { + return to_lower(cond_values.front()) == "null" ? PredicateType::IS_NULL + : PredicateType::IS_NOT_NULL; + } else { + throw Exception(Status::Error( + "invalid condition operator. operator={}", op_str)); + } + return PredicateType::UNKNOWN; +} + +DeleteHandler::ConditionParseResult DeleteHandler::parse_condition( + const DeleteSubPredicatePB& sub_cond) { + ConditionParseResult res; if (!sub_cond.has_column_name() || !sub_cond.has_op() || !sub_cond.has_cond_value()) { - return Status::Error( + throw Exception(Status::Error( "fail to parse condition. condition={} {} {}", sub_cond.column_name(), - sub_cond.op(), sub_cond.cond_value()); + sub_cond.op(), sub_cond.cond_value())); } if (sub_cond.has_column_unique_id()) { - condition->column_unique_id = sub_cond.column_unique_id(); + res.col_unique_id = sub_cond.column_unique_id(); } - condition->column_name = sub_cond.column_name(); - condition->condition_op = sub_cond.op(); - condition->condition_values.push_back(sub_cond.cond_value()); - return Status::OK(); + res.column_name = sub_cond.column_name(); + res.value_str.push_back(sub_cond.cond_value()); + res.condition_op = parse_condition_op(sub_cond.op(), res.value_str); + return res; } // clang-format off @@ -322,28 +695,31 @@ const char* const CONDITION_STR_PATTERN = // clang-format on RE2 DELETE_HANDLER_REGEX(CONDITION_STR_PATTERN); -Status DeleteHandler::parse_condition(const std::string& condition_str, TCondition* condition) { +DeleteHandler::ConditionParseResult DeleteHandler::parse_condition( + const std::string& condition_str) { + ConditionParseResult res; std::string col_name, op, value, g4; bool matched = RE2::FullMatch(condition_str, DELETE_HANDLER_REGEX, &col_name, &op, &value, &g4); // exact match if (!matched) { - return Status::InvalidArgument("fail to sub condition. condition={}", condition_str); + throw Exception( + Status::InvalidArgument("fail to sub condition. condition={}", condition_str)); } - condition->column_name = col_name; - condition->condition_op = op == " IS " ? "IS" : op; + res.column_name = col_name; + // match string with single quotes, a = b or a = 'b' if (!g4.empty()) { - condition->condition_values.push_back(g4); + res.value_str.push_back(g4); } else { - condition->condition_values.push_back(value); + res.value_str.push_back(value); } - VLOG_NOTICE << "parsed condition_str: col_name={" << condition->column_name << "} op={" - << condition->condition_op << "} val={" << condition->condition_values.back() - << "}"; - return Status::OK(); + res.condition_op = DeleteHandler::parse_condition_op(op, res.value_str); + VLOG_NOTICE << "parsed condition_str: col_name={" << col_name << "} op={" << op << "} val={" + << res.value_str.back() << "}"; + return res; } template @@ -354,8 +730,7 @@ Status DeleteHandler::_parse_column_pred(TabletSchemaSPtr complete_schema, const RepeatedPtrField& sub_pred_list, DeleteConditions* delete_conditions) { for (const auto& sub_predicate : sub_pred_list) { - TCondition condition; - RETURN_IF_ERROR(parse_condition(sub_predicate, &condition)); + auto condition = parse_condition(sub_predicate); int32_t col_unique_id = -1; if constexpr (std::is_same_v) { if (sub_predicate.has_column_unique_id()) [[likely]] { @@ -367,11 +742,12 @@ Status DeleteHandler::_parse_column_pred(TabletSchemaSPtr complete_schema, *DORIS_TRY(delete_pred_related_schema->column(condition.column_name)); col_unique_id = column.unique_id(); } - condition.__set_column_unique_id(col_unique_id); + condition.col_unique_id = col_unique_id; const auto& column = complete_schema->column_by_uid(col_unique_id); uint32_t index = complete_schema->field_index(col_unique_id); - auto* predicate = - parse_to_predicate(column.get_vec_type(), index, condition, _predicate_arena, true); + std::shared_ptr predicate; + RETURN_IF_ERROR(parse_to_predicate(index, column.name(), column.get_vec_type(), condition, + _predicate_arena, predicate)); if (predicate != nullptr) { delete_conditions->column_predicate_vec.push_back(predicate); } @@ -379,16 +755,6 @@ Status DeleteHandler::_parse_column_pred(TabletSchemaSPtr complete_schema, return Status::OK(); } -template Status DeleteHandler::_parse_column_pred( - TabletSchemaSPtr complete_schema, TabletSchemaSPtr delete_pred_related_schema, - const ::google::protobuf::RepeatedPtrField& sub_pred_list, - DeleteConditions* delete_conditions); - -template Status DeleteHandler::_parse_column_pred( - TabletSchemaSPtr complete_schema, TabletSchemaSPtr delete_pred_related_schema, - const ::google::protobuf::RepeatedPtrField& sub_pred_list, - DeleteConditions* delete_conditions); - Status DeleteHandler::init(TabletSchemaSPtr tablet_schema, const std::vector& delete_preds, int64_t version) { DCHECK(!_is_inited) << "reinitialize delete handler."; @@ -413,8 +779,8 @@ Status DeleteHandler::init(TabletSchemaSPtr tablet_schema, delete_condition.sub_predicates(), &temp)); } for (const auto& in_predicate : delete_condition.in_predicates()) { - TCondition condition; - condition.__set_column_name(in_predicate.column_name()); + ConditionParseResult condition; + condition.column_name = in_predicate.column_name(); int32_t col_unique_id = -1; if (in_predicate.has_column_unique_id()) { @@ -429,20 +795,19 @@ Status DeleteHandler::init(TabletSchemaSPtr tablet_schema, return Status::Error( "cannot get column_unique_id for column {}", condition.column_name); } - condition.__set_column_unique_id(col_unique_id); + condition.col_unique_id = col_unique_id; - if (in_predicate.is_not_in()) { - condition.__set_condition_op("!*="); - } else { - condition.__set_condition_op("*="); - } + condition.condition_op = + in_predicate.is_not_in() ? PredicateType::NOT_IN_LIST : PredicateType::IN_LIST; for (const auto& value : in_predicate.values()) { - condition.condition_values.push_back(value); + condition.value_str.push_back(value); } const auto& column = tablet_schema->column_by_uid(col_unique_id); uint32_t index = tablet_schema->field_index(col_unique_id); - temp.column_predicate_vec.push_back(parse_to_predicate( - column.get_vec_type(), index, condition, _predicate_arena, true)); + std::shared_ptr predicate; + RETURN_IF_ERROR(parse_to_in_predicate(index, column.name(), column.get_vec_type(), + condition, _predicate_arena, predicate)); + temp.column_predicate_vec.push_back(predicate); } _del_conds.emplace_back(std::move(temp)); @@ -458,19 +823,13 @@ DeleteHandler::~DeleteHandler() { return; } - for (auto& cond : _del_conds) { - for (const auto* pred : cond.column_predicate_vec) { - delete pred; - } - } - _del_conds.clear(); _is_inited = false; } void DeleteHandler::get_delete_conditions_after_version( int64_t version, AndBlockColumnPredicate* and_block_column_predicate_ptr, - std::unordered_map>* + std::unordered_map>>* del_predicates_for_zone_map) const { for (const auto& del_cond : _del_conds) { if (del_cond.filter_version > version) { @@ -485,7 +844,7 @@ void DeleteHandler::get_delete_conditions_after_version( del_cond.column_predicate_vec[0]->column_id()) < 1) { del_predicates_for_zone_map->insert( {del_cond.column_predicate_vec[0]->column_id(), - std::vector {}}); + std::vector> {}}); } (*del_predicates_for_zone_map)[del_cond.column_predicate_vec[0]->column_id()] .push_back(del_cond.column_predicate_vec[0]); @@ -499,7 +858,8 @@ void DeleteHandler::get_delete_conditions_after_version( // // TODO: need refactor design and code to use more version delete and more column delete to filter zone page. std::for_each(del_cond.column_predicate_vec.cbegin(), del_cond.column_predicate_vec.cend(), - [&or_column_predicate](const ColumnPredicate* predicate) { + [&or_column_predicate]( + const std::shared_ptr predicate) { or_column_predicate->add_column_predicate( SingleColumnBlockPredicate::create_unique(predicate)); }); diff --git a/be/src/olap/delete_handler.h b/be/src/olap/delete_handler.h index d1c6a866cf2216..a5e97d3fac7a68 100644 --- a/be/src/olap/delete_handler.h +++ b/be/src/olap/delete_handler.h @@ -25,6 +25,7 @@ #include "common/factory_creator.h" #include "common/status.h" +#include "olap/column_predicate.h" #include "olap/rowset/rowset_meta.h" #include "olap/tablet_schema.h" #include "vec/common/arena.h" @@ -39,7 +40,7 @@ class TCondition; // Represent a delete condition. struct DeleteConditions { int64_t filter_version = 0; // The version of this condition - std::vector column_predicate_vec; + std::vector> column_predicate_vec; }; // This class is used for checking whether a row should be deleted. @@ -55,8 +56,14 @@ struct DeleteConditions { // * In the first step, before calling delete_handler.init(), you should lock the tablet's header file. class DeleteHandler { ENABLE_FACTORY_CREATOR(DeleteHandler); - // These static method is used to generate delete predicate pb during write or push handler + public: + struct ConditionParseResult { + int32_t col_unique_id; + std::string column_name; + PredicateType condition_op; + std::list value_str; + }; // generated DeletePredicatePB by TCondition static Status generate_delete_predicate(const TabletSchema& schema, const std::vector& conditions, @@ -71,7 +78,10 @@ class DeleteHandler { * @param condition output param * @return OK if matched and extracted correctly otherwise DELETE_INVALID_PARAMETERS */ - static Status parse_condition(const std::string& condition_str, TCondition* condition); + static ConditionParseResult parse_condition(const std::string& condition_str); + static ConditionParseResult parse_condition(const DeleteSubPredicatePB& sub_cond); + static PredicateType parse_condition_op(const std::string& op_str, + const std::list& cond_values); private: // Validate the condition on the schema. @@ -86,9 +96,6 @@ class DeleteHandler { const std::string& condition_op, const std::string& value_str); - // extract 'column_name', 'op' and 'operands' to condition - static Status parse_condition(const DeleteSubPredicatePB& sub_cond, TCondition* condition); - public: DeleteHandler() = default; ~DeleteHandler(); @@ -111,7 +118,7 @@ class DeleteHandler { void get_delete_conditions_after_version( int64_t version, AndBlockColumnPredicate* and_block_column_predicate_ptr, - std::unordered_map>* + std::unordered_map>>* del_predicates_for_zone_map) const; private: diff --git a/be/src/olap/in_list_predicate.h b/be/src/olap/in_list_predicate.h index 2246d0e2fccc15..ea8ee54facc564 100644 --- a/be/src/olap/in_list_predicate.h +++ b/be/src/olap/in_list_predicate.h @@ -62,36 +62,25 @@ namespace doris { * @tparam PT * @tparam HybridSetType */ -template -class InListPredicateBase : public ColumnPredicate { +template +class InListPredicateBase final : public ColumnPredicate { public: + ENABLE_FACTORY_CREATOR(InListPredicateBase); using T = typename PrimitiveTypeTraits::CppType; - template - InListPredicateBase(uint32_t column_id, const ConditionType& conditions, - const ConvertFunc& convert, bool is_opposite, - const vectorized::DataTypePtr& data_type, vectorized::Arena& arena) - : ColumnPredicate(column_id, is_opposite), - _min_value(type_limit::max()), - _max_value(type_limit::min()) { - _values = std::make_shared(false); - for (const auto& condition : conditions) { - T tmp; - if constexpr (Type == TYPE_STRING || Type == TYPE_CHAR) { - tmp = convert(data_type, condition, arena); - } else if constexpr (Type == TYPE_DECIMAL32 || Type == TYPE_DECIMAL64 || - Type == TYPE_DECIMAL128I || Type == TYPE_DECIMAL256) { - tmp = convert(data_type, condition); - } else { - tmp = convert(condition); - } - _values->insert(&tmp); - _update_min_max(tmp); - } - } - - InListPredicateBase(uint32_t column_id, const std::shared_ptr& hybrid_set, + using HybridSetType = std::conditional_t< + N >= 1 && N <= FIXED_CONTAINER_MAX_SIZE, + std::conditional_t< + std::is_same_v, StringSet>, + HybridSet, + vectorized::PredicateColumnType>>>, + std::conditional_t< + std::is_same_v, StringSet>, + HybridSet, + vectorized::PredicateColumnType>>>>; + InListPredicateBase(uint32_t column_id, std::string col_name, + const std::shared_ptr& hybrid_set, bool is_opposite, size_t char_length = 0) - : ColumnPredicate(column_id, false), + : ColumnPredicate(column_id, col_name, Type, is_opposite), _min_value(type_limit::max()), _max_value(type_limit::min()) { CHECK(hybrid_set != nullptr); @@ -132,8 +121,26 @@ class InListPredicateBase : public ColumnPredicate { iter->next(); } } + InListPredicateBase(const InListPredicateBase& other, uint32_t col_id) + : ColumnPredicate(other, col_id) { + _values = other._values; + _min_value = other._min_value; + _max_value = other._max_value; + _temp_datas = other._temp_datas; + DCHECK(_segment_id_to_value_in_dict_flags.empty()); + } + InListPredicateBase(const InListPredicateBase& other) = delete; + std::shared_ptr clone(uint32_t col_id) const override { + return InListPredicateBase::create_shared(*this, col_id); + } ~InListPredicateBase() override = default; + std::string debug_string() const override { + fmt::memory_buffer debug_string_buffer; + fmt::format_to(debug_string_buffer, "InListPredicateBase({})", + ColumnPredicate::debug_string()); + return fmt::to_string(debug_string_buffer); + } PredicateType type() const override { return PT; } @@ -656,10 +663,6 @@ class InListPredicateBase : public ColumnPredicate { } } - std::string _debug_string() const override { - return "InListPredicate(" + type_to_string(Type) + ", " + type_to_string(PT) + ")"; - } - void _update_min_max(const T& value) { if (Compare::greater(value, _max_value)) { _max_value = value; @@ -678,111 +681,5 @@ class InListPredicateBase : public ColumnPredicate { // temp string for char type column std::list _temp_datas; }; - -template -ColumnPredicate* _create_in_list_predicate(uint32_t column_id, const ConditionType& conditions, - const ConvertFunc& convert, bool is_opposite, - const vectorized::DataTypePtr& data_type, - vectorized::Arena& arena) { - using T = typename PrimitiveTypeTraits::CppType; - if constexpr (N >= 1 && N <= FIXED_CONTAINER_MAX_SIZE) { - using Set = std::conditional_t< - std::is_same_v, StringSet>, - HybridSet, - vectorized::PredicateColumnType>>>; - return new InListPredicateBase(column_id, conditions, convert, is_opposite, - data_type, arena); - } else { - using Set = std::conditional_t< - std::is_same_v, StringSet>, - HybridSet, - vectorized::PredicateColumnType>>>; - return new InListPredicateBase(column_id, conditions, convert, is_opposite, - data_type, arena); - } -} - -template -ColumnPredicate* create_in_list_predicate(uint32_t column_id, const ConditionType& conditions, - const ConvertFunc& convert, bool is_opposite, - const vectorized::DataTypePtr& data_type, - vectorized::Arena& arena) { - if (conditions.size() == 1) { - return _create_in_list_predicate( - column_id, conditions, convert, is_opposite, data_type, arena); - } else if (conditions.size() == 2) { - return _create_in_list_predicate( - column_id, conditions, convert, is_opposite, data_type, arena); - } else if (conditions.size() == 3) { - return _create_in_list_predicate( - column_id, conditions, convert, is_opposite, data_type, arena); - } else if (conditions.size() == 4) { - return _create_in_list_predicate( - column_id, conditions, convert, is_opposite, data_type, arena); - } else if (conditions.size() == 5) { - return _create_in_list_predicate( - column_id, conditions, convert, is_opposite, data_type, arena); - } else if (conditions.size() == 6) { - return _create_in_list_predicate( - column_id, conditions, convert, is_opposite, data_type, arena); - } else if (conditions.size() == 7) { - return _create_in_list_predicate( - column_id, conditions, convert, is_opposite, data_type, arena); - } else if (conditions.size() == FIXED_CONTAINER_MAX_SIZE) { - return _create_in_list_predicate(column_id, conditions, convert, - is_opposite, data_type, arena); - } else { - return _create_in_list_predicate( - column_id, conditions, convert, is_opposite, data_type, arena); - } -} - -template -ColumnPredicate* _create_in_list_predicate(uint32_t column_id, - const std::shared_ptr& hybrid_set, - size_t char_length = 0) { - using T = typename PrimitiveTypeTraits::CppType; - if constexpr (N >= 1 && N <= FIXED_CONTAINER_MAX_SIZE) { - using Set = std::conditional_t< - std::is_same_v, StringSet>, - HybridSet, - vectorized::PredicateColumnType>>>; - return new InListPredicateBase(column_id, hybrid_set, char_length); - } else { - using Set = std::conditional_t< - std::is_same_v, StringSet>, - HybridSet, - vectorized::PredicateColumnType>>>; - return new InListPredicateBase(column_id, hybrid_set, char_length); - } -} - -template -ColumnPredicate* create_in_list_predicate(uint32_t column_id, - const std::shared_ptr& hybrid_set, - size_t char_length = 0) { - if (hybrid_set->size() == 1) { - return _create_in_list_predicate(column_id, hybrid_set, char_length); - } else if (hybrid_set->size() == 2) { - return _create_in_list_predicate(column_id, hybrid_set, char_length); - } else if (hybrid_set->size() == 3) { - return _create_in_list_predicate(column_id, hybrid_set, char_length); - } else if (hybrid_set->size() == 4) { - return _create_in_list_predicate(column_id, hybrid_set, char_length); - } else if (hybrid_set->size() == 5) { - return _create_in_list_predicate(column_id, hybrid_set, char_length); - } else if (hybrid_set->size() == 6) { - return _create_in_list_predicate(column_id, hybrid_set, char_length); - } else if (hybrid_set->size() == 7) { - return _create_in_list_predicate(column_id, hybrid_set, char_length); - } else if (hybrid_set->size() == FIXED_CONTAINER_MAX_SIZE) { - return _create_in_list_predicate(column_id, hybrid_set, - char_length); - } else { - return _create_in_list_predicate(column_id, hybrid_set, char_length); - } -} #include "common/compile_check_end.h" } //namespace doris diff --git a/be/src/olap/iterators.h b/be/src/olap/iterators.h index 3379d50368ed90..16b3309bc240f5 100644 --- a/be/src/olap/iterators.h +++ b/be/src/olap/iterators.h @@ -87,9 +87,10 @@ class StorageReadOptions { AndBlockColumnPredicate::create_shared(); // reader's column predicate, nullptr if not existed // used to fiter rows in row block - std::vector column_predicates; + std::vector> column_predicates; std::unordered_map> col_id_to_predicates; - std::unordered_map> del_predicates_for_zone_map; + std::unordered_map>> + del_predicates_for_zone_map; TPushAggOp::type push_down_agg_type_opt = TPushAggOp::NONE; // REQUIRED (null is not allowed) diff --git a/be/src/olap/like_column_predicate.cpp b/be/src/olap/like_column_predicate.cpp index a2bc50735efb08..813acaabca64d1 100644 --- a/be/src/olap/like_column_predicate.cpp +++ b/be/src/olap/like_column_predicate.cpp @@ -26,9 +26,9 @@ namespace doris { template -LikeColumnPredicate::LikeColumnPredicate(bool opposite, uint32_t column_id, +LikeColumnPredicate::LikeColumnPredicate(bool opposite, uint32_t column_id, std::string col_name, doris::FunctionContext* fn_ctx, doris::StringRef val) - : ColumnPredicate(column_id, opposite), pattern(val) { + : ColumnPredicate(column_id, col_name, T, opposite), pattern(val) { static_assert(T == TYPE_VARCHAR || T == TYPE_CHAR || T == TYPE_STRING, "LikeColumnPredicate only supports the following types: TYPE_VARCHAR, TYPE_CHAR, " "TYPE_STRING"); diff --git a/be/src/olap/like_column_predicate.h b/be/src/olap/like_column_predicate.h index 267b7ac1ea126d..cdcc52bfa7dba9 100644 --- a/be/src/olap/like_column_predicate.h +++ b/be/src/olap/like_column_predicate.h @@ -44,11 +44,29 @@ namespace doris { class FunctionContext; template -class LikeColumnPredicate : public ColumnPredicate { +class LikeColumnPredicate final : public ColumnPredicate { public: - LikeColumnPredicate(bool opposite, uint32_t column_id, doris::FunctionContext* fn_ctx, - doris::StringRef val); + ENABLE_FACTORY_CREATOR(LikeColumnPredicate); + LikeColumnPredicate(bool opposite, uint32_t column_id, std::string col_name, + doris::FunctionContext* fn_ctx, doris::StringRef val); ~LikeColumnPredicate() override = default; + LikeColumnPredicate(const LikeColumnPredicate& other, uint32_t col_id) + : ColumnPredicate(other, col_id) { + _origin = other._origin; + pattern = other.pattern; + _state = other._state; + _opposite = other._opposite; + } + LikeColumnPredicate(const LikeColumnPredicate& other) = delete; + std::shared_ptr clone(uint32_t col_id) const override { + return LikeColumnPredicate::create_shared(*this, col_id); + } + std::string debug_string() const override { + fmt::memory_buffer debug_string_buffer; + fmt::format_to(debug_string_buffer, "LikeColumnPredicate({}, pattern={}, origin={})", + ColumnPredicate::debug_string(), pattern, _origin); + return fmt::to_string(debug_string_buffer); + } PredicateType type() const override { return PredicateType::EQ; } void evaluate_vec(const vectorized::IColumn& column, uint16_t size, bool* flags) const override; @@ -171,11 +189,6 @@ class LikeColumnPredicate : public ColumnPredicate { std::shared_mutex> _segment_id_to_cached_res_flags; - std::string _debug_string() const override { - std::string info = "LikeColumnPredicate"; - return info; - } - std::string _origin; // lifetime controlled by scan node using StateType = vectorized::LikeState; @@ -187,7 +200,7 @@ class LikeColumnPredicate : public ColumnPredicate { // Hyperscan API. So here _like_state is separate for each instance of // LikeColumnPredicate. vectorized::LikeSearchState _like_state; - std::unique_ptr _page_ng_bf; // for ngram-bf index + std::shared_ptr _page_ng_bf; // for ngram-bf index }; } // namespace doris diff --git a/be/src/olap/null_predicate.cpp b/be/src/olap/null_predicate.cpp index 602964241213a6..ff17496229c44a 100644 --- a/be/src/olap/null_predicate.cpp +++ b/be/src/olap/null_predicate.cpp @@ -31,8 +31,9 @@ using namespace doris::vectorized; namespace doris { -NullPredicate::NullPredicate(uint32_t column_id, bool is_null, bool opposite) - : ColumnPredicate(column_id), _is_null(opposite != is_null) {} +NullPredicate::NullPredicate(uint32_t column_id, std::string col_name, bool is_null, + PrimitiveType type, bool opposite) + : ColumnPredicate(column_id, col_name, type), _is_null(opposite != is_null) {} PredicateType NullPredicate::type() const { return _is_null ? PredicateType::IS_NULL : PredicateType::IS_NOT_NULL; diff --git a/be/src/olap/null_predicate.h b/be/src/olap/null_predicate.h index 113356c1ab32db..f07e2b7e0a6485 100644 --- a/be/src/olap/null_predicate.h +++ b/be/src/olap/null_predicate.h @@ -29,7 +29,6 @@ #include "olap/rowset/segment_v2/bloom_filter.h" #include "olap/schema.h" #include "olap/wrapper_field.h" -#include "vec/exec/format/parquet/parquet_pred_cmp.h" namespace roaring { class Roaring; @@ -43,9 +42,24 @@ namespace vectorized { class IColumn; } // namespace vectorized -class NullPredicate : public ColumnPredicate { +class NullPredicate final : public ColumnPredicate { public: - NullPredicate(uint32_t column_id, bool is_null, bool opposite = false); + ENABLE_FACTORY_CREATOR(NullPredicate); + NullPredicate(uint32_t column_id, std::string col_name, bool is_null, PrimitiveType type, + bool opposite = false); + NullPredicate(const NullPredicate& other) = delete; + NullPredicate(const NullPredicate& other, uint32_t column_id) + : ColumnPredicate(other, column_id), _is_null(other._is_null) {} + ~NullPredicate() override = default; + std::shared_ptr clone(uint32_t column_id) const override { + return NullPredicate::create_shared(*this, column_id); + } + std::string debug_string() const override { + fmt::memory_buffer debug_string_buffer; + fmt::format_to(debug_string_buffer, "NullPredicate({}, is_null={})", + ColumnPredicate::debug_string(), _is_null); + return fmt::to_string(debug_string_buffer); + } PredicateType type() const override; @@ -122,11 +136,6 @@ class NullPredicate : public ColumnPredicate { uint16_t _evaluate_inner(const vectorized::IColumn& column, uint16_t* sel, uint16_t size) const override; - std::string _debug_string() const override { - std::string info = "NullPredicate(" + std::string(_is_null ? "is_null" : "not_null") + ")"; - return info; - } - bool _is_null; //true for null, false for not null }; diff --git a/be/src/olap/predicate_creator.cpp b/be/src/olap/predicate_creator.cpp new file mode 100644 index 00000000000000..b72458a3b8560d --- /dev/null +++ b/be/src/olap/predicate_creator.cpp @@ -0,0 +1,135 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "olap/predicate_creator.h" + +namespace doris { + +std::shared_ptr create_bloom_filter_predicate( + const uint32_t cid, const std::string col_name, const vectorized::DataTypePtr& data_type, + const std::shared_ptr& filter) { + // Do the necessary type conversion, for CAST(STRING AS CHAR), we do nothing here but change the data type to the target type CHAR + std::shared_ptr filter_olap; + filter_olap.reset(create_bloom_filter(data_type->get_primitive_type(), false)); + filter_olap->light_copy(filter.get()); + switch (data_type->get_primitive_type()) { + case TYPE_TINYINT: { + return BloomFilterColumnPredicate::create_shared(cid, col_name, filter_olap); + } + case TYPE_SMALLINT: { + return BloomFilterColumnPredicate::create_shared(cid, col_name, filter_olap); + } + case TYPE_INT: { + return BloomFilterColumnPredicate::create_shared(cid, col_name, filter_olap); + } + case TYPE_BIGINT: { + return BloomFilterColumnPredicate::create_shared(cid, col_name, filter_olap); + } + case TYPE_LARGEINT: { + return BloomFilterColumnPredicate::create_shared(cid, col_name, filter_olap); + } + case TYPE_FLOAT: { + return BloomFilterColumnPredicate::create_shared(cid, col_name, filter_olap); + } + case TYPE_DOUBLE: { + return BloomFilterColumnPredicate::create_shared(cid, col_name, filter_olap); + } + case TYPE_DECIMALV2: { + return BloomFilterColumnPredicate::create_shared(cid, col_name, + filter_olap); + } + case TYPE_DECIMAL32: { + return BloomFilterColumnPredicate::create_shared(cid, col_name, + filter_olap); + } + case TYPE_DECIMAL64: { + return BloomFilterColumnPredicate::create_shared(cid, col_name, + filter_olap); + } + case TYPE_DECIMAL128I: { + return BloomFilterColumnPredicate::create_shared(cid, col_name, + filter_olap); + } + case TYPE_DECIMAL256: { + return BloomFilterColumnPredicate::create_shared(cid, col_name, + filter_olap); + } + case TYPE_CHAR: { + return BloomFilterColumnPredicate::create_shared(cid, col_name, filter_olap); + } + case TYPE_VARCHAR: { + return BloomFilterColumnPredicate::create_shared(cid, col_name, filter_olap); + } + case TYPE_STRING: { + return BloomFilterColumnPredicate::create_shared(cid, col_name, filter_olap); + } + case TYPE_DATE: { + return BloomFilterColumnPredicate::create_shared(cid, col_name, filter_olap); + } + case TYPE_DATEV2: { + return BloomFilterColumnPredicate::create_shared(cid, col_name, filter_olap); + } + case TYPE_DATETIME: { + return BloomFilterColumnPredicate::create_shared(cid, col_name, filter_olap); + } + case TYPE_DATETIMEV2: { + return BloomFilterColumnPredicate::create_shared(cid, col_name, + filter_olap); + } + case TYPE_TIMESTAMPTZ: { + return BloomFilterColumnPredicate::create_shared(cid, col_name, + filter_olap); + } + case TYPE_BOOLEAN: { + return BloomFilterColumnPredicate::create_shared(cid, col_name, filter_olap); + } + case TYPE_IPV4: { + return BloomFilterColumnPredicate::create_shared(cid, col_name, filter_olap); + } + case TYPE_IPV6: { + return BloomFilterColumnPredicate::create_shared(cid, col_name, filter_olap); + } + default: + return nullptr; + } +} + +std::shared_ptr create_bitmap_filter_predicate( + const uint32_t cid, const std::string col_name, const vectorized::DataTypePtr& data_type, + const std::shared_ptr& filter) { + switch (data_type->get_primitive_type()) { + case TYPE_TINYINT: { + return BitmapFilterColumnPredicate::create_shared(cid, col_name, filter); + } + case TYPE_SMALLINT: { + return BitmapFilterColumnPredicate::create_shared(cid, col_name, filter); + } + case TYPE_INT: { + return BitmapFilterColumnPredicate::create_shared(cid, col_name, filter); + } + case TYPE_BIGINT: { + return BitmapFilterColumnPredicate::create_shared(cid, col_name, filter); + } + default: + throw Exception(ErrorCode::INVALID_ARGUMENT, + fmt::format("Cannot use bitmap filter for type: {}", + type_to_string(data_type->get_primitive_type()))); + return nullptr; + } +} + +} // namespace doris diff --git a/be/src/olap/predicate_creator.h b/be/src/olap/predicate_creator.h index 7bf5b65181a5cd..c225dcfc3d9b20 100644 --- a/be/src/olap/predicate_creator.h +++ b/be/src/olap/predicate_creator.h @@ -46,321 +46,282 @@ namespace doris { #include "common/compile_check_begin.h" -template -class PredicateCreator { -public: - virtual ColumnPredicate* create(const vectorized::DataTypePtr& data_type, int index, - const ConditionType& conditions, bool opposite, - vectorized::Arena& arena) = 0; - virtual ~PredicateCreator() = default; -}; -template -class IntegerPredicateCreator : public PredicateCreator { -public: - using CppType = typename PrimitiveTypeTraits::CppType; - ColumnPredicate* create(const vectorized::DataTypePtr& data_type, int index, - const ConditionType& conditions, bool opposite, - vectorized::Arena& arena) override { - if constexpr (PredicateTypeTraits::is_list(PT)) { - return create_in_list_predicate( - index, conditions, convert, opposite, data_type, arena); - } else { - static_assert(PredicateTypeTraits::is_comparison(PT)); - return new ComparisonPredicateBase(index, convert(conditions), opposite); - } +template +std::shared_ptr create_in_list_predicate(const uint32_t cid, + const std::string col_name, + const std::shared_ptr& set, + bool is_opposite, + size_t char_length = 0) { + auto set_size = set->size(); + if (set_size == 1) { + return InListPredicateBase::create_shared(cid, col_name, set, is_opposite, + char_length); + } else if (set_size == 2) { + return InListPredicateBase::create_shared(cid, col_name, set, is_opposite, + char_length); + } else if (set_size == 3) { + return InListPredicateBase::create_shared(cid, col_name, set, is_opposite, + char_length); + } else if (set_size == 4) { + return InListPredicateBase::create_shared(cid, col_name, set, is_opposite, + char_length); + } else if (set_size == 5) { + return InListPredicateBase::create_shared(cid, col_name, set, is_opposite, + char_length); + } else if (set_size == 6) { + return InListPredicateBase::create_shared(cid, col_name, set, is_opposite, + char_length); + } else if (set_size == 7) { + return InListPredicateBase::create_shared(cid, col_name, set, is_opposite, + char_length); + } else if (set_size == FIXED_CONTAINER_MAX_SIZE) { + return InListPredicateBase::create_shared(cid, col_name, set, is_opposite, + char_length); + } else { + return InListPredicateBase::create_shared( + cid, col_name, set, is_opposite, char_length); } +} -private: - static CppType convert(const std::string& condition) { - CppType value = 0; - if constexpr (std::is_floating_point_v) { - vectorized::CastParameters params; - if (vectorized::CastToFloat::from_string(StringRef {condition.data(), condition.size()}, - value, params)) { - return value; - } else { - throw Exception( - ErrorCode::INVALID_ARGUMENT, - fmt::format("convert string to number failed, str: {} to float/double", - condition)); - } - } else { - auto ret = - std::from_chars(condition.data(), condition.data() + condition.size(), value); - if (ret.ptr == condition.data() + condition.size()) { - return value; - } else { - throw Exception( - ErrorCode::INVALID_ARGUMENT, - fmt::format("convert string to number failed, str: {}, error: [{}] {}", - condition, ret.ec, std::make_error_code(ret.ec).message())); - } - } - } -}; - -template -class DecimalPredicateCreator : public PredicateCreator { -public: - using CppType = typename PrimitiveTypeTraits::CppType; - ColumnPredicate* create(const vectorized::DataTypePtr& data_type, int index, - const ConditionType& conditions, bool opposite, - vectorized::Arena& arena) override { - if constexpr (PredicateTypeTraits::is_list(PT)) { - return create_in_list_predicate( - index, conditions, convert, opposite, data_type, arena); - } else { - static_assert(PredicateTypeTraits::is_comparison(PT)); - return new ComparisonPredicateBase(index, convert(data_type, conditions), - opposite); - } +template +std::shared_ptr create_in_list_predicate(const uint32_t cid, + const std::string col_name, + const vectorized::DataTypePtr& data_type, + const std::shared_ptr set, + bool is_opposite) { + switch (data_type->get_primitive_type()) { + case TYPE_TINYINT: { + return create_in_list_predicate(cid, col_name, set, is_opposite); } - -private: - static CppType convert(const vectorized::DataTypePtr& data_type, const std::string& condition) { - StringParser::ParseResult result = StringParser::ParseResult::PARSE_SUCCESS; - // return CppType value cast from int128_t - return CppType(StringParser::string_to_decimal( - condition.data(), (int)condition.size(), data_type->get_precision(), - data_type->get_scale(), &result)); - } -}; - -template -class StringPredicateCreator : public PredicateCreator { -public: - ColumnPredicate* create(const vectorized::DataTypePtr& data_type, int index, - const ConditionType& conditions, bool opposite, - vectorized::Arena& arena) override { - if constexpr (PredicateTypeTraits::is_list(PT)) { - return create_in_list_predicate( - index, conditions, convert, opposite, data_type, arena); - } else { - static_assert(PredicateTypeTraits::is_comparison(PT)); - return new ComparisonPredicateBase( - index, convert(data_type, conditions, arena), opposite); - } + case TYPE_SMALLINT: { + return create_in_list_predicate(cid, col_name, set, is_opposite); } - -private: - static StringRef convert(const vectorized::DataTypePtr& data_type, const std::string& condition, - vectorized::Arena& arena) { - size_t length = condition.length(); - if constexpr (Type == TYPE_CHAR) { - length = std::max( - static_cast(assert_cast( - vectorized::remove_nullable(data_type).get()) - ->len()), - length); - } - - char* buffer = arena.alloc(length); - memset(buffer, 0, length); - memcpy(buffer, condition.data(), condition.length()); - - return {buffer, length}; + case TYPE_INT: { + return create_in_list_predicate(cid, col_name, set, is_opposite); } -}; - -template -struct CustomPredicateCreator : public PredicateCreator { -public: - using CppType = typename PrimitiveTypeTraits::CppType; - CustomPredicateCreator(const std::function& convert) - : _convert(convert) {} - - ColumnPredicate* create(const vectorized::DataTypePtr& data_type, int index, - const ConditionType& conditions, bool opposite, - vectorized::Arena& arena) override { - if constexpr (PredicateTypeTraits::is_list(PT)) { - return create_in_list_predicate( - index, conditions, _convert, opposite, data_type, arena); - } else { - static_assert(PredicateTypeTraits::is_comparison(PT)); - return new ComparisonPredicateBase(index, _convert(conditions), opposite); - } + case TYPE_BIGINT: { + return create_in_list_predicate(cid, col_name, set, is_opposite); } + case TYPE_LARGEINT: { + return create_in_list_predicate(cid, col_name, set, is_opposite); + } + case TYPE_FLOAT: { + return create_in_list_predicate(cid, col_name, set, is_opposite); + } + case TYPE_DOUBLE: { + return create_in_list_predicate(cid, col_name, set, is_opposite); + } + case TYPE_DECIMALV2: { + return create_in_list_predicate(cid, col_name, set, is_opposite); + } + case TYPE_DECIMAL32: { + return create_in_list_predicate(cid, col_name, set, is_opposite); + } + case TYPE_DECIMAL64: { + return create_in_list_predicate(cid, col_name, set, is_opposite); + } + case TYPE_DECIMAL128I: { + return create_in_list_predicate(cid, col_name, set, is_opposite); + } + case TYPE_DECIMAL256: { + return create_in_list_predicate(cid, col_name, set, is_opposite); + } + case TYPE_CHAR: { + return create_in_list_predicate( + cid, col_name, set, is_opposite, + assert_cast( + vectorized::remove_nullable(data_type).get()) + ->len()); + } + case TYPE_VARCHAR: { + return create_in_list_predicate(cid, col_name, set, is_opposite); + } + case TYPE_STRING: { + return create_in_list_predicate(cid, col_name, set, is_opposite); + } + case TYPE_DATE: { + return create_in_list_predicate(cid, col_name, set, is_opposite); + } + case TYPE_DATEV2: { + return create_in_list_predicate(cid, col_name, set, is_opposite); + } + case TYPE_DATETIME: { + return create_in_list_predicate(cid, col_name, set, is_opposite); + } + case TYPE_DATETIMEV2: { + return create_in_list_predicate(cid, col_name, set, is_opposite); + } + case TYPE_TIMESTAMPTZ: { + return create_in_list_predicate(cid, col_name, set, is_opposite); + } + case TYPE_BOOLEAN: { + return create_in_list_predicate(cid, col_name, set, is_opposite); + } + case TYPE_IPV4: { + return create_in_list_predicate(cid, col_name, set, is_opposite); + } + case TYPE_IPV6: { + return create_in_list_predicate(cid, col_name, set, is_opposite); + } + default: + throw Exception(Status::InternalError("Unsupported type {} for in_predicate", + type_to_string(data_type->get_primitive_type()))); + return nullptr; + } +} -private: - std::function _convert; -}; - -template -std::unique_ptr> get_creator( - const vectorized::DataTypePtr& data_type) { +template +std::shared_ptr create_comparison_predicate0( + const uint32_t cid, const std::string col_name, const vectorized::DataTypePtr& data_type, + StringRef& value, bool opposite, vectorized::Arena& arena) { switch (data_type->get_primitive_type()) { case TYPE_TINYINT: { - return std::make_unique>(); + return ComparisonPredicateBase::create_shared( + cid, col_name, *(typename PrimitiveTypeTraits::CppType*)value.data, + opposite); } case TYPE_SMALLINT: { - return std::make_unique>(); + return ComparisonPredicateBase::create_shared( + cid, col_name, *(typename PrimitiveTypeTraits::CppType*)value.data, + opposite); } case TYPE_INT: { - return std::make_unique>(); + return ComparisonPredicateBase::create_shared( + cid, col_name, *(typename PrimitiveTypeTraits::CppType*)value.data, + opposite); } case TYPE_BIGINT: { - return std::make_unique>(); + return ComparisonPredicateBase::create_shared( + cid, col_name, *(typename PrimitiveTypeTraits::CppType*)value.data, + opposite); } case TYPE_LARGEINT: { - return std::make_unique>(); + return ComparisonPredicateBase::create_shared( + cid, col_name, *(typename PrimitiveTypeTraits::CppType*)value.data, + opposite); } case TYPE_FLOAT: { - return std::make_unique>(); + return ComparisonPredicateBase::create_shared( + cid, col_name, *(typename PrimitiveTypeTraits::CppType*)value.data, + opposite); } case TYPE_DOUBLE: { - return std::make_unique>(); + return ComparisonPredicateBase::create_shared( + cid, col_name, *(typename PrimitiveTypeTraits::CppType*)value.data, + opposite); } case TYPE_DECIMALV2: { - return std::make_unique>( - [](const std::string& condition) { - decimal12_t value = {0, 0}; - static_cast(value.from_string(condition)); - // Decimal12t is storage type, we need convert to compute type here to - // do comparisons - return DecimalV2Value(value.integer, value.fraction); - }); + return ComparisonPredicateBase::create_shared( + cid, col_name, *(typename PrimitiveTypeTraits::CppType*)value.data, + opposite); } case TYPE_DECIMAL32: { - return std::make_unique>(); + return ComparisonPredicateBase::create_shared( + cid, col_name, *(typename PrimitiveTypeTraits::CppType*)value.data, + opposite); } case TYPE_DECIMAL64: { - return std::make_unique>(); + return ComparisonPredicateBase::create_shared( + cid, col_name, *(typename PrimitiveTypeTraits::CppType*)value.data, + opposite); } case TYPE_DECIMAL128I: { - return std::make_unique>(); + return ComparisonPredicateBase::create_shared( + cid, col_name, + *(typename PrimitiveTypeTraits::CppType*)value.data, opposite); } case TYPE_DECIMAL256: { - return std::make_unique>(); + return ComparisonPredicateBase::create_shared( + cid, col_name, *(typename PrimitiveTypeTraits::CppType*)value.data, + opposite); } case TYPE_CHAR: { - return std::make_unique>(); + // TODO(gabriel): Use std::string instead of StringRef + auto target = + std::max(cast_set(assert_cast( + vectorized::remove_nullable(data_type).get()) + ->len()), + value.size); + char* buffer = arena.alloc(target); + memset(buffer, 0, target); + memcpy(buffer, value.data, value.size); + StringRef v = {buffer, target}; + return ComparisonPredicateBase::create_shared(cid, col_name, v, opposite); + } + case TYPE_VARCHAR: { + char* buffer = arena.alloc(value.size); + memcpy(buffer, value.data, value.size); + StringRef v = {buffer, value.size}; + return ComparisonPredicateBase::create_shared(cid, col_name, v, opposite); } - case TYPE_VARCHAR: case TYPE_STRING: { - return std::make_unique>(); + char* buffer = arena.alloc(value.size); + memcpy(buffer, value.data, value.size); + StringRef v = {buffer, value.size}; + return ComparisonPredicateBase::create_shared(cid, col_name, v, opposite); } case TYPE_DATE: { - return std::make_unique>( - timestamp_from_date); + return ComparisonPredicateBase::create_shared( + cid, col_name, *(typename PrimitiveTypeTraits::CppType*)value.data, + opposite); } case TYPE_DATEV2: { - return std::make_unique>( - timestamp_from_date_v2); + return ComparisonPredicateBase::create_shared( + cid, col_name, *(typename PrimitiveTypeTraits::CppType*)value.data, + opposite); } case TYPE_DATETIME: { - return std::make_unique>( - timestamp_from_datetime); + return ComparisonPredicateBase::create_shared( + cid, col_name, *(typename PrimitiveTypeTraits::CppType*)value.data, + opposite); } case TYPE_DATETIMEV2: { - return std::make_unique>( - timestamp_from_datetime_v2); + return ComparisonPredicateBase::create_shared( + cid, col_name, *(typename PrimitiveTypeTraits::CppType*)value.data, + opposite); } case TYPE_TIMESTAMPTZ: { - return std::make_unique>( - timestamptz_from_string); + return ComparisonPredicateBase::create_shared( + cid, col_name, + *(typename PrimitiveTypeTraits::CppType*)value.data, opposite); } case TYPE_BOOLEAN: { - return std::make_unique>( - [](const std::string& condition) { - int32_t ivalue = 0; - auto result = std::from_chars(condition.data(), - condition.data() + condition.size(), ivalue); - if (result.ec == std::errc()) { - return bool(ivalue); - } - - StringParser::ParseResult parse_result; - bool value = StringParser::string_to_bool(condition.data(), condition.size(), - &parse_result); - return value; - }); + return ComparisonPredicateBase::create_shared( + cid, col_name, *(typename PrimitiveTypeTraits::CppType*)value.data, + opposite); } case TYPE_IPV4: { - return std::make_unique>( - [](const std::string& condition) { - IPv4 value; - bool res = IPv4Value::from_string(value, condition); - DCHECK(res); - return value; - }); + return ComparisonPredicateBase::create_shared( + cid, col_name, *(typename PrimitiveTypeTraits::CppType*)value.data, + opposite); } case TYPE_IPV6: { - return std::make_unique>( - [](const std::string& condition) { - IPv6 value; - bool res = IPv6Value::from_string(value, condition); - DCHECK(res); - return value; - }); + return ComparisonPredicateBase::create_shared( + cid, col_name, *(typename PrimitiveTypeTraits::CppType*)value.data, + opposite); } default: + throw Exception(Status::InternalError("Unsupported type {} for comparison_predicate", + type_to_string(data_type->get_primitive_type()))); return nullptr; } } -template -ColumnPredicate* create_predicate(const vectorized::DataTypePtr& data_type, int index, - const ConditionType& conditions, bool opposite, - vectorized::Arena& arena) { - return get_creator(data_type)->create(data_type, index, conditions, opposite, - arena); +template +std::shared_ptr build_set() { + return std::make_shared>, + HybridSet::CppType>, + vectorized::PredicateColumnType>>>>(false); } -template -ColumnPredicate* create_comparison_predicate(const vectorized::DataTypePtr& data_type, int index, - const std::string& condition, bool opposite, - vectorized::Arena& arena) { - static_assert(PredicateTypeTraits::is_comparison(PT)); - return create_predicate(data_type, index, condition, opposite, arena); -} +std::shared_ptr create_bloom_filter_predicate( + const uint32_t cid, const std::string col_name, const vectorized::DataTypePtr& data_type, + const std::shared_ptr& filter); -template -ColumnPredicate* create_list_predicate(const vectorized::DataTypePtr& data_type, int index, - const std::vector& conditions, bool opposite, - vectorized::Arena& arena) { - static_assert(PredicateTypeTraits::is_list(PT)); - return create_predicate>(data_type, index, conditions, opposite, - arena); -} - -// This method is called in reader and in deletehandler. -// The "column" parameter might represent a column resulting from the decomposition of a variant column. -inline ColumnPredicate* parse_to_predicate(const vectorized::DataTypePtr& data_type, uint32_t index, - const TCondition& condition, vectorized::Arena& arena, - bool opposite = false) { - if (to_lower(condition.condition_op) == "is") { - return new NullPredicate(index, to_lower(condition.condition_values[0]) == "null", - opposite); - } - - if ((condition.condition_op == "*=" || condition.condition_op == "!*=") && - condition.condition_values.size() > 1) { - decltype(create_list_predicate)* create = nullptr; - - if (condition.condition_op == "*=") { - create = create_list_predicate; - } else { - create = create_list_predicate; - } - return create(data_type, index, condition.condition_values, opposite, arena); - } - - decltype(create_comparison_predicate)* create = nullptr; - if (condition.condition_op == "*=" || condition.condition_op == "=") { - create = create_comparison_predicate; - } else if (condition.condition_op == "!*=" || condition.condition_op == "!=") { - create = create_comparison_predicate; - } else if (condition.condition_op == "<<") { - create = create_comparison_predicate; - } else if (condition.condition_op == "<=") { - create = create_comparison_predicate; - } else if (condition.condition_op == ">>") { - create = create_comparison_predicate; - } else if (condition.condition_op == ">=") { - create = create_comparison_predicate; - } - return create(data_type, index, condition.condition_values[0], opposite, arena); -} +std::shared_ptr create_bitmap_filter_predicate( + const uint32_t cid, const std::string col_name, const vectorized::DataTypePtr& data_type, + const std::shared_ptr& filter); #include "common/compile_check_end.h" } //namespace doris diff --git a/be/src/olap/push_handler.cpp b/be/src/olap/push_handler.cpp index 09548820ccf4fa..b7a562b1577d6a 100644 --- a/be/src/olap/push_handler.cpp +++ b/be/src/olap/push_handler.cpp @@ -658,8 +658,8 @@ Status PushBrokerReader::_get_next_reader() { _io_ctx.get(), _runtime_state.get()); init_status = parquet_reader->init_reader( - _all_col_names, &_col_name_to_block_idx, _push_down_exprs, _real_tuple_desc, - _default_val_row_desc.get(), _col_name_to_slot_id, + _all_col_names, &_col_name_to_block_idx, _push_down_exprs, _slot_id_to_predicates, + _real_tuple_desc, _default_val_row_desc.get(), _col_name_to_slot_id, &_not_single_slot_filter_conjuncts, &_slot_id_to_filter_conjuncts, vectorized::TableSchemaChangeHelper::ConstNode::get_instance(), false); _cur_reader = std::move(parquet_reader); diff --git a/be/src/olap/push_handler.h b/be/src/olap/push_handler.h index 4c108a6a90822a..f468dd2decf246 100644 --- a/be/src/olap/push_handler.h +++ b/be/src/olap/push_handler.h @@ -143,6 +143,7 @@ class PushBrokerReader { std::vector _all_col_names; std::unordered_map _col_name_to_block_idx; vectorized::VExprContextSPtrs _push_down_exprs; + phmap::flat_hash_map>> _slot_id_to_predicates; const std::unordered_map* _col_name_to_slot_id; // single slot filter conjuncts std::unordered_map _slot_id_to_filter_conjuncts; diff --git a/be/src/olap/rowset/rowset_reader_context.h b/be/src/olap/rowset/rowset_reader_context.h index 1378ebb7cb7a49..acf18cf86a4744 100644 --- a/be/src/olap/rowset/rowset_reader_context.h +++ b/be/src/olap/rowset/rowset_reader_context.h @@ -58,9 +58,9 @@ struct RowsetReaderContext { TPushAggOp::type push_down_agg_type_opt = TPushAggOp::NONE; // column name -> column predicate // adding column_name for predicate to make use of column selectivity - const std::vector* predicates = nullptr; + const std::vector>* predicates = nullptr; // value column predicate in UNIQUE table - const std::vector* value_predicates = nullptr; + const std::vector>* value_predicates = nullptr; const std::vector* lower_bound_keys = nullptr; const std::vector* is_lower_keys_included = nullptr; const std::vector* upper_bound_keys = nullptr; diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp b/be/src/olap/rowset/segment_v2/column_reader.cpp index 27efea4fe08efa..974b852e037380 100644 --- a/be/src/olap/rowset/segment_v2/column_reader.cpp +++ b/be/src/olap/rowset/segment_v2/column_reader.cpp @@ -438,8 +438,8 @@ Status ColumnReader::read_page(const ColumnIteratorOptions& iter_opts, const Pag Status ColumnReader::get_row_ranges_by_zone_map( const AndBlockColumnPredicate* col_predicates, - const std::vector* delete_predicates, RowRanges* row_ranges, - const ColumnIteratorOptions& iter_opts) { + const std::vector>* delete_predicates, + RowRanges* row_ranges, const ColumnIteratorOptions& iter_opts) { std::vector page_indexes; RETURN_IF_ERROR( _get_filtered_pages(col_predicates, delete_predicates, &page_indexes, iter_opts)); @@ -505,8 +505,9 @@ Status ColumnReader::match_condition(const AndBlockColumnPredicate* col_predicat return Status::OK(); } -Status ColumnReader::prune_predicates_by_zone_map(std::vector& predicates, - const int column_id, bool* pruned) const { +Status ColumnReader::prune_predicates_by_zone_map( + std::vector>& predicates, const int column_id, + bool* pruned) const { *pruned = false; if (_zone_map_index == nullptr) { return Status::OK(); @@ -615,7 +616,7 @@ bool ColumnReader::_zone_map_match_condition(const ZoneMapPB& zone_map, Status ColumnReader::_get_filtered_pages( const AndBlockColumnPredicate* col_predicates, - const std::vector* delete_predicates, + const std::vector>* delete_predicates, std::vector* page_indexes, const ColumnIteratorOptions& iter_opts) { RETURN_IF_ERROR(_load_zone_map_index(_use_index_page_cache, _opts.kept_in_memory, iter_opts)); @@ -2080,7 +2081,8 @@ Status FileColumnIterator::_read_dict_data() { Status FileColumnIterator::get_row_ranges_by_zone_map( const AndBlockColumnPredicate* col_predicates, - const std::vector* delete_predicates, RowRanges* row_ranges) { + const std::vector>* delete_predicates, + RowRanges* row_ranges) { if (_reader->has_zone_map()) { RETURN_IF_ERROR(_reader->get_row_ranges_by_zone_map(col_predicates, delete_predicates, row_ranges, _opts)); diff --git a/be/src/olap/rowset/segment_v2/column_reader.h b/be/src/olap/rowset/segment_v2/column_reader.h index 48a10164a1373f..b65cda21ac95d2 100644 --- a/be/src/olap/rowset/segment_v2/column_reader.h +++ b/be/src/olap/rowset/segment_v2/column_reader.h @@ -193,10 +193,10 @@ class ColumnReader : public MetadataAdder, // get row ranges with zone map // - cond_column is user's query predicate // - delete_condition is a delete predicate of one version - Status get_row_ranges_by_zone_map(const AndBlockColumnPredicate* col_predicates, - const std::vector* delete_predicates, - RowRanges* row_ranges, - const ColumnIteratorOptions& iter_opts); + Status get_row_ranges_by_zone_map( + const AndBlockColumnPredicate* col_predicates, + const std::vector>* delete_predicates, + RowRanges* row_ranges, const ColumnIteratorOptions& iter_opts); // get row ranges with bloom filter index Status get_row_ranges_by_bloom_filter(const AndBlockColumnPredicate* col_predicates, @@ -207,7 +207,7 @@ class ColumnReader : public MetadataAdder, bool is_empty() const { return _num_rows == 0; } - Status prune_predicates_by_zone_map(std::vector& predicates, + Status prune_predicates_by_zone_map(std::vector>& predicates, const int column_id, bool* pruned) const; CompressionTypePB get_compression() const { return _meta_compression; } @@ -262,10 +262,10 @@ class ColumnReader : public MetadataAdder, Status _parse_zone_map_skip_null(const ZoneMapPB& zone_map, WrapperField* min_value_container, WrapperField* max_value_container) const; - Status _get_filtered_pages(const AndBlockColumnPredicate* col_predicates, - const std::vector* delete_predicates, - std::vector* page_indexes, - const ColumnIteratorOptions& iter_opts); + Status _get_filtered_pages( + const AndBlockColumnPredicate* col_predicates, + const std::vector>* delete_predicates, + std::vector* page_indexes, const ColumnIteratorOptions& iter_opts); Status _calculate_row_ranges(const std::vector& page_indexes, RowRanges* row_ranges, const ColumnIteratorOptions& iter_opts); @@ -349,7 +349,8 @@ class ColumnIterator { virtual Status get_row_ranges_by_zone_map( const AndBlockColumnPredicate* col_predicates, - const std::vector* delete_predicates, RowRanges* row_ranges) { + const std::vector>* delete_predicates, + RowRanges* row_ranges) { return Status::OK(); } @@ -440,9 +441,10 @@ class FileColumnIterator final : public ColumnIterator { // get row ranges by zone map // - cond_column is user's query predicate // - delete_condition is delete predicate of one version - Status get_row_ranges_by_zone_map(const AndBlockColumnPredicate* col_predicates, - const std::vector* delete_predicates, - RowRanges* row_ranges) override; + Status get_row_ranges_by_zone_map( + const AndBlockColumnPredicate* col_predicates, + const std::vector>* delete_predicates, + RowRanges* row_ranges) override; Status get_row_ranges_by_bloom_filter(const AndBlockColumnPredicate* col_predicates, RowRanges* row_ranges) override; diff --git a/be/src/olap/rowset/segment_v2/segment.cpp b/be/src/olap/rowset/segment_v2/segment.cpp index 53c4b2d4f4eb3c..561735b49b1678 100644 --- a/be/src/olap/rowset/segment_v2/segment.cpp +++ b/be/src/olap/rowset/segment_v2/segment.cpp @@ -275,39 +275,6 @@ Status Segment::new_iterator(SchemaSPtr schema, const StorageReadOptions& read_o } } - if (!read_options.topn_filter_source_node_ids.empty()) { - auto* query_ctx = read_options.runtime_state->get_query_ctx(); - for (int id : read_options.topn_filter_source_node_ids) { - auto runtime_predicate = query_ctx->get_runtime_predicate(id).get_predicate( - read_options.topn_filter_target_node_id); - - AndBlockColumnPredicate and_predicate; - and_predicate.add_column_predicate( - SingleColumnBlockPredicate::create_unique(runtime_predicate.get())); - std::shared_ptr reader; - Status st = get_column_reader( - read_options.tablet_schema->column(runtime_predicate->column_id()), &reader, - read_options.stats); - if (st.is()) { - continue; - } - RETURN_IF_ERROR(st); - DCHECK(reader != nullptr); - if (can_apply_predicate_safely(runtime_predicate->column_id(), *schema, - read_options.target_cast_type_for_variants, - read_options)) { - bool matched = true; - RETURN_IF_ERROR(reader->match_condition(&and_predicate, &matched)); - if (!matched) { - // any condition not satisfied, return. - *iter = std::make_unique(*schema); - read_options.stats->filtered_segment_number++; - return Status::OK(); - } - } - } - } - { SCOPED_RAW_TIMER(&read_options.stats->segment_load_index_timer_ns); RETURN_IF_ERROR(load_index(read_options.stats)); @@ -340,7 +307,7 @@ Status Segment::new_iterator(SchemaSPtr schema, const StorageReadOptions& read_o options_with_pruned_predicates.column_predicates = pruned_predicates; //because column_predicates is changed, we need to rebuild col_id_to_predicates so that inverted index will not go through it. options_with_pruned_predicates.col_id_to_predicates.clear(); - for (auto* pred : options_with_pruned_predicates.column_predicates) { + for (auto pred : options_with_pruned_predicates.column_predicates) { if (!options_with_pruned_predicates.col_id_to_predicates.contains( pred->column_id())) { options_with_pruned_predicates.col_id_to_predicates.insert( diff --git a/be/src/olap/rowset/segment_v2/segment.h b/be/src/olap/rowset/segment_v2/segment.h index 84e86110e137af..1e24b16fa80fe0 100644 --- a/be/src/olap/rowset/segment_v2/segment.h +++ b/be/src/olap/rowset/segment_v2/segment.h @@ -181,6 +181,7 @@ class Segment : public std::enable_shared_from_this, public MetadataAdd const std::map& target_cast_type_for_variants, const StorageReadOptions& read_options) { const doris::Field* col = schema.column(cid); + DCHECK(col != nullptr) << "Column not found in schema for cid=" << cid; vectorized::DataTypePtr storage_column_type = get_data_type_of(col->get_desc(), read_options); if (storage_column_type == nullptr || col->type() != FieldType::OLAP_FIELD_TYPE_VARIANT || diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp index 43659b368d15dc..4d97fe2611e65f 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp +++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp @@ -378,7 +378,7 @@ Status SegmentIterator::_init_impl(const StorageReadOptions& opts) { void SegmentIterator::_initialize_predicate_results() { // Initialize from _col_predicates - for (auto* pred : _col_predicates) { + for (auto pred : _col_predicates) { int cid = pred->column_id(); _column_predicate_index_exec_status[cid][pred] = false; } @@ -890,31 +890,6 @@ Status SegmentIterator::_get_row_ranges_from_conditions(RowRanges* condition_row RowRanges::ranges_intersection(*condition_row_ranges, zone_map_row_ranges, condition_row_ranges); - if (!_opts.topn_filter_source_node_ids.empty()) { - auto* query_ctx = _opts.runtime_state->get_query_ctx(); - for (int id : _opts.topn_filter_source_node_ids) { - std::shared_ptr runtime_predicate = - query_ctx->get_runtime_predicate(id).get_predicate( - _opts.topn_filter_target_node_id); - if (_segment->can_apply_predicate_safely(runtime_predicate->column_id(), *_schema, - _opts.target_cast_type_for_variants, - _opts)) { - AndBlockColumnPredicate and_predicate; - and_predicate.add_column_predicate( - SingleColumnBlockPredicate::create_unique(runtime_predicate.get())); - - RowRanges column_rp_row_ranges = RowRanges::create_single(num_rows()); - RETURN_IF_ERROR(_column_iterators[runtime_predicate->column_id()] - ->get_row_ranges_by_zone_map(&and_predicate, nullptr, - &column_rp_row_ranges)); - - // intersect different columns's row ranges to get final row ranges by zone map - RowRanges::ranges_intersection(zone_map_row_ranges, column_rp_row_ranges, - &zone_map_row_ranges); - } - } - } - size_t pre_size2 = condition_row_ranges->count(); RowRanges::ranges_intersection(*condition_row_ranges, zone_map_row_ranges, condition_row_ranges); @@ -961,7 +936,7 @@ Status SegmentIterator::_extract_common_expr_columns(const vectorized::VExprSPtr return Status::OK(); } -bool SegmentIterator::_check_apply_by_inverted_index(ColumnPredicate* pred) { +bool SegmentIterator::_check_apply_by_inverted_index(std::shared_ptr pred) { if (_opts.runtime_state && !_opts.runtime_state->query_options().enable_inverted_index_query) { return false; } @@ -989,8 +964,8 @@ bool SegmentIterator::_check_apply_by_inverted_index(ColumnPredicate* pred) { } // Function filter no apply inverted index - if (dynamic_cast*>(pred) != nullptr || - dynamic_cast*>(pred) != nullptr) { + if (dynamic_cast*>(pred.get()) != nullptr || + dynamic_cast*>(pred.get()) != nullptr) { return false; } @@ -1102,8 +1077,8 @@ inline bool SegmentIterator::_inverted_index_not_support_pred_type(const Predica } Status SegmentIterator::_apply_inverted_index_on_column_predicate( - ColumnPredicate* pred, std::vector& remaining_predicates, - bool* continue_apply) { + std::shared_ptr pred, + std::vector>& remaining_predicates, bool* continue_apply) { if (!_check_apply_by_inverted_index(pred)) { remaining_predicates.emplace_back(pred); } else { @@ -1191,8 +1166,8 @@ bool SegmentIterator::_need_read_data(ColumnId cid) { } Status SegmentIterator::_apply_inverted_index() { - std::vector remaining_predicates; - std::set no_need_to_pass_column_predicate_set; + std::vector> remaining_predicates; + std::set> no_need_to_pass_column_predicate_set; for (auto pred : _col_predicates) { if (no_need_to_pass_column_predicate_set.count(pred) > 0) { @@ -1622,9 +1597,9 @@ Status SegmentIterator::_vec_init_lazy_materialization() { std::set del_cond_id_set; _opts.delete_condition_predicates->get_all_column_ids(del_cond_id_set); - std::set delete_predicate_set {}; + std::set> delete_predicate_set {}; _opts.delete_condition_predicates->get_all_column_predicate(delete_predicate_set); - for (const auto* const predicate : delete_predicate_set) { + for (auto predicate : delete_predicate_set) { if (PredicateTypeTraits::is_range(predicate->type())) { _delete_range_column_ids.push_back(predicate->column_id()); } else if (PredicateTypeTraits::is_bloom_filter(predicate->type())) { @@ -1632,32 +1607,12 @@ Status SegmentIterator::_vec_init_lazy_materialization() { } } - // add runtime predicate to _col_predicates - // should NOT add for order by key, - // since key is already sorted and topn_next only need first N rows from each segment, - // but runtime predicate will filter some rows and read more than N rows. - // should add add for order by none-key column, since none-key column is not sorted and - // all rows should be read, so runtime predicate will reduce rows for topn node - if (!_opts.topn_filter_source_node_ids.empty() && - (_opts.read_orderby_key_columns == nullptr || _opts.read_orderby_key_columns->empty())) { - for (int id : _opts.topn_filter_source_node_ids) { - auto& runtime_predicate = - _opts.runtime_state->get_query_ctx()->get_runtime_predicate(id); - _col_predicates.push_back( - runtime_predicate.get_predicate(_opts.topn_filter_target_node_id).get()); - VLOG_DEBUG << fmt::format( - "After appending topn filter to col_predicates, " - "col_predicates size: {}, col_predicate: {}", - _col_predicates.size(), _col_predicates.back()->debug_string()); - } - } - // Step1: extract columns that can be lazy materialization if (!_col_predicates.empty() || !del_cond_id_set.empty()) { std::set short_cir_pred_col_id_set; // using set for distinct cid std::set vec_pred_col_id_set; - for (auto* predicate : _col_predicates) { + for (auto predicate : _col_predicates) { auto cid = predicate->column_id(); _is_pred_column[cid] = true; pred_column_ids.insert(cid); @@ -1809,7 +1764,7 @@ Status SegmentIterator::_vec_init_lazy_materialization() { return Status::OK(); } -bool SegmentIterator::_can_evaluated_by_vectorized(ColumnPredicate* predicate) { +bool SegmentIterator::_can_evaluated_by_vectorized(std::shared_ptr predicate) { auto cid = predicate->column_id(); FieldType field_type = _schema->column(cid)->type(); if (field_type == FieldType::OLAP_FIELD_TYPE_VARIANT) { @@ -2230,7 +2185,7 @@ uint16_t SegmentIterator::_evaluate_short_circuit_predicate(uint16_t* vec_sel_ro } uint16_t original_size = selected_size; - for (auto* predicate : _short_cir_eval_predicate) { + for (auto predicate : _short_cir_eval_predicate) { auto column_id = predicate->column_id(); auto& short_cir_column = _current_return_columns[column_id]; selected_size = predicate->evaluate(*short_cir_column, vec_sel_rowid_idx, selected_size); @@ -2740,7 +2695,7 @@ void SegmentIterator::_convert_dict_code_for_predicate_if_necessary() { } void SegmentIterator::_convert_dict_code_for_predicate_if_necessary_impl( - ColumnPredicate* predicate) { + std::shared_ptr predicate) { auto& column = _current_return_columns[predicate->column_id()]; auto* col_ptr = column.get(); diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.h b/be/src/olap/rowset/segment_v2/segment_iterator.h index 94aa87adc79cf0..589854961af109 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.h +++ b/be/src/olap/rowset/segment_v2/segment_iterator.h @@ -138,7 +138,7 @@ class SegmentIterator : public RowwiseIterator { _update_profile(profile, _pre_eval_block_predicate, "PreEvaluatePredicates"); if (_opts.delete_condition_predicates != nullptr) { - std::set delete_predicate_set; + std::set> delete_predicate_set; _opts.delete_condition_predicates->get_all_column_predicate(delete_predicate_set); _update_profile(profile, delete_predicate_set, "DeleteConditionPredicates"); } @@ -191,7 +191,8 @@ class SegmentIterator : public RowwiseIterator { [[nodiscard]] Status _get_row_ranges_from_conditions(RowRanges* condition_row_ranges); [[nodiscard]] Status _apply_inverted_index(); [[nodiscard]] Status _apply_inverted_index_on_column_predicate( - ColumnPredicate* pred, std::vector& remaining_predicates, + std::shared_ptr pred, + std::vector>& remaining_predicates, bool* continue_apply); [[nodiscard]] Status _apply_ann_topn_predicate(); [[nodiscard]] Status _apply_index_expr(); @@ -275,7 +276,7 @@ class SegmentIterator : public RowwiseIterator { return Status::OK(); } - bool _can_evaluated_by_vectorized(ColumnPredicate* predicate); + bool _can_evaluated_by_vectorized(std::shared_ptr predicate); [[nodiscard]] Status _extract_common_expr_columns(const vectorized::VExprSPtr& expr); // same with _extract_common_expr_columns, but only extract columns that can be used for index @@ -290,9 +291,10 @@ class SegmentIterator : public RowwiseIterator { // Dictionary column should do something to initial. void _convert_dict_code_for_predicate_if_necessary(); - void _convert_dict_code_for_predicate_if_necessary_impl(ColumnPredicate* predicate); + void _convert_dict_code_for_predicate_if_necessary_impl( + std::shared_ptr predicate); - bool _check_apply_by_inverted_index(ColumnPredicate* pred); + bool _check_apply_by_inverted_index(std::shared_ptr pred); void _output_index_result_column_for_expr(uint16_t* sel_rowid_idx, uint16_t select_size, vectorized::Block* block); @@ -420,8 +422,8 @@ class SegmentIterator : public RowwiseIterator { std::map _need_read_data_indices; std::vector _is_common_expr_column; vectorized::MutableColumns _current_return_columns; - std::vector _pre_eval_block_predicate; - std::vector _short_cir_eval_predicate; + std::vector> _pre_eval_block_predicate; + std::vector> _short_cir_eval_predicate; std::vector _delete_range_column_ids; std::vector _delete_bloom_filter_column_ids; // when lazy materialization is enabled, segmentIter need to read data at least twice @@ -442,7 +444,7 @@ class SegmentIterator : public RowwiseIterator { StorageReadOptions _opts; // make a copy of `_opts.column_predicates` in order to make local changes - std::vector _col_predicates; + std::vector> _col_predicates; vectorized::VExprContextSPtrs _common_expr_ctxs_push_down; bool _enable_common_expr_pushdown = false; std::vector _remaining_conjunct_roots; @@ -471,7 +473,7 @@ class SegmentIterator : public RowwiseIterator { std::unique_ptr _pool; // used to collect filter information. - std::vector _filter_info_id; + std::vector> _filter_info_id; bool _record_rowids = false; int64_t _tablet_id = 0; std::set _output_columns; @@ -482,7 +484,7 @@ class SegmentIterator : public RowwiseIterator { * column and column_predicates on it. * a boolean value to indicate whether the column has been read by the index. */ - std::unordered_map> + std::unordered_map, bool>> _column_predicate_index_exec_status; /* diff --git a/be/src/olap/shared_predicate.h b/be/src/olap/shared_predicate.h index 45eae1b7f80ff9..46cda6653b9e5f 100644 --- a/be/src/olap/shared_predicate.h +++ b/be/src/olap/shared_predicate.h @@ -32,30 +32,58 @@ namespace doris { // SharedPredicate only used on topn runtime predicate. // Runtime predicate globally share one predicate, to ensure that updates can be real-time. // At the beginning nested predicate may be nullptr, in which case predicate always returns true. -class SharedPredicate : public ColumnPredicate { +class SharedPredicate final : public ColumnPredicate { ENABLE_FACTORY_CREATOR(SharedPredicate); public: - SharedPredicate(uint32_t column_id) : ColumnPredicate(column_id) {} + SharedPredicate(uint32_t column_id, std::string col_name) + : ColumnPredicate(column_id, col_name, PrimitiveType::INVALID_TYPE), + _mtx(std::make_shared()) {} + SharedPredicate(const ColumnPredicate& other) = delete; + SharedPredicate(const SharedPredicate& other, uint32_t column_id) + : ColumnPredicate(other, column_id), + _mtx(std::make_shared()), + _nested(assert_cast(other)._nested + ? other._nested->clone(column_id) + : nullptr) {} + ~SharedPredicate() override = default; + std::string debug_string() const override { + std::shared_lock lock(*_mtx); + fmt::memory_buffer debug_string_buffer; + fmt::format_to(debug_string_buffer, "SharedPredicate({}, nested={})", + ColumnPredicate::debug_string(), _nested ? _nested->debug_string() : "null"); + return fmt::to_string(debug_string_buffer); + } + std::shared_ptr clone(uint32_t column_id) const override { + // All scanner thread should share the same SharedPredicate object. + return std::const_pointer_cast(shared_from_this()); + } PredicateType type() const override { - std::shared_lock lock(_mtx); + std::shared_lock lock(*_mtx); if (!_nested) { // topn filter is le or ge return PredicateType::LE; } return _nested->type(); } + PrimitiveType primitive_type() const override { + std::shared_lock lock(*_mtx); + if (!_nested) { + return PrimitiveType::INVALID_TYPE; + } + return _nested->primitive_type(); + } - void set_nested(ColumnPredicate* nested) { - std::unique_lock lock(_mtx); - _nested.reset(nested); + void set_nested(const std::shared_ptr& nested) { + std::unique_lock lock(*_mtx); + _nested = nested; } Status evaluate(const vectorized::IndexFieldNameAndTypePair& name_with_type, IndexIterator* iterator, uint32_t num_rows, roaring::Roaring* bitmap) const override { - std::shared_lock lock(_mtx); + std::shared_lock lock(*_mtx); if (!_nested) { return Status::OK(); } @@ -64,7 +92,7 @@ class SharedPredicate : public ColumnPredicate { void evaluate_and(const vectorized::IColumn& column, const uint16_t* sel, uint16_t size, bool* flags) const override { - std::shared_lock lock(_mtx); + std::shared_lock lock(*_mtx); if (!_nested) { return; } @@ -77,7 +105,7 @@ class SharedPredicate : public ColumnPredicate { } bool evaluate_and(const std::pair& statistic) const override { - std::shared_lock lock(_mtx); + std::shared_lock lock(*_mtx); if (!_nested) { return ColumnPredicate::evaluate_and(statistic); } @@ -85,7 +113,7 @@ class SharedPredicate : public ColumnPredicate { } bool evaluate_del(const std::pair& statistic) const override { - std::shared_lock lock(_mtx); + std::shared_lock lock(*_mtx); if (!_nested) { return ColumnPredicate::evaluate_del(statistic); } @@ -93,7 +121,7 @@ class SharedPredicate : public ColumnPredicate { } bool evaluate_and(const BloomFilter* bf) const override { - std::shared_lock lock(_mtx); + std::shared_lock lock(*_mtx); if (!_nested) { return ColumnPredicate::evaluate_and(bf); } @@ -101,7 +129,7 @@ class SharedPredicate : public ColumnPredicate { } bool can_do_bloom_filter(bool ngram) const override { - std::shared_lock lock(_mtx); + std::shared_lock lock(*_mtx); if (!_nested) { return ColumnPredicate::can_do_bloom_filter(ngram); } @@ -110,7 +138,7 @@ class SharedPredicate : public ColumnPredicate { void evaluate_vec(const vectorized::IColumn& column, uint16_t size, bool* flags) const override { - std::shared_lock lock(_mtx); + std::shared_lock lock(*_mtx); if (!_nested) { for (uint16_t i = 0; i < size; ++i) { flags[i] = true; @@ -122,7 +150,7 @@ class SharedPredicate : public ColumnPredicate { void evaluate_and_vec(const vectorized::IColumn& column, uint16_t size, bool* flags) const override { - std::shared_lock lock(_mtx); + std::shared_lock lock(*_mtx); if (!_nested) { return; } @@ -130,7 +158,7 @@ class SharedPredicate : public ColumnPredicate { } std::string get_search_str() const override { - std::shared_lock lock(_mtx); + std::shared_lock lock(*_mtx); if (!_nested) { DCHECK(false) << "should not reach here"; } @@ -140,22 +168,14 @@ class SharedPredicate : public ColumnPredicate { private: uint16_t _evaluate_inner(const vectorized::IColumn& column, uint16_t* sel, uint16_t size) const override { - std::shared_lock lock(_mtx); + std::shared_lock lock(*_mtx); if (!_nested) { return size; } return _nested->evaluate(column, sel, size); } - std::string _debug_string() const override { - std::shared_lock lock(_mtx); - if (!_nested) { - return "shared_predicate(unknow)"; - } - return "shared_predicate(" + _nested->debug_string() + ")"; - } - - mutable std::shared_mutex _mtx; + mutable std::shared_ptr _mtx; std::shared_ptr _nested; }; diff --git a/be/src/olap/tablet_reader.cpp b/be/src/olap/tablet_reader.cpp index 800485f6bb08dc..8028eca7cf71ed 100644 --- a/be/src/olap/tablet_reader.cpp +++ b/be/src/olap/tablet_reader.cpp @@ -81,10 +81,6 @@ std::string TabletReader::ReaderParams::to_string() const { ss << " end_keys=" << key; } - for (auto& condition : conditions) { - ss << " conditions=" << apache::thrift::ThriftDebugString(condition.filter); - } - return ss.str(); } @@ -102,15 +98,6 @@ std::string TabletReader::KeysParam::to_string() const { return ss.str(); } -TabletReader::~TabletReader() { - for (auto* pred : _col_predicates) { - delete pred; - } - for (auto* pred : _value_col_predicates) { - delete pred; - } -} - Status TabletReader::init(const ReaderParams& read_params) { SCOPED_RAW_TIMER(&_stats.tablet_reader_init_timer_ns); @@ -125,31 +112,6 @@ Status TabletReader::init(const ReaderParams& read_params) { return res; } -// When only one rowset has data, and this rowset is nonoverlapping, we can read directly without aggregation -bool TabletReader::_optimize_for_single_rowset( - const std::vector& rs_readers) { - bool has_delete_rowset = false; - bool has_overlapping = false; - int nonoverlapping_count = 0; - for (const auto& rs_reader : rs_readers) { - if (rs_reader->rowset()->rowset_meta()->delete_flag()) { - has_delete_rowset = true; - break; - } - if (rs_reader->rowset()->rowset_meta()->num_rows() > 0) { - if (rs_reader->rowset()->rowset_meta()->is_segments_overlapping()) { - // when there are overlapping segments, can not do directly read - has_overlapping = true; - break; - } else if (++nonoverlapping_count > 1) { - break; - } - } - } - - return !has_overlapping && nonoverlapping_count == 1 && !has_delete_rowset; -} - Status TabletReader::_capture_rs_readers(const ReaderParams& read_params) { SCOPED_RAW_TIMER(&_stats.tablet_reader_capture_rs_readers_timer_ns); if (read_params.rs_splits.empty()) { @@ -520,47 +482,18 @@ Status TabletReader::_init_orderby_keys_param(const ReaderParams& read_params) { Status TabletReader::_init_conditions_param(const ReaderParams& read_params) { SCOPED_RAW_TIMER(&_stats.tablet_reader_init_conditions_param_timer_ns); - std::vector predicates; - - auto parse_and_emplace_predicates = [this, &predicates](auto& params) { - for (const auto& param : params) { - ColumnPredicate* predicate = _parse_to_predicate({param.column_name, param.filter}); - predicate->attach_profile_counter(param.runtime_filter_id, param.filtered_rows_counter, - param.input_rows_counter, - param.always_true_rows_counter); - predicates.emplace_back(predicate); - } - }; - - for (const auto& param : read_params.conditions) { - TCondition tmp_cond = param.filter; - RETURN_IF_ERROR(_tablet_schema->have_column(tmp_cond.column_name)); - // The "column" parameter might represent a column resulting from the decomposition of a variant column. - // Instead of using a "unique_id" for identification, we are utilizing a "path" to denote this column. - const auto& column = *DORIS_TRY(_tablet_schema->column(tmp_cond.column_name)); - const auto& mcolumn = materialize_column(column); - uint32_t index = _tablet_schema->field_index(tmp_cond.column_name); - ColumnPredicate* predicate = - parse_to_predicate(mcolumn.get_vec_type(), index, tmp_cond, _predicate_arena); - // record condition value into predicate_params in order to pushdown segment_iterator, - // _gen_predicate_result_sign will build predicate result unique sign with condition value - predicate->attach_profile_counter(param.runtime_filter_id, param.filtered_rows_counter, - param.input_rows_counter, param.always_true_rows_counter); - predicates.emplace_back(predicate); - } - parse_and_emplace_predicates(read_params.bloom_filters); - parse_and_emplace_predicates(read_params.bitmap_filters); - parse_and_emplace_predicates(read_params.in_filters); - + std::vector> predicates; + std::copy(read_params.predicates.cbegin(), read_params.predicates.cend(), + std::inserter(predicates, predicates.begin())); // Function filter push down to storage engine - auto is_like_predicate = [](ColumnPredicate* _pred) { - return dynamic_cast*>(_pred) != nullptr || - dynamic_cast*>(_pred) != nullptr; + auto is_like_predicate = [](std::shared_ptr _pred) { + return dynamic_cast*>(_pred.get()) != nullptr || + dynamic_cast*>(_pred.get()) != nullptr; }; for (const auto& filter : read_params.function_filters) { predicates.emplace_back(_parse_to_predicate(filter)); - auto* pred = predicates.back(); + auto pred = predicates.back(); const auto& col = _tablet_schema->column(pred->column_id()); const auto* tablet_index = _tablet_schema->get_ngram_bf_index(col.unique_id()); @@ -581,7 +514,7 @@ Status TabletReader::_init_conditions_param(const ReaderParams& read_params) { } } - for (auto* predicate : predicates) { + for (auto predicate : predicates) { auto column = _tablet_schema->column(predicate->column_id()); if (column.aggregation() != FieldAggregationMethod::OLAP_FIELD_AGGREGATION_NONE) { _value_col_predicates.push_back(predicate); @@ -599,39 +532,12 @@ Status TabletReader::_init_conditions_param(const ReaderParams& read_params) { return Status::OK(); } -ColumnPredicate* TabletReader::_parse_to_predicate( - const std::pair>& bloom_filter) { - int32_t index = _tablet_schema->field_index(bloom_filter.first); - if (index < 0) { - return nullptr; - } - const TabletColumn& column = materialize_column(_tablet_schema->column(index)); - return create_column_predicate(index, bloom_filter.second, column.type(), &column); -} - -ColumnPredicate* TabletReader::_parse_to_predicate( - const std::pair>& in_filter) { - int32_t index = _tablet_schema->field_index(in_filter.first); - if (index < 0) { - return nullptr; - } - const TabletColumn& column = materialize_column(_tablet_schema->column(index)); - return create_column_predicate(index, in_filter.second, column.type(), &column); -} - -ColumnPredicate* TabletReader::_parse_to_predicate( - const std::pair>& bitmap_filter) { - int32_t index = _tablet_schema->field_index(bitmap_filter.first); - if (index < 0) { - return nullptr; - } - const TabletColumn& column = materialize_column(_tablet_schema->column(index)); - return create_column_predicate(index, bitmap_filter.second, column.type(), &column); -} - -ColumnPredicate* TabletReader::_parse_to_predicate(const FunctionFilter& function_filter) { +std::shared_ptr TabletReader::_parse_to_predicate( + const FunctionFilter& function_filter) { int32_t index = _tablet_schema->field_index(function_filter._col_name); if (index < 0) { + throw Exception(Status::InternalError("Column {} not found in tablet schema", + function_filter._col_name)); return nullptr; } const TabletColumn& column = materialize_column(_tablet_schema->column(index)); diff --git a/be/src/olap/tablet_reader.h b/be/src/olap/tablet_reader.h index aa72f2ce0dbf15..79539c6a6e60f9 100644 --- a/be/src/olap/tablet_reader.h +++ b/be/src/olap/tablet_reader.h @@ -138,10 +138,7 @@ class TabletReader { bool start_key_include = false; bool end_key_include = false; - std::vector> conditions; - std::vector>> bloom_filters; - std::vector>> bitmap_filters; - std::vector>> in_filters; + std::vector> predicates; std::vector function_filters; std::vector delete_predicates; // slots that cast may be eliminated in storage layer @@ -165,7 +162,6 @@ class TabletReader { std::vector* origin_return_columns = nullptr; std::unordered_set* tablet_columns_convert_to_null_set = nullptr; TPushAggOp::type push_down_agg_type_opt = TPushAggOp::NONE; - vectorized::VExpr* remaining_vconjunct_root = nullptr; std::vector remaining_conjunct_roots; vectorized::VExprContextSPtrs common_expr_ctxs_push_down; @@ -210,7 +206,7 @@ class TabletReader { TabletReader() = default; - virtual ~TabletReader(); + virtual ~TabletReader() = default; TabletReader(const TabletReader&) = delete; void operator=(const TabletReader&) = delete; @@ -256,24 +252,14 @@ class TabletReader { Status _capture_rs_readers(const ReaderParams& read_params); - bool _optimize_for_single_rowset(const std::vector& rs_readers); - Status _init_keys_param(const ReaderParams& read_params); Status _init_orderby_keys_param(const ReaderParams& read_params); Status _init_conditions_param(const ReaderParams& read_params); - ColumnPredicate* _parse_to_predicate( - const std::pair>& bloom_filter); - - ColumnPredicate* _parse_to_predicate( - const std::pair>& bitmap_filter); - - ColumnPredicate* _parse_to_predicate( - const std::pair>& in_filter); - - virtual ColumnPredicate* _parse_to_predicate(const FunctionFilter& function_filter); + virtual std::shared_ptr _parse_to_predicate( + const FunctionFilter& function_filter); Status _init_delete_condition(const ReaderParams& read_params); @@ -306,8 +292,8 @@ class TabletReader { KeysParam _keys_param; std::vector _is_lower_keys_included; std::vector _is_upper_keys_included; - std::vector _col_predicates; - std::vector _value_col_predicates; + std::vector> _col_predicates; + std::vector> _value_col_predicates; DeleteHandler _delete_handler; // Indicates whether the tablets has do a aggregation in storage engine. diff --git a/be/src/pipeline/exec/file_scan_operator.cpp b/be/src/pipeline/exec/file_scan_operator.cpp index b05638b74711a6..2ffa0e64465f13 100644 --- a/be/src/pipeline/exec/file_scan_operator.cpp +++ b/be/src/pipeline/exec/file_scan_operator.cpp @@ -32,6 +32,29 @@ namespace doris::pipeline { #include "common/compile_check_begin.h" +PushDownType FileScanLocalState::_should_push_down_binary_predicate( + vectorized::VectorizedFnCall* fn_call, vectorized::VExprContext* expr_ctx, + StringRef* constant_val, const std::set fn_name) const { + if (!fn_name.contains(fn_call->fn().name.function_name)) { + return PushDownType::UNACCEPTABLE; + } + DCHECK(constant_val->data == nullptr) << "constant_val should not have a value"; + const auto& children = fn_call->children(); + DCHECK(children.size() == 2); + DCHECK_EQ(children[0]->node_type(), TExprNodeType::SLOT_REF); + if (children[1]->is_constant()) { + std::shared_ptr const_col_wrapper; + THROW_IF_ERROR(children[1]->get_const_col(expr_ctx, &const_col_wrapper)); + const auto* const_column = + assert_cast(const_col_wrapper->column_ptr.get()); + *constant_val = const_column->get_data_at(0); + return PushDownType::PARTIAL_ACCEPTABLE; + } else { + // only handle constant value + return PushDownType::UNACCEPTABLE; + } +} + int FileScanLocalState::max_scanners_concurrency(RuntimeState* state) const { // For select * from table limit 10; should just use one thread. if (should_run_serial()) { @@ -90,7 +113,7 @@ Status FileScanLocalState::_init_scanners(std::list* sc for (int i = 0; i < _max_scanners; ++i) { std::unique_ptr scanner = vectorized::FileScanner::create_unique( state(), this, p._limit, _split_source, _scanner_profile.get(), _kv_cache.get(), - &_colname_to_value_range, &p._colname_to_slot_id); + &p._colname_to_slot_id); RETURN_IF_ERROR(scanner->init(state(), _conjuncts)); scanners->push_back(std::move(scanner)); } diff --git a/be/src/pipeline/exec/file_scan_operator.h b/be/src/pipeline/exec/file_scan_operator.h index 12b303a02c9375..c2e1da398fee8f 100644 --- a/be/src/pipeline/exec/file_scan_operator.h +++ b/be/src/pipeline/exec/file_scan_operator.h @@ -60,6 +60,28 @@ class FileScanLocalState final : public ScanLocalState { private: friend class vectorized::FileScanner; + PushDownType _should_push_down_bloom_filter() const override { + return PushDownType::PARTIAL_ACCEPTABLE; + } + PushDownType _should_push_down_topn_filter() const override { + return PushDownType::PARTIAL_ACCEPTABLE; + } + PushDownType _should_push_down_bitmap_filter() const override { + return PushDownType::PARTIAL_ACCEPTABLE; + } + PushDownType _should_push_down_is_null_predicate( + vectorized::VectorizedFnCall* fn_call) const override { + return fn_call->fn().name.function_name == "is_null_pred" || + fn_call->fn().name.function_name == "is_not_null_pred" + ? PushDownType::PARTIAL_ACCEPTABLE + : PushDownType::UNACCEPTABLE; + } + PushDownType _should_push_down_in_predicate() const override { + return PushDownType::PARTIAL_ACCEPTABLE; + } + PushDownType _should_push_down_binary_predicate( + vectorized::VectorizedFnCall* fn_call, vectorized::VExprContext* expr_ctx, + StringRef* constant_val, const std::set fn_name) const override; std::shared_ptr _split_source = nullptr; int _max_scanners; // A in memory cache to save some common components @@ -83,8 +105,6 @@ class FileScanOperatorX final : public ScanOperatorX { Status prepare(RuntimeState* state) override; - bool is_file_scan_operator() const override { return true; } - // There's only one scan range for each backend in batch split mode. Each backend only starts up one ScanNode instance. int parallelism(RuntimeState* state) const override { return _batch_split_mode ? 1 : ScanOperatorX::parallelism(state); diff --git a/be/src/pipeline/exec/mock_scan_operator.h b/be/src/pipeline/exec/mock_scan_operator.h index 9a7c51952ee219..65e6cd32782f4a 100644 --- a/be/src/pipeline/exec/mock_scan_operator.h +++ b/be/src/pipeline/exec/mock_scan_operator.h @@ -33,13 +33,49 @@ class MockScanLocalState final : public ScanLocalState { bool _is_key_column(const std::string& col_name) override { return true; } private: - PushDownType _should_push_down_bloom_filter() override { return PushDownType::ACCEPTABLE; } + PushDownType _should_push_down_bloom_filter() const override { + return PushDownType::ACCEPTABLE; + } - PushDownType _should_push_down_bitmap_filter() override { return PushDownType::ACCEPTABLE; } - - PushDownType _should_push_down_is_null_predicate() override { return PushDownType::ACCEPTABLE; } + PushDownType _should_push_down_bitmap_filter() const override { + return PushDownType::ACCEPTABLE; + } bool _should_push_down_common_expr() override { return true; } + PushDownType _should_push_down_topn_filter() const override { return PushDownType::ACCEPTABLE; } + + PushDownType _should_push_down_is_null_predicate( + vectorized::VectorizedFnCall* fn_call) const override { + return fn_call->fn().name.function_name == "is_null_pred" || + fn_call->fn().name.function_name == "is_not_null_pred" + ? PushDownType::ACCEPTABLE + : PushDownType::UNACCEPTABLE; + } + PushDownType _should_push_down_in_predicate() const override { + return PushDownType::ACCEPTABLE; + } + PushDownType _should_push_down_binary_predicate( + vectorized::VectorizedFnCall* fn_call, vectorized::VExprContext* expr_ctx, + StringRef* constant_val, const std::set fn_name) const override { + if (!fn_name.contains(fn_call->fn().name.function_name)) { + return PushDownType::UNACCEPTABLE; + } + DCHECK(constant_val->data == nullptr) << "constant_val should not have a value"; + const auto& children = fn_call->children(); + DCHECK(children.size() == 2); + DCHECK_EQ(children[0]->node_type(), TExprNodeType::SLOT_REF); + if (children[1]->is_constant()) { + std::shared_ptr const_col_wrapper; + THROW_IF_ERROR(children[1]->get_const_col(expr_ctx, &const_col_wrapper)); + const auto* const_column = assert_cast( + const_col_wrapper->column_ptr.get()); + *constant_val = const_column->get_data_at(0); + return PushDownType::ACCEPTABLE; + } else { + // only handle constant value + return PushDownType::UNACCEPTABLE; + } + } }; class MockScanOperatorX final : public ScanOperatorX { diff --git a/be/src/pipeline/exec/olap_scan_operator.cpp b/be/src/pipeline/exec/olap_scan_operator.cpp index 53b3d7e76d9dfb..a499ed8bd3fcfe 100644 --- a/be/src/pipeline/exec/olap_scan_operator.cpp +++ b/be/src/pipeline/exec/olap_scan_operator.cpp @@ -83,6 +83,29 @@ Status OlapScanLocalState::init(RuntimeState* state, LocalStateInfo& info) { return Status::OK(); } +PushDownType OlapScanLocalState::_should_push_down_binary_predicate( + vectorized::VectorizedFnCall* fn_call, vectorized::VExprContext* expr_ctx, + StringRef* constant_val, const std::set fn_name) const { + if (!fn_name.contains(fn_call->fn().name.function_name)) { + return PushDownType::UNACCEPTABLE; + } + DCHECK(constant_val->data == nullptr) << "constant_val should not have a value"; + const auto& children = fn_call->children(); + DCHECK(children.size() == 2); + DCHECK_EQ(children[0]->node_type(), TExprNodeType::SLOT_REF); + if (children[1]->is_constant()) { + std::shared_ptr const_col_wrapper; + THROW_IF_ERROR(children[1]->get_const_col(expr_ctx, &const_col_wrapper)); + const auto* const_column = + assert_cast(const_col_wrapper->column_ptr.get()); + *constant_val = const_column->get_data_at(0); + return PushDownType::ACCEPTABLE; + } else { + // only handle constant value + return PushDownType::UNACCEPTABLE; + } +} + Status OlapScanLocalState::_init_profile() { RETURN_IF_ERROR(ScanLocalState::_init_profile()); // Rows read from storage. @@ -423,19 +446,6 @@ Status OlapScanLocalState::_init_scanners(std::list* sc return Status::OK(); } SCOPED_TIMER(_scanner_init_timer); - - if (!_conjuncts.empty() && _state->enable_profile()) { - std::string message; - for (auto& conjunct : _conjuncts) { - if (conjunct->root()) { - if (!message.empty()) { - message += ", "; - } - message += conjunct->root()->debug_string(); - } - } - custom_profile()->add_info_string("RemainedDownPredicates", message); - } auto& p = _parent->cast(); for (auto uid : p._olap_scan_node.output_column_unique_ids) { @@ -807,32 +817,6 @@ void OlapScanLocalState::set_scan_ranges(RuntimeState* state, } } -static std::string olap_filter_to_string(const doris::TCondition& condition) { - auto op_name = condition.condition_op; - if (condition.condition_op == "*=") { - op_name = "IN"; - } else if (condition.condition_op == "!*=") { - op_name = "NOT IN"; - } - return fmt::format("{{{} {} {}}}", condition.column_name, op_name, - condition.condition_values.size() > 128 - ? "[more than 128 elements]" - : to_string(condition.condition_values)); -} - -static std::string olap_filters_to_string(const std::vector>& filters) { - std::string filters_string; - filters_string += "["; - for (auto it = filters.cbegin(); it != filters.cend(); it++) { - if (it != filters.cbegin()) { - filters_string += ", "; - } - filters_string += olap_filter_to_string(it->filter); - } - filters_string += "]"; - return filters_string; -} - static std::string tablets_id_to_string( const std::vector>& scan_ranges) { if (scan_ranges.empty()) { @@ -881,10 +865,17 @@ Status OlapScanLocalState::_build_key_ranges_and_filters() { for (int column_index = 0; column_index < column_names.size() && !_scan_keys.has_range_value() && !eos && !should_break; ++column_index) { - auto iter = _colname_to_value_range.find(column_names[column_index]); - if (_colname_to_value_range.end() == iter) { + if (p._colname_to_slot_id.find(column_names[column_index]) == + p._colname_to_slot_id.end()) { + break; + } + auto iter = + _slot_id_to_value_range.find(p._colname_to_slot_id[column_names[column_index]]); + if (_slot_id_to_value_range.end() == iter) { break; } + DCHECK(_slot_id_to_predicates.count(iter->first) > 0); + const auto& value_range = iter->second; RETURN_IF_ERROR(std::visit( [&](auto&& range) { @@ -897,11 +888,25 @@ Status OlapScanLocalState::_build_key_ranges_and_filters() { _scan_keys.extend_scan_key(temp_range, p._max_scan_key_num, &exact_range, &eos, &should_break)); if (exact_range) { - _colname_to_value_range.erase(iter->first); + auto key = iter->first; + _slot_id_to_value_range.erase(key); + + std::vector> new_predicates; + for (const auto& it : _slot_id_to_predicates[key]) { + if (it->type() == PredicateType::NOT_IN_LIST || + it->type() == PredicateType::NE) { + new_predicates.push_back(it); + } + } + if (new_predicates.empty()) { + _slot_id_to_predicates.erase(key); + } else { + _slot_id_to_predicates[key] = new_predicates; + } } } else { // if exceed max_pushdown_conditions_per_column, use whole_value_rang instead - // and will not erase from _colname_to_value_range, it must be not exact_range + // and will not erase from _slot_id_to_value_range, it must be not exact_range temp_range.set_whole_value_range(); RETURN_IF_ERROR( _scan_keys.extend_scan_key(temp_range, p._max_scan_key_num, @@ -909,35 +914,18 @@ Status OlapScanLocalState::_build_key_ranges_and_filters() { } return Status::OK(); }, - iter->second)); + value_range)); } if (eos) { _eos = true; _scan_dependency->set_ready(); } - - for (auto& iter : _colname_to_value_range) { - std::vector> filters; - std::visit([&](auto&& range) { range.to_olap_filter(filters); }, iter.second); - - for (const auto& filter : filters) { - _olap_filters.emplace_back(filter); - } - } - - // Append value ranges in "_not_in_value_ranges" - for (auto& range : _not_in_value_ranges) { - std::visit([&](auto&& the_range) { the_range.to_in_condition(_olap_filters, false); }, - range); - } } else { custom_profile()->add_info_string("PushDownAggregate", push_down_agg_to_string(p._push_down_agg_type)); } if (state()->enable_profile()) { - custom_profile()->add_info_string("PushDownPredicates", - olap_filters_to_string(_olap_filters)); custom_profile()->add_info_string("KeyRanges", _scan_keys.debug_string()); custom_profile()->add_info_string("TabletIds", tablets_id_to_string(_scan_ranges)); } @@ -961,6 +949,21 @@ OlapScanOperatorX::OlapScanOperatorX(ObjectPool* pool, const TPlanNode& tnode, i << ", sort_limit: " << _olap_scan_node.sort_limit << ", isset.sort_limit: " << _olap_scan_node.__isset.sort_limit; }) + + if (_olap_scan_node.__isset.columns_desc && !_olap_scan_node.columns_desc.empty() && + _olap_scan_node.columns_desc[0].col_unique_id >= 0) { + _tablet_schema = std::make_shared(); + _tablet_schema->clear_columns(); + for (const auto& column_desc : _olap_scan_node.columns_desc) { + _tablet_schema->append_column(TabletColumn(column_desc)); + } + if (_olap_scan_node.__isset.schema_version) { + _tablet_schema->set_schema_version(_olap_scan_node.schema_version); + } + if (_olap_scan_node.__isset.indexes_desc) { + _tablet_schema->update_indexes_from_thrift(_olap_scan_node.indexes_desc); + } + } } #include "common/compile_check_end.h" diff --git a/be/src/pipeline/exec/olap_scan_operator.h b/be/src/pipeline/exec/olap_scan_operator.h index c97f71a0113e4b..8d1fb44a0415f6 100644 --- a/be/src/pipeline/exec/olap_scan_operator.h +++ b/be/src/pipeline/exec/olap_scan_operator.h @@ -78,11 +78,28 @@ class OlapScanLocalState final : public ScanLocalState { doris::FunctionContext** fn_ctx, PushDownType& pdt) override; - PushDownType _should_push_down_bloom_filter() override { return PushDownType::ACCEPTABLE; } + PushDownType _should_push_down_bloom_filter() const override { + return PushDownType::ACCEPTABLE; + } + PushDownType _should_push_down_topn_filter() const override { return PushDownType::ACCEPTABLE; } - PushDownType _should_push_down_bitmap_filter() override { return PushDownType::ACCEPTABLE; } + PushDownType _should_push_down_bitmap_filter() const override { + return PushDownType::ACCEPTABLE; + } - PushDownType _should_push_down_is_null_predicate() override { return PushDownType::ACCEPTABLE; } + PushDownType _should_push_down_is_null_predicate( + vectorized::VectorizedFnCall* fn_call) const override { + return fn_call->fn().name.function_name == "is_null_pred" || + fn_call->fn().name.function_name == "is_not_null_pred" + ? PushDownType::ACCEPTABLE + : PushDownType::UNACCEPTABLE; + } + PushDownType _should_push_down_in_predicate() const override { + return PushDownType::ACCEPTABLE; + } + PushDownType _should_push_down_binary_predicate( + vectorized::VectorizedFnCall* fn_call, vectorized::VExprContext* expr_ctx, + StringRef* constant_val, const std::set fn_name) const override; bool _should_push_down_common_expr() override; @@ -92,6 +109,11 @@ class OlapScanLocalState final : public ScanLocalState { if (!predicate.target_is_slot(_parent->node_id())) { return false; } + if (!olap_scan_node().__isset.columns_desc || olap_scan_node().columns_desc.empty() || + olap_scan_node().columns_desc[0].col_unique_id < 0) { + // Disable topN filter if there is no schema info + return false; + } return _is_key_column(predicate.get_col_name(_parent->node_id())); } @@ -109,7 +131,6 @@ class OlapScanLocalState final : public ScanLocalState { std::atomic_bool _sync_tablet = false; std::vector> _cond_ranges; OlapScanKeys _scan_keys; - std::vector> _olap_filters; // If column id in this set, indicate that we need to read data after index filtering std::set _output_column_ids; @@ -295,10 +316,19 @@ class OlapScanOperatorX final : public ScanOperatorX { const DescriptorTbl& descs, int parallel_tasks, const TQueryCacheParam& cache_param); + int get_column_id(const std::string& col_name) const override { + if (!_tablet_schema) { + return -1; + } + const auto& column = *DORIS_TRY(_tablet_schema->column(col_name)); + return _tablet_schema->field_index(column.unique_id()); + } + private: friend class OlapScanLocalState; TOlapScanNode _olap_scan_node; TQueryCacheParam _cache_param; + TabletSchemaSPtr _tablet_schema; }; #include "common/compile_check_end.h" diff --git a/be/src/pipeline/exec/scan_operator.cpp b/be/src/pipeline/exec/scan_operator.cpp index c6ca051adec23b..216a8c9d95e963 100644 --- a/be/src/pipeline/exec/scan_operator.cpp +++ b/be/src/pipeline/exec/scan_operator.cpp @@ -25,6 +25,8 @@ #include #include "common/global_types.h" +#include "olap/null_predicate.h" +#include "olap/predicate_creator.h" #include "pipeline/exec/es_scan_operator.h" #include "pipeline/exec/file_scan_operator.h" #include "pipeline/exec/group_commit_scan_operator.h" @@ -165,6 +167,23 @@ Status ScanLocalState::open(RuntimeState* state) { return status; } +static std::string predicates_to_string( + const phmap::flat_hash_map>>& + slot_id_to_predicates) { + fmt::memory_buffer debug_string_buffer; + for (const auto& [slot_id, predicates] : slot_id_to_predicates) { + if (predicates.empty()) { + continue; + } + fmt::format_to(debug_string_buffer, "Slot ID: {}: [", slot_id); + for (const auto& predicate : predicates) { + fmt::format_to(debug_string_buffer, "{{{}}}, ", predicate->debug_string()); + } + fmt::format_to(debug_string_buffer, "] "); + } + return fmt::to_string(debug_string_buffer); +} + template Status ScanLocalState::_normalize_conjuncts(RuntimeState* state) { auto& p = _parent->cast(); @@ -178,7 +197,7 @@ Status ScanLocalState::_normalize_conjuncts(RuntimeState* state) { ColumnValueRange range(slot->col_name(), slot->is_nullable(), \ cast_set(type_desc->get_precision()), \ cast_set(type_desc->get_scale())); \ - _slot_id_to_value_range[slot->id()] = std::pair {slot, range}; \ + _slot_id_to_value_range[slot->id()] = std::move(range); \ break; \ } #define APPLY_FOR_PRIMITIVE_TYPE(M) \ @@ -227,11 +246,16 @@ Status ScanLocalState::_normalize_conjuncts(RuntimeState* state) { } } init_value_range(slot, slot->type()); + _slot_id_to_predicates.insert( + {slot->id(), std::vector>()}); } get_cast_types_for_variants(); for (const auto& [colname, type] : _cast_types_for_variants) { init_value_range(p._slot_id_to_slot_desc[p._colname_to_slot_id[colname]], type); + _slot_id_to_predicates.insert( + {p._slot_id_to_slot_desc[p._colname_to_slot_id[colname]]->id(), + std::vector>()}); } RETURN_IF_ERROR(_get_topn_filters(state)); @@ -240,7 +264,7 @@ Status ScanLocalState::_normalize_conjuncts(RuntimeState* state) { auto& conjunct = *it; if (conjunct->root()) { vectorized::VExprSPtr new_root; - RETURN_IF_ERROR(_normalize_predicate(conjunct->root(), conjunct.get(), new_root)); + RETURN_IF_ERROR(_normalize_predicate(conjunct.get(), conjunct->root(), new_root)); if (new_root) { conjunct->set_root(new_root); if (_should_push_down_common_expr() && @@ -257,6 +281,22 @@ Status ScanLocalState::_normalize_conjuncts(RuntimeState* state) { } ++it; } + + if (state->enable_profile()) { + custom_profile()->add_info_string("PushDownPredicates", + predicates_to_string(_slot_id_to_predicates)); + std::string message; + for (auto& conjunct : _conjuncts) { + if (conjunct->root()) { + if (!message.empty()) { + message += ", "; + } + message += conjunct->root()->debug_string(); + } + } + custom_profile()->add_info_string("RemainedDownPredicates", message); + } + for (auto& it : _slot_id_to_value_range) { std::visit( [&](auto&& range) { @@ -265,201 +305,215 @@ Status ScanLocalState::_normalize_conjuncts(RuntimeState* state) { _scan_dependency->set_ready(); } }, - it.second.second); - _colname_to_value_range[it.second.first->col_name()] = it.second.second; + it.second); } return Status::OK(); } template -Status ScanLocalState::_normalize_predicate( - const vectorized::VExprSPtr& conjunct_expr_root, vectorized::VExprContext* context, - vectorized::VExprSPtr& output_expr) { - static constexpr auto is_leaf = [](auto&& expr) { return !expr->is_and_expr(); }; - auto in_predicate_checker = [](const vectorized::VExprSPtrs& children, - std::shared_ptr& slot, - vectorized::VExprSPtr& child_contains_slot) { - if (children.empty() || vectorized::VExpr::expr_without_cast(children[0])->node_type() != - TExprNodeType::SLOT_REF) { +Status ScanLocalState::_normalize_predicate(vectorized::VExprContext* context, + const vectorized::VExprSPtr& root, + vectorized::VExprSPtr& output_expr) { + auto expr_root = root->is_rf_wrapper() ? root->get_impl() : root; + PushDownType pdt = PushDownType::UNACCEPTABLE; + if (dynamic_cast(expr_root.get())) { + // If the expr has virtual slot ref, we need to keep it in the tree. + output_expr = expr_root; + return Status::OK(); + } + + SlotDescriptor* slot = nullptr; + ColumnValueRangeType* range = nullptr; + RETURN_IF_ERROR(_eval_const_conjuncts(context, &pdt)); + if (pdt == PushDownType::ACCEPTABLE) { + output_expr = nullptr; + return Status::OK(); + } + std::shared_ptr slotref; + for (const auto& child : expr_root->children()) { + if (vectorized::VExpr::expr_without_cast(child)->node_type() != TExprNodeType::SLOT_REF) { // not a slot ref(column) - return false; - } - slot = std::dynamic_pointer_cast( - vectorized::VExpr::expr_without_cast(children[0])); - child_contains_slot = children[0]; - return true; - }; - auto eq_predicate_checker = [](const vectorized::VExprSPtrs& children, - std::shared_ptr& slot, - vectorized::VExprSPtr& child_contains_slot) { - for (const auto& child : children) { - if (vectorized::VExpr::expr_without_cast(child)->node_type() != - TExprNodeType::SLOT_REF) { - // not a slot ref(column) - continue; - } - slot = std::dynamic_pointer_cast( - vectorized::VExpr::expr_without_cast(child)); - CHECK(slot != nullptr); - child_contains_slot = child; - return true; + continue; } - return false; - }; - - if (conjunct_expr_root != nullptr) { - if (is_leaf(conjunct_expr_root)) { - auto impl = conjunct_expr_root->get_impl(); - // If impl is not null, which means this is a conjunct from runtime filter. - vectorized::VExpr* cur_expr = impl ? impl.get() : conjunct_expr_root.get(); - if (dynamic_cast(cur_expr)) { - // If the expr has virtual slot ref, we need to keep it in the tree. - output_expr = conjunct_expr_root; - return Status::OK(); - } - - SlotDescriptor* slot = nullptr; - ColumnValueRangeType* range = nullptr; - PushDownType pdt = PushDownType::UNACCEPTABLE; - RETURN_IF_ERROR(_eval_const_conjuncts(cur_expr, context, &pdt)); - if (pdt == PushDownType::ACCEPTABLE) { - output_expr = nullptr; - return Status::OK(); - } - std::shared_ptr slotref; - for (const auto& child : cur_expr->children()) { - if (vectorized::VExpr::expr_without_cast(child)->node_type() != - TExprNodeType::SLOT_REF) { - // not a slot ref(column) - continue; - } - slotref = std::dynamic_pointer_cast( - vectorized::VExpr::expr_without_cast(child)); - } - if (_is_predicate_acting_on_slot(cur_expr, in_predicate_checker, &slot, &range) || - _is_predicate_acting_on_slot(cur_expr, eq_predicate_checker, &slot, &range)) { - Status status = Status::OK(); - std::visit( - [&](auto& value_range) { - bool need_set_runtime_filter_id = value_range.is_whole_value_range() && - conjunct_expr_root->is_rf_wrapper(); - Defer set_runtime_filter_id {[&]() { - // rf predicates is always appended to the end of conjuncts. We need to ensure that there is no non-rf predicate after rf-predicate - // If it is not a whole range, it means that the column has other non-rf predicates, so it cannot be marked as rf predicate. - // If the range where non-rf predicates are located is incorrectly marked as rf, can_ignore will return true, resulting in the predicate not taking effect and getting an incorrect result. - if (need_set_runtime_filter_id) { - auto* rf_expr = assert_cast( - conjunct_expr_root.get()); - DCHECK(rf_expr->predicate_filtered_rows_counter() != nullptr); - DCHECK(rf_expr->predicate_input_rows_counter() != nullptr); - value_range.attach_profile_counter( - rf_expr->filter_id(), - rf_expr->predicate_filtered_rows_counter(), - rf_expr->predicate_input_rows_counter(), - rf_expr->predicate_always_true_rows_counter()); - } - }}; - RETURN_IF_PUSH_DOWN(_normalize_in_and_eq_predicate( - cur_expr, context, slot, value_range, &pdt), - status); - RETURN_IF_PUSH_DOWN(_normalize_not_in_and_not_eq_predicate( - cur_expr, context, slot, value_range, &pdt), - status); - RETURN_IF_PUSH_DOWN(_normalize_is_null_predicate( - cur_expr, context, slot, value_range, &pdt), - status); - RETURN_IF_PUSH_DOWN(_normalize_noneq_binary_predicate( - cur_expr, context, slot, value_range, &pdt), - status); + slotref = std::dynamic_pointer_cast( + vectorized::VExpr::expr_without_cast(child)); + } + if (_is_predicate_acting_on_slot(expr_root->children(), &slot, &range)) { + Status status = Status::OK(); + std::visit( + [&](auto& value_range) { + auto expr = root->is_rf_wrapper() ? root->get_impl() : root; + { + Defer attach_defer = [&]() { + if (pdt != PushDownType::UNACCEPTABLE && root->is_rf_wrapper()) { + auto* rf_expr = + assert_cast(root.get()); + _slot_id_to_predicates[slot->id()].back()->attach_profile_counter( + rf_expr->filter_id(), + rf_expr->predicate_filtered_rows_counter(), + rf_expr->predicate_input_rows_counter(), + rf_expr->predicate_always_true_rows_counter()); + } + }; + switch (expr->node_type()) { + case TExprNodeType::IN_PRED: RETURN_IF_PUSH_DOWN( - _normalize_bitmap_filter(cur_expr, context, slot, &pdt), + _normalize_in_predicate(context, expr, slot, + _slot_id_to_predicates[slot->id()], + value_range, &pdt), status); + break; + case TExprNodeType::BINARY_PRED: RETURN_IF_PUSH_DOWN( - _normalize_bloom_filter(cur_expr, context, slot, &pdt), status); - if (state()->enable_function_pushdown()) { + _normalize_binary_predicate(context, expr, slot, + _slot_id_to_predicates[slot->id()], + value_range, &pdt), + status); + break; + case TExprNodeType::FUNCTION_CALL: + if (expr->is_topn_filter()) { RETURN_IF_PUSH_DOWN( - _normalize_function_filters(cur_expr, context, slot, &pdt), + _normalize_topn_filter(context, expr, slot, + _slot_id_to_predicates[slot->id()], + &pdt), status); + } else { + RETURN_IF_PUSH_DOWN(_normalize_is_null_predicate( + context, expr, slot, + _slot_id_to_predicates[slot->id()], + value_range, &pdt), + status); } - }, - *range); - RETURN_IF_ERROR(status); - } - if (pdt == PushDownType::ACCEPTABLE && slotref != nullptr && - slotref->data_type()->get_primitive_type() == PrimitiveType::TYPE_VARIANT) { - // remaining it in the expr tree, in order to filter by function if the pushdown - // predicate is not applied - output_expr = conjunct_expr_root; // remaining in conjunct tree - return Status::OK(); - } + break; + case TExprNodeType::BITMAP_PRED: + RETURN_IF_PUSH_DOWN(_normalize_bitmap_filter( + context, root, slot, + _slot_id_to_predicates[slot->id()], &pdt), + status); + break; + case TExprNodeType::BLOOM_PRED: + RETURN_IF_PUSH_DOWN(_normalize_bloom_filter( + context, root, slot, + _slot_id_to_predicates[slot->id()], &pdt), + status); + break; + default: + break; + } + } + // `node_type` of function filter is FUNCTION_CALL or COMPOUND_PRED + if (state()->enable_function_pushdown()) { + RETURN_IF_PUSH_DOWN(_normalize_function_filters(context, slot, &pdt), + status); + } + }, + *range); + RETURN_IF_ERROR(status); + } + if (pdt == PushDownType::ACCEPTABLE && slotref != nullptr && + slotref->data_type()->get_primitive_type() == PrimitiveType::TYPE_VARIANT) { + // remaining it in the expr tree, in order to filter by function if the pushdown + // predicate is not applied + output_expr = expr_root; // remaining in conjunct tree + return Status::OK(); + } - if (pdt == PushDownType::ACCEPTABLE && (_is_key_column(slot->col_name()))) { - output_expr = nullptr; - return Status::OK(); - } else { - // for PARTIAL_ACCEPTABLE and UNACCEPTABLE, do not remove expr from the tree - output_expr = conjunct_expr_root; - return Status::OK(); - } - } else { - return Status::InternalError("conjunct root should not and expr, but now {}", - conjunct_expr_root->debug_string()); + if (pdt == PushDownType::ACCEPTABLE && (_is_key_column(slot->col_name()))) { + output_expr = nullptr; + return Status::OK(); + } else { + // for PARTIAL_ACCEPTABLE and UNACCEPTABLE, do not remove expr from the tree + output_expr = root; + return Status::OK(); + } + output_expr = root; + return Status::OK(); +} + +template +Status ScanLocalState::_normalize_bloom_filter( + vectorized::VExprContext* expr_ctx, const vectorized::VExprSPtr& root, SlotDescriptor* slot, + std::vector>& predicates, PushDownType* pdt) { + std::shared_ptr pred = nullptr; + Defer defer = [&]() { + if (pred) { + DCHECK(*pdt != PushDownType::UNACCEPTABLE) << root->debug_string(); + predicates.emplace_back(pred); } + }; + DCHECK(TExprNodeType::BLOOM_PRED == root->node_type()); + auto expr = root->is_rf_wrapper() ? root->get_impl() : root; + DCHECK(expr->get_num_children() == 1); + DCHECK(root->is_rf_wrapper()); + *pdt = _should_push_down_bloom_filter(); + if (*pdt != PushDownType::UNACCEPTABLE) { + pred = create_bloom_filter_predicate( + _parent->intermediate_row_desc().get_column_id(slot->id()), slot->col_name(), + slot->type()->get_primitive_type() == TYPE_VARIANT ? expr->get_child(0)->data_type() + : slot->type(), + expr->get_bloom_filter_func()); } - output_expr = conjunct_expr_root; return Status::OK(); } template -Status ScanLocalState::_normalize_bloom_filter(vectorized::VExpr* expr, - vectorized::VExprContext* expr_ctx, - SlotDescriptor* slot, PushDownType* pdt) { - if (TExprNodeType::BLOOM_PRED == expr->node_type()) { - DCHECK(expr->get_num_children() == 1); - DCHECK(expr_ctx->root()->is_rf_wrapper()); - PushDownType temp_pdt = _should_push_down_bloom_filter(); - if (temp_pdt != PushDownType::UNACCEPTABLE) { - auto* rf_expr = assert_cast(expr_ctx->root().get()); - _filter_predicates.bloom_filters.emplace_back( - slot->col_name(), expr->get_bloom_filter_func(), rf_expr->filter_id(), - rf_expr->predicate_filtered_rows_counter(), - rf_expr->predicate_input_rows_counter(), - rf_expr->predicate_always_true_rows_counter()); - *pdt = temp_pdt; +Status ScanLocalState::_normalize_topn_filter( + vectorized::VExprContext* expr_ctx, const vectorized::VExprSPtr& root, SlotDescriptor* slot, + std::vector>& predicates, PushDownType* pdt) { + std::shared_ptr pred = nullptr; + Defer defer = [&]() { + if (pred) { + DCHECK(*pdt != PushDownType::UNACCEPTABLE) << root->debug_string(); + predicates.emplace_back(pred); + } + }; + DCHECK(root->is_topn_filter()); + *pdt = _should_push_down_topn_filter(); + if (*pdt != PushDownType::UNACCEPTABLE) { + auto& p = _parent->cast(); + auto& tmp = _state->get_query_ctx()->get_runtime_predicate( + assert_cast(root.get())->source_node_id()); + if (_push_down_topn(tmp)) { + pred = tmp.get_predicate(p.node_id()); } } return Status::OK(); } template -Status ScanLocalState::_normalize_bitmap_filter(vectorized::VExpr* expr, - vectorized::VExprContext* expr_ctx, - SlotDescriptor* slot, PushDownType* pdt) { - if (TExprNodeType::BITMAP_PRED == expr->node_type()) { - DCHECK(expr->get_num_children() == 1); - DCHECK(expr_ctx->root()->is_rf_wrapper()); - PushDownType temp_pdt = _should_push_down_bitmap_filter(); - if (temp_pdt != PushDownType::UNACCEPTABLE) { - auto* rf_expr = assert_cast(expr_ctx->root().get()); - _filter_predicates.bitmap_filters.emplace_back( - slot->col_name(), expr->get_bitmap_filter_func(), rf_expr->filter_id(), - rf_expr->predicate_filtered_rows_counter(), - rf_expr->predicate_input_rows_counter(), - rf_expr->predicate_always_true_rows_counter()); - *pdt = temp_pdt; +Status ScanLocalState::_normalize_bitmap_filter( + vectorized::VExprContext* expr_ctx, const vectorized::VExprSPtr& root, SlotDescriptor* slot, + std::vector>& predicates, PushDownType* pdt) { + std::shared_ptr pred = nullptr; + Defer defer = [&]() { + if (pred) { + DCHECK(*pdt != PushDownType::UNACCEPTABLE) << root->debug_string(); + predicates.emplace_back(pred); } + }; + DCHECK(TExprNodeType::BITMAP_PRED == root->node_type()); + auto expr = root->is_rf_wrapper() ? root->get_impl() : root; + *pdt = _should_push_down_bitmap_filter(); + if (*pdt != PushDownType::UNACCEPTABLE) { + DCHECK(expr->get_num_children() == 1); + DCHECK(root->is_rf_wrapper()); + pred = create_bitmap_filter_predicate( + _parent->intermediate_row_desc().get_column_id(slot->id()), slot->col_name(), + slot->type()->get_primitive_type() == TYPE_VARIANT ? expr->get_child(0)->data_type() + : slot->type(), + expr->get_bitmap_filter_func()); } return Status::OK(); } template -Status ScanLocalState::_normalize_function_filters(vectorized::VExpr* expr, - vectorized::VExprContext* expr_ctx, +Status ScanLocalState::_normalize_function_filters(vectorized::VExprContext* expr_ctx, SlotDescriptor* slot, PushDownType* pdt) { + auto expr = expr_ctx->root()->is_rf_wrapper() ? expr_ctx->root()->get_impl() : expr_ctx->root(); bool opposite = false; - vectorized::VExpr* fn_expr = expr; + vectorized::VExpr* fn_expr = expr.get(); if (TExprNodeType::COMPOUND_PRED == expr->node_type() && expr->fn().name.function_name == "not") { fn_expr = fn_expr->children()[0].get(); @@ -471,7 +525,7 @@ Status ScanLocalState::_normalize_function_filters(vectorized::VExpr* e StringRef val; PushDownType temp_pdt; RETURN_IF_ERROR(_should_push_down_function_filter( - reinterpret_cast(fn_expr), expr_ctx, &val, &fn_ctx, + assert_cast(fn_expr), expr_ctx, &val, &fn_ctx, temp_pdt)); if (temp_pdt != PushDownType::UNACCEPTABLE) { std::string col = slot->col_name(); @@ -483,55 +537,26 @@ Status ScanLocalState::_normalize_function_filters(vectorized::VExpr* e } template -bool ScanLocalState::_is_predicate_acting_on_slot( - vectorized::VExpr* expr, - const std::function&, vectorized::VExprSPtr&)>& - checker, - SlotDescriptor** slot_desc, ColumnValueRangeType** range) { - std::shared_ptr slot_ref; - vectorized::VExprSPtr child_contains_slot; - if (!checker(expr->children(), slot_ref, child_contains_slot)) { +bool ScanLocalState::_is_predicate_acting_on_slot(const vectorized::VExprSPtrs& children, + SlotDescriptor** slot_desc, + ColumnValueRangeType** range) { + if (children.empty() || children[0]->node_type() != TExprNodeType::SLOT_REF) { // not a slot ref(column) return false; } - - // slot_ref is a specific expr - // child_contains_slot may include a cast expr - - auto entry = _slot_id_to_value_range.find(slot_ref->slot_id()); - if (_slot_id_to_value_range.end() == entry) { + std::shared_ptr slot_ref = + std::dynamic_pointer_cast(children[0]); + *slot_desc = + _parent->cast()._slot_id_to_slot_desc[slot_ref->slot_id()]; + auto entry = _slot_id_to_predicates.find(slot_ref->slot_id()); + if (_slot_id_to_predicates.end() == entry) { return false; } - // if the slot is a complex type(array/map/struct), we do not push down the predicate, because - // we delete pack these type into predict column, and origin pack action is wrong. we should - // make sense to push down this complex type after we delete predict column. - if (is_complex_type(slot_ref->data_type()->get_primitive_type())) { + auto sid_to_range = _slot_id_to_value_range.find(slot_ref->slot_id()); + if (_slot_id_to_value_range.end() == sid_to_range) { return false; } - *slot_desc = entry->second.first; - DCHECK(child_contains_slot != nullptr); - if (child_contains_slot->data_type()->get_primitive_type() != - (*slot_desc)->type()->get_primitive_type() || - child_contains_slot->data_type()->get_precision() != - (*slot_desc)->type()->get_precision() || - child_contains_slot->data_type()->get_scale() != (*slot_desc)->type()->get_scale()) { - if (!_ignore_cast(*slot_desc, child_contains_slot.get())) { - // the type of predicate not match the slot's type - return false; - } - } else if ((child_contains_slot->data_type()->get_primitive_type() == - PrimitiveType::TYPE_DATETIME || - child_contains_slot->data_type()->get_primitive_type() == - PrimitiveType::TYPE_DATETIMEV2 || - child_contains_slot->data_type()->get_primitive_type() == - PrimitiveType::TYPE_TIMESTAMPTZ) && - child_contains_slot->node_type() == doris::TExprNodeType::CAST_EXPR) { - // Expr `CAST(CAST(datetime_col AS DATE) AS DATETIME) = datetime_literal` should not be - // push down. - return false; - } - *range = &(entry->second.second); + *range = &(sid_to_range->second); return true; } @@ -553,39 +578,10 @@ std::string ScanLocalState::debug_string(int indentation_level) const { } template -bool ScanLocalState::_ignore_cast(SlotDescriptor* slot, vectorized::VExpr* expr) { - if (is_string_type(slot->type()->get_primitive_type()) && - is_string_type(expr->data_type()->get_primitive_type())) { - return true; - } - // only one level cast expr could push down for variant type - // check if expr is cast and it's children is slot - if (slot->type()->get_primitive_type() == PrimitiveType::TYPE_VARIANT) { - return expr->node_type() == TExprNodeType::CAST_EXPR && - expr->children().at(0)->is_slot_ref(); - } - if (slot->type()->get_primitive_type() == PrimitiveType::TYPE_ARRAY) { - if (assert_cast( - vectorized::remove_nullable(slot->type()).get()) - ->get_nested_type() - ->get_primitive_type() == expr->data_type()->get_primitive_type()) { - return true; - } - if (is_string_type(assert_cast( - vectorized::remove_nullable(slot->type()).get()) - ->get_nested_type() - ->get_primitive_type()) && - is_string_type(expr->data_type()->get_primitive_type())) { - return true; - } - } - return false; -} - -template -Status ScanLocalState::_eval_const_conjuncts(vectorized::VExpr* vexpr, - vectorized::VExprContext* expr_ctx, +Status ScanLocalState::_eval_const_conjuncts(vectorized::VExprContext* expr_ctx, PushDownType* pdt) { + auto vexpr = + expr_ctx->root()->is_rf_wrapper() ? expr_ctx->root()->get_impl() : expr_ctx->root(); // Used to handle constant expressions, such as '1 = 1' _eval_const_conjuncts does not handle cases like 'colA = 1' const char* constant_val = nullptr; if (vexpr->is_constant()) { @@ -633,301 +629,242 @@ Status ScanLocalState::_eval_const_conjuncts(vectorized::VExpr* vexpr, template template -Status ScanLocalState::_normalize_in_and_eq_predicate(vectorized::VExpr* expr, - vectorized::VExprContext* expr_ctx, - SlotDescriptor* slot, - ColumnValueRange& range, - PushDownType* pdt) { - auto temp_range = ColumnValueRange::create_empty_column_value_range( - slot->is_nullable(), range.precision(), range.scale()); +Status ScanLocalState::_normalize_in_predicate( + vectorized::VExprContext* expr_ctx, const vectorized::VExprSPtr& root, SlotDescriptor* slot, + std::vector>& predicates, ColumnValueRange& range, + PushDownType* pdt) { + std::shared_ptr pred = nullptr; + Defer defer = [&]() { + if (pred) { + DCHECK(*pdt != PushDownType::UNACCEPTABLE) << root->debug_string(); + predicates.emplace_back(pred); + } + }; if (slot->get_virtual_column_expr() != nullptr) { // virtual column, do not push down return Status::OK(); } - // 1. Normalize in conjuncts like 'where col in (v1, v2, v3)' - if (TExprNodeType::IN_PRED == expr->node_type()) { - HybridSetBase::IteratorBase* iter = nullptr; - auto hybrid_set = expr->get_set_func(); - - if (hybrid_set != nullptr) { - // runtime filter produce VDirectInPredicate - if (hybrid_set->size() <= - _parent->cast()._max_pushdown_conditions_per_column) { - iter = hybrid_set->begin(); - } else { - int runtime_filter_id = -1; - std::shared_ptr predicate_filtered_rows_counter = nullptr; - std::shared_ptr predicate_input_rows_counter = nullptr; - std::shared_ptr predicate_always_true_rows_counter = - nullptr; - if (expr_ctx->root()->is_rf_wrapper()) { - auto* rf_expr = - assert_cast(expr_ctx->root().get()); - runtime_filter_id = rf_expr->filter_id(); - predicate_filtered_rows_counter = rf_expr->predicate_filtered_rows_counter(); - predicate_input_rows_counter = rf_expr->predicate_input_rows_counter(); - predicate_always_true_rows_counter = - rf_expr->predicate_always_true_rows_counter(); - } - _filter_predicates.in_filters.emplace_back( - slot->col_name(), expr->get_set_func(), runtime_filter_id, - predicate_filtered_rows_counter, predicate_input_rows_counter, - predicate_always_true_rows_counter); - *pdt = PushDownType::ACCEPTABLE; - return Status::OK(); - } - } else { - // normal in predicate - auto* pred = static_cast(expr); - PushDownType temp_pdt = _should_push_down_in_predicate(pred, false); - if (temp_pdt == PushDownType::UNACCEPTABLE) { - return Status::OK(); - } + DCHECK(!root->is_rf_wrapper()) << root->debug_string(); + DCHECK(TExprNodeType::IN_PRED == root->node_type()) << root->debug_string(); + *pdt = _should_push_down_in_predicate(); + if (*pdt == PushDownType::UNACCEPTABLE) { + return Status::OK(); + } + HybridSetBase::IteratorBase* iter = nullptr; + auto hybrid_set = root->get_set_func(); + + auto is_in = false; + if (hybrid_set != nullptr) { + // runtime filter produce VDirectInPredicate + if (hybrid_set->size() <= + _parent->cast()._max_pushdown_conditions_per_column) { + iter = hybrid_set->begin(); + } + is_in = true; + } else { + // normal in predicate + auto* tmp = assert_cast(root.get()); - // begin to push InPredicate value into ColumnValueRange - auto* state = reinterpret_cast( - expr_ctx->fn_context(pred->fn_context_index()) - ->get_function_state(FunctionContext::FRAGMENT_LOCAL)); + // begin to push InPredicate value into ColumnValueRange + auto* state = reinterpret_cast( + expr_ctx->fn_context(tmp->fn_context_index()) + ->get_function_state(FunctionContext::FRAGMENT_LOCAL)); - // xx in (col, xx, xx) should not be push down - if (!state->use_set) { - return Status::OK(); - } + // xx in (col, xx, xx) should not be push down + if (!state->use_set) { + return Status::OK(); + } + is_in = !tmp->is_not_in(); - iter = state->hybrid_set->begin(); + if (state->hybrid_set->contain_null() && tmp->is_not_in()) { + _eos = true; + _scan_dependency->set_ready(); + return Status::OK(); } + hybrid_set = state->hybrid_set; + iter = state->hybrid_set->begin(); + } + if (iter) { + auto empty_range = ColumnValueRange::create_empty_column_value_range( + slot->is_nullable(), range.precision(), range.scale()); + auto& temp_range = is_in ? empty_range : range; + auto fn = is_in ? ColumnValueRange::add_fixed_value_range + : (range.is_fixed_value_range() + ? ColumnValueRange::remove_fixed_value_range + : ColumnValueRange::empty_function); while (iter->has_next()) { // column in (nullptr) is always false so continue to // dispose next item DCHECK(iter->get_value() != nullptr); const auto* value = iter->get_value(); - RETURN_IF_ERROR(_change_value_range( - temp_range, value, ColumnValueRange::add_fixed_value_range, "")); + RETURN_IF_ERROR( + _change_value_range(is_in, temp_range, value, fn, is_in ? "in" : "not_in")); iter->next(); } - range.intersection(temp_range); - *pdt = PushDownType::ACCEPTABLE; - } else if (TExprNodeType::BINARY_PRED == expr->node_type()) { - DCHECK(expr->get_num_children() == 2); - auto eq_checker = [](const std::string& fn_name) { return fn_name == "eq"; }; - - StringRef value; - int slot_ref_child = -1; - - PushDownType temp_pdt; - RETURN_IF_ERROR(_should_push_down_binary_predicate( - reinterpret_cast(expr), expr_ctx, &value, - &slot_ref_child, eq_checker, temp_pdt)); - if (temp_pdt == PushDownType::UNACCEPTABLE) { - return Status::OK(); - } - DCHECK(slot_ref_child >= 0); - // where A = nullptr should return empty result set - auto fn_name = std::string(""); - if (value.data != nullptr) { - if constexpr (T == TYPE_CHAR || T == TYPE_VARCHAR || T == TYPE_STRING || - T == TYPE_HLL) { - auto val = StringRef(value.data, value.size); - RETURN_IF_ERROR(_change_value_range( - temp_range, reinterpret_cast(&val), - ColumnValueRange::add_fixed_value_range, fn_name)); - } else { - if (sizeof(typename PrimitiveTypeTraits::CppType) != value.size) { - return Status::InternalError( - "PrimitiveType {} meet invalid input value size {}, expect size {}", T, - value.size, sizeof(typename PrimitiveTypeTraits::CppType)); - } - RETURN_IF_ERROR(_change_value_range( - temp_range, reinterpret_cast(value.data), - ColumnValueRange::add_fixed_value_range, fn_name)); - } + if (is_in) { range.intersection(temp_range); - } else { - _eos = true; - _scan_dependency->set_ready(); } - *pdt = temp_pdt; } - + pred = is_in ? create_in_list_predicate( + _parent->intermediate_row_desc().get_column_id(slot->id()), + slot->col_name(), + slot->type()->get_primitive_type() == TYPE_VARIANT + ? root->get_child(0)->data_type() + : slot->type(), + hybrid_set, false) + : create_in_list_predicate( + _parent->intermediate_row_desc().get_column_id(slot->id()), + slot->col_name(), + slot->type()->get_primitive_type() == TYPE_VARIANT + ? root->get_child(0)->data_type() + : slot->type(), + hybrid_set, false); return Status::OK(); } template -Status ScanLocalState::_should_push_down_binary_predicate( - vectorized::VectorizedFnCall* fn_call, vectorized::VExprContext* expr_ctx, - StringRef* constant_val, int* slot_ref_child, - const std::function& fn_checker, PushDownType& pdt) { - if (!fn_checker(fn_call->fn().name.function_name)) { - pdt = PushDownType::UNACCEPTABLE; - return Status::OK(); - } - DCHECK(constant_val->data == nullptr) << "constant_val should not have a value"; - const auto& children = fn_call->children(); - DCHECK(children.size() == 2); - for (int i = 0; i < 2; i++) { - if (vectorized::VExpr::expr_without_cast(children[i])->node_type() != - TExprNodeType::SLOT_REF) { - // not a slot ref(column) - continue; - } - if (!children[1 - i]->is_constant()) { - // only handle constant value - pdt = PushDownType::UNACCEPTABLE; - return Status::OK(); - } else { - std::shared_ptr const_col_wrapper; - RETURN_IF_ERROR(children[1 - i]->get_const_col(expr_ctx, &const_col_wrapper)); - if (const auto* const_column = check_and_get_column( - const_col_wrapper->column_ptr.get())) { - *slot_ref_child = i; - *constant_val = const_column->get_data_at(0); - } else { - pdt = PushDownType::UNACCEPTABLE; - return Status::OK(); - } +template +Status ScanLocalState::_normalize_binary_predicate( + vectorized::VExprContext* expr_ctx, const vectorized::VExprSPtr& root, SlotDescriptor* slot, + std::vector>& predicates, ColumnValueRange& range, + PushDownType* pdt) { + std::shared_ptr pred = nullptr; + Defer defer = [&]() { + if (pred) { + DCHECK(*pdt != PushDownType::UNACCEPTABLE) << root->debug_string(); + predicates.emplace_back(pred); } - } - pdt = PushDownType::ACCEPTABLE; - return Status::OK(); -} + }; -template -PushDownType ScanLocalState::_should_push_down_in_predicate(vectorized::VInPredicate* pred, - bool is_not_in) { - if (pred->is_not_in() != is_not_in) { - return PushDownType::UNACCEPTABLE; + if (slot->get_virtual_column_expr() != nullptr) { + // virtual column, do not push down + return Status::OK(); } - return PushDownType::ACCEPTABLE; -} - -template -template -Status ScanLocalState::_normalize_not_in_and_not_eq_predicate( - vectorized::VExpr* expr, vectorized::VExprContext* expr_ctx, SlotDescriptor* slot, - ColumnValueRange& range, PushDownType* pdt) { - bool is_fixed_range = range.is_fixed_value_range(); - auto not_in_range = ColumnValueRange::create_empty_column_value_range( - range.column_name(), slot->is_nullable(), range.precision(), range.scale()); - PushDownType temp_pdt = PushDownType::UNACCEPTABLE; - // 1. Normalize in conjuncts like 'where col in (v1, v2, v3)' - if (TExprNodeType::IN_PRED == expr->node_type()) { - /// `VDirectInPredicate` here should not be pushed down. - /// here means the `VDirectInPredicate` is too big to be converted into `ColumnValueRange`. - /// For non-key columns and `_storage_no_merge()` is false, this predicate should not be pushed down. - if (expr->get_set_func() != nullptr) { - *pdt = PushDownType::UNACCEPTABLE; - return Status::OK(); - } - vectorized::VInPredicate* pred = static_cast(expr); - if ((temp_pdt = _should_push_down_in_predicate(pred, true)) == PushDownType::UNACCEPTABLE) { - return Status::OK(); + DCHECK(!root->is_rf_wrapper()) << root->debug_string(); + DCHECK(TExprNodeType::BINARY_PRED == root->node_type()) << root->debug_string(); + DCHECK(root->get_num_children() == 2); + StringRef value; + *pdt = _should_push_down_binary_predicate( + assert_cast(root.get()), expr_ctx, &value, + {"eq", "ne", "lt", "gt", "le", "ge"}); + if (*pdt == PushDownType::UNACCEPTABLE) { + return Status::OK(); + } + const std::string& function_name = + assert_cast(root.get())->fn().name.function_name; + auto op = to_olap_filter_type(function_name); + auto is_equal_op = op == SQLFilterOp::FILTER_EQ || op == SQLFilterOp::FILTER_NE; + auto empty_range = ColumnValueRange::create_empty_column_value_range( + slot->is_nullable(), range.precision(), range.scale()); + auto& temp_range = op == SQLFilterOp::FILTER_EQ ? empty_range : range; + if (value.data != nullptr) { + if (!is_string_type(T) && sizeof(typename PrimitiveTypeTraits::CppType) != value.size) { + return Status::InternalError( + "PrimitiveType {} meet invalid input value size {}, expect size {}", T, + value.size, sizeof(typename PrimitiveTypeTraits::CppType)); } - - // begin to push InPredicate value into ColumnValueRange - vectorized::InState* state = reinterpret_cast( - expr_ctx->fn_context(pred->fn_context_index()) - ->get_function_state(FunctionContext::FRAGMENT_LOCAL)); - - // xx in (col, xx, xx) should not be push down - if (!state->use_set) { - return Status::OK(); + switch (op) { + case SQLFilterOp::FILTER_EQ: + pred = create_comparison_predicate0( + _parent->intermediate_row_desc().get_column_id(slot->id()), slot->col_name(), + slot->type()->get_primitive_type() == TYPE_VARIANT + ? root->get_child(0)->data_type() + : slot->type(), + value, false, _arena); + break; + case SQLFilterOp::FILTER_NE: + pred = create_comparison_predicate0( + _parent->intermediate_row_desc().get_column_id(slot->id()), slot->col_name(), + slot->type()->get_primitive_type() == TYPE_VARIANT + ? root->get_child(0)->data_type() + : slot->type(), + value, false, _arena); + break; + case SQLFilterOp::FILTER_LESS: + pred = create_comparison_predicate0( + _parent->intermediate_row_desc().get_column_id(slot->id()), slot->col_name(), + slot->type()->get_primitive_type() == TYPE_VARIANT + ? root->get_child(0)->data_type() + : slot->type(), + value, false, _arena); + break; + case SQLFilterOp::FILTER_LARGER: + pred = create_comparison_predicate0( + _parent->intermediate_row_desc().get_column_id(slot->id()), slot->col_name(), + slot->type()->get_primitive_type() == TYPE_VARIANT + ? root->get_child(0)->data_type() + : slot->type(), + value, false, _arena); + break; + case SQLFilterOp::FILTER_LESS_OR_EQUAL: + pred = create_comparison_predicate0( + _parent->intermediate_row_desc().get_column_id(slot->id()), slot->col_name(), + slot->type()->get_primitive_type() == TYPE_VARIANT + ? root->get_child(0)->data_type() + : slot->type(), + value, false, _arena); + break; + case SQLFilterOp::FILTER_LARGER_OR_EQUAL: + pred = create_comparison_predicate0( + _parent->intermediate_row_desc().get_column_id(slot->id()), slot->col_name(), + slot->type()->get_primitive_type() == TYPE_VARIANT + ? root->get_child(0)->data_type() + : slot->type(), + value, false, _arena); + break; + default: + throw Exception(Status::InternalError("Unsupported function name: {}", function_name)); } - HybridSetBase::IteratorBase* iter = state->hybrid_set->begin(); - auto fn_name = std::string(""); - if (state->hybrid_set->contain_null()) { - _eos = true; - _scan_dependency->set_ready(); - } - while (iter->has_next()) { - // column not in (nullptr) is always true - DCHECK(iter->get_value() != nullptr); - const auto value = iter->get_value(); - if (is_fixed_range) { - RETURN_IF_ERROR(_change_value_range( - range, value, ColumnValueRange::remove_fixed_value_range, fn_name)); - } else { - RETURN_IF_ERROR(_change_value_range( - not_in_range, value, ColumnValueRange::add_fixed_value_range, fn_name)); + auto fn = op == SQLFilterOp::FILTER_EQ ? ColumnValueRange::add_fixed_value_range + : op == SQLFilterOp::FILTER_NE + ? (range.is_fixed_value_range() + ? ColumnValueRange::remove_fixed_value_range + : ColumnValueRange::empty_function) + : ColumnValueRange::add_value_range; + if constexpr (T == TYPE_CHAR || T == TYPE_VARCHAR || T == TYPE_STRING || T == TYPE_HLL) { + auto val = StringRef(value.data, value.size); + RETURN_IF_ERROR(_change_value_range(is_equal_op, temp_range, + reinterpret_cast(&val), fn, function_name)); + } else { + if (sizeof(typename PrimitiveTypeTraits::CppType) != value.size) { + return Status::InternalError( + "PrimitiveType {} meet invalid input value size {}, expect size {}", T, + value.size, sizeof(typename PrimitiveTypeTraits::CppType)); } - iter->next(); + RETURN_IF_ERROR(_change_value_range(is_equal_op, temp_range, + reinterpret_cast(value.data), fn, + function_name)); } - } else if (TExprNodeType::BINARY_PRED == expr->node_type()) { - DCHECK(expr->get_num_children() == 2); - - auto ne_checker = [](const std::string& fn_name) { return fn_name == "ne"; }; - StringRef value; - int slot_ref_child = -1; - RETURN_IF_ERROR(_should_push_down_binary_predicate( - reinterpret_cast(expr), expr_ctx, &value, - &slot_ref_child, ne_checker, temp_pdt)); - if (temp_pdt == PushDownType::UNACCEPTABLE) { - return Status::OK(); - } - - DCHECK(slot_ref_child >= 0); - // where A = nullptr should return empty result set - if (value.data != nullptr) { - auto fn_name = std::string(""); - if constexpr (T == TYPE_CHAR || T == TYPE_VARCHAR || T == TYPE_STRING || - T == TYPE_HLL) { - auto val = StringRef(value.data, value.size); - if (is_fixed_range) { - RETURN_IF_ERROR(_change_value_range( - range, reinterpret_cast(&val), - ColumnValueRange::remove_fixed_value_range, fn_name)); - } else { - RETURN_IF_ERROR(_change_value_range( - not_in_range, reinterpret_cast(&val), - ColumnValueRange::add_fixed_value_range, fn_name)); - } - } else { - if (is_fixed_range) { - RETURN_IF_ERROR(_change_value_range( - range, reinterpret_cast(value.data), - ColumnValueRange::remove_fixed_value_range, fn_name)); - } else { - RETURN_IF_ERROR(_change_value_range( - not_in_range, reinterpret_cast(value.data), - ColumnValueRange::add_fixed_value_range, fn_name)); - } - } - } else { - _eos = true; - _scan_dependency->set_ready(); + if (op == SQLFilterOp::FILTER_EQ) { + range.intersection(temp_range); } } else { - return Status::OK(); + *pdt = PushDownType::UNACCEPTABLE; + _eos = true; + _scan_dependency->set_ready(); } - if (is_fixed_range || - not_in_range.get_fixed_value_size() <= - _parent->cast()._max_pushdown_conditions_per_column) { - if (!is_fixed_range) { - _not_in_value_ranges.push_back(not_in_range); - } - *pdt = temp_pdt; - } return Status::OK(); } template -template -Status ScanLocalState::_change_value_range(ColumnValueRange& temp_range, +template +Status ScanLocalState::_change_value_range(bool is_equal_op, + ColumnValueRange& temp_range, const void* value, const ChangeFixedValueRangeFunc& func, - const std::string& fn_name, - int slot_ref_child) { + const std::string& fn_name) { if constexpr (PrimitiveType == TYPE_DATE) { VecDateTimeValue tmp_value; memcpy(&tmp_value, value, sizeof(VecDateTimeValue)); - if constexpr (IsFixed) { + if (is_equal_op) { if (!tmp_value.check_loss_accuracy_cast_to_date()) { - func(temp_range, + func(temp_range, to_olap_filter_type(fn_name), reinterpret_cast::CppType*>( &tmp_value)); } @@ -937,27 +874,15 @@ Status ScanLocalState::_change_value_range(ColumnValueRange::CppType*>( &tmp_value)); } } else if constexpr (PrimitiveType == TYPE_DATETIME) { - if constexpr (IsFixed) { - func(temp_range, - reinterpret_cast::CppType*>( - value)); - } else { - func(temp_range, to_olap_filter_type(fn_name, slot_ref_child), - reinterpret_cast::CppType*>( - reinterpret_cast(value))); - } + func(temp_range, to_olap_filter_type(fn_name), + reinterpret_cast::CppType*>(value)); } else if constexpr (PrimitiveType == TYPE_HLL) { - if constexpr (IsFixed) { - func(temp_range, reinterpret_cast(value)); - } else { - func(temp_range, to_olap_filter_type(fn_name, slot_ref_child), - reinterpret_cast(value)); - } + func(temp_range, to_olap_filter_type(fn_name), reinterpret_cast(value)); } else if constexpr ((PrimitiveType == TYPE_DECIMALV2) || (PrimitiveType == TYPE_CHAR) || (PrimitiveType == TYPE_VARCHAR) || (PrimitiveType == TYPE_DATETIMEV2) || (PrimitiveType == TYPE_TINYINT) || (PrimitiveType == TYPE_SMALLINT) || @@ -969,95 +894,56 @@ Status ScanLocalState::_change_value_range(ColumnValueRange::CppType*>( - value)); - } else { - func(temp_range, to_olap_filter_type(fn_name, slot_ref_child), - reinterpret_cast::CppType*>( - value)); - } + func(temp_range, to_olap_filter_type(fn_name), + reinterpret_cast::CppType*>(value)); } else { static_assert(always_false_v); } - return Status::OK(); } template template -Status ScanLocalState::_normalize_is_null_predicate(vectorized::VExpr* expr, - vectorized::VExprContext* expr_ctx, - SlotDescriptor* slot, - ColumnValueRange& range, - PushDownType* pdt) { - PushDownType temp_pdt = _should_push_down_is_null_predicate(); - if (temp_pdt == PushDownType::UNACCEPTABLE) { - return Status::OK(); +Status ScanLocalState::_normalize_is_null_predicate( + vectorized::VExprContext* expr_ctx, const vectorized::VExprSPtr& root, SlotDescriptor* slot, + std::vector>& predicates, ColumnValueRange& range, + PushDownType* pdt) { + std::shared_ptr pred = nullptr; + Defer defer = [&]() { + if (pred) { + DCHECK(*pdt != PushDownType::UNACCEPTABLE) << root->debug_string(); + predicates.emplace_back(pred); + } + }; + DCHECK(!root->is_rf_wrapper()) << root->debug_string(); + DCHECK(TExprNodeType::FUNCTION_CALL == root->node_type()) << root->debug_string(); + if (auto fn_call = dynamic_cast(root.get())) { + *pdt = _should_push_down_is_null_predicate(fn_call); + } else { + *pdt = PushDownType::UNACCEPTABLE; } - if (TExprNodeType::FUNCTION_CALL == expr->node_type()) { - if (reinterpret_cast(expr)->fn().name.function_name == - "is_null_pred") { - auto temp_range = ColumnValueRange::create_empty_column_value_range( - slot->is_nullable(), range.precision(), range.scale()); - temp_range.set_contain_null(true); - range.intersection(temp_range); - *pdt = temp_pdt; - } else if (reinterpret_cast(expr)->fn().name.function_name == - "is_not_null_pred") { - auto temp_range = ColumnValueRange::create_empty_column_value_range( - slot->is_nullable(), range.precision(), range.scale()); - temp_range.set_contain_null(false); - range.intersection(temp_range); - *pdt = temp_pdt; - } + if (*pdt == PushDownType::UNACCEPTABLE) { + return Status::OK(); } - return Status::OK(); -} -template -template -Status ScanLocalState::_normalize_noneq_binary_predicate( - vectorized::VExpr* expr, vectorized::VExprContext* expr_ctx, SlotDescriptor* slot, - ColumnValueRange& range, PushDownType* pdt) { - if (TExprNodeType::BINARY_PRED == expr->node_type()) { - DCHECK(expr->get_num_children() == 2); - - auto noneq_checker = [](const std::string& fn_name) { - return fn_name != "ne" && fn_name != "eq" && fn_name != "eq_for_null"; - }; - StringRef value; - int slot_ref_child = -1; - PushDownType temp_pdt; - RETURN_IF_ERROR(_should_push_down_binary_predicate( - reinterpret_cast(expr), expr_ctx, &value, - &slot_ref_child, noneq_checker, temp_pdt)); - if (temp_pdt != PushDownType::UNACCEPTABLE) { - DCHECK(slot_ref_child >= 0); - const std::string& fn_name = - reinterpret_cast(expr)->fn().name.function_name; - - // where A = nullptr should return empty result set - if (value.data != nullptr) { - if constexpr (T == TYPE_CHAR || T == TYPE_VARCHAR || T == TYPE_STRING || - T == TYPE_HLL) { - auto val = StringRef(value.data, value.size); - RETURN_IF_ERROR(_change_value_range(range, reinterpret_cast(&val), - ColumnValueRange::add_value_range, - fn_name, slot_ref_child)); - } else { - RETURN_IF_ERROR(_change_value_range( - range, reinterpret_cast(value.data), - ColumnValueRange::add_value_range, fn_name, slot_ref_child)); - } - *pdt = temp_pdt; - } else { - _eos = true; - _scan_dependency->set_ready(); - } - } + auto fn_call = assert_cast(root.get()); + if (fn_call->fn().name.function_name == "is_null_pred") { + pred = NullPredicate::create_shared( + _parent->intermediate_row_desc().get_column_id(slot->id()), slot->col_name(), true, + T); + auto temp_range = ColumnValueRange::create_empty_column_value_range( + slot->is_nullable(), range.precision(), range.scale()); + temp_range.set_contain_null(true); + range.intersection(temp_range); + } else if (fn_call->fn().name.function_name == "is_not_null_pred") { + pred = NullPredicate::create_shared( + _parent->intermediate_row_desc().get_column_id(slot->id()), slot->col_name(), false, + T); + auto temp_range = ColumnValueRange::create_empty_column_value_range( + slot->is_nullable(), range.precision(), range.scale()); + temp_range.set_contain_null(false); + range.intersection(temp_range); } return Status::OK(); } @@ -1109,11 +995,6 @@ TPushAggOp::type ScanLocalState::get_push_down_agg_type() { return _parent->cast()._push_down_agg_type; } -template -int64_t ScanLocalState::get_push_down_count() { - return _parent->cast()._push_down_count; -} - template int64_t ScanLocalState::limit_per_scanner() { return _parent->cast()._limit_per_scanner; @@ -1186,6 +1067,18 @@ Status ScanLocalState::_get_topn_filters(RuntimeState* state) { RETURN_IF_ERROR(conjunct->open(state)); _conjuncts.emplace_back(conjunct); } + for (auto id : get_topn_filter_source_node_ids(state, true)) { + const auto& pred = state->get_query_ctx()->get_runtime_predicate(id); + vectorized::VExprSPtr topn_pred; + RETURN_IF_ERROR(vectorized::VTopNPred::create_vtopn_pred(pred.get_texpr(p.node_id()), id, + topn_pred)); + + vectorized::VExprContextSPtr conjunct = vectorized::VExprContext::create_shared(topn_pred); + RETURN_IF_ERROR(conjunct->prepare( + state, _parent->cast().row_descriptor())); + RETURN_IF_ERROR(conjunct->open(state)); + _conjuncts.emplace_back(conjunct); + } return Status::OK(); } @@ -1310,8 +1203,19 @@ Status ScanOperatorX::prepare(RuntimeState* state) { continue; } - state->get_query_ctx()->get_runtime_predicate(id).init_target(node_id(), - _slot_id_to_slot_desc); + int cid = -1; + if (state->get_query_ctx()->get_runtime_predicate(id).target_is_slot(node_id())) { + auto s = _slot_id_to_slot_desc[state->get_query_ctx() + ->get_runtime_predicate(id) + .get_texpr(node_id()) + .nodes[0] + .slot_ref.slot_id]; + DCHECK(s != nullptr); + auto col_name = s->col_name(); + cid = get_column_id(col_name); + } + RETURN_IF_ERROR(state->get_query_ctx()->get_runtime_predicate(id).init_target( + node_id(), _slot_id_to_slot_desc, cid)); } RETURN_IF_CANCELLED(state); diff --git a/be/src/pipeline/exec/scan_operator.h b/be/src/pipeline/exec/scan_operator.h index 00c39269c25c0a..dc3723ef4fe352 100644 --- a/be/src/pipeline/exec/scan_operator.h +++ b/be/src/pipeline/exec/scan_operator.h @@ -37,6 +37,7 @@ namespace doris::vectorized { #include "common/compile_check_begin.h" class ScannerDelegate; +class OlapScanner; } // namespace doris::vectorized namespace doris::pipeline { @@ -52,16 +53,6 @@ enum class PushDownType { PARTIAL_ACCEPTABLE }; -struct FilterPredicates { - // Save all runtime filter predicates which may be pushed down to data source. - // column name -> bloom filter function - std::vector>> bloom_filters; - - std::vector>> bitmap_filters; - - std::vector>> in_filters; -}; - class ScanLocalStateBase : public PipelineXLocalState<> { public: ScanLocalStateBase(RuntimeState* state, OperatorXBase* parent) @@ -82,7 +73,6 @@ class ScanLocalStateBase : public PipelineXLocalState<> { const std::vector& scan_ranges) = 0; virtual TPushAggOp::type get_push_down_agg_type() = 0; - virtual int64_t get_push_down_count() = 0; // If scan operator is serial operator(like topn), its real parallelism is 1. // Otherwise, its real parallelism is query_parallel_instance_num. // query_parallel_instance_num of olap table is usually equal to session var parallel_pipeline_task_num. @@ -119,7 +109,6 @@ class ScanLocalStateBase : public PipelineXLocalState<> { RuntimeProfile::Counter* _scan_cpu_timer = nullptr; // time of filter output block from scanner RuntimeProfile::Counter* _filter_timer = nullptr; - RuntimeProfile::Counter* _memory_usage_counter = nullptr; // rows read from the scanner (including those discarded by (pre)filters) RuntimeProfile::Counter* _rows_read_counter = nullptr; @@ -165,8 +154,6 @@ class ScanLocalState : public ScanLocalStateBase { TPushAggOp::type get_push_down_agg_type() override; - int64_t get_push_down_count() override; - std::vector execution_dependencies() override { if (_filter_dependencies.empty()) { return {}; @@ -214,17 +201,27 @@ class ScanLocalState : public ScanLocalStateBase { virtual bool _storage_no_merge() { return false; } virtual bool _push_down_topn(const vectorized::RuntimePredicate& predicate) { return false; } virtual bool _is_key_column(const std::string& col_name) { return false; } - virtual PushDownType _should_push_down_bloom_filter() { return PushDownType::UNACCEPTABLE; } - virtual PushDownType _should_push_down_bitmap_filter() { return PushDownType::UNACCEPTABLE; } - virtual PushDownType _should_push_down_is_null_predicate() { + virtual PushDownType _should_push_down_bloom_filter() const { + return PushDownType::UNACCEPTABLE; + } + virtual PushDownType _should_push_down_topn_filter() const { + return PushDownType::UNACCEPTABLE; + } + virtual PushDownType _should_push_down_bitmap_filter() const { + return PushDownType::UNACCEPTABLE; + } + virtual PushDownType _should_push_down_is_null_predicate( + vectorized::VectorizedFnCall* fn_call) const { + return PushDownType::UNACCEPTABLE; + } + virtual PushDownType _should_push_down_in_predicate() const { return PushDownType::UNACCEPTABLE; } - Status _should_push_down_binary_predicate( + virtual PushDownType _should_push_down_binary_predicate( vectorized::VectorizedFnCall* fn_call, vectorized::VExprContext* expr_ctx, - StringRef* constant_val, int* slot_ref_child, - const std::function& fn_checker, PushDownType& pdt); - - PushDownType _should_push_down_in_predicate(vectorized::VInPredicate* in_pred, bool is_not_in); + StringRef* constant_val, const std::set fn_name) const { + return PushDownType::UNACCEPTABLE; + } virtual Status _should_push_down_function_filter(vectorized::VectorizedFnCall* fn_call, vectorized::VExprContext* expr_ctx, @@ -245,54 +242,51 @@ class ScanLocalState : public ScanLocalStateBase { } Status _normalize_conjuncts(RuntimeState* state); - Status _normalize_predicate(const vectorized::VExprSPtr& conjunct_expr_root, - vectorized::VExprContext* context, + // Normalize a conjunct and try to convert it to column predicate recursively. + Status _normalize_predicate(vectorized::VExprContext* context, + const vectorized::VExprSPtr& root, vectorized::VExprSPtr& output_expr); - Status _eval_const_conjuncts(vectorized::VExpr* vexpr, vectorized::VExprContext* expr_ctx, - PushDownType* pdt); - - Status _normalize_bloom_filter(vectorized::VExpr* expr, vectorized::VExprContext* expr_ctx, - SlotDescriptor* slot, PushDownType* pdt); - - Status _normalize_bitmap_filter(vectorized::VExpr* expr, vectorized::VExprContext* expr_ctx, - SlotDescriptor* slot, PushDownType* pdt); - - Status _normalize_function_filters(vectorized::VExpr* expr, vectorized::VExprContext* expr_ctx, - SlotDescriptor* slot, PushDownType* pdt); - - bool _is_predicate_acting_on_slot( - vectorized::VExpr* expr, - const std::function&, - vectorized::VExprSPtr&)>& checker, - SlotDescriptor** slot_desc, ColumnValueRangeType** range); + bool _is_predicate_acting_on_slot(const vectorized::VExprSPtrs& children, + SlotDescriptor** slot_desc, ColumnValueRangeType** range); + Status _eval_const_conjuncts(vectorized::VExprContext* expr_ctx, PushDownType* pdt); template - Status _normalize_in_and_eq_predicate(vectorized::VExpr* expr, - vectorized::VExprContext* expr_ctx, SlotDescriptor* slot, - ColumnValueRange& range, PushDownType* pdt); + Status _normalize_in_predicate(vectorized::VExprContext* expr_ctx, + const vectorized::VExprSPtr& root, SlotDescriptor* slot, + std::vector>& predicates, + ColumnValueRange& range, PushDownType* pdt); template - Status _normalize_not_in_and_not_eq_predicate(vectorized::VExpr* expr, - vectorized::VExprContext* expr_ctx, - SlotDescriptor* slot, ColumnValueRange& range, - PushDownType* pdt); + Status _normalize_binary_predicate(vectorized::VExprContext* expr_ctx, + const vectorized::VExprSPtr& root, SlotDescriptor* slot, + std::vector>& predicates, + ColumnValueRange& range, PushDownType* pdt); + Status _normalize_bloom_filter(vectorized::VExprContext* expr_ctx, + const vectorized::VExprSPtr& root, SlotDescriptor* slot, + std::vector>& predicates, + PushDownType* pdt); + Status _normalize_topn_filter(vectorized::VExprContext* expr_ctx, + const vectorized::VExprSPtr& root, SlotDescriptor* slot, + std::vector>& predicates, + PushDownType* pdt); + + Status _normalize_bitmap_filter(vectorized::VExprContext* expr_ctx, + const vectorized::VExprSPtr& root, SlotDescriptor* slot, + std::vector>& predicates, + PushDownType* pdt); + + Status _normalize_function_filters(vectorized::VExprContext* expr_ctx, SlotDescriptor* slot, + PushDownType* pdt); template - Status _normalize_noneq_binary_predicate(vectorized::VExpr* expr, - vectorized::VExprContext* expr_ctx, - SlotDescriptor* slot, ColumnValueRange& range, - PushDownType* pdt); - template - Status _normalize_is_null_predicate(vectorized::VExpr* expr, vectorized::VExprContext* expr_ctx, - SlotDescriptor* slot, ColumnValueRange& range, - PushDownType* pdt); - - bool _ignore_cast(SlotDescriptor* slot, vectorized::VExpr* expr); + Status _normalize_is_null_predicate(vectorized::VExprContext* expr_ctx, + const vectorized::VExprSPtr& root, SlotDescriptor* slot, + std::vector>& predicates, + ColumnValueRange& range, PushDownType* pdt); - template - Status _change_value_range(ColumnValueRange& range, const void* value, - const ChangeFixedValueRangeFunc& func, const std::string& fn_name, - int slot_ref_child = -1); + template + Status _change_value_range(bool is_equal_op, ColumnValueRange& range, + const void* value, const ChangeFixedValueRangeFunc& func, + const std::string& fn_name); Status _prepare_scanners(); @@ -317,8 +311,6 @@ class ScanLocalState : public ScanLocalStateBase { std::shared_ptr _scanner_ctx = nullptr; - FilterPredicates _filter_predicates {}; - // Save all function predicates which may be pushed down to data source. std::vector _push_down_functions; @@ -327,27 +319,17 @@ class ScanLocalState : public ScanLocalStateBase { // slot id -> ColumnValueRange // Parsed from conjuncts - phmap::flat_hash_map> - _slot_id_to_value_range; - // column -> ColumnValueRange - // We use _colname_to_value_range to store a column and its conresponding value ranges. - std::unordered_map _colname_to_value_range; - - // But if a col is with value range, eg: 1 < col < 10, which is "!is_fixed_range", - // in this case we can not merge "1 < col < 10" with "col not in (2)". - // So we have to save "col not in (2)" to another structure: "_not_in_value_ranges". - // When the data source try to use the value ranges, it should use both ranges in - // "_colname_to_value_range" and in "_not_in_value_ranges" - std::vector _not_in_value_ranges; + phmap::flat_hash_map _slot_id_to_value_range; + phmap::flat_hash_map>> _slot_id_to_predicates; + std::vector> _or_predicates; std::atomic _eos = false; - std::mutex _block_lock; - std::vector> _filter_dependencies; // ScanLocalState owns the ownership of scanner, scanner context only has its weakptr std::list> _scanners; + vectorized::Arena _arena; }; template @@ -370,14 +352,14 @@ class ScanOperatorX : public OperatorX { } [[nodiscard]] bool is_source() const override { return true; } - [[nodiscard]] virtual bool is_file_scan_operator() const { return false; } - [[nodiscard]] size_t get_reserve_mem_size(RuntimeState* state) override; const std::vector& runtime_filter_descs() override { return _runtime_filter_descs; } + [[nodiscard]] virtual int get_column_id(const std::string& col_name) const { return -1; } + TPushAggOp::type get_push_down_agg_type() { return _push_down_agg_type; } DataDistribution required_data_distribution() const override { @@ -396,7 +378,6 @@ class ScanOperatorX : public OperatorX { } } - int64_t get_push_down_count() const { return _push_down_count; } using OperatorX::node_id; using OperatorX::operator_id; using OperatorX::get_local_state; @@ -407,6 +388,7 @@ class ScanOperatorX : public OperatorX { protected: using LocalState = LocalStateType; + friend class vectorized::OlapScanner; ScanOperatorX(ObjectPool* pool, const TPlanNode& tnode, int operator_id, const DescriptorTbl& descs, int parallel_tasks = 0); virtual ~ScanOperatorX() = default; diff --git a/be/src/runtime/runtime_predicate.cpp b/be/src/runtime/runtime_predicate.cpp index 2c763dfcb9835f..1449a0ef2d7f6a 100644 --- a/be/src/runtime/runtime_predicate.cpp +++ b/be/src/runtime/runtime_predicate.cpp @@ -36,178 +36,149 @@ RuntimePredicate::RuntimePredicate(const TTopnFilterDesc& desc) _contexts[p.first].expr = p.second; } - PrimitiveType type = thrift_to_type(desc.target_node_id_to_target_expr.begin() - ->second.nodes[0] - .type.types[0] - .scalar_type.type); - if (!_init(type)) { + _type = thrift_to_type(desc.target_node_id_to_target_expr.begin() + ->second.nodes[0] + .type.types[0] + .scalar_type.type); + if (!_init(_type)) { std::stringstream ss; desc.target_node_id_to_target_expr.begin()->second.nodes[0].printTo(ss); - throw Exception(ErrorCode::INTERNAL_ERROR, "meet invalid type, type={}, expr={}", int(type), - ss.str()); + throw Exception(ErrorCode::INTERNAL_ERROR, "meet invalid type, type={}, expr={}", + type_to_string(_type), ss.str()); } // For ASC sort, create runtime predicate col_name <= max_top_value // since values that > min_top_value are large than any value in current topn values // For DESC sort, create runtime predicate col_name >= min_top_value // since values that < min_top_value are less than any value in current topn values - _pred_constructor = _is_asc ? create_comparison_predicate - : create_comparison_predicate; + _pred_constructor = _is_asc ? create_comparison_predicate0 + : create_comparison_predicate0; } -void RuntimePredicate::init_target( - int32_t target_node_id, phmap::flat_hash_map slot_id_to_slot_desc) { +Status RuntimePredicate::init_target( + int32_t target_node_id, phmap::flat_hash_map slot_id_to_slot_desc, + const int column_id) { + if (column_id < 0) { + return Status::OK(); + } std::unique_lock wlock(_rwlock); check_target_node_id(target_node_id); if (target_is_slot(target_node_id)) { _contexts[target_node_id].col_name = slot_id_to_slot_desc[get_texpr(target_node_id).nodes[0].slot_ref.slot_id] ->col_name(); + _contexts[target_node_id].predicate = + SharedPredicate::create_shared(cast_set(column_id), ""); } _detected_target = true; + return Status::OK(); } -template -std::string get_normal_value(const Field& field) { - using ValueType = typename PrimitiveTypeTraits::CppType; - return cast_to_string(field.get(), 0); -} - -std::string get_date_value(const Field& field) { - using ValueType = typename PrimitiveTypeTraits::CppType; - ValueType value; - Int64 v = field.get(); - auto* p = (VecDateTimeValue*)&v; - value.from_olap_date(p->to_olap_date()); - value.cast_to_date(); - return cast_to_string(value, 0); -} - -std::string get_datetime_value(const Field& field) { - using ValueType = typename PrimitiveTypeTraits::CppType; - ValueType value; - Int64 v = field.get(); - auto* p = (VecDateTimeValue*)&v; - value.from_olap_datetime(p->to_olap_datetime()); - value.to_datetime(); - return cast_to_string(value, 0); -} - -std::string get_time_value(const Field& field) { - using ValueType = typename PrimitiveTypeTraits::CppType; - ValueType value = field.get(); - return cast_to_string(value, 0); -} - -std::string get_decimalv2_value(const Field& field) { - // can NOT use PrimitiveTypeTraits::CppType since - // it is DecimalV2Value and Decimal128V2 can not convert to it implicitly - using ValueType = Decimal128V2::NativeType; - auto v = field.get>(); - // use TYPE_DECIMAL128I instead of TYPE_DECIMALV2 since v.get_scale() - // is always 9 for DECIMALV2 - return cast_to_string(v.get_value(), v.get_scale()); -} - -template -std::string get_decimal_value(const Field& field) { - using ValueType = typename PrimitiveTypeTraits::CppType; - auto v = field.get>(); - return cast_to_string(v.get_value(), v.get_scale()); -} - -bool RuntimePredicate::_init(PrimitiveType type) { - // set get value function +StringRef RuntimePredicate::_get_string_ref(const Field& field, const PrimitiveType type) { switch (type) { case PrimitiveType::TYPE_BOOLEAN: { - _get_value_fn = get_normal_value; - break; + const auto& v = field.get::CppType>(); + return StringRef((char*)&v, sizeof(v)); } case PrimitiveType::TYPE_TINYINT: { - _get_value_fn = get_normal_value; - break; + const auto& v = field.get::CppType>(); + return StringRef((char*)&v, sizeof(v)); } case PrimitiveType::TYPE_SMALLINT: { - _get_value_fn = get_normal_value; - break; + const auto& v = field.get::CppType>(); + return StringRef((char*)&v, sizeof(v)); } case PrimitiveType::TYPE_INT: { - _get_value_fn = get_normal_value; - break; + const auto& v = field.get::CppType>(); + return StringRef((char*)&v, sizeof(v)); } case PrimitiveType::TYPE_BIGINT: { - _get_value_fn = get_normal_value; - break; + const auto& v = field.get::CppType>(); + return StringRef((char*)&v, sizeof(v)); } case PrimitiveType::TYPE_LARGEINT: { - _get_value_fn = get_normal_value; - break; + const auto& v = field.get::CppType>(); + return StringRef((char*)&v, sizeof(v)); } case PrimitiveType::TYPE_CHAR: case PrimitiveType::TYPE_VARCHAR: case PrimitiveType::TYPE_STRING: { - _get_value_fn = [](const Field& field) { return field.get(); }; - break; + const auto& v = field.get(); + auto length = v.size(); + char* buffer = _predicate_arena.alloc(length); + memset(buffer, 0, length); + memcpy(buffer, v.data(), v.length()); + + return {buffer, length}; } case PrimitiveType::TYPE_DATEV2: { - _get_value_fn = get_normal_value; - break; + const auto& v = field.get::CppType>(); + return StringRef((char*)&v, sizeof(v)); } case PrimitiveType::TYPE_DATETIMEV2: { - _get_value_fn = get_normal_value; - break; + const auto& v = field.get::CppType>(); + return StringRef((char*)&v, sizeof(v)); } case PrimitiveType::TYPE_DATE: { - _get_value_fn = get_date_value; - break; + const auto& v = field.get::CppType>(); + return StringRef((char*)&v, sizeof(v)); } case PrimitiveType::TYPE_DATETIME: { - _get_value_fn = get_datetime_value; - break; + const auto& v = field.get::CppType>(); + return StringRef((char*)&v, sizeof(v)); } case PrimitiveType::TYPE_TIMEV2: { - _get_value_fn = get_time_value; - break; + const auto& v = field.get::CppType>(); + return StringRef((char*)&v, sizeof(v)); } case PrimitiveType::TYPE_DECIMAL32: { - _get_value_fn = get_decimal_value; - break; + const auto& v = field.get::CppType>(); + return StringRef((char*)&v, sizeof(v)); } case PrimitiveType::TYPE_DECIMAL64: { - _get_value_fn = get_decimal_value; - break; + const auto& v = field.get::CppType>(); + return StringRef((char*)&v, sizeof(v)); } case PrimitiveType::TYPE_DECIMALV2: { - _get_value_fn = get_decimalv2_value; - break; + const auto& v = field.get::CppType>(); + return StringRef((char*)&v, sizeof(v)); } case PrimitiveType::TYPE_DECIMAL128I: { - _get_value_fn = get_decimal_value; - break; + const auto& v = field.get::CppType>(); + return StringRef((char*)&v, sizeof(v)); } case PrimitiveType::TYPE_DECIMAL256: { - _get_value_fn = get_decimal_value; - break; + const auto& v = field.get::CppType>(); + return StringRef((char*)&v, sizeof(v)); } case PrimitiveType::TYPE_IPV4: { - _get_value_fn = get_normal_value; - break; + const auto& v = field.get::CppType>(); + return StringRef((char*)&v, sizeof(v)); } case PrimitiveType::TYPE_IPV6: { - _get_value_fn = get_normal_value; - break; + const auto& v = field.get::CppType>(); + return StringRef((char*)&v, sizeof(v)); } case PrimitiveType::TYPE_VARBINARY: { - _get_value_fn = [](const Field& field) { - return field.get().get_string(); - }; - break; + // For VARBINARY type, use StringViewField to store binary data + const auto& v = field.get::CppType>(); + auto length = v.size(); + char* buffer = _predicate_arena.alloc(length); + memset(buffer, 0, length); + memcpy(buffer, v.data(), length); + return {buffer, length}; } default: - return false; + break; } - return true; + throw Exception(ErrorCode::INTERNAL_ERROR, "meet invalid type, type={}", type_to_string(type)); + return {}; +} + +bool RuntimePredicate::_init(PrimitiveType type) { + return is_int_or_bool(type) || is_decimal(type) || is_string_type(type) || is_date_type(type) || + is_time_type(type) || is_ip(type) || is_varbinary(type); } Status RuntimePredicate::update(const Field& value) { @@ -240,18 +211,19 @@ Status RuntimePredicate::update(const Field& value) { continue; } const auto& column = *DORIS_TRY(ctx.tablet_schema->column(ctx.col_name)); - std::unique_ptr pred { - _pred_constructor(column.get_vec_type(), ctx.predicate->column_id(), - _get_value_fn(_orderby_extrem), false, _predicate_arena)}; + auto str_ref = _get_string_ref(_orderby_extrem, _type); + std::shared_ptr pred = + _pred_constructor(ctx.predicate->column_id(), column.name(), column.get_vec_type(), + str_ref, false, _predicate_arena); // For NULLS FIRST, wrap a AcceptNullPredicate to return true for NULL // since ORDER BY ASC/DESC should get NULL first but pred returns NULL // and NULL in where predicate will be treated as FALSE if (_nulls_first) { - pred = AcceptNullPredicate::create_unique(pred.release()); + pred = AcceptNullPredicate::create_shared(pred); } - ((SharedPredicate*)ctx.predicate.get())->set_nested(pred.release()); + ((SharedPredicate*)ctx.predicate.get())->set_nested(pred); } return Status::OK(); } diff --git a/be/src/runtime/runtime_predicate.h b/be/src/runtime/runtime_predicate.h index 51c79e1b426199..1e20bf800e13e8 100644 --- a/be/src/runtime/runtime_predicate.h +++ b/be/src/runtime/runtime_predicate.h @@ -44,8 +44,9 @@ class RuntimePredicate { public: RuntimePredicate(const TTopnFilterDesc& desc); - void init_target(int32_t target_node_id, - phmap::flat_hash_map slot_id_to_slot_desc); + Status init_target(int32_t target_node_id, + phmap::flat_hash_map slot_id_to_slot_desc, + const int column_id); bool enable() const { // when sort node and scan node are not in the same fragment, predicate will be disabled @@ -66,9 +67,7 @@ class RuntimePredicate { } RETURN_IF_ERROR(tablet_schema->have_column(_contexts[target_node_id].col_name)); _contexts[target_node_id].tablet_schema = tablet_schema; - int64_t index = DORIS_TRY(_contexts[target_node_id].get_field_index()) - _contexts[target_node_id] - .predicate = SharedPredicate::create_shared(index); + DCHECK(_contexts[target_node_id].predicate != nullptr); return Status::OK(); } @@ -110,6 +109,7 @@ class RuntimePredicate { } private: + StringRef _get_string_ref(const Field& field, const PrimitiveType type); void check_target_node_id(int32_t target_node_id) const { if (!_contexts.contains(target_node_id)) { std::string msg = "context target node ids: ["; @@ -129,6 +129,7 @@ class RuntimePredicate { struct TargetContext { TExpr expr; std::string col_name; + // TODO(gabriel): remove this TabletSchemaSPtr tablet_schema; std::shared_ptr predicate; @@ -153,13 +154,14 @@ class RuntimePredicate { Field _orderby_extrem {PrimitiveType::TYPE_NULL}; Arena _predicate_arena; - std::function _get_value_fn; - std::function + std::function( + const int cid, const std::string& col_name, const vectorized::DataTypePtr& data_type, + StringRef& value, bool opposite, vectorized::Arena& arena)> _pred_constructor; bool _detected_source = false; bool _detected_target = false; bool _has_value = false; + PrimitiveType _type; }; } // namespace vectorized diff --git a/be/src/vec/exec/format/avro/avro_jni_reader.cpp b/be/src/vec/exec/format/avro/avro_jni_reader.cpp index 195b6cfc56d491..6c8d28bd6bc896 100644 --- a/be/src/vec/exec/format/avro/avro_jni_reader.cpp +++ b/be/src/vec/exec/format/avro/avro_jni_reader.cpp @@ -59,9 +59,7 @@ Status AvroJNIReader::get_columns(std::unordered_map* return Status::OK(); } -Status AvroJNIReader::init_reader( - const std::unordered_map* colname_to_value_range) { - _colname_to_value_range = colname_to_value_range; +Status AvroJNIReader::init_reader() { std::ostringstream required_fields; std::ostringstream columns_types; std::vector column_names; @@ -97,7 +95,7 @@ Status AvroJNIReader::init_reader( required_param.insert(std::make_pair("uri", _range.path)); _jni_connector = std::make_unique("org/apache/doris/avro/AvroJNIScanner", required_param, column_names); - RETURN_IF_ERROR(_jni_connector->init(_colname_to_value_range)); + RETURN_IF_ERROR(_jni_connector->init()); return _jni_connector->open(_state, _profile); } diff --git a/be/src/vec/exec/format/avro/avro_jni_reader.h b/be/src/vec/exec/format/avro/avro_jni_reader.h index 96bcd9cc7b8cdc..f94e41f6d8e546 100644 --- a/be/src/vec/exec/format/avro/avro_jni_reader.h +++ b/be/src/vec/exec/format/avro/avro_jni_reader.h @@ -66,8 +66,7 @@ class AvroJNIReader : public JniReader { Status get_columns(std::unordered_map* name_to_type, std::unordered_set* missing_cols) override; - Status init_reader( - const std::unordered_map* colname_to_value_range); + Status init_reader(); TFileType::type get_file_type() const; @@ -81,7 +80,6 @@ class AvroJNIReader : public JniReader { private: const TFileScanRangeParams _params; const TFileRangeDesc _range; - const std::unordered_map* _colname_to_value_range = nullptr; }; #include "common/compile_check_end.h" diff --git a/be/src/vec/exec/format/generic_reader.cpp b/be/src/vec/exec/format/generic_reader.cpp deleted file mode 100644 index 8b3339faede6e0..00000000000000 --- a/be/src/vec/exec/format/generic_reader.cpp +++ /dev/null @@ -1,252 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "vec/exec/format/generic_reader.h" - -#include "olap/predicate_creator.h" -#include "vec/data_types/data_type.h" -#include "vec/data_types/data_type_nullable.h" -#include "vec/data_types/data_type_string.h" -#include "vec/exprs/vruntimefilter_wrapper.h" -#include "vec/exprs/vslot_ref.h" -#include "vec/exprs/vtopn_pred.h" - -namespace doris::vectorized { -#include "common/compile_check_begin.h" - -Status ExprPushDownHelper::_extract_predicates(const VExprSPtr& expr, int& cid, - DataTypePtr& data_type, std::vector& values, - bool null_pred, bool& parsed) const { - parsed = false; - values.clear(); - if (!expr->children()[0]->is_slot_ref()) [[unlikely]] { - return Status::OK(); - } - const auto* slot_ref = assert_cast(expr->children()[0].get()); - cid = slot_ref->column_id(); - values.reserve(expr->children().size() - 1); - data_type = remove_nullable(slot_ref->data_type()); - if (null_pred) { - DCHECK_EQ(expr->children().size(), 1); - parsed = true; - } - for (size_t child_id = 1; child_id < expr->children().size(); child_id++) { - auto child_expr = expr->children()[child_id]; - if (!child_expr->is_literal()) { - return Status::OK(); - } - const auto* literal = static_cast(child_expr.get()); - if (literal->get_column_ptr()->is_null_at(0)) { - continue; - } - values.emplace_back(literal->get_column_ptr()->operator[](0)); - parsed = true; - } - return Status::OK(); -} - -Status ExprPushDownHelper::convert_predicates( - const VExprSPtrs& exprs, std::vector>& predicates, - std::unique_ptr& root, Arena& arena) { - if (exprs.empty()) { - return Status::OK(); - } - - int cid; - DataTypePtr data_type; - std::vector values; - bool parsed = false; - for (const auto& expr : exprs) { - cid = -1; - values.clear(); - parsed = false; - switch (expr->node_type()) { - case TExprNodeType::BINARY_PRED: { - decltype(create_comparison_predicate)* create = nullptr; - if (expr->op() == TExprOpcode::EQ) { - create = create_comparison_predicate; - } else if (expr->op() == TExprOpcode::NE) { - create = create_comparison_predicate; - } else if (expr->op() == TExprOpcode::LT) { - create = create_comparison_predicate; - } else if (expr->op() == TExprOpcode::LE) { - create = create_comparison_predicate; - } else if (expr->op() == TExprOpcode::GT) { - create = create_comparison_predicate; - } else if (expr->op() == TExprOpcode::GE) { - create = create_comparison_predicate; - } else { - break; - } - RETURN_IF_ERROR(_extract_predicates(expr, cid, data_type, values, false, parsed)); - if (parsed) { - // TODO(gabriel): Use string view - predicates.push_back(std::unique_ptr( - create(data_type, cid, values[0].to_string(), false, arena))); - root->add_column_predicate( - SingleColumnBlockPredicate::create_unique(predicates.back().get())); - } - break; - } - case TExprNodeType::IN_PRED: { - switch (expr->op()) { - case TExprOpcode::FILTER_IN: { - RETURN_IF_ERROR(_extract_predicates(expr, cid, data_type, values, false, parsed)); - if (parsed) { - // TODO(gabriel): Use string view - std::vector conditions(values.size()); - for (size_t i = 0; i < conditions.size(); i++) { - conditions[i] = values[i].to_string(); - } - predicates.push_back(std::unique_ptr( - create_list_predicate( - data_type, cid, conditions, false, arena))); - root->add_column_predicate( - SingleColumnBlockPredicate::create_unique(predicates.back().get())); - } - break; - } - default: { - break; - } - } - break; - } - case TExprNodeType::COMPOUND_PRED: { - switch (expr->op()) { - case TExprOpcode::COMPOUND_AND: { - for (const auto& child : expr->children()) { - RETURN_IF_ERROR(convert_predicates({child}, predicates, root, arena)); - } - break; - } - case TExprOpcode::COMPOUND_OR: { - std::unique_ptr new_root = - OrBlockColumnPredicate::create_unique(); - for (const auto& child : expr->children()) { - RETURN_IF_ERROR(convert_predicates({child}, predicates, new_root, arena)); - } - root->add_column_predicate(std::move(new_root)); - break; - } - default: { - break; - } - } - break; - } - case TExprNodeType::FUNCTION_CALL: { - auto fn_name = expr->fn().name.function_name; - // only support `is null` and `is not null` - if (fn_name == "is_null_pred" || fn_name == "is_not_null_pred") { - RETURN_IF_ERROR(_extract_predicates(expr, cid, data_type, values, true, parsed)); - if (parsed) { - predicates.push_back(std::unique_ptr( - new NullPredicate(cid, true, fn_name == "is_not_null_pred"))); - root->add_column_predicate( - SingleColumnBlockPredicate::create_unique(predicates.back().get())); - } - } - break; - } - default: - break; - } - } - - return Status::OK(); -} - -bool ExprPushDownHelper::check_expr_can_push_down(const VExprSPtr& expr) const { - if (expr == nullptr) { - return false; - } - - switch (expr->node_type()) { - case TExprNodeType::BINARY_PRED: - case TExprNodeType::IN_PRED: { - switch (expr->op()) { - case TExprOpcode::GE: - case TExprOpcode::GT: - case TExprOpcode::LE: - case TExprOpcode::LT: - case TExprOpcode::EQ: - case TExprOpcode::FILTER_IN: - return _check_slot_can_push_down(expr) && _check_other_children_is_literal(expr); - default: { - return false; - } - } - } - case TExprNodeType::COMPOUND_PRED: { - switch (expr->op()) { - case TExprOpcode::COMPOUND_AND: { - // at least one child can be pushed down - return std::ranges::any_of(expr->children(), [this](const auto& child) { - return check_expr_can_push_down(child); - }); - } - case TExprOpcode::COMPOUND_OR: { - // all children must be pushed down - return std::ranges::all_of(expr->children(), [this](const auto& child) { - return check_expr_can_push_down(child); - }); - } - default: { - return false; - } - } - } - case TExprNodeType::FUNCTION_CALL: { - auto fn_name = expr->fn().name.function_name; - // only support `is null` and `is not null` - if (fn_name == "is_null_pred" || fn_name == "is_not_null_pred") { - return _check_slot_can_push_down(expr); - } - return false; - } - default: { - return false; - } - } -} - -bool ExprPushDownHelper::_check_slot_can_push_down(const VExprSPtr& expr) const { - if (!expr->children()[0]->is_slot_ref()) { - return false; - } - - const auto* slot_ref = assert_cast(expr->children()[0].get()); - // check if the slot exists in parquet file. - if (!_exists_in_file(slot_ref)) { - return false; - } - return _type_matches(slot_ref); -} - -bool ExprPushDownHelper::_check_other_children_is_literal(const VExprSPtr& expr) const { - for (size_t child_id = 1; child_id < expr->children().size(); child_id++) { - auto child_expr = expr->children()[child_id]; - if (!child_expr->is_literal()) { - return false; - } - } - return true; -} - -#include "common/compile_check_end.h" -} // namespace doris::vectorized diff --git a/be/src/vec/exec/format/generic_reader.h b/be/src/vec/exec/format/generic_reader.h index b21971b7a3f18b..620112a71e7999 100644 --- a/be/src/vec/exec/format/generic_reader.h +++ b/be/src/vec/exec/format/generic_reader.h @@ -112,25 +112,5 @@ class GenericReader : public ProfileCollector { FileMetaCache* _meta_cache = nullptr; }; -class ExprPushDownHelper { -public: - ExprPushDownHelper() = default; - virtual ~ExprPushDownHelper() = default; - bool check_expr_can_push_down(const VExprSPtr& expr) const; - Status convert_predicates(const VExprSPtrs& exprs, - std::vector>& predicates, - std::unique_ptr& root, Arena& arena); - -protected: - virtual bool _exists_in_file(const VSlotRef*) const = 0; - virtual bool _type_matches(const VSlotRef*) const = 0; - -private: - bool _check_slot_can_push_down(const VExprSPtr& expr) const; - bool _check_other_children_is_literal(const VExprSPtr& expr) const; - Status _extract_predicates(const VExprSPtr& expr, int& cid, DataTypePtr& data_type, - std::vector& values, bool null_pred, bool& parsed) const; -}; - #include "common/compile_check_end.h" } // namespace doris::vectorized diff --git a/be/src/vec/exec/format/jni_reader.cpp b/be/src/vec/exec/format/jni_reader.cpp index 800f5fb389cebe..da1862ec48f335 100644 --- a/be/src/vec/exec/format/jni_reader.cpp +++ b/be/src/vec/exec/format/jni_reader.cpp @@ -63,10 +63,8 @@ MockJniReader::MockJniReader(const std::vector& file_slot_descs params, column_names); } -Status MockJniReader::init_reader( - const std::unordered_map* colname_to_value_range) { - _colname_to_value_range = colname_to_value_range; - RETURN_IF_ERROR(_jni_connector->init(colname_to_value_range)); +Status MockJniReader::init_reader() { + RETURN_IF_ERROR(_jni_connector->init()); return _jni_connector->open(_state, _profile); } #include "common/compile_check_end.h" diff --git a/be/src/vec/exec/format/jni_reader.h b/be/src/vec/exec/format/jni_reader.h index 045de2aeafa0ae..325b7221d044bb 100644 --- a/be/src/vec/exec/format/jni_reader.h +++ b/be/src/vec/exec/format/jni_reader.h @@ -101,8 +101,7 @@ class MockJniReader : public JniReader { ~MockJniReader() override = default; - Status init_reader( - const std::unordered_map* colname_to_value_range); + Status init_reader(); Status close() override { if (_jni_connector) { @@ -117,9 +116,6 @@ class MockJniReader : public JniReader { _jni_connector->collect_profile_before_close(); } } - -private: - const std::unordered_map* _colname_to_value_range; }; #include "common/compile_check_end.h" diff --git a/be/src/vec/exec/format/parquet/parquet_pred_cmp.h b/be/src/vec/exec/format/parquet/parquet_predicate.h similarity index 100% rename from be/src/vec/exec/format/parquet/parquet_pred_cmp.h rename to be/src/vec/exec/format/parquet/parquet_predicate.h diff --git a/be/src/vec/exec/format/parquet/vparquet_group_reader.h b/be/src/vec/exec/format/parquet/vparquet_group_reader.h index b3b1123f82e4a3..749eac02b1fae1 100644 --- a/be/src/vec/exec/format/parquet/vparquet_group_reader.h +++ b/be/src/vec/exec/format/parquet/vparquet_group_reader.h @@ -87,6 +87,8 @@ class RowGroupReader : public ProfileCollector { fill_partition_columns; std::unordered_map fill_missing_columns; + phmap::flat_hash_map>> + slot_id_to_predicates; bool can_lazy_read = false; // block->rows() returns the number of rows of the first column, // so we should check and resize the first column diff --git a/be/src/vec/exec/format/parquet/vparquet_page_index.cpp b/be/src/vec/exec/format/parquet/vparquet_page_index.cpp index 6552bb048e90a2..b73675cfc4c0f7 100644 --- a/be/src/vec/exec/format/parquet/vparquet_page_index.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_page_index.cpp @@ -26,7 +26,7 @@ #include "common/logging.h" #include "common/status.h" -#include "parquet_pred_cmp.h" +#include "parquet_predicate.h" #include "util/thrift_util.h" #include "vec/exec/format/parquet/parquet_common.h" diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_reader.cpp index 3eb04608f73676..cf66497a9a2807 100644 --- a/be/src/vec/exec/format/parquet/vparquet_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_reader.cpp @@ -33,7 +33,7 @@ #include "io/fs/file_reader.h" #include "io/fs/file_reader_writer_fwd.h" #include "io/fs/tracing_file_reader.h" -#include "parquet_pred_cmp.h" +#include "parquet_predicate.h" #include "parquet_thrift_util.h" #include "runtime/define_primitive_type.h" #include "runtime/descriptors.h" @@ -329,8 +329,10 @@ void ParquetReader::_init_file_description() { Status ParquetReader::init_reader( const std::vector& all_column_names, std::unordered_map* col_name_to_block_idx, - const VExprContextSPtrs& conjuncts, const TupleDescriptor* tuple_descriptor, - const RowDescriptor* row_descriptor, + const VExprContextSPtrs& conjuncts, + phmap::flat_hash_map>>& + slot_id_to_predicates, + const TupleDescriptor* tuple_descriptor, const RowDescriptor* row_descriptor, const std::unordered_map* colname_to_slot_id, const VExprContextSPtrs* not_single_slot_filter_conjuncts, const std::unordered_map* slot_id_to_filter_conjuncts, @@ -383,21 +385,22 @@ Status ParquetReader::init_reader( } // build column predicates for column lazy read _lazy_read_ctx.conjuncts = conjuncts; + _lazy_read_ctx.slot_id_to_predicates = slot_id_to_predicates; return Status::OK(); } -bool ParquetReader::_exists_in_file(const VSlotRef* slot_ref) const { +bool ParquetReader::_exists_in_file(const std::string& expr_name) const { // `_read_table_columns_set` is used to ensure that only columns actually read are subject to min-max filtering. // This primarily handles cases where partition columns also exist in a file. The reason it's not modified // in `_table_info_node_ptr` is that Iceberg、Hudi has inconsistent requirements for this node; // Iceberg partition evolution need read partition columns from a file. // hudi set `hoodie.datasource.write.drop.partition.columns=false` not need read partition columns from a file. - return _table_info_node_ptr->children_column_exists(slot_ref->expr_name()) && - _read_table_columns_set.contains(slot_ref->expr_name()); + return _table_info_node_ptr->children_column_exists(expr_name) && + _read_table_columns_set.contains(expr_name); } -bool ParquetReader::_type_matches(const VSlotRef* slot_ref) const { - auto* slot = _tuple_descriptor->slots()[slot_ref->column_id()]; +bool ParquetReader::_type_matches(const int cid) const { + auto* slot = _tuple_descriptor->slots()[cid]; auto table_col_type = remove_nullable(slot->type()); const auto& file_col_name = _table_info_node_ptr->children_file_column_name(slot->col_name()); @@ -415,11 +418,12 @@ Status ParquetReader::_update_lazy_read_ctx(const VExprContextSPtrs& new_conjunc new_lazy_read_ctx.fill_missing_columns = std::move(_lazy_read_ctx.fill_missing_columns); _lazy_read_ctx = std::move(new_lazy_read_ctx); - _top_runtime_vexprs.clear(); _push_down_predicates.clear(); // std::unordered_map> std::unordered_map> predicate_columns; + + // TODO(gabriel): we should try to clear too much structs which are used to represent conjuncts and predicates. // visit_slot for lazy mat. std::function visit_slot = [&](VExpr* expr) { if (expr->is_slot_ref()) { @@ -469,32 +473,26 @@ Status ParquetReader::_update_lazy_read_ctx(const VExprContextSPtrs& new_conjunc VExprSPtr new_in_slot = nullptr; if (direct_in_predicate->get_slot_in_expr(new_in_slot)) { expr = new_in_slot; - } else { - continue; } - } else { - continue; - } - } else if (VTopNPred* topn_pred = typeid_cast(expr.get())) { - // top runtime filter : only le && ge. - DCHECK(topn_pred->children().size() > 0); - visit_slot(topn_pred->children()[0].get()); - - if (topn_pred->children()[0]->is_slot_ref()) { - // can min-max filter row group and page index. - // Since the filtering conditions for topn are dynamic, the filtering is - // delayed until create next row group reader. - _top_runtime_vexprs.emplace_back(expr); } - continue; - } else { + } else if (VTopNPred* topn_pred = typeid_cast(expr.get()); + topn_pred == nullptr) { visit_slot(expr.get()); } - - if (check_expr_can_push_down(expr)) { - _push_down_predicates.push_back(AndBlockColumnPredicate::create_unique()); - RETURN_IF_ERROR(convert_predicates({expr}, _useless_predicates, - _push_down_predicates.back(), _arena)); + } + if (!_lazy_read_ctx.slot_id_to_predicates.empty()) { + auto and_pred = AndBlockColumnPredicate::create_unique(); + for (const auto& entry : _lazy_read_ctx.slot_id_to_predicates) { + for (const auto& pred : entry.second) { + if (!_exists_in_file(pred->col_name()) || !_type_matches(pred->column_id())) { + continue; + } + and_pred->add_column_predicate( + SingleColumnBlockPredicate::create_unique(pred->clone(pred->column_id()))); + } + } + if (and_pred->num_of_column_predicate() > 0) { + _push_down_predicates.push_back(std::move(and_pred)); } } @@ -721,28 +719,11 @@ Status ParquetReader::_next_row_group_reader() { RETURN_IF_ERROR(_update_lazy_read_ctx(new_push_down_conjuncts)); } - size_t before_predicate_size = _push_down_predicates.size(); - _push_down_predicates.reserve(before_predicate_size + _top_runtime_vexprs.size()); - for (const auto& vexpr : _top_runtime_vexprs) { - VTopNPred* topn_pred = assert_cast(vexpr.get()); - VExprSPtr binary_expr; - if (topn_pred->get_binary_expr(binary_expr)) { - // for min-max filter. - if (check_expr_can_push_down(binary_expr)) { - _push_down_predicates.push_back(AndBlockColumnPredicate::create_unique()); - RETURN_IF_ERROR(convert_predicates({binary_expr}, _useless_predicates, - _push_down_predicates.back(), _arena)); - } - } - } - candidate_row_ranges.clear(); // The range of lines to be read is determined by the push down predicate. RETURN_IF_ERROR(_process_min_max_bloom_filter( _current_row_group_index, row_group, _push_down_predicates, &candidate_row_ranges)); - _push_down_predicates.resize(before_predicate_size); - std::function column_compressed_size = [&row_group, &column_compressed_size](const FieldSchema* field) -> int64_t { if (field->physical_column_index >= 0) { diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.h b/be/src/vec/exec/format/parquet/vparquet_reader.h index 31a39b442fcb70..02b1d5349fd841 100644 --- a/be/src/vec/exec/format/parquet/vparquet_reader.h +++ b/be/src/vec/exec/format/parquet/vparquet_reader.h @@ -35,7 +35,7 @@ #include "io/fs/file_meta_cache.h" #include "io/fs/file_reader.h" #include "io/fs/file_reader_writer_fwd.h" -#include "parquet_pred_cmp.h" +#include "parquet_predicate.h" #include "util/obj_lru_cache.h" #include "util/runtime_profile.h" #include "vec/exec/format/generic_reader.h" @@ -70,7 +70,7 @@ class VExprContext; namespace doris::vectorized { #include "common/compile_check_begin.h" -class ParquetReader : public GenericReader, public ExprPushDownHelper { +class ParquetReader : public GenericReader { ENABLE_FACTORY_CREATOR(ParquetReader); public: @@ -119,8 +119,10 @@ class ParquetReader : public GenericReader, public ExprPushDownHelper { Status init_reader( const std::vector& all_column_names, std::unordered_map* col_name_to_block_idx, - const VExprContextSPtrs& conjuncts, const TupleDescriptor* tuple_descriptor, - const RowDescriptor* row_descriptor, + const VExprContextSPtrs& conjuncts, + phmap::flat_hash_map>>& + slot_id_to_predicates, + const TupleDescriptor* tuple_descriptor, const RowDescriptor* row_descriptor, const std::unordered_map* colname_to_slot_id, const VExprContextSPtrs* not_single_slot_filter_conjuncts, const std::unordered_map* slot_id_to_filter_conjuncts, @@ -257,8 +259,8 @@ class ParquetReader : public GenericReader, public ExprPushDownHelper { Status _set_read_one_line_impl() override { return Status::OK(); } - bool _exists_in_file(const VSlotRef* slot) const override; - bool _type_matches(const VSlotRef*) const override; + bool _exists_in_file(const std::string& expr_name) const; + bool _type_matches(const int cid) const; // update lazy read context when runtime filter changed Status _update_lazy_read_ctx(const VExprContextSPtrs& new_conjuncts); @@ -348,9 +350,7 @@ class ParquetReader : public GenericReader, public ExprPushDownHelper { std::unordered_map* _col_name_to_block_idx = nullptr; // Since the filtering conditions for topn are dynamic, the filtering is delayed until create next row group reader. - VExprSPtrs _top_runtime_vexprs; std::vector> _push_down_predicates; - std::vector> _useless_predicates; Arena _arena; // when creating a new row group reader, call this function to get the latest runtime filter conjuncts. diff --git a/be/src/vec/exec/format/table/hive_reader.cpp b/be/src/vec/exec/format/table/hive_reader.cpp index ac004230bd0aab..f0465d4c4c0b87 100644 --- a/be/src/vec/exec/format/table/hive_reader.cpp +++ b/be/src/vec/exec/format/table/hive_reader.cpp @@ -213,8 +213,10 @@ ColumnIdResult HiveOrcReader::_create_column_ids_by_top_level_col_index( Status HiveParquetReader::init_reader( const std::vector& read_table_col_names, std::unordered_map* col_name_to_block_idx, - const VExprContextSPtrs& conjuncts, const TupleDescriptor* tuple_descriptor, - const RowDescriptor* row_descriptor, + const VExprContextSPtrs& conjuncts, + phmap::flat_hash_map>>& + slot_id_to_predicates, + const TupleDescriptor* tuple_descriptor, const RowDescriptor* row_descriptor, const std::unordered_map* colname_to_slot_id, const VExprContextSPtrs* not_single_slot_filter_conjuncts, const std::unordered_map* slot_id_to_filter_conjuncts) { @@ -285,8 +287,8 @@ Status HiveParquetReader::init_reader( RETURN_IF_ERROR(init_row_filters()); return parquet_reader->init_reader( - read_table_col_names, col_name_to_block_idx, conjuncts, tuple_descriptor, - row_descriptor, colname_to_slot_id, not_single_slot_filter_conjuncts, + read_table_col_names, col_name_to_block_idx, conjuncts, slot_id_to_predicates, + tuple_descriptor, row_descriptor, colname_to_slot_id, not_single_slot_filter_conjuncts, slot_id_to_filter_conjuncts, table_info_node_ptr, true, column_ids, filter_column_ids); } diff --git a/be/src/vec/exec/format/table/hive_reader.h b/be/src/vec/exec/format/table/hive_reader.h index 5d461a1a5b8675..70f047a1aa2241 100644 --- a/be/src/vec/exec/format/table/hive_reader.h +++ b/be/src/vec/exec/format/table/hive_reader.h @@ -88,8 +88,10 @@ class HiveParquetReader final : public HiveReader { Status init_reader( const std::vector& read_table_col_names, std::unordered_map* col_name_to_block_idx, - const VExprContextSPtrs& conjuncts, const TupleDescriptor* tuple_descriptor, - const RowDescriptor* row_descriptor, + const VExprContextSPtrs& conjuncts, + phmap::flat_hash_map>>& + slot_id_to_predicates, + const TupleDescriptor* tuple_descriptor, const RowDescriptor* row_descriptor, const std::unordered_map* colname_to_slot_id, const VExprContextSPtrs* not_single_slot_filter_conjuncts, const std::unordered_map* slot_id_to_filter_conjuncts); diff --git a/be/src/vec/exec/format/table/hudi_jni_reader.cpp b/be/src/vec/exec/format/table/hudi_jni_reader.cpp index a211e6603921fa..f0f9b540d7dc43 100644 --- a/be/src/vec/exec/format/table/hudi_jni_reader.cpp +++ b/be/src/vec/exec/format/table/hudi_jni_reader.cpp @@ -76,10 +76,8 @@ HudiJniReader::HudiJniReader(const TFileScanRangeParams& scan_params, params, required_fields); } -Status HudiJniReader::init_reader( - const std::unordered_map* colname_to_value_range) { - _colname_to_value_range = colname_to_value_range; - RETURN_IF_ERROR(_jni_connector->init(colname_to_value_range)); +Status HudiJniReader::init_reader() { + RETURN_IF_ERROR(_jni_connector->init()); return _jni_connector->open(_state, _profile); } #include "common/compile_check_end.h" diff --git a/be/src/vec/exec/format/table/hudi_jni_reader.h b/be/src/vec/exec/format/table/hudi_jni_reader.h index c6b63659722c05..8bc7eb9d09cb73 100644 --- a/be/src/vec/exec/format/table/hudi_jni_reader.h +++ b/be/src/vec/exec/format/table/hudi_jni_reader.h @@ -51,13 +51,11 @@ class HudiJniReader : public JniReader { ~HudiJniReader() override = default; - Status init_reader( - const std::unordered_map* colname_to_value_range); + Status init_reader(); private: const TFileScanRangeParams& _scan_params; const THudiFileDesc& _hudi_params; - const std::unordered_map* _colname_to_value_range; }; #include "common/compile_check_end.h" diff --git a/be/src/vec/exec/format/table/hudi_reader.cpp b/be/src/vec/exec/format/table/hudi_reader.cpp index d7bd32ae4987e7..c9c0497d2074fe 100644 --- a/be/src/vec/exec/format/table/hudi_reader.cpp +++ b/be/src/vec/exec/format/table/hudi_reader.cpp @@ -33,8 +33,10 @@ Status HudiReader::get_next_block_inner(Block* block, size_t* read_rows, bool* e Status HudiParquetReader::init_reader( const std::vector& read_table_col_names, std::unordered_map* col_name_to_block_idx, - const VExprContextSPtrs& conjuncts, const TupleDescriptor* tuple_descriptor, - const RowDescriptor* row_descriptor, + const VExprContextSPtrs& conjuncts, + phmap::flat_hash_map>>& + slot_id_to_predicates, + const TupleDescriptor* tuple_descriptor, const RowDescriptor* row_descriptor, const std::unordered_map* colname_to_slot_id, const VExprContextSPtrs* not_single_slot_filter_conjuncts, const std::unordered_map* slot_id_to_filter_conjuncts) { @@ -48,8 +50,8 @@ Status HudiParquetReader::init_reader( _params, _range.table_format_params.hudi_params.schema_id, tuple_descriptor, *field_desc)); return parquet_reader->init_reader(read_table_col_names, col_name_to_block_idx, conjuncts, - tuple_descriptor, row_descriptor, colname_to_slot_id, - not_single_slot_filter_conjuncts, + slot_id_to_predicates, tuple_descriptor, row_descriptor, + colname_to_slot_id, not_single_slot_filter_conjuncts, slot_id_to_filter_conjuncts, table_info_node_ptr); } diff --git a/be/src/vec/exec/format/table/hudi_reader.h b/be/src/vec/exec/format/table/hudi_reader.h index bc8ba25ee5aea2..66fab379e47d97 100644 --- a/be/src/vec/exec/format/table/hudi_reader.h +++ b/be/src/vec/exec/format/table/hudi_reader.h @@ -51,8 +51,10 @@ class HudiParquetReader final : public HudiReader { Status init_reader( const std::vector& read_table_col_names, std::unordered_map* col_name_to_block_idx, - const VExprContextSPtrs& conjuncts, const TupleDescriptor* tuple_descriptor, - const RowDescriptor* row_descriptor, + const VExprContextSPtrs& conjuncts, + phmap::flat_hash_map>>& + slot_id_to_predicates, + const TupleDescriptor* tuple_descriptor, const RowDescriptor* row_descriptor, const std::unordered_map* colname_to_slot_id, const VExprContextSPtrs* not_single_slot_filter_conjuncts, const std::unordered_map* slot_id_to_filter_conjuncts); diff --git a/be/src/vec/exec/format/table/iceberg_reader.cpp b/be/src/vec/exec/format/table/iceberg_reader.cpp index 57ff9d2c70f57c..91bad044cbfca1 100644 --- a/be/src/vec/exec/format/table/iceberg_reader.cpp +++ b/be/src/vec/exec/format/table/iceberg_reader.cpp @@ -177,10 +177,11 @@ Status IcebergTableReader::_equality_delete_base( init_schema = true; } if (auto* parquet_reader = typeid_cast(delete_reader.get())) { + phmap::flat_hash_map>> tmp; RETURN_IF_ERROR(parquet_reader->init_reader( - equality_delete_col_names, &delete_col_name_to_block_idx, {}, nullptr, nullptr, - nullptr, nullptr, nullptr, TableSchemaChangeHelper::ConstNode::get_instance(), - false)); + equality_delete_col_names, &delete_col_name_to_block_idx, {}, tmp, nullptr, + nullptr, nullptr, nullptr, nullptr, + TableSchemaChangeHelper::ConstNode::get_instance(), false)); } else if (auto* orc_reader = typeid_cast(delete_reader.get())) { RETURN_IF_ERROR(orc_reader->init_reader(&equality_delete_col_names, &delete_col_name_to_block_idx, {}, false, {}, @@ -443,8 +444,10 @@ void IcebergTableReader::_gen_position_delete_file_range(Block& block, DeleteFil Status IcebergParquetReader::init_reader( const std::vector& file_col_names, std::unordered_map* col_name_to_block_idx, - const VExprContextSPtrs& conjuncts, const TupleDescriptor* tuple_descriptor, - const RowDescriptor* row_descriptor, + const VExprContextSPtrs& conjuncts, + phmap::flat_hash_map>>& + slot_id_to_predicates, + const TupleDescriptor* tuple_descriptor, const RowDescriptor* row_descriptor, const std::unordered_map* colname_to_slot_id, const VExprContextSPtrs* not_single_slot_filter_conjuncts, const std::unordered_map* slot_id_to_filter_conjuncts) { @@ -487,8 +490,8 @@ Status IcebergParquetReader::init_reader( } } return parquet_reader->init_reader( - _all_required_col_names, _col_name_to_block_idx, conjuncts, tuple_descriptor, - row_descriptor, colname_to_slot_id, not_single_slot_filter_conjuncts, + _all_required_col_names, _col_name_to_block_idx, conjuncts, slot_id_to_predicates, + tuple_descriptor, row_descriptor, colname_to_slot_id, not_single_slot_filter_conjuncts, slot_id_to_filter_conjuncts, table_info_node_ptr, true, column_ids, filter_column_ids); } @@ -559,10 +562,11 @@ Status IcebergParquetReader ::_read_position_delete_file(const TFileRangeDesc* d ParquetReader parquet_delete_reader( _profile, _params, *delete_range, READ_DELETE_FILE_BATCH_SIZE, const_cast(&_state->timezone_obj()), _io_ctx, _state, _meta_cache); + phmap::flat_hash_map>> tmp; RETURN_IF_ERROR(parquet_delete_reader.init_reader( delete_file_col_names, const_cast*>(&DELETE_COL_NAME_TO_BLOCK_IDX), - {}, nullptr, nullptr, nullptr, nullptr, nullptr, + {}, tmp, nullptr, nullptr, nullptr, nullptr, nullptr, TableSchemaChangeHelper::ConstNode::get_instance(), false)); std::unordered_map> diff --git a/be/src/vec/exec/format/table/iceberg_reader.h b/be/src/vec/exec/format/table/iceberg_reader.h index cc32fc90075809..6c9b95d3a3fb66 100644 --- a/be/src/vec/exec/format/table/iceberg_reader.h +++ b/be/src/vec/exec/format/table/iceberg_reader.h @@ -173,8 +173,10 @@ class IcebergParquetReader final : public IcebergTableReader { Status init_reader( const std::vector& file_col_names, std::unordered_map* col_name_to_block_idx, - const VExprContextSPtrs& conjuncts, const TupleDescriptor* tuple_descriptor, - const RowDescriptor* row_descriptor, + const VExprContextSPtrs& conjuncts, + phmap::flat_hash_map>>& + slot_id_to_predicates, + const TupleDescriptor* tuple_descriptor, const RowDescriptor* row_descriptor, const std::unordered_map* colname_to_slot_id, const VExprContextSPtrs* not_single_slot_filter_conjuncts, const std::unordered_map* slot_id_to_filter_conjuncts); diff --git a/be/src/vec/exec/format/table/iceberg_sys_table_jni_reader.cpp b/be/src/vec/exec/format/table/iceberg_sys_table_jni_reader.cpp index ffcae20df9dce2..d3c7ce82e4f822 100644 --- a/be/src/vec/exec/format/table/iceberg_sys_table_jni_reader.cpp +++ b/be/src/vec/exec/format/table/iceberg_sys_table_jni_reader.cpp @@ -30,8 +30,7 @@ IcebergSysTableJniReader::IcebergSysTableJniReader( RuntimeProfile* profile, const TMetaScanRange& meta_scan_range) : JniReader(file_slot_descs, state, profile), _meta_scan_range(meta_scan_range) {} -Status IcebergSysTableJniReader::init_reader( - const std::unordered_map* colname_to_value_range) { +Status IcebergSysTableJniReader::init_reader() { std::vector required_fields; std::vector required_types; for (const auto& desc : _file_slot_descs) { @@ -53,7 +52,7 @@ Status IcebergSysTableJniReader::init_reader( if (_jni_connector == nullptr) { return Status::InternalError("JniConnector failed to initialize"); } - RETURN_IF_ERROR(_jni_connector->init(colname_to_value_range)); + RETURN_IF_ERROR(_jni_connector->init()); return _jni_connector->open(_state, _profile); } diff --git a/be/src/vec/exec/format/table/iceberg_sys_table_jni_reader.h b/be/src/vec/exec/format/table/iceberg_sys_table_jni_reader.h index 982f4357343f58..ec78d9211f08f9 100644 --- a/be/src/vec/exec/format/table/iceberg_sys_table_jni_reader.h +++ b/be/src/vec/exec/format/table/iceberg_sys_table_jni_reader.h @@ -51,8 +51,7 @@ class IcebergSysTableJniReader : public JniReader { ~IcebergSysTableJniReader() override = default; - Status init_reader( - const std::unordered_map* colname_to_value_range); + Status init_reader(); private: const TMetaScanRange& _meta_scan_range; diff --git a/be/src/vec/exec/format/table/lakesoul_jni_reader.cpp b/be/src/vec/exec/format/table/lakesoul_jni_reader.cpp index 2fe821c49ead90..a3af8c5833de05 100644 --- a/be/src/vec/exec/format/table/lakesoul_jni_reader.cpp +++ b/be/src/vec/exec/format/table/lakesoul_jni_reader.cpp @@ -60,9 +60,8 @@ LakeSoulJniReader::LakeSoulJniReader(const TLakeSoulFileDesc& lakesoul_params, params, required_fields); } -Status LakeSoulJniReader::init_reader( - const std::unordered_map* colname_to_value_range) { - RETURN_IF_ERROR(_jni_connector->init(colname_to_value_range)); +Status LakeSoulJniReader::init_reader() { + RETURN_IF_ERROR(_jni_connector->init()); return _jni_connector->open(_state, _profile); } #include "common/compile_check_end.h" diff --git a/be/src/vec/exec/format/table/lakesoul_jni_reader.h b/be/src/vec/exec/format/table/lakesoul_jni_reader.h index 6a659cddc9e0d2..a0c1004208e8ea 100644 --- a/be/src/vec/exec/format/table/lakesoul_jni_reader.h +++ b/be/src/vec/exec/format/table/lakesoul_jni_reader.h @@ -51,8 +51,7 @@ class LakeSoulJniReader : public JniReader { ~LakeSoulJniReader() override = default; - Status init_reader( - const std::unordered_map* colname_to_value_range); + Status init_reader(); private: const TLakeSoulFileDesc& _lakesoul_params; diff --git a/be/src/vec/exec/format/table/max_compute_jni_reader.cpp b/be/src/vec/exec/format/table/max_compute_jni_reader.cpp index e98a7acd3796b8..81999f896173a9 100644 --- a/be/src/vec/exec/format/table/max_compute_jni_reader.cpp +++ b/be/src/vec/exec/format/table/max_compute_jni_reader.cpp @@ -85,10 +85,8 @@ MaxComputeJniReader::MaxComputeJniReader(const MaxComputeTableDescriptor* mc_des "org/apache/doris/maxcompute/MaxComputeJniScanner", params, column_names); } -Status MaxComputeJniReader::init_reader( - const std::unordered_map* colname_to_value_range) { - _colname_to_value_range = colname_to_value_range; - RETURN_IF_ERROR(_jni_connector->init(colname_to_value_range)); +Status MaxComputeJniReader::init_reader() { + RETURN_IF_ERROR(_jni_connector->init()); return _jni_connector->open(_state, _profile); } #include "common/compile_check_end.h" diff --git a/be/src/vec/exec/format/table/max_compute_jni_reader.h b/be/src/vec/exec/format/table/max_compute_jni_reader.h index 4af75a5ab71077..bc83d7d372462c 100644 --- a/be/src/vec/exec/format/table/max_compute_jni_reader.h +++ b/be/src/vec/exec/format/table/max_compute_jni_reader.h @@ -56,14 +56,12 @@ class MaxComputeJniReader : public JniReader { ~MaxComputeJniReader() override = default; - Status init_reader( - const std::unordered_map* colname_to_value_range); + Status init_reader(); private: const MaxComputeTableDescriptor* _table_desc = nullptr; const TMaxComputeFileDesc& _max_compute_params; const TFileRangeDesc& _range; - const std::unordered_map* _colname_to_value_range = nullptr; }; #include "common/compile_check_end.h" diff --git a/be/src/vec/exec/format/table/paimon_jni_reader.cpp b/be/src/vec/exec/format/table/paimon_jni_reader.cpp index f62e7afa14c9b6..3c9afe93eb36b0 100644 --- a/be/src/vec/exec/format/table/paimon_jni_reader.cpp +++ b/be/src/vec/exec/format/table/paimon_jni_reader.cpp @@ -110,10 +110,8 @@ Status PaimonJniReader::get_next_block(Block* block, size_t* read_rows, bool* eo return _jni_connector->get_next_block(block, read_rows, eof); } -Status PaimonJniReader::init_reader( - const std::unordered_map* colname_to_value_range) { - _colname_to_value_range = colname_to_value_range; - RETURN_IF_ERROR(_jni_connector->init(colname_to_value_range)); +Status PaimonJniReader::init_reader() { + RETURN_IF_ERROR(_jni_connector->init()); return _jni_connector->open(_state, _profile); } #include "common/compile_check_end.h" diff --git a/be/src/vec/exec/format/table/paimon_jni_reader.h b/be/src/vec/exec/format/table/paimon_jni_reader.h index 37b320f28cd720..81b5bd68d29a4d 100644 --- a/be/src/vec/exec/format/table/paimon_jni_reader.h +++ b/be/src/vec/exec/format/table/paimon_jni_reader.h @@ -58,11 +58,9 @@ class PaimonJniReader : public JniReader { Status get_next_block(Block* block, size_t* read_rows, bool* eof) override; - Status init_reader( - const std::unordered_map* colname_to_value_range); + Status init_reader(); private: - const std::unordered_map* _colname_to_value_range; int64_t _remaining_table_level_row_count; }; diff --git a/be/src/vec/exec/format/table/paimon_reader.h b/be/src/vec/exec/format/table/paimon_reader.h index d5e2ec5a35da42..30cd788ce89163 100644 --- a/be/src/vec/exec/format/table/paimon_reader.h +++ b/be/src/vec/exec/format/table/paimon_reader.h @@ -104,8 +104,10 @@ class PaimonParquetReader final : public PaimonReader { Status init_reader( const std::vector& read_table_col_names, std::unordered_map* col_name_to_block_idx, - const VExprContextSPtrs& conjuncts, const TupleDescriptor* tuple_descriptor, - const RowDescriptor* row_descriptor, + const VExprContextSPtrs& conjuncts, + phmap::flat_hash_map>>& + slot_id_to_predicates, + const TupleDescriptor* tuple_descriptor, const RowDescriptor* row_descriptor, const std::unordered_map* colname_to_slot_id, const VExprContextSPtrs* not_single_slot_filter_conjuncts, const std::unordered_map* slot_id_to_filter_conjuncts) { @@ -120,8 +122,8 @@ class PaimonParquetReader final : public PaimonReader { *field_desc)); return parquet_reader->init_reader(read_table_col_names, col_name_to_block_idx, conjuncts, - tuple_descriptor, row_descriptor, colname_to_slot_id, - not_single_slot_filter_conjuncts, + slot_id_to_predicates, tuple_descriptor, row_descriptor, + colname_to_slot_id, not_single_slot_filter_conjuncts, slot_id_to_filter_conjuncts, table_info_node_ptr); } }; diff --git a/be/src/vec/exec/format/table/paimon_sys_table_jni_reader.cpp b/be/src/vec/exec/format/table/paimon_sys_table_jni_reader.cpp index 6e9c7f50c7e1c3..ae1088a9a5f799 100644 --- a/be/src/vec/exec/format/table/paimon_sys_table_jni_reader.cpp +++ b/be/src/vec/exec/format/table/paimon_sys_table_jni_reader.cpp @@ -51,10 +51,8 @@ PaimonSysTableJniReader::PaimonSysTableJniReader( "org/apache/doris/paimon/PaimonSysTableJniScanner", std::move(params), required_fields); } -Status PaimonSysTableJniReader::init_reader( - const std::unordered_map* colname_to_value_range) { - _colname_to_value_range = colname_to_value_range; - RETURN_IF_ERROR(_jni_connector->init(colname_to_value_range)); +Status PaimonSysTableJniReader::init_reader() { + RETURN_IF_ERROR(_jni_connector->init()); return _jni_connector->open(_state, _profile); } diff --git a/be/src/vec/exec/format/table/paimon_sys_table_jni_reader.h b/be/src/vec/exec/format/table/paimon_sys_table_jni_reader.h index a6f43899e2db96..c398c89e65155e 100644 --- a/be/src/vec/exec/format/table/paimon_sys_table_jni_reader.h +++ b/be/src/vec/exec/format/table/paimon_sys_table_jni_reader.h @@ -52,11 +52,9 @@ class PaimonSysTableJniReader : public JniReader { ~PaimonSysTableJniReader() override = default; - Status init_reader( - const std::unordered_map* colname_to_value_range); + Status init_reader(); private: - const std::unordered_map* _colname_to_value_range; const TMetaScanRange& _meta_scan_range; }; diff --git a/be/src/vec/exec/format/table/trino_connector_jni_reader.cpp b/be/src/vec/exec/format/table/trino_connector_jni_reader.cpp index c8cc08531121cc..b2b21fda33f352 100644 --- a/be/src/vec/exec/format/table/trino_connector_jni_reader.cpp +++ b/be/src/vec/exec/format/table/trino_connector_jni_reader.cpp @@ -76,9 +76,8 @@ TrinoConnectorJniReader::TrinoConnectorJniReader( "org/apache/doris/trinoconnector/TrinoConnectorJniScanner", params, column_names); } -Status TrinoConnectorJniReader::init_reader( - const std::unordered_map* colname_to_value_range) { - RETURN_IF_ERROR(_jni_connector->init(colname_to_value_range)); +Status TrinoConnectorJniReader::init_reader() { + RETURN_IF_ERROR(_jni_connector->init()); RETURN_IF_ERROR(_set_spi_plugins_dir()); return _jni_connector->open(_state, _profile); } diff --git a/be/src/vec/exec/format/table/trino_connector_jni_reader.h b/be/src/vec/exec/format/table/trino_connector_jni_reader.h index 4c6b1d2e57a67a..63610a38bba0a5 100644 --- a/be/src/vec/exec/format/table/trino_connector_jni_reader.h +++ b/be/src/vec/exec/format/table/trino_connector_jni_reader.h @@ -49,8 +49,7 @@ class TrinoConnectorJniReader : public JniReader { ~TrinoConnectorJniReader() override = default; - Status init_reader( - const std::unordered_map* colname_to_value_range); + Status init_reader(); private: Status _set_spi_plugins_dir(); diff --git a/be/src/vec/exec/jni_connector.cpp b/be/src/vec/exec/jni_connector.cpp index 700b07719b0c96..15e9640d04abd0 100644 --- a/be/src/vec/exec/jni_connector.cpp +++ b/be/src/vec/exec/jni_connector.cpp @@ -103,18 +103,7 @@ Status JniConnector::open(RuntimeState* state, RuntimeProfile* profile) { return Status::OK(); } -Status JniConnector::init( - const std::unordered_map* colname_to_value_range) { - // TODO: This logic need to be changed. - // See the comment of "predicates" field in JniScanner.java - - // _generate_predicates(colname_to_value_range); - // if (_predicates_length != 0 && _predicates != nullptr) { - // int64_t predicates_address = (int64_t)_predicates.get(); - // // We can call org.apache.doris.common.jni.vec.ScanPredicate#parseScanPredicates to parse the - // // serialized predicates in java side. - // _scanner_params.emplace("push_down_predicates", std::to_string(predicates_address)); - // } +Status JniConnector::init() { return Status::OK(); } @@ -502,18 +491,6 @@ Status JniConnector::_fill_struct_column(TableMetaAddress& address, MutableColum return Status::OK(); } -void JniConnector::_generate_predicates( - const std::unordered_map* colname_to_value_range) { - if (colname_to_value_range == nullptr) { - return; - } - for (auto& kv : *colname_to_value_range) { - const std::string& column_name = kv.first; - const ColumnValueRangeType& col_val_range = kv.second; - std::visit([&](auto&& range) { _parse_value_range(range, column_name); }, col_val_range); - } -} - std::string JniConnector::get_jni_type(const DataTypePtr& data_type) { DataTypePtr type = remove_nullable(data_type); std::ostringstream buffer; diff --git a/be/src/vec/exec/jni_connector.h b/be/src/vec/exec/jni_connector.h index 9d92e596994ae3..5a08247f658074 100644 --- a/be/src/vec/exec/jni_connector.h +++ b/be/src/vec/exec/jni_connector.h @@ -224,8 +224,7 @@ class JniConnector : public ProfileCollector { * number_filters(4) | length(4) | column_name | op(4) | scale(4) | num_values(4) | value_length(4) | value | ... * Then, pass the byte array address in configuration map, like "push_down_predicates=${address}" */ - Status init( - const std::unordered_map* colname_to_value_range); + Status init(); /** * Call java side function JniScanner.getNextBatchMeta. The columns information are stored as long array: @@ -375,9 +374,6 @@ class JniConnector : public ProfileCollector { return (long)assert_cast(doris_column).get_data().data(); } - void _generate_predicates( - const std::unordered_map* colname_to_value_range); - template void _parse_value_range(const ColumnValueRange& col_val_range, const std::string& column_name) { diff --git a/be/src/vec/exec/scan/file_scanner.cpp b/be/src/vec/exec/scan/file_scanner.cpp index 8a0f296b2b97f2..5df83abde39ceb 100644 --- a/be/src/vec/exec/scan/file_scanner.cpp +++ b/be/src/vec/exec/scan/file_scanner.cpp @@ -101,17 +101,15 @@ using namespace ErrorCode; const std::string FileScanner::FileReadBytesProfile = "FileReadBytes"; const std::string FileScanner::FileReadTimeProfile = "FileReadTime"; -FileScanner::FileScanner( - RuntimeState* state, pipeline::FileScanLocalState* local_state, int64_t limit, - std::shared_ptr split_source, RuntimeProfile* profile, - ShardedKVCache* kv_cache, - const std::unordered_map* colname_to_value_range, - const std::unordered_map* colname_to_slot_id) +FileScanner::FileScanner(RuntimeState* state, pipeline::FileScanLocalState* local_state, + int64_t limit, + std::shared_ptr split_source, + RuntimeProfile* profile, ShardedKVCache* kv_cache, + const std::unordered_map* colname_to_slot_id) : Scanner(state, local_state, limit, profile), _split_source(split_source), _cur_reader(nullptr), _cur_reader_eof(false), - _colname_to_value_range(colname_to_value_range), _kv_cache(kv_cache), _strict_mode(false), _col_name_to_slot_id(colname_to_slot_id) { @@ -1008,34 +1006,30 @@ Status FileScanner::_get_next_reader() { std::unique_ptr mc_reader = MaxComputeJniReader::create_unique( mc_desc, range.table_format_params.max_compute_params, _file_slot_descs, range, _state, _profile); - init_status = mc_reader->init_reader(_colname_to_value_range); + init_status = mc_reader->init_reader(); _cur_reader = std::move(mc_reader); } else if (range.__isset.table_format_params && range.table_format_params.table_format_type == "paimon") { _cur_reader = PaimonJniReader::create_unique(_file_slot_descs, _state, _profile, range, _params); - init_status = ((PaimonJniReader*)(_cur_reader.get())) - ->init_reader(_colname_to_value_range); + init_status = ((PaimonJniReader*)(_cur_reader.get()))->init_reader(); } else if (range.__isset.table_format_params && range.table_format_params.table_format_type == "hudi") { _cur_reader = HudiJniReader::create_unique(*_params, range.table_format_params.hudi_params, _file_slot_descs, _state, _profile); - init_status = - ((HudiJniReader*)_cur_reader.get())->init_reader(_colname_to_value_range); + init_status = ((HudiJniReader*)_cur_reader.get())->init_reader(); } else if (range.__isset.table_format_params && range.table_format_params.table_format_type == "lakesoul") { _cur_reader = LakeSoulJniReader::create_unique(range.table_format_params.lakesoul_params, _file_slot_descs, _state, _profile); - init_status = ((LakeSoulJniReader*)_cur_reader.get()) - ->init_reader(_colname_to_value_range); + init_status = ((LakeSoulJniReader*)_cur_reader.get())->init_reader(); } else if (range.__isset.table_format_params && range.table_format_params.table_format_type == "trino_connector") { _cur_reader = TrinoConnectorJniReader::create_unique(_file_slot_descs, _state, _profile, range); - init_status = ((TrinoConnectorJniReader*)(_cur_reader.get())) - ->init_reader(_colname_to_value_range); + init_status = ((TrinoConnectorJniReader*)(_cur_reader.get()))->init_reader(); } // Set col_name_to_block_idx for JNI readers to avoid repeated map creation if (_cur_reader) { @@ -1135,8 +1129,7 @@ Status FileScanner::_get_next_reader() { case TFileFormatType::FORMAT_AVRO: { _cur_reader = AvroJNIReader::create_unique(_state, _profile, *_params, _file_slot_descs, range); - init_status = - ((AvroJNIReader*)(_cur_reader.get()))->init_reader(_colname_to_value_range); + init_status = ((AvroJNIReader*)(_cur_reader.get()))->init_reader(); // Set col_name_to_block_idx for JNI readers to avoid repeated map creation if (_cur_reader) { static_cast(_cur_reader.get()) @@ -1226,15 +1219,20 @@ Status FileScanner::_init_parquet_reader(std::unique_ptr&& parque const TFileRangeDesc& range = _current_range; Status init_status = Status::OK(); + phmap::flat_hash_map>> slot_id_to_predicates = + _local_state + ? _local_state->cast()._slot_id_to_predicates + : phmap::flat_hash_map>> {}; if (range.__isset.table_format_params && range.table_format_params.table_format_type == "iceberg") { std::unique_ptr iceberg_reader = IcebergParquetReader::create_unique( std::move(parquet_reader), _profile, _state, *_params, range, _kv_cache, _io_ctx.get(), file_meta_cache_ptr); init_status = iceberg_reader->init_reader( - _file_col_names, &_src_block_name_to_idx, _push_down_conjuncts, _real_tuple_desc, - _default_val_row_desc.get(), _col_name_to_slot_id, - &_not_single_slot_filter_conjuncts, &_slot_id_to_filter_conjuncts); + _file_col_names, &_src_block_name_to_idx, _push_down_conjuncts, + slot_id_to_predicates, _real_tuple_desc, _default_val_row_desc.get(), + _col_name_to_slot_id, &_not_single_slot_filter_conjuncts, + &_slot_id_to_filter_conjuncts); _cur_reader = std::move(iceberg_reader); } else if (range.__isset.table_format_params && range.table_format_params.table_format_type == "paimon") { @@ -1242,9 +1240,10 @@ Status FileScanner::_init_parquet_reader(std::unique_ptr&& parque std::move(parquet_reader), _profile, _state, *_params, range, _io_ctx.get(), file_meta_cache_ptr); init_status = paimon_reader->init_reader( - _file_col_names, &_src_block_name_to_idx, _push_down_conjuncts, _real_tuple_desc, - _default_val_row_desc.get(), _col_name_to_slot_id, - &_not_single_slot_filter_conjuncts, &_slot_id_to_filter_conjuncts); + _file_col_names, &_src_block_name_to_idx, _push_down_conjuncts, + slot_id_to_predicates, _real_tuple_desc, _default_val_row_desc.get(), + _col_name_to_slot_id, &_not_single_slot_filter_conjuncts, + &_slot_id_to_filter_conjuncts); RETURN_IF_ERROR(paimon_reader->init_row_filters()); _cur_reader = std::move(paimon_reader); } else if (range.__isset.table_format_params && @@ -1253,18 +1252,20 @@ Status FileScanner::_init_parquet_reader(std::unique_ptr&& parque std::move(parquet_reader), _profile, _state, *_params, range, _io_ctx.get(), file_meta_cache_ptr); init_status = hudi_reader->init_reader( - _file_col_names, &_src_block_name_to_idx, _push_down_conjuncts, _real_tuple_desc, - _default_val_row_desc.get(), _col_name_to_slot_id, - &_not_single_slot_filter_conjuncts, &_slot_id_to_filter_conjuncts); + _file_col_names, &_src_block_name_to_idx, _push_down_conjuncts, + slot_id_to_predicates, _real_tuple_desc, _default_val_row_desc.get(), + _col_name_to_slot_id, &_not_single_slot_filter_conjuncts, + &_slot_id_to_filter_conjuncts); _cur_reader = std::move(hudi_reader); } else if (range.table_format_params.table_format_type == "hive") { auto hive_reader = HiveParquetReader::create_unique(std::move(parquet_reader), _profile, _state, *_params, range, _io_ctx.get(), &_is_file_slot, file_meta_cache_ptr); init_status = hive_reader->init_reader( - _file_col_names, &_src_block_name_to_idx, _push_down_conjuncts, _real_tuple_desc, - _default_val_row_desc.get(), _col_name_to_slot_id, - &_not_single_slot_filter_conjuncts, &_slot_id_to_filter_conjuncts); + _file_col_names, &_src_block_name_to_idx, _push_down_conjuncts, + slot_id_to_predicates, _real_tuple_desc, _default_val_row_desc.get(), + _col_name_to_slot_id, &_not_single_slot_filter_conjuncts, + &_slot_id_to_filter_conjuncts); _cur_reader = std::move(hive_reader); } else if (range.table_format_params.table_format_type == "tvf") { const FieldDescriptor* parquet_meta = nullptr; @@ -1278,9 +1279,10 @@ Status FileScanner::_init_parquet_reader(std::unique_ptr&& parque RETURN_IF_ERROR(TableSchemaChangeHelper::BuildTableInfoUtil::by_parquet_name( _real_tuple_desc, *parquet_meta, tvf_info_node)); init_status = parquet_reader->init_reader( - _file_col_names, &_src_block_name_to_idx, _push_down_conjuncts, _real_tuple_desc, - _default_val_row_desc.get(), _col_name_to_slot_id, - &_not_single_slot_filter_conjuncts, &_slot_id_to_filter_conjuncts, tvf_info_node); + _file_col_names, &_src_block_name_to_idx, _push_down_conjuncts, + slot_id_to_predicates, _real_tuple_desc, _default_val_row_desc.get(), + _col_name_to_slot_id, &_not_single_slot_filter_conjuncts, + &_slot_id_to_filter_conjuncts, tvf_info_node); _cur_reader = std::move(parquet_reader); } else if (_is_load) { const FieldDescriptor* parquet_meta = nullptr; @@ -1308,9 +1310,10 @@ Status FileScanner::_init_parquet_reader(std::unique_ptr&& parque } init_status = parquet_reader->init_reader( - _file_col_names, &_src_block_name_to_idx, _push_down_conjuncts, _real_tuple_desc, - _default_val_row_desc.get(), _col_name_to_slot_id, - &_not_single_slot_filter_conjuncts, &_slot_id_to_filter_conjuncts, load_info_node); + _file_col_names, &_src_block_name_to_idx, _push_down_conjuncts, + slot_id_to_predicates, _real_tuple_desc, _default_val_row_desc.get(), + _col_name_to_slot_id, &_not_single_slot_filter_conjuncts, + &_slot_id_to_filter_conjuncts, load_info_node); _cur_reader = std::move(parquet_reader); } @@ -1513,8 +1516,6 @@ Status FileScanner::prepare_for_read_lines(const TFileRangeDesc& range) { RETURN_IF_ERROR(_init_expr_ctxes()); // Since only one column is read from the file, there is no need to filter, so set these variables to empty. - static std::unordered_map colname_to_value_range; - _colname_to_value_range = &colname_to_value_range; _push_down_conjuncts.clear(); _not_single_slot_filter_conjuncts.clear(); _slot_id_to_filter_conjuncts.clear(); diff --git a/be/src/vec/exec/scan/file_scanner.h b/be/src/vec/exec/scan/file_scanner.h index 379f6a246f62bd..381f75bf648d8a 100644 --- a/be/src/vec/exec/scan/file_scanner.h +++ b/be/src/vec/exec/scan/file_scanner.h @@ -69,7 +69,6 @@ class FileScanner : public Scanner { FileScanner(RuntimeState* state, pipeline::FileScanLocalState* parent, int64_t limit, std::shared_ptr split_source, RuntimeProfile* profile, ShardedKVCache* kv_cache, - const std::unordered_map* colname_to_value_range, const std::unordered_map* colname_to_slot_id); Status open(RuntimeState* state) override; @@ -125,7 +124,6 @@ class FileScanner : public Scanner { std::unique_ptr _cur_reader; bool _cur_reader_eof = false; - const std::unordered_map* _colname_to_value_range = nullptr; // File source slot descriptors std::vector _file_slot_descs; // col names from _file_slot_descs @@ -286,8 +284,6 @@ class FileScanner : public Scanner { : _local_state->get_push_down_agg_type(); } - int64_t _get_push_down_count() { return _local_state->get_push_down_count(); } - // enable the file meta cache only when // 1. max_external_file_meta_cache_num is > 0 // 2. the file number is less than 1/3 of cache's capacibility diff --git a/be/src/vec/exec/scan/meta_scanner.cpp b/be/src/vec/exec/scan/meta_scanner.cpp index 494c11fc615998..1cc20a3d11ba64 100644 --- a/be/src/vec/exec/scan/meta_scanner.cpp +++ b/be/src/vec/exec/scan/meta_scanner.cpp @@ -72,7 +72,7 @@ Status MetaScanner::open(RuntimeState* state) { auto reader = IcebergSysTableJniReader::create_unique(_tuple_desc->slots(), state, _profile, _scan_range.meta_scan_range); const std::unordered_map colname_to_value_range; - RETURN_IF_ERROR(reader->init_reader(&colname_to_value_range)); + RETURN_IF_ERROR(reader->init_reader()); static_cast(reader.get()) ->set_col_name_to_block_idx(&_src_block_name_to_idx); _reader = std::move(reader); @@ -80,7 +80,7 @@ Status MetaScanner::open(RuntimeState* state) { auto reader = PaimonSysTableJniReader::create_unique(_tuple_desc->slots(), state, _profile, _scan_range.meta_scan_range); const std::unordered_map colname_to_value_range; - RETURN_IF_ERROR(reader->init_reader(&colname_to_value_range)); + RETURN_IF_ERROR(reader->init_reader()); static_cast(reader.get()) ->set_col_name_to_block_idx(&_src_block_name_to_idx); _reader = std::move(reader); diff --git a/be/src/vec/exec/scan/olap_scanner.cpp b/be/src/vec/exec/scan/olap_scanner.cpp index fa808121a088de..5e8d808dbe238b 100644 --- a/be/src/vec/exec/scan/olap_scanner.cpp +++ b/be/src/vec/exec/scan/olap_scanner.cpp @@ -76,10 +76,7 @@ OlapScanner::OlapScanner(pipeline::ScanLocalStateBase* parent, OlapScanner::Para .version = {0, params.version}, .start_key {}, .end_key {}, - .conditions {}, - .bloom_filters {}, - .bitmap_filters {}, - .in_filters {}, + .predicates {}, .function_filters {}, .delete_predicates {}, .target_cast_type_for_variants {}, @@ -271,9 +268,10 @@ Status OlapScanner::prepare() { } // Initialize tablet_reader_params - RETURN_IF_ERROR(_init_tablet_reader_params(_key_ranges, local_state->_olap_filters, - local_state->_filter_predicates, - local_state->_push_down_functions)); + RETURN_IF_ERROR(_init_tablet_reader_params( + local_state->_parent->cast()._slot_id_to_slot_desc, + _key_ranges, local_state->_slot_id_to_predicates, + local_state->_push_down_functions)); } // add read columns in profile @@ -329,9 +327,10 @@ Status OlapScanner::open(RuntimeState* state) { // it will be called under tablet read lock because capture rs readers need Status OlapScanner::_init_tablet_reader_params( + const phmap::flat_hash_map& slot_id_to_slot_desc, const std::vector& key_ranges, - const std::vector>& filters, - const pipeline::FilterPredicates& filter_predicates, + const phmap::flat_hash_map>>& + slot_to_predicates, const std::vector& function_filters) { // if the table with rowset [0-x] or [0-1] [2-y], and [0-1] is empty const bool single_version = _tablet_reader_params.has_single_version(); @@ -375,27 +374,26 @@ Status OlapScanner::_init_tablet_reader_params( ((pipeline::OlapScanLocalState*)_local_state)->_cast_types_for_variants) { _tablet_reader_params.target_cast_type_for_variants[ele.first] = ele.second; }; - // Condition - for (auto& filter : filters) { - _tablet_reader_params.conditions.push_back(filter); + auto& tablet_schema = _tablet_reader_params.tablet_schema; + for (auto& predicates : slot_to_predicates) { + const int sid = predicates.first; + DCHECK(slot_id_to_slot_desc.contains(sid)); + int32_t index = + tablet_schema->field_index(slot_id_to_slot_desc.find(sid)->second->col_name()); + if (index < 0) { + throw Exception( + Status::InternalError("Column {} not found in tablet schema", + slot_id_to_slot_desc.find(sid)->second->col_name())); + } + for (auto& predicate : predicates.second) { + _tablet_reader_params.predicates.push_back(predicate->clone(index)); + } } - std::copy(filter_predicates.bloom_filters.cbegin(), filter_predicates.bloom_filters.cend(), - std::inserter(_tablet_reader_params.bloom_filters, - _tablet_reader_params.bloom_filters.begin())); - std::copy(filter_predicates.bitmap_filters.cbegin(), filter_predicates.bitmap_filters.cend(), - std::inserter(_tablet_reader_params.bitmap_filters, - _tablet_reader_params.bitmap_filters.begin())); - - std::copy(filter_predicates.in_filters.cbegin(), filter_predicates.in_filters.cend(), - std::inserter(_tablet_reader_params.in_filters, - _tablet_reader_params.in_filters.begin())); - std::copy(function_filters.cbegin(), function_filters.cend(), std::inserter(_tablet_reader_params.function_filters, _tablet_reader_params.function_filters.begin())); - auto& tablet_schema = _tablet_reader_params.tablet_schema; // Merge the columns in delete predicate that not in latest schema in to current tablet schema for (auto& del_pred : _tablet_reader_params.delete_predicates) { tablet_schema->merge_dropped_columns(*del_pred->tablet_schema()); diff --git a/be/src/vec/exec/scan/olap_scanner.h b/be/src/vec/exec/scan/olap_scanner.h index 27e09f298172f2..4b8d866ba25fa7 100644 --- a/be/src/vec/exec/scan/olap_scanner.h +++ b/be/src/vec/exec/scan/olap_scanner.h @@ -88,10 +88,12 @@ class OlapScanner : public Scanner { void _collect_profile_before_close() override; private: - Status _init_tablet_reader_params(const std::vector& key_ranges, - const std::vector>& filters, - const pipeline::FilterPredicates& filter_predicates, - const std::vector& function_filters); + Status _init_tablet_reader_params( + const phmap::flat_hash_map& slot_id_to_slot_desc, + const std::vector& key_ranges, + const phmap::flat_hash_map>>& + predicates, + const std::vector& function_filters); [[nodiscard]] Status _init_return_columns(); [[nodiscard]] Status _init_variant_columns(); diff --git a/be/src/vec/exprs/vexpr.h b/be/src/vec/exprs/vexpr.h index 03694652bf0092..4e7f3325b8616d 100644 --- a/be/src/vec/exprs/vexpr.h +++ b/be/src/vec/exprs/vexpr.h @@ -196,7 +196,7 @@ class VExpr { virtual bool is_literal() const { return false; } - MOCK_FUNCTION TExprNodeType::type node_type() const { return _node_type; } + virtual TExprNodeType::type node_type() const { return _node_type; } TExprOpcode::type op() const { return _opcode; } @@ -210,6 +210,7 @@ class VExpr { return std::ranges::any_of(_children.begin(), _children.end(), [](VExprSPtr child) { return child->is_rf_wrapper(); }); } + virtual bool is_topn_filter() const { return false; } virtual void do_judge_selectivity(uint64_t filter_rows, uint64_t input_rows) { for (auto child : _children) { diff --git a/be/src/vec/exprs/vexpr_context.h b/be/src/vec/exprs/vexpr_context.h index e511260af57e4c..04bf7c20f7c608 100644 --- a/be/src/vec/exprs/vexpr_context.h +++ b/be/src/vec/exprs/vexpr_context.h @@ -180,7 +180,7 @@ class VExprContext { [[nodiscard]] Status execute_const_expr(ColumnWithTypeAndName& result); - VExprSPtr root() { return _root; } + VExprSPtr root() const { return _root; } void set_root(const VExprSPtr& expr) { _root = expr; } void set_index_context(std::shared_ptr index_context) { _index_context = std::move(index_context); diff --git a/be/src/vec/exprs/vruntimefilter_wrapper.h b/be/src/vec/exprs/vruntimefilter_wrapper.h index 9db2b295c6cb65..567899adc7f1da 100644 --- a/be/src/vec/exprs/vruntimefilter_wrapper.h +++ b/be/src/vec/exprs/vruntimefilter_wrapper.h @@ -62,6 +62,7 @@ class VRuntimeFilterWrapper final : public VExpr { void close(VExprContext* context, FunctionContext::FunctionStateScope scope) override; const std::string& expr_name() const override; const VExprSPtrs& children() const override { return _impl->children(); } + TExprNodeType::type node_type() const override { return _impl->node_type(); } VExprSPtr get_impl() const override { return _impl; } diff --git a/be/src/vec/exprs/vtopn_pred.h b/be/src/vec/exprs/vtopn_pred.h index d14239c1a3e7ac..3c2db89b71914c 100644 --- a/be/src/vec/exprs/vtopn_pred.h +++ b/be/src/vec/exprs/vtopn_pred.h @@ -45,6 +45,7 @@ class VTopNPred : public VExpr { _source_node_id(source_node_id), _expr_name(fmt::format("VTopNPred(source_node_id={})", _source_node_id)), _target_ctx(std::move(target_ctx)) {} + bool is_topn_filter() const override { return true; } static Status create_vtopn_pred(const TExpr& target_expr, int source_node_id, vectorized::VExprSPtr& expr) { @@ -63,6 +64,8 @@ class VTopNPred : public VExpr { return Status::OK(); } + int source_node_id() const { return _source_node_id; } + Status prepare(RuntimeState* state, const RowDescriptor& desc, VExprContext* context) override { _predicate = &state->get_query_ctx()->get_runtime_predicate(_source_node_id); RETURN_IF_ERROR_OR_PREPARED(VExpr::prepare(state, desc, context)); diff --git a/be/src/vec/functions/in.h b/be/src/vec/functions/in.h index 6324cdfb97f2d8..e52841df682458 100644 --- a/be/src/vec/functions/in.h +++ b/be/src/vec/functions/in.h @@ -56,7 +56,7 @@ using ColumnString = ColumnStr; struct InState { bool use_set = true; - std::unique_ptr hybrid_set; + std::shared_ptr hybrid_set; }; template diff --git a/be/test/olap/block_column_predicate_test.cpp b/be/test/olap/block_column_predicate_test.cpp index beb5c16d7407ad..ee53881980ee91 100644 --- a/be/test/olap/block_column_predicate_test.cpp +++ b/be/test/olap/block_column_predicate_test.cpp @@ -82,9 +82,9 @@ TEST_F(BlockColumnPredicateTest, SINGLE_COLUMN_VEC) { int value = 5; int rows = 10; int col_idx = 0; - std::unique_ptr pred( - new ComparisonPredicateBase(col_idx, value)); - SingleColumnBlockPredicate single_column_block_pred(pred.get()); + std::shared_ptr pred( + new ComparisonPredicateBase(col_idx, "", value)); + SingleColumnBlockPredicate single_column_block_pred(pred); std::vector sel_idx(rows); uint16_t selected_size = rows; @@ -110,12 +110,12 @@ TEST_F(BlockColumnPredicateTest, AND_MUTI_COLUMN_VEC) { int great_value = 3; int rows = 10; int col_idx = 0; - std::unique_ptr less_pred( - new ComparisonPredicateBase(col_idx, less_value)); - std::unique_ptr great_pred( - new ComparisonPredicateBase(col_idx, great_value)); - auto single_less_pred = SingleColumnBlockPredicate::create_unique(less_pred.get()); - auto single_great_pred = SingleColumnBlockPredicate::create_unique(great_pred.get()); + std::shared_ptr less_pred( + new ComparisonPredicateBase(col_idx, "", less_value)); + std::shared_ptr great_pred( + new ComparisonPredicateBase(col_idx, "", great_value)); + auto single_less_pred = SingleColumnBlockPredicate::create_unique(less_pred); + auto single_great_pred = SingleColumnBlockPredicate::create_unique(great_pred); AndBlockColumnPredicate and_block_column_pred; and_block_column_pred.add_column_predicate(std::move(single_less_pred)); @@ -145,12 +145,12 @@ TEST_F(BlockColumnPredicateTest, OR_MUTI_COLUMN_VEC) { int great_value = 3; int rows = 10; int col_idx = 0; - std::unique_ptr less_pred( - new ComparisonPredicateBase(col_idx, less_value)); - std::unique_ptr great_pred( - new ComparisonPredicateBase(col_idx, great_value)); - auto single_less_pred = SingleColumnBlockPredicate::create_unique(less_pred.get()); - auto single_great_pred = SingleColumnBlockPredicate::create_unique(great_pred.get()); + std::shared_ptr less_pred( + new ComparisonPredicateBase(col_idx, "", less_value)); + std::shared_ptr great_pred( + new ComparisonPredicateBase(col_idx, "", great_value)); + auto single_less_pred = SingleColumnBlockPredicate::create_unique(less_pred); + auto single_great_pred = SingleColumnBlockPredicate::create_unique(great_pred); OrBlockColumnPredicate or_block_column_pred; or_block_column_pred.add_column_predicate(std::move(single_less_pred)); @@ -180,25 +180,25 @@ TEST_F(BlockColumnPredicateTest, OR_AND_MUTI_COLUMN_VEC) { int great_value = 3; int rows = 10; int col_idx = 0; - std::unique_ptr less_pred( - new ComparisonPredicateBase(0, less_value)); - std::unique_ptr great_pred( - new ComparisonPredicateBase(0, great_value)); - std::unique_ptr less_pred1( - new ComparisonPredicateBase(0, great_value)); + std::shared_ptr less_pred( + new ComparisonPredicateBase(0, "", less_value)); + std::shared_ptr great_pred( + new ComparisonPredicateBase(0, "", great_value)); + std::shared_ptr less_pred1( + new ComparisonPredicateBase(0, "", great_value)); // Test for and or single // (column < 5 and column > 3) or column < 3 auto and_block_column_pred = AndBlockColumnPredicate::create_unique(); and_block_column_pred->add_column_predicate( - SingleColumnBlockPredicate::create_unique(less_pred.get())); + SingleColumnBlockPredicate::create_unique(less_pred)); and_block_column_pred->add_column_predicate( - SingleColumnBlockPredicate::create_unique(great_pred.get())); + SingleColumnBlockPredicate::create_unique(great_pred)); OrBlockColumnPredicate or_block_column_pred; or_block_column_pred.add_column_predicate(std::move(and_block_column_pred)); or_block_column_pred.add_column_predicate( - SingleColumnBlockPredicate::create_unique(less_pred1.get())); + SingleColumnBlockPredicate::create_unique(less_pred1)); std::vector sel_idx(rows); uint16_t selected_size = rows; @@ -222,13 +222,13 @@ TEST_F(BlockColumnPredicateTest, OR_AND_MUTI_COLUMN_VEC) { // column < 3 or (column < 5 and column > 3) auto and_block_column_pred1 = AndBlockColumnPredicate::create_unique(); and_block_column_pred1->add_column_predicate( - SingleColumnBlockPredicate::create_unique(less_pred.get())); + SingleColumnBlockPredicate::create_unique(less_pred)); and_block_column_pred1->add_column_predicate( - SingleColumnBlockPredicate::create_unique(great_pred.get())); + SingleColumnBlockPredicate::create_unique(great_pred)); OrBlockColumnPredicate or_block_column_pred1; or_block_column_pred1.add_column_predicate( - SingleColumnBlockPredicate::create_unique(less_pred1.get())); + SingleColumnBlockPredicate::create_unique(less_pred1)); or_block_column_pred1.add_column_predicate(std::move(and_block_column_pred1)); selected_size = or_block_column_pred1.evaluate(block, sel_idx.data(), selected_size); @@ -247,25 +247,25 @@ TEST_F(BlockColumnPredicateTest, AND_OR_MUTI_COLUMN_VEC) { int great_value = 3; int rows = 10; int col_idx = 0; - std::unique_ptr less_pred( - new ComparisonPredicateBase(0, less_value)); - std::unique_ptr great_pred( - new ComparisonPredicateBase(0, great_value)); - std::unique_ptr less_pred1( - new ComparisonPredicateBase(0, great_value)); + std::shared_ptr less_pred( + new ComparisonPredicateBase(0, "", less_value)); + std::shared_ptr great_pred( + new ComparisonPredicateBase(0, "", great_value)); + std::shared_ptr less_pred1( + new ComparisonPredicateBase(0, "", great_value)); // Test for and or single // (column < 5 or column < 3) and column > 3 auto or_block_column_pred = OrBlockColumnPredicate::create_unique(); or_block_column_pred->add_column_predicate( - SingleColumnBlockPredicate::create_unique(less_pred.get())); + SingleColumnBlockPredicate::create_unique(less_pred)); or_block_column_pred->add_column_predicate( - SingleColumnBlockPredicate::create_unique(less_pred1.get())); + SingleColumnBlockPredicate::create_unique(less_pred1)); AndBlockColumnPredicate and_block_column_pred; and_block_column_pred.add_column_predicate(std::move(or_block_column_pred)); and_block_column_pred.add_column_predicate( - SingleColumnBlockPredicate::create_unique(great_pred.get())); + SingleColumnBlockPredicate::create_unique(great_pred)); std::vector sel_idx(rows); uint16_t selected_size = rows; @@ -287,13 +287,13 @@ TEST_F(BlockColumnPredicateTest, AND_OR_MUTI_COLUMN_VEC) { // column > 3 and (column < 5 or column < 3) auto or_block_column_pred1 = OrBlockColumnPredicate::create_unique(); or_block_column_pred1->add_column_predicate( - SingleColumnBlockPredicate::create_unique(less_pred.get())); + SingleColumnBlockPredicate::create_unique(less_pred)); or_block_column_pred1->add_column_predicate( - SingleColumnBlockPredicate::create_unique(less_pred1.get())); + SingleColumnBlockPredicate::create_unique(less_pred1)); AndBlockColumnPredicate and_block_column_pred1; and_block_column_pred1.add_column_predicate( - SingleColumnBlockPredicate::create_unique(great_pred.get())); + SingleColumnBlockPredicate::create_unique(great_pred)); and_block_column_pred1.add_column_predicate(std::move(or_block_column_pred1)); EXPECT_EQ(selected_size, 1); @@ -305,8 +305,9 @@ void single_column_predicate_test_func(const std::pair::CppType check_value, bool expect_match) { int col_idx = 0; - std::unique_ptr pred(new ComparisonPredicateBase(col_idx, check_value)); - SingleColumnBlockPredicate single_column_block_pred(pred.get()); + std::shared_ptr pred( + new ComparisonPredicateBase(col_idx, "", check_value)); + SingleColumnBlockPredicate single_column_block_pred(pred); bool matched = single_column_block_pred.evaluate_and(statistic); EXPECT_EQ(matched, expect_match); @@ -1331,8 +1332,9 @@ void single_column_predicate_test_func(const segment_v2::BloomFilter* bf, typename PrimitiveTypeTraits::CppType check_value, bool expect_match) { int col_idx = 0; - std::shared_ptr pred(new ComparisonPredicateBase(col_idx, check_value)); - SingleColumnBlockPredicate single_column_block_pred(pred.get()); + std::shared_ptr pred( + new ComparisonPredicateBase(col_idx, "", check_value)); + SingleColumnBlockPredicate single_column_block_pred(pred); bool matched = single_column_block_pred.evaluate_and(bf); EXPECT_EQ(matched, expect_match); @@ -1386,9 +1388,9 @@ TEST_F(BlockColumnPredicateTest, PARQUET_COMPARISON_PREDICATE) { {// EQ int value = 5; int col_idx = 0; - std::unique_ptr pred( - new ComparisonPredicateBase(col_idx, value)); - SingleColumnBlockPredicate single_column_block_pred(pred.get()); + std::shared_ptr pred( + new ComparisonPredicateBase(col_idx, "", value)); + SingleColumnBlockPredicate single_column_block_pred(pred); std::unique_ptr parquet_field_col1 = std::make_unique(); parquet_field_col1->name = "col1"; @@ -1463,9 +1465,9 @@ TEST_F(BlockColumnPredicateTest, PARQUET_COMPARISON_PREDICATE) { // NE int value = 5; int col_idx = 0; - std::unique_ptr pred( - new ComparisonPredicateBase(col_idx, value)); - SingleColumnBlockPredicate single_column_block_pred(pred.get()); + std::shared_ptr pred( + new ComparisonPredicateBase(col_idx, "", value)); + SingleColumnBlockPredicate single_column_block_pred(pred); std::unique_ptr parquet_field_col1 = std::make_unique(); parquet_field_col1->name = "col1"; @@ -1532,9 +1534,9 @@ TEST_F(BlockColumnPredicateTest, PARQUET_COMPARISON_PREDICATE) { // GE int value = 5; int col_idx = 0; - std::unique_ptr pred( - new ComparisonPredicateBase(col_idx, value)); - SingleColumnBlockPredicate single_column_block_pred(pred.get()); + std::shared_ptr pred( + new ComparisonPredicateBase(col_idx, "", value)); + SingleColumnBlockPredicate single_column_block_pred(pred); std::unique_ptr parquet_field_col1 = std::make_unique(); parquet_field_col1->name = "col1"; @@ -1601,9 +1603,9 @@ TEST_F(BlockColumnPredicateTest, PARQUET_COMPARISON_PREDICATE) { // LE int value = 5; int col_idx = 0; - std::unique_ptr pred( - new ComparisonPredicateBase(col_idx, value)); - SingleColumnBlockPredicate single_column_block_pred(pred.get()); + std::shared_ptr pred( + new ComparisonPredicateBase(col_idx, "", value)); + SingleColumnBlockPredicate single_column_block_pred(pred); std::unique_ptr parquet_field_col1 = std::make_unique(); parquet_field_col1->name = "col1"; @@ -1673,9 +1675,9 @@ TEST_F(BlockColumnPredicateTest, PARQUET_COMPARISON_PREDICATE) { // EQ float value = 5.0; int col_idx = 0; - std::unique_ptr pred( - new ComparisonPredicateBase(col_idx, value)); - SingleColumnBlockPredicate single_column_block_pred(pred.get()); + std::shared_ptr pred( + new ComparisonPredicateBase(col_idx, "", value)); + SingleColumnBlockPredicate single_column_block_pred(pred); std::unique_ptr parquet_field_col1 = std::make_unique(); parquet_field_col1->name = "col1"; @@ -1767,9 +1769,9 @@ TEST_F(BlockColumnPredicateTest, PARQUET_COMPARISON_PREDICATE) { // NE float value = 5; int col_idx = 0; - std::unique_ptr pred( - new ComparisonPredicateBase(col_idx, value)); - SingleColumnBlockPredicate single_column_block_pred(pred.get()); + std::shared_ptr pred( + new ComparisonPredicateBase(col_idx, "", value)); + SingleColumnBlockPredicate single_column_block_pred(pred); std::unique_ptr parquet_field_col1 = std::make_unique(); parquet_field_col1->name = "col1"; @@ -1836,9 +1838,9 @@ TEST_F(BlockColumnPredicateTest, PARQUET_COMPARISON_PREDICATE) { // GE float value = 5.0; int col_idx = 0; - std::unique_ptr pred( - new ComparisonPredicateBase(col_idx, value)); - SingleColumnBlockPredicate single_column_block_pred(pred.get()); + std::shared_ptr pred( + new ComparisonPredicateBase(col_idx, "", value)); + SingleColumnBlockPredicate single_column_block_pred(pred); std::unique_ptr parquet_field_col1 = std::make_unique(); parquet_field_col1->name = "col1"; @@ -1905,9 +1907,9 @@ TEST_F(BlockColumnPredicateTest, PARQUET_COMPARISON_PREDICATE) { // LE float value = 5.0; int col_idx = 0; - std::unique_ptr pred( - new ComparisonPredicateBase(col_idx, value)); - SingleColumnBlockPredicate single_column_block_pred(pred.get()); + std::shared_ptr pred( + new ComparisonPredicateBase(col_idx, "", value)); + SingleColumnBlockPredicate single_column_block_pred(pred); std::unique_ptr parquet_field_col1 = std::make_unique(); parquet_field_col1->name = "col1"; @@ -1980,11 +1982,10 @@ TEST_F(BlockColumnPredicateTest, PARQUET_IN_PREDICATE) { int col_idx = 0; auto hybrid_set = std::make_shared>(false); hybrid_set->insert(&value); - std::unique_ptr pred( - new InListPredicateBase>(col_idx, - hybrid_set)); - SingleColumnBlockPredicate single_column_block_pred(pred.get()); + std::shared_ptr pred( + new InListPredicateBase( + col_idx, "", hybrid_set, false)); + SingleColumnBlockPredicate single_column_block_pred(pred); std::unique_ptr parquet_field_col1 = std::make_unique(); parquet_field_col1->name = "col1"; @@ -2027,11 +2028,10 @@ TEST_F(BlockColumnPredicateTest, PARQUET_IN_PREDICATE) { int col_idx = 0; auto hybrid_set = std::make_shared>(false); hybrid_set->insert(&value); - std::unique_ptr pred( - new InListPredicateBase>(col_idx, - hybrid_set)); - SingleColumnBlockPredicate single_column_block_pred(pred.get()); + std::shared_ptr pred( + new InListPredicateBase( + col_idx, "", hybrid_set, false)); + SingleColumnBlockPredicate single_column_block_pred(pred); std::unique_ptr parquet_field_col1 = std::make_unique(); parquet_field_col1->name = "col1"; @@ -2076,9 +2076,9 @@ TEST_F(BlockColumnPredicateTest, PARQUET_IN_PREDICATE) { TEST_F(BlockColumnPredicateTest, PARQUET_COMPARISON_PREDICATE_BLOOM_FILTER) { const int value = 42; const int col_idx = 0; - std::unique_ptr pred( - new ComparisonPredicateBase(col_idx, value)); - SingleColumnBlockPredicate single_column_block_pred(pred.get()); + std::shared_ptr pred( + new ComparisonPredicateBase(col_idx, "", value)); + SingleColumnBlockPredicate single_column_block_pred(pred); auto parquet_field = std::make_unique(); parquet_field->name = "col1"; @@ -2238,10 +2238,10 @@ TEST_F(BlockColumnPredicateTest, PARQUET_IN_PREDICATE_BLOOM_FILTER) { auto hybrid_set = std::make_shared>(false); const int included_value = 7; hybrid_set->insert(&included_value); - std::unique_ptr pred( - new InListPredicateBase>(col_idx, hybrid_set)); - SingleColumnBlockPredicate single_column_block_pred(pred.get()); + std::shared_ptr pred( + new InListPredicateBase(col_idx, "", hybrid_set, + false)); + SingleColumnBlockPredicate single_column_block_pred(pred); auto parquet_field = std::make_unique(); parquet_field->name = "col1"; @@ -2370,8 +2370,9 @@ TEST_F(BlockColumnPredicateTest, PARQUET_IN_PREDICATE_BLOOM_FILTER) { TEST_F(BlockColumnPredicateTest, NULL_PREDICATE) { { int col_idx = 0; - std::unique_ptr pred(new NullPredicate(col_idx, true)); - SingleColumnBlockPredicate single_column_block_pred(pred.get()); + std::shared_ptr pred( + new NullPredicate(col_idx, "", true, PrimitiveType::TYPE_INT)); + SingleColumnBlockPredicate single_column_block_pred(pred); std::unique_ptr parquet_field_col1 = std::make_unique(); parquet_field_col1->name = "col1"; @@ -2407,8 +2408,9 @@ TEST_F(BlockColumnPredicateTest, NULL_PREDICATE) { } { int col_idx = 0; - std::unique_ptr pred(new NullPredicate(col_idx, false)); - SingleColumnBlockPredicate single_column_block_pred(pred.get()); + std::shared_ptr pred( + new NullPredicate(col_idx, "", false, PrimitiveType::TYPE_INT)); + SingleColumnBlockPredicate single_column_block_pred(pred); std::unique_ptr parquet_field_col1 = std::make_unique(); parquet_field_col1->name = "col1"; @@ -2462,14 +2464,14 @@ TEST_F(BlockColumnPredicateTest, COMBINED_PREDICATE) { std::unique_ptr true_predicate; int col_idx = 0; int value = 5; - std::unique_ptr pred( - new ComparisonPredicateBase(col_idx, value)); - true_predicate = std::make_unique(pred.get()); + std::shared_ptr pred( + new ComparisonPredicateBase(col_idx, "", value)); + true_predicate = std::make_unique(pred); std::unique_ptr false_predicate; - std::unique_ptr pred2( - new ComparisonPredicateBase(col_idx, value)); - false_predicate = std::make_unique(pred2.get()); + std::shared_ptr pred2( + new ComparisonPredicateBase(col_idx, "", value)); + false_predicate = std::make_unique(pred2); std::unique_ptr parquet_field_col1 = std::make_unique(); @@ -2507,14 +2509,14 @@ TEST_F(BlockColumnPredicateTest, COMBINED_PREDICATE) { std::unique_ptr true_predicate; int col_idx = 0; int value = 5; - std::unique_ptr pred( - new ComparisonPredicateBase(col_idx, value)); - true_predicate = std::make_unique(pred.get()); + std::shared_ptr pred( + new ComparisonPredicateBase(col_idx, "", value)); + true_predicate = std::make_unique(pred); std::unique_ptr true_predicate2; - std::unique_ptr pred2( - new ComparisonPredicateBase(col_idx, value)); - true_predicate2 = std::make_unique(pred2.get()); + std::shared_ptr pred2( + new ComparisonPredicateBase(col_idx, "", value)); + true_predicate2 = std::make_unique(pred2); std::unique_ptr parquet_field_col1 = std::make_unique(); @@ -2552,14 +2554,14 @@ TEST_F(BlockColumnPredicateTest, COMBINED_PREDICATE) { std::unique_ptr true_predicate; int col_idx = 0; int value = 5; - std::unique_ptr pred( - new ComparisonPredicateBase(col_idx, value)); - true_predicate = std::make_unique(pred.get()); + std::shared_ptr pred( + new ComparisonPredicateBase(col_idx, "", value)); + true_predicate = std::make_unique(pred); std::unique_ptr false_predicate; - std::unique_ptr pred2( - new ComparisonPredicateBase(col_idx, value)); - false_predicate = std::make_unique(pred2.get()); + std::shared_ptr pred2( + new ComparisonPredicateBase(col_idx, "", value)); + false_predicate = std::make_unique(pred2); std::unique_ptr parquet_field_col1 = std::make_unique(); @@ -2597,14 +2599,14 @@ TEST_F(BlockColumnPredicateTest, COMBINED_PREDICATE) { std::unique_ptr false_predicate2; int col_idx = 0; int value = 5; - std::unique_ptr pred( - new ComparisonPredicateBase(col_idx, value)); - false_predicate2 = std::make_unique(pred.get()); + std::shared_ptr pred( + new ComparisonPredicateBase(col_idx, "", value)); + false_predicate2 = std::make_unique(pred); std::unique_ptr false_predicate; - std::unique_ptr pred2( - new ComparisonPredicateBase(col_idx, value)); - false_predicate = std::make_unique(pred2.get()); + std::shared_ptr pred2( + new ComparisonPredicateBase(col_idx, "", value)); + false_predicate = std::make_unique(pred2); std::unique_ptr parquet_field_col1 = std::make_unique(); @@ -2642,9 +2644,9 @@ TEST_F(BlockColumnPredicateTest, COMBINED_PREDICATE) { int col_idx = 0; int value = 5; std::unique_ptr false_predicate; - std::unique_ptr pred2( - new ComparisonPredicateBase(col_idx, value)); - false_predicate = std::make_unique(pred2.get()); + std::shared_ptr pred2( + new ComparisonPredicateBase(col_idx, "", value)); + false_predicate = std::make_unique(pred2); std::unique_ptr parquet_field_col1 = std::make_unique(); diff --git a/be/test/olap/date_bloom_filter_test.cpp b/be/test/olap/date_bloom_filter_test.cpp index 6ef6eacb3e7858..383a0869b6bf4a 100644 --- a/be/test/olap/date_bloom_filter_test.cpp +++ b/be/test/olap/date_bloom_filter_test.cpp @@ -178,7 +178,7 @@ TEST_F(DateBloomFilterTest, query_index_test) { auto test = [&](const std::string& query_string, bool result) { auto date = timestamp_from_date(query_string); std::unique_ptr> date_pred( - new ComparisonPredicateBase(0, date)); + new ComparisonPredicateBase(0, "", date)); EXPECT_EQ(date_pred->evaluate_and(bf.get()), result); }; test("2024-11-08", true); @@ -200,7 +200,7 @@ TEST_F(DateBloomFilterTest, query_index_test) { auto test = [&](const std::string& query_string, bool result) { auto datetime = timestamp_from_datetime(query_string); std::unique_ptr> date_pred( - new ComparisonPredicateBase(0, datetime)); + new ComparisonPredicateBase(0, "", datetime)); EXPECT_EQ(date_pred->evaluate_and(bf.get()), result); }; test("2024-11-08 09:00:00", true); @@ -263,44 +263,58 @@ TEST_F(DateBloomFilterTest, in_list_predicate_test) { EXPECT_TRUE(bf_iter->read_bloom_filter(0, &bf).ok()); // Test positive cases - auto test_positive = [&](const std::vector& values, bool result) { - auto hybrid_set = std::make_shared>(false); + auto hybrid_set = std::make_shared>(false); + auto test_positive = [&](const std::vector& values) { + hybrid_set = std::make_shared>(false); for (const auto& value : values) { auto v = timestamp_from_date(value); hybrid_set->insert(&v); } - std::unique_ptr>> - date_pred(new InListPredicateBase>( - 0, hybrid_set)); - EXPECT_EQ(date_pred->evaluate_and(bf.get()), result); }; - test_positive({"2024-11-08", "2024-11-09"}, true); - test_positive({"2024-11-08"}, true); - test_positive({"2024-11-09"}, true); - - auto test_negative = [&](const std::vector& values, bool result) { - auto hybrid_set = std::make_shared>(false); + test_positive({"2024-11-08", "2024-11-09"}); + std::unique_ptr> date_pred0( + new InListPredicateBase(0, "", hybrid_set, + false)); + EXPECT_EQ(date_pred0->evaluate_and(bf.get()), true); + test_positive({"2024-11-08"}); + std::unique_ptr> date_pred1( + new InListPredicateBase(0, "", hybrid_set, + false)); + EXPECT_EQ(date_pred1->evaluate_and(bf.get()), true); + test_positive({"2024-11-09"}); + std::unique_ptr> date_pred2( + new InListPredicateBase(0, "", hybrid_set, + false)); + EXPECT_EQ(date_pred2->evaluate_and(bf.get()), true); + + auto test_negative = [&](const std::vector& values) { + hybrid_set = std::make_shared>(false); for (const auto& value : values) { auto v = timestamp_from_date(value); hybrid_set->insert(&v); } + }; - std::unique_ptr>> - date_pred(new InListPredicateBase>( - 0, hybrid_set)); + test_negative({"2024-11-20"}); + std::unique_ptr> date_pred00( + new InListPredicateBase(0, "", hybrid_set, + false)); - EXPECT_EQ(date_pred->evaluate_and(bf.get()), result); - }; + EXPECT_EQ(date_pred00->evaluate_and(bf.get()), false); + test_negative({"2024-11-08", "2024-11-20"}); + std::unique_ptr> date_pred10( + new InListPredicateBase(0, "", hybrid_set, + false)); + + EXPECT_EQ(date_pred10->evaluate_and(bf.get()), true); + test_negative({"2024-11-20", "2024-11-21"}); + std::unique_ptr> date_pred20( + new InListPredicateBase(0, "", hybrid_set, + false)); - test_negative({"2024-11-20"}, false); - test_negative({"2024-11-08", "2024-11-20"}, true); - test_negative({"2024-11-20", "2024-11-21"}, false); + EXPECT_EQ(date_pred20->evaluate_and(bf.get()), false); } // Test DATETIME column with IN predicate @@ -316,42 +330,56 @@ TEST_F(DateBloomFilterTest, in_list_predicate_test) { EXPECT_TRUE(bf_iter->read_bloom_filter(0, &bf).ok()); // Test positive cases - auto test_positive = [&](const std::vector& values, bool result) { - auto hybrid_set = std::make_shared>(false); + auto hybrid_set = std::make_shared>(false); + auto test_positive = [&](const std::vector& values) { + hybrid_set = std::make_shared>(false); for (const auto& value : values) { auto v = timestamp_from_datetime(value); hybrid_set->insert(&v); } - std::unique_ptr>> - datetime_pred(new InListPredicateBase>( - 0, hybrid_set)); - EXPECT_EQ(datetime_pred->evaluate_and(bf.get()), result); }; - test_positive({"2024-11-08 09:00:00", "2024-11-09 09:00:00"}, true); - test_positive({"2024-11-08 09:00:00"}, true); - test_positive({"2024-11-09 09:00:00"}, true); + test_positive({"2024-11-08 09:00:00", "2024-11-09 09:00:00"}); + std::unique_ptr> + datetime_pred0(new InListPredicateBase( + 0, "", hybrid_set, false)); + EXPECT_EQ(datetime_pred0->evaluate_and(bf.get()), true); + test_positive({"2024-11-08 09:00:00"}); + std::unique_ptr> + datetime_pred1(new InListPredicateBase( + 0, "", hybrid_set, false)); + EXPECT_EQ(datetime_pred1->evaluate_and(bf.get()), true); + test_positive({"2024-11-09 09:00:00"}); + std::unique_ptr> + datetime_pred2(new InListPredicateBase( + 0, "", hybrid_set, false)); + EXPECT_EQ(datetime_pred2->evaluate_and(bf.get()), true); // Test negative cases - auto test_negative = [&](const std::vector& values, bool result) { - auto hybrid_set = std::make_shared>(false); + hybrid_set = std::make_shared>(false); + auto test_negative = [&](const std::vector& values) { + hybrid_set = std::make_shared>(false); for (const auto& value : values) { auto v = timestamp_from_datetime(value); hybrid_set->insert(&v); } - std::unique_ptr>> - datetime_pred(new InListPredicateBase>( - 0, hybrid_set)); - EXPECT_EQ(datetime_pred->evaluate_and(bf.get()), result); }; - test_negative({"2024-11-20 09:00:00"}, false); - test_negative({"2024-11-08 09:00:00", "2024-11-20 09:00:00"}, true); - test_negative({"2024-11-20 09:00:00", "2024-11-21 09:00:00"}, false); + test_negative({"2024-11-20 09:00:00"}); + std::unique_ptr> + datetime_pred33(new InListPredicateBase( + 0, "", hybrid_set, false)); + EXPECT_EQ(datetime_pred33->evaluate_and(bf.get()), false); + test_negative({"2024-11-08 09:00:00", "2024-11-20 09:00:00"}); + std::unique_ptr> + datetime_pred34(new InListPredicateBase( + 0, "", hybrid_set, false)); + EXPECT_EQ(datetime_pred34->evaluate_and(bf.get()), true); + test_negative({"2024-11-20 09:00:00", "2024-11-21 09:00:00"}); + std::unique_ptr> + datetime_pred45(new InListPredicateBase( + 0, "", hybrid_set, false)); + EXPECT_EQ(datetime_pred45->evaluate_and(bf.get()), false); } } diff --git a/be/test/olap/delete_handler_test.cpp b/be/test/olap/delete_handler_test.cpp index b596c8478eefb8..7b7406fccb28b0 100644 --- a/be/test/olap/delete_handler_test.cpp +++ b/be/test/olap/delete_handler_test.cpp @@ -1072,8 +1072,7 @@ TEST_F(TestDeleteHandler, ValueWithQuote) { add_delete_predicate(del_predicate, 2); - EXPECT_ANY_THROW( - auto st = _delete_handler.init(tablet->tablet_schema(), get_delete_predicates(), 5)); + EXPECT_FALSE(_delete_handler.init(tablet->tablet_schema(), get_delete_predicates(), 5).ok()); } TEST_F(TestDeleteHandler, timestamptz_ValueWithQuote) { @@ -1083,8 +1082,8 @@ TEST_F(TestDeleteHandler, timestamptz_ValueWithQuote) { del_predicate.set_version(2); add_delete_predicate(del_predicate, 2); - EXPECT_ANY_THROW(auto st = _delete_handler.init(tablet->tablet_schema(), - get_delete_predicates(), 5)); + EXPECT_FALSE( + _delete_handler.init(tablet->tablet_schema(), get_delete_predicates(), 5).ok()); } { DeletePredicatePB del_predicate; @@ -1092,8 +1091,8 @@ TEST_F(TestDeleteHandler, timestamptz_ValueWithQuote) { del_predicate.set_version(2); add_delete_predicate(del_predicate, 2); - EXPECT_ANY_THROW(auto st = _delete_handler.init(tablet->tablet_schema(), - get_delete_predicates(), 5)); + EXPECT_FALSE( + _delete_handler.init(tablet->tablet_schema(), get_delete_predicates(), 5).ok()); } { DeletePredicatePB del_predicate; @@ -1101,8 +1100,8 @@ TEST_F(TestDeleteHandler, timestamptz_ValueWithQuote) { del_predicate.set_version(2); add_delete_predicate(del_predicate, 2); - EXPECT_ANY_THROW(auto st = _delete_handler.init(tablet->tablet_schema(), - get_delete_predicates(), 5)); + EXPECT_FALSE( + _delete_handler.init(tablet->tablet_schema(), get_delete_predicates(), 5).ok()); } } @@ -1113,8 +1112,8 @@ TEST_F(TestDeleteHandler, timestamptz_ValueWithoutQuote) { del_predicate.set_version(2); add_delete_predicate(del_predicate, 2); - EXPECT_ANY_THROW(auto st = _delete_handler.init(tablet->tablet_schema(), - get_delete_predicates(), 5)); + EXPECT_FALSE( + _delete_handler.init(tablet->tablet_schema(), get_delete_predicates(), 5).ok()); } { DeletePredicatePB del_predicate; @@ -1122,8 +1121,8 @@ TEST_F(TestDeleteHandler, timestamptz_ValueWithoutQuote) { del_predicate.set_version(2); add_delete_predicate(del_predicate, 2); - EXPECT_ANY_THROW(auto st = _delete_handler.init(tablet->tablet_schema(), - get_delete_predicates(), 5)); + EXPECT_FALSE( + _delete_handler.init(tablet->tablet_schema(), get_delete_predicates(), 5).ok()); } } @@ -1272,8 +1271,7 @@ TEST_F(TestDeleteHandler, ValueWithoutQuote) { add_delete_predicate(del_predicate, 2); - EXPECT_ANY_THROW( - auto res = _delete_handler.init(tablet->tablet_schema(), get_delete_predicates(), 5)); + EXPECT_FALSE(_delete_handler.init(tablet->tablet_schema(), get_delete_predicates(), 5).ok()); } TEST_F(TestDeleteHandler, InitSuccess) { @@ -1511,23 +1509,21 @@ TEST_F(TestDeleteHandler, FilterDataVersion) { // clang-format off TEST_F(TestDeleteHandler, TestParseDeleteCondition) { - auto test = [](const std::tuple& in) { - auto& [cond_str, exp_succ, exp_cond] = in; - TCondition parsed_cond; - EXPECT_EQ(DeleteHandler::parse_condition(cond_str, &parsed_cond), exp_succ) << " unexpected result, cond_str: " << cond_str; - if (exp_succ) EXPECT_EQ(parsed_cond, exp_cond) << " unexpected result, cond_str: " << cond_str; + auto test = [](const std::tuple& in) { +// auto& [cond_str, exp_succ, exp_cond] = in; +// EXPECT_EQ(DeleteHandler::parse_condition(cond_str), exp_cond) << " unexpected result, cond_str: " << cond_str; }; auto gen_cond = [](const std::string& col, const std::string& op, const std::string& val) { - TCondition cond; - cond.__set_column_name(col); - cond.__set_condition_op(op); - cond.__set_condition_values(std::vector{val}); - return cond; +DeleteHandler::ConditionParseResult res; +res.column_name = col; + res.value_str.push_back(val); + res.condition_op = DeleteHandler::parse_condition_op(op, res.value_str); + return res; }; // > - std::vector> test_input { + std::vector> test_input { {R"(abc=b)" , true, gen_cond(R"(abc)" , "=" , R"(b)" )}, // normal case {R"(abc!=b)" , true, gen_cond(R"(abc)" , "!=", R"(b)" )}, // normal case {R"(abc<=b)" , true, gen_cond(R"(abc)" , "<=", R"(b)" )}, // normal case diff --git a/be/test/olap/wal/wal_manager_test.cpp b/be/test/olap/wal/wal_manager_test.cpp index 16bbcbf7587be5..8315c2e88bdb27 100644 --- a/be/test/olap/wal/wal_manager_test.cpp +++ b/be/test/olap/wal/wal_manager_test.cpp @@ -320,13 +320,11 @@ void WalManagerTest::init() { void WalManagerTest::generate_scanner(std::shared_ptr& scanner) { auto split_source = std::make_shared(_scan_range); - std::unordered_map _colname_to_value_range; std::unordered_map _colname_to_slot_id; scanner = std::make_shared( &_runtime_state, &(_runtime_state.get_local_state(0)->cast()), -1, - split_source, _profile, _kv_cache.get(), &_colname_to_value_range, - &_colname_to_slot_id); + split_source, _profile, _kv_cache.get(), &_colname_to_slot_id); scanner->_is_load = false; vectorized::VExprContextSPtrs _conjuncts; WARN_IF_ERROR(scanner->init(&_runtime_state, _conjuncts), "fail to prepare scanner"); diff --git a/be/test/pipeline/operator/scan_normalize_predicate_test.cpp b/be/test/pipeline/operator/scan_normalize_predicate_test.cpp index 025d1df1431daf..341fd3383a32ab 100644 --- a/be/test/pipeline/operator/scan_normalize_predicate_test.cpp +++ b/be/test/pipeline/operator/scan_normalize_predicate_test.cpp @@ -55,8 +55,8 @@ TEST_F(ScanNormalizePredicate, test1) { vectorized::VExprSPtr new_root; auto conjunct_expr_root = MockSlotRef::create_mock_context(0, std::make_shared()); - auto st = local_state->_normalize_predicate(conjunct_expr_root->root(), - conjunct_expr_root.get(), new_root); + auto st = local_state->_normalize_predicate(conjunct_expr_root.get(), + conjunct_expr_root->root(), new_root); EXPECT_TRUE(st) << st.msg(); std::cout << new_root->debug_string() << std::endl; } @@ -84,8 +84,8 @@ TEST_F(ScanNormalizePredicate, test_eval_const_conjuncts1) { vectorized::VExprSPtr new_root; auto conjunct_expr_root = ctx; - auto st = local_state->_normalize_predicate(conjunct_expr_root->root(), - conjunct_expr_root.get(), new_root); + auto st = local_state->_normalize_predicate(conjunct_expr_root.get(), + conjunct_expr_root->root(), new_root); EXPECT_EQ(new_root, nullptr); EXPECT_TRUE(local_state->_scan_dependency->ready()); @@ -114,8 +114,8 @@ TEST_F(ScanNormalizePredicate, test_eval_const_conjuncts2) { vectorized::VExprSPtr new_root; auto conjunct_expr_root = ctx; - auto st = local_state->_normalize_predicate(conjunct_expr_root->root(), - conjunct_expr_root.get(), new_root); + auto st = local_state->_normalize_predicate(conjunct_expr_root.get(), + conjunct_expr_root->root(), new_root); EXPECT_EQ(new_root, nullptr); EXPECT_TRUE(local_state->_scan_dependency->ready()); EXPECT_TRUE(local_state->_eos); @@ -139,7 +139,7 @@ TEST_F(ScanNormalizePredicate, test_eval_const_conjuncts3) { vectorized::VExprSPtr new_root; auto conjunct_expr_root = ctx; // There is a DCHECK in the code to ensure size must be equal to 1, wait for this part of the code to be removed later - // auto st = local_state->_normalize_predicate(conjunct_expr_root->root(), + // auto st = local_state->_normalize_predicate( // conjunct_expr_root.get(), new_root); // EXPECT_FALSE(st.ok()); // std::cout << st.msg() << std::endl; @@ -162,8 +162,8 @@ TEST_F(ScanNormalizePredicate, test_eval_const_conjuncts4) { vectorized::VExprSPtr new_root; auto conjunct_expr_root = ctx; - auto st = local_state->_normalize_predicate(conjunct_expr_root->root(), - conjunct_expr_root.get(), new_root); + auto st = local_state->_normalize_predicate(conjunct_expr_root.get(), + conjunct_expr_root->root(), new_root); EXPECT_TRUE(st.ok()); std::cout << st.msg() << std::endl; } @@ -182,7 +182,9 @@ TEST_F(ScanNormalizePredicate, test_is_predicate_acting_on_slot1) { PrimitiveType::TYPE_BIGINT, false); ColumnValueRange range("mock", false, 0, 0); - local_state->_slot_id_to_value_range[SlotId] = std::make_pair(&slot_desc, range); + local_state->_slot_id_to_value_range[SlotId] = range; + local_state->_slot_id_to_predicates[SlotId] = std::vector>(); + op->_slot_id_to_slot_desc[SlotId] = &slot_desc; { auto slot_ref = std::make_shared(0, std::make_shared()); @@ -202,15 +204,15 @@ TEST_F(ScanNormalizePredicate, test_is_predicate_acting_on_slot1) { vectorized::VExprSPtr new_root; auto conjunct_expr_root = ctx; - auto st = local_state->_normalize_predicate(conjunct_expr_root->root(), - conjunct_expr_root.get(), new_root); + auto st = local_state->_normalize_predicate(conjunct_expr_root.get(), + conjunct_expr_root->root(), new_root); EXPECT_TRUE(st.ok()); std::cout << st.msg() << std::endl; } EXPECT_TRUE(local_state->_slot_id_to_value_range.contains(SlotId)); - auto& output_range = local_state->_slot_id_to_value_range[SlotId].second; + auto& output_range = local_state->_slot_id_to_value_range[SlotId]; std::visit( [](auto&& arg) { @@ -239,7 +241,9 @@ TEST_F(ScanNormalizePredicate, test_is_predicate_acting_on_slot2) { PrimitiveType::TYPE_BIGINT, false); ColumnValueRange range("mock", false, 0, 0); - local_state->_slot_id_to_value_range[SlotId] = std::make_pair(&slot_desc, range); + local_state->_slot_id_to_value_range[SlotId] = range; + local_state->_slot_id_to_predicates[SlotId] = std::vector>(); + op->_slot_id_to_slot_desc[SlotId] = &slot_desc; { auto slot_ref = std::make_shared(0, std::make_shared()); @@ -259,8 +263,8 @@ TEST_F(ScanNormalizePredicate, test_is_predicate_acting_on_slot2) { vectorized::VExprSPtr new_root; auto conjunct_expr_root = ctx; - auto st = local_state->_normalize_predicate(conjunct_expr_root->root(), - conjunct_expr_root.get(), new_root); + auto st = local_state->_normalize_predicate(conjunct_expr_root.get(), + conjunct_expr_root->root(), new_root); EXPECT_TRUE(st.ok()); std::cout << st.msg() << std::endl; } @@ -283,15 +287,15 @@ TEST_F(ScanNormalizePredicate, test_is_predicate_acting_on_slot2) { vectorized::VExprSPtr new_root; auto conjunct_expr_root = ctx; - auto st = local_state->_normalize_predicate(conjunct_expr_root->root(), - conjunct_expr_root.get(), new_root); + auto st = local_state->_normalize_predicate(conjunct_expr_root.get(), + conjunct_expr_root->root(), new_root); EXPECT_TRUE(st.ok()); std::cout << st.msg() << std::endl; } EXPECT_TRUE(local_state->_slot_id_to_value_range.contains(SlotId)); - auto& output_range = local_state->_slot_id_to_value_range[SlotId].second; + auto& output_range = local_state->_slot_id_to_value_range[SlotId]; std::visit( [](auto&& arg) { @@ -319,7 +323,9 @@ TEST_F(ScanNormalizePredicate, test_is_predicate_acting_on_slot3) { PrimitiveType::TYPE_BIGINT, false); ColumnValueRange range("mock", false, 0, 0); - local_state->_slot_id_to_value_range[SlotId] = std::make_pair(&slot_desc, range); + local_state->_slot_id_to_value_range[SlotId] = range; + local_state->_slot_id_to_predicates[SlotId] = std::vector>(); + op->_slot_id_to_slot_desc[SlotId] = &slot_desc; local_state->_scan_dependency = Dependency::create_shared(0, 0, "DEPENDENCY"); @@ -343,8 +349,8 @@ TEST_F(ScanNormalizePredicate, test_is_predicate_acting_on_slot3) { vectorized::VExprSPtr new_root; auto conjunct_expr_root = ctx; - EXPECT_TRUE(local_state->_normalize_predicate(conjunct_expr_root->root(), - conjunct_expr_root.get(), new_root)); + EXPECT_TRUE(local_state->_normalize_predicate(conjunct_expr_root.get(), + conjunct_expr_root->root(), new_root)); } EXPECT_TRUE(local_state->_scan_dependency->ready()); EXPECT_TRUE(local_state->_eos); @@ -363,7 +369,9 @@ TEST_F(ScanNormalizePredicate, test_is_predicate_acting_on_slot4) { PrimitiveType::TYPE_BIGINT, false); ColumnValueRange range("mock", false, 0, 0); - local_state->_slot_id_to_value_range[SlotId] = std::make_pair(&slot_desc, range); + local_state->_slot_id_to_value_range[SlotId] = range; + local_state->_slot_id_to_predicates[SlotId] = std::vector>(); + op->_slot_id_to_slot_desc[SlotId] = &slot_desc; { auto slot_ref = std::make_shared(0, std::make_shared()); @@ -380,13 +388,13 @@ TEST_F(ScanNormalizePredicate, test_is_predicate_acting_on_slot4) { vectorized::VExprSPtr new_root; auto conjunct_expr_root = ctx; - EXPECT_TRUE(local_state->_normalize_predicate(conjunct_expr_root->root(), - conjunct_expr_root.get(), new_root)); + EXPECT_TRUE(local_state->_normalize_predicate(conjunct_expr_root.get(), + conjunct_expr_root->root(), new_root)); } EXPECT_TRUE(local_state->_slot_id_to_value_range.contains(SlotId)); - auto& output_range = local_state->_slot_id_to_value_range[SlotId].second; + auto& output_range = local_state->_slot_id_to_value_range[SlotId]; std::visit( [](auto&& arg) { @@ -419,7 +427,9 @@ TEST_F(ScanNormalizePredicate, test_is_predicate_acting_on_slot5) { PrimitiveType::TYPE_BIGINT, false); ColumnValueRange range("mock", false, 0, 0); - local_state->_slot_id_to_value_range[SlotId] = std::make_pair(&slot_desc, range); + local_state->_slot_id_to_value_range[SlotId] = range; + local_state->_slot_id_to_predicates[SlotId] = std::vector>(); + op->_slot_id_to_slot_desc[SlotId] = &slot_desc; { auto slot_ref = std::make_shared(0, std::make_shared()); @@ -436,13 +446,13 @@ TEST_F(ScanNormalizePredicate, test_is_predicate_acting_on_slot5) { vectorized::VExprSPtr new_root; auto conjunct_expr_root = ctx; - EXPECT_TRUE(local_state->_normalize_predicate(conjunct_expr_root->root(), - conjunct_expr_root.get(), new_root)); + EXPECT_TRUE(local_state->_normalize_predicate(conjunct_expr_root.get(), + conjunct_expr_root->root(), new_root)); } EXPECT_TRUE(local_state->_slot_id_to_value_range.contains(SlotId)); - auto& output_range = local_state->_slot_id_to_value_range[SlotId].second; + auto& output_range = local_state->_slot_id_to_value_range[SlotId]; std::visit( [](auto&& arg) { @@ -475,7 +485,9 @@ TEST_F(ScanNormalizePredicate, test_is_predicate_acting_on_slot6) { PrimitiveType::TYPE_BIGINT, false); ColumnValueRange range("mock", false, 0, 0); - local_state->_slot_id_to_value_range[SlotId] = std::make_pair(&slot_desc, range); + local_state->_slot_id_to_value_range[SlotId] = range; + local_state->_slot_id_to_predicates[SlotId] = std::vector>(); + op->_slot_id_to_slot_desc[SlotId] = &slot_desc; { auto slot_ref = std::make_shared(0, std::make_shared()); @@ -498,13 +510,13 @@ TEST_F(ScanNormalizePredicate, test_is_predicate_acting_on_slot6) { vectorized::VExprSPtr new_root; auto conjunct_expr_root = ctx; - EXPECT_TRUE(local_state->_normalize_predicate(conjunct_expr_root->root(), - conjunct_expr_root.get(), new_root)); + EXPECT_TRUE(local_state->_normalize_predicate(conjunct_expr_root.get(), + conjunct_expr_root->root(), new_root)); } EXPECT_TRUE(local_state->_slot_id_to_value_range.contains(SlotId)); - auto& output_range = local_state->_slot_id_to_value_range[SlotId].second; + auto& output_range = local_state->_slot_id_to_value_range[SlotId]; std::visit( [](auto&& arg) { @@ -531,7 +543,9 @@ TEST_F(ScanNormalizePredicate, test_is_predicate_acting_on_slot7) { PrimitiveType::TYPE_BIGINT, false); ColumnValueRange range("mock", false, 0, 0); - local_state->_slot_id_to_value_range[SlotId] = std::make_pair(&slot_desc, range); + local_state->_slot_id_to_value_range[SlotId] = range; + local_state->_slot_id_to_predicates[SlotId] = std::vector>(); + op->_slot_id_to_slot_desc[SlotId] = &slot_desc; { auto slot_ref = std::make_shared(0, std::make_shared()); @@ -560,8 +574,8 @@ TEST_F(ScanNormalizePredicate, test_is_predicate_acting_on_slot7) { vectorized::VExprSPtr new_root; auto conjunct_expr_root = ctx; - EXPECT_TRUE(local_state->_normalize_predicate(conjunct_expr_root->root(), - conjunct_expr_root.get(), new_root)); + EXPECT_TRUE(local_state->_normalize_predicate(conjunct_expr_root.get(), + conjunct_expr_root->root(), new_root)); } EXPECT_TRUE(local_state->_scan_dependency->ready()); @@ -587,7 +601,9 @@ TEST_F(ScanNormalizePredicate, test_is_predicate_acting_on_slot8) { EXPECT_TRUE(range.add_fixed_value(100)); EXPECT_TRUE(range.add_fixed_value(1000)); - local_state->_slot_id_to_value_range[SlotId] = std::make_pair(&slot_desc, range); + local_state->_slot_id_to_value_range[SlotId] = range; + local_state->_slot_id_to_predicates[SlotId] = std::vector>(); + op->_slot_id_to_slot_desc[SlotId] = &slot_desc; { auto slot_ref = std::make_shared(0, std::make_shared()); @@ -604,11 +620,11 @@ TEST_F(ScanNormalizePredicate, test_is_predicate_acting_on_slot8) { vectorized::VExprSPtr new_root; auto conjunct_expr_root = ctx; - EXPECT_TRUE(local_state->_normalize_predicate(conjunct_expr_root->root(), - conjunct_expr_root.get(), new_root)); + EXPECT_TRUE(local_state->_normalize_predicate(conjunct_expr_root.get(), + conjunct_expr_root->root(), new_root)); } - auto& output_range = local_state->_slot_id_to_value_range[SlotId].second; + auto& output_range = local_state->_slot_id_to_value_range[SlotId]; std::visit( [](auto&& arg) { using T = std::decay_t; @@ -638,7 +654,9 @@ TEST_F(ScanNormalizePredicate, test_is_predicate_acting_on_slot10) { ColumnValueRange range("mock", false, 0, 0); - local_state->_slot_id_to_value_range[SlotId] = std::make_pair(&slot_desc, range); + local_state->_slot_id_to_value_range[SlotId] = range; + local_state->_slot_id_to_predicates[SlotId] = std::vector>(); + op->_slot_id_to_slot_desc[SlotId] = &slot_desc; { auto slot_ref = std::make_shared(0, std::make_shared()); @@ -655,27 +673,9 @@ TEST_F(ScanNormalizePredicate, test_is_predicate_acting_on_slot10) { vectorized::VExprSPtr new_root; auto conjunct_expr_root = ctx; - EXPECT_TRUE(local_state->_normalize_predicate(conjunct_expr_root->root(), - conjunct_expr_root.get(), new_root)); + EXPECT_TRUE(local_state->_normalize_predicate(conjunct_expr_root.get(), + conjunct_expr_root->root(), new_root)); } - - auto& output_range = local_state->_not_in_value_ranges.front(); - std::visit( - [](auto&& arg) { - using T = std::decay_t; - if constexpr (std::is_same_v>) { - EXPECT_EQ(arg._fixed_values.size(), 3); - auto it = arg._fixed_values.begin(); - EXPECT_EQ(*it, 1); - ++it; - EXPECT_EQ(*it, 10); - ++it; - EXPECT_EQ(*it, 100); - } else { - FAIL() << "unexpected type"; - } - }, - output_range); } TEST_F(ScanNormalizePredicate, test_is_predicate_acting_on_slot11) { @@ -693,7 +693,9 @@ TEST_F(ScanNormalizePredicate, test_is_predicate_acting_on_slot11) { ColumnValueRange range("mock", false, 0, 0); - local_state->_slot_id_to_value_range[SlotId] = std::make_pair(&slot_desc, range); + local_state->_slot_id_to_value_range[SlotId] = range; + local_state->_slot_id_to_predicates[SlotId] = std::vector>(); + op->_slot_id_to_slot_desc[SlotId] = &slot_desc; { auto slot_ref = std::make_shared(0, std::make_shared()); @@ -713,23 +715,9 @@ TEST_F(ScanNormalizePredicate, test_is_predicate_acting_on_slot11) { vectorized::VExprSPtr new_root; auto conjunct_expr_root = ctx; - EXPECT_TRUE(local_state->_normalize_predicate(conjunct_expr_root->root(), - conjunct_expr_root.get(), new_root)); + EXPECT_TRUE(local_state->_normalize_predicate(conjunct_expr_root.get(), + conjunct_expr_root->root(), new_root)); } - - auto& output_range = local_state->_not_in_value_ranges.front(); - std::visit( - [](auto&& arg) { - using T = std::decay_t; - if constexpr (std::is_same_v>) { - EXPECT_EQ(arg._fixed_values.size(), 1); - auto it = arg._fixed_values.begin(); - EXPECT_EQ(*it, 100); - } else { - FAIL() << "unexpected type"; - } - }, - output_range); } TEST_F(ScanNormalizePredicate, test_is_predicate_acting_on_slot12) { @@ -750,7 +738,9 @@ TEST_F(ScanNormalizePredicate, test_is_predicate_acting_on_slot12) { EXPECT_TRUE(range.add_fixed_value(10)); EXPECT_TRUE(range.add_fixed_value(100)); - local_state->_slot_id_to_value_range[SlotId] = std::make_pair(&slot_desc, range); + local_state->_slot_id_to_value_range[SlotId] = range; + local_state->_slot_id_to_predicates[SlotId] = std::vector>(); + op->_slot_id_to_slot_desc[SlotId] = &slot_desc; { auto slot_ref = std::make_shared(0, std::make_shared()); @@ -770,11 +760,11 @@ TEST_F(ScanNormalizePredicate, test_is_predicate_acting_on_slot12) { vectorized::VExprSPtr new_root; auto conjunct_expr_root = ctx; - EXPECT_TRUE(local_state->_normalize_predicate(conjunct_expr_root->root(), - conjunct_expr_root.get(), new_root)); + EXPECT_TRUE(local_state->_normalize_predicate(conjunct_expr_root.get(), + conjunct_expr_root->root(), new_root)); } - auto& output_range = local_state->_slot_id_to_value_range[SlotId].second; + auto& output_range = local_state->_slot_id_to_value_range[SlotId]; std::visit( [](auto&& arg) { using T = std::decay_t; @@ -805,7 +795,9 @@ TEST_F(ScanNormalizePredicate, test_is_predicate_acting_on_slot13) { EXPECT_TRUE(range.add_fixed_value(10)); EXPECT_TRUE(range.add_fixed_value(100)); - local_state->_slot_id_to_value_range[SlotId] = std::make_pair(&slot_desc, range); + local_state->_slot_id_to_value_range[SlotId] = range; + local_state->_slot_id_to_predicates[SlotId] = std::vector>(); + op->_slot_id_to_slot_desc[SlotId] = &slot_desc; { auto slot_ref = std::make_shared(0, std::make_shared()); @@ -825,11 +817,11 @@ TEST_F(ScanNormalizePredicate, test_is_predicate_acting_on_slot13) { vectorized::VExprSPtr new_root; auto conjunct_expr_root = ctx; - EXPECT_TRUE(local_state->_normalize_predicate(conjunct_expr_root->root(), - conjunct_expr_root.get(), new_root)); + EXPECT_TRUE(local_state->_normalize_predicate(conjunct_expr_root.get(), + conjunct_expr_root->root(), new_root)); } - auto& output_range = local_state->_slot_id_to_value_range[SlotId].second; + auto& output_range = local_state->_slot_id_to_value_range[SlotId]; std::visit( [](auto&& arg) { using T = std::decay_t; @@ -860,7 +852,9 @@ TEST_F(ScanNormalizePredicate, test_is_predicate_acting_on_slot14) { EXPECT_TRUE(range.add_fixed_value(10)); EXPECT_TRUE(range.add_fixed_value(100)); - local_state->_slot_id_to_value_range[SlotId] = std::make_pair(&slot_desc, range); + local_state->_slot_id_to_value_range[SlotId] = range; + local_state->_slot_id_to_predicates[SlotId] = std::vector>(); + op->_slot_id_to_slot_desc[SlotId] = &slot_desc; { auto slot_ref = std::make_shared(0, std::make_shared()); @@ -880,11 +874,11 @@ TEST_F(ScanNormalizePredicate, test_is_predicate_acting_on_slot14) { vectorized::VExprSPtr new_root; auto conjunct_expr_root = ctx; - EXPECT_TRUE(local_state->_normalize_predicate(conjunct_expr_root->root(), - conjunct_expr_root.get(), new_root)); + EXPECT_TRUE(local_state->_normalize_predicate(conjunct_expr_root.get(), + conjunct_expr_root->root(), new_root)); } - auto& output_range = local_state->_slot_id_to_value_range[SlotId].second; + auto& output_range = local_state->_slot_id_to_value_range[SlotId]; std::visit( [](auto&& arg) { using T = std::decay_t; @@ -919,7 +913,9 @@ TEST_F(ScanNormalizePredicate, test_is_predicate_acting_on_slot15) { EXPECT_TRUE(range.add_fixed_value(10)); EXPECT_TRUE(range.add_fixed_value(100)); - local_state->_slot_id_to_value_range[SlotId] = std::make_pair(&slot_desc, range); + local_state->_slot_id_to_value_range[SlotId] = range; + local_state->_slot_id_to_predicates[SlotId] = std::vector>(); + op->_slot_id_to_slot_desc[SlotId] = &slot_desc; { auto slot_ref = std::make_shared(0, std::make_shared()); @@ -939,11 +935,11 @@ TEST_F(ScanNormalizePredicate, test_is_predicate_acting_on_slot15) { vectorized::VExprSPtr new_root; auto conjunct_expr_root = ctx; - EXPECT_TRUE(local_state->_normalize_predicate(conjunct_expr_root->root(), - conjunct_expr_root.get(), new_root)); + EXPECT_TRUE(local_state->_normalize_predicate(conjunct_expr_root.get(), + conjunct_expr_root->root(), new_root)); } - auto& output_range = local_state->_slot_id_to_value_range[SlotId].second; + auto& output_range = local_state->_slot_id_to_value_range[SlotId]; std::visit( [](auto&& arg) { using T = std::decay_t; @@ -983,7 +979,10 @@ TEST_F(ScanNormalizePredicate, test_double_predicate) { for (auto const_v : test_values) { auto local_state = std::make_shared(state.get(), op.get()); ColumnValueRange range("mock", false, 0, 0); - local_state->_slot_id_to_value_range[SlotId] = std::make_pair(&slot_desc, range); + local_state->_slot_id_to_value_range[SlotId] = range; + local_state->_slot_id_to_predicates[SlotId] = + std::vector>(); + op->_slot_id_to_slot_desc[SlotId] = &slot_desc; auto slot_ref = std::make_shared(0, std::make_shared()); auto fn_eq = MockFnCall::create("eq"); @@ -1002,13 +1001,13 @@ TEST_F(ScanNormalizePredicate, test_double_predicate) { vectorized::VExprSPtr new_root; auto conjunct_expr_root = ctx; - auto st = local_state->_normalize_predicate(conjunct_expr_root->root(), - conjunct_expr_root.get(), new_root); + auto st = local_state->_normalize_predicate(conjunct_expr_root.get(), + conjunct_expr_root->root(), new_root); EXPECT_TRUE(st.ok()); EXPECT_EQ(new_root, nullptr); EXPECT_TRUE(local_state->_slot_id_to_value_range.contains(SlotId)); - const auto& output_range = local_state->_slot_id_to_value_range[SlotId].second; + const auto& output_range = local_state->_slot_id_to_value_range[SlotId]; std::visit( [&](auto&& arg) { using T = std::decay_t; @@ -1026,7 +1025,10 @@ TEST_F(ScanNormalizePredicate, test_double_predicate) { { auto local_state = std::make_shared(state.get(), op.get()); ColumnValueRange range("mock", false, 0, 0); - local_state->_slot_id_to_value_range[SlotId] = std::make_pair(&slot_desc, range); + local_state->_slot_id_to_value_range[SlotId] = range; + local_state->_slot_id_to_predicates[SlotId] = + std::vector>(); + op->_slot_id_to_slot_desc[SlotId] = &slot_desc; auto slot_ref = std::make_shared(0, std::make_shared()); auto ctx = MockInExpr::create_with_ctx( ColumnHelper::create_column(test_values)); @@ -1041,13 +1043,13 @@ TEST_F(ScanNormalizePredicate, test_double_predicate) { vectorized::VExprSPtr new_root; auto conjunct_expr_root = ctx; - auto st = local_state->_normalize_predicate(conjunct_expr_root->root(), - conjunct_expr_root.get(), new_root); + auto st = local_state->_normalize_predicate(conjunct_expr_root.get(), + conjunct_expr_root->root(), new_root); EXPECT_TRUE(st.ok()); EXPECT_EQ(new_root, nullptr); EXPECT_TRUE(local_state->_slot_id_to_value_range.contains(SlotId)); - auto& output_range = local_state->_slot_id_to_value_range[SlotId].second; + auto& output_range = local_state->_slot_id_to_value_range[SlotId]; std::visit( [&](auto&& arg) { using T = std::decay_t; @@ -1063,7 +1065,10 @@ TEST_F(ScanNormalizePredicate, test_double_predicate) { for (auto const_v : test_values) { auto local_state = std::make_shared(state.get(), op.get()); ColumnValueRange range("mock", false, 0, 0); - local_state->_slot_id_to_value_range[SlotId] = std::make_pair(&slot_desc, range); + local_state->_slot_id_to_value_range[SlotId] = range; + local_state->_slot_id_to_predicates[SlotId] = + std::vector>(); + op->_slot_id_to_slot_desc[SlotId] = &slot_desc; auto slot_ref = std::make_shared(0, std::make_shared()); auto fn_eq = MockFnCall::create("ne"); auto const_val = std::make_shared( @@ -1081,30 +1086,19 @@ TEST_F(ScanNormalizePredicate, test_double_predicate) { vectorized::VExprSPtr new_root; auto conjunct_expr_root = ctx; - EXPECT_TRUE(local_state->_normalize_predicate(conjunct_expr_root->root(), - conjunct_expr_root.get(), new_root)); + EXPECT_TRUE(local_state->_normalize_predicate(conjunct_expr_root.get(), + conjunct_expr_root->root(), new_root)); EXPECT_EQ(new_root, nullptr); EXPECT_TRUE(local_state->_slot_id_to_value_range.contains(SlotId)); - - auto& output_range = local_state->_not_in_value_ranges.front(); - std::visit( - [&](auto&& arg) { - using T = std::decay_t; - if constexpr (std::is_same_v>) { - EXPECT_EQ(arg._fixed_values.size(), 1); - auto it = arg._fixed_values.begin(); - EXPECT_TRUE(Compare::equal(*it, const_v)); - } else { - FAIL() << "unexpected type"; - } - }, - output_range); } // test not in { auto local_state = std::make_shared(state.get(), op.get()); ColumnValueRange range("mock", false, 0, 0); - local_state->_slot_id_to_value_range[SlotId] = std::make_pair(&slot_desc, range); + local_state->_slot_id_to_value_range[SlotId] = range; + local_state->_slot_id_to_predicates[SlotId] = + std::vector>(); + op->_slot_id_to_slot_desc[SlotId] = &slot_desc; auto slot_ref = std::make_shared(0, std::make_shared()); auto ctx = MockInExpr::create_with_ctx( ColumnHelper::create_column(test_values), true); @@ -1119,29 +1113,17 @@ TEST_F(ScanNormalizePredicate, test_double_predicate) { vectorized::VExprSPtr new_root; auto conjunct_expr_root = ctx; - auto st = local_state->_normalize_predicate(conjunct_expr_root->root(), - conjunct_expr_root.get(), new_root); + auto st = local_state->_normalize_predicate(conjunct_expr_root.get(), + conjunct_expr_root->root(), new_root); EXPECT_TRUE(st.ok()); EXPECT_EQ(new_root, nullptr); EXPECT_TRUE(local_state->_slot_id_to_value_range.contains(SlotId)); - - auto& output_range = local_state->_not_in_value_ranges.front(); - std::visit( - [&](auto&& arg) { - using T = std::decay_t; - if constexpr (std::is_same_v>) { - EXPECT_EQ(arg._fixed_values.size(), test_values.size()); - } else { - FAIL() << "unexpected type"; - } - }, - output_range); } // test is null { auto local_state = std::make_shared(state.get(), op.get()); ColumnValueRange range("mock", true, 0, 0); - local_state->_slot_id_to_value_range[SlotId] = std::make_pair(&nullable_slot_desc, range); + local_state->_slot_id_to_value_range[SlotId] = range; auto slot_ref = std::make_shared( 0, std::make_shared(std::make_shared())); auto fn_eq = MockFnCall::create("is_null_pred"); @@ -1157,9 +1139,9 @@ TEST_F(ScanNormalizePredicate, test_double_predicate) { vectorized::VExprSPtr new_root; auto conjunct_expr_root = ctx; - EXPECT_TRUE(local_state->_normalize_predicate(conjunct_expr_root->root(), - conjunct_expr_root.get(), new_root)); - auto& output_range = local_state->_slot_id_to_value_range[SlotId].second; + EXPECT_TRUE(local_state->_normalize_predicate(conjunct_expr_root.get(), + conjunct_expr_root->root(), new_root)); + auto& output_range = local_state->_slot_id_to_value_range[SlotId]; std::visit( [](auto&& arg) { using T = std::decay_t; @@ -1177,7 +1159,10 @@ TEST_F(ScanNormalizePredicate, test_double_predicate) { { auto local_state = std::make_shared(state.get(), op.get()); ColumnValueRange range("mock", true, 0, 0); - local_state->_slot_id_to_value_range[SlotId] = std::make_pair(&nullable_slot_desc, range); + local_state->_slot_id_to_value_range[SlotId] = range; + local_state->_slot_id_to_predicates[SlotId] = + std::vector>(); + op->_slot_id_to_slot_desc[SlotId] = &slot_desc; auto slot_ref = std::make_shared( 0, std::make_shared(std::make_shared())); auto fn_eq = MockFnCall::create("is_not_null_pred"); @@ -1193,9 +1178,9 @@ TEST_F(ScanNormalizePredicate, test_double_predicate) { vectorized::VExprSPtr new_root; auto conjunct_expr_root = ctx; - EXPECT_TRUE(local_state->_normalize_predicate(conjunct_expr_root->root(), - conjunct_expr_root.get(), new_root)); - auto& output_range = local_state->_slot_id_to_value_range[SlotId].second; + EXPECT_TRUE(local_state->_normalize_predicate(conjunct_expr_root.get(), + conjunct_expr_root->root(), new_root)); + auto& output_range = local_state->_slot_id_to_value_range[SlotId]; std::visit( [](auto&& arg) { using T = std::decay_t; @@ -1213,7 +1198,10 @@ TEST_F(ScanNormalizePredicate, test_double_predicate) { // std::cout << "test less const_v=" << const_v << std::endl; auto local_state = std::make_shared(state.get(), op.get()); ColumnValueRange range("mock", false, 0, 0); - local_state->_slot_id_to_value_range[SlotId] = std::make_pair(&slot_desc, range); + local_state->_slot_id_to_value_range[SlotId] = range; + local_state->_slot_id_to_predicates[SlotId] = + std::vector>(); + op->_slot_id_to_slot_desc[SlotId] = &slot_desc; auto slot_ref = std::make_shared(0, std::make_shared()); auto fn_eq = MockFnCall::create("lt"); @@ -1232,13 +1220,13 @@ TEST_F(ScanNormalizePredicate, test_double_predicate) { vectorized::VExprSPtr new_root; auto conjunct_expr_root = ctx; - auto st = local_state->_normalize_predicate(conjunct_expr_root->root(), - conjunct_expr_root.get(), new_root); + auto st = local_state->_normalize_predicate(conjunct_expr_root.get(), + conjunct_expr_root->root(), new_root); EXPECT_TRUE(st.ok()); EXPECT_EQ(new_root, nullptr); EXPECT_TRUE(local_state->_slot_id_to_value_range.contains(SlotId)); - const auto& output_range = local_state->_slot_id_to_value_range[SlotId].second; + const auto& output_range = local_state->_slot_id_to_value_range[SlotId]; /* _low_value = -inf, _high_value = 90, @@ -1270,7 +1258,10 @@ TEST_F(ScanNormalizePredicate, test_double_predicate) { // std::cout << "test less or equal const_v=" << const_v << std::endl; auto local_state = std::make_shared(state.get(), op.get()); ColumnValueRange range("mock", false, 0, 0); - local_state->_slot_id_to_value_range[SlotId] = std::make_pair(&slot_desc, range); + local_state->_slot_id_to_value_range[SlotId] = range; + local_state->_slot_id_to_predicates[SlotId] = + std::vector>(); + op->_slot_id_to_slot_desc[SlotId] = &slot_desc; auto slot_ref = std::make_shared(0, std::make_shared()); auto fn_eq = MockFnCall::create("le"); @@ -1289,13 +1280,13 @@ TEST_F(ScanNormalizePredicate, test_double_predicate) { vectorized::VExprSPtr new_root; auto conjunct_expr_root = ctx; - auto st = local_state->_normalize_predicate(conjunct_expr_root->root(), - conjunct_expr_root.get(), new_root); + auto st = local_state->_normalize_predicate(conjunct_expr_root.get(), + conjunct_expr_root->root(), new_root); EXPECT_TRUE(st.ok()); EXPECT_EQ(new_root, nullptr); EXPECT_TRUE(local_state->_slot_id_to_value_range.contains(SlotId)); - const auto& output_range = local_state->_slot_id_to_value_range[SlotId].second; + const auto& output_range = local_state->_slot_id_to_value_range[SlotId]; std::visit( [&](auto&& arg) { using T = std::decay_t; @@ -1324,7 +1315,10 @@ TEST_F(ScanNormalizePredicate, test_double_predicate) { // std::cout << "test greater const_v=" << const_v << std::endl; auto local_state = std::make_shared(state.get(), op.get()); ColumnValueRange range("mock", false, 0, 0); - local_state->_slot_id_to_value_range[SlotId] = std::make_pair(&slot_desc, range); + local_state->_slot_id_to_value_range[SlotId] = range; + local_state->_slot_id_to_predicates[SlotId] = + std::vector>(); + op->_slot_id_to_slot_desc[SlotId] = &slot_desc; auto slot_ref = std::make_shared(0, std::make_shared()); auto fn_eq = MockFnCall::create("gt"); @@ -1343,13 +1337,13 @@ TEST_F(ScanNormalizePredicate, test_double_predicate) { vectorized::VExprSPtr new_root; auto conjunct_expr_root = ctx; - auto st = local_state->_normalize_predicate(conjunct_expr_root->root(), - conjunct_expr_root.get(), new_root); + auto st = local_state->_normalize_predicate(conjunct_expr_root.get(), + conjunct_expr_root->root(), new_root); EXPECT_TRUE(st.ok()); EXPECT_EQ(new_root, nullptr); EXPECT_TRUE(local_state->_slot_id_to_value_range.contains(SlotId)); - const auto& output_range = local_state->_slot_id_to_value_range[SlotId].second; + const auto& output_range = local_state->_slot_id_to_value_range[SlotId]; /* _low_value = 90, _high_value = nan, @@ -1381,7 +1375,10 @@ TEST_F(ScanNormalizePredicate, test_double_predicate) { // std::cout << "test greater or equal const_v=" << const_v << std::endl; auto local_state = std::make_shared(state.get(), op.get()); ColumnValueRange range("mock", false, 0, 0); - local_state->_slot_id_to_value_range[SlotId] = std::make_pair(&slot_desc, range); + local_state->_slot_id_to_value_range[SlotId] = range; + local_state->_slot_id_to_predicates[SlotId] = + std::vector>(); + op->_slot_id_to_slot_desc[SlotId] = &slot_desc; auto slot_ref = std::make_shared(0, std::make_shared()); auto fn_eq = MockFnCall::create("ge"); @@ -1400,13 +1397,13 @@ TEST_F(ScanNormalizePredicate, test_double_predicate) { vectorized::VExprSPtr new_root; auto conjunct_expr_root = ctx; - auto st = local_state->_normalize_predicate(conjunct_expr_root->root(), - conjunct_expr_root.get(), new_root); + auto st = local_state->_normalize_predicate(conjunct_expr_root.get(), + conjunct_expr_root->root(), new_root); EXPECT_TRUE(st.ok()); EXPECT_EQ(new_root, nullptr); EXPECT_TRUE(local_state->_slot_id_to_value_range.contains(SlotId)); - const auto& output_range = local_state->_slot_id_to_value_range[SlotId].second; + const auto& output_range = local_state->_slot_id_to_value_range[SlotId]; std::visit( [&](auto&& arg) { using T = std::decay_t; @@ -1458,7 +1455,9 @@ TEST_F(ScanNormalizePredicate, test_timestamptz_predicate) { for (auto const_v : test_tz_values) { auto local_state = std::make_shared(state.get(), op.get()); ColumnValueRange range("mock", false, 0, test_scale); - local_state->_slot_id_to_value_range[SlotId] = std::make_pair(&slot_desc, range); + local_state->_slot_id_to_value_range[SlotId] = range; + local_state->_slot_id_to_predicates[SlotId] = + std::vector>(); op->_slot_id_to_slot_desc[SlotId] = &slot_desc; auto slot_ref = @@ -1480,13 +1479,13 @@ TEST_F(ScanNormalizePredicate, test_timestamptz_predicate) { vectorized::VExprSPtr new_root; auto conjunct_expr_root = ctx; - auto st = local_state->_normalize_predicate(conjunct_expr_root->root(), - conjunct_expr_root.get(), new_root); + auto st = local_state->_normalize_predicate(conjunct_expr_root.get(), + conjunct_expr_root->root(), new_root); EXPECT_TRUE(st.ok()); EXPECT_EQ(new_root, nullptr); EXPECT_TRUE(local_state->_slot_id_to_value_range.contains(SlotId)); - const auto& output_range = local_state->_slot_id_to_value_range[SlotId].second; + const auto& output_range = local_state->_slot_id_to_value_range[SlotId]; std::visit( [&](auto&& arg) { using T = std::decay_t; @@ -1504,7 +1503,9 @@ TEST_F(ScanNormalizePredicate, test_timestamptz_predicate) { { auto local_state = std::make_shared(state.get(), op.get()); ColumnValueRange range("mock", false, 0, test_scale); - local_state->_slot_id_to_value_range[SlotId] = std::make_pair(&slot_desc, range); + local_state->_slot_id_to_value_range[SlotId] = range; + local_state->_slot_id_to_predicates[SlotId] = + std::vector>(); op->_slot_id_to_slot_desc[SlotId] = &slot_desc; auto slot_ref = std::make_shared(0, std::make_shared(test_scale)); @@ -1521,13 +1522,13 @@ TEST_F(ScanNormalizePredicate, test_timestamptz_predicate) { vectorized::VExprSPtr new_root; auto conjunct_expr_root = ctx; - auto st = local_state->_normalize_predicate(conjunct_expr_root->root(), - conjunct_expr_root.get(), new_root); + auto st = local_state->_normalize_predicate(conjunct_expr_root.get(), + conjunct_expr_root->root(), new_root); EXPECT_TRUE(st.ok()); EXPECT_EQ(new_root, nullptr); EXPECT_TRUE(local_state->_slot_id_to_value_range.contains(SlotId)); - auto& output_range = local_state->_slot_id_to_value_range[SlotId].second; + auto& output_range = local_state->_slot_id_to_value_range[SlotId]; std::visit( [&](auto&& arg) { using T = std::decay_t; @@ -1543,7 +1544,9 @@ TEST_F(ScanNormalizePredicate, test_timestamptz_predicate) { for (auto const_v : test_tz_values) { auto local_state = std::make_shared(state.get(), op.get()); ColumnValueRange range("mock", false, 0, test_scale); - local_state->_slot_id_to_value_range[SlotId] = std::make_pair(&slot_desc, range); + local_state->_slot_id_to_value_range[SlotId] = range; + local_state->_slot_id_to_predicates[SlotId] = + std::vector>(); op->_slot_id_to_slot_desc[SlotId] = &slot_desc; auto slot_ref = std::make_shared(0, std::make_shared(test_scale)); @@ -1564,8 +1567,8 @@ TEST_F(ScanNormalizePredicate, test_timestamptz_predicate) { vectorized::VExprSPtr new_root; auto conjunct_expr_root = ctx; - EXPECT_TRUE(local_state->_normalize_predicate(conjunct_expr_root->root(), - conjunct_expr_root.get(), new_root)); + EXPECT_TRUE(local_state->_normalize_predicate(conjunct_expr_root.get(), + conjunct_expr_root->root(), new_root)); EXPECT_EQ(new_root, nullptr); EXPECT_TRUE(local_state->_slot_id_to_value_range.contains(SlotId)); } @@ -1573,7 +1576,9 @@ TEST_F(ScanNormalizePredicate, test_timestamptz_predicate) { { auto local_state = std::make_shared(state.get(), op.get()); ColumnValueRange range("mock", false, 0, test_scale); - local_state->_slot_id_to_value_range[SlotId] = std::make_pair(&slot_desc, range); + local_state->_slot_id_to_value_range[SlotId] = range; + local_state->_slot_id_to_predicates[SlotId] = + std::vector>(); op->_slot_id_to_slot_desc[SlotId] = &slot_desc; auto slot_ref = std::make_shared(0, std::make_shared(test_scale)); @@ -1590,8 +1595,8 @@ TEST_F(ScanNormalizePredicate, test_timestamptz_predicate) { vectorized::VExprSPtr new_root; auto conjunct_expr_root = ctx; - auto st = local_state->_normalize_predicate(conjunct_expr_root->root(), - conjunct_expr_root.get(), new_root); + auto st = local_state->_normalize_predicate(conjunct_expr_root.get(), + conjunct_expr_root->root(), new_root); EXPECT_TRUE(st.ok()); EXPECT_EQ(new_root, nullptr); EXPECT_TRUE(local_state->_slot_id_to_value_range.contains(SlotId)); @@ -1600,10 +1605,10 @@ TEST_F(ScanNormalizePredicate, test_timestamptz_predicate) { { auto local_state = std::make_shared(state.get(), op.get()); ColumnValueRange range("mock", true, 0, test_scale); - local_state->_slot_id_to_value_range[SlotId] = std::make_pair(&nullable_slot_desc, range); - // local_state->_slot_id_to_predicates[SlotId] = - // std::vector>(); - // op->_slot_id_to_slot_desc[SlotId] = &slot_desc; + local_state->_slot_id_to_value_range[SlotId] = range; + local_state->_slot_id_to_predicates[SlotId] = + std::vector>(); + op->_slot_id_to_slot_desc[SlotId] = &nullable_slot_desc; auto slot_ref = std::make_shared( 0, std::make_shared( std::make_shared(test_scale))); @@ -1620,9 +1625,9 @@ TEST_F(ScanNormalizePredicate, test_timestamptz_predicate) { vectorized::VExprSPtr new_root; auto conjunct_expr_root = ctx; - EXPECT_TRUE(local_state->_normalize_predicate(conjunct_expr_root->root(), - conjunct_expr_root.get(), new_root)); - auto& output_range = local_state->_slot_id_to_value_range[SlotId].second; + EXPECT_TRUE(local_state->_normalize_predicate(conjunct_expr_root.get(), + conjunct_expr_root->root(), new_root)); + auto& output_range = local_state->_slot_id_to_value_range[SlotId]; std::visit( [](auto&& arg) { using T = std::decay_t; @@ -1640,8 +1645,9 @@ TEST_F(ScanNormalizePredicate, test_timestamptz_predicate) { { auto local_state = std::make_shared(state.get(), op.get()); ColumnValueRange range("mock", true, 0, test_scale); - local_state->_slot_id_to_value_range[SlotId] = std::make_pair(&slot_desc, range); - op->_slot_id_to_slot_desc[SlotId] = &slot_desc; + local_state->_slot_id_to_value_range[SlotId] = range; + local_state->_slot_id_to_predicates[SlotId] = + std::vector>(); auto slot_ref = std::make_shared( 0, std::make_shared( std::make_shared(test_scale))); @@ -1658,9 +1664,9 @@ TEST_F(ScanNormalizePredicate, test_timestamptz_predicate) { vectorized::VExprSPtr new_root; auto conjunct_expr_root = ctx; - EXPECT_TRUE(local_state->_normalize_predicate(conjunct_expr_root->root(), - conjunct_expr_root.get(), new_root)); - auto& output_range = local_state->_slot_id_to_value_range[SlotId].second; + EXPECT_TRUE(local_state->_normalize_predicate(conjunct_expr_root.get(), + conjunct_expr_root->root(), new_root)); + auto& output_range = local_state->_slot_id_to_value_range[SlotId]; std::visit( [](auto&& arg) { using T = std::decay_t; @@ -1678,7 +1684,9 @@ TEST_F(ScanNormalizePredicate, test_timestamptz_predicate) { // std::cout << "test less const_v=" << const_v << std::endl; auto local_state = std::make_shared(state.get(), op.get()); ColumnValueRange range("mock", false, 0, test_scale); - local_state->_slot_id_to_value_range[SlotId] = std::make_pair(&slot_desc, range); + local_state->_slot_id_to_value_range[SlotId] = range; + local_state->_slot_id_to_predicates[SlotId] = + std::vector>(); op->_slot_id_to_slot_desc[SlotId] = &slot_desc; auto slot_ref = @@ -1700,13 +1708,13 @@ TEST_F(ScanNormalizePredicate, test_timestamptz_predicate) { vectorized::VExprSPtr new_root; auto conjunct_expr_root = ctx; - auto st = local_state->_normalize_predicate(conjunct_expr_root->root(), - conjunct_expr_root.get(), new_root); + auto st = local_state->_normalize_predicate(conjunct_expr_root.get(), + conjunct_expr_root->root(), new_root); EXPECT_TRUE(st.ok()); EXPECT_EQ(new_root, nullptr); EXPECT_TRUE(local_state->_slot_id_to_value_range.contains(SlotId)); - const auto& output_range = local_state->_slot_id_to_value_range[SlotId].second; + const auto& output_range = local_state->_slot_id_to_value_range[SlotId]; /* _low_value = -inf, _high_value = 90, @@ -1738,7 +1746,9 @@ TEST_F(ScanNormalizePredicate, test_timestamptz_predicate) { // std::cout << "test less or equal const_v=" << const_v << std::endl; auto local_state = std::make_shared(state.get(), op.get()); ColumnValueRange range("mock", false, 0, test_scale); - local_state->_slot_id_to_value_range[SlotId] = std::make_pair(&slot_desc, range); + local_state->_slot_id_to_value_range[SlotId] = range; + local_state->_slot_id_to_predicates[SlotId] = + std::vector>(); auto slot_ref = std::make_shared(0, std::make_shared(test_scale)); @@ -1759,13 +1769,13 @@ TEST_F(ScanNormalizePredicate, test_timestamptz_predicate) { vectorized::VExprSPtr new_root; auto conjunct_expr_root = ctx; - auto st = local_state->_normalize_predicate(conjunct_expr_root->root(), - conjunct_expr_root.get(), new_root); + auto st = local_state->_normalize_predicate(conjunct_expr_root.get(), + conjunct_expr_root->root(), new_root); EXPECT_TRUE(st.ok()); EXPECT_EQ(new_root, nullptr); EXPECT_TRUE(local_state->_slot_id_to_value_range.contains(SlotId)); - const auto& output_range = local_state->_slot_id_to_value_range[SlotId].second; + const auto& output_range = local_state->_slot_id_to_value_range[SlotId]; std::visit( [&](auto&& arg) { using T = std::decay_t; @@ -1794,7 +1804,9 @@ TEST_F(ScanNormalizePredicate, test_timestamptz_predicate) { // std::cout << "test greater const_v=" << const_v << std::endl; auto local_state = std::make_shared(state.get(), op.get()); ColumnValueRange range("mock", false, 0, test_scale); - local_state->_slot_id_to_value_range[SlotId] = std::make_pair(&slot_desc, range); + local_state->_slot_id_to_value_range[SlotId] = range; + local_state->_slot_id_to_predicates[SlotId] = + std::vector>(); auto slot_ref = std::make_shared(0, std::make_shared(test_scale)); @@ -1815,13 +1827,13 @@ TEST_F(ScanNormalizePredicate, test_timestamptz_predicate) { vectorized::VExprSPtr new_root; auto conjunct_expr_root = ctx; - auto st = local_state->_normalize_predicate(conjunct_expr_root->root(), - conjunct_expr_root.get(), new_root); + auto st = local_state->_normalize_predicate(conjunct_expr_root.get(), + conjunct_expr_root->root(), new_root); EXPECT_TRUE(st.ok()); EXPECT_EQ(new_root, nullptr); EXPECT_TRUE(local_state->_slot_id_to_value_range.contains(SlotId)); - const auto& output_range = local_state->_slot_id_to_value_range[SlotId].second; + const auto& output_range = local_state->_slot_id_to_value_range[SlotId]; /* _low_value = 90, _high_value = nan, @@ -1853,7 +1865,9 @@ TEST_F(ScanNormalizePredicate, test_timestamptz_predicate) { // std::cout << "test greater or equal const_v=" << const_v << std::endl; auto local_state = std::make_shared(state.get(), op.get()); ColumnValueRange range("mock", false, 0, test_scale); - local_state->_slot_id_to_value_range[SlotId] = std::make_pair(&slot_desc, range); + local_state->_slot_id_to_value_range[SlotId] = range; + local_state->_slot_id_to_predicates[SlotId] = + std::vector>(); auto slot_ref = std::make_shared(0, std::make_shared(test_scale)); @@ -1874,13 +1888,13 @@ TEST_F(ScanNormalizePredicate, test_timestamptz_predicate) { vectorized::VExprSPtr new_root; auto conjunct_expr_root = ctx; - auto st = local_state->_normalize_predicate(conjunct_expr_root->root(), - conjunct_expr_root.get(), new_root); + auto st = local_state->_normalize_predicate(conjunct_expr_root.get(), + conjunct_expr_root->root(), new_root); EXPECT_TRUE(st.ok()); EXPECT_EQ(new_root, nullptr); EXPECT_TRUE(local_state->_slot_id_to_value_range.contains(SlotId)); - const auto& output_range = local_state->_slot_id_to_value_range[SlotId].second; + const auto& output_range = local_state->_slot_id_to_value_range[SlotId]; std::visit( [&](auto&& arg) { using T = std::decay_t; diff --git a/be/test/testutil/mock/mock_in_expr.h b/be/test/testutil/mock/mock_in_expr.h index 7f31d99c6cdaaf..8542cff046ee0e 100644 --- a/be/test/testutil/mock/mock_in_expr.h +++ b/be/test/testutil/mock/mock_in_expr.h @@ -30,7 +30,7 @@ class VExprContext; // use to mock a slot ref expr class MockInExpr final : public VInPredicate { public: - MockInExpr() = default; + MockInExpr() { _node_type = TExprNodeType::IN_PRED; } Status execute(VExprContext* context, Block* block, int* result_column_id) const override { return Status::OK(); diff --git a/be/test/vec/exec/format/parquet/parquet_expr_test.cpp b/be/test/vec/exec/format/parquet/parquet_expr_test.cpp index 07a1d1af4fbf29..40f3891df4667d 100644 --- a/be/test/vec/exec/format/parquet/parquet_expr_test.cpp +++ b/be/test/vec/exec/format/parquet/parquet_expr_test.cpp @@ -281,7 +281,8 @@ class ParquetExprTest : public testing::Test { &ctz, nullptr, nullptr); p_reader->set_file_reader(local_file_reader); colname_to_slot_id.emplace("int64_col", 2); - static_cast(p_reader->init_reader(column_names, &col_name_to_block_idx, {}, + phmap::flat_hash_map>> tmp; + static_cast(p_reader->init_reader(column_names, &col_name_to_block_idx, {}, tmp, tuple_desc, nullptr, &colname_to_slot_id, nullptr, nullptr)); @@ -401,126 +402,6 @@ TEST_F(ParquetExprTest, test_min_max) { } } -TEST_F(ParquetExprTest, test_ne) { - auto slot_ref = std::make_shared(0, std::make_shared()); - auto fn_eq = MockFnCall::create("ne"); - auto const_val = std::make_shared( - ColumnHelper::create_column_with_name({100})); - - slot_ref->set_expr_name("int32_all_null_col"); - fn_eq->add_child(slot_ref); - fn_eq->add_child(const_val); - fn_eq->_node_type = TExprNodeType::BINARY_PRED; - fn_eq->_opcode = TExprOpcode::NE; - slot_ref->_slot_id = 1; - EXPECT_FALSE(fn_eq->is_constant()); - - auto ctx = VExprContext::create_shared(fn_eq); - ctx->_prepared = true; - ctx->_opened = true; - ASSERT_FALSE(p_reader->check_expr_can_push_down(ctx->root())); -} - -TEST_F(ParquetExprTest, test_eq) { - auto slot_ref = std::make_shared(0, std::make_shared()); - auto fn_eq = MockFnCall::create("eq"); - auto const_val = std::make_shared( - ColumnHelper::create_column_with_name({100})); - slot_ref->set_expr_name("int32_all_null_col"); - fn_eq->add_child(slot_ref); - fn_eq->add_child(const_val); - fn_eq->_node_type = TExprNodeType::BINARY_PRED; - fn_eq->_opcode = TExprOpcode::EQ; - slot_ref->_slot_id = 1; - slot_ref->_column_id = 1; - EXPECT_FALSE(fn_eq->is_constant()); - - auto ctx = VExprContext::create_shared(fn_eq); - ctx->_prepared = true; - ctx->_opened = true; - ASSERT_TRUE(p_reader->check_expr_can_push_down(ctx->root())); -} - -TEST_F(ParquetExprTest, test_le) { - auto slot_ref = std::make_shared(0, std::make_shared()); - auto fn_eq = MockFnCall::create("le"); - auto const_val = std::make_shared( - ColumnHelper::create_column_with_name({100})); - slot_ref->set_expr_name("int32_all_null_col"); - fn_eq->add_child(slot_ref); - fn_eq->add_child(const_val); - fn_eq->_node_type = TExprNodeType::BINARY_PRED; - fn_eq->_opcode = TExprOpcode::LE; - slot_ref->_slot_id = 1; - slot_ref->_column_id = 1; - EXPECT_FALSE(fn_eq->is_constant()); - - auto ctx = VExprContext::create_shared(fn_eq); - ctx->_prepared = true; - ctx->_opened = true; - ASSERT_TRUE(p_reader->check_expr_can_push_down(ctx->root())); -} - -TEST_F(ParquetExprTest, test_ge) { - auto slot_ref = std::make_shared(0, std::make_shared()); - auto fn_eq = MockFnCall::create("ge"); - auto const_val = std::make_shared( - ColumnHelper::create_column_with_name({100})); - slot_ref->set_expr_name("int32_all_null_col"); - fn_eq->add_child(slot_ref); - fn_eq->add_child(const_val); - fn_eq->_node_type = TExprNodeType::BINARY_PRED; - fn_eq->_opcode = TExprOpcode::GE; - slot_ref->_slot_id = 1; - slot_ref->_column_id = 1; - EXPECT_FALSE(fn_eq->is_constant()); - - auto ctx = VExprContext::create_shared(fn_eq); - ctx->_prepared = true; - ctx->_opened = true; - ASSERT_TRUE(p_reader->check_expr_can_push_down(ctx->root())); -} - -TEST_F(ParquetExprTest, test_gt) { - auto slot_ref = std::make_shared(0, std::make_shared()); - auto fn_eq = MockFnCall::create("gt"); - auto const_val = std::make_shared( - ColumnHelper::create_column_with_name({100})); - slot_ref->set_expr_name("int32_all_null_col"); - fn_eq->add_child(slot_ref); - fn_eq->add_child(const_val); - fn_eq->_node_type = TExprNodeType::BINARY_PRED; - fn_eq->_opcode = TExprOpcode::GT; - slot_ref->_slot_id = 1; - slot_ref->_column_id = 1; - EXPECT_FALSE(fn_eq->is_constant()); - - auto ctx = VExprContext::create_shared(fn_eq); - ctx->_prepared = true; - ctx->_opened = true; - ASSERT_TRUE(p_reader->check_expr_can_push_down(ctx->root())); -} - -TEST_F(ParquetExprTest, test_lt) { - auto slot_ref = std::make_shared(0, std::make_shared()); - auto fn_eq = MockFnCall::create("lt"); - auto const_val = std::make_shared( - ColumnHelper::create_column_with_name({100})); - slot_ref->set_expr_name("int32_all_null_col"); - fn_eq->add_child(slot_ref); - fn_eq->add_child(const_val); - fn_eq->_node_type = TExprNodeType::BINARY_PRED; - fn_eq->_opcode = TExprOpcode::LT; - slot_ref->_slot_id = 1; - slot_ref->_column_id = 1; - EXPECT_FALSE(fn_eq->is_constant()); - - auto ctx = VExprContext::create_shared(fn_eq); - ctx->_prepared = true; - ctx->_opened = true; - ASSERT_TRUE(p_reader->check_expr_can_push_down(ctx->root())); -} - TEST_F(ParquetExprTest, test_ge_2) { // int64_col = 10000000001 [10000000000 , 10000000000+3) // int64_col = 10000000001 [10000000000 , 10000000000+3) int loc = 2; @@ -540,7 +421,6 @@ TEST_F(ParquetExprTest, test_ge_2) { // int64_col = 10000000001 [10000000000 , auto ctx = VExprContext::create_shared(fn_eq); ctx->_prepared = true; ctx->_opened = true; - ASSERT_TRUE(p_reader->check_expr_can_push_down(ctx->root())); { const std::function& @@ -592,7 +472,6 @@ TEST_F(ParquetExprTest, test_lt_2) { // string_col < name_1 auto ctx = VExprContext::create_shared(fn_eq); ctx->_prepared = true; ctx->_opened = true; - ASSERT_TRUE(p_reader->check_expr_can_push_down(ctx->root())); { const std::function& @@ -636,8 +515,6 @@ TEST_F(ParquetExprTest, test_is_null) { // int32_all_null_col is null ctx->_prepared = true; ctx->_opened = true; - ASSERT_TRUE(p_reader->check_expr_can_push_down(ctx->root())); - { const std::function& get_stat_func = @@ -687,8 +564,6 @@ TEST_F(ParquetExprTest, test_is_not_null) { // int32_all_null_col is not null ctx->_prepared = true; ctx->_opened = true; - ASSERT_TRUE(p_reader->check_expr_can_push_down(ctx->root())); - { const std::function& get_stat_func = @@ -738,8 +613,6 @@ TEST_F(ParquetExprTest, test_is_null_2) { // int32_partial_null_col is null ctx->_prepared = true; ctx->_opened = true; - ASSERT_TRUE(p_reader->check_expr_can_push_down(ctx->root())); - { const std::function& get_stat_func = @@ -1174,69 +1047,28 @@ TEST_F(ParquetExprTest, test_expr_push_down_eq_bool) { } TEST_F(ParquetExprTest, test_expr_push_down_and) { + std::unique_ptr pred = AndBlockColumnPredicate::create_unique(); auto and_expr = std::make_shared(); and_expr->_op = TExprOpcode::COMPOUND_AND; and_expr->_opcode = TExprOpcode::COMPOUND_AND; and_expr->_node_type = TExprNodeType::COMPOUND_PRED; // x <= 10000000002 { - auto slot_ref = std::make_shared(2, std::make_shared()); - auto fn_le = MockFnCall::create("le"); - auto const_val = std::make_shared( - ColumnHelper::create_column_with_name({10000000002})); - slot_ref->set_expr_name("int64_col"); - fn_le->add_child(slot_ref); - fn_le->add_child(const_val); - fn_le->_node_type = TExprNodeType::BINARY_PRED; - fn_le->_opcode = TExprOpcode::LE; - slot_ref->_slot_id = 2; - slot_ref->_column_id = 2; - EXPECT_FALSE(fn_le->is_constant()); - - auto ctx = VExprContext::create_shared(fn_le); - ctx->_prepared = true; - ctx->_opened = true; - and_expr->add_child(ctx->root()); + pred->add_column_predicate(SingleColumnBlockPredicate::create_unique( + ComparisonPredicateBase::create_shared( + 2, "", 10000000002))); } { // x > 100 - auto slot_ref = std::make_shared(2, std::make_shared()); - auto fn_le = MockFnCall::create("gt"); - auto const_val = std::make_shared( - ColumnHelper::create_column_with_name({100})); - slot_ref->set_expr_name("int64_col"); - fn_le->add_child(slot_ref); - fn_le->add_child(const_val); - fn_le->_node_type = TExprNodeType::BINARY_PRED; - fn_le->_opcode = TExprOpcode::GT; - slot_ref->_slot_id = 2; - slot_ref->_column_id = 2; - EXPECT_FALSE(fn_le->is_constant()); - - auto ctx = VExprContext::create_shared(fn_le); - ctx->_prepared = true; - ctx->_opened = true; - and_expr->add_child(ctx->root()); + pred->add_column_predicate(SingleColumnBlockPredicate::create_unique( + ComparisonPredicateBase::create_shared(2, "", + 100))); } { // x >= 900 - auto slot_ref = std::make_shared(2, std::make_shared()); - auto fn_le = MockFnCall::create("ge"); - auto const_val = std::make_shared( - ColumnHelper::create_column_with_name({900})); - slot_ref->set_expr_name("int64_col"); - fn_le->add_child(slot_ref); - fn_le->add_child(const_val); - fn_le->_node_type = TExprNodeType::BINARY_PRED; - fn_le->_opcode = TExprOpcode::GE; - slot_ref->_slot_id = 2; - slot_ref->_column_id = 2; - EXPECT_FALSE(fn_le->is_constant()); - - auto ctx = VExprContext::create_shared(fn_le); - ctx->_prepared = true; - ctx->_opened = true; - and_expr->add_child(ctx->root()); + pred->add_column_predicate(SingleColumnBlockPredicate::create_unique( + ComparisonPredicateBase::create_shared(2, "", + 900))); } const std::function& get_stat_func = @@ -1250,16 +1082,8 @@ TEST_F(ParquetExprTest, test_expr_push_down_and) { } return true; }; - ASSERT_TRUE(p_reader->check_expr_can_push_down(and_expr)); - p_reader->_enable_filter_by_min_max = true; - std::map>> push_down_simple_predicates; - push_down_simple_predicates.emplace(2, std::vector> {}); - p_reader->_push_down_predicates.push_back(AndBlockColumnPredicate::create_unique()); - ASSERT_TRUE(p_reader->convert_predicates({and_expr}, push_down_simple_predicates[2], - p_reader->_push_down_predicates.back(), - p_reader->_arena) - .ok()); + p_reader->_push_down_predicates.push_back(std::move(pred)); bool filter_group = false; bool filtered_by_min_max = false; @@ -1334,13 +1158,12 @@ TEST_F(ParquetExprTest, test_expr_push_down_or_string) { } return true; }; - ASSERT_TRUE(p_reader->check_expr_can_push_down(or_expr)); } TEST_F(ParquetExprTest, test_bloom_filter_skipped_when_range_miss) { const int col_idx = 2; const int64_t predicate_value = 10000000001; - ComparisonPredicateBase eq_pred(col_idx, predicate_value); + ComparisonPredicateBase eq_pred(col_idx, "", predicate_value); ParquetPredicate::ColumnStat stat; stat.ctz = &ctz; @@ -1382,7 +1205,7 @@ TEST_F(ParquetExprTest, test_bloom_filter_skipped_when_range_miss) { TEST_F(ParquetExprTest, test_bloom_filter_rejects_value) { const int col_idx = 2; const int64_t predicate_value = 10000000001; - ComparisonPredicateBase eq_pred(col_idx, predicate_value); + ComparisonPredicateBase eq_pred(col_idx, "", predicate_value); ParquetPredicate::ColumnStat stat; stat.ctz = &ctz; @@ -1433,7 +1256,7 @@ TEST_F(ParquetExprTest, test_bloom_filter_rejects_value) { TEST_F(ParquetExprTest, test_bloom_filter_accepts_value) { const int col_idx = 2; const int64_t predicate_value = 10000000001; - ComparisonPredicateBase eq_pred(col_idx, predicate_value); + ComparisonPredicateBase eq_pred(col_idx, "", predicate_value); ParquetPredicate::ColumnStat stat; stat.ctz = &ctz; @@ -1484,7 +1307,7 @@ TEST_F(ParquetExprTest, test_bloom_filter_accepts_value) { TEST_F(ParquetExprTest, test_bloom_filter_skipped_when_min_max_evicts_rowgroup) { const int col_idx = 2; const int64_t predicate_value = 10000000001; - ComparisonPredicateBase eq_pred(col_idx, predicate_value); + ComparisonPredicateBase eq_pred(col_idx, "", predicate_value); ParquetPredicate::ColumnStat stat; stat.ctz = &ctz; @@ -1527,7 +1350,7 @@ TEST_F(ParquetExprTest, test_bloom_filter_skipped_when_min_max_evicts_rowgroup) TEST_F(ParquetExprTest, test_bloom_filter_loader_called_when_min_max_allows) { const int col_idx = 2; const int64_t predicate_value = 10000000001; - ComparisonPredicateBase eq_pred(col_idx, predicate_value); + ComparisonPredicateBase eq_pred(col_idx, "", predicate_value); ParquetPredicate::ColumnStat stat; stat.ctz = &ctz; @@ -1578,7 +1401,7 @@ TEST_F(ParquetExprTest, test_bloom_filter_loader_called_when_min_max_allows) { TEST_F(ParquetExprTest, test_bloom_filter_loader_not_called_when_missing_metadata) { const int col_idx = 2; const int64_t predicate_value = 10000000001; - ComparisonPredicateBase eq_pred(col_idx, predicate_value); + ComparisonPredicateBase eq_pred(col_idx, "", predicate_value); ParquetPredicate::ColumnStat stat; stat.ctz = &ctz; @@ -1620,7 +1443,7 @@ TEST_F(ParquetExprTest, test_bloom_filter_loader_not_called_when_missing_metadat TEST_F(ParquetExprTest, test_bloom_filter_loader_resets_on_failure) { const int col_idx = 2; const int64_t predicate_value = 10000000001; - ComparisonPredicateBase eq_pred(col_idx, predicate_value); + ComparisonPredicateBase eq_pred(col_idx, "", predicate_value); ParquetPredicate::ColumnStat stat; stat.ctz = &ctz; @@ -1667,7 +1490,7 @@ TEST_F(ParquetExprTest, test_bloom_filter_loader_resets_on_failure) { TEST_F(ParquetExprTest, test_bloom_filter_not_supported_type) { const int col_idx = 6; // bool column const bool predicate_value = true; - ComparisonPredicateBase eq_pred(col_idx, predicate_value); + ComparisonPredicateBase eq_pred(col_idx, "", predicate_value); ParquetPredicate::ColumnStat stat; stat.ctz = &ctz; @@ -1707,7 +1530,7 @@ TEST_F(ParquetExprTest, test_bloom_filter_not_supported_type) { TEST_F(ParquetExprTest, test_bloom_filter_min_max_overlap_but_no_loader) { const int col_idx = 2; const int64_t predicate_value = 10000000001; - ComparisonPredicateBase eq_pred(col_idx, predicate_value); + ComparisonPredicateBase eq_pred(col_idx, "", predicate_value); ParquetPredicate::ColumnStat stat; stat.ctz = &ctz; @@ -1749,8 +1572,7 @@ TEST_F(ParquetExprTest, test_in_list_predicate_uses_bloom_filter) { set->insert(&v); } - InListPredicateBase> - in_pred(col_idx, set); + InListPredicateBase in_pred(col_idx, "", set, false); ParquetPredicate::ColumnStat stat; stat.ctz = &ctz; @@ -1803,8 +1625,7 @@ TEST_F(ParquetExprTest, test_in_list_predicate_no_loader_on_range_miss) { set->insert(&v); } - InListPredicateBase> - in_pred(col_idx, set); + InListPredicateBase in_pred(col_idx, "", set, false); ParquetPredicate::ColumnStat stat; stat.ctz = &ctz; @@ -1846,7 +1667,7 @@ TEST_F(ParquetExprTest, test_in_list_predicate_no_loader_on_range_miss) { TEST_F(ParquetExprTest, test_bloom_filter_reused_after_first_load) { const int col_idx = 2; const int64_t predicate_value = 10000000001; - ComparisonPredicateBase eq_pred(col_idx, predicate_value); + ComparisonPredicateBase eq_pred(col_idx, "", predicate_value); ParquetPredicate::ColumnStat stat; stat.ctz = &ctz; diff --git a/be/test/vec/exec/format/parquet/parquet_read_lines.cpp b/be/test/vec/exec/format/parquet/parquet_read_lines.cpp index 60aebd16dd4d79..46d8d1020e1085 100644 --- a/be/test/vec/exec/format/parquet/parquet_read_lines.cpp +++ b/be/test/vec/exec/format/parquet/parquet_read_lines.cpp @@ -151,7 +151,8 @@ static void read_parquet_lines(std::vector numeric_types, runtime_state.set_desc_tbl(desc_tbl); std::unordered_map colname_to_value_range; - static_cast(p_reader->init_reader(column_names, &col_name_to_block_idx, {}, nullptr, + phmap::flat_hash_map>> tmp; + static_cast(p_reader->init_reader(column_names, &col_name_to_block_idx, {}, tmp, nullptr, nullptr, nullptr, nullptr, nullptr)); std::unordered_map> partition_columns; diff --git a/be/test/vec/exec/format/parquet/parquet_reader_test.cpp b/be/test/vec/exec/format/parquet/parquet_reader_test.cpp index 97b40bccf0a377..74f9a1b142ff8b 100644 --- a/be/test/vec/exec/format/parquet/parquet_reader_test.cpp +++ b/be/test/vec/exec/format/parquet/parquet_reader_test.cpp @@ -151,7 +151,8 @@ TEST_F(ParquetReaderTest, normal) { RuntimeState runtime_state((TQueryGlobals())); runtime_state.set_desc_tbl(desc_tbl); - static_cast(p_reader->init_reader(column_names, &col_name_to_block_idx, {}, nullptr, + phmap::flat_hash_map>> tmp; + static_cast(p_reader->init_reader(column_names, &col_name_to_block_idx, {}, tmp, nullptr, nullptr, nullptr, nullptr, nullptr)); std::unordered_map> partition_columns; @@ -215,8 +216,9 @@ TEST_F(ParquetReaderTest, uuid_varbinary) { RuntimeState runtime_state((TQueryGlobals())); runtime_state.set_desc_tbl(desc_tbl); - st = p_reader->init_reader(column_names, &col_name_to_block_idx, {}, nullptr, nullptr, nullptr, - nullptr, nullptr); + phmap::flat_hash_map>> tmp; + st = p_reader->init_reader(column_names, &col_name_to_block_idx, {}, tmp, nullptr, nullptr, + nullptr, nullptr, nullptr); EXPECT_TRUE(st.ok()) << st; std::unordered_map> partition_columns; @@ -288,8 +290,9 @@ TEST_F(ParquetReaderTest, varbinary_varbinary) { RuntimeState runtime_state((TQueryGlobals())); runtime_state.set_desc_tbl(desc_tbl); - st = p_reader->init_reader(column_names, &col_name_to_block_idx, {}, nullptr, nullptr, nullptr, - nullptr, nullptr); + phmap::flat_hash_map>> tmp; + st = p_reader->init_reader(column_names, &col_name_to_block_idx, {}, tmp, nullptr, nullptr, + nullptr, nullptr, nullptr); EXPECT_TRUE(st.ok()) << st; std::unordered_map> partition_columns; @@ -363,8 +366,9 @@ TEST_F(ParquetReaderTest, varbinary_string) { RuntimeState runtime_state((TQueryGlobals())); runtime_state.set_desc_tbl(desc_tbl); - st = p_reader->init_reader(column_names, &col_name_to_block_idx, {}, nullptr, nullptr, nullptr, - nullptr, nullptr); + phmap::flat_hash_map>> tmp; + st = p_reader->init_reader(column_names, &col_name_to_block_idx, {}, tmp, nullptr, nullptr, + nullptr, nullptr, nullptr); EXPECT_TRUE(st.ok()) << st; std::unordered_map> partition_columns; @@ -438,8 +442,9 @@ TEST_F(ParquetReaderTest, varbinary_string2) { RuntimeState runtime_state((TQueryGlobals())); runtime_state.set_desc_tbl(desc_tbl); - st = p_reader->init_reader(column_names, &col_name_to_block_idx, {}, nullptr, nullptr, nullptr, - nullptr, nullptr); + phmap::flat_hash_map>> tmp; + st = p_reader->init_reader(column_names, &col_name_to_block_idx, {}, tmp, nullptr, nullptr, + nullptr, nullptr, nullptr); EXPECT_TRUE(st.ok()) << st; std::unordered_map> partition_columns; diff --git a/be/test/vec/exec/format/parquet/parquet_statistics_test.cpp b/be/test/vec/exec/format/parquet/parquet_statistics_test.cpp index cd8d3068fe1312..b414095245ec7e 100644 --- a/be/test/vec/exec/format/parquet/parquet_statistics_test.cpp +++ b/be/test/vec/exec/format/parquet/parquet_statistics_test.cpp @@ -19,7 +19,7 @@ #include -#include "vec/exec/format/parquet/parquet_pred_cmp.h" +#include "vec/exec/format/parquet/parquet_predicate.h" namespace doris { namespace vectorized { diff --git a/be/test/vec/exec/format/table/hive/hive_reader_test.cpp b/be/test/vec/exec/format/table/hive/hive_reader_test.cpp index d79f4ded7888fd..16608a85dee421 100644 --- a/be/test/vec/exec/format/table/hive/hive_reader_test.cpp +++ b/be/test/vec/exec/format/table/hive/hive_reader_test.cpp @@ -572,7 +572,8 @@ TEST_F(HiveReaderTest, read_hive_parquet_file) { const VExprContextSPtrs* not_single_slot_filter_conjuncts = nullptr; const std::unordered_map* slot_id_to_filter_conjuncts = nullptr; - st = hive_reader->init_reader(table_col_names, &col_name_to_block_idx, conjuncts, + phmap::flat_hash_map>> tmp; + st = hive_reader->init_reader(table_col_names, &col_name_to_block_idx, conjuncts, tmp, tuple_descriptor, row_descriptor, colname_to_slot_id, not_single_slot_filter_conjuncts, slot_id_to_filter_conjuncts); ASSERT_TRUE(st.ok()) << st; diff --git a/be/test/vec/exec/format/table/iceberg/iceberg_reader_test.cpp b/be/test/vec/exec/format/table/iceberg/iceberg_reader_test.cpp index f82e64bb03ff33..4e72f3f5e6ed25 100644 --- a/be/test/vec/exec/format/table/iceberg/iceberg_reader_test.cpp +++ b/be/test/vec/exec/format/table/iceberg/iceberg_reader_test.cpp @@ -572,7 +572,8 @@ TEST_F(IcebergReaderTest, read_iceberg_parquet_file) { const VExprContextSPtrs* not_single_slot_filter_conjuncts = nullptr; const std::unordered_map* slot_id_to_filter_conjuncts = nullptr; - st = iceberg_reader->init_reader(table_col_names, &col_name_to_block_idx, conjuncts, + phmap::flat_hash_map>> tmp; + st = iceberg_reader->init_reader(table_col_names, &col_name_to_block_idx, conjuncts, tmp, tuple_descriptor, row_descriptor, colname_to_slot_id, not_single_slot_filter_conjuncts, slot_id_to_filter_conjuncts); ASSERT_TRUE(st.ok()) << st; diff --git a/be/test/vec/exec/vfile_scanner_exception_test.cpp b/be/test/vec/exec/vfile_scanner_exception_test.cpp index e5d38b18c54e72..6927a5076c19e5 100644 --- a/be/test/vec/exec/vfile_scanner_exception_test.cpp +++ b/be/test/vec/exec/vfile_scanner_exception_test.cpp @@ -276,13 +276,11 @@ void VfileScannerExceptionTest::init() { void VfileScannerExceptionTest::generate_scanner(std::shared_ptr& scanner) { auto split_source = std::make_shared(_scan_range); - std::unordered_map _colname_to_value_range; std::unordered_map _colname_to_slot_id; scanner = std::make_shared( &_runtime_state, &(_runtime_state.get_local_state(0)->cast()), -1, - split_source, _profile, _kv_cache.get(), &_colname_to_value_range, - &_colname_to_slot_id); + split_source, _profile, _kv_cache.get(), &_colname_to_slot_id); scanner->_is_load = false; vectorized::VExprContextSPtrs _conjuncts; WARN_IF_ERROR(scanner->init(&_runtime_state, _conjuncts), "fail to prepare scanner");