diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt index d5429808c..7c71f6510 100644 --- a/src/iceberg/CMakeLists.txt +++ b/src/iceberg/CMakeLists.txt @@ -25,6 +25,7 @@ set(ICEBERG_SOURCES expression/evaluator.cc expression/expression.cc expression/expressions.cc + expression/inclusive_metrics_evaluator.cc expression/literal.cc expression/predicate.cc expression/rewrite_not.cc diff --git a/src/iceberg/expression/expressions.cc b/src/iceberg/expression/expressions.cc index 786cc0ab7..7eef60232 100644 --- a/src/iceberg/expression/expressions.cc +++ b/src/iceberg/expression/expressions.cc @@ -156,56 +156,21 @@ std::shared_ptr> Expressions::IsNull( return IsNull(Ref(std::move(name))); } -template -std::shared_ptr> Expressions::IsNull( - std::shared_ptr> expr) { - ICEBERG_ASSIGN_OR_THROW( - auto pred, - UnboundPredicateImpl::Make(Expression::Operation::kIsNull, std::move(expr))); - return pred; -} - std::shared_ptr> Expressions::NotNull( std::string name) { return NotNull(Ref(std::move(name))); } -template -std::shared_ptr> Expressions::NotNull( - std::shared_ptr> expr) { - ICEBERG_ASSIGN_OR_THROW( - auto pred, - UnboundPredicateImpl::Make(Expression::Operation::kNotNull, std::move(expr))); - return pred; -} - std::shared_ptr> Expressions::IsNaN( std::string name) { return IsNaN(Ref(std::move(name))); } -template -std::shared_ptr> Expressions::IsNaN( - std::shared_ptr> expr) { - ICEBERG_ASSIGN_OR_THROW(auto pred, UnboundPredicateImpl::Make( - Expression::Operation::kIsNan, std::move(expr))); - return pred; -} - std::shared_ptr> Expressions::NotNaN( std::string name) { return NotNaN(Ref(std::move(name))); } -template -std::shared_ptr> Expressions::NotNaN( - std::shared_ptr> expr) { - ICEBERG_ASSIGN_OR_THROW( - auto pred, - UnboundPredicateImpl::Make(Expression::Operation::kNotNan, std::move(expr))); - return pred; -} - // Template implementations for comparison predicates std::shared_ptr> Expressions::LessThan( @@ -213,85 +178,31 @@ std::shared_ptr> Expressions::LessThan( return LessThan(Ref(std::move(name)), std::move(value)); } -template -std::shared_ptr> Expressions::LessThan( - std::shared_ptr> expr, Literal value) { - ICEBERG_ASSIGN_OR_THROW( - auto pred, UnboundPredicateImpl::Make(Expression::Operation::kLt, - std::move(expr), std::move(value))); - return pred; -} - std::shared_ptr> Expressions::LessThanOrEqual( std::string name, Literal value) { return LessThanOrEqual(Ref(std::move(name)), std::move(value)); } -template -std::shared_ptr> Expressions::LessThanOrEqual( - std::shared_ptr> expr, Literal value) { - ICEBERG_ASSIGN_OR_THROW( - auto pred, UnboundPredicateImpl::Make(Expression::Operation::kLtEq, - std::move(expr), std::move(value))); - return pred; -} - std::shared_ptr> Expressions::GreaterThan( std::string name, Literal value) { return GreaterThan(Ref(std::move(name)), std::move(value)); } -template -std::shared_ptr> Expressions::GreaterThan( - std::shared_ptr> expr, Literal value) { - ICEBERG_ASSIGN_OR_THROW( - auto pred, UnboundPredicateImpl::Make(Expression::Operation::kGt, - std::move(expr), std::move(value))); - return pred; -} - std::shared_ptr> Expressions::GreaterThanOrEqual( std::string name, Literal value) { return GreaterThanOrEqual(Ref(std::move(name)), std::move(value)); } -template -std::shared_ptr> Expressions::GreaterThanOrEqual( - std::shared_ptr> expr, Literal value) { - ICEBERG_ASSIGN_OR_THROW( - auto pred, UnboundPredicateImpl::Make(Expression::Operation::kGtEq, - std::move(expr), std::move(value))); - return pred; -} - std::shared_ptr> Expressions::Equal(std::string name, Literal value) { return Equal(Ref(std::move(name)), std::move(value)); } -template -std::shared_ptr> Expressions::Equal( - std::shared_ptr> expr, Literal value) { - ICEBERG_ASSIGN_OR_THROW( - auto pred, UnboundPredicateImpl::Make(Expression::Operation::kEq, - std::move(expr), std::move(value))); - return pred; -} - std::shared_ptr> Expressions::NotEqual( std::string name, Literal value) { return NotEqual(Ref(std::move(name)), std::move(value)); } -template -std::shared_ptr> Expressions::NotEqual( - std::shared_ptr> expr, Literal value) { - ICEBERG_ASSIGN_OR_THROW( - auto pred, UnboundPredicateImpl::Make(Expression::Operation::kNotEq, - std::move(expr), std::move(value))); - return pred; -} - // String predicates std::shared_ptr> Expressions::StartsWith( @@ -299,31 +210,11 @@ std::shared_ptr> Expressions::StartsWith( return StartsWith(Ref(std::move(name)), std::move(value)); } -template -std::shared_ptr> Expressions::StartsWith( - std::shared_ptr> expr, std::string value) { - ICEBERG_ASSIGN_OR_THROW( - auto pred, - UnboundPredicateImpl::Make(Expression::Operation::kStartsWith, std::move(expr), - Literal::String(std::move(value)))); - return pred; -} - std::shared_ptr> Expressions::NotStartsWith( std::string name, std::string value) { return NotStartsWith(Ref(std::move(name)), std::move(value)); } -template -std::shared_ptr> Expressions::NotStartsWith( - std::shared_ptr> expr, std::string value) { - ICEBERG_ASSIGN_OR_THROW( - auto pred, - UnboundPredicateImpl::Make(Expression::Operation::kNotStartsWith, - std::move(expr), Literal::String(std::move(value)))); - return pred; -} - // Template implementations for set predicates std::shared_ptr> Expressions::In( @@ -331,51 +222,21 @@ std::shared_ptr> Expressions::In( return In(Ref(std::move(name)), std::move(values)); } -template -std::shared_ptr> Expressions::In( - std::shared_ptr> expr, std::vector values) { - ICEBERG_ASSIGN_OR_THROW( - auto pred, UnboundPredicateImpl::Make(Expression::Operation::kIn, - std::move(expr), std::move(values))); - return pred; -} - std::shared_ptr> Expressions::In( std::string name, std::initializer_list values) { return In(Ref(std::move(name)), std::vector(values)); } -template -std::shared_ptr> Expressions::In( - std::shared_ptr> expr, std::initializer_list values) { - return In(std::move(expr), std::vector(values)); -} - std::shared_ptr> Expressions::NotIn( std::string name, std::vector values) { return NotIn(Ref(std::move(name)), std::move(values)); } -template -std::shared_ptr> Expressions::NotIn( - std::shared_ptr> expr, std::vector values) { - ICEBERG_ASSIGN_OR_THROW( - auto pred, UnboundPredicateImpl::Make(Expression::Operation::kNotIn, - std::move(expr), std::move(values))); - return pred; -} - std::shared_ptr> Expressions::NotIn( std::string name, std::initializer_list values) { return NotIn(Ref(std::move(name)), std::vector(values)); } -template -std::shared_ptr> Expressions::NotIn( - std::shared_ptr> expr, std::initializer_list values) { - return NotIn(expr, std::vector(values)); -} - // Template implementations for generic predicate factory std::shared_ptr> Expressions::Predicate( @@ -404,29 +265,6 @@ std::shared_ptr> Expressions::Predicate( return pred; } -template -std::shared_ptr> Expressions::Predicate( - Expression::Operation op, std::shared_ptr> expr, - std::vector values) { - ICEBERG_ASSIGN_OR_THROW( - auto pred, UnboundPredicateImpl::Make(op, std::move(expr), std::move(values))); - return pred; -} - -template -std::shared_ptr> Expressions::Predicate( - Expression::Operation op, std::shared_ptr> expr, - std::initializer_list values) { - return Predicate(op, std::move(expr), std::vector(values)); -} - -template -std::shared_ptr> Expressions::Predicate( - Expression::Operation op, std::shared_ptr> expr) { - ICEBERG_ASSIGN_OR_THROW(auto pred, UnboundPredicateImpl::Make(op, std::move(expr))); - return pred; -} - // Constants std::shared_ptr Expressions::AlwaysTrue() { return True::Instance(); } diff --git a/src/iceberg/expression/expressions.h b/src/iceberg/expression/expressions.h index cb1d6df7e..92c523ca7 100644 --- a/src/iceberg/expression/expressions.h +++ b/src/iceberg/expression/expressions.h @@ -27,7 +27,6 @@ #include #include -#include "iceberg/exception.h" #include "iceberg/expression/aggregate.h" #include "iceberg/expression/literal.h" #include "iceberg/expression/predicate.h" @@ -152,7 +151,12 @@ class ICEBERG_EXPORT Expressions { /// \brief Create an IS NULL predicate for an unbound term. template static std::shared_ptr> IsNull( - std::shared_ptr> expr); + std::shared_ptr> expr) { + ICEBERG_ASSIGN_OR_THROW( + auto pred, + UnboundPredicateImpl::Make(Expression::Operation::kIsNull, std::move(expr))); + return pred; + } /// \brief Create a NOT NULL predicate for a field name. static std::shared_ptr> NotNull(std::string name); @@ -160,7 +164,12 @@ class ICEBERG_EXPORT Expressions { /// \brief Create a NOT NULL predicate for an unbound term. template static std::shared_ptr> NotNull( - std::shared_ptr> expr); + std::shared_ptr> expr) { + ICEBERG_ASSIGN_OR_THROW( + auto pred, + UnboundPredicateImpl::Make(Expression::Operation::kNotNull, std::move(expr))); + return pred; + } /// \brief Create an IS NaN predicate for a field name. static std::shared_ptr> IsNaN(std::string name); @@ -168,7 +177,12 @@ class ICEBERG_EXPORT Expressions { /// \brief Create an IS NaN predicate for an unbound term. template static std::shared_ptr> IsNaN( - std::shared_ptr> expr); + std::shared_ptr> expr) { + ICEBERG_ASSIGN_OR_THROW( + auto pred, + UnboundPredicateImpl::Make(Expression::Operation::kIsNan, std::move(expr))); + return pred; + } /// \brief Create a NOT NaN predicate for a field name. static std::shared_ptr> NotNaN(std::string name); @@ -176,7 +190,12 @@ class ICEBERG_EXPORT Expressions { /// \brief Create a NOT NaN predicate for an unbound term. template static std::shared_ptr> NotNaN( - std::shared_ptr> expr); + std::shared_ptr> expr) { + ICEBERG_ASSIGN_OR_THROW( + auto pred, + UnboundPredicateImpl::Make(Expression::Operation::kNotNan, std::move(expr))); + return pred; + } // Comparison predicates @@ -187,7 +206,12 @@ class ICEBERG_EXPORT Expressions { /// \brief Create a less than predicate for an unbound term. template static std::shared_ptr> LessThan( - std::shared_ptr> expr, Literal value); + std::shared_ptr> expr, Literal value) { + ICEBERG_ASSIGN_OR_THROW( + auto pred, UnboundPredicateImpl::Make(Expression::Operation::kLt, + std::move(expr), std::move(value))); + return pred; + } /// \brief Create a less than or equal predicate for a field name. static std::shared_ptr> LessThanOrEqual( @@ -196,7 +220,12 @@ class ICEBERG_EXPORT Expressions { /// \brief Create a less than or equal predicate for an unbound term. template static std::shared_ptr> LessThanOrEqual( - std::shared_ptr> expr, Literal value); + std::shared_ptr> expr, Literal value) { + ICEBERG_ASSIGN_OR_THROW( + auto pred, UnboundPredicateImpl::Make(Expression::Operation::kLtEq, + std::move(expr), std::move(value))); + return pred; + } /// \brief Create a greater than predicate for a field name. static std::shared_ptr> GreaterThan( @@ -205,7 +234,12 @@ class ICEBERG_EXPORT Expressions { /// \brief Create a greater than predicate for an unbound term. template static std::shared_ptr> GreaterThan( - std::shared_ptr> expr, Literal value); + std::shared_ptr> expr, Literal value) { + ICEBERG_ASSIGN_OR_THROW( + auto pred, UnboundPredicateImpl::Make(Expression::Operation::kGt, + std::move(expr), std::move(value))); + return pred; + } /// \brief Create a greater than or equal predicate for a field name. static std::shared_ptr> GreaterThanOrEqual( @@ -214,7 +248,12 @@ class ICEBERG_EXPORT Expressions { /// \brief Create a greater than or equal predicate for an unbound term. template static std::shared_ptr> GreaterThanOrEqual( - std::shared_ptr> expr, Literal value); + std::shared_ptr> expr, Literal value) { + ICEBERG_ASSIGN_OR_THROW( + auto pred, UnboundPredicateImpl::Make(Expression::Operation::kGtEq, + std::move(expr), std::move(value))); + return pred; + } /// \brief Create an equal predicate for a field name. static std::shared_ptr> Equal(std::string name, @@ -223,7 +262,12 @@ class ICEBERG_EXPORT Expressions { /// \brief Create an equal predicate for an unbound term. template static std::shared_ptr> Equal( - std::shared_ptr> expr, Literal value); + std::shared_ptr> expr, Literal value) { + ICEBERG_ASSIGN_OR_THROW( + auto pred, UnboundPredicateImpl::Make(Expression::Operation::kEq, + std::move(expr), std::move(value))); + return pred; + } /// \brief Create a not equal predicate for a field name. static std::shared_ptr> NotEqual(std::string name, @@ -232,7 +276,12 @@ class ICEBERG_EXPORT Expressions { /// \brief Create a not equal predicate for an unbound term. template static std::shared_ptr> NotEqual( - std::shared_ptr> expr, Literal value); + std::shared_ptr> expr, Literal value) { + ICEBERG_ASSIGN_OR_THROW( + auto pred, UnboundPredicateImpl::Make(Expression::Operation::kNotEq, + std::move(expr), std::move(value))); + return pred; + } // String predicates @@ -243,7 +292,13 @@ class ICEBERG_EXPORT Expressions { /// \brief Create a starts with predicate for an unbound term. template static std::shared_ptr> StartsWith( - std::shared_ptr> expr, std::string value); + std::shared_ptr> expr, std::string value) { + ICEBERG_ASSIGN_OR_THROW( + auto pred, + UnboundPredicateImpl::Make(Expression::Operation::kStartsWith, std::move(expr), + Literal::String(std::move(value)))); + return pred; + } /// \brief Create a not starts with predicate for a field name. static std::shared_ptr> NotStartsWith( @@ -252,7 +307,13 @@ class ICEBERG_EXPORT Expressions { /// \brief Create a not starts with predicate for an unbound term. template static std::shared_ptr> NotStartsWith( - std::shared_ptr> expr, std::string value); + std::shared_ptr> expr, std::string value) { + ICEBERG_ASSIGN_OR_THROW( + auto pred, UnboundPredicateImpl::Make(Expression::Operation::kNotStartsWith, + std::move(expr), + Literal::String(std::move(value)))); + return pred; + } // Set predicates @@ -263,7 +324,12 @@ class ICEBERG_EXPORT Expressions { /// \brief Create an IN predicate for an unbound term. template static std::shared_ptr> In(std::shared_ptr> expr, - std::vector values); + std::vector values) { + ICEBERG_ASSIGN_OR_THROW( + auto pred, UnboundPredicateImpl::Make(Expression::Operation::kIn, + std::move(expr), std::move(values))); + return pred; + } /// \brief Create an IN predicate for a field name with initializer list. static std::shared_ptr> In( @@ -272,7 +338,9 @@ class ICEBERG_EXPORT Expressions { /// \brief Create an IN predicate for an unbound term with initializer list. template static std::shared_ptr> In( - std::shared_ptr> expr, std::initializer_list values); + std::shared_ptr> expr, std::initializer_list values) { + return In(std::move(expr), std::vector(values)); + } /// \brief Create a NOT IN predicate for a field name. static std::shared_ptr> NotIn( @@ -281,7 +349,12 @@ class ICEBERG_EXPORT Expressions { /// \brief Create a NOT IN predicate for an unbound term. template static std::shared_ptr> NotIn( - std::shared_ptr> expr, std::vector values); + std::shared_ptr> expr, std::vector values) { + ICEBERG_ASSIGN_OR_THROW( + auto pred, UnboundPredicateImpl::Make(Expression::Operation::kNotIn, + std::move(expr), std::move(values))); + return pred; + } /// \brief Create a NOT IN predicate for a field name with initializer list. static std::shared_ptr> NotIn( @@ -290,7 +363,9 @@ class ICEBERG_EXPORT Expressions { /// \brief Create a NOT IN predicate for an unbound term with initializer list. template static std::shared_ptr> NotIn( - std::shared_ptr> expr, std::initializer_list values); + std::shared_ptr> expr, std::initializer_list values) { + return NotIn(expr, std::vector(values)); + } // Generic predicate factory @@ -314,18 +389,28 @@ class ICEBERG_EXPORT Expressions { template static std::shared_ptr> Predicate( Expression::Operation op, std::shared_ptr> expr, - std::vector values); + std::vector values) { + ICEBERG_ASSIGN_OR_THROW( + auto pred, UnboundPredicateImpl::Make(op, std::move(expr), std::move(values))); + return pred; + } /// \brief Create a predicate with operation and multiple values. template static std::shared_ptr> Predicate( Expression::Operation op, std::shared_ptr> expr, - std::initializer_list values); + std::initializer_list values) { + return Predicate(op, std::move(expr), std::vector(values)); + } /// \brief Create a unary predicate for unbound term. template static std::shared_ptr> Predicate( - Expression::Operation op, std::shared_ptr> expr); + Expression::Operation op, std::shared_ptr> expr) { + ICEBERG_ASSIGN_OR_THROW(auto pred, + UnboundPredicateImpl::Make(op, std::move(expr))); + return pred; + } // Constants diff --git a/src/iceberg/expression/inclusive_metrics_evaluator.cc b/src/iceberg/expression/inclusive_metrics_evaluator.cc new file mode 100644 index 000000000..29f5aba24 --- /dev/null +++ b/src/iceberg/expression/inclusive_metrics_evaluator.cc @@ -0,0 +1,521 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/expression/inclusive_metrics_evaluator.h" + +#include "iceberg/expression/binder.h" +#include "iceberg/expression/expression_visitor.h" +#include "iceberg/expression/rewrite_not.h" +#include "iceberg/manifest/manifest_entry.h" +#include "iceberg/schema.h" +#include "iceberg/transform.h" +#include "iceberg/util/macros.h" + +namespace iceberg { + +namespace { +constexpr bool kRowsMightMatch = true; +constexpr bool kRowCannotMatch = false; +constexpr int32_t kInPredicateLimit = 200; +} // namespace + +class InclusiveMetricsVisitor : public BoundVisitor { + public: + explicit InclusiveMetricsVisitor(const DataFile& data_file) : data_file_(data_file) {} + + Result AlwaysTrue() override { return kRowsMightMatch; } + + Result AlwaysFalse() override { return kRowCannotMatch; } + + Result Not(bool child_result) override { return !child_result; } + + Result And(bool left_result, bool right_result) override { + return left_result && right_result; + } + + Result Or(bool left_result, bool right_result) override { + return left_result || right_result; + } + + Result IsNull(const std::shared_ptr& expr) override { + // no need to check whether the field is required because binding evaluates that case + // if the column has no null values, the expression cannot match + if (IsNonNullPreserving(expr)) { + // number of non-nulls is the same as for the ref + int32_t id = expr->reference()->field().field_id(); + if (!MayContainNull(id)) { + return kRowCannotMatch; + } + } + return kRowsMightMatch; + } + + Result NotNull(const std::shared_ptr& expr) override { + // no need to check whether the field is required because binding evaluates that case + // if the column has no non-null values, the expression cannot match + + // all terms are null preserving. see #isNullPreserving(Bound) + int32_t id = expr->reference()->field().field_id(); + if (ContainsNullsOnly(id)) { + return kRowCannotMatch; + } + + return kRowsMightMatch; + } + + Result IsNaN(const std::shared_ptr& expr) override { + // when there's no nanCounts information, but we already know the column only contains + // null, it's guaranteed that there's no NaN value + int32_t id = expr->reference()->field().field_id(); + if (ContainsNullsOnly(id)) { + return kRowCannotMatch; + } + if (dynamic_cast(expr.get()) == nullptr) { + return kRowsMightMatch; + } + auto it = data_file_.nan_value_counts.find(id); + if (it != data_file_.nan_value_counts.end() && it->second == 0) { + return kRowCannotMatch; + } + return kRowsMightMatch; + } + + Result NotNaN(const std::shared_ptr& expr) override { + if (dynamic_cast(expr.get()) == nullptr) { + // identity transforms are already removed by this time + return kRowsMightMatch; + } + + int32_t id = expr->reference()->field().field_id(); + if (ContainsNaNsOnly(id)) { + return kRowCannotMatch; + } + + return kRowsMightMatch; + } + + Result Lt(const std::shared_ptr& expr, const Literal& lit) override { + // all terms are null preserving. see #isNullPreserving(Bound) + int32_t id = expr->reference()->field().field_id(); + if (ContainsNullsOnly(id) || ContainsNaNsOnly(id)) { + return kRowCannotMatch; + } + ICEBERG_ASSIGN_OR_RAISE(auto lower, LowerBound(expr)); + if (!lower.has_value() || lower->IsNull() || lower->IsNaN()) { + // NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more. + return kRowsMightMatch; + } + + // this also works for transforms that are order preserving: + // if a transform f is order preserving, a < b means that f(a) <= f(b). + // because lower <= a for all values of a in the file, f(lower) <= f(a). + // when f(lower) >= X then f(a) >= f(lower) >= X, so there is no a such that f(a) < X + // f(lower) >= X means rows cannot match + if (lower.value() >= lit) { + return kRowCannotMatch; + } + + return kRowsMightMatch; + } + + Result LtEq(const std::shared_ptr& expr, const Literal& lit) override { + // all terms are null preserving. see #isNullPreserving(Bound) + int32_t id = expr->reference()->field().field_id(); + if (ContainsNullsOnly(id) || ContainsNaNsOnly(id)) { + return kRowCannotMatch; + } + + ICEBERG_ASSIGN_OR_RAISE(auto lower, LowerBound(expr)); + if (!lower.has_value() || lower->IsNull() || lower->IsNaN()) { + // NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more. + return kRowsMightMatch; + } + + // this also works for transforms that are order preserving: + // if a transform f is order preserving, a < b means that f(a) <= f(b). + // because lower <= a for all values of a in the file, f(lower) <= f(a). + // when f(lower) > X then f(a) >= f(lower) > X, so there is no a such that f(a) <= X + // f(lower) > X means rows cannot match + if (lower.value() > lit) { + return kRowCannotMatch; + } + + return kRowsMightMatch; + } + + Result Gt(const std::shared_ptr& expr, const Literal& lit) override { + // all terms are null preserving. see #isNullPreserving(Bound) + int32_t id = expr->reference()->field().field_id(); + if (ContainsNullsOnly(id) || ContainsNaNsOnly(id)) { + return kRowCannotMatch; + } + + ICEBERG_ASSIGN_OR_RAISE(auto upper, UpperBound(expr)); + if (!upper.has_value() || upper->IsNull()) { + return kRowsMightMatch; + } + if (upper.value() <= lit) { + return kRowCannotMatch; + } + + return kRowsMightMatch; + } + + Result GtEq(const std::shared_ptr& expr, const Literal& lit) override { + // all terms are null preserving. see #isNullPreserving(Bound) + int32_t id = expr->reference()->field().field_id(); + if (ContainsNullsOnly(id) || ContainsNaNsOnly(id)) { + return kRowCannotMatch; + } + + ICEBERG_ASSIGN_OR_RAISE(auto upper, UpperBound(expr)); + if (!upper.has_value() || upper->IsNull()) { + return kRowsMightMatch; + } + if (upper.value() < lit) { + return kRowCannotMatch; + } + + return kRowsMightMatch; + } + + Result Eq(const std::shared_ptr& expr, const Literal& lit) override { + // all terms are null preserving. see #isNullPreserving(Bound) + int32_t id = expr->reference()->field().field_id(); + if (ContainsNullsOnly(id) || ContainsNaNsOnly(id)) { + return kRowCannotMatch; + } + + ICEBERG_ASSIGN_OR_RAISE(auto lower, LowerBound(expr)); + if (lower.has_value() && !lower->IsNull() && !lower->IsNaN()) { + if (lower.value() > lit) { + return kRowCannotMatch; + } + } + + ICEBERG_ASSIGN_OR_RAISE(auto upper, UpperBound(expr)); + if (!upper.has_value() || upper->IsNull()) { + return kRowsMightMatch; + } + if (upper.value() < lit) { + return kRowCannotMatch; + } + + return kRowsMightMatch; + } + + Result NotEq(const std::shared_ptr& expr, const Literal& lit) override { + // because the bounds are not necessarily a min or max value, this cannot be answered + // using them. notEq(col, X) with (X, Y) doesn't guarantee that X is a value in col. + return kRowsMightMatch; + } + + Result In(const std::shared_ptr& expr, + const BoundSetPredicate::LiteralSet& literal_set) override { + // all terms are null preserving. see #isNullPreserving(Bound) + int32_t id = expr->reference()->field().field_id(); + if (ContainsNullsOnly(id) || ContainsNaNsOnly(id)) { + return kRowCannotMatch; + } + + if (literal_set.size() > kInPredicateLimit) { + // skip evaluating the predicate if the number of values is too big + return kRowsMightMatch; + } + + ICEBERG_ASSIGN_OR_RAISE(auto lower, LowerBound(expr)); + if (!lower.has_value() || lower->IsNull() || lower->IsNaN()) { + // NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more. + return kRowsMightMatch; + } + auto literals_view = literal_set | std::views::filter([&](const Literal& lit) { + return lower.value() <= lit; + }); + // if all values are less than lower bound, rows cannot match + if (literals_view.empty()) { + return kRowCannotMatch; + } + + ICEBERG_ASSIGN_OR_RAISE(auto upper, UpperBound(expr)); + if (!upper.has_value() || upper->IsNull()) { + return kRowsMightMatch; + } + auto filtered_view = literals_view | std::views::filter([&](const Literal& lit) { + return upper.value() >= lit; + }); + // if remaining values are greater than upper bound, rows cannot match + if (filtered_view.empty()) { + return kRowCannotMatch; + } + + return kRowsMightMatch; + } + + Result NotIn(const std::shared_ptr& expr, + const BoundSetPredicate::LiteralSet& literal_set) override { + // because the bounds are not necessarily a min or max value, this cannot be answered + // using them. notIn(col, {X, ...}) with (X, Y) doesn't guarantee that X is a value in + // col. + return kRowsMightMatch; + } + + Result StartsWith(const std::shared_ptr& expr, + const Literal& lit) override { + if (auto transform = dynamic_cast(expr.get()); + transform != nullptr && + transform->transform()->transform_type() != TransformType::kIdentity) { + // truncate must be rewritten in binding. the result is either always or never + // compatible + return kRowsMightMatch; + } + + int32_t id = expr->reference()->field().field_id(); + if (ContainsNullsOnly(id)) { + return kRowCannotMatch; + } + if (lit.type()->type_id() != TypeId::kString) { + return kRowCannotMatch; + } + const auto& prefix = std::get(lit.value()); + + ICEBERG_ASSIGN_OR_RAISE(auto lower, LowerBound(expr)); + if (!lower.has_value() || lower->IsNull()) { + return kRowsMightMatch; + } + const auto& lower_str = std::get(lower->value()); + // truncate lower bound so that its length in bytes is not greater than the length of + // prefix + size_t length = std::min(prefix.size(), lower_str.size()); + // if prefix of lower bound is greater than prefix, rows cannot match + if (lower_str.substr(0, length) > prefix) { + return kRowCannotMatch; + } + + ICEBERG_ASSIGN_OR_RAISE(auto upper, UpperBound(expr)); + if (!upper.has_value() || upper->IsNull()) { + return kRowsMightMatch; + } + const auto& upper_str = std::get(upper->value()); + // truncate upper bound so that its length in bytes is not greater than the length of + // prefix + length = std::min(prefix.size(), upper_str.size()); + // if prefix of upper bound is less than prefix, rows cannot match + if (upper_str.substr(0, length) < prefix) { + return kRowCannotMatch; + } + + return kRowsMightMatch; + } + + Result NotStartsWith(const std::shared_ptr& expr, + const Literal& lit) override { + // the only transforms that produce strings are truncate and identity, which work with + // this + int32_t id = expr->reference()->field().field_id(); + if (MayContainNull(id)) { + return kRowsMightMatch; + } + + if (lit.type()->type_id() != TypeId::kString) { + return kRowCannotMatch; + } + const auto& prefix = std::get(lit.value()); + + // notStartsWith will match unless all values must start with the prefix. This happens + // when the lower and upper bounds both start with the prefix. + ICEBERG_ASSIGN_OR_RAISE(auto lower, LowerBound(expr)); + ICEBERG_ASSIGN_OR_RAISE(auto upper, UpperBound(expr)); + if (!lower.has_value() || lower->IsNull() || !upper.has_value() || upper->IsNull()) { + return kRowsMightMatch; + } + const auto& lower_str = std::get(lower->value()); + const auto& upper_str = std::get(upper->value()); + + // if lower is shorter than the prefix then lower doesn't start with the prefix + if (lower_str.size() < prefix.size()) { + return kRowsMightMatch; + } + + if (lower_str.starts_with(prefix)) { + // if upper is shorter than the prefix then upper can't start with the prefix + if (upper_str.size() < prefix.size()) { + return kRowsMightMatch; + } + if (upper_str.starts_with(prefix)) { + // both bounds match the prefix, so all rows must match the prefix and therefore + // do not satisfy the predicate + return kRowCannotMatch; + } + } + + return kRowsMightMatch; + } + + private: + bool MayContainNull(int32_t id) { + return data_file_.null_value_counts.empty() || + !data_file_.null_value_counts.contains(id) || + data_file_.null_value_counts.at(id) != 0; + } + + bool ContainsNullsOnly(int32_t id) { + auto val_it = data_file_.value_counts.find(id); + auto null_it = data_file_.null_value_counts.find(id); + return val_it != data_file_.value_counts.cend() && + null_it != data_file_.null_value_counts.cend() && + val_it->second == null_it->second; + } + + bool ContainsNaNsOnly(int32_t id) { + auto val_it = data_file_.value_counts.find(id); + auto nan_it = data_file_.nan_value_counts.find(id); + return val_it != data_file_.value_counts.cend() && + nan_it != data_file_.nan_value_counts.cend() && + val_it->second == nan_it->second; + } + + Result> LowerBound(const std::shared_ptr& expr) { + if (auto reference = dynamic_cast(expr.get()); + reference != nullptr) { + return ParseLowerBound(*reference); + } else if (auto transform = dynamic_cast(expr.get()); + transform != nullptr) { + return TransformLowerBound(*transform); + } else { + return std::nullopt; + } + // TODO(xiao.dong) handle extract lower and upper bounds + } + + Result> UpperBound(const std::shared_ptr& expr) { + if (auto reference = dynamic_cast(expr.get()); + reference != nullptr) { + return ParseUpperBound(*reference); + } else if (auto transform = dynamic_cast(expr.get()); + transform != nullptr) { + return TransformUpperBound(*transform); + } else { + return std::nullopt; + } + // TODO(xiao.dong) handle extract lower and upper bounds + } + + Result> ParseLowerBound(const BoundReference& ref) { + int32_t id = ref.field().field_id(); + auto type = ref.type(); + if (!type->is_primitive()) { + return NotSupported("Lower bound of non-primitive type is not supported."); + } + auto primitive_type = internal::checked_pointer_cast(type); + if (data_file_.lower_bounds.contains(id)) { + return Literal::Deserialize(data_file_.lower_bounds.at(id), primitive_type); + } + + return std::nullopt; + } + + Result> ParseUpperBound(const BoundReference& ref) { + int32_t id = ref.field().field_id(); + auto type = ref.type(); + if (!type->is_primitive()) { + return NotSupported("Upper bound of non-primitive type is not supported."); + } + auto primitive_type = internal::checked_pointer_cast(type); + if (data_file_.upper_bounds.contains(id)) { + return Literal::Deserialize(data_file_.upper_bounds.at(id), primitive_type); + } + + return std::nullopt; + } + + Result> TransformLowerBound(BoundTransform& boundTransform) { + auto transform = boundTransform.transform(); + if (transform->PreservesOrder()) { + ICEBERG_ASSIGN_OR_RAISE(auto lower, ParseLowerBound(*boundTransform.reference())); + if (lower.has_value()) { + ICEBERG_ASSIGN_OR_RAISE(auto transform_func, + transform->Bind(boundTransform.reference()->type())); + return transform_func->Transform(lower.value()); + } + } + + return std::nullopt; + } + + Result> TransformUpperBound(BoundTransform& boundTransform) { + auto transform = boundTransform.transform(); + if (transform->PreservesOrder()) { + ICEBERG_ASSIGN_OR_RAISE(auto upper, ParseUpperBound(*boundTransform.reference())); + if (upper.has_value()) { + ICEBERG_ASSIGN_OR_RAISE(auto transform_func, + transform->Bind(boundTransform.reference()->type())); + return transform_func->Transform(upper.value()); + } + } + + return std::nullopt; + } + + /** Returns true if the expression term produces a non-null value for non-null input. */ + bool IsNonNullPreserving(const std::shared_ptr& expr) { + if (auto reference = dynamic_cast(expr.get()); + reference != nullptr) { + return true; + } else if (auto transform = dynamic_cast(expr.get()); + transform != nullptr) { + return transform->transform()->PreservesOrder(); + } + // a non-null variant does not necessarily contain a specific field + // and unknown bound terms are not non-null preserving + return false; + } + + private: + const DataFile& data_file_; +}; + +InclusiveMetricsEvaluator::InclusiveMetricsEvaluator(std::shared_ptr expr) + : expr_(std::move(expr)) {} + +InclusiveMetricsEvaluator::~InclusiveMetricsEvaluator() = default; + +Result> InclusiveMetricsEvaluator::Make( + std::shared_ptr expr, const Schema& schema, bool case_sensitive) { + ICEBERG_ASSIGN_OR_RAISE(auto rewrite_expr, RewriteNot::Visit(std::move(expr))); + ICEBERG_ASSIGN_OR_RAISE(auto bound_expr, + Binder::Bind(schema, rewrite_expr, case_sensitive)); + return std::unique_ptr( + new InclusiveMetricsEvaluator(std::move(bound_expr))); +} + +Result InclusiveMetricsEvaluator::Evaluate(const DataFile& data_file) const { + if (data_file.record_count == 0) { + return kRowCannotMatch; + } + if (data_file.record_count < 0) { + // we haven't implemented parsing record count from avro file and thus set record + // count -1 when importing avro tables to iceberg tables. This should be updated once + // we implemented and set correct record count. + return kRowsMightMatch; + } + InclusiveMetricsVisitor visitor(data_file); + return Visit(expr_, visitor); +} + +} // namespace iceberg diff --git a/src/iceberg/expression/inclusive_metrics_evaluator.h b/src/iceberg/expression/inclusive_metrics_evaluator.h new file mode 100644 index 000000000..1887b3399 --- /dev/null +++ b/src/iceberg/expression/inclusive_metrics_evaluator.h @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +/// \file iceberg/expression/inclusive_metrics_evaluator.h +/// +/// Evaluates an Expression on a DataFile to test whether rows in the file may match. +/// +/// This evaluation is inclusive: it returns true if a file may match and false if it +/// cannot match. +/// +/// Files are passed to #eval(ContentFile), which returns true if the file may contain +/// matching rows and false if the file cannot contain matching rows. Files may be skipped +/// if and only if the return value of eval is false. +/// +/// Due to the comparison implementation of ORC stats, for float/double columns in ORC +/// files, if the first value in a file is NaN, metrics of this file will report NaN for +/// both upper and lower bound despite that the column could contain non-NaN data. Thus in +/// some scenarios explicitly checks for NaN is necessary in order to not skip files that +/// may contain matching data. +/// + +#include + +#include "iceberg/expression/expression.h" +#include "iceberg/iceberg_export.h" +#include "iceberg/result.h" +#include "iceberg/type_fwd.h" + +namespace iceberg { + +class ICEBERG_EXPORT InclusiveMetricsEvaluator { + public: + /// \brief Make a inclusive metrics evaluator + /// + /// \param expr The expression to evaluate + /// \param schema The schema of the table + /// \param case_sensitive Whether field name matching is case-sensitive + static Result> Make( + std::shared_ptr expr, const Schema& schema, bool case_sensitive = true); + + ~InclusiveMetricsEvaluator(); + + /// \brief Evaluate the expression against a DataFile. + /// + /// \param data_file The data file to evaluate + /// \return true if the file matches the expression, false otherwise, or error + Result Evaluate(const DataFile& data_file) const; + + private: + explicit InclusiveMetricsEvaluator(std::shared_ptr expr); + + std::shared_ptr expr_; +}; + +} // namespace iceberg diff --git a/src/iceberg/expression/literal.cc b/src/iceberg/expression/literal.cc index c1aad90df..cb0a4c6d0 100644 --- a/src/iceberg/expression/literal.cc +++ b/src/iceberg/expression/literal.cc @@ -343,12 +343,32 @@ std::strong_ordering CompareFloat(T lhs, T rhs) { return lhs_is_negative <=> rhs_is_negative; } +namespace { + +bool Comparable(TypeId lhs, TypeId rhs) { + switch (lhs) { + case TypeId::kInt: + case TypeId::kDate: + return rhs == TypeId::kInt || rhs == TypeId::kDate; + case TypeId::kLong: + case TypeId::kTimestamp: + case TypeId::kTimestampTz: + return rhs == TypeId::kLong || rhs == TypeId::kTimestamp || + rhs == TypeId::kTimestampTz; + default: + return lhs == rhs; + } +} + +} // namespace + bool Literal::operator==(const Literal& other) const { return (*this <=> other) == 0; } // Three-way comparison operator std::partial_ordering Literal::operator<=>(const Literal& other) const { // If types are different, comparison is unordered - if (type_->type_id() != other.type_->type_id()) { + // (Int & Date) (Timestamp & Long) were excluded from this check to allow comparison + if (!Comparable(type_->type_id(), other.type_->type_id())) { return std::partial_ordering::unordered; } diff --git a/src/iceberg/meson.build b/src/iceberg/meson.build index d52739be9..053971798 100644 --- a/src/iceberg/meson.build +++ b/src/iceberg/meson.build @@ -47,6 +47,7 @@ iceberg_sources = files( 'expression/evaluator.cc', 'expression/expression.cc', 'expression/expressions.cc', + 'expression/inclusive_metrics_evaluator.cc', 'expression/literal.cc', 'expression/predicate.cc', 'expression/rewrite_not.cc', diff --git a/src/iceberg/test/CMakeLists.txt b/src/iceberg/test/CMakeLists.txt index 21ccd4d66..08edbb209 100644 --- a/src/iceberg/test/CMakeLists.txt +++ b/src/iceberg/test/CMakeLists.txt @@ -95,6 +95,8 @@ add_iceberg_test(expression_test expression_test.cc expression_visitor_test.cc literal_test.cc + inclusive_metrics_evaluator_test.cc + inclusive_metrics_evaluator_with_transform_test.cc predicate_test.cc) add_iceberg_test(json_serde_test diff --git a/src/iceberg/test/inclusive_metrics_evaluator_test.cc b/src/iceberg/test/inclusive_metrics_evaluator_test.cc new file mode 100644 index 000000000..27867f1a4 --- /dev/null +++ b/src/iceberg/test/inclusive_metrics_evaluator_test.cc @@ -0,0 +1,948 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/expression/inclusive_metrics_evaluator.h" + +#include + +#include "iceberg/expression/binder.h" +#include "iceberg/expression/expressions.h" +#include "iceberg/manifest/manifest_entry.h" +#include "iceberg/schema.h" +#include "iceberg/test/matchers.h" +#include "iceberg/type.h" +#include "iceberg/util/truncate_util.h" + +namespace iceberg { + +namespace { +constexpr bool kRowsMightMatch = true; +constexpr bool kRowCannotMatch = false; +constexpr int64_t kIntMinValue = 30; +constexpr int64_t kIntMaxValue = 79; +constexpr float kFloatNan = std::numeric_limits::quiet_NaN(); +constexpr double kDoubleNan = std::numeric_limits::quiet_NaN(); +} // namespace +using TestVariant = std::variant; + +class InclusiveMetricsEvaluatorTest : public ::testing::Test { + protected: + Result> Bind(const std::shared_ptr& expr, + bool case_sensitive = true) { + return Binder::Bind(*schema_, expr, case_sensitive); + } + + void SetUp() override { + schema_ = std::make_shared( + std::vector{ + SchemaField::MakeRequired(1, "id", int64()), + SchemaField::MakeOptional(2, "name", string()), + SchemaField::MakeRequired(3, "age", int32()), + SchemaField::MakeOptional(4, "salary", float64()), + SchemaField::MakeRequired(5, "active", boolean()), + SchemaField::MakeRequired(6, "date", string()), + }, + 0); + } + + std::shared_ptr PrepareDataFile( + const std::string& partition, int64_t record_count, int64_t file_size_in_bytes, + const std::map& lower_bounds, + const std::map& upper_bounds, + const std::map& value_counts = {}, + const std::map& null_counts = {}, + const std::map& nan_counts = {}) { + auto parse_bound = [&](const std::map& bounds, + std::map>& bound_values) { + for (const auto& [key, value] : bounds) { + if (key == "id") { + bound_values[1] = Literal::Long(std::get(value)).Serialize().value(); + } else if (key == "name") { + bound_values[2] = + Literal::String(std::get(value)).Serialize().value(); + } else if (key == "age") { + bound_values[3] = Literal::Int(std::get(value)).Serialize().value(); + } else if (key == "salary") { + bound_values[4] = Literal::Double(std::get(value)).Serialize().value(); + } else if (key == "active") { + bound_values[5] = Literal::Boolean(std::get(value)).Serialize().value(); + } + } + }; + + auto data_file = std::make_shared(); + data_file->file_path = "test_path"; + data_file->file_format = FileFormatType::kParquet; + data_file->partition.AddValue(Literal::String(partition)); + data_file->record_count = record_count; + data_file->file_size_in_bytes = file_size_in_bytes; + data_file->column_sizes = {}; + data_file->value_counts = value_counts; + data_file->null_value_counts = null_counts; + data_file->nan_value_counts = nan_counts; + data_file->split_offsets = {1}; + data_file->sort_order_id = 0; + parse_bound(upper_bounds, data_file->upper_bounds); + parse_bound(lower_bounds, data_file->lower_bounds); + return data_file; + } + + void TestCase(const std::shared_ptr& unbound, bool expected_result) { + ICEBERG_UNWRAP_OR_FAIL(auto evaluator, + InclusiveMetricsEvaluator::Make(unbound, *schema_, true)); + auto file = PrepareDataFile("20251128", 10, 1024, {{"id", static_cast(100)}}, + {{"id", static_cast(200)}}); + auto result = evaluator->Evaluate(*file); + ASSERT_TRUE(result.has_value()); + ASSERT_EQ(result.value(), expected_result) << unbound->ToString(); + } + + void TestStringCase(const std::shared_ptr& unbound, bool expected_result) { + ICEBERG_UNWRAP_OR_FAIL(auto evaluator, + InclusiveMetricsEvaluator::Make(unbound, *schema_, true)); + auto file = PrepareDataFile("20251128", 10, 1024, {{"name", "123"}}, + {{"name", "456"}}, {{2, 10}}, {{2, 0}}); + auto result = evaluator->Evaluate(*file); + ASSERT_TRUE(result.has_value()); + ASSERT_EQ(result.value(), expected_result) << unbound->ToString(); + } + + protected: + std::shared_ptr schema_; +}; + +TEST_F(InclusiveMetricsEvaluatorTest, CaseSensitiveTest) { + { + auto unbound = Expressions::Equal("id", Literal::Long(300)); + auto evaluator = InclusiveMetricsEvaluator::Make(unbound, *schema_, true); + ASSERT_TRUE(evaluator.has_value()); + } + { + auto unbound = Expressions::Equal("ID", Literal::Long(300)); + auto evaluator = InclusiveMetricsEvaluator::Make(unbound, *schema_, true); + ASSERT_FALSE(evaluator.has_value()); + ASSERT_EQ(evaluator.error().kind, ErrorKind::kInvalidExpression); + } + { + auto unbound = Expressions::Equal("ID", Literal::Long(300)); + auto evaluator = InclusiveMetricsEvaluator::Make(unbound, *schema_, false); + ASSERT_TRUE(evaluator.has_value()); + } +} + +TEST_F(InclusiveMetricsEvaluatorTest, IsNullTest) { + { + auto unbound = Expressions::IsNull("name"); + ICEBERG_UNWRAP_OR_FAIL(auto evaluator, + InclusiveMetricsEvaluator::Make(unbound, *schema_, true)); + auto file = PrepareDataFile("20251128", 10, 1024, {{"name", "1"}}, {{"name", "2"}}, + {{2, 10}}, {{2, 5}}, {}); + auto result = evaluator->Evaluate(*file); + ASSERT_TRUE(result.has_value()); + ASSERT_EQ(result.value(), kRowsMightMatch) << unbound->ToString(); + } + { + auto unbound = Expressions::IsNull("name"); + ICEBERG_UNWRAP_OR_FAIL(auto evaluator, + InclusiveMetricsEvaluator::Make(unbound, *schema_, true)); + auto file = PrepareDataFile("20251128", 10, 1024, {{"name", "1"}}, {{"name", "2"}}, + {{2, 10}}, {{2, 0}}, {}); + auto result = evaluator->Evaluate(*file); + ASSERT_TRUE(result.has_value()); + ASSERT_EQ(result.value(), kRowCannotMatch) << unbound->ToString(); + } +} + +TEST_F(InclusiveMetricsEvaluatorTest, NotNullTest) { + { + auto unbound = Expressions::NotNull("name"); + ICEBERG_UNWRAP_OR_FAIL(auto evaluator, + InclusiveMetricsEvaluator::Make(unbound, *schema_, true)); + auto file = PrepareDataFile("20251128", 10, 1024, {{"name", "1"}}, {{"name", "2"}}, + {{2, 10}}, {{2, 5}}, {}); + auto result = evaluator->Evaluate(*file); + ASSERT_TRUE(result.has_value()); + ASSERT_EQ(result.value(), kRowsMightMatch) << unbound->ToString(); + } + { + auto unbound = Expressions::NotNull("name"); + ICEBERG_UNWRAP_OR_FAIL(auto evaluator, + InclusiveMetricsEvaluator::Make(unbound, *schema_, true)); + auto file = PrepareDataFile("20251128", 10, 1024, {{"name", "1"}}, {{"name", "2"}}, + {{2, 10}}, {{2, 10}}, {}); + auto result = evaluator->Evaluate(*file); + ASSERT_TRUE(result.has_value()); + ASSERT_EQ(result.value(), kRowCannotMatch) << unbound->ToString(); + } +} + +TEST_F(InclusiveMetricsEvaluatorTest, IsNanTest) { + { + auto unbound = Expressions::IsNaN("salary"); + ICEBERG_UNWRAP_OR_FAIL(auto evaluator, + InclusiveMetricsEvaluator::Make(unbound, *schema_, true)); + auto file = PrepareDataFile("20251128", 10, 1024, {{"salary", 1.0}}, + {{"salary", 2.0}}, {{4, 10}}, {{4, 5}}, {{4, 5}}); + auto result = evaluator->Evaluate(*file); + ASSERT_TRUE(result.has_value()); + ASSERT_EQ(result.value(), kRowsMightMatch) << unbound->ToString(); + } + { + auto unbound = Expressions::IsNaN("salary"); + ICEBERG_UNWRAP_OR_FAIL(auto evaluator, + InclusiveMetricsEvaluator::Make(unbound, *schema_, true)); + auto file = PrepareDataFile("20251128", 10, 1024, {{"salary", 1.0}}, + {{"salary", 2.0}}, {{4, 10}}, {{4, 10}}, {{4, 5}}); + auto result = evaluator->Evaluate(*file); + ASSERT_TRUE(result.has_value()); + ASSERT_EQ(result.value(), kRowCannotMatch) << unbound->ToString(); + } + { + auto unbound = Expressions::IsNaN("salary"); + ICEBERG_UNWRAP_OR_FAIL(auto evaluator, + InclusiveMetricsEvaluator::Make(unbound, *schema_, true)); + auto file = PrepareDataFile("20251128", 10, 1024, {{"salary", 1.0}}, + {{"salary", 2.0}}, {{4, 10}}, {{4, 5}}, {{4, 0}}); + auto result = evaluator->Evaluate(*file); + ASSERT_TRUE(result.has_value()); + ASSERT_EQ(result.value(), kRowCannotMatch) << unbound->ToString(); + } +} + +TEST_F(InclusiveMetricsEvaluatorTest, NotNanTest) { + { + auto unbound = Expressions::NotNaN("salary"); + ICEBERG_UNWRAP_OR_FAIL(auto evaluator, + InclusiveMetricsEvaluator::Make(unbound, *schema_, true)); + auto file = PrepareDataFile("20251128", 10, 1024, {{"salary", 1.0}}, + {{"salary", 2.0}}, {{4, 10}}, {}, {{4, 5}}); + auto result = evaluator->Evaluate(*file); + ASSERT_TRUE(result.has_value()); + ASSERT_EQ(result.value(), kRowsMightMatch) << unbound->ToString(); + } + { + auto unbound = Expressions::NotNaN("salary"); + ICEBERG_UNWRAP_OR_FAIL(auto evaluator, + InclusiveMetricsEvaluator::Make(unbound, *schema_, true)); + auto file = PrepareDataFile("20251128", 10, 1024, {{"salary", 1.0}}, + {{"salary", 2.0}}, {{4, 10}}, {}, {{4, 10}}); + auto result = evaluator->Evaluate(*file); + ASSERT_TRUE(result.has_value()); + ASSERT_EQ(result.value(), kRowCannotMatch) << unbound->ToString(); + } +} + +TEST_F(InclusiveMetricsEvaluatorTest, LTTest) { + TestCase(Expressions::LessThan("id", Literal::Long(300)), kRowsMightMatch); + TestCase(Expressions::LessThan("id", Literal::Long(150)), kRowsMightMatch); + TestCase(Expressions::LessThan("id", Literal::Long(100)), kRowCannotMatch); + TestCase(Expressions::LessThan("id", Literal::Long(200)), kRowsMightMatch); + TestCase(Expressions::LessThan("id", Literal::Long(99)), kRowCannotMatch); +} + +TEST_F(InclusiveMetricsEvaluatorTest, LTEQTest) { + TestCase(Expressions::LessThanOrEqual("id", Literal::Long(300)), kRowsMightMatch); + TestCase(Expressions::LessThanOrEqual("id", Literal::Long(150)), kRowsMightMatch); + TestCase(Expressions::LessThanOrEqual("id", Literal::Long(100)), kRowsMightMatch); + TestCase(Expressions::LessThanOrEqual("id", Literal::Long(200)), kRowsMightMatch); + TestCase(Expressions::LessThanOrEqual("id", Literal::Long(99)), kRowCannotMatch); +} + +TEST_F(InclusiveMetricsEvaluatorTest, GTTest) { + TestCase(Expressions::GreaterThan("id", Literal::Long(300)), kRowCannotMatch); + TestCase(Expressions::GreaterThan("id", Literal::Long(150)), kRowsMightMatch); + TestCase(Expressions::GreaterThan("id", Literal::Long(100)), kRowsMightMatch); + TestCase(Expressions::GreaterThan("id", Literal::Long(200)), kRowCannotMatch); + TestCase(Expressions::GreaterThan("id", Literal::Long(99)), kRowsMightMatch); +} + +TEST_F(InclusiveMetricsEvaluatorTest, GTEQTest) { + TestCase(Expressions::GreaterThanOrEqual("id", Literal::Long(300)), kRowCannotMatch); + TestCase(Expressions::GreaterThanOrEqual("id", Literal::Long(150)), kRowsMightMatch); + TestCase(Expressions::GreaterThanOrEqual("id", Literal::Long(100)), kRowsMightMatch); + TestCase(Expressions::GreaterThanOrEqual("id", Literal::Long(200)), kRowsMightMatch); + TestCase(Expressions::GreaterThanOrEqual("id", Literal::Long(99)), kRowsMightMatch); +} + +TEST_F(InclusiveMetricsEvaluatorTest, EQTest) { + TestCase(Expressions::Equal("id", Literal::Long(300)), kRowCannotMatch); + TestCase(Expressions::Equal("id", Literal::Long(150)), kRowsMightMatch); + TestCase(Expressions::Equal("id", Literal::Long(100)), kRowsMightMatch); + TestCase(Expressions::Equal("id", Literal::Long(200)), kRowsMightMatch); +} + +TEST_F(InclusiveMetricsEvaluatorTest, NotEqTest) { + TestCase(Expressions::NotEqual("id", Literal::Long(300)), kRowsMightMatch); + TestCase(Expressions::NotEqual("id", Literal::Long(150)), kRowsMightMatch); + TestCase(Expressions::NotEqual("id", Literal::Long(100)), kRowsMightMatch); + TestCase(Expressions::NotEqual("id", Literal::Long(200)), kRowsMightMatch); +} + +TEST_F(InclusiveMetricsEvaluatorTest, InTest) { + TestCase(Expressions::In("id", + { + Literal::Long(300), + Literal::Long(400), + Literal::Long(500), + }), + kRowCannotMatch); + TestCase(Expressions::In("id", + { + Literal::Long(150), + Literal::Long(300), + }), + kRowsMightMatch); + TestCase(Expressions::In("id", {Literal::Long(100)}), kRowsMightMatch); + TestCase(Expressions::In("id", {Literal::Long(200)}), kRowsMightMatch); + TestCase(Expressions::In("id", + { + Literal::Long(99), + Literal::Long(201), + }), + kRowCannotMatch); +} + +TEST_F(InclusiveMetricsEvaluatorTest, NotInTest) { + TestCase(Expressions::NotIn("id", + { + Literal::Long(300), + Literal::Long(400), + Literal::Long(500), + }), + kRowsMightMatch); + TestCase(Expressions::NotIn("id", + { + Literal::Long(150), + Literal::Long(300), + }), + kRowsMightMatch); + TestCase(Expressions::NotIn("id", + { + Literal::Long(100), + Literal::Long(200), + }), + kRowsMightMatch); + TestCase(Expressions::NotIn("id", + { + Literal::Long(99), + Literal::Long(201), + }), + kRowsMightMatch); +} + +TEST_F(InclusiveMetricsEvaluatorTest, StartsWithTest) { + TestStringCase(Expressions::StartsWith("name", "1"), kRowsMightMatch); + TestStringCase(Expressions::StartsWith("name", "4"), kRowsMightMatch); + TestStringCase(Expressions::StartsWith("name", "12"), kRowsMightMatch); + TestStringCase(Expressions::StartsWith("name", "45"), kRowsMightMatch); + TestStringCase(Expressions::StartsWith("name", "123"), kRowsMightMatch); + TestStringCase(Expressions::StartsWith("name", "456"), kRowsMightMatch); + TestStringCase(Expressions::StartsWith("name", "1234"), kRowsMightMatch); + TestStringCase(Expressions::StartsWith("name", "4567"), kRowCannotMatch); + TestStringCase(Expressions::StartsWith("name", "78"), kRowCannotMatch); + TestStringCase(Expressions::StartsWith("name", "7"), kRowCannotMatch); + TestStringCase(Expressions::StartsWith("name", "A"), kRowCannotMatch); +} + +TEST_F(InclusiveMetricsEvaluatorTest, NotStartsWithTest) { + TestStringCase(Expressions::NotStartsWith("name", "1"), kRowsMightMatch); + TestStringCase(Expressions::NotStartsWith("name", "4"), kRowsMightMatch); + TestStringCase(Expressions::NotStartsWith("name", "12"), kRowsMightMatch); + TestStringCase(Expressions::NotStartsWith("name", "45"), kRowsMightMatch); + TestStringCase(Expressions::NotStartsWith("name", "123"), kRowsMightMatch); + TestStringCase(Expressions::NotStartsWith("name", "456"), kRowsMightMatch); + TestStringCase(Expressions::NotStartsWith("name", "1234"), kRowsMightMatch); + TestStringCase(Expressions::NotStartsWith("name", "4567"), kRowsMightMatch); + TestStringCase(Expressions::NotStartsWith("name", "78"), kRowsMightMatch); + TestStringCase(Expressions::NotStartsWith("name", "7"), kRowsMightMatch); + TestStringCase(Expressions::NotStartsWith("name", "A"), kRowsMightMatch); + + auto RunTest = [&](const std::string& prefix, bool expected_result) { + auto unbound = Expressions::NotStartsWith("name", prefix); + ICEBERG_UNWRAP_OR_FAIL(auto evaluator, + InclusiveMetricsEvaluator::Make(unbound, *schema_, true)); + auto file = PrepareDataFile("20251128", 10, 1024, {{"name", "123"}}, + {{"name", "123"}}, {{2, 10}}, {{2, 0}}); + auto result = evaluator->Evaluate(*file); + ASSERT_TRUE(result.has_value()); + ASSERT_EQ(result.value(), expected_result) << unbound->ToString(); + }; + RunTest("12", kRowCannotMatch); + RunTest("123", kRowCannotMatch); + RunTest("1234", kRowsMightMatch); +} + +class InclusiveMetricsEvaluatorMigratedTest : public InclusiveMetricsEvaluatorTest { + protected: + void SetUp() override { + schema_ = std::make_shared( + std::vector{ + SchemaField::MakeRequired(1, "id", int64()), + SchemaField::MakeOptional(2, "no_stats", int64()), + SchemaField::MakeRequired(3, "required", string()), + SchemaField::MakeOptional(4, "all_nulls", string()), + SchemaField::MakeOptional(5, "some_nulls", string()), + SchemaField::MakeOptional(6, "no_nulls", string()), + SchemaField::MakeOptional(7, "all_nans", float64()), + SchemaField::MakeOptional(8, "some_nans", float32()), + SchemaField::MakeOptional(9, "no_nans", float32()), + SchemaField::MakeOptional(10, "all_nulls_double", float64()), + SchemaField::MakeOptional(11, "all_nans_v1_stats", float32()), + SchemaField::MakeOptional(12, "nan_and_null_only", float64()), + SchemaField::MakeOptional(13, "no_nan_stats", float64()), + SchemaField::MakeOptional(14, "some_empty", string()), + }, + /*schema_id=*/0); + file1_ = PrepareDataFile1(); + file2_ = PrepareDataFile2(); + file3_ = PrepareDataFile3(); + file4_ = PrepareDataFile4(); + file5_ = PrepareDataFile5(); + } + + std::shared_ptr PrepareDataFile1() { + auto data_file = std::make_shared(); + data_file->file_path = "test_path1"; + data_file->file_format = FileFormatType::kParquet; + data_file->record_count = 50; + data_file->value_counts = { + {4, 50L}, {5, 50L}, {6, 50L}, {7, 50L}, {8, 50L}, {9, 50L}, + {10, 50L}, {11, 50L}, {12, 50L}, {13, 50L}, {14, 50L}, + }; + data_file->null_value_counts = { + {4, 50L}, {5, 10L}, {6, 0L}, {10, 50L}, {11, 0L}, {12, 1L}, {14, 0L}, + }; + data_file->nan_value_counts = { + {7, 50L}, + {8, 10L}, + {9, 0L}, + }; + data_file->lower_bounds = { + {1, Literal::Long(kIntMinValue).Serialize().value()}, + {11, Literal::Float(kFloatNan).Serialize().value()}, + {12, Literal::Double(kDoubleNan).Serialize().value()}, + {14, Literal::String("").Serialize().value()}, + }; + data_file->upper_bounds = { + {1, Literal::Long(kIntMaxValue).Serialize().value()}, + {11, Literal::Float(kFloatNan).Serialize().value()}, + {12, Literal::Double(kDoubleNan).Serialize().value()}, + {14, Literal::String("房东整租霍营小区二层两居室").Serialize().value()}, + }; + return data_file; + } + + std::shared_ptr PrepareDataFile2() { + auto data_file = std::make_shared(); + data_file->file_path = "test_path2"; + data_file->file_format = FileFormatType::kParquet; + data_file->record_count = 50; + data_file->value_counts = { + {3, 50L}, + }; + data_file->null_value_counts = { + {3, 0L}, + }; + data_file->nan_value_counts = {}; + data_file->lower_bounds = { + {3, Literal::String("aa").Serialize().value()}, + }; + data_file->upper_bounds = { + {3, Literal::String("dC").Serialize().value()}, + }; + return data_file; + } + + std::shared_ptr PrepareDataFile3() { + auto data_file = std::make_shared(); + data_file->file_path = "test_path3"; + data_file->file_format = FileFormatType::kParquet; + data_file->record_count = 50; + data_file->value_counts = { + {3, 50L}, + }; + data_file->null_value_counts = { + {3, 0L}, + }; + data_file->nan_value_counts = {}; + data_file->lower_bounds = { + {3, Literal::String("1str1").Serialize().value()}, + }; + data_file->upper_bounds = { + {3, Literal::String("3str3").Serialize().value()}, + }; + return data_file; + } + + std::shared_ptr PrepareDataFile4() { + auto data_file = std::make_shared(); + data_file->file_path = "test_path4"; + data_file->file_format = FileFormatType::kParquet; + data_file->record_count = 50; + data_file->value_counts = { + {3, 50L}, + }; + data_file->null_value_counts = { + {3, 0L}, + }; + data_file->nan_value_counts = {}; + data_file->lower_bounds = { + {3, Literal::String("abc").Serialize().value()}, + }; + data_file->upper_bounds = { + {3, Literal::String("イロハニホヘト").Serialize().value()}, + }; + return data_file; + } + + std::shared_ptr PrepareDataFile5() { + auto data_file = std::make_shared(); + data_file->file_path = "test_path5"; + data_file->file_format = FileFormatType::kParquet; + data_file->record_count = 50; + data_file->value_counts = { + {3, 50L}, + }; + data_file->null_value_counts = { + {3, 0L}, + }; + data_file->nan_value_counts = {}; + data_file->lower_bounds = { + {3, Literal::String("abc").Serialize().value()}, + }; + data_file->upper_bounds = { + {3, Literal::String("abcdefghi").Serialize().value()}, + }; + return data_file; + } + + void RunTest(const std::shared_ptr& expr, bool expected_result, + const std::shared_ptr& file, bool case_sensitive = true) { + ICEBERG_UNWRAP_OR_FAIL( + auto evaluator, InclusiveMetricsEvaluator::Make(expr, *schema_, case_sensitive)); + auto result = evaluator->Evaluate(*file); + ASSERT_TRUE(result.has_value()); + ASSERT_EQ(result.value(), expected_result) << expr->ToString(); + }; + + std::shared_ptr file1_; + std::shared_ptr file2_; + std::shared_ptr file3_; + std::shared_ptr file4_; + std::shared_ptr file5_; +}; + +TEST_F(InclusiveMetricsEvaluatorMigratedTest, CaseSensitiveTest) { + { + auto unbound = Expressions::Equal("id", Literal::Long(300)); + auto evaluator = InclusiveMetricsEvaluator::Make(unbound, *schema_, true); + ASSERT_TRUE(evaluator.has_value()); + } + { + auto unbound = Expressions::Equal("ID", Literal::Long(300)); + auto evaluator = InclusiveMetricsEvaluator::Make(unbound, *schema_, true); + ASSERT_FALSE(evaluator.has_value()); + ASSERT_EQ(evaluator.error().kind, ErrorKind::kInvalidExpression); + } + { + auto unbound = Expressions::Equal("ID", Literal::Long(300)); + auto evaluator = InclusiveMetricsEvaluator::Make(unbound, *schema_, false); + ASSERT_TRUE(evaluator.has_value()); + } +} + +TEST_F(InclusiveMetricsEvaluatorMigratedTest, AllNullsTest) { + RunTest(Expressions::NotNull("all_nulls"), kRowCannotMatch, file1_); + RunTest(Expressions::LessThan("all_nulls", Literal::String("a")), kRowCannotMatch, + file1_); + RunTest(Expressions::LessThanOrEqual("all_nulls", Literal::String("a")), + kRowCannotMatch, file1_); + RunTest(Expressions::GreaterThan("all_nulls", Literal::String("a")), kRowCannotMatch, + file1_); + RunTest(Expressions::GreaterThanOrEqual("all_nulls", Literal::String("a")), + kRowCannotMatch, file1_); + RunTest(Expressions::Equal("all_nulls", Literal::String("a")), kRowCannotMatch, file1_); + RunTest(Expressions::StartsWith("all_nulls", "a"), kRowCannotMatch, file1_); + RunTest(Expressions::NotStartsWith("all_nulls", "a"), kRowsMightMatch, file1_); + RunTest(Expressions::NotNull("some_nulls"), kRowsMightMatch, file1_); + RunTest(Expressions::NotNull("no_nulls"), kRowsMightMatch, file1_); +} + +TEST_F(InclusiveMetricsEvaluatorMigratedTest, NoNullsTest) { + RunTest(Expressions::IsNull("all_nulls"), kRowsMightMatch, file1_); + RunTest(Expressions::IsNull("some_nulls"), kRowsMightMatch, file1_); + RunTest(Expressions::IsNull("no_nulls"), kRowCannotMatch, file1_); +} + +TEST_F(InclusiveMetricsEvaluatorMigratedTest, IsNaNTest) { + RunTest(Expressions::IsNaN("all_nans"), kRowsMightMatch, file1_); + RunTest(Expressions::IsNaN("some_nans"), kRowsMightMatch, file1_); + RunTest(Expressions::IsNaN("no_nans"), kRowCannotMatch, file1_); + RunTest(Expressions::IsNaN("all_nulls_double"), kRowCannotMatch, file1_); + RunTest(Expressions::IsNaN("no_nan_stats"), kRowsMightMatch, file1_); + RunTest(Expressions::IsNaN("all_nans_v1_stats"), kRowsMightMatch, file1_); + RunTest(Expressions::IsNaN("nan_and_null_only"), kRowsMightMatch, file1_); +} + +TEST_F(InclusiveMetricsEvaluatorMigratedTest, NotNaNTest) { + RunTest(Expressions::NotNaN("all_nans"), kRowCannotMatch, file1_); + RunTest(Expressions::NotNaN("some_nans"), kRowsMightMatch, file1_); + RunTest(Expressions::NotNaN("no_nans"), kRowsMightMatch, file1_); + RunTest(Expressions::NotNaN("all_nulls_double"), kRowsMightMatch, file1_); + RunTest(Expressions::NotNaN("no_nan_stats"), kRowsMightMatch, file1_); + RunTest(Expressions::NotNaN("all_nans_v1_stats"), kRowsMightMatch, file1_); + RunTest(Expressions::NotNaN("nan_and_null_only"), kRowsMightMatch, file1_); +} + +TEST_F(InclusiveMetricsEvaluatorMigratedTest, RequiredColumnTest) { + RunTest(Expressions::NotNull("required"), kRowsMightMatch, file1_); + RunTest(Expressions::IsNull("required"), kRowCannotMatch, file1_); +} + +TEST_F(InclusiveMetricsEvaluatorMigratedTest, MissingColumnTest) { + auto expr = Expressions::LessThan("missing", Literal::Long(5)); + auto result = InclusiveMetricsEvaluator::Make(expr, *schema_, true); + ASSERT_FALSE(result.has_value()) << result.error().message; + ASSERT_TRUE(result.error().message.contains("Cannot find field 'missing' in struct")) + << result.error().message; +} + +TEST_F(InclusiveMetricsEvaluatorMigratedTest, MissingStatsTest) { + auto data_file = std::make_shared(); + data_file->file_path = "test_path"; + data_file->file_format = FileFormatType::kParquet; + data_file->record_count = 50; + + RunTest(Expressions::LessThan("no_stats", Literal::Long(5)), kRowsMightMatch, + data_file); + RunTest(Expressions::LessThanOrEqual("no_stats", Literal::Long(30)), kRowsMightMatch, + data_file); + RunTest(Expressions::Equal("no_stats", Literal::Long(70)), kRowsMightMatch, data_file); + RunTest(Expressions::GreaterThan("no_stats", Literal::Long(78)), kRowsMightMatch, + data_file); + RunTest(Expressions::GreaterThanOrEqual("no_stats", Literal::Long(90)), kRowsMightMatch, + data_file); + RunTest(Expressions::NotEqual("no_stats", Literal::Long(101)), kRowsMightMatch, + data_file); + RunTest(Expressions::IsNull("no_stats"), kRowsMightMatch, data_file); + RunTest(Expressions::NotNull("no_stats"), kRowsMightMatch, data_file); + RunTest(Expressions::IsNaN("some_nans"), kRowsMightMatch, data_file); + RunTest(Expressions::NotNaN("some_nans"), kRowsMightMatch, data_file); +} + +TEST_F(InclusiveMetricsEvaluatorMigratedTest, ZeroRecordFileTest) { + auto data_file = std::make_shared(); + data_file->file_path = "test_path"; + data_file->file_format = FileFormatType::kParquet; + data_file->record_count = 0; + + RunTest(Expressions::LessThan("no_stats", Literal::Long(5)), kRowCannotMatch, + data_file); + RunTest(Expressions::LessThanOrEqual("no_stats", Literal::Long(30)), kRowCannotMatch, + data_file); + RunTest(Expressions::Equal("no_stats", Literal::Long(70)), kRowCannotMatch, data_file); + RunTest(Expressions::GreaterThan("no_stats", Literal::Long(78)), kRowCannotMatch, + data_file); + RunTest(Expressions::GreaterThanOrEqual("no_stats", Literal::Long(90)), kRowCannotMatch, + data_file); + RunTest(Expressions::NotEqual("no_stats", Literal::Long(101)), kRowCannotMatch, + data_file); + RunTest(Expressions::IsNull("some_nulls"), kRowCannotMatch, data_file); + RunTest(Expressions::NotNull("some_nulls"), kRowCannotMatch, data_file); + RunTest(Expressions::IsNaN("some_nans"), kRowCannotMatch, data_file); + RunTest(Expressions::NotNaN("some_nans"), kRowCannotMatch, data_file); +} + +TEST_F(InclusiveMetricsEvaluatorMigratedTest, NotTest) { + RunTest(Expressions::Not(Expressions::LessThan("id", Literal::Long(kIntMinValue - 25))), + kRowsMightMatch, file1_); + RunTest( + Expressions::Not(Expressions::GreaterThan("id", Literal::Long(kIntMinValue - 25))), + kRowCannotMatch, file1_); +} + +TEST_F(InclusiveMetricsEvaluatorMigratedTest, AndTest) { + RunTest(Expressions::And( + Expressions::LessThan("id", Literal::Long(kIntMinValue - 25)), + Expressions::GreaterThanOrEqual("id", Literal::Long(kIntMinValue - 30))), + kRowCannotMatch, file1_); + RunTest(Expressions::And( + Expressions::LessThan("id", Literal::Long(kIntMinValue - 25)), + Expressions::GreaterThanOrEqual("id", Literal::Long(kIntMaxValue + 1))), + kRowCannotMatch, file1_); + RunTest( + Expressions::And(Expressions::GreaterThan("id", Literal::Long(kIntMinValue - 25)), + Expressions::LessThanOrEqual("id", Literal::Long(kIntMaxValue))), + kRowsMightMatch, file1_); +} + +TEST_F(InclusiveMetricsEvaluatorMigratedTest, OrTest) { + RunTest(Expressions::Or( + Expressions::LessThan("id", Literal::Long(kIntMinValue - 25)), + Expressions::GreaterThanOrEqual("id", Literal::Long(kIntMaxValue + 1))), + kRowCannotMatch, file1_); + RunTest(Expressions::Or( + Expressions::LessThan("id", Literal::Long(kIntMinValue - 25)), + Expressions::GreaterThanOrEqual("id", Literal::Long(kIntMaxValue - 19))), + kRowsMightMatch, file1_); +} + +TEST_F(InclusiveMetricsEvaluatorMigratedTest, IntegerLtTest) { + RunTest(Expressions::LessThan("id", Literal::Long(kIntMinValue - 25)), kRowCannotMatch, + file1_); + RunTest(Expressions::LessThan("id", Literal::Long(kIntMinValue)), kRowCannotMatch, + file1_); + RunTest(Expressions::LessThan("id", Literal::Long(kIntMinValue + 1)), kRowsMightMatch, + file1_); + RunTest(Expressions::LessThan("id", Literal::Long(kIntMaxValue)), kRowsMightMatch, + file1_); +} + +TEST_F(InclusiveMetricsEvaluatorMigratedTest, IntegerLtEqTest) { + RunTest(Expressions::LessThanOrEqual("id", Literal::Long(kIntMinValue - 25)), + kRowCannotMatch, file1_); + RunTest(Expressions::LessThanOrEqual("id", Literal::Long(kIntMinValue - 1)), + kRowCannotMatch, file1_); + RunTest(Expressions::LessThanOrEqual("id", Literal::Long(kIntMinValue)), + kRowsMightMatch, file1_); + RunTest(Expressions::LessThanOrEqual("id", Literal::Long(kIntMaxValue)), + kRowsMightMatch, file1_); +} + +TEST_F(InclusiveMetricsEvaluatorMigratedTest, IntegerGtTest) { + RunTest(Expressions::GreaterThan("id", Literal::Long(kIntMaxValue + 6)), + kRowCannotMatch, file1_); + RunTest(Expressions::GreaterThan("id", Literal::Long(kIntMaxValue)), kRowCannotMatch, + file1_); + RunTest(Expressions::GreaterThan("id", Literal::Long(kIntMaxValue - 1)), + kRowsMightMatch, file1_); + RunTest(Expressions::GreaterThan("id", Literal::Long(kIntMaxValue - 4)), + kRowsMightMatch, file1_); +} + +TEST_F(InclusiveMetricsEvaluatorMigratedTest, IntegerGtEqTest) { + RunTest(Expressions::GreaterThanOrEqual("id", Literal::Long(kIntMaxValue + 6)), + kRowCannotMatch, file1_); + RunTest(Expressions::GreaterThanOrEqual("id", Literal::Long(kIntMaxValue + 1)), + kRowCannotMatch, file1_); + RunTest(Expressions::GreaterThanOrEqual("id", Literal::Long(kIntMaxValue)), + kRowsMightMatch, file1_); + RunTest(Expressions::GreaterThanOrEqual("id", Literal::Long(kIntMaxValue - 4)), + kRowsMightMatch, file1_); +} + +TEST_F(InclusiveMetricsEvaluatorMigratedTest, IntegerEqTest) { + RunTest(Expressions::Equal("id", Literal::Long(kIntMinValue - 25)), kRowCannotMatch, + file1_); + RunTest(Expressions::Equal("id", Literal::Long(kIntMinValue - 1)), kRowCannotMatch, + file1_); + RunTest(Expressions::Equal("id", Literal::Long(kIntMinValue)), kRowsMightMatch, file1_); + RunTest(Expressions::Equal("id", Literal::Long(kIntMaxValue - 4)), kRowsMightMatch, + file1_); + RunTest(Expressions::Equal("id", Literal::Long(kIntMaxValue)), kRowsMightMatch, file1_); + RunTest(Expressions::Equal("id", Literal::Long(kIntMaxValue + 1)), kRowCannotMatch, + file1_); + RunTest(Expressions::Equal("id", Literal::Long(kIntMaxValue + 6)), kRowCannotMatch, + file1_); +} + +TEST_F(InclusiveMetricsEvaluatorMigratedTest, IntegerNotEqTest) { + RunTest(Expressions::NotEqual("id", Literal::Long(kIntMinValue - 25)), kRowsMightMatch, + file1_); + RunTest(Expressions::NotEqual("id", Literal::Long(kIntMinValue - 1)), kRowsMightMatch, + file1_); + RunTest(Expressions::NotEqual("id", Literal::Long(kIntMinValue)), kRowsMightMatch, + file1_); + RunTest(Expressions::NotEqual("id", Literal::Long(kIntMaxValue - 4)), kRowsMightMatch, + file1_); + RunTest(Expressions::NotEqual("id", Literal::Long(kIntMaxValue)), kRowsMightMatch, + file1_); + RunTest(Expressions::NotEqual("id", Literal::Long(kIntMaxValue + 1)), kRowsMightMatch, + file1_); + RunTest(Expressions::NotEqual("id", Literal::Long(kIntMaxValue + 6)), kRowsMightMatch, + file1_); +} + +TEST_F(InclusiveMetricsEvaluatorMigratedTest, IntegerNotEqRewrittenTest) { + RunTest(Expressions::Not(Expressions::Equal("id", Literal::Long(kIntMinValue - 25))), + kRowsMightMatch, file1_); + RunTest(Expressions::Not(Expressions::Equal("id", Literal::Long(kIntMinValue - 1))), + kRowsMightMatch, file1_); + RunTest(Expressions::Not(Expressions::Equal("id", Literal::Long(kIntMinValue))), + kRowsMightMatch, file1_); + RunTest(Expressions::Not(Expressions::Equal("id", Literal::Long(kIntMaxValue - 4))), + kRowsMightMatch, file1_); + RunTest(Expressions::Not(Expressions::Equal("id", Literal::Long(kIntMaxValue))), + kRowsMightMatch, file1_); + RunTest(Expressions::Not(Expressions::Equal("id", Literal::Long(kIntMaxValue + 1))), + kRowsMightMatch, file1_); + RunTest(Expressions::Not(Expressions::Equal("id", Literal::Long(kIntMaxValue + 6))), + kRowsMightMatch, file1_); +} + +TEST_F(InclusiveMetricsEvaluatorMigratedTest, CaseInsensitiveIntegerNotEqRewrittenTest) { + RunTest(Expressions::Not(Expressions::Equal("ID", Literal::Long(kIntMinValue - 25))), + kRowsMightMatch, file1_, false); + RunTest(Expressions::Not(Expressions::Equal("ID", Literal::Long(kIntMinValue - 1))), + kRowsMightMatch, file1_, false); + RunTest(Expressions::Not(Expressions::Equal("ID", Literal::Long(kIntMinValue))), + kRowsMightMatch, file1_, false); + RunTest(Expressions::Not(Expressions::Equal("ID", Literal::Long(kIntMaxValue - 4))), + kRowsMightMatch, file1_, false); + RunTest(Expressions::Not(Expressions::Equal("ID", Literal::Long(kIntMaxValue))), + kRowsMightMatch, file1_, false); + RunTest(Expressions::Not(Expressions::Equal("ID", Literal::Long(kIntMaxValue + 1))), + kRowsMightMatch, file1_, false); + RunTest(Expressions::Not(Expressions::Equal("ID", Literal::Long(kIntMaxValue + 6))), + kRowsMightMatch, file1_, false); +} + +TEST_F(InclusiveMetricsEvaluatorMigratedTest, CaseSensitiveIntegerNotEqRewrittenTest) { + auto expr = Expressions::Not(Expressions::Equal("ID", Literal::Long(5))); + auto result = InclusiveMetricsEvaluator::Make(expr, *schema_, true); + ASSERT_FALSE(result.has_value()) << result.error().message; + ASSERT_TRUE(result.error().message.contains("Cannot find field 'ID' in struct")) + << result.error().message; +} + +TEST_F(InclusiveMetricsEvaluatorMigratedTest, StringStartsWithTest) { + RunTest(Expressions::StartsWith("required", "a"), kRowsMightMatch, file1_); + RunTest(Expressions::StartsWith("required", "a"), kRowsMightMatch, file2_); + RunTest(Expressions::StartsWith("required", "aa"), kRowsMightMatch, file2_); + RunTest(Expressions::StartsWith("required", "aaa"), kRowsMightMatch, file2_); + RunTest(Expressions::StartsWith("required", "1s"), kRowsMightMatch, file3_); + RunTest(Expressions::StartsWith("required", "1str1x"), kRowsMightMatch, file3_); + RunTest(Expressions::StartsWith("required", "ff"), kRowsMightMatch, file4_); + + RunTest(Expressions::StartsWith("required", "aB"), kRowCannotMatch, file2_); + RunTest(Expressions::StartsWith("required", "dWX"), kRowCannotMatch, file2_); + + RunTest(Expressions::StartsWith("required", "5"), kRowCannotMatch, file3_); + RunTest(Expressions::StartsWith("required", "3str3x"), kRowCannotMatch, file3_); + RunTest(Expressions::StartsWith("some_empty", "房东整租霍"), kRowsMightMatch, file1_); + + RunTest(Expressions::StartsWith("all_nulls", ""), kRowCannotMatch, file1_); + auto above_max = TruncateUtils::TruncateLiteral(Literal::String("イロハニホヘト"), 4) + .value() + .ToString(); + RunTest(Expressions::StartsWith("required", above_max), kRowCannotMatch, file4_); +} + +TEST_F(InclusiveMetricsEvaluatorMigratedTest, StringNotStartsWithTest) { + RunTest(Expressions::NotStartsWith("required", "a"), kRowsMightMatch, file1_); + RunTest(Expressions::NotStartsWith("required", "a"), kRowsMightMatch, file2_); + RunTest(Expressions::NotStartsWith("required", "aa"), kRowsMightMatch, file2_); + RunTest(Expressions::NotStartsWith("required", "aaa"), kRowsMightMatch, file2_); + RunTest(Expressions::NotStartsWith("required", "1s"), kRowsMightMatch, file3_); + RunTest(Expressions::NotStartsWith("required", "1str1x"), kRowsMightMatch, file3_); + RunTest(Expressions::NotStartsWith("required", "ff"), kRowsMightMatch, file4_); + + RunTest(Expressions::NotStartsWith("required", "aB"), kRowsMightMatch, file2_); + RunTest(Expressions::NotStartsWith("required", "dWX"), kRowsMightMatch, file2_); + + RunTest(Expressions::NotStartsWith("required", "5"), kRowsMightMatch, file3_); + RunTest(Expressions::NotStartsWith("required", "3str3x"), kRowsMightMatch, file3_); + + auto above_max = TruncateUtils::TruncateLiteral(Literal::String("イロハニホヘト"), 4) + .value() + .ToString(); + RunTest(Expressions::NotStartsWith("required", above_max), kRowsMightMatch, file4_); + + RunTest(Expressions::NotStartsWith("required", "abc"), kRowCannotMatch, file5_); + RunTest(Expressions::NotStartsWith("required", "abcd"), kRowsMightMatch, file5_); +} + +TEST_F(InclusiveMetricsEvaluatorMigratedTest, IntegerInTest) { + RunTest(Expressions::In( + "id", {Literal::Long(kIntMinValue - 25), Literal::Long(kIntMinValue - 24)}), + kRowCannotMatch, file1_); + RunTest(Expressions::In( + "id", {Literal::Long(kIntMinValue - 2), Literal::Long(kIntMinValue - 1)}), + kRowCannotMatch, file1_); + RunTest(Expressions::In("id", + {Literal::Long(kIntMinValue - 1), Literal::Long(kIntMinValue)}), + kRowsMightMatch, file1_); + RunTest(Expressions::In( + "id", {Literal::Long(kIntMaxValue - 4), Literal::Long(kIntMaxValue - 3)}), + kRowsMightMatch, file1_); + RunTest(Expressions::In("id", + {Literal::Long(kIntMaxValue), Literal::Long(kIntMaxValue + 1)}), + kRowsMightMatch, file1_); + RunTest(Expressions::In( + "id", {Literal::Long(kIntMaxValue + 1), Literal::Long(kIntMaxValue + 2)}), + kRowCannotMatch, file1_); + RunTest(Expressions::In( + "id", {Literal::Long(kIntMaxValue + 6), Literal::Long(kIntMaxValue + 7)}), + kRowCannotMatch, file1_); + + RunTest(Expressions::In("all_nulls", {Literal::String("abc"), Literal::String("def")}), + kRowCannotMatch, file1_); + RunTest(Expressions::In("some_nulls", {Literal::String("abc"), Literal::String("def")}), + kRowsMightMatch, file1_); + RunTest(Expressions::In("no_nulls", {Literal::String("abc"), Literal::String("def")}), + kRowsMightMatch, file1_); + + std::vector ids; + for (int i = -400; i <= 0; i++) { + ids.emplace_back(Literal::Long(i)); + } + RunTest(Expressions::In("id", ids), kRowsMightMatch, file1_); +} + +TEST_F(InclusiveMetricsEvaluatorMigratedTest, IntegerNotInTest) { + RunTest(Expressions::NotIn( + "id", {Literal::Long(kIntMinValue - 25), Literal::Long(kIntMinValue - 24)}), + kRowsMightMatch, file1_); + RunTest(Expressions::NotIn( + "id", {Literal::Long(kIntMinValue - 2), Literal::Long(kIntMinValue - 1)}), + kRowsMightMatch, file1_); + RunTest(Expressions::NotIn( + "id", {Literal::Long(kIntMinValue - 1), Literal::Long(kIntMinValue)}), + kRowsMightMatch, file1_); + RunTest(Expressions::NotIn( + "id", {Literal::Long(kIntMaxValue - 4), Literal::Long(kIntMaxValue - 3)}), + kRowsMightMatch, file1_); + RunTest(Expressions::NotIn( + "id", {Literal::Long(kIntMaxValue), Literal::Long(kIntMaxValue + 1)}), + kRowsMightMatch, file1_); + RunTest(Expressions::NotIn( + "id", {Literal::Long(kIntMaxValue + 1), Literal::Long(kIntMaxValue + 2)}), + kRowsMightMatch, file1_); + RunTest(Expressions::NotIn( + "id", {Literal::Long(kIntMaxValue + 6), Literal::Long(kIntMaxValue + 7)}), + kRowsMightMatch, file1_); + + RunTest( + Expressions::NotIn("all_nulls", {Literal::String("abc"), Literal::String("def")}), + kRowsMightMatch, file1_); + RunTest( + Expressions::NotIn("some_nulls", {Literal::String("abc"), Literal::String("def")}), + kRowsMightMatch, file1_); + RunTest( + Expressions::NotIn("no_nulls", {Literal::String("abc"), Literal::String("def")}), + kRowsMightMatch, file1_); + + std::vector ids; + for (int i = -400; i <= 0; i++) { + ids.emplace_back(Literal::Long(i)); + } + RunTest(Expressions::NotIn("id", ids), kRowsMightMatch, file1_); +} + +} // namespace iceberg diff --git a/src/iceberg/test/inclusive_metrics_evaluator_with_transform_test.cc b/src/iceberg/test/inclusive_metrics_evaluator_with_transform_test.cc new file mode 100644 index 000000000..935f3c3ab --- /dev/null +++ b/src/iceberg/test/inclusive_metrics_evaluator_with_transform_test.cc @@ -0,0 +1,485 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include +#include + +#include + +#include "iceberg/expression/expressions.h" +#include "iceberg/expression/inclusive_metrics_evaluator.h" +#include "iceberg/expression/term.h" +#include "iceberg/manifest/manifest_entry.h" +#include "iceberg/schema.h" +#include "iceberg/test/matchers.h" +#include "iceberg/type.h" + +namespace iceberg { + +namespace { +constexpr bool kRowsMightMatch = true; +constexpr bool kRowCannotMatch = false; +constexpr int64_t kIntMinValue = 30; +constexpr int64_t kIntMaxValue = 79; +constexpr int64_t kMicrosPerDay = 86'400'000'000LL; +constexpr int64_t kTsMinValue = 30 * kMicrosPerDay; +constexpr int64_t kTsMaxValue = 79 * kMicrosPerDay; + +std::shared_ptr> ToBoundTransform( + const std::shared_ptr& transform) { + return std::static_pointer_cast>(transform); +} +} // namespace + +class InclusiveMetricsEvaluatorWithTransformTest : public ::testing::Test { + protected: + void SetUp() override { + schema_ = std::make_shared( + std::vector{ + SchemaField::MakeRequired(1, "id", int64()), + SchemaField::MakeRequired(2, "ts", timestamp_tz()), + SchemaField::MakeOptional(3, "all_nulls", int64()), + SchemaField::MakeOptional(4, "all_nulls_str", string()), + SchemaField::MakeOptional(5, "no_stats", int64()), + SchemaField::MakeOptional(6, "str", string()), + }, + /*schema_id=*/0); + + data_file_ = std::make_shared(); + data_file_->file_path = "file.avro"; + data_file_->file_format = FileFormatType::kAvro; + data_file_->record_count = 50; + data_file_->value_counts = { + {1, 50L}, + {2, 50L}, + {3, 50L}, + {4, 50L}, + }; + data_file_->null_value_counts = { + {1, 0L}, + {2, 0L}, + {3, 50L}, + {4, 50L}, + }; + data_file_->nan_value_counts.clear(); + data_file_->lower_bounds = { + {2, Literal::TimestampTz(kTsMinValue).Serialize().value()}, + {6, Literal::String("abc").Serialize().value()}, + }; + data_file_->upper_bounds = { + {2, Literal::TimestampTz(kTsMaxValue).Serialize().value()}, + {6, Literal::String("abe").Serialize().value()}, + }; + } + + void ExpectShouldRead(const std::shared_ptr& expr, bool expected_result, + std::shared_ptr file = nullptr, + bool case_sensitive = true) { + auto target_file = file ? file : data_file_; + ICEBERG_UNWRAP_OR_FAIL( + auto evaluator, InclusiveMetricsEvaluator::Make(expr, *schema_, case_sensitive)); + auto eval_result = evaluator->Evaluate(*target_file); + ASSERT_TRUE(eval_result.has_value()); + ASSERT_EQ(eval_result.value(), expected_result) << expr->ToString(); + } + + std::vector> MissingStatsExpressions() const { + auto truncate_no_stats = ToBoundTransform(Expressions::Truncate("no_stats", 10)); + return { + Expressions::LessThan(truncate_no_stats, Literal::Long(5)), + Expressions::LessThanOrEqual(truncate_no_stats, Literal::Long(30)), + Expressions::Equal(truncate_no_stats, Literal::Long(70)), + Expressions::GreaterThan(truncate_no_stats, Literal::Long(78)), + Expressions::GreaterThanOrEqual(truncate_no_stats, Literal::Long(90)), + Expressions::NotEqual(truncate_no_stats, Literal::Long(101)), + Expressions::IsNull(truncate_no_stats), + Expressions::NotNull(truncate_no_stats), + }; + } + + std::shared_ptr schema_; + std::shared_ptr data_file_; +}; + +TEST_F(InclusiveMetricsEvaluatorWithTransformTest, AllNullsWithNonOrderPreserving) { + auto bucket_all_nulls = ToBoundTransform(Expressions::Bucket("all_nulls", 100)); + ExpectShouldRead(Expressions::IsNull(bucket_all_nulls), kRowsMightMatch); + ExpectShouldRead(Expressions::NotNull(bucket_all_nulls), kRowCannotMatch); + ExpectShouldRead(Expressions::LessThan(bucket_all_nulls, Literal::Int(30)), + kRowCannotMatch); + ExpectShouldRead(Expressions::LessThanOrEqual(bucket_all_nulls, Literal::Int(30)), + kRowCannotMatch); + ExpectShouldRead(Expressions::GreaterThan(bucket_all_nulls, Literal::Int(30)), + kRowCannotMatch); + ExpectShouldRead(Expressions::GreaterThanOrEqual(bucket_all_nulls, Literal::Int(30)), + kRowCannotMatch); + ExpectShouldRead(Expressions::Equal(bucket_all_nulls, Literal::Int(30)), + kRowCannotMatch); + ExpectShouldRead(Expressions::NotEqual(bucket_all_nulls, Literal::Int(30)), + kRowsMightMatch); + ExpectShouldRead(Expressions::In(bucket_all_nulls, {Literal::Int(1), Literal::Int(2)}), + kRowCannotMatch); + ExpectShouldRead( + Expressions::NotIn(bucket_all_nulls, {Literal::Int(1), Literal::Int(2)}), + kRowsMightMatch); +} + +TEST_F(InclusiveMetricsEvaluatorWithTransformTest, RequiredWithNonOrderPreserving) { + auto bucket_ts = ToBoundTransform(Expressions::Bucket("ts", 100)); + ExpectShouldRead(Expressions::IsNull(bucket_ts), kRowsMightMatch); + ExpectShouldRead(Expressions::NotNull(bucket_ts), kRowsMightMatch); + ExpectShouldRead(Expressions::LessThan(bucket_ts, Literal::Int(30)), kRowsMightMatch); + ExpectShouldRead(Expressions::LessThanOrEqual(bucket_ts, Literal::Int(30)), + kRowsMightMatch); + ExpectShouldRead(Expressions::GreaterThan(bucket_ts, Literal::Int(30)), + kRowsMightMatch); + ExpectShouldRead(Expressions::GreaterThanOrEqual(bucket_ts, Literal::Int(30)), + kRowsMightMatch); + ExpectShouldRead(Expressions::Equal(bucket_ts, Literal::Int(30)), kRowsMightMatch); + ExpectShouldRead(Expressions::NotEqual(bucket_ts, Literal::Int(30)), kRowsMightMatch); + ExpectShouldRead(Expressions::In(bucket_ts, {Literal::Int(1), Literal::Int(2)}), + kRowsMightMatch); + ExpectShouldRead(Expressions::NotIn(bucket_ts, {Literal::Int(1), Literal::Int(2)}), + kRowsMightMatch); +} + +TEST_F(InclusiveMetricsEvaluatorWithTransformTest, AllNulls) { + auto truncate_all_nulls = ToBoundTransform(Expressions::Truncate("all_nulls", 10)); + ExpectShouldRead(Expressions::IsNull(truncate_all_nulls), kRowsMightMatch); + ExpectShouldRead(Expressions::NotNull(truncate_all_nulls), kRowCannotMatch); + ExpectShouldRead(Expressions::LessThan(truncate_all_nulls, Literal::Long(30)), + kRowCannotMatch); + ExpectShouldRead(Expressions::LessThanOrEqual(truncate_all_nulls, Literal::Long(30)), + kRowCannotMatch); + ExpectShouldRead(Expressions::GreaterThan(truncate_all_nulls, Literal::Long(30)), + kRowCannotMatch); + ExpectShouldRead(Expressions::GreaterThanOrEqual(truncate_all_nulls, Literal::Long(30)), + kRowCannotMatch); + ExpectShouldRead(Expressions::Equal(truncate_all_nulls, Literal::Long(30)), + kRowCannotMatch); + ExpectShouldRead(Expressions::NotEqual(truncate_all_nulls, Literal::Long(30)), + kRowsMightMatch); + ExpectShouldRead( + Expressions::In(truncate_all_nulls, {Literal::Long(10), Literal::Long(20)}), + kRowCannotMatch); + ExpectShouldRead( + Expressions::NotIn(truncate_all_nulls, {Literal::Long(10), Literal::Long(20)}), + kRowsMightMatch); + + auto truncate_all_nulls_str = + ToBoundTransform(Expressions::Truncate("all_nulls_str", 10)); + ExpectShouldRead(Expressions::StartsWith(truncate_all_nulls_str, "a"), kRowsMightMatch); + ExpectShouldRead(Expressions::NotStartsWith(truncate_all_nulls_str, "a"), + kRowsMightMatch); +} + +TEST_F(InclusiveMetricsEvaluatorWithTransformTest, MissingColumn) { + auto expr = Expressions::LessThan( + ToBoundTransform(Expressions::Truncate("missing", 10)), Literal::Long(20)); + auto result = InclusiveMetricsEvaluator::Make(expr, *schema_, true); + ASSERT_FALSE(result.has_value()) << result.error().message; + ASSERT_TRUE(result.error().message.contains("Cannot find field 'missing'")) + << result.error().message; +} + +TEST_F(InclusiveMetricsEvaluatorWithTransformTest, MissingStats) { + for (const auto& expr : MissingStatsExpressions()) { + ExpectShouldRead(expr, kRowsMightMatch); + } +} + +TEST_F(InclusiveMetricsEvaluatorWithTransformTest, ZeroRecordFile) { + auto zero_record_file = std::make_shared(); + zero_record_file->file_path = "file.parquet"; + zero_record_file->file_format = FileFormatType::kParquet; + zero_record_file->record_count = 0; + + for (const auto& expr : MissingStatsExpressions()) { + ExpectShouldRead(expr, kRowCannotMatch, zero_record_file); + } +} + +TEST_F(InclusiveMetricsEvaluatorWithTransformTest, Not) { + auto day_ts = ToBoundTransform(Expressions::Day("ts")); + ExpectShouldRead( + Expressions::Not(Expressions::LessThan(day_ts, Literal::Long(kIntMinValue - 25))), + kRowsMightMatch); + ExpectShouldRead(Expressions::Not(Expressions::GreaterThan( + day_ts, Literal::Long(kIntMinValue - 25))), + kRowCannotMatch); +} + +TEST_F(InclusiveMetricsEvaluatorWithTransformTest, And) { + auto day_ts = ToBoundTransform(Expressions::Day("ts")); + ExpectShouldRead( + Expressions::And( + Expressions::LessThan(day_ts, Literal::Long(kIntMinValue - 25)), + Expressions::GreaterThanOrEqual(day_ts, Literal::Long(kIntMinValue - 30))), + kRowCannotMatch); + ExpectShouldRead( + Expressions::And( + Expressions::LessThan(day_ts, Literal::Long(kIntMinValue - 25)), + Expressions::GreaterThanOrEqual(day_ts, Literal::Long(kIntMaxValue + 1))), + kRowCannotMatch); + ExpectShouldRead( + Expressions::And(Expressions::GreaterThan(day_ts, Literal::Long(kIntMinValue - 25)), + Expressions::LessThanOrEqual(day_ts, Literal::Long(kIntMinValue))), + kRowsMightMatch); +} + +TEST_F(InclusiveMetricsEvaluatorWithTransformTest, Or) { + auto day_ts = ToBoundTransform(Expressions::Day("ts")); + ExpectShouldRead( + Expressions::Or( + Expressions::LessThan(day_ts, Literal::Long(kIntMinValue - 25)), + Expressions::GreaterThanOrEqual(day_ts, Literal::Long(kIntMaxValue + 1))), + kRowCannotMatch); + ExpectShouldRead( + Expressions::Or( + Expressions::LessThan(day_ts, Literal::Long(kIntMinValue - 25)), + Expressions::GreaterThanOrEqual(day_ts, Literal::Long(kIntMaxValue - 19))), + kRowsMightMatch); +} + +TEST_F(InclusiveMetricsEvaluatorWithTransformTest, IntegerLt) { + auto day_ts = ToBoundTransform(Expressions::Day("ts")); + ExpectShouldRead(Expressions::LessThan(day_ts, Literal::Long(kIntMinValue - 25)), + kRowCannotMatch); + ExpectShouldRead(Expressions::LessThan(day_ts, Literal::Long(kIntMinValue)), + kRowCannotMatch); + ExpectShouldRead(Expressions::LessThan(day_ts, Literal::Long(kIntMinValue + 1)), + kRowsMightMatch); + ExpectShouldRead(Expressions::LessThan(day_ts, Literal::Long(kIntMaxValue)), + kRowsMightMatch); +} + +TEST_F(InclusiveMetricsEvaluatorWithTransformTest, IntegerLtEq) { + auto day_ts = ToBoundTransform(Expressions::Day("ts")); + ExpectShouldRead(Expressions::LessThanOrEqual(day_ts, Literal::Long(kIntMinValue - 25)), + kRowCannotMatch); + ExpectShouldRead(Expressions::LessThanOrEqual(day_ts, Literal::Long(kIntMinValue - 1)), + kRowCannotMatch); + ExpectShouldRead(Expressions::LessThanOrEqual(day_ts, Literal::Long(kIntMinValue)), + kRowsMightMatch); + ExpectShouldRead(Expressions::LessThanOrEqual(day_ts, Literal::Long(kIntMaxValue)), + kRowsMightMatch); +} + +TEST_F(InclusiveMetricsEvaluatorWithTransformTest, IntegerGt) { + auto day_ts = ToBoundTransform(Expressions::Day("ts")); + ExpectShouldRead(Expressions::GreaterThan(day_ts, Literal::Int(kIntMaxValue + 6)), + kRowCannotMatch); + ExpectShouldRead(Expressions::GreaterThan(day_ts, Literal::Date(kIntMaxValue)), + kRowCannotMatch); + ExpectShouldRead(Expressions::GreaterThan(day_ts, Literal::Date(kIntMaxValue - 1)), + kRowsMightMatch); + ExpectShouldRead(Expressions::GreaterThan(day_ts, Literal::Date(kIntMaxValue - 4)), + kRowsMightMatch); +} + +TEST_F(InclusiveMetricsEvaluatorWithTransformTest, IntegerGtEq) { + auto day_ts = ToBoundTransform(Expressions::Day("ts")); + ExpectShouldRead( + Expressions::GreaterThanOrEqual(day_ts, Literal::Long(kIntMaxValue + 6)), + kRowCannotMatch); + ExpectShouldRead( + Expressions::GreaterThanOrEqual(day_ts, Literal::Long(kIntMaxValue + 1)), + kRowCannotMatch); + ExpectShouldRead(Expressions::GreaterThanOrEqual(day_ts, Literal::Long(kIntMaxValue)), + kRowsMightMatch); + ExpectShouldRead( + Expressions::GreaterThanOrEqual(day_ts, Literal::Long(kIntMaxValue - 4)), + kRowsMightMatch); +} + +TEST_F(InclusiveMetricsEvaluatorWithTransformTest, IntegerEq) { + auto day_ts = ToBoundTransform(Expressions::Day("ts")); + ExpectShouldRead(Expressions::Equal(day_ts, Literal::Long(kIntMinValue - 25)), + kRowCannotMatch); + ExpectShouldRead(Expressions::Equal(day_ts, Literal::Long(kIntMinValue - 1)), + kRowCannotMatch); + ExpectShouldRead(Expressions::Equal(day_ts, Literal::Long(kIntMinValue)), + kRowsMightMatch); + ExpectShouldRead(Expressions::Equal(day_ts, Literal::Long(kIntMaxValue - 4)), + kRowsMightMatch); + ExpectShouldRead(Expressions::Equal(day_ts, Literal::Long(kIntMaxValue)), + kRowsMightMatch); + ExpectShouldRead(Expressions::Equal(day_ts, Literal::Long(kIntMaxValue + 1)), + kRowCannotMatch); + ExpectShouldRead(Expressions::Equal(day_ts, Literal::Long(kIntMaxValue + 6)), + kRowCannotMatch); +} + +TEST_F(InclusiveMetricsEvaluatorWithTransformTest, IntegerNotEq) { + auto day_ts = ToBoundTransform(Expressions::Day("ts")); + ExpectShouldRead(Expressions::NotEqual(day_ts, Literal::Long(kIntMinValue - 25)), + kRowsMightMatch); + ExpectShouldRead(Expressions::NotEqual(day_ts, Literal::Long(kIntMinValue - 1)), + kRowsMightMatch); + ExpectShouldRead(Expressions::NotEqual(day_ts, Literal::Long(kIntMinValue)), + kRowsMightMatch); + ExpectShouldRead(Expressions::NotEqual(day_ts, Literal::Long(kIntMaxValue - 4)), + kRowsMightMatch); + ExpectShouldRead(Expressions::NotEqual(day_ts, Literal::Long(kIntMaxValue)), + kRowsMightMatch); + ExpectShouldRead(Expressions::NotEqual(day_ts, Literal::Long(kIntMaxValue + 1)), + kRowsMightMatch); + ExpectShouldRead(Expressions::NotEqual(day_ts, Literal::Long(kIntMaxValue + 6)), + kRowsMightMatch); +} + +TEST_F(InclusiveMetricsEvaluatorWithTransformTest, IntegerNotEqRewritten) { + auto day_ts = ToBoundTransform(Expressions::Day("ts")); + ExpectShouldRead( + Expressions::Not(Expressions::Equal(day_ts, Literal::Long(kIntMinValue - 25))), + kRowsMightMatch); + ExpectShouldRead( + Expressions::Not(Expressions::Equal(day_ts, Literal::Long(kIntMinValue - 1))), + kRowsMightMatch); + ExpectShouldRead( + Expressions::Not(Expressions::Equal(day_ts, Literal::Long(kIntMinValue))), + kRowsMightMatch); + ExpectShouldRead( + Expressions::Not(Expressions::Equal(day_ts, Literal::Long(kIntMaxValue - 4))), + kRowsMightMatch); + ExpectShouldRead( + Expressions::Not(Expressions::Equal(day_ts, Literal::Long(kIntMaxValue))), + kRowsMightMatch); + ExpectShouldRead( + Expressions::Not(Expressions::Equal(day_ts, Literal::Long(kIntMaxValue + 1))), + kRowsMightMatch); + ExpectShouldRead( + Expressions::Not(Expressions::Equal(day_ts, Literal::Long(kIntMaxValue + 6))), + kRowsMightMatch); +} + +TEST_F(InclusiveMetricsEvaluatorWithTransformTest, CaseInsensitiveIntegerNotEqRewritten) { + auto day_ts = ToBoundTransform(Expressions::Day("TS")); + ExpectShouldRead( + Expressions::Not(Expressions::Equal(day_ts, Literal::Long(kIntMinValue - 25))), + kRowsMightMatch, nullptr, false); + ExpectShouldRead( + Expressions::Not(Expressions::Equal(day_ts, Literal::Long(kIntMinValue - 1))), + kRowsMightMatch, nullptr, false); + ExpectShouldRead( + Expressions::Not(Expressions::Equal(day_ts, Literal::Long(kIntMinValue))), + kRowsMightMatch, nullptr, false); + ExpectShouldRead( + Expressions::Not(Expressions::Equal(day_ts, Literal::Long(kIntMaxValue - 4))), + kRowsMightMatch, nullptr, false); + ExpectShouldRead( + Expressions::Not(Expressions::Equal(day_ts, Literal::Long(kIntMaxValue))), + kRowsMightMatch, nullptr, false); + ExpectShouldRead( + Expressions::Not(Expressions::Equal(day_ts, Literal::Long(kIntMaxValue + 1))), + kRowsMightMatch, nullptr, false); + ExpectShouldRead( + Expressions::Not(Expressions::Equal(day_ts, Literal::Long(kIntMaxValue + 6))), + kRowsMightMatch, nullptr, false); +} + +TEST_F(InclusiveMetricsEvaluatorWithTransformTest, CaseSensitiveIntegerNotEqRewritten) { + auto day_ts = ToBoundTransform(Expressions::Day("TS")); + auto expr = Expressions::Not(Expressions::Equal(day_ts, Literal::Long(5))); + auto result = InclusiveMetricsEvaluator::Make(expr, *schema_, true); + ASSERT_FALSE(result.has_value()) << result.error().message; + ASSERT_TRUE(result.error().message.contains("Cannot find field 'TS'")) + << result.error().message; +} + +TEST_F(InclusiveMetricsEvaluatorWithTransformTest, StringStartsWith) { + auto truncate_str = ToBoundTransform(Expressions::Truncate("str", 10)); + ExpectShouldRead(Expressions::StartsWith(truncate_str, "a"), kRowsMightMatch); + ExpectShouldRead(Expressions::StartsWith(truncate_str, "ab"), kRowsMightMatch); + ExpectShouldRead(Expressions::StartsWith(truncate_str, "b"), kRowsMightMatch); +} + +TEST_F(InclusiveMetricsEvaluatorWithTransformTest, StringNotStartsWith) { + auto truncate_str = ToBoundTransform(Expressions::Truncate("str", 10)); + ExpectShouldRead(Expressions::StartsWith(truncate_str, "a"), kRowsMightMatch); + ExpectShouldRead(Expressions::StartsWith(truncate_str, "ab"), kRowsMightMatch); + ExpectShouldRead(Expressions::StartsWith(truncate_str, "b"), kRowsMightMatch); +} + +TEST_F(InclusiveMetricsEvaluatorWithTransformTest, IntegerIn) { + auto day_ts = ToBoundTransform(Expressions::Day("ts")); + ExpectShouldRead(Expressions::In(day_ts, {Literal::Long(kIntMinValue - 25), + Literal::Long(kIntMinValue - 24)}), + kRowCannotMatch); + ExpectShouldRead(Expressions::In(day_ts, {Literal::Long(kIntMinValue - 2), + Literal::Long(kIntMinValue - 1)}), + kRowCannotMatch); + ExpectShouldRead(Expressions::In(day_ts, {Literal::Long(kIntMinValue - 1), + Literal::Long(kIntMinValue)}), + kRowsMightMatch); + ExpectShouldRead(Expressions::In(day_ts, {Literal::Long(kIntMaxValue - 4), + Literal::Long(kIntMaxValue - 3)}), + kRowsMightMatch); + ExpectShouldRead(Expressions::In(day_ts, {Literal::Long(kIntMaxValue), + Literal::Long(kIntMaxValue + 1)}), + kRowsMightMatch); + ExpectShouldRead(Expressions::In(day_ts, {Literal::Long(kIntMaxValue + 1), + Literal::Long(kIntMaxValue + 2)}), + kRowCannotMatch); + ExpectShouldRead(Expressions::In(day_ts, {Literal::Long(kIntMaxValue + 6), + Literal::Long(kIntMaxValue + 7)}), + kRowCannotMatch); + + std::vector ids; + ids.reserve(401); + for (int i = -400; i <= 0; ++i) { + ids.emplace_back(Literal::Long(i)); + } + ExpectShouldRead(Expressions::In(day_ts, ids), kRowsMightMatch); +} + +TEST_F(InclusiveMetricsEvaluatorWithTransformTest, IntegerNotIn) { + auto day_ts = ToBoundTransform(Expressions::Day("ts")); + ExpectShouldRead(Expressions::NotIn(day_ts, {Literal::Long(kIntMinValue - 25), + Literal::Long(kIntMinValue - 24)}), + kRowsMightMatch); + ExpectShouldRead(Expressions::NotIn(day_ts, {Literal::Long(kIntMinValue - 2), + Literal::Long(kIntMinValue - 1)}), + kRowsMightMatch); + ExpectShouldRead(Expressions::NotIn(day_ts, {Literal::Long(kIntMinValue - 1), + Literal::Long(kIntMinValue)}), + kRowsMightMatch); + ExpectShouldRead(Expressions::NotIn(day_ts, {Literal::Long(kIntMaxValue - 4), + Literal::Long(kIntMaxValue - 3)}), + kRowsMightMatch); + ExpectShouldRead(Expressions::NotIn(day_ts, {Literal::Long(kIntMaxValue), + Literal::Long(kIntMaxValue + 1)}), + kRowsMightMatch); + ExpectShouldRead(Expressions::NotIn(day_ts, {Literal::Long(kIntMaxValue + 1), + Literal::Long(kIntMaxValue + 2)}), + kRowsMightMatch); + ExpectShouldRead(Expressions::NotIn(day_ts, {Literal::Long(kIntMaxValue + 6), + Literal::Long(kIntMaxValue + 7)}), + kRowsMightMatch); + + std::vector ids; + ids.reserve(401); + for (int i = -400; i <= 0; ++i) { + ids.emplace_back(Literal::Long(i)); + } + ExpectShouldRead(Expressions::NotIn(day_ts, ids), kRowsMightMatch); +} + +} // namespace iceberg diff --git a/src/iceberg/test/meson.build b/src/iceberg/test/meson.build index c3a401b52..b8fd6cef9 100644 --- a/src/iceberg/test/meson.build +++ b/src/iceberg/test/meson.build @@ -60,6 +60,8 @@ iceberg_tests = { 'aggregate_test.cc', 'expression_test.cc', 'expression_visitor_test.cc', + 'inclusive_metrics_evaluator_test.cc', + 'inclusive_metrics_evaluator_with_transform_test.cc', 'literal_test.cc', 'predicate_test.cc', ),