From e60b30d4c195c40b4dc6336cb1da256c5a0779ad Mon Sep 17 00:00:00 2001 From: "xiao.dong" Date: Wed, 26 Nov 2025 09:32:43 +0800 Subject: [PATCH 01/15] feat: add inclusive metrics evaluator --- src/iceberg/CMakeLists.txt | 1 + src/iceberg/expression/expression_visitor.h | 2 +- .../expression/inclusive_metrics_evaluator.cc | 543 ++++++++++++++++++ .../expression/inclusive_metrics_evaluator.h | 74 +++ src/iceberg/result.h | 2 + 5 files changed, 621 insertions(+), 1 deletion(-) create mode 100644 src/iceberg/expression/inclusive_metrics_evaluator.cc create mode 100644 src/iceberg/expression/inclusive_metrics_evaluator.h diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt index d5429808c..b263d7f17 100644 --- a/src/iceberg/CMakeLists.txt +++ b/src/iceberg/CMakeLists.txt @@ -28,6 +28,7 @@ set(ICEBERG_SOURCES expression/literal.cc expression/predicate.cc expression/rewrite_not.cc + expression/inclusive_metrics_evaluator.cc expression/term.cc file_reader.cc file_writer.cc diff --git a/src/iceberg/expression/expression_visitor.h b/src/iceberg/expression/expression_visitor.h index d66382453..cda5f93ce 100644 --- a/src/iceberg/expression/expression_visitor.h +++ b/src/iceberg/expression/expression_visitor.h @@ -263,7 +263,7 @@ class ICEBERG_EXPORT BoundVisitor : public ExpressionVisitor { /// Bound visitors do not support unbound predicates. /// /// \param pred The unbound predicate - Result Predicate(const std::shared_ptr& pred) final { + Result Predicate(const std::shared_ptr& pred) override { ICEBERG_DCHECK(pred != nullptr, "UnboundPredicate cannot be null"); return NotSupported("Not a bound predicate: {}", pred->ToString()); } diff --git a/src/iceberg/expression/inclusive_metrics_evaluator.cc b/src/iceberg/expression/inclusive_metrics_evaluator.cc new file mode 100644 index 000000000..1015f677b --- /dev/null +++ b/src/iceberg/expression/inclusive_metrics_evaluator.cc @@ -0,0 +1,543 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/expression/inclusive_metrics_evaluator.h" + +#include "iceberg/expression/binder.h" +#include "iceberg/expression/expression_visitor.h" +#include "iceberg/expression/rewrite_not.h" +#include "iceberg/manifest_entry.h" +#include "iceberg/schema.h" +#include "iceberg/transform.h" +#include "iceberg/util/macros.h" + +namespace iceberg { + +namespace { +static const bool ROWS_MIGHT_MATCH = true; +static const bool ROWS_CANNOT_MATCH = false; +static const int32_t IN_PREDICATE_LIMIT = 200; +} // namespace + +class InclusiveMetricsVisitor : public BoundVisitor { + public: + explicit InclusiveMetricsVisitor(const DataFile& data_file) : data_file_(data_file) {} + + Result AlwaysTrue() override { return ROWS_MIGHT_MATCH; } + + Result AlwaysFalse() override { return ROWS_CANNOT_MATCH; } + + Result Not(bool child_result) override { return !child_result; } + + Result And(bool left_result, bool right_result) override { + return left_result && right_result; + } + + Result Or(bool left_result, bool right_result) override { + return left_result || right_result; + } + + Result IsNull(const std::shared_ptr& term) override { + if (IsNonNullPreserving(term)) { + // number of non-nulls is the same as for the ref + int id = term->reference()->field().field_id(); + if (!MayContainNull(id)) { + return ROWS_CANNOT_MATCH; + } + } + return ROWS_MIGHT_MATCH; + } + + Result NotNull(const std::shared_ptr& term) override { + // no need to check whether the field is required because binding evaluates that case + // if the column has no non-null values, the expression cannot match + + // all terms are null preserving. see #isNullPreserving(Bound) + int id = term->reference()->field().field_id(); + if (ContainsNullsOnly(id)) { + return ROWS_CANNOT_MATCH; + } + + return ROWS_MIGHT_MATCH; + } + + Result IsNaN(const std::shared_ptr& term) override { + // when there's no nanCounts information, but we already know the column only contains + // null, it's guaranteed that there's no NaN value + int id = term->reference()->field().field_id(); + if (ContainsNullsOnly(id)) { + return ROWS_CANNOT_MATCH; + } + auto ptr = std::dynamic_pointer_cast(term); + if (ptr == nullptr) { + return ROWS_MIGHT_MATCH; + } + if (!data_file_.nan_value_counts.empty() && + data_file_.nan_value_counts.contains(id) && + data_file_.nan_value_counts.at(id) == 0) { + return ROWS_CANNOT_MATCH; + } + return ROWS_MIGHT_MATCH; + } + + Result NotNaN(const std::shared_ptr& term) override { + auto ptr = std::dynamic_pointer_cast(term); + if (ptr == nullptr) { + // identity transforms are already removed by this time + return ROWS_MIGHT_MATCH; + } + + int id = term->reference()->field().field_id(); + + if (ContainsNaNsOnly(id)) { + return ROWS_CANNOT_MATCH; + } + + return ROWS_MIGHT_MATCH; + } + + Result Lt(const std::shared_ptr& term, const Literal& lit) override { + // all terms are null preserving. see #isNullPreserving(Bound) + int id = term->reference()->field().field_id(); + if (ContainsNullsOnly(id) || ContainsNaNsOnly(id)) { + return ROWS_CANNOT_MATCH; + } + auto lower_result = LowerBound(term); + if (!lower_result.has_value() || lower_result.value().IsNaN()) { + // NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more. + return ROWS_MIGHT_MATCH; + } + const auto& lower = lower_result.value(); + + // this also works for transforms that are order preserving: + // if a transform f is order preserving, a < b means that f(a) <= f(b). + // because lower <= a for all values of a in the file, f(lower) <= f(a). + // when f(lower) >= X then f(a) >= f(lower) >= X, so there is no a such that f(a) < X + // f(lower) >= X means rows cannot match + if (lit >= lower) { + return ROWS_CANNOT_MATCH; + } + + return ROWS_MIGHT_MATCH; + } + + Result LtEq(const std::shared_ptr& term, const Literal& lit) override { + // all terms are null preserving. see #isNullPreserving(Bound) + int id = term->reference()->field().field_id(); + if (ContainsNullsOnly(id) || ContainsNaNsOnly(id)) { + return ROWS_CANNOT_MATCH; + } + + auto lower_result = LowerBound(term); + if (!lower_result.has_value() || lower_result.value().IsNaN()) { + // NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more. + return ROWS_MIGHT_MATCH; + } + const auto& lower = lower_result.value(); + + // this also works for transforms that are order preserving: + // if a transform f is order preserving, a < b means that f(a) <= f(b). + // because lower <= a for all values of a in the file, f(lower) <= f(a). + // when f(lower) > X then f(a) >= f(lower) > X, so there is no a such that f(a) <= X + // f(lower) > X means rows cannot match + if (lit > lower) { + return ROWS_CANNOT_MATCH; + } + + return ROWS_MIGHT_MATCH; + } + + Result Gt(const std::shared_ptr& term, const Literal& lit) override { + // all terms are null preserving. see #isNullPreserving(Bound) + int id = term->reference()->field().field_id(); + if (ContainsNullsOnly(id) || ContainsNaNsOnly(id)) { + return ROWS_CANNOT_MATCH; + } + + auto upper_result = UpperBound(term); + if (!upper_result.has_value()) { + return ROWS_MIGHT_MATCH; + } + const auto& upper = upper_result.value(); + + if (lit <= upper) { + return ROWS_CANNOT_MATCH; + } + + return ROWS_MIGHT_MATCH; + } + + Result GtEq(const std::shared_ptr& term, const Literal& lit) override { + // all terms are null preserving. see #isNullPreserving(Bound) + int id = term->reference()->field().field_id(); + if (ContainsNullsOnly(id) || ContainsNaNsOnly(id)) { + return ROWS_CANNOT_MATCH; + } + + auto upper_result = UpperBound(term); + if (!upper_result.has_value()) { + return ROWS_MIGHT_MATCH; + } + const auto& upper = upper_result.value(); + if (lit < upper) { + return ROWS_CANNOT_MATCH; + } + + return ROWS_MIGHT_MATCH; + } + + Result Eq(const std::shared_ptr& term, const Literal& lit) override { + // all terms are null preserving. see #isNullPreserving(Bound) + int id = term->reference()->field().field_id(); + if (ContainsNullsOnly(id) || ContainsNaNsOnly(id)) { + return ROWS_CANNOT_MATCH; + } + + auto lower_result = LowerBound(term); + if (lower_result.has_value()) { + const auto& lower = lower_result.value(); + if (!lower.IsNaN() && lower > lit) { + return ROWS_CANNOT_MATCH; + } + } + + auto upper_result = UpperBound(term); + if (!upper_result.has_value()) { + return ROWS_MIGHT_MATCH; + } + const auto& upper = upper_result.value(); + if (upper != lit) { + return ROWS_CANNOT_MATCH; + } + + return ROWS_MIGHT_MATCH; + } + + Result NotEq(const std::shared_ptr& term, + const Literal& lit) override { + // because the bounds are not necessarily a min or max value, this cannot be answered + // using them. notEq(col, X) with (X, Y) doesn't guarantee that X is a value in col. + return ROWS_MIGHT_MATCH; + } + + Result In(const std::shared_ptr& term, + const BoundSetPredicate::LiteralSet& literal_set) override { + // all terms are null preserving. see #isNullPreserving(Bound) + int id = term->reference()->field().field_id(); + if (ContainsNullsOnly(id) || ContainsNaNsOnly(id)) { + return ROWS_CANNOT_MATCH; + } + + if (literal_set.size() > IN_PREDICATE_LIMIT) { + // skip evaluating the predicate if the number of values is too big + return ROWS_MIGHT_MATCH; + } + + auto lower_result = LowerBound(term); + if (!lower_result.has_value() || lower_result.value().IsNaN()) { + // NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more. + return ROWS_MIGHT_MATCH; + } + const auto& lower = lower_result.value(); + std::vector literals; + for (const auto& lit : literal_set) { + if (lit >= lower) { + literals.emplace_back(lit); + } + } + // if all values are less than lower bound, rows cannot match + if (literals.empty()) { + return ROWS_CANNOT_MATCH; + } + + auto upper_result = UpperBound(term); + if (!upper_result.has_value()) { + return ROWS_MIGHT_MATCH; + } + const auto& upper = upper_result.value(); + std::erase_if(literals, [&](const Literal& x) { return x > upper; }); + // if remaining values are greater than upper bound, rows cannot match + if (literals.empty()) { + return ROWS_CANNOT_MATCH; + } + + return ROWS_MIGHT_MATCH; + } + + Result NotIn(const std::shared_ptr& term, + const BoundSetPredicate::LiteralSet& literal_set) override { + // because the bounds are not necessarily a min or max value, this cannot be answered + // using them. notIn(col, {X, ...}) with (X, Y) doesn't guarantee that X is a value in + // col. + return ROWS_MIGHT_MATCH; + } + + Result StartsWith(const std::shared_ptr& term, + const Literal& lit) override { + if (term->kind() == BoundTerm::Kind::kTransform && + std::dynamic_pointer_cast(term)->transform()->transform_type() != + TransformType::kIdentity) { + // truncate must be rewritten in binding. the result is either always or never + // compatible + return ROWS_MIGHT_MATCH; + } + + int id = term->reference()->field().field_id(); + if (ContainsNullsOnly(id)) { + return ROWS_CANNOT_MATCH; + } + if (lit.type()->type_id() != TypeId::kString) { + return ROWS_CANNOT_MATCH; + } + std::string prefix = get(lit.value()); + + auto lower_result = LowerBound(term); + if (!lower_result.has_value()) { + return ROWS_MIGHT_MATCH; + } + const auto& lower = lower_result.value(); + auto lower_str = get(lower.value()); + // truncate lower bound so that its length in bytes is not greater than the length of + // prefix + int length = std::min(prefix.size(), lower_str.size()); + // if prefix of lower bound is greater than prefix, rows cannot match + if (lower_str.substr(0, length) > prefix.substr(0, length)) { + return ROWS_CANNOT_MATCH; + } + + auto upper_result = UpperBound(term); + if (!upper_result.has_value()) { + return ROWS_MIGHT_MATCH; + } + const auto& upper = upper_result.value(); + auto upper_str = get(upper.value()); + // truncate upper bound so that its length in bytes is not greater than the length of + // prefix + length = std::min(prefix.size(), upper_str.size()); + // if prefix of upper bound is less than prefix, rows cannot match + if (upper_str.substr(0, length) < prefix.substr(0, length)) { + return ROWS_CANNOT_MATCH; + } + + return ROWS_MIGHT_MATCH; + } + + Result NotStartsWith(const std::shared_ptr& term, + const Literal& lit) override { + // the only transforms that produce strings are truncate and identity, which work with + // this + int id = term->reference()->field().field_id(); + if (MayContainNull(id)) { + return ROWS_MIGHT_MATCH; + } + + if (lit.type()->type_id() != TypeId::kString) { + return ROWS_CANNOT_MATCH; + } + std::string prefix = get(lit.value()); + + // notStartsWith will match unless all values must start with the prefix. This happens + // when the lower and upper bounds both start with the prefix. + auto lower_result = LowerBound(term); + auto upper_result = UpperBound(term); + if (!lower_result.has_value() || !upper_result.has_value()) { + return ROWS_MIGHT_MATCH; + } + const auto& lower = lower_result.value(); + const auto& upper = upper_result.value(); + auto lower_str = get(lower.value()); + auto upper_str = get(upper.value()); + + // if lower is shorter than the prefix then lower doesn't start with the prefix + if (lower_str.size() < prefix.size()) { + return ROWS_MIGHT_MATCH; + } + + if (prefix == lower_str.substr(0, prefix.size())) { + // if upper is shorter than the prefix then upper can't start with the prefix + if (upper_str.size() < prefix.size()) { + return ROWS_MIGHT_MATCH; + } + if (upper_str.substr(0, prefix.size()) == prefix) { + // both bounds match the prefix, so all rows must match the prefix and therefore + // do not satisfy the predicate + return ROWS_CANNOT_MATCH; + } + } + + return ROWS_MIGHT_MATCH; + } + + private: + bool MayContainNull(int32_t id) { + return data_file_.null_value_counts.empty() || + !data_file_.null_value_counts.contains(id) || + data_file_.null_value_counts.at(id) != 0; + } + + bool ContainsNullsOnly(int32_t id) { + return !data_file_.value_counts.empty() && data_file_.value_counts.contains(id) && + !data_file_.null_value_counts.empty() && + data_file_.null_value_counts.contains(id) && + data_file_.value_counts.at(id) - data_file_.null_value_counts.at(id) == 0; + } + + bool ContainsNaNsOnly(int32_t id) { + return !data_file_.nan_value_counts.empty() && + data_file_.nan_value_counts.contains(id) && !data_file_.value_counts.empty() && + data_file_.value_counts.at(id) == data_file_.nan_value_counts.at(id); + } + + Result LowerBound(const std::shared_ptr& term) { + if (term->kind() == BoundTerm::Kind::kReference) { + return ParseLowerBound(*std::dynamic_pointer_cast(term)); + } else if (term->kind() == BoundTerm::Kind::kTransform) { + return TransformLowerBound(*std::dynamic_pointer_cast(term)); + } else if (term->kind() == BoundTerm::Kind::kExtract) { + // TODO(xiao.dong) handle extract lower and upper bounds + return NotImplemented("Extract lower bound not implemented."); + } else { + return NotFound("Lower bound not found."); + } + } + + Result UpperBound(const std::shared_ptr& term) { + if (term->kind() == BoundTerm::Kind::kReference) { + return ParseUpperBound(*std::dynamic_pointer_cast(term)); + } else if (term->kind() == BoundTerm::Kind::kTransform) { + return TransformUpperBound(*std::dynamic_pointer_cast(term)); + } else if (term->kind() == BoundTerm::Kind::kExtract) { + // TODO(xiao.dong) handle extract lower and upper bounds + return NotImplemented("Extract upper bound not implemented."); + } else { + return NotFound("Upper bound not found."); + } + } + + Result ParseLowerBound(const BoundReference& ref) { + int id = ref.field().field_id(); + auto type = ref.type(); + if (!type->is_primitive()) { + return InvalidStats("Lower bound of non-primitive type is not supported."); + } + auto primitive_type = std::dynamic_pointer_cast(type); + if (!data_file_.lower_bounds.empty() && data_file_.lower_bounds.contains(id)) { + ICEBERG_ASSIGN_OR_RAISE( + auto lower, + Literal::Deserialize(data_file_.lower_bounds.at(id), primitive_type)); + return lower; + } + + return NotFound("Lower bound not found."); + } + + Result ParseUpperBound(const BoundReference& ref) { + int id = ref.field().field_id(); + auto type = ref.type(); + if (!type->is_primitive()) { + return InvalidStats("Upper bound of non-primitive type is not supported."); + } + auto primitive_type = std::dynamic_pointer_cast(type); + if (!data_file_.upper_bounds.empty() && data_file_.upper_bounds.contains(id)) { + ICEBERG_ASSIGN_OR_RAISE( + auto upper, + Literal::Deserialize(data_file_.upper_bounds.at(id), primitive_type)); + return upper; + } + + return NotFound("Upper bound not found."); + } + + Result TransformLowerBound(BoundTransform& boundTransform) { + auto transform = boundTransform.transform(); + if (transform->PreservesOrder()) { + ICEBERG_ASSIGN_OR_RAISE(auto lower, ParseLowerBound(*boundTransform.reference())); + ICEBERG_ASSIGN_OR_RAISE(auto transform_func, + transform->Bind(boundTransform.reference()->type())); + return transform_func->Transform(lower); + } + + return NotFound("Transform lower bound not found."); + } + + Result TransformUpperBound(BoundTransform& boundTransform) { + auto transform = boundTransform.transform(); + if (transform->PreservesOrder()) { + ICEBERG_ASSIGN_OR_RAISE(auto upper, ParseLowerBound(*boundTransform.reference())); + ICEBERG_ASSIGN_OR_RAISE(auto transform_func, + transform->Bind(boundTransform.reference()->type())); + return transform_func->Transform(upper); + } + + return NotFound("Transform upper bound not found."); + } + + // TODO(xiao.dong) handle extract lower and upper bounds + /* + Literal extractLowerBound(const BoundExtract& bound) { + + } + + Literal extractUpperBound(const BoundExtract& bound) { + + } +*/ + + /** Returns true if the expression term produces a non-null value for non-null input. */ + bool IsNonNullPreserving(const std::shared_ptr& term) { + if (term->kind() == BoundTerm::Kind::kReference) { + return true; + } else if (term->kind() == BoundTerm::Kind::kTransform) { + return std::dynamic_pointer_cast(term) + ->transform() + ->PreservesOrder(); + } + // a non-null variant does not necessarily contain a specific field + // and unknown bound terms are not non-null preserving + return false; + } + + private: + const DataFile& data_file_; +}; + +InclusiveMetricsEvaluator::InclusiveMetricsEvaluator( + const std::shared_ptr& expr) + : expr_(std::move(expr)) {} + +InclusiveMetricsEvaluator::~InclusiveMetricsEvaluator() = default; + +Result> InclusiveMetricsEvaluator::Make( + std::shared_ptr expr, const Schema& schema, bool case_sensitive) { + ICEBERG_ASSIGN_OR_RAISE(auto rewrite_expr, RewriteNot::Visit(std::move(expr))); + ICEBERG_ASSIGN_OR_RAISE(auto bound_expr, + Binder::Bind(schema, rewrite_expr, case_sensitive)); + return std::unique_ptr( + new InclusiveMetricsEvaluator(std::move(bound_expr))); +} + +Result InclusiveMetricsEvaluator::Eval(const DataFile& data_file) const { + if (data_file.record_count <= 0) { + return ROWS_CANNOT_MATCH; + } + InclusiveMetricsVisitor visitor(data_file); + return Visit(expr_, visitor); +} + +} // namespace iceberg diff --git a/src/iceberg/expression/inclusive_metrics_evaluator.h b/src/iceberg/expression/inclusive_metrics_evaluator.h new file mode 100644 index 000000000..4a5869f91 --- /dev/null +++ b/src/iceberg/expression/inclusive_metrics_evaluator.h @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +/// \file iceberg/expression/inclusive_metrics_evaluator.h +/// +/// Evaluates an Expression on a DataFile to test whether rows in the file may match. +/// +/// This evaluation is inclusive: it returns true if a file may match and false if it +/// cannot match. +/// +/// Files are passed to #eval(ContentFile), which returns true if the file may contain +/// matching rows and false if the file cannot contain matching rows. Files may be skipped +/// if and only if the return value of eval is false. +/// +/// Due to the comparison implementation of ORC stats, for float/double columns in ORC +/// files, if the first value in a file is NaN, metrics of this file will report NaN for +/// both upper and lower bound despite that the column could contain non-NaN data. Thus in +/// some scenarios explicitly checks for NaN is necessary in order to not skip files that +/// may contain matching data. +/// + +#include + +#include "iceberg/expression/expression.h" +#include "iceberg/iceberg_export.h" +#include "iceberg/result.h" +#include "iceberg/type_fwd.h" + +namespace iceberg { + +class ICEBERG_EXPORT InclusiveMetricsEvaluator { + public: + /// \brief Make a inclusive metrics evaluator + /// + /// \param expr The expression to evaluate + /// \param schema The schema of the table + /// \param case_sensitive Whether field name matching is case-sensitive + static Result> Make( + std::shared_ptr expr, const Schema& schema, bool case_sensitive = true); + + ~InclusiveMetricsEvaluator(); + + /// \brief Evaluate the expression against a DataFile. + /// + /// \param data_file The data file to evaluate + /// \return true if the file matches the expression, false otherwise, or error + Result Eval(const DataFile& data_file) const; + + private: + explicit InclusiveMetricsEvaluator(const std::shared_ptr& expr); + + private: + std::shared_ptr expr_; +}; + +} // namespace iceberg diff --git a/src/iceberg/result.h b/src/iceberg/result.h index ddc428a23..1bef45a29 100644 --- a/src/iceberg/result.h +++ b/src/iceberg/result.h @@ -43,6 +43,7 @@ enum class ErrorKind { kInvalidManifest, kInvalidManifestList, kInvalidSchema, + kInvalidStats, kIOError, kJsonParseError, kNamespaceNotEmpty, @@ -104,6 +105,7 @@ DEFINE_ERROR_FUNCTION(InvalidExpression) DEFINE_ERROR_FUNCTION(InvalidManifest) DEFINE_ERROR_FUNCTION(InvalidManifestList) DEFINE_ERROR_FUNCTION(InvalidSchema) +DEFINE_ERROR_FUNCTION(InvalidStats) DEFINE_ERROR_FUNCTION(IOError) DEFINE_ERROR_FUNCTION(JsonParseError) DEFINE_ERROR_FUNCTION(NamespaceNotEmpty) From 84970bbe3f95a68f0146665001387489710647d1 Mon Sep 17 00:00:00 2001 From: "xiao.dong" Date: Thu, 27 Nov 2025 16:04:21 +0800 Subject: [PATCH 02/15] fix cpplint --- src/iceberg/expression/inclusive_metrics_evaluator.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/iceberg/expression/inclusive_metrics_evaluator.cc b/src/iceberg/expression/inclusive_metrics_evaluator.cc index 1015f677b..21e2eecfb 100644 --- a/src/iceberg/expression/inclusive_metrics_evaluator.cc +++ b/src/iceberg/expression/inclusive_metrics_evaluator.cc @@ -369,12 +369,12 @@ class InclusiveMetricsVisitor : public BoundVisitor { return ROWS_MIGHT_MATCH; } - if (prefix == lower_str.substr(0, prefix.size())) { + if (lower_str.starts_with(prefix)) { // if upper is shorter than the prefix then upper can't start with the prefix if (upper_str.size() < prefix.size()) { return ROWS_MIGHT_MATCH; } - if (upper_str.substr(0, prefix.size()) == prefix) { + if (upper_str.starts_with(prefix)) { // both bounds match the prefix, so all rows must match the prefix and therefore // do not satisfy the predicate return ROWS_CANNOT_MATCH; From 77a44639989366553a9b1f4ffc6ea95060268735 Mon Sep 17 00:00:00 2001 From: "xiao.dong" Date: Fri, 28 Nov 2025 15:19:47 +0800 Subject: [PATCH 03/15] add case --- .../expression/inclusive_metrics_evaluator.cc | 21 +- .../expression/inclusive_metrics_evaluator.h | 3 +- src/iceberg/test/CMakeLists.txt | 1 + .../test/inclusive_metrics_evaluator_test.cc | 385 ++++++++++++++++++ 4 files changed, 399 insertions(+), 11 deletions(-) create mode 100644 src/iceberg/test/inclusive_metrics_evaluator_test.cc diff --git a/src/iceberg/expression/inclusive_metrics_evaluator.cc b/src/iceberg/expression/inclusive_metrics_evaluator.cc index 21e2eecfb..14ce029b5 100644 --- a/src/iceberg/expression/inclusive_metrics_evaluator.cc +++ b/src/iceberg/expression/inclusive_metrics_evaluator.cc @@ -130,7 +130,7 @@ class InclusiveMetricsVisitor : public BoundVisitor { // because lower <= a for all values of a in the file, f(lower) <= f(a). // when f(lower) >= X then f(a) >= f(lower) >= X, so there is no a such that f(a) < X // f(lower) >= X means rows cannot match - if (lit >= lower) { + if (lower >= lit) { return ROWS_CANNOT_MATCH; } @@ -156,7 +156,7 @@ class InclusiveMetricsVisitor : public BoundVisitor { // because lower <= a for all values of a in the file, f(lower) <= f(a). // when f(lower) > X then f(a) >= f(lower) > X, so there is no a such that f(a) <= X // f(lower) > X means rows cannot match - if (lit > lower) { + if (lower > lit) { return ROWS_CANNOT_MATCH; } @@ -176,7 +176,7 @@ class InclusiveMetricsVisitor : public BoundVisitor { } const auto& upper = upper_result.value(); - if (lit <= upper) { + if (upper <= lit) { return ROWS_CANNOT_MATCH; } @@ -195,7 +195,7 @@ class InclusiveMetricsVisitor : public BoundVisitor { return ROWS_MIGHT_MATCH; } const auto& upper = upper_result.value(); - if (lit < upper) { + if (upper < lit) { return ROWS_CANNOT_MATCH; } @@ -222,7 +222,7 @@ class InclusiveMetricsVisitor : public BoundVisitor { return ROWS_MIGHT_MATCH; } const auto& upper = upper_result.value(); - if (upper != lit) { + if (upper < lit) { return ROWS_CANNOT_MATCH; } @@ -257,7 +257,7 @@ class InclusiveMetricsVisitor : public BoundVisitor { const auto& lower = lower_result.value(); std::vector literals; for (const auto& lit : literal_set) { - if (lit >= lower) { + if (lower <= lit) { literals.emplace_back(lit); } } @@ -317,7 +317,7 @@ class InclusiveMetricsVisitor : public BoundVisitor { // prefix int length = std::min(prefix.size(), lower_str.size()); // if prefix of lower bound is greater than prefix, rows cannot match - if (lower_str.substr(0, length) > prefix.substr(0, length)) { + if (lower_str.substr(0, length) > prefix) { return ROWS_CANNOT_MATCH; } @@ -331,7 +331,7 @@ class InclusiveMetricsVisitor : public BoundVisitor { // prefix length = std::min(prefix.size(), upper_str.size()); // if prefix of upper bound is less than prefix, rows cannot match - if (upper_str.substr(0, length) < prefix.substr(0, length)) { + if (upper_str.substr(0, length) < prefix) { return ROWS_CANNOT_MATCH; } @@ -524,10 +524,11 @@ InclusiveMetricsEvaluator::InclusiveMetricsEvaluator( InclusiveMetricsEvaluator::~InclusiveMetricsEvaluator() = default; Result> InclusiveMetricsEvaluator::Make( - std::shared_ptr expr, const Schema& schema, bool case_sensitive) { + std::shared_ptr expr, const std::shared_ptr& schema, + bool case_sensitive) { ICEBERG_ASSIGN_OR_RAISE(auto rewrite_expr, RewriteNot::Visit(std::move(expr))); ICEBERG_ASSIGN_OR_RAISE(auto bound_expr, - Binder::Bind(schema, rewrite_expr, case_sensitive)); + Binder::Bind(*schema, rewrite_expr, case_sensitive)); return std::unique_ptr( new InclusiveMetricsEvaluator(std::move(bound_expr))); } diff --git a/src/iceberg/expression/inclusive_metrics_evaluator.h b/src/iceberg/expression/inclusive_metrics_evaluator.h index 4a5869f91..9e2c6d879 100644 --- a/src/iceberg/expression/inclusive_metrics_evaluator.h +++ b/src/iceberg/expression/inclusive_metrics_evaluator.h @@ -54,7 +54,8 @@ class ICEBERG_EXPORT InclusiveMetricsEvaluator { /// \param schema The schema of the table /// \param case_sensitive Whether field name matching is case-sensitive static Result> Make( - std::shared_ptr expr, const Schema& schema, bool case_sensitive = true); + std::shared_ptr expr, const std::shared_ptr& schema, + bool case_sensitive = true); ~InclusiveMetricsEvaluator(); diff --git a/src/iceberg/test/CMakeLists.txt b/src/iceberg/test/CMakeLists.txt index 21ccd4d66..c2ecb5350 100644 --- a/src/iceberg/test/CMakeLists.txt +++ b/src/iceberg/test/CMakeLists.txt @@ -95,6 +95,7 @@ add_iceberg_test(expression_test expression_test.cc expression_visitor_test.cc literal_test.cc + inclusive_metrics_evaluator_test.cc predicate_test.cc) add_iceberg_test(json_serde_test diff --git a/src/iceberg/test/inclusive_metrics_evaluator_test.cc b/src/iceberg/test/inclusive_metrics_evaluator_test.cc new file mode 100644 index 000000000..b0f8cb8d3 --- /dev/null +++ b/src/iceberg/test/inclusive_metrics_evaluator_test.cc @@ -0,0 +1,385 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/expression/inclusive_metrics_evaluator.h" + +#include + +#include "iceberg/expression/binder.h" +#include "iceberg/expression/expressions.h" +#include "iceberg/manifest_entry.h" +#include "iceberg/schema.h" +#include "iceberg/test/matchers.h" +#include "iceberg/type.h" + +namespace iceberg { + +namespace { +static const bool ROWS_MIGHT_MATCH = true; +static const bool ROWS_CANNOT_MATCH = false; +} // namespace +using TestVariant = std::variant; + +class InclusiveMetricsEvaluatorTest : public ::testing::Test { + protected: + void SetUp() override { + schema_ = std::make_shared( + std::vector{ + SchemaField::MakeRequired(1, "id", int64()), + SchemaField::MakeOptional(2, "name", string()), + SchemaField::MakeRequired(3, "age", int32()), + SchemaField::MakeOptional(4, "salary", float64()), + SchemaField::MakeRequired(5, "active", boolean()), + SchemaField::MakeRequired(6, "date", string()), + }, + /*schema_id=*/0); + } + + Result> Bind(const std::shared_ptr& expr, + bool case_sensitive = true) { + return Binder::Bind(*schema_, expr, case_sensitive); + } + + std::shared_ptr PrepareDataFile( + const std::string& partition, int64_t record_count, int64_t file_size_in_bytes, + const std::map& lower_bounds, + const std::map& upper_bounds, + const std::map& value_counts = {}, + const std::map& null_counts = {}, + const std::map& nan_counts = {}) { + auto parse_bound = [&](const std::map& bounds, + std::map>& bound_values) { + for (const auto& [key, value] : bounds) { + if (key == "id") { + bound_values[1] = Literal::Long(std::get(value)).Serialize().value(); + } else if (key == "name") { + bound_values[2] = + Literal::String(std::get(value)).Serialize().value(); + } else if (key == "age") { + bound_values[3] = Literal::Int(std::get(value)).Serialize().value(); + } else if (key == "salary") { + bound_values[4] = Literal::Double(std::get(value)).Serialize().value(); + } else if (key == "active") { + bound_values[5] = Literal::Boolean(std::get(value)).Serialize().value(); + } + } + }; + + auto data_file = std::make_shared(); + data_file->file_path = "test_path"; + data_file->file_format = FileFormatType::kParquet; + data_file->partition.emplace_back(Literal::String(partition)); + data_file->record_count = record_count; + data_file->file_size_in_bytes = file_size_in_bytes; + data_file->column_sizes = {}; + data_file->value_counts = value_counts; + data_file->null_value_counts = null_counts; + data_file->nan_value_counts = nan_counts; + data_file->split_offsets = {1}; + data_file->sort_order_id = 0; + parse_bound(upper_bounds, data_file->upper_bounds); + parse_bound(lower_bounds, data_file->lower_bounds); + return data_file; + } + + void TestCase(const std::shared_ptr& unbound, bool expected_result) { + ICEBERG_UNWRAP_OR_FAIL(auto evaluator, + InclusiveMetricsEvaluator::Make(unbound, schema_, true)); + auto file = PrepareDataFile("20251128", 10, 1024, {{"id", int64_t(100)}}, + {{"id", int64_t(200)}}); + auto result = evaluator->Eval(*file); + ASSERT_TRUE(result.has_value()); + ASSERT_EQ(result.value(), expected_result) << unbound->ToString(); + } + + void TestStringCase(const std::shared_ptr& unbound, bool expected_result) { + ICEBERG_UNWRAP_OR_FAIL(auto evaluator, + InclusiveMetricsEvaluator::Make(unbound, schema_, true)); + auto file = PrepareDataFile("20251128", 10, 1024, {{"name", "123"}}, + {{"name", "456"}}, {{2, 10}}, {{2, 0}}); + auto result = evaluator->Eval(*file); + ASSERT_TRUE(result.has_value()); + ASSERT_EQ(result.value(), expected_result) << unbound->ToString(); + } + + std::shared_ptr schema_; +}; + +TEST_F(InclusiveMetricsEvaluatorTest, CaseSensitiveTest) { + { + auto unbound = Expressions::Equal("id", Literal::Long(300)); + auto evaluator = InclusiveMetricsEvaluator::Make(unbound, schema_, true); + ASSERT_TRUE(evaluator.has_value()); + } + { + auto unbound = Expressions::Equal("ID", Literal::Long(300)); + auto evaluator = InclusiveMetricsEvaluator::Make(unbound, schema_, true); + ASSERT_FALSE(evaluator.has_value()); + ASSERT_EQ(evaluator.error().kind, ErrorKind::kInvalidExpression); + } + { + auto unbound = Expressions::Equal("ID", Literal::Long(300)); + auto evaluator = InclusiveMetricsEvaluator::Make(unbound, schema_, false); + ASSERT_TRUE(evaluator.has_value()); + } +} + +TEST_F(InclusiveMetricsEvaluatorTest, IsNullTest) { + { + auto unbound = Expressions::IsNull("name"); + ICEBERG_UNWRAP_OR_FAIL(auto evaluator, + InclusiveMetricsEvaluator::Make(unbound, schema_, true)); + auto file = PrepareDataFile("20251128", 10, 1024, {{"name", "1"}}, {{"name", "2"}}, + {{2, 10}}, {{2, 5}}, {}); + auto result = evaluator->Eval(*file); + ASSERT_TRUE(result.has_value()); + ASSERT_EQ(result.value(), ROWS_MIGHT_MATCH) << unbound->ToString(); + } + { + auto unbound = Expressions::IsNull("name"); + ICEBERG_UNWRAP_OR_FAIL(auto evaluator, + InclusiveMetricsEvaluator::Make(unbound, schema_, true)); + auto file = PrepareDataFile("20251128", 10, 1024, {{"name", "1"}}, {{"name", "2"}}, + {{2, 10}}, {{2, 0}}, {}); + auto result = evaluator->Eval(*file); + ASSERT_TRUE(result.has_value()); + ASSERT_EQ(result.value(), ROWS_CANNOT_MATCH) << unbound->ToString(); + } +} + +TEST_F(InclusiveMetricsEvaluatorTest, NotNullTest) { + { + auto unbound = Expressions::NotNull("name"); + ICEBERG_UNWRAP_OR_FAIL(auto evaluator, + InclusiveMetricsEvaluator::Make(unbound, schema_, true)); + auto file = PrepareDataFile("20251128", 10, 1024, {{"name", "1"}}, {{"name", "2"}}, + {{2, 10}}, {{2, 5}}, {}); + auto result = evaluator->Eval(*file); + ASSERT_TRUE(result.has_value()); + ASSERT_EQ(result.value(), ROWS_MIGHT_MATCH) << unbound->ToString(); + } + { + auto unbound = Expressions::NotNull("name"); + ICEBERG_UNWRAP_OR_FAIL(auto evaluator, + InclusiveMetricsEvaluator::Make(unbound, schema_, true)); + auto file = PrepareDataFile("20251128", 10, 1024, {{"name", "1"}}, {{"name", "2"}}, + {{2, 10}}, {{2, 10}}, {}); + auto result = evaluator->Eval(*file); + ASSERT_TRUE(result.has_value()); + ASSERT_EQ(result.value(), ROWS_CANNOT_MATCH) << unbound->ToString(); + } +} + +TEST_F(InclusiveMetricsEvaluatorTest, IsNanTest) { + { + auto unbound = Expressions::IsNaN("salary"); + ICEBERG_UNWRAP_OR_FAIL(auto evaluator, + InclusiveMetricsEvaluator::Make(unbound, schema_, true)); + auto file = PrepareDataFile("20251128", 10, 1024, {{"salary", 1.0}}, + {{"salary", 2.0}}, {{4, 10}}, {{4, 5}}, {{4, 5}}); + auto result = evaluator->Eval(*file); + ASSERT_TRUE(result.has_value()); + ASSERT_EQ(result.value(), ROWS_MIGHT_MATCH) << unbound->ToString(); + } + { + auto unbound = Expressions::IsNaN("salary"); + ICEBERG_UNWRAP_OR_FAIL(auto evaluator, + InclusiveMetricsEvaluator::Make(unbound, schema_, true)); + auto file = PrepareDataFile("20251128", 10, 1024, {{"salary", 1.0}}, + {{"salary", 2.0}}, {{4, 10}}, {{4, 10}}, {{4, 5}}); + auto result = evaluator->Eval(*file); + ASSERT_TRUE(result.has_value()); + ASSERT_EQ(result.value(), ROWS_CANNOT_MATCH) << unbound->ToString(); + } + { + auto unbound = Expressions::IsNaN("salary"); + ICEBERG_UNWRAP_OR_FAIL(auto evaluator, + InclusiveMetricsEvaluator::Make(unbound, schema_, true)); + auto file = PrepareDataFile("20251128", 10, 1024, {{"salary", 1.0}}, + {{"salary", 2.0}}, {{4, 10}}, {{4, 5}}, {{4, 0}}); + auto result = evaluator->Eval(*file); + ASSERT_TRUE(result.has_value()); + ASSERT_EQ(result.value(), ROWS_CANNOT_MATCH) << unbound->ToString(); + } +} + +TEST_F(InclusiveMetricsEvaluatorTest, NotNanTest) { + { + auto unbound = Expressions::NotNaN("salary"); + ICEBERG_UNWRAP_OR_FAIL(auto evaluator, + InclusiveMetricsEvaluator::Make(unbound, schema_, true)); + auto file = PrepareDataFile("20251128", 10, 1024, {{"salary", 1.0}}, + {{"salary", 2.0}}, {{4, 10}}, {}, {{4, 5}}); + auto result = evaluator->Eval(*file); + ASSERT_TRUE(result.has_value()); + ASSERT_EQ(result.value(), ROWS_MIGHT_MATCH) << unbound->ToString(); + } + { + auto unbound = Expressions::NotNaN("salary"); + ICEBERG_UNWRAP_OR_FAIL(auto evaluator, + InclusiveMetricsEvaluator::Make(unbound, schema_, true)); + auto file = PrepareDataFile("20251128", 10, 1024, {{"salary", 1.0}}, + {{"salary", 2.0}}, {{4, 10}}, {}, {{4, 10}}); + auto result = evaluator->Eval(*file); + ASSERT_TRUE(result.has_value()); + ASSERT_EQ(result.value(), ROWS_CANNOT_MATCH) << unbound->ToString(); + } +} + +TEST_F(InclusiveMetricsEvaluatorTest, LTTest) { + TestCase(Expressions::LessThan("id", Literal::Long(300)), ROWS_MIGHT_MATCH); + TestCase(Expressions::LessThan("id", Literal::Long(150)), ROWS_MIGHT_MATCH); + TestCase(Expressions::LessThan("id", Literal::Long(100)), ROWS_CANNOT_MATCH); + TestCase(Expressions::LessThan("id", Literal::Long(200)), ROWS_MIGHT_MATCH); + TestCase(Expressions::LessThan("id", Literal::Long(99)), ROWS_CANNOT_MATCH); +} + +TEST_F(InclusiveMetricsEvaluatorTest, LTEQTest) { + TestCase(Expressions::LessThanOrEqual("id", Literal::Long(300)), ROWS_MIGHT_MATCH); + TestCase(Expressions::LessThanOrEqual("id", Literal::Long(150)), ROWS_MIGHT_MATCH); + TestCase(Expressions::LessThanOrEqual("id", Literal::Long(100)), ROWS_MIGHT_MATCH); + TestCase(Expressions::LessThanOrEqual("id", Literal::Long(200)), ROWS_MIGHT_MATCH); + TestCase(Expressions::LessThanOrEqual("id", Literal::Long(99)), ROWS_CANNOT_MATCH); +} + +TEST_F(InclusiveMetricsEvaluatorTest, GTTest) { + TestCase(Expressions::GreaterThan("id", Literal::Long(300)), ROWS_CANNOT_MATCH); + TestCase(Expressions::GreaterThan("id", Literal::Long(150)), ROWS_MIGHT_MATCH); + TestCase(Expressions::GreaterThan("id", Literal::Long(100)), ROWS_MIGHT_MATCH); + TestCase(Expressions::GreaterThan("id", Literal::Long(200)), ROWS_CANNOT_MATCH); + TestCase(Expressions::GreaterThan("id", Literal::Long(99)), ROWS_MIGHT_MATCH); +} + +TEST_F(InclusiveMetricsEvaluatorTest, GTEQTest) { + TestCase(Expressions::GreaterThanOrEqual("id", Literal::Long(300)), ROWS_CANNOT_MATCH); + TestCase(Expressions::GreaterThanOrEqual("id", Literal::Long(150)), ROWS_MIGHT_MATCH); + TestCase(Expressions::GreaterThanOrEqual("id", Literal::Long(100)), ROWS_MIGHT_MATCH); + TestCase(Expressions::GreaterThanOrEqual("id", Literal::Long(200)), ROWS_MIGHT_MATCH); + TestCase(Expressions::GreaterThanOrEqual("id", Literal::Long(99)), ROWS_MIGHT_MATCH); +} + +TEST_F(InclusiveMetricsEvaluatorTest, EQTest) { + TestCase(Expressions::Equal("id", Literal::Long(300)), ROWS_CANNOT_MATCH); + TestCase(Expressions::Equal("id", Literal::Long(150)), ROWS_MIGHT_MATCH); + TestCase(Expressions::Equal("id", Literal::Long(100)), ROWS_MIGHT_MATCH); + TestCase(Expressions::Equal("id", Literal::Long(200)), ROWS_MIGHT_MATCH); +} + +TEST_F(InclusiveMetricsEvaluatorTest, NotEqTest) { + TestCase(Expressions::NotEqual("id", Literal::Long(300)), ROWS_MIGHT_MATCH); + TestCase(Expressions::NotEqual("id", Literal::Long(150)), ROWS_MIGHT_MATCH); + TestCase(Expressions::NotEqual("id", Literal::Long(100)), ROWS_MIGHT_MATCH); + TestCase(Expressions::NotEqual("id", Literal::Long(200)), ROWS_MIGHT_MATCH); +} + +TEST_F(InclusiveMetricsEvaluatorTest, InTest) { + TestCase(Expressions::In("id", + { + Literal::Long(300), + Literal::Long(400), + Literal::Long(500), + }), + ROWS_CANNOT_MATCH); + TestCase(Expressions::In("id", + { + Literal::Long(150), + Literal::Long(300), + }), + ROWS_MIGHT_MATCH); + TestCase(Expressions::In("id", {Literal::Long(100)}), ROWS_MIGHT_MATCH); + TestCase(Expressions::In("id", {Literal::Long(200)}), ROWS_MIGHT_MATCH); + TestCase(Expressions::In("id", + { + Literal::Long(99), + Literal::Long(201), + }), + ROWS_CANNOT_MATCH); +} + +TEST_F(InclusiveMetricsEvaluatorTest, NotInTest) { + TestCase(Expressions::NotIn("id", + { + Literal::Long(300), + Literal::Long(400), + Literal::Long(500), + }), + ROWS_MIGHT_MATCH); + TestCase(Expressions::NotIn("id", + { + Literal::Long(150), + Literal::Long(300), + }), + ROWS_MIGHT_MATCH); + TestCase(Expressions::NotIn("id", + { + Literal::Long(100), + Literal::Long(200), + }), + ROWS_MIGHT_MATCH); + TestCase(Expressions::NotIn("id", + { + Literal::Long(99), + Literal::Long(201), + }), + ROWS_MIGHT_MATCH); +} + +TEST_F(InclusiveMetricsEvaluatorTest, StartsWithTest) { + TestStringCase(Expressions::StartsWith("name", "1"), ROWS_MIGHT_MATCH); + TestStringCase(Expressions::StartsWith("name", "4"), ROWS_MIGHT_MATCH); + TestStringCase(Expressions::StartsWith("name", "12"), ROWS_MIGHT_MATCH); + TestStringCase(Expressions::StartsWith("name", "45"), ROWS_MIGHT_MATCH); + TestStringCase(Expressions::StartsWith("name", "123"), ROWS_MIGHT_MATCH); + TestStringCase(Expressions::StartsWith("name", "456"), ROWS_MIGHT_MATCH); + TestStringCase(Expressions::StartsWith("name", "1234"), ROWS_MIGHT_MATCH); + TestStringCase(Expressions::StartsWith("name", "4567"), ROWS_CANNOT_MATCH); + TestStringCase(Expressions::StartsWith("name", "78"), ROWS_CANNOT_MATCH); + TestStringCase(Expressions::StartsWith("name", "7"), ROWS_CANNOT_MATCH); + TestStringCase(Expressions::StartsWith("name", "A"), ROWS_CANNOT_MATCH); +} + +TEST_F(InclusiveMetricsEvaluatorTest, NotStartsWithTest) { + TestStringCase(Expressions::NotStartsWith("name", "1"), ROWS_MIGHT_MATCH); + TestStringCase(Expressions::NotStartsWith("name", "4"), ROWS_MIGHT_MATCH); + TestStringCase(Expressions::NotStartsWith("name", "12"), ROWS_MIGHT_MATCH); + TestStringCase(Expressions::NotStartsWith("name", "45"), ROWS_MIGHT_MATCH); + TestStringCase(Expressions::NotStartsWith("name", "123"), ROWS_MIGHT_MATCH); + TestStringCase(Expressions::NotStartsWith("name", "456"), ROWS_MIGHT_MATCH); + TestStringCase(Expressions::NotStartsWith("name", "1234"), ROWS_MIGHT_MATCH); + TestStringCase(Expressions::NotStartsWith("name", "4567"), ROWS_MIGHT_MATCH); + TestStringCase(Expressions::NotStartsWith("name", "78"), ROWS_MIGHT_MATCH); + TestStringCase(Expressions::NotStartsWith("name", "7"), ROWS_MIGHT_MATCH); + TestStringCase(Expressions::NotStartsWith("name", "A"), ROWS_MIGHT_MATCH); + + auto test_case = [&](const std::string& prefix, bool expected_result) { + auto unbound = Expressions::NotStartsWith("name", prefix); + ICEBERG_UNWRAP_OR_FAIL(auto evaluator, + InclusiveMetricsEvaluator::Make(unbound, schema_, true)); + auto file = PrepareDataFile("20251128", 10, 1024, {{"name", "123"}}, + {{"name", "123"}}, {{2, 10}}, {{2, 0}}); + auto result = evaluator->Eval(*file); + ASSERT_TRUE(result.has_value()); + ASSERT_EQ(result.value(), expected_result) << unbound->ToString(); + }; + test_case("12", ROWS_CANNOT_MATCH); + test_case("123", ROWS_CANNOT_MATCH); + test_case("1234", ROWS_MIGHT_MATCH); +} + +} // namespace iceberg From e40c09bd0427949eee4589cb9366e23bfaea6d7b Mon Sep 17 00:00:00 2001 From: "xiao.dong" Date: Fri, 28 Nov 2025 15:32:28 +0800 Subject: [PATCH 04/15] fix cpplint --- src/iceberg/test/inclusive_metrics_evaluator_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/iceberg/test/inclusive_metrics_evaluator_test.cc b/src/iceberg/test/inclusive_metrics_evaluator_test.cc index b0f8cb8d3..04b72bf99 100644 --- a/src/iceberg/test/inclusive_metrics_evaluator_test.cc +++ b/src/iceberg/test/inclusive_metrics_evaluator_test.cc @@ -101,8 +101,8 @@ class InclusiveMetricsEvaluatorTest : public ::testing::Test { void TestCase(const std::shared_ptr& unbound, bool expected_result) { ICEBERG_UNWRAP_OR_FAIL(auto evaluator, InclusiveMetricsEvaluator::Make(unbound, schema_, true)); - auto file = PrepareDataFile("20251128", 10, 1024, {{"id", int64_t(100)}}, - {{"id", int64_t(200)}}); + auto file = PrepareDataFile("20251128", 10, 1024, {{"id", static_cast(100)}}, + {{"id", static_cast(200)}}); auto result = evaluator->Eval(*file); ASSERT_TRUE(result.has_value()); ASSERT_EQ(result.value(), expected_result) << unbound->ToString(); From af85e080db4f9dd3396fa231904cc7beee275a35 Mon Sep 17 00:00:00 2001 From: "xiao.dong" Date: Fri, 28 Nov 2025 16:36:28 +0800 Subject: [PATCH 05/15] revert new error code --- src/iceberg/expression/expression_visitor.h | 2 +- src/iceberg/expression/inclusive_metrics_evaluator.cc | 4 ++-- src/iceberg/result.h | 2 -- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/src/iceberg/expression/expression_visitor.h b/src/iceberg/expression/expression_visitor.h index cda5f93ce..d66382453 100644 --- a/src/iceberg/expression/expression_visitor.h +++ b/src/iceberg/expression/expression_visitor.h @@ -263,7 +263,7 @@ class ICEBERG_EXPORT BoundVisitor : public ExpressionVisitor { /// Bound visitors do not support unbound predicates. /// /// \param pred The unbound predicate - Result Predicate(const std::shared_ptr& pred) override { + Result Predicate(const std::shared_ptr& pred) final { ICEBERG_DCHECK(pred != nullptr, "UnboundPredicate cannot be null"); return NotSupported("Not a bound predicate: {}", pred->ToString()); } diff --git a/src/iceberg/expression/inclusive_metrics_evaluator.cc b/src/iceberg/expression/inclusive_metrics_evaluator.cc index 14ce029b5..6938570bb 100644 --- a/src/iceberg/expression/inclusive_metrics_evaluator.cc +++ b/src/iceberg/expression/inclusive_metrics_evaluator.cc @@ -434,7 +434,7 @@ class InclusiveMetricsVisitor : public BoundVisitor { int id = ref.field().field_id(); auto type = ref.type(); if (!type->is_primitive()) { - return InvalidStats("Lower bound of non-primitive type is not supported."); + return Invalid("Lower bound of non-primitive type is not supported."); } auto primitive_type = std::dynamic_pointer_cast(type); if (!data_file_.lower_bounds.empty() && data_file_.lower_bounds.contains(id)) { @@ -451,7 +451,7 @@ class InclusiveMetricsVisitor : public BoundVisitor { int id = ref.field().field_id(); auto type = ref.type(); if (!type->is_primitive()) { - return InvalidStats("Upper bound of non-primitive type is not supported."); + return Invalid("Upper bound of non-primitive type is not supported."); } auto primitive_type = std::dynamic_pointer_cast(type); if (!data_file_.upper_bounds.empty() && data_file_.upper_bounds.contains(id)) { diff --git a/src/iceberg/result.h b/src/iceberg/result.h index 1bef45a29..ddc428a23 100644 --- a/src/iceberg/result.h +++ b/src/iceberg/result.h @@ -43,7 +43,6 @@ enum class ErrorKind { kInvalidManifest, kInvalidManifestList, kInvalidSchema, - kInvalidStats, kIOError, kJsonParseError, kNamespaceNotEmpty, @@ -105,7 +104,6 @@ DEFINE_ERROR_FUNCTION(InvalidExpression) DEFINE_ERROR_FUNCTION(InvalidManifest) DEFINE_ERROR_FUNCTION(InvalidManifestList) DEFINE_ERROR_FUNCTION(InvalidSchema) -DEFINE_ERROR_FUNCTION(InvalidStats) DEFINE_ERROR_FUNCTION(IOError) DEFINE_ERROR_FUNCTION(JsonParseError) DEFINE_ERROR_FUNCTION(NamespaceNotEmpty) From 931cb85424e3fa0e310b7d4d6def0cbb3329aa1c Mon Sep 17 00:00:00 2001 From: "xiao.dong" Date: Mon, 1 Dec 2025 09:51:19 +0800 Subject: [PATCH 06/15] modify meson build files --- src/iceberg/meson.build | 1 + src/iceberg/test/meson.build | 1 + 2 files changed, 2 insertions(+) diff --git a/src/iceberg/meson.build b/src/iceberg/meson.build index d52739be9..053971798 100644 --- a/src/iceberg/meson.build +++ b/src/iceberg/meson.build @@ -47,6 +47,7 @@ iceberg_sources = files( 'expression/evaluator.cc', 'expression/expression.cc', 'expression/expressions.cc', + 'expression/inclusive_metrics_evaluator.cc', 'expression/literal.cc', 'expression/predicate.cc', 'expression/rewrite_not.cc', diff --git a/src/iceberg/test/meson.build b/src/iceberg/test/meson.build index c3a401b52..331273dac 100644 --- a/src/iceberg/test/meson.build +++ b/src/iceberg/test/meson.build @@ -60,6 +60,7 @@ iceberg_tests = { 'aggregate_test.cc', 'expression_test.cc', 'expression_visitor_test.cc', + 'inclusive_metrics_evaluator_test.cc', 'literal_test.cc', 'predicate_test.cc', ), From 6f7c2add16dc6973385ada3f4816b51d200d45d3 Mon Sep 17 00:00:00 2001 From: "xiao.dong" Date: Mon, 1 Dec 2025 10:07:20 +0800 Subject: [PATCH 07/15] resolve conflict --- src/iceberg/test/inclusive_metrics_evaluator_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/iceberg/test/inclusive_metrics_evaluator_test.cc b/src/iceberg/test/inclusive_metrics_evaluator_test.cc index 04b72bf99..6f168b578 100644 --- a/src/iceberg/test/inclusive_metrics_evaluator_test.cc +++ b/src/iceberg/test/inclusive_metrics_evaluator_test.cc @@ -84,7 +84,7 @@ class InclusiveMetricsEvaluatorTest : public ::testing::Test { auto data_file = std::make_shared(); data_file->file_path = "test_path"; data_file->file_format = FileFormatType::kParquet; - data_file->partition.emplace_back(Literal::String(partition)); + data_file->partition.AddValue(Literal::String(partition)); data_file->record_count = record_count; data_file->file_size_in_bytes = file_size_in_bytes; data_file->column_sizes = {}; From 7d407920628985bca0d98716168b99ed3e7add64 Mon Sep 17 00:00:00 2001 From: "xiao.dong" Date: Mon, 1 Dec 2025 14:30:13 +0800 Subject: [PATCH 08/15] fix comments && rebase main --- src/iceberg/CMakeLists.txt | 2 +- .../expression/inclusive_metrics_evaluator.cc | 393 +++++++++--------- .../expression/inclusive_metrics_evaluator.h | 5 +- .../test/inclusive_metrics_evaluator_test.cc | 32 +- 4 files changed, 219 insertions(+), 213 deletions(-) diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt index b263d7f17..7c71f6510 100644 --- a/src/iceberg/CMakeLists.txt +++ b/src/iceberg/CMakeLists.txt @@ -25,10 +25,10 @@ set(ICEBERG_SOURCES expression/evaluator.cc expression/expression.cc expression/expressions.cc + expression/inclusive_metrics_evaluator.cc expression/literal.cc expression/predicate.cc expression/rewrite_not.cc - expression/inclusive_metrics_evaluator.cc expression/term.cc file_reader.cc file_writer.cc diff --git a/src/iceberg/expression/inclusive_metrics_evaluator.cc b/src/iceberg/expression/inclusive_metrics_evaluator.cc index 6938570bb..1dedda1fa 100644 --- a/src/iceberg/expression/inclusive_metrics_evaluator.cc +++ b/src/iceberg/expression/inclusive_metrics_evaluator.cc @@ -22,7 +22,7 @@ #include "iceberg/expression/binder.h" #include "iceberg/expression/expression_visitor.h" #include "iceberg/expression/rewrite_not.h" -#include "iceberg/manifest_entry.h" +#include "iceberg/manifest/manifest_entry.h" #include "iceberg/schema.h" #include "iceberg/transform.h" #include "iceberg/util/macros.h" @@ -30,18 +30,18 @@ namespace iceberg { namespace { -static const bool ROWS_MIGHT_MATCH = true; -static const bool ROWS_CANNOT_MATCH = false; -static const int32_t IN_PREDICATE_LIMIT = 200; +constexpr bool kRowsMightMatch = true; +constexpr bool kRowCannotMatch = false; +constexpr int32_t kInPredicateLimit = 200; } // namespace class InclusiveMetricsVisitor : public BoundVisitor { public: explicit InclusiveMetricsVisitor(const DataFile& data_file) : data_file_(data_file) {} - Result AlwaysTrue() override { return ROWS_MIGHT_MATCH; } + Result AlwaysTrue() override { return kRowsMightMatch; } - Result AlwaysFalse() override { return ROWS_CANNOT_MATCH; } + Result AlwaysFalse() override { return kRowCannotMatch; } Result Not(bool child_result) override { return !child_result; } @@ -53,75 +53,75 @@ class InclusiveMetricsVisitor : public BoundVisitor { return left_result || right_result; } - Result IsNull(const std::shared_ptr& term) override { - if (IsNonNullPreserving(term)) { + Result IsNull(const std::shared_ptr& expr) override { + // no need to check whether the field is required because binding evaluates that case + // if the column has no null values, the expression cannot match + if (IsNonNullPreserving(expr)) { // number of non-nulls is the same as for the ref - int id = term->reference()->field().field_id(); + int32_t id = expr->reference()->field().field_id(); if (!MayContainNull(id)) { - return ROWS_CANNOT_MATCH; + return kRowCannotMatch; } } - return ROWS_MIGHT_MATCH; + return kRowsMightMatch; } - Result NotNull(const std::shared_ptr& term) override { + Result NotNull(const std::shared_ptr& expr) override { // no need to check whether the field is required because binding evaluates that case // if the column has no non-null values, the expression cannot match // all terms are null preserving. see #isNullPreserving(Bound) - int id = term->reference()->field().field_id(); + int32_t id = expr->reference()->field().field_id(); if (ContainsNullsOnly(id)) { - return ROWS_CANNOT_MATCH; + return kRowCannotMatch; } - return ROWS_MIGHT_MATCH; + return kRowsMightMatch; } - Result IsNaN(const std::shared_ptr& term) override { + Result IsNaN(const std::shared_ptr& expr) override { // when there's no nanCounts information, but we already know the column only contains // null, it's guaranteed that there's no NaN value - int id = term->reference()->field().field_id(); + int32_t id = expr->reference()->field().field_id(); if (ContainsNullsOnly(id)) { - return ROWS_CANNOT_MATCH; + return kRowCannotMatch; } - auto ptr = std::dynamic_pointer_cast(term); - if (ptr == nullptr) { - return ROWS_MIGHT_MATCH; + if (expr->reference()->kind() != BoundTerm::Kind::kReference) { + return kRowsMightMatch; } - if (!data_file_.nan_value_counts.empty() && - data_file_.nan_value_counts.contains(id) && - data_file_.nan_value_counts.at(id) == 0) { - return ROWS_CANNOT_MATCH; + auto it = data_file_.nan_value_counts.find(id); + if (it != data_file_.nan_value_counts.end() && it->second == 0) { + return kRowCannotMatch; } - return ROWS_MIGHT_MATCH; + return kRowsMightMatch; } - Result NotNaN(const std::shared_ptr& term) override { - auto ptr = std::dynamic_pointer_cast(term); - if (ptr == nullptr) { + Result NotNaN(const std::shared_ptr& expr) override { + if (expr->reference()->kind() != BoundTerm::Kind::kReference) { // identity transforms are already removed by this time - return ROWS_MIGHT_MATCH; + return kRowsMightMatch; } - int id = term->reference()->field().field_id(); + int32_t id = expr->reference()->field().field_id(); if (ContainsNaNsOnly(id)) { - return ROWS_CANNOT_MATCH; + return kRowCannotMatch; } - return ROWS_MIGHT_MATCH; + return kRowsMightMatch; } - Result Lt(const std::shared_ptr& term, const Literal& lit) override { + Result Lt(const std::shared_ptr& expr, const Literal& lit) override { // all terms are null preserving. see #isNullPreserving(Bound) - int id = term->reference()->field().field_id(); + int32_t id = expr->reference()->field().field_id(); if (ContainsNullsOnly(id) || ContainsNaNsOnly(id)) { - return ROWS_CANNOT_MATCH; + return kRowCannotMatch; } - auto lower_result = LowerBound(term); - if (!lower_result.has_value() || lower_result.value().IsNaN()) { + ICEBERG_ASSIGN_OR_RAISE(auto lower_result, LowerBound(expr)); + if (!lower_result.has_value() || lower_result.value().IsNull() || + lower_result.value().IsNaN()) { // NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more. - return ROWS_MIGHT_MATCH; + return kRowsMightMatch; } const auto& lower = lower_result.value(); @@ -131,23 +131,24 @@ class InclusiveMetricsVisitor : public BoundVisitor { // when f(lower) >= X then f(a) >= f(lower) >= X, so there is no a such that f(a) < X // f(lower) >= X means rows cannot match if (lower >= lit) { - return ROWS_CANNOT_MATCH; + return kRowCannotMatch; } - return ROWS_MIGHT_MATCH; + return kRowsMightMatch; } - Result LtEq(const std::shared_ptr& term, const Literal& lit) override { + Result LtEq(const std::shared_ptr& expr, const Literal& lit) override { // all terms are null preserving. see #isNullPreserving(Bound) - int id = term->reference()->field().field_id(); + int32_t id = expr->reference()->field().field_id(); if (ContainsNullsOnly(id) || ContainsNaNsOnly(id)) { - return ROWS_CANNOT_MATCH; + return kRowCannotMatch; } - auto lower_result = LowerBound(term); - if (!lower_result.has_value() || lower_result.value().IsNaN()) { + ICEBERG_ASSIGN_OR_RAISE(auto lower_result, LowerBound(expr)); + if (!lower_result.has_value() || lower_result.value().IsNull() || + lower_result.value().IsNaN()) { // NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more. - return ROWS_MIGHT_MATCH; + return kRowsMightMatch; } const auto& lower = lower_result.value(); @@ -157,159 +158,160 @@ class InclusiveMetricsVisitor : public BoundVisitor { // when f(lower) > X then f(a) >= f(lower) > X, so there is no a such that f(a) <= X // f(lower) > X means rows cannot match if (lower > lit) { - return ROWS_CANNOT_MATCH; + return kRowCannotMatch; } - return ROWS_MIGHT_MATCH; + return kRowsMightMatch; } - Result Gt(const std::shared_ptr& term, const Literal& lit) override { + Result Gt(const std::shared_ptr& expr, const Literal& lit) override { // all terms are null preserving. see #isNullPreserving(Bound) - int id = term->reference()->field().field_id(); + int32_t id = expr->reference()->field().field_id(); if (ContainsNullsOnly(id) || ContainsNaNsOnly(id)) { - return ROWS_CANNOT_MATCH; + return kRowCannotMatch; } - auto upper_result = UpperBound(term); - if (!upper_result.has_value()) { - return ROWS_MIGHT_MATCH; + ICEBERG_ASSIGN_OR_RAISE(auto upper_result, UpperBound(expr)); + if (!upper_result.has_value() || upper_result.value().IsNull()) { + return kRowsMightMatch; } const auto& upper = upper_result.value(); if (upper <= lit) { - return ROWS_CANNOT_MATCH; + return kRowCannotMatch; } - return ROWS_MIGHT_MATCH; + return kRowsMightMatch; } - Result GtEq(const std::shared_ptr& term, const Literal& lit) override { + Result GtEq(const std::shared_ptr& expr, const Literal& lit) override { // all terms are null preserving. see #isNullPreserving(Bound) - int id = term->reference()->field().field_id(); + int32_t id = expr->reference()->field().field_id(); if (ContainsNullsOnly(id) || ContainsNaNsOnly(id)) { - return ROWS_CANNOT_MATCH; + return kRowCannotMatch; } - auto upper_result = UpperBound(term); - if (!upper_result.has_value()) { - return ROWS_MIGHT_MATCH; + ICEBERG_ASSIGN_OR_RAISE(auto upper_result, UpperBound(expr)); + if (!upper_result.has_value() || upper_result.value().IsNull()) { + return kRowsMightMatch; } const auto& upper = upper_result.value(); if (upper < lit) { - return ROWS_CANNOT_MATCH; + return kRowCannotMatch; } - return ROWS_MIGHT_MATCH; + return kRowsMightMatch; } - Result Eq(const std::shared_ptr& term, const Literal& lit) override { + Result Eq(const std::shared_ptr& expr, const Literal& lit) override { // all terms are null preserving. see #isNullPreserving(Bound) - int id = term->reference()->field().field_id(); + int32_t id = expr->reference()->field().field_id(); if (ContainsNullsOnly(id) || ContainsNaNsOnly(id)) { - return ROWS_CANNOT_MATCH; + return kRowCannotMatch; } - auto lower_result = LowerBound(term); - if (lower_result.has_value()) { + ICEBERG_ASSIGN_OR_RAISE(auto lower_result, LowerBound(expr)); + if (lower_result.has_value() && !lower_result.value().IsNull() && + !lower_result.value().IsNaN()) { const auto& lower = lower_result.value(); if (!lower.IsNaN() && lower > lit) { - return ROWS_CANNOT_MATCH; + return kRowCannotMatch; } } - auto upper_result = UpperBound(term); - if (!upper_result.has_value()) { - return ROWS_MIGHT_MATCH; + ICEBERG_ASSIGN_OR_RAISE(auto upper_result, UpperBound(expr)); + if (!upper_result.has_value() || upper_result.value().IsNull()) { + return kRowsMightMatch; } const auto& upper = upper_result.value(); if (upper < lit) { - return ROWS_CANNOT_MATCH; + return kRowCannotMatch; } - return ROWS_MIGHT_MATCH; + return kRowsMightMatch; } - Result NotEq(const std::shared_ptr& term, - const Literal& lit) override { + Result NotEq(const std::shared_ptr& expr, const Literal& lit) override { // because the bounds are not necessarily a min or max value, this cannot be answered // using them. notEq(col, X) with (X, Y) doesn't guarantee that X is a value in col. - return ROWS_MIGHT_MATCH; + return kRowsMightMatch; } - Result In(const std::shared_ptr& term, + Result In(const std::shared_ptr& expr, const BoundSetPredicate::LiteralSet& literal_set) override { // all terms are null preserving. see #isNullPreserving(Bound) - int id = term->reference()->field().field_id(); + int32_t id = expr->reference()->field().field_id(); if (ContainsNullsOnly(id) || ContainsNaNsOnly(id)) { - return ROWS_CANNOT_MATCH; + return kRowCannotMatch; } - if (literal_set.size() > IN_PREDICATE_LIMIT) { + if (literal_set.size() > kInPredicateLimit) { // skip evaluating the predicate if the number of values is too big - return ROWS_MIGHT_MATCH; + return kRowsMightMatch; } - auto lower_result = LowerBound(term); - if (!lower_result.has_value() || lower_result.value().IsNaN()) { + ICEBERG_ASSIGN_OR_RAISE(auto lower_result, LowerBound(expr)); + if (!lower_result.has_value() || lower_result.value().IsNull() || + lower_result.value().IsNaN()) { // NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more. - return ROWS_MIGHT_MATCH; + return kRowsMightMatch; } const auto& lower = lower_result.value(); - std::vector literals; - for (const auto& lit : literal_set) { - if (lower <= lit) { - literals.emplace_back(lit); - } - } + auto literals_view = literal_set | std::views::filter([&](const Literal& lit) { + return lower <= lit; + }); // if all values are less than lower bound, rows cannot match - if (literals.empty()) { - return ROWS_CANNOT_MATCH; + if (literals_view.empty()) { + return kRowCannotMatch; } - auto upper_result = UpperBound(term); - if (!upper_result.has_value()) { - return ROWS_MIGHT_MATCH; + ICEBERG_ASSIGN_OR_RAISE(auto upper_result, UpperBound(expr)); + if (!upper_result.has_value() || upper_result.value().IsNull()) { + return kRowsMightMatch; } const auto& upper = upper_result.value(); - std::erase_if(literals, [&](const Literal& x) { return x > upper; }); + auto filtered_view = literals_view | std::views::filter([&](const Literal& lit) { + return upper >= lit; + }); // if remaining values are greater than upper bound, rows cannot match - if (literals.empty()) { - return ROWS_CANNOT_MATCH; + if (filtered_view.empty()) { + return kRowCannotMatch; } - return ROWS_MIGHT_MATCH; + return kRowsMightMatch; } - Result NotIn(const std::shared_ptr& term, + Result NotIn(const std::shared_ptr& expr, const BoundSetPredicate::LiteralSet& literal_set) override { // because the bounds are not necessarily a min or max value, this cannot be answered // using them. notIn(col, {X, ...}) with (X, Y) doesn't guarantee that X is a value in // col. - return ROWS_MIGHT_MATCH; + return kRowsMightMatch; } - Result StartsWith(const std::shared_ptr& term, + Result StartsWith(const std::shared_ptr& expr, const Literal& lit) override { - if (term->kind() == BoundTerm::Kind::kTransform && - std::dynamic_pointer_cast(term)->transform()->transform_type() != - TransformType::kIdentity) { + if (expr->reference()->kind() == BoundTerm::Kind::kTransform && + internal::checked_cast(*expr) + .transform() + ->transform_type() != TransformType::kIdentity) { // truncate must be rewritten in binding. the result is either always or never // compatible - return ROWS_MIGHT_MATCH; + return kRowsMightMatch; } - int id = term->reference()->field().field_id(); + int32_t id = expr->reference()->field().field_id(); if (ContainsNullsOnly(id)) { - return ROWS_CANNOT_MATCH; + return kRowCannotMatch; } if (lit.type()->type_id() != TypeId::kString) { - return ROWS_CANNOT_MATCH; + return kRowCannotMatch; } - std::string prefix = get(lit.value()); + const auto& prefix = get(lit.value()); - auto lower_result = LowerBound(term); - if (!lower_result.has_value()) { - return ROWS_MIGHT_MATCH; + ICEBERG_ASSIGN_OR_RAISE(auto lower_result, LowerBound(expr)); + if (!lower_result.has_value() || lower_result.value().IsNull()) { + return kRowsMightMatch; } const auto& lower = lower_result.value(); auto lower_str = get(lower.value()); @@ -318,12 +320,12 @@ class InclusiveMetricsVisitor : public BoundVisitor { int length = std::min(prefix.size(), lower_str.size()); // if prefix of lower bound is greater than prefix, rows cannot match if (lower_str.substr(0, length) > prefix) { - return ROWS_CANNOT_MATCH; + return kRowCannotMatch; } - auto upper_result = UpperBound(term); - if (!upper_result.has_value()) { - return ROWS_MIGHT_MATCH; + ICEBERG_ASSIGN_OR_RAISE(auto upper_result, UpperBound(expr)); + if (!upper_result.has_value() || upper_result.value().IsNull()) { + return kRowsMightMatch; } const auto& upper = upper_result.value(); auto upper_str = get(upper.value()); @@ -332,32 +334,33 @@ class InclusiveMetricsVisitor : public BoundVisitor { length = std::min(prefix.size(), upper_str.size()); // if prefix of upper bound is less than prefix, rows cannot match if (upper_str.substr(0, length) < prefix) { - return ROWS_CANNOT_MATCH; + return kRowCannotMatch; } - return ROWS_MIGHT_MATCH; + return kRowsMightMatch; } - Result NotStartsWith(const std::shared_ptr& term, + Result NotStartsWith(const std::shared_ptr& expr, const Literal& lit) override { // the only transforms that produce strings are truncate and identity, which work with // this - int id = term->reference()->field().field_id(); + int32_t id = expr->reference()->field().field_id(); if (MayContainNull(id)) { - return ROWS_MIGHT_MATCH; + return kRowsMightMatch; } if (lit.type()->type_id() != TypeId::kString) { - return ROWS_CANNOT_MATCH; + return kRowCannotMatch; } - std::string prefix = get(lit.value()); + const auto& prefix = get(lit.value()); // notStartsWith will match unless all values must start with the prefix. This happens // when the lower and upper bounds both start with the prefix. - auto lower_result = LowerBound(term); - auto upper_result = UpperBound(term); - if (!lower_result.has_value() || !upper_result.has_value()) { - return ROWS_MIGHT_MATCH; + ICEBERG_ASSIGN_OR_RAISE(auto lower_result, LowerBound(expr)); + ICEBERG_ASSIGN_OR_RAISE(auto upper_result, UpperBound(expr)); + if (!lower_result.has_value() || lower_result.value().IsNull() || + !upper_result.has_value() || upper_result.value().IsNull()) { + return kRowsMightMatch; } const auto& lower = lower_result.value(); const auto& upper = upper_result.value(); @@ -366,22 +369,22 @@ class InclusiveMetricsVisitor : public BoundVisitor { // if lower is shorter than the prefix then lower doesn't start with the prefix if (lower_str.size() < prefix.size()) { - return ROWS_MIGHT_MATCH; + return kRowsMightMatch; } if (lower_str.starts_with(prefix)) { // if upper is shorter than the prefix then upper can't start with the prefix if (upper_str.size() < prefix.size()) { - return ROWS_MIGHT_MATCH; + return kRowsMightMatch; } if (upper_str.starts_with(prefix)) { // both bounds match the prefix, so all rows must match the prefix and therefore // do not satisfy the predicate - return ROWS_CANNOT_MATCH; + return kRowCannotMatch; } } - return ROWS_MIGHT_MATCH; + return kRowsMightMatch; } private: @@ -392,51 +395,58 @@ class InclusiveMetricsVisitor : public BoundVisitor { } bool ContainsNullsOnly(int32_t id) { - return !data_file_.value_counts.empty() && data_file_.value_counts.contains(id) && - !data_file_.null_value_counts.empty() && - data_file_.null_value_counts.contains(id) && - data_file_.value_counts.at(id) - data_file_.null_value_counts.at(id) == 0; + auto val_it = data_file_.value_counts.find(id); + auto null_it = data_file_.null_value_counts.find(id); + return val_it != data_file_.value_counts.end() && + null_it != data_file_.null_value_counts.end() && + val_it->second - null_it->second == 0; } bool ContainsNaNsOnly(int32_t id) { - return !data_file_.nan_value_counts.empty() && - data_file_.nan_value_counts.contains(id) && !data_file_.value_counts.empty() && - data_file_.value_counts.at(id) == data_file_.nan_value_counts.at(id); + auto val_it = data_file_.value_counts.find(id); + auto nan_it = data_file_.nan_value_counts.find(id); + return val_it != data_file_.value_counts.end() && + nan_it != data_file_.nan_value_counts.end() && + val_it->second == nan_it->second; } - Result LowerBound(const std::shared_ptr& term) { - if (term->kind() == BoundTerm::Kind::kReference) { - return ParseLowerBound(*std::dynamic_pointer_cast(term)); - } else if (term->kind() == BoundTerm::Kind::kTransform) { - return TransformLowerBound(*std::dynamic_pointer_cast(term)); - } else if (term->kind() == BoundTerm::Kind::kExtract) { + Result> LowerBound(const std::shared_ptr& expr) { + if (expr->reference()->kind() == BoundTerm::Kind::kReference) { + return ParseLowerBound( + internal::checked_cast(*expr->reference())); + } else if (expr->reference()->kind() == BoundTerm::Kind::kTransform) { + return TransformLowerBound( + internal::checked_cast(*expr->reference())); + } else if (expr->reference()->kind() == BoundTerm::Kind::kExtract) { // TODO(xiao.dong) handle extract lower and upper bounds return NotImplemented("Extract lower bound not implemented."); } else { - return NotFound("Lower bound not found."); + return std::nullopt; } } - Result UpperBound(const std::shared_ptr& term) { - if (term->kind() == BoundTerm::Kind::kReference) { - return ParseUpperBound(*std::dynamic_pointer_cast(term)); - } else if (term->kind() == BoundTerm::Kind::kTransform) { - return TransformUpperBound(*std::dynamic_pointer_cast(term)); - } else if (term->kind() == BoundTerm::Kind::kExtract) { + Result> UpperBound(const std::shared_ptr& expr) { + if (expr->reference()->kind() == BoundTerm::Kind::kReference) { + return ParseUpperBound( + internal::checked_cast(*expr->reference())); + } else if (expr->reference()->kind() == BoundTerm::Kind::kTransform) { + return TransformUpperBound( + internal::checked_cast(*expr->reference())); + } else if (expr->reference()->kind() == BoundTerm::Kind::kExtract) { // TODO(xiao.dong) handle extract lower and upper bounds return NotImplemented("Extract upper bound not implemented."); } else { - return NotFound("Upper bound not found."); + return std::nullopt; } } - Result ParseLowerBound(const BoundReference& ref) { - int id = ref.field().field_id(); + Result> ParseLowerBound(const BoundReference& ref) { + int32_t id = ref.field().field_id(); auto type = ref.type(); if (!type->is_primitive()) { return Invalid("Lower bound of non-primitive type is not supported."); } - auto primitive_type = std::dynamic_pointer_cast(type); + auto primitive_type = internal::checked_pointer_cast(type); if (!data_file_.lower_bounds.empty() && data_file_.lower_bounds.contains(id)) { ICEBERG_ASSIGN_OR_RAISE( auto lower, @@ -444,16 +454,16 @@ class InclusiveMetricsVisitor : public BoundVisitor { return lower; } - return NotFound("Lower bound not found."); + return std::nullopt; } - Result ParseUpperBound(const BoundReference& ref) { - int id = ref.field().field_id(); + Result> ParseUpperBound(const BoundReference& ref) { + int32_t id = ref.field().field_id(); auto type = ref.type(); if (!type->is_primitive()) { return Invalid("Upper bound of non-primitive type is not supported."); } - auto primitive_type = std::dynamic_pointer_cast(type); + auto primitive_type = internal::checked_pointer_cast(type); if (!data_file_.upper_bounds.empty() && data_file_.upper_bounds.contains(id)) { ICEBERG_ASSIGN_OR_RAISE( auto upper, @@ -461,51 +471,44 @@ class InclusiveMetricsVisitor : public BoundVisitor { return upper; } - return NotFound("Upper bound not found."); + return std::nullopt; } - Result TransformLowerBound(BoundTransform& boundTransform) { + Result> TransformLowerBound(BoundTransform& boundTransform) { auto transform = boundTransform.transform(); if (transform->PreservesOrder()) { ICEBERG_ASSIGN_OR_RAISE(auto lower, ParseLowerBound(*boundTransform.reference())); ICEBERG_ASSIGN_OR_RAISE(auto transform_func, transform->Bind(boundTransform.reference()->type())); - return transform_func->Transform(lower); + if (lower.has_value()) { + return transform_func->Transform(lower.value()); + } } - return NotFound("Transform lower bound not found."); + return std::nullopt; } - Result TransformUpperBound(BoundTransform& boundTransform) { + Result> TransformUpperBound(BoundTransform& boundTransform) { auto transform = boundTransform.transform(); if (transform->PreservesOrder()) { ICEBERG_ASSIGN_OR_RAISE(auto upper, ParseLowerBound(*boundTransform.reference())); ICEBERG_ASSIGN_OR_RAISE(auto transform_func, transform->Bind(boundTransform.reference()->type())); - return transform_func->Transform(upper); + if (upper.has_value()) { + return transform_func->Transform(upper.value()); + } } - return NotFound("Transform upper bound not found."); - } - - // TODO(xiao.dong) handle extract lower and upper bounds - /* - Literal extractLowerBound(const BoundExtract& bound) { - - } - - Literal extractUpperBound(const BoundExtract& bound) { - + return std::nullopt; } -*/ /** Returns true if the expression term produces a non-null value for non-null input. */ - bool IsNonNullPreserving(const std::shared_ptr& term) { - if (term->kind() == BoundTerm::Kind::kReference) { + bool IsNonNullPreserving(const std::shared_ptr& expr) { + if (expr->reference()->kind() == BoundTerm::Kind::kReference) { return true; - } else if (term->kind() == BoundTerm::Kind::kTransform) { - return std::dynamic_pointer_cast(term) - ->transform() + } else if (expr->reference()->kind() == BoundTerm::Kind::kTransform) { + return internal::checked_cast(*expr->reference()) + .transform() ->PreservesOrder(); } // a non-null variant does not necessarily contain a specific field @@ -517,25 +520,29 @@ class InclusiveMetricsVisitor : public BoundVisitor { const DataFile& data_file_; }; -InclusiveMetricsEvaluator::InclusiveMetricsEvaluator( - const std::shared_ptr& expr) +InclusiveMetricsEvaluator::InclusiveMetricsEvaluator(std::shared_ptr expr) : expr_(std::move(expr)) {} InclusiveMetricsEvaluator::~InclusiveMetricsEvaluator() = default; Result> InclusiveMetricsEvaluator::Make( - std::shared_ptr expr, const std::shared_ptr& schema, - bool case_sensitive) { + std::shared_ptr expr, const Schema& schema, bool case_sensitive) { ICEBERG_ASSIGN_OR_RAISE(auto rewrite_expr, RewriteNot::Visit(std::move(expr))); ICEBERG_ASSIGN_OR_RAISE(auto bound_expr, - Binder::Bind(*schema, rewrite_expr, case_sensitive)); + Binder::Bind(schema, rewrite_expr, case_sensitive)); return std::unique_ptr( new InclusiveMetricsEvaluator(std::move(bound_expr))); } Result InclusiveMetricsEvaluator::Eval(const DataFile& data_file) const { - if (data_file.record_count <= 0) { - return ROWS_CANNOT_MATCH; + if (data_file.record_count == 0) { + return kRowCannotMatch; + } + if (data_file.record_count < 0) { + // we haven't implemented parsing record count from avro file and thus set record + // count -1 when importing avro tables to iceberg tables. This should be updated once + // we implemented and set correct record count. + return kRowsMightMatch; } InclusiveMetricsVisitor visitor(data_file); return Visit(expr_, visitor); diff --git a/src/iceberg/expression/inclusive_metrics_evaluator.h b/src/iceberg/expression/inclusive_metrics_evaluator.h index 9e2c6d879..7ffe98e70 100644 --- a/src/iceberg/expression/inclusive_metrics_evaluator.h +++ b/src/iceberg/expression/inclusive_metrics_evaluator.h @@ -54,8 +54,7 @@ class ICEBERG_EXPORT InclusiveMetricsEvaluator { /// \param schema The schema of the table /// \param case_sensitive Whether field name matching is case-sensitive static Result> Make( - std::shared_ptr expr, const std::shared_ptr& schema, - bool case_sensitive = true); + std::shared_ptr expr, const Schema& schema, bool case_sensitive = true); ~InclusiveMetricsEvaluator(); @@ -66,7 +65,7 @@ class ICEBERG_EXPORT InclusiveMetricsEvaluator { Result Eval(const DataFile& data_file) const; private: - explicit InclusiveMetricsEvaluator(const std::shared_ptr& expr); + explicit InclusiveMetricsEvaluator(std::shared_ptr expr); private: std::shared_ptr expr_; diff --git a/src/iceberg/test/inclusive_metrics_evaluator_test.cc b/src/iceberg/test/inclusive_metrics_evaluator_test.cc index 6f168b578..261c2fb49 100644 --- a/src/iceberg/test/inclusive_metrics_evaluator_test.cc +++ b/src/iceberg/test/inclusive_metrics_evaluator_test.cc @@ -23,7 +23,7 @@ #include "iceberg/expression/binder.h" #include "iceberg/expression/expressions.h" -#include "iceberg/manifest_entry.h" +#include "iceberg/manifest/manifest_entry.h" #include "iceberg/schema.h" #include "iceberg/test/matchers.h" #include "iceberg/type.h" @@ -100,7 +100,7 @@ class InclusiveMetricsEvaluatorTest : public ::testing::Test { void TestCase(const std::shared_ptr& unbound, bool expected_result) { ICEBERG_UNWRAP_OR_FAIL(auto evaluator, - InclusiveMetricsEvaluator::Make(unbound, schema_, true)); + InclusiveMetricsEvaluator::Make(unbound, *schema_, true)); auto file = PrepareDataFile("20251128", 10, 1024, {{"id", static_cast(100)}}, {{"id", static_cast(200)}}); auto result = evaluator->Eval(*file); @@ -110,7 +110,7 @@ class InclusiveMetricsEvaluatorTest : public ::testing::Test { void TestStringCase(const std::shared_ptr& unbound, bool expected_result) { ICEBERG_UNWRAP_OR_FAIL(auto evaluator, - InclusiveMetricsEvaluator::Make(unbound, schema_, true)); + InclusiveMetricsEvaluator::Make(unbound, *schema_, true)); auto file = PrepareDataFile("20251128", 10, 1024, {{"name", "123"}}, {{"name", "456"}}, {{2, 10}}, {{2, 0}}); auto result = evaluator->Eval(*file); @@ -124,18 +124,18 @@ class InclusiveMetricsEvaluatorTest : public ::testing::Test { TEST_F(InclusiveMetricsEvaluatorTest, CaseSensitiveTest) { { auto unbound = Expressions::Equal("id", Literal::Long(300)); - auto evaluator = InclusiveMetricsEvaluator::Make(unbound, schema_, true); + auto evaluator = InclusiveMetricsEvaluator::Make(unbound, *schema_, true); ASSERT_TRUE(evaluator.has_value()); } { auto unbound = Expressions::Equal("ID", Literal::Long(300)); - auto evaluator = InclusiveMetricsEvaluator::Make(unbound, schema_, true); + auto evaluator = InclusiveMetricsEvaluator::Make(unbound, *schema_, true); ASSERT_FALSE(evaluator.has_value()); ASSERT_EQ(evaluator.error().kind, ErrorKind::kInvalidExpression); } { auto unbound = Expressions::Equal("ID", Literal::Long(300)); - auto evaluator = InclusiveMetricsEvaluator::Make(unbound, schema_, false); + auto evaluator = InclusiveMetricsEvaluator::Make(unbound, *schema_, false); ASSERT_TRUE(evaluator.has_value()); } } @@ -144,7 +144,7 @@ TEST_F(InclusiveMetricsEvaluatorTest, IsNullTest) { { auto unbound = Expressions::IsNull("name"); ICEBERG_UNWRAP_OR_FAIL(auto evaluator, - InclusiveMetricsEvaluator::Make(unbound, schema_, true)); + InclusiveMetricsEvaluator::Make(unbound, *schema_, true)); auto file = PrepareDataFile("20251128", 10, 1024, {{"name", "1"}}, {{"name", "2"}}, {{2, 10}}, {{2, 5}}, {}); auto result = evaluator->Eval(*file); @@ -154,7 +154,7 @@ TEST_F(InclusiveMetricsEvaluatorTest, IsNullTest) { { auto unbound = Expressions::IsNull("name"); ICEBERG_UNWRAP_OR_FAIL(auto evaluator, - InclusiveMetricsEvaluator::Make(unbound, schema_, true)); + InclusiveMetricsEvaluator::Make(unbound, *schema_, true)); auto file = PrepareDataFile("20251128", 10, 1024, {{"name", "1"}}, {{"name", "2"}}, {{2, 10}}, {{2, 0}}, {}); auto result = evaluator->Eval(*file); @@ -167,7 +167,7 @@ TEST_F(InclusiveMetricsEvaluatorTest, NotNullTest) { { auto unbound = Expressions::NotNull("name"); ICEBERG_UNWRAP_OR_FAIL(auto evaluator, - InclusiveMetricsEvaluator::Make(unbound, schema_, true)); + InclusiveMetricsEvaluator::Make(unbound, *schema_, true)); auto file = PrepareDataFile("20251128", 10, 1024, {{"name", "1"}}, {{"name", "2"}}, {{2, 10}}, {{2, 5}}, {}); auto result = evaluator->Eval(*file); @@ -177,7 +177,7 @@ TEST_F(InclusiveMetricsEvaluatorTest, NotNullTest) { { auto unbound = Expressions::NotNull("name"); ICEBERG_UNWRAP_OR_FAIL(auto evaluator, - InclusiveMetricsEvaluator::Make(unbound, schema_, true)); + InclusiveMetricsEvaluator::Make(unbound, *schema_, true)); auto file = PrepareDataFile("20251128", 10, 1024, {{"name", "1"}}, {{"name", "2"}}, {{2, 10}}, {{2, 10}}, {}); auto result = evaluator->Eval(*file); @@ -190,7 +190,7 @@ TEST_F(InclusiveMetricsEvaluatorTest, IsNanTest) { { auto unbound = Expressions::IsNaN("salary"); ICEBERG_UNWRAP_OR_FAIL(auto evaluator, - InclusiveMetricsEvaluator::Make(unbound, schema_, true)); + InclusiveMetricsEvaluator::Make(unbound, *schema_, true)); auto file = PrepareDataFile("20251128", 10, 1024, {{"salary", 1.0}}, {{"salary", 2.0}}, {{4, 10}}, {{4, 5}}, {{4, 5}}); auto result = evaluator->Eval(*file); @@ -200,7 +200,7 @@ TEST_F(InclusiveMetricsEvaluatorTest, IsNanTest) { { auto unbound = Expressions::IsNaN("salary"); ICEBERG_UNWRAP_OR_FAIL(auto evaluator, - InclusiveMetricsEvaluator::Make(unbound, schema_, true)); + InclusiveMetricsEvaluator::Make(unbound, *schema_, true)); auto file = PrepareDataFile("20251128", 10, 1024, {{"salary", 1.0}}, {{"salary", 2.0}}, {{4, 10}}, {{4, 10}}, {{4, 5}}); auto result = evaluator->Eval(*file); @@ -210,7 +210,7 @@ TEST_F(InclusiveMetricsEvaluatorTest, IsNanTest) { { auto unbound = Expressions::IsNaN("salary"); ICEBERG_UNWRAP_OR_FAIL(auto evaluator, - InclusiveMetricsEvaluator::Make(unbound, schema_, true)); + InclusiveMetricsEvaluator::Make(unbound, *schema_, true)); auto file = PrepareDataFile("20251128", 10, 1024, {{"salary", 1.0}}, {{"salary", 2.0}}, {{4, 10}}, {{4, 5}}, {{4, 0}}); auto result = evaluator->Eval(*file); @@ -223,7 +223,7 @@ TEST_F(InclusiveMetricsEvaluatorTest, NotNanTest) { { auto unbound = Expressions::NotNaN("salary"); ICEBERG_UNWRAP_OR_FAIL(auto evaluator, - InclusiveMetricsEvaluator::Make(unbound, schema_, true)); + InclusiveMetricsEvaluator::Make(unbound, *schema_, true)); auto file = PrepareDataFile("20251128", 10, 1024, {{"salary", 1.0}}, {{"salary", 2.0}}, {{4, 10}}, {}, {{4, 5}}); auto result = evaluator->Eval(*file); @@ -233,7 +233,7 @@ TEST_F(InclusiveMetricsEvaluatorTest, NotNanTest) { { auto unbound = Expressions::NotNaN("salary"); ICEBERG_UNWRAP_OR_FAIL(auto evaluator, - InclusiveMetricsEvaluator::Make(unbound, schema_, true)); + InclusiveMetricsEvaluator::Make(unbound, *schema_, true)); auto file = PrepareDataFile("20251128", 10, 1024, {{"salary", 1.0}}, {{"salary", 2.0}}, {{4, 10}}, {}, {{4, 10}}); auto result = evaluator->Eval(*file); @@ -370,7 +370,7 @@ TEST_F(InclusiveMetricsEvaluatorTest, NotStartsWithTest) { auto test_case = [&](const std::string& prefix, bool expected_result) { auto unbound = Expressions::NotStartsWith("name", prefix); ICEBERG_UNWRAP_OR_FAIL(auto evaluator, - InclusiveMetricsEvaluator::Make(unbound, schema_, true)); + InclusiveMetricsEvaluator::Make(unbound, *schema_, true)); auto file = PrepareDataFile("20251128", 10, 1024, {{"name", "123"}}, {{"name", "123"}}, {{2, 10}}, {{2, 0}}); auto result = evaluator->Eval(*file); From 28d9397c32ff9638b216f25a0f3abf53a225eb1d Mon Sep 17 00:00:00 2001 From: "xiao.dong" Date: Mon, 1 Dec 2025 15:59:28 +0800 Subject: [PATCH 09/15] refactor kind check --- .../expression/inclusive_metrics_evaluator.cc | 53 +++++++++---------- 1 file changed, 24 insertions(+), 29 deletions(-) diff --git a/src/iceberg/expression/inclusive_metrics_evaluator.cc b/src/iceberg/expression/inclusive_metrics_evaluator.cc index 1dedda1fa..3306a1352 100644 --- a/src/iceberg/expression/inclusive_metrics_evaluator.cc +++ b/src/iceberg/expression/inclusive_metrics_evaluator.cc @@ -86,7 +86,7 @@ class InclusiveMetricsVisitor : public BoundVisitor { if (ContainsNullsOnly(id)) { return kRowCannotMatch; } - if (expr->reference()->kind() != BoundTerm::Kind::kReference) { + if (internal::checked_pointer_cast(expr) == nullptr) { return kRowsMightMatch; } auto it = data_file_.nan_value_counts.find(id); @@ -97,7 +97,7 @@ class InclusiveMetricsVisitor : public BoundVisitor { } Result NotNaN(const std::shared_ptr& expr) override { - if (expr->reference()->kind() != BoundTerm::Kind::kReference) { + if (internal::checked_pointer_cast(expr) == nullptr) { // identity transforms are already removed by this time return kRowsMightMatch; } @@ -291,10 +291,9 @@ class InclusiveMetricsVisitor : public BoundVisitor { Result StartsWith(const std::shared_ptr& expr, const Literal& lit) override { - if (expr->reference()->kind() == BoundTerm::Kind::kTransform && - internal::checked_cast(*expr) - .transform() - ->transform_type() != TransformType::kIdentity) { + auto transform = internal::checked_pointer_cast(expr); + if (transform != nullptr && + transform->transform()->transform_type() != TransformType::kIdentity) { // truncate must be rewritten in binding. the result is either always or never // compatible return kRowsMightMatch; @@ -411,33 +410,29 @@ class InclusiveMetricsVisitor : public BoundVisitor { } Result> LowerBound(const std::shared_ptr& expr) { - if (expr->reference()->kind() == BoundTerm::Kind::kReference) { - return ParseLowerBound( - internal::checked_cast(*expr->reference())); - } else if (expr->reference()->kind() == BoundTerm::Kind::kTransform) { - return TransformLowerBound( - internal::checked_cast(*expr->reference())); - } else if (expr->reference()->kind() == BoundTerm::Kind::kExtract) { - // TODO(xiao.dong) handle extract lower and upper bounds - return NotImplemented("Extract lower bound not implemented."); + if (auto reference = internal::checked_pointer_cast(expr); + reference != nullptr) { + return ParseLowerBound(*reference); + } else if (auto transform = internal::checked_pointer_cast(expr); + transform != nullptr) { + return TransformLowerBound(*transform); } else { return std::nullopt; } + // TODO(xiao.dong) handle extract lower and upper bounds } Result> UpperBound(const std::shared_ptr& expr) { - if (expr->reference()->kind() == BoundTerm::Kind::kReference) { - return ParseUpperBound( - internal::checked_cast(*expr->reference())); - } else if (expr->reference()->kind() == BoundTerm::Kind::kTransform) { - return TransformUpperBound( - internal::checked_cast(*expr->reference())); - } else if (expr->reference()->kind() == BoundTerm::Kind::kExtract) { - // TODO(xiao.dong) handle extract lower and upper bounds - return NotImplemented("Extract upper bound not implemented."); + if (auto reference = internal::checked_pointer_cast(expr); + reference != nullptr) { + return ParseUpperBound(*reference); + } else if (auto transform = internal::checked_pointer_cast(expr); + transform != nullptr) { + return TransformUpperBound(*transform); } else { return std::nullopt; } + // TODO(xiao.dong) handle extract lower and upper bounds } Result> ParseLowerBound(const BoundReference& ref) { @@ -504,12 +499,12 @@ class InclusiveMetricsVisitor : public BoundVisitor { /** Returns true if the expression term produces a non-null value for non-null input. */ bool IsNonNullPreserving(const std::shared_ptr& expr) { - if (expr->reference()->kind() == BoundTerm::Kind::kReference) { + if (auto reference = internal::checked_pointer_cast(expr); + reference != nullptr) { return true; - } else if (expr->reference()->kind() == BoundTerm::Kind::kTransform) { - return internal::checked_cast(*expr->reference()) - .transform() - ->PreservesOrder(); + } else if (auto transform = internal::checked_pointer_cast(expr); + transform != nullptr) { + return transform->transform()->PreservesOrder(); } // a non-null variant does not necessarily contain a specific field // and unknown bound terms are not non-null preserving From 5ec5315c76a9533e6a442c7eba12ce661cc8b0dc Mon Sep 17 00:00:00 2001 From: "xiao.dong" Date: Mon, 1 Dec 2025 19:52:38 +0800 Subject: [PATCH 10/15] migrate all java case --- .../expression/inclusive_metrics_evaluator.cc | 8 +- .../test/inclusive_metrics_evaluator_test.cc | 725 ++++++++++++++++-- 2 files changed, 648 insertions(+), 85 deletions(-) diff --git a/src/iceberg/expression/inclusive_metrics_evaluator.cc b/src/iceberg/expression/inclusive_metrics_evaluator.cc index 3306a1352..d2f513c0a 100644 --- a/src/iceberg/expression/inclusive_metrics_evaluator.cc +++ b/src/iceberg/expression/inclusive_metrics_evaluator.cc @@ -313,7 +313,7 @@ class InclusiveMetricsVisitor : public BoundVisitor { return kRowsMightMatch; } const auto& lower = lower_result.value(); - auto lower_str = get(lower.value()); + const auto& lower_str = get(lower.value()); // truncate lower bound so that its length in bytes is not greater than the length of // prefix int length = std::min(prefix.size(), lower_str.size()); @@ -327,7 +327,7 @@ class InclusiveMetricsVisitor : public BoundVisitor { return kRowsMightMatch; } const auto& upper = upper_result.value(); - auto upper_str = get(upper.value()); + const auto& upper_str = get(upper.value()); // truncate upper bound so that its length in bytes is not greater than the length of // prefix length = std::min(prefix.size(), upper_str.size()); @@ -363,8 +363,8 @@ class InclusiveMetricsVisitor : public BoundVisitor { } const auto& lower = lower_result.value(); const auto& upper = upper_result.value(); - auto lower_str = get(lower.value()); - auto upper_str = get(upper.value()); + const auto& lower_str = get(lower.value()); + const auto& upper_str = get(upper.value()); // if lower is shorter than the prefix then lower doesn't start with the prefix if (lower_str.size() < prefix.size()) { diff --git a/src/iceberg/test/inclusive_metrics_evaluator_test.cc b/src/iceberg/test/inclusive_metrics_evaluator_test.cc index 261c2fb49..f0652edfa 100644 --- a/src/iceberg/test/inclusive_metrics_evaluator_test.cc +++ b/src/iceberg/test/inclusive_metrics_evaluator_test.cc @@ -27,17 +27,27 @@ #include "iceberg/schema.h" #include "iceberg/test/matchers.h" #include "iceberg/type.h" +#include "iceberg/util/truncate_util.h" namespace iceberg { namespace { -static const bool ROWS_MIGHT_MATCH = true; -static const bool ROWS_CANNOT_MATCH = false; +constexpr bool kRowsMightMatch = true; +constexpr bool kRowCannotMatch = false; +constexpr int64_t kIntMinValue = 30; +constexpr int64_t kIntMaxValue = 79; +constexpr float kFloatNan = std::numeric_limits::quiet_NaN(); +constexpr double kDoubleNan = std::numeric_limits::quiet_NaN(); } // namespace using TestVariant = std::variant; class InclusiveMetricsEvaluatorTest : public ::testing::Test { protected: + Result> Bind(const std::shared_ptr& expr, + bool case_sensitive = true) { + return Binder::Bind(*schema_, expr, case_sensitive); + } + void SetUp() override { schema_ = std::make_shared( std::vector{ @@ -48,12 +58,7 @@ class InclusiveMetricsEvaluatorTest : public ::testing::Test { SchemaField::MakeRequired(5, "active", boolean()), SchemaField::MakeRequired(6, "date", string()), }, - /*schema_id=*/0); - } - - Result> Bind(const std::shared_ptr& expr, - bool case_sensitive = true) { - return Binder::Bind(*schema_, expr, case_sensitive); + 0); } std::shared_ptr PrepareDataFile( @@ -118,6 +123,7 @@ class InclusiveMetricsEvaluatorTest : public ::testing::Test { ASSERT_EQ(result.value(), expected_result) << unbound->ToString(); } + protected: std::shared_ptr schema_; }; @@ -149,7 +155,7 @@ TEST_F(InclusiveMetricsEvaluatorTest, IsNullTest) { {{2, 10}}, {{2, 5}}, {}); auto result = evaluator->Eval(*file); ASSERT_TRUE(result.has_value()); - ASSERT_EQ(result.value(), ROWS_MIGHT_MATCH) << unbound->ToString(); + ASSERT_EQ(result.value(), kRowsMightMatch) << unbound->ToString(); } { auto unbound = Expressions::IsNull("name"); @@ -159,7 +165,7 @@ TEST_F(InclusiveMetricsEvaluatorTest, IsNullTest) { {{2, 10}}, {{2, 0}}, {}); auto result = evaluator->Eval(*file); ASSERT_TRUE(result.has_value()); - ASSERT_EQ(result.value(), ROWS_CANNOT_MATCH) << unbound->ToString(); + ASSERT_EQ(result.value(), kRowCannotMatch) << unbound->ToString(); } } @@ -172,7 +178,7 @@ TEST_F(InclusiveMetricsEvaluatorTest, NotNullTest) { {{2, 10}}, {{2, 5}}, {}); auto result = evaluator->Eval(*file); ASSERT_TRUE(result.has_value()); - ASSERT_EQ(result.value(), ROWS_MIGHT_MATCH) << unbound->ToString(); + ASSERT_EQ(result.value(), kRowsMightMatch) << unbound->ToString(); } { auto unbound = Expressions::NotNull("name"); @@ -182,7 +188,7 @@ TEST_F(InclusiveMetricsEvaluatorTest, NotNullTest) { {{2, 10}}, {{2, 10}}, {}); auto result = evaluator->Eval(*file); ASSERT_TRUE(result.has_value()); - ASSERT_EQ(result.value(), ROWS_CANNOT_MATCH) << unbound->ToString(); + ASSERT_EQ(result.value(), kRowCannotMatch) << unbound->ToString(); } } @@ -195,7 +201,7 @@ TEST_F(InclusiveMetricsEvaluatorTest, IsNanTest) { {{"salary", 2.0}}, {{4, 10}}, {{4, 5}}, {{4, 5}}); auto result = evaluator->Eval(*file); ASSERT_TRUE(result.has_value()); - ASSERT_EQ(result.value(), ROWS_MIGHT_MATCH) << unbound->ToString(); + ASSERT_EQ(result.value(), kRowsMightMatch) << unbound->ToString(); } { auto unbound = Expressions::IsNaN("salary"); @@ -205,7 +211,7 @@ TEST_F(InclusiveMetricsEvaluatorTest, IsNanTest) { {{"salary", 2.0}}, {{4, 10}}, {{4, 10}}, {{4, 5}}); auto result = evaluator->Eval(*file); ASSERT_TRUE(result.has_value()); - ASSERT_EQ(result.value(), ROWS_CANNOT_MATCH) << unbound->ToString(); + ASSERT_EQ(result.value(), kRowCannotMatch) << unbound->ToString(); } { auto unbound = Expressions::IsNaN("salary"); @@ -215,7 +221,7 @@ TEST_F(InclusiveMetricsEvaluatorTest, IsNanTest) { {{"salary", 2.0}}, {{4, 10}}, {{4, 5}}, {{4, 0}}); auto result = evaluator->Eval(*file); ASSERT_TRUE(result.has_value()); - ASSERT_EQ(result.value(), ROWS_CANNOT_MATCH) << unbound->ToString(); + ASSERT_EQ(result.value(), kRowCannotMatch) << unbound->ToString(); } } @@ -228,7 +234,7 @@ TEST_F(InclusiveMetricsEvaluatorTest, NotNanTest) { {{"salary", 2.0}}, {{4, 10}}, {}, {{4, 5}}); auto result = evaluator->Eval(*file); ASSERT_TRUE(result.has_value()); - ASSERT_EQ(result.value(), ROWS_MIGHT_MATCH) << unbound->ToString(); + ASSERT_EQ(result.value(), kRowsMightMatch) << unbound->ToString(); } { auto unbound = Expressions::NotNaN("salary"); @@ -238,54 +244,54 @@ TEST_F(InclusiveMetricsEvaluatorTest, NotNanTest) { {{"salary", 2.0}}, {{4, 10}}, {}, {{4, 10}}); auto result = evaluator->Eval(*file); ASSERT_TRUE(result.has_value()); - ASSERT_EQ(result.value(), ROWS_CANNOT_MATCH) << unbound->ToString(); + ASSERT_EQ(result.value(), kRowCannotMatch) << unbound->ToString(); } } TEST_F(InclusiveMetricsEvaluatorTest, LTTest) { - TestCase(Expressions::LessThan("id", Literal::Long(300)), ROWS_MIGHT_MATCH); - TestCase(Expressions::LessThan("id", Literal::Long(150)), ROWS_MIGHT_MATCH); - TestCase(Expressions::LessThan("id", Literal::Long(100)), ROWS_CANNOT_MATCH); - TestCase(Expressions::LessThan("id", Literal::Long(200)), ROWS_MIGHT_MATCH); - TestCase(Expressions::LessThan("id", Literal::Long(99)), ROWS_CANNOT_MATCH); + TestCase(Expressions::LessThan("id", Literal::Long(300)), kRowsMightMatch); + TestCase(Expressions::LessThan("id", Literal::Long(150)), kRowsMightMatch); + TestCase(Expressions::LessThan("id", Literal::Long(100)), kRowCannotMatch); + TestCase(Expressions::LessThan("id", Literal::Long(200)), kRowsMightMatch); + TestCase(Expressions::LessThan("id", Literal::Long(99)), kRowCannotMatch); } TEST_F(InclusiveMetricsEvaluatorTest, LTEQTest) { - TestCase(Expressions::LessThanOrEqual("id", Literal::Long(300)), ROWS_MIGHT_MATCH); - TestCase(Expressions::LessThanOrEqual("id", Literal::Long(150)), ROWS_MIGHT_MATCH); - TestCase(Expressions::LessThanOrEqual("id", Literal::Long(100)), ROWS_MIGHT_MATCH); - TestCase(Expressions::LessThanOrEqual("id", Literal::Long(200)), ROWS_MIGHT_MATCH); - TestCase(Expressions::LessThanOrEqual("id", Literal::Long(99)), ROWS_CANNOT_MATCH); + TestCase(Expressions::LessThanOrEqual("id", Literal::Long(300)), kRowsMightMatch); + TestCase(Expressions::LessThanOrEqual("id", Literal::Long(150)), kRowsMightMatch); + TestCase(Expressions::LessThanOrEqual("id", Literal::Long(100)), kRowsMightMatch); + TestCase(Expressions::LessThanOrEqual("id", Literal::Long(200)), kRowsMightMatch); + TestCase(Expressions::LessThanOrEqual("id", Literal::Long(99)), kRowCannotMatch); } TEST_F(InclusiveMetricsEvaluatorTest, GTTest) { - TestCase(Expressions::GreaterThan("id", Literal::Long(300)), ROWS_CANNOT_MATCH); - TestCase(Expressions::GreaterThan("id", Literal::Long(150)), ROWS_MIGHT_MATCH); - TestCase(Expressions::GreaterThan("id", Literal::Long(100)), ROWS_MIGHT_MATCH); - TestCase(Expressions::GreaterThan("id", Literal::Long(200)), ROWS_CANNOT_MATCH); - TestCase(Expressions::GreaterThan("id", Literal::Long(99)), ROWS_MIGHT_MATCH); + TestCase(Expressions::GreaterThan("id", Literal::Long(300)), kRowCannotMatch); + TestCase(Expressions::GreaterThan("id", Literal::Long(150)), kRowsMightMatch); + TestCase(Expressions::GreaterThan("id", Literal::Long(100)), kRowsMightMatch); + TestCase(Expressions::GreaterThan("id", Literal::Long(200)), kRowCannotMatch); + TestCase(Expressions::GreaterThan("id", Literal::Long(99)), kRowsMightMatch); } TEST_F(InclusiveMetricsEvaluatorTest, GTEQTest) { - TestCase(Expressions::GreaterThanOrEqual("id", Literal::Long(300)), ROWS_CANNOT_MATCH); - TestCase(Expressions::GreaterThanOrEqual("id", Literal::Long(150)), ROWS_MIGHT_MATCH); - TestCase(Expressions::GreaterThanOrEqual("id", Literal::Long(100)), ROWS_MIGHT_MATCH); - TestCase(Expressions::GreaterThanOrEqual("id", Literal::Long(200)), ROWS_MIGHT_MATCH); - TestCase(Expressions::GreaterThanOrEqual("id", Literal::Long(99)), ROWS_MIGHT_MATCH); + TestCase(Expressions::GreaterThanOrEqual("id", Literal::Long(300)), kRowCannotMatch); + TestCase(Expressions::GreaterThanOrEqual("id", Literal::Long(150)), kRowsMightMatch); + TestCase(Expressions::GreaterThanOrEqual("id", Literal::Long(100)), kRowsMightMatch); + TestCase(Expressions::GreaterThanOrEqual("id", Literal::Long(200)), kRowsMightMatch); + TestCase(Expressions::GreaterThanOrEqual("id", Literal::Long(99)), kRowsMightMatch); } TEST_F(InclusiveMetricsEvaluatorTest, EQTest) { - TestCase(Expressions::Equal("id", Literal::Long(300)), ROWS_CANNOT_MATCH); - TestCase(Expressions::Equal("id", Literal::Long(150)), ROWS_MIGHT_MATCH); - TestCase(Expressions::Equal("id", Literal::Long(100)), ROWS_MIGHT_MATCH); - TestCase(Expressions::Equal("id", Literal::Long(200)), ROWS_MIGHT_MATCH); + TestCase(Expressions::Equal("id", Literal::Long(300)), kRowCannotMatch); + TestCase(Expressions::Equal("id", Literal::Long(150)), kRowsMightMatch); + TestCase(Expressions::Equal("id", Literal::Long(100)), kRowsMightMatch); + TestCase(Expressions::Equal("id", Literal::Long(200)), kRowsMightMatch); } TEST_F(InclusiveMetricsEvaluatorTest, NotEqTest) { - TestCase(Expressions::NotEqual("id", Literal::Long(300)), ROWS_MIGHT_MATCH); - TestCase(Expressions::NotEqual("id", Literal::Long(150)), ROWS_MIGHT_MATCH); - TestCase(Expressions::NotEqual("id", Literal::Long(100)), ROWS_MIGHT_MATCH); - TestCase(Expressions::NotEqual("id", Literal::Long(200)), ROWS_MIGHT_MATCH); + TestCase(Expressions::NotEqual("id", Literal::Long(300)), kRowsMightMatch); + TestCase(Expressions::NotEqual("id", Literal::Long(150)), kRowsMightMatch); + TestCase(Expressions::NotEqual("id", Literal::Long(100)), kRowsMightMatch); + TestCase(Expressions::NotEqual("id", Literal::Long(200)), kRowsMightMatch); } TEST_F(InclusiveMetricsEvaluatorTest, InTest) { @@ -295,21 +301,21 @@ TEST_F(InclusiveMetricsEvaluatorTest, InTest) { Literal::Long(400), Literal::Long(500), }), - ROWS_CANNOT_MATCH); + kRowCannotMatch); TestCase(Expressions::In("id", { Literal::Long(150), Literal::Long(300), }), - ROWS_MIGHT_MATCH); - TestCase(Expressions::In("id", {Literal::Long(100)}), ROWS_MIGHT_MATCH); - TestCase(Expressions::In("id", {Literal::Long(200)}), ROWS_MIGHT_MATCH); + kRowsMightMatch); + TestCase(Expressions::In("id", {Literal::Long(100)}), kRowsMightMatch); + TestCase(Expressions::In("id", {Literal::Long(200)}), kRowsMightMatch); TestCase(Expressions::In("id", { Literal::Long(99), Literal::Long(201), }), - ROWS_CANNOT_MATCH); + kRowCannotMatch); } TEST_F(InclusiveMetricsEvaluatorTest, NotInTest) { @@ -319,55 +325,55 @@ TEST_F(InclusiveMetricsEvaluatorTest, NotInTest) { Literal::Long(400), Literal::Long(500), }), - ROWS_MIGHT_MATCH); + kRowsMightMatch); TestCase(Expressions::NotIn("id", { Literal::Long(150), Literal::Long(300), }), - ROWS_MIGHT_MATCH); + kRowsMightMatch); TestCase(Expressions::NotIn("id", { Literal::Long(100), Literal::Long(200), }), - ROWS_MIGHT_MATCH); + kRowsMightMatch); TestCase(Expressions::NotIn("id", { Literal::Long(99), Literal::Long(201), }), - ROWS_MIGHT_MATCH); + kRowsMightMatch); } TEST_F(InclusiveMetricsEvaluatorTest, StartsWithTest) { - TestStringCase(Expressions::StartsWith("name", "1"), ROWS_MIGHT_MATCH); - TestStringCase(Expressions::StartsWith("name", "4"), ROWS_MIGHT_MATCH); - TestStringCase(Expressions::StartsWith("name", "12"), ROWS_MIGHT_MATCH); - TestStringCase(Expressions::StartsWith("name", "45"), ROWS_MIGHT_MATCH); - TestStringCase(Expressions::StartsWith("name", "123"), ROWS_MIGHT_MATCH); - TestStringCase(Expressions::StartsWith("name", "456"), ROWS_MIGHT_MATCH); - TestStringCase(Expressions::StartsWith("name", "1234"), ROWS_MIGHT_MATCH); - TestStringCase(Expressions::StartsWith("name", "4567"), ROWS_CANNOT_MATCH); - TestStringCase(Expressions::StartsWith("name", "78"), ROWS_CANNOT_MATCH); - TestStringCase(Expressions::StartsWith("name", "7"), ROWS_CANNOT_MATCH); - TestStringCase(Expressions::StartsWith("name", "A"), ROWS_CANNOT_MATCH); + TestStringCase(Expressions::StartsWith("name", "1"), kRowsMightMatch); + TestStringCase(Expressions::StartsWith("name", "4"), kRowsMightMatch); + TestStringCase(Expressions::StartsWith("name", "12"), kRowsMightMatch); + TestStringCase(Expressions::StartsWith("name", "45"), kRowsMightMatch); + TestStringCase(Expressions::StartsWith("name", "123"), kRowsMightMatch); + TestStringCase(Expressions::StartsWith("name", "456"), kRowsMightMatch); + TestStringCase(Expressions::StartsWith("name", "1234"), kRowsMightMatch); + TestStringCase(Expressions::StartsWith("name", "4567"), kRowCannotMatch); + TestStringCase(Expressions::StartsWith("name", "78"), kRowCannotMatch); + TestStringCase(Expressions::StartsWith("name", "7"), kRowCannotMatch); + TestStringCase(Expressions::StartsWith("name", "A"), kRowCannotMatch); } TEST_F(InclusiveMetricsEvaluatorTest, NotStartsWithTest) { - TestStringCase(Expressions::NotStartsWith("name", "1"), ROWS_MIGHT_MATCH); - TestStringCase(Expressions::NotStartsWith("name", "4"), ROWS_MIGHT_MATCH); - TestStringCase(Expressions::NotStartsWith("name", "12"), ROWS_MIGHT_MATCH); - TestStringCase(Expressions::NotStartsWith("name", "45"), ROWS_MIGHT_MATCH); - TestStringCase(Expressions::NotStartsWith("name", "123"), ROWS_MIGHT_MATCH); - TestStringCase(Expressions::NotStartsWith("name", "456"), ROWS_MIGHT_MATCH); - TestStringCase(Expressions::NotStartsWith("name", "1234"), ROWS_MIGHT_MATCH); - TestStringCase(Expressions::NotStartsWith("name", "4567"), ROWS_MIGHT_MATCH); - TestStringCase(Expressions::NotStartsWith("name", "78"), ROWS_MIGHT_MATCH); - TestStringCase(Expressions::NotStartsWith("name", "7"), ROWS_MIGHT_MATCH); - TestStringCase(Expressions::NotStartsWith("name", "A"), ROWS_MIGHT_MATCH); - - auto test_case = [&](const std::string& prefix, bool expected_result) { + TestStringCase(Expressions::NotStartsWith("name", "1"), kRowsMightMatch); + TestStringCase(Expressions::NotStartsWith("name", "4"), kRowsMightMatch); + TestStringCase(Expressions::NotStartsWith("name", "12"), kRowsMightMatch); + TestStringCase(Expressions::NotStartsWith("name", "45"), kRowsMightMatch); + TestStringCase(Expressions::NotStartsWith("name", "123"), kRowsMightMatch); + TestStringCase(Expressions::NotStartsWith("name", "456"), kRowsMightMatch); + TestStringCase(Expressions::NotStartsWith("name", "1234"), kRowsMightMatch); + TestStringCase(Expressions::NotStartsWith("name", "4567"), kRowsMightMatch); + TestStringCase(Expressions::NotStartsWith("name", "78"), kRowsMightMatch); + TestStringCase(Expressions::NotStartsWith("name", "7"), kRowsMightMatch); + TestStringCase(Expressions::NotStartsWith("name", "A"), kRowsMightMatch); + + auto RunTest = [&](const std::string& prefix, bool expected_result) { auto unbound = Expressions::NotStartsWith("name", prefix); ICEBERG_UNWRAP_OR_FAIL(auto evaluator, InclusiveMetricsEvaluator::Make(unbound, *schema_, true)); @@ -377,9 +383,566 @@ TEST_F(InclusiveMetricsEvaluatorTest, NotStartsWithTest) { ASSERT_TRUE(result.has_value()); ASSERT_EQ(result.value(), expected_result) << unbound->ToString(); }; - test_case("12", ROWS_CANNOT_MATCH); - test_case("123", ROWS_CANNOT_MATCH); - test_case("1234", ROWS_MIGHT_MATCH); + RunTest("12", kRowCannotMatch); + RunTest("123", kRowCannotMatch); + RunTest("1234", kRowsMightMatch); +} + +class InclusiveMetricsEvaluatorMigratedTest : public InclusiveMetricsEvaluatorTest { + protected: + void SetUp() override { + schema_ = std::make_shared( + std::vector{ + SchemaField::MakeRequired(1, "id", int64()), + SchemaField::MakeOptional(2, "no_stats", int64()), + SchemaField::MakeRequired(3, "required", string()), + SchemaField::MakeOptional(4, "all_nulls", string()), + SchemaField::MakeOptional(5, "some_nulls", string()), + SchemaField::MakeOptional(6, "no_nulls", string()), + SchemaField::MakeOptional(7, "all_nans", float64()), + SchemaField::MakeOptional(8, "some_nans", float32()), + SchemaField::MakeOptional(9, "no_nans", float32()), + SchemaField::MakeOptional(10, "all_nulls_double", float64()), + SchemaField::MakeOptional(11, "all_nans_v1_stats", float32()), + SchemaField::MakeOptional(12, "nan_and_null_only", float64()), + SchemaField::MakeOptional(13, "no_nan_stats", float64()), + SchemaField::MakeOptional(14, "some_empty", string()), + }, + /*schema_id=*/0); + file1_ = PrepareDataFile1(); + file2_ = PrepareDataFile2(); + file3_ = PrepareDataFile3(); + file4_ = PrepareDataFile4(); + file5_ = PrepareDataFile5(); + } + + std::shared_ptr PrepareDataFile1() { + auto data_file = std::make_shared(); + data_file->file_path = "test_path1"; + data_file->file_format = FileFormatType::kParquet; + data_file->record_count = 50; + data_file->value_counts = { + {4, 50L}, {5, 50L}, {6, 50L}, {7, 50L}, {8, 50L}, {9, 50L}, + {10, 50L}, {11, 50L}, {12, 50L}, {13, 50L}, {14, 50L}, + }; + data_file->null_value_counts = { + {4, 50L}, {5, 10L}, {6, 0L}, {10, 50L}, {11, 0L}, {12, 1L}, {14, 0L}, + }; + data_file->nan_value_counts = { + {7, 50L}, + {8, 10L}, + {9, 0L}, + }; + data_file->lower_bounds = { + {1, Literal::Long(kIntMinValue).Serialize().value()}, + {11, Literal::Float(kFloatNan).Serialize().value()}, + {12, Literal::Double(kDoubleNan).Serialize().value()}, + {14, Literal::String("").Serialize().value()}, + }; + data_file->upper_bounds = { + {1, Literal::Long(kIntMaxValue).Serialize().value()}, + {11, Literal::Float(kFloatNan).Serialize().value()}, + {12, Literal::Double(kDoubleNan).Serialize().value()}, + {14, Literal::String("房东整租霍营小区二层两居室").Serialize().value()}, + }; + return data_file; + } + + std::shared_ptr PrepareDataFile2() { + auto data_file = std::make_shared(); + data_file->file_path = "test_path2"; + data_file->file_format = FileFormatType::kParquet; + data_file->record_count = 50; + data_file->value_counts = { + {3, 50L}, + }; + data_file->null_value_counts = { + {3, 0L}, + }; + data_file->nan_value_counts = {}; + data_file->lower_bounds = { + {3, Literal::String("aa").Serialize().value()}, + }; + data_file->upper_bounds = { + {3, Literal::String("dC").Serialize().value()}, + }; + return data_file; + } + + std::shared_ptr PrepareDataFile3() { + auto data_file = std::make_shared(); + data_file->file_path = "test_path3"; + data_file->file_format = FileFormatType::kParquet; + data_file->record_count = 50; + data_file->value_counts = { + {3, 50L}, + }; + data_file->null_value_counts = { + {3, 0L}, + }; + data_file->nan_value_counts = {}; + data_file->lower_bounds = { + {3, Literal::String("1str1").Serialize().value()}, + }; + data_file->upper_bounds = { + {3, Literal::String("3str3").Serialize().value()}, + }; + return data_file; + } + + std::shared_ptr PrepareDataFile4() { + auto data_file = std::make_shared(); + data_file->file_path = "test_path4"; + data_file->file_format = FileFormatType::kParquet; + data_file->record_count = 50; + data_file->value_counts = { + {3, 50L}, + }; + data_file->null_value_counts = { + {3, 0L}, + }; + data_file->nan_value_counts = {}; + data_file->lower_bounds = { + {3, Literal::String("abc").Serialize().value()}, + }; + data_file->upper_bounds = { + {3, Literal::String("イロハニホヘト").Serialize().value()}, + }; + return data_file; + } + + std::shared_ptr PrepareDataFile5() { + auto data_file = std::make_shared(); + data_file->file_path = "test_path5"; + data_file->file_format = FileFormatType::kParquet; + data_file->record_count = 50; + data_file->value_counts = { + {3, 50L}, + }; + data_file->null_value_counts = { + {3, 0L}, + }; + data_file->nan_value_counts = {}; + data_file->lower_bounds = { + {3, Literal::String("abc").Serialize().value()}, + }; + data_file->upper_bounds = { + {3, Literal::String("abcdefghi").Serialize().value()}, + }; + return data_file; + } + + void RunTest(const std::shared_ptr& expr, bool expected_result, + const std::shared_ptr& file, bool case_sensitive = true) { + ICEBERG_UNWRAP_OR_FAIL( + auto evaluator, InclusiveMetricsEvaluator::Make(expr, *schema_, case_sensitive)); + auto result = evaluator->Eval(*file); + ASSERT_TRUE(result.has_value()); + ASSERT_EQ(result.value(), expected_result) << expr->ToString(); + }; + + std::shared_ptr file1_; + std::shared_ptr file2_; + std::shared_ptr file3_; + std::shared_ptr file4_; + std::shared_ptr file5_; +}; + +TEST_F(InclusiveMetricsEvaluatorMigratedTest, CaseSensitiveTest) { + { + auto unbound = Expressions::Equal("id", Literal::Long(300)); + auto evaluator = InclusiveMetricsEvaluator::Make(unbound, *schema_, true); + ASSERT_TRUE(evaluator.has_value()); + } + { + auto unbound = Expressions::Equal("ID", Literal::Long(300)); + auto evaluator = InclusiveMetricsEvaluator::Make(unbound, *schema_, true); + ASSERT_FALSE(evaluator.has_value()); + ASSERT_EQ(evaluator.error().kind, ErrorKind::kInvalidExpression); + } + { + auto unbound = Expressions::Equal("ID", Literal::Long(300)); + auto evaluator = InclusiveMetricsEvaluator::Make(unbound, *schema_, false); + ASSERT_TRUE(evaluator.has_value()); + } +} + +TEST_F(InclusiveMetricsEvaluatorMigratedTest, AllNullsTest) { + RunTest(Expressions::NotNull("all_nulls"), kRowCannotMatch, file1_); + RunTest(Expressions::LessThan("all_nulls", Literal::String("a")), kRowCannotMatch, + file1_); + RunTest(Expressions::LessThanOrEqual("all_nulls", Literal::String("a")), + kRowCannotMatch, file1_); + RunTest(Expressions::GreaterThan("all_nulls", Literal::String("a")), kRowCannotMatch, + file1_); + RunTest(Expressions::GreaterThanOrEqual("all_nulls", Literal::String("a")), + kRowCannotMatch, file1_); + RunTest(Expressions::Equal("all_nulls", Literal::String("a")), kRowCannotMatch, file1_); + RunTest(Expressions::StartsWith("all_nulls", "a"), kRowCannotMatch, file1_); + RunTest(Expressions::NotStartsWith("all_nulls", "a"), kRowsMightMatch, file1_); + RunTest(Expressions::NotNull("some_nulls"), kRowsMightMatch, file1_); + RunTest(Expressions::NotNull("no_nulls"), kRowsMightMatch, file1_); +} + +TEST_F(InclusiveMetricsEvaluatorMigratedTest, NoNullsTest) { + RunTest(Expressions::IsNull("all_nulls"), kRowsMightMatch, file1_); + RunTest(Expressions::IsNull("some_nulls"), kRowsMightMatch, file1_); + RunTest(Expressions::IsNull("no_nulls"), kRowCannotMatch, file1_); +} + +TEST_F(InclusiveMetricsEvaluatorMigratedTest, IsNaNTest) { + RunTest(Expressions::IsNaN("all_nans"), kRowsMightMatch, file1_); + RunTest(Expressions::IsNaN("some_nans"), kRowsMightMatch, file1_); + RunTest(Expressions::IsNaN("no_nans"), kRowCannotMatch, file1_); + RunTest(Expressions::IsNaN("all_nulls_double"), kRowCannotMatch, file1_); + RunTest(Expressions::IsNaN("no_nan_stats"), kRowsMightMatch, file1_); + RunTest(Expressions::IsNaN("all_nans_v1_stats"), kRowsMightMatch, file1_); + RunTest(Expressions::IsNaN("nan_and_null_only"), kRowsMightMatch, file1_); +} + +TEST_F(InclusiveMetricsEvaluatorMigratedTest, NotNaNTest) { + RunTest(Expressions::NotNaN("all_nans"), kRowCannotMatch, file1_); + RunTest(Expressions::NotNaN("some_nans"), kRowsMightMatch, file1_); + RunTest(Expressions::NotNaN("no_nans"), kRowsMightMatch, file1_); + RunTest(Expressions::NotNaN("all_nulls_double"), kRowsMightMatch, file1_); + RunTest(Expressions::NotNaN("no_nan_stats"), kRowsMightMatch, file1_); + RunTest(Expressions::NotNaN("all_nans_v1_stats"), kRowsMightMatch, file1_); + RunTest(Expressions::NotNaN("nan_and_null_only"), kRowsMightMatch, file1_); +} + +TEST_F(InclusiveMetricsEvaluatorMigratedTest, RequiredColumnTest) { + RunTest(Expressions::NotNull("required"), kRowsMightMatch, file1_); + RunTest(Expressions::IsNull("required"), kRowCannotMatch, file1_); +} + +TEST_F(InclusiveMetricsEvaluatorMigratedTest, MissingColumnTest) { + auto expr = Expressions::LessThan("missing", Literal::Long(5)); + auto result = InclusiveMetricsEvaluator::Make(expr, *schema_, true); + ASSERT_FALSE(result.has_value()) << result.error().message; + ASSERT_TRUE(result.error().message.contains("Cannot find field 'missing' in struct")) + << result.error().message; +} + +TEST_F(InclusiveMetricsEvaluatorMigratedTest, MissingStatsTest) { + auto data_file = std::make_shared(); + data_file->file_path = "test_path"; + data_file->file_format = FileFormatType::kParquet; + data_file->record_count = 50; + + RunTest(Expressions::LessThan("no_stats", Literal::Long(5)), kRowsMightMatch, + data_file); + RunTest(Expressions::LessThanOrEqual("no_stats", Literal::Long(30)), kRowsMightMatch, + data_file); + RunTest(Expressions::Equal("no_stats", Literal::Long(70)), kRowsMightMatch, data_file); + RunTest(Expressions::GreaterThan("no_stats", Literal::Long(78)), kRowsMightMatch, + data_file); + RunTest(Expressions::GreaterThanOrEqual("no_stats", Literal::Long(90)), kRowsMightMatch, + data_file); + RunTest(Expressions::NotEqual("no_stats", Literal::Long(101)), kRowsMightMatch, + data_file); + RunTest(Expressions::IsNull("no_stats"), kRowsMightMatch, data_file); + RunTest(Expressions::NotNull("no_stats"), kRowsMightMatch, data_file); + RunTest(Expressions::IsNaN("some_nans"), kRowsMightMatch, data_file); + RunTest(Expressions::NotNaN("some_nans"), kRowsMightMatch, data_file); +} + +TEST_F(InclusiveMetricsEvaluatorMigratedTest, ZeroRecordFileTest) { + auto data_file = std::make_shared(); + data_file->file_path = "test_path"; + data_file->file_format = FileFormatType::kParquet; + data_file->record_count = 0; + + RunTest(Expressions::LessThan("no_stats", Literal::Long(5)), kRowCannotMatch, + data_file); + RunTest(Expressions::LessThanOrEqual("no_stats", Literal::Long(30)), kRowCannotMatch, + data_file); + RunTest(Expressions::Equal("no_stats", Literal::Long(70)), kRowCannotMatch, data_file); + RunTest(Expressions::GreaterThan("no_stats", Literal::Long(78)), kRowCannotMatch, + data_file); + RunTest(Expressions::GreaterThanOrEqual("no_stats", Literal::Long(90)), kRowCannotMatch, + data_file); + RunTest(Expressions::NotEqual("no_stats", Literal::Long(101)), kRowCannotMatch, + data_file); + RunTest(Expressions::IsNull("some_nulls"), kRowCannotMatch, data_file); + RunTest(Expressions::NotNull("some_nulls"), kRowCannotMatch, data_file); + RunTest(Expressions::IsNaN("some_nans"), kRowCannotMatch, data_file); + RunTest(Expressions::NotNaN("some_nans"), kRowCannotMatch, data_file); +} + +TEST_F(InclusiveMetricsEvaluatorMigratedTest, NotTest) { + RunTest(Expressions::Not(Expressions::LessThan("id", Literal::Long(kIntMinValue - 25))), + kRowsMightMatch, file1_); + RunTest( + Expressions::Not(Expressions::GreaterThan("id", Literal::Long(kIntMinValue - 25))), + kRowCannotMatch, file1_); +} + +TEST_F(InclusiveMetricsEvaluatorMigratedTest, AndTest) { + RunTest(Expressions::And( + Expressions::LessThan("id", Literal::Long(kIntMinValue - 25)), + Expressions::GreaterThanOrEqual("id", Literal::Long(kIntMinValue - 30))), + kRowCannotMatch, file1_); + RunTest(Expressions::And( + Expressions::LessThan("id", Literal::Long(kIntMinValue - 25)), + Expressions::GreaterThanOrEqual("id", Literal::Long(kIntMaxValue + 1))), + kRowCannotMatch, file1_); + RunTest( + Expressions::And(Expressions::GreaterThan("id", Literal::Long(kIntMinValue - 25)), + Expressions::LessThanOrEqual("id", Literal::Long(kIntMaxValue))), + kRowsMightMatch, file1_); +} + +TEST_F(InclusiveMetricsEvaluatorMigratedTest, OrTest) { + RunTest(Expressions::Or( + Expressions::LessThan("id", Literal::Long(kIntMinValue - 25)), + Expressions::GreaterThanOrEqual("id", Literal::Long(kIntMaxValue + 1))), + kRowCannotMatch, file1_); + RunTest(Expressions::Or( + Expressions::LessThan("id", Literal::Long(kIntMinValue - 25)), + Expressions::GreaterThanOrEqual("id", Literal::Long(kIntMaxValue - 19))), + kRowsMightMatch, file1_); +} + +TEST_F(InclusiveMetricsEvaluatorMigratedTest, IntegerLtTest) { + RunTest(Expressions::LessThan("id", Literal::Long(kIntMinValue - 25)), kRowCannotMatch, + file1_); + RunTest(Expressions::LessThan("id", Literal::Long(kIntMinValue)), kRowCannotMatch, + file1_); + RunTest(Expressions::LessThan("id", Literal::Long(kIntMinValue + 1)), kRowsMightMatch, + file1_); + RunTest(Expressions::LessThan("id", Literal::Long(kIntMaxValue)), kRowsMightMatch, + file1_); +} + +TEST_F(InclusiveMetricsEvaluatorMigratedTest, IntegerLtEqTest) { + RunTest(Expressions::LessThanOrEqual("id", Literal::Long(kIntMinValue - 25)), + kRowCannotMatch, file1_); + RunTest(Expressions::LessThanOrEqual("id", Literal::Long(kIntMinValue - 1)), + kRowCannotMatch, file1_); + RunTest(Expressions::LessThanOrEqual("id", Literal::Long(kIntMinValue)), + kRowsMightMatch, file1_); + RunTest(Expressions::LessThanOrEqual("id", Literal::Long(kIntMaxValue)), + kRowsMightMatch, file1_); +} + +TEST_F(InclusiveMetricsEvaluatorMigratedTest, IntegerGtTest) { + RunTest(Expressions::GreaterThan("id", Literal::Long(kIntMaxValue + 6)), + kRowCannotMatch, file1_); + RunTest(Expressions::GreaterThan("id", Literal::Long(kIntMaxValue)), kRowCannotMatch, + file1_); + RunTest(Expressions::GreaterThan("id", Literal::Long(kIntMaxValue - 1)), + kRowsMightMatch, file1_); + RunTest(Expressions::GreaterThan("id", Literal::Long(kIntMaxValue - 4)), + kRowsMightMatch, file1_); +} + +TEST_F(InclusiveMetricsEvaluatorMigratedTest, IntegerGtEqTest) { + RunTest(Expressions::GreaterThanOrEqual("id", Literal::Long(kIntMaxValue + 6)), + kRowCannotMatch, file1_); + RunTest(Expressions::GreaterThanOrEqual("id", Literal::Long(kIntMaxValue + 1)), + kRowCannotMatch, file1_); + RunTest(Expressions::GreaterThanOrEqual("id", Literal::Long(kIntMaxValue)), + kRowsMightMatch, file1_); + RunTest(Expressions::GreaterThanOrEqual("id", Literal::Long(kIntMaxValue - 4)), + kRowsMightMatch, file1_); +} + +TEST_F(InclusiveMetricsEvaluatorMigratedTest, IntegerEqTest) { + RunTest(Expressions::Equal("id", Literal::Long(kIntMinValue - 25)), kRowCannotMatch, + file1_); + RunTest(Expressions::Equal("id", Literal::Long(kIntMinValue - 1)), kRowCannotMatch, + file1_); + RunTest(Expressions::Equal("id", Literal::Long(kIntMinValue)), kRowsMightMatch, file1_); + RunTest(Expressions::Equal("id", Literal::Long(kIntMaxValue - 4)), kRowsMightMatch, + file1_); + RunTest(Expressions::Equal("id", Literal::Long(kIntMaxValue)), kRowsMightMatch, file1_); + RunTest(Expressions::Equal("id", Literal::Long(kIntMaxValue + 1)), kRowCannotMatch, + file1_); + RunTest(Expressions::Equal("id", Literal::Long(kIntMaxValue + 6)), kRowCannotMatch, + file1_); +} + +TEST_F(InclusiveMetricsEvaluatorMigratedTest, IntegerNotEqTest) { + RunTest(Expressions::NotEqual("id", Literal::Long(kIntMinValue - 25)), kRowsMightMatch, + file1_); + RunTest(Expressions::NotEqual("id", Literal::Long(kIntMinValue - 1)), kRowsMightMatch, + file1_); + RunTest(Expressions::NotEqual("id", Literal::Long(kIntMinValue)), kRowsMightMatch, + file1_); + RunTest(Expressions::NotEqual("id", Literal::Long(kIntMaxValue - 4)), kRowsMightMatch, + file1_); + RunTest(Expressions::NotEqual("id", Literal::Long(kIntMaxValue)), kRowsMightMatch, + file1_); + RunTest(Expressions::NotEqual("id", Literal::Long(kIntMaxValue + 1)), kRowsMightMatch, + file1_); + RunTest(Expressions::NotEqual("id", Literal::Long(kIntMaxValue + 6)), kRowsMightMatch, + file1_); +} + +TEST_F(InclusiveMetricsEvaluatorMigratedTest, IntegerNotEqRewrittenTest) { + RunTest(Expressions::Not(Expressions::Equal("id", Literal::Long(kIntMinValue - 25))), + kRowsMightMatch, file1_); + RunTest(Expressions::Not(Expressions::Equal("id", Literal::Long(kIntMinValue - 1))), + kRowsMightMatch, file1_); + RunTest(Expressions::Not(Expressions::Equal("id", Literal::Long(kIntMinValue))), + kRowsMightMatch, file1_); + RunTest(Expressions::Not(Expressions::Equal("id", Literal::Long(kIntMaxValue - 4))), + kRowsMightMatch, file1_); + RunTest(Expressions::Not(Expressions::Equal("id", Literal::Long(kIntMaxValue))), + kRowsMightMatch, file1_); + RunTest(Expressions::Not(Expressions::Equal("id", Literal::Long(kIntMaxValue + 1))), + kRowsMightMatch, file1_); + RunTest(Expressions::Not(Expressions::Equal("id", Literal::Long(kIntMaxValue + 6))), + kRowsMightMatch, file1_); +} + +TEST_F(InclusiveMetricsEvaluatorMigratedTest, CaseInsensitiveIntegerNotEqRewrittenTest) { + RunTest(Expressions::Not(Expressions::Equal("ID", Literal::Long(kIntMinValue - 25))), + kRowsMightMatch, file1_, false); + RunTest(Expressions::Not(Expressions::Equal("ID", Literal::Long(kIntMinValue - 1))), + kRowsMightMatch, file1_, false); + RunTest(Expressions::Not(Expressions::Equal("ID", Literal::Long(kIntMinValue))), + kRowsMightMatch, file1_, false); + RunTest(Expressions::Not(Expressions::Equal("ID", Literal::Long(kIntMaxValue - 4))), + kRowsMightMatch, file1_, false); + RunTest(Expressions::Not(Expressions::Equal("ID", Literal::Long(kIntMaxValue))), + kRowsMightMatch, file1_, false); + RunTest(Expressions::Not(Expressions::Equal("ID", Literal::Long(kIntMaxValue + 1))), + kRowsMightMatch, file1_, false); + RunTest(Expressions::Not(Expressions::Equal("ID", Literal::Long(kIntMaxValue + 6))), + kRowsMightMatch, file1_, false); +} + +TEST_F(InclusiveMetricsEvaluatorMigratedTest, CaseSensitiveIntegerNotEqRewrittenTest) { + auto expr = Expressions::Not(Expressions::Equal("ID", Literal::Long(5))); + auto result = InclusiveMetricsEvaluator::Make(expr, *schema_, true); + ASSERT_FALSE(result.has_value()) << result.error().message; + ASSERT_TRUE(result.error().message.contains("Cannot find field 'ID' in struct")) + << result.error().message; +} + +TEST_F(InclusiveMetricsEvaluatorMigratedTest, StringStartsWithTest) { + RunTest(Expressions::StartsWith("required", "a"), kRowsMightMatch, file1_); + RunTest(Expressions::StartsWith("required", "a"), kRowsMightMatch, file2_); + RunTest(Expressions::StartsWith("required", "aa"), kRowsMightMatch, file2_); + RunTest(Expressions::StartsWith("required", "aaa"), kRowsMightMatch, file2_); + RunTest(Expressions::StartsWith("required", "1s"), kRowsMightMatch, file3_); + RunTest(Expressions::StartsWith("required", "1str1x"), kRowsMightMatch, file3_); + RunTest(Expressions::StartsWith("required", "ff"), kRowsMightMatch, file4_); + + RunTest(Expressions::StartsWith("required", "aB"), kRowCannotMatch, file2_); + RunTest(Expressions::StartsWith("required", "dWX"), kRowCannotMatch, file2_); + + RunTest(Expressions::StartsWith("required", "5"), kRowCannotMatch, file3_); + RunTest(Expressions::StartsWith("required", "3str3x"), kRowCannotMatch, file3_); + RunTest(Expressions::StartsWith("some_empty", "房东整租霍"), kRowsMightMatch, file1_); + + RunTest(Expressions::StartsWith("all_nulls", ""), kRowCannotMatch, file1_); + auto above_max = TruncateUtils::TruncateLiteral(Literal::String("イロハニホヘト"), 4) + .value() + .ToString(); + RunTest(Expressions::StartsWith("required", above_max), kRowCannotMatch, file4_); +} + +TEST_F(InclusiveMetricsEvaluatorMigratedTest, StringNotStartsWithTest) { + RunTest(Expressions::NotStartsWith("required", "a"), kRowsMightMatch, file1_); + RunTest(Expressions::NotStartsWith("required", "a"), kRowsMightMatch, file2_); + RunTest(Expressions::NotStartsWith("required", "aa"), kRowsMightMatch, file2_); + RunTest(Expressions::NotStartsWith("required", "aaa"), kRowsMightMatch, file2_); + RunTest(Expressions::NotStartsWith("required", "1s"), kRowsMightMatch, file3_); + RunTest(Expressions::NotStartsWith("required", "1str1x"), kRowsMightMatch, file3_); + RunTest(Expressions::NotStartsWith("required", "ff"), kRowsMightMatch, file4_); + + RunTest(Expressions::NotStartsWith("required", "aB"), kRowsMightMatch, file2_); + RunTest(Expressions::NotStartsWith("required", "dWX"), kRowsMightMatch, file2_); + + RunTest(Expressions::NotStartsWith("required", "5"), kRowsMightMatch, file3_); + RunTest(Expressions::NotStartsWith("required", "3str3x"), kRowsMightMatch, file3_); + + auto above_max = TruncateUtils::TruncateLiteral(Literal::String("イロハニホヘト"), 4) + .value() + .ToString(); + RunTest(Expressions::NotStartsWith("required", above_max), kRowsMightMatch, file4_); + + RunTest(Expressions::NotStartsWith("required", "abc"), kRowCannotMatch, file5_); + RunTest(Expressions::NotStartsWith("required", "abcd"), kRowsMightMatch, file5_); +} + +TEST_F(InclusiveMetricsEvaluatorMigratedTest, IntegerInTest) { + RunTest(Expressions::In( + "id", {Literal::Long(kIntMinValue - 25), Literal::Long(kIntMinValue - 24)}), + kRowCannotMatch, file1_); + RunTest(Expressions::In( + "id", {Literal::Long(kIntMinValue - 2), Literal::Long(kIntMinValue - 1)}), + kRowCannotMatch, file1_); + RunTest(Expressions::In("id", + {Literal::Long(kIntMinValue - 1), Literal::Long(kIntMinValue)}), + kRowsMightMatch, file1_); + RunTest(Expressions::In( + "id", {Literal::Long(kIntMaxValue - 4), Literal::Long(kIntMaxValue - 3)}), + kRowsMightMatch, file1_); + RunTest(Expressions::In("id", + {Literal::Long(kIntMaxValue), Literal::Long(kIntMaxValue + 1)}), + kRowsMightMatch, file1_); + RunTest(Expressions::In( + "id", {Literal::Long(kIntMaxValue + 1), Literal::Long(kIntMaxValue + 2)}), + kRowCannotMatch, file1_); + RunTest(Expressions::In( + "id", {Literal::Long(kIntMaxValue + 6), Literal::Long(kIntMaxValue + 7)}), + kRowCannotMatch, file1_); + + RunTest(Expressions::In("all_nulls", {Literal::String("abc"), Literal::String("def")}), + kRowCannotMatch, file1_); + RunTest(Expressions::In("some_nulls", {Literal::String("abc"), Literal::String("def")}), + kRowsMightMatch, file1_); + RunTest(Expressions::In("no_nulls", {Literal::String("abc"), Literal::String("def")}), + kRowsMightMatch, file1_); + + std::vector ids; + for (int i = -400; i <= 0; i++) { + ids.emplace_back(Literal::Long(i)); + } + RunTest(Expressions::In("id", ids), kRowsMightMatch, file1_); +} + +TEST_F(InclusiveMetricsEvaluatorMigratedTest, IntegerNotInTest) { + RunTest(Expressions::NotIn( + "id", {Literal::Long(kIntMinValue - 25), Literal::Long(kIntMinValue - 24)}), + kRowsMightMatch, file1_); + RunTest(Expressions::NotIn( + "id", {Literal::Long(kIntMinValue - 2), Literal::Long(kIntMinValue - 1)}), + kRowsMightMatch, file1_); + RunTest(Expressions::NotIn( + "id", {Literal::Long(kIntMinValue - 1), Literal::Long(kIntMinValue)}), + kRowsMightMatch, file1_); + RunTest(Expressions::NotIn( + "id", {Literal::Long(kIntMaxValue - 4), Literal::Long(kIntMaxValue - 3)}), + kRowsMightMatch, file1_); + RunTest(Expressions::NotIn( + "id", {Literal::Long(kIntMaxValue), Literal::Long(kIntMaxValue + 1)}), + kRowsMightMatch, file1_); + RunTest(Expressions::NotIn( + "id", {Literal::Long(kIntMaxValue + 1), Literal::Long(kIntMaxValue + 2)}), + kRowsMightMatch, file1_); + RunTest(Expressions::NotIn( + "id", {Literal::Long(kIntMaxValue + 6), Literal::Long(kIntMaxValue + 7)}), + kRowsMightMatch, file1_); + + RunTest( + Expressions::NotIn("all_nulls", {Literal::String("abc"), Literal::String("def")}), + kRowsMightMatch, file1_); + RunTest( + Expressions::NotIn("some_nulls", {Literal::String("abc"), Literal::String("def")}), + kRowsMightMatch, file1_); + RunTest( + Expressions::NotIn("no_nulls", {Literal::String("abc"), Literal::String("def")}), + kRowsMightMatch, file1_); + + std::vector ids; + for (int i = -400; i <= 0; i++) { + ids.emplace_back(Literal::Long(i)); + } + RunTest(Expressions::NotIn("id", ids), kRowsMightMatch, file1_); } } // namespace iceberg From c20b738fb6a03944650066c9f15eb8dc4953114b Mon Sep 17 00:00:00 2001 From: "xiao.dong" Date: Tue, 2 Dec 2025 09:45:03 +0800 Subject: [PATCH 11/15] debug startsWith case in windows --- .../expression/inclusive_metrics_evaluator.cc | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/iceberg/expression/inclusive_metrics_evaluator.cc b/src/iceberg/expression/inclusive_metrics_evaluator.cc index d2f513c0a..ba24a2920 100644 --- a/src/iceberg/expression/inclusive_metrics_evaluator.cc +++ b/src/iceberg/expression/inclusive_metrics_evaluator.cc @@ -291,13 +291,13 @@ class InclusiveMetricsVisitor : public BoundVisitor { Result StartsWith(const std::shared_ptr& expr, const Literal& lit) override { - auto transform = internal::checked_pointer_cast(expr); - if (transform != nullptr && - transform->transform()->transform_type() != TransformType::kIdentity) { - // truncate must be rewritten in binding. the result is either always or never - // compatible - return kRowsMightMatch; - } + // auto transform = internal::checked_pointer_cast(expr); + // if (transform != nullptr && + // transform->transform()->transform_type() != TransformType::kIdentity) { + // // truncate must be rewritten in binding. the result is either always or never + // // compatible + // return kRowsMightMatch; + // } int32_t id = expr->reference()->field().field_id(); if (ContainsNullsOnly(id)) { From 0ce98189deb1f9b068ea256ef00369021a3310bd Mon Sep 17 00:00:00 2001 From: "xiao.dong" Date: Tue, 2 Dec 2025 10:21:02 +0800 Subject: [PATCH 12/15] fix cast --- .../expression/inclusive_metrics_evaluator.cc | 34 +++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/src/iceberg/expression/inclusive_metrics_evaluator.cc b/src/iceberg/expression/inclusive_metrics_evaluator.cc index ba24a2920..ac13c4a7b 100644 --- a/src/iceberg/expression/inclusive_metrics_evaluator.cc +++ b/src/iceberg/expression/inclusive_metrics_evaluator.cc @@ -86,7 +86,7 @@ class InclusiveMetricsVisitor : public BoundVisitor { if (ContainsNullsOnly(id)) { return kRowCannotMatch; } - if (internal::checked_pointer_cast(expr) == nullptr) { + if (std::dynamic_pointer_cast(expr) == nullptr) { return kRowsMightMatch; } auto it = data_file_.nan_value_counts.find(id); @@ -97,7 +97,7 @@ class InclusiveMetricsVisitor : public BoundVisitor { } Result NotNaN(const std::shared_ptr& expr) override { - if (internal::checked_pointer_cast(expr) == nullptr) { + if (std::dynamic_pointer_cast(expr) == nullptr) { // identity transforms are already removed by this time return kRowsMightMatch; } @@ -291,13 +291,13 @@ class InclusiveMetricsVisitor : public BoundVisitor { Result StartsWith(const std::shared_ptr& expr, const Literal& lit) override { - // auto transform = internal::checked_pointer_cast(expr); - // if (transform != nullptr && - // transform->transform()->transform_type() != TransformType::kIdentity) { - // // truncate must be rewritten in binding. the result is either always or never - // // compatible - // return kRowsMightMatch; - // } + auto transform = std::dynamic_pointer_cast(expr); + if (transform != nullptr && + transform->transform()->transform_type() != TransformType::kIdentity) { + // truncate must be rewritten in binding. the result is either always or never + // compatible + return kRowsMightMatch; + } int32_t id = expr->reference()->field().field_id(); if (ContainsNullsOnly(id)) { @@ -410,10 +410,10 @@ class InclusiveMetricsVisitor : public BoundVisitor { } Result> LowerBound(const std::shared_ptr& expr) { - if (auto reference = internal::checked_pointer_cast(expr); + if (auto reference = std::dynamic_pointer_cast(expr); reference != nullptr) { return ParseLowerBound(*reference); - } else if (auto transform = internal::checked_pointer_cast(expr); + } else if (auto transform = std::dynamic_pointer_cast(expr); transform != nullptr) { return TransformLowerBound(*transform); } else { @@ -423,10 +423,10 @@ class InclusiveMetricsVisitor : public BoundVisitor { } Result> UpperBound(const std::shared_ptr& expr) { - if (auto reference = internal::checked_pointer_cast(expr); + if (auto reference = std::dynamic_pointer_cast(expr); reference != nullptr) { return ParseUpperBound(*reference); - } else if (auto transform = internal::checked_pointer_cast(expr); + } else if (auto transform = std::dynamic_pointer_cast(expr); transform != nullptr) { return TransformUpperBound(*transform); } else { @@ -439,7 +439,7 @@ class InclusiveMetricsVisitor : public BoundVisitor { int32_t id = ref.field().field_id(); auto type = ref.type(); if (!type->is_primitive()) { - return Invalid("Lower bound of non-primitive type is not supported."); + return NotSupported("Lower bound of non-primitive type is not supported."); } auto primitive_type = internal::checked_pointer_cast(type); if (!data_file_.lower_bounds.empty() && data_file_.lower_bounds.contains(id)) { @@ -456,7 +456,7 @@ class InclusiveMetricsVisitor : public BoundVisitor { int32_t id = ref.field().field_id(); auto type = ref.type(); if (!type->is_primitive()) { - return Invalid("Upper bound of non-primitive type is not supported."); + return NotSupported("Upper bound of non-primitive type is not supported."); } auto primitive_type = internal::checked_pointer_cast(type); if (!data_file_.upper_bounds.empty() && data_file_.upper_bounds.contains(id)) { @@ -499,10 +499,10 @@ class InclusiveMetricsVisitor : public BoundVisitor { /** Returns true if the expression term produces a non-null value for non-null input. */ bool IsNonNullPreserving(const std::shared_ptr& expr) { - if (auto reference = internal::checked_pointer_cast(expr); + if (auto reference = std::dynamic_pointer_cast(expr); reference != nullptr) { return true; - } else if (auto transform = internal::checked_pointer_cast(expr); + } else if (auto transform = std::dynamic_pointer_cast(expr); transform != nullptr) { return transform->transform()->PreservesOrder(); } From 786b5f4f78efb0aec45ab5f64268cfce9506ac0a Mon Sep 17 00:00:00 2001 From: "xiao.dong" Date: Tue, 2 Dec 2025 18:34:03 +0800 Subject: [PATCH 13/15] add transform case --- src/iceberg/expression/expressions.cc | 162 ------ src/iceberg/expression/expressions.h | 125 ++++- .../expression/inclusive_metrics_evaluator.cc | 3 +- src/iceberg/expression/literal.cc | 5 +- src/iceberg/test/CMakeLists.txt | 1 + ...e_metrics_evaluator_with_transform_test.cc | 485 ++++++++++++++++++ src/iceberg/test/meson.build | 1 + 7 files changed, 596 insertions(+), 186 deletions(-) create mode 100644 src/iceberg/test/inclusive_metrics_evaluator_with_transform_test.cc diff --git a/src/iceberg/expression/expressions.cc b/src/iceberg/expression/expressions.cc index 786cc0ab7..7eef60232 100644 --- a/src/iceberg/expression/expressions.cc +++ b/src/iceberg/expression/expressions.cc @@ -156,56 +156,21 @@ std::shared_ptr> Expressions::IsNull( return IsNull(Ref(std::move(name))); } -template -std::shared_ptr> Expressions::IsNull( - std::shared_ptr> expr) { - ICEBERG_ASSIGN_OR_THROW( - auto pred, - UnboundPredicateImpl::Make(Expression::Operation::kIsNull, std::move(expr))); - return pred; -} - std::shared_ptr> Expressions::NotNull( std::string name) { return NotNull(Ref(std::move(name))); } -template -std::shared_ptr> Expressions::NotNull( - std::shared_ptr> expr) { - ICEBERG_ASSIGN_OR_THROW( - auto pred, - UnboundPredicateImpl::Make(Expression::Operation::kNotNull, std::move(expr))); - return pred; -} - std::shared_ptr> Expressions::IsNaN( std::string name) { return IsNaN(Ref(std::move(name))); } -template -std::shared_ptr> Expressions::IsNaN( - std::shared_ptr> expr) { - ICEBERG_ASSIGN_OR_THROW(auto pred, UnboundPredicateImpl::Make( - Expression::Operation::kIsNan, std::move(expr))); - return pred; -} - std::shared_ptr> Expressions::NotNaN( std::string name) { return NotNaN(Ref(std::move(name))); } -template -std::shared_ptr> Expressions::NotNaN( - std::shared_ptr> expr) { - ICEBERG_ASSIGN_OR_THROW( - auto pred, - UnboundPredicateImpl::Make(Expression::Operation::kNotNan, std::move(expr))); - return pred; -} - // Template implementations for comparison predicates std::shared_ptr> Expressions::LessThan( @@ -213,85 +178,31 @@ std::shared_ptr> Expressions::LessThan( return LessThan(Ref(std::move(name)), std::move(value)); } -template -std::shared_ptr> Expressions::LessThan( - std::shared_ptr> expr, Literal value) { - ICEBERG_ASSIGN_OR_THROW( - auto pred, UnboundPredicateImpl::Make(Expression::Operation::kLt, - std::move(expr), std::move(value))); - return pred; -} - std::shared_ptr> Expressions::LessThanOrEqual( std::string name, Literal value) { return LessThanOrEqual(Ref(std::move(name)), std::move(value)); } -template -std::shared_ptr> Expressions::LessThanOrEqual( - std::shared_ptr> expr, Literal value) { - ICEBERG_ASSIGN_OR_THROW( - auto pred, UnboundPredicateImpl::Make(Expression::Operation::kLtEq, - std::move(expr), std::move(value))); - return pred; -} - std::shared_ptr> Expressions::GreaterThan( std::string name, Literal value) { return GreaterThan(Ref(std::move(name)), std::move(value)); } -template -std::shared_ptr> Expressions::GreaterThan( - std::shared_ptr> expr, Literal value) { - ICEBERG_ASSIGN_OR_THROW( - auto pred, UnboundPredicateImpl::Make(Expression::Operation::kGt, - std::move(expr), std::move(value))); - return pred; -} - std::shared_ptr> Expressions::GreaterThanOrEqual( std::string name, Literal value) { return GreaterThanOrEqual(Ref(std::move(name)), std::move(value)); } -template -std::shared_ptr> Expressions::GreaterThanOrEqual( - std::shared_ptr> expr, Literal value) { - ICEBERG_ASSIGN_OR_THROW( - auto pred, UnboundPredicateImpl::Make(Expression::Operation::kGtEq, - std::move(expr), std::move(value))); - return pred; -} - std::shared_ptr> Expressions::Equal(std::string name, Literal value) { return Equal(Ref(std::move(name)), std::move(value)); } -template -std::shared_ptr> Expressions::Equal( - std::shared_ptr> expr, Literal value) { - ICEBERG_ASSIGN_OR_THROW( - auto pred, UnboundPredicateImpl::Make(Expression::Operation::kEq, - std::move(expr), std::move(value))); - return pred; -} - std::shared_ptr> Expressions::NotEqual( std::string name, Literal value) { return NotEqual(Ref(std::move(name)), std::move(value)); } -template -std::shared_ptr> Expressions::NotEqual( - std::shared_ptr> expr, Literal value) { - ICEBERG_ASSIGN_OR_THROW( - auto pred, UnboundPredicateImpl::Make(Expression::Operation::kNotEq, - std::move(expr), std::move(value))); - return pred; -} - // String predicates std::shared_ptr> Expressions::StartsWith( @@ -299,31 +210,11 @@ std::shared_ptr> Expressions::StartsWith( return StartsWith(Ref(std::move(name)), std::move(value)); } -template -std::shared_ptr> Expressions::StartsWith( - std::shared_ptr> expr, std::string value) { - ICEBERG_ASSIGN_OR_THROW( - auto pred, - UnboundPredicateImpl::Make(Expression::Operation::kStartsWith, std::move(expr), - Literal::String(std::move(value)))); - return pred; -} - std::shared_ptr> Expressions::NotStartsWith( std::string name, std::string value) { return NotStartsWith(Ref(std::move(name)), std::move(value)); } -template -std::shared_ptr> Expressions::NotStartsWith( - std::shared_ptr> expr, std::string value) { - ICEBERG_ASSIGN_OR_THROW( - auto pred, - UnboundPredicateImpl::Make(Expression::Operation::kNotStartsWith, - std::move(expr), Literal::String(std::move(value)))); - return pred; -} - // Template implementations for set predicates std::shared_ptr> Expressions::In( @@ -331,51 +222,21 @@ std::shared_ptr> Expressions::In( return In(Ref(std::move(name)), std::move(values)); } -template -std::shared_ptr> Expressions::In( - std::shared_ptr> expr, std::vector values) { - ICEBERG_ASSIGN_OR_THROW( - auto pred, UnboundPredicateImpl::Make(Expression::Operation::kIn, - std::move(expr), std::move(values))); - return pred; -} - std::shared_ptr> Expressions::In( std::string name, std::initializer_list values) { return In(Ref(std::move(name)), std::vector(values)); } -template -std::shared_ptr> Expressions::In( - std::shared_ptr> expr, std::initializer_list values) { - return In(std::move(expr), std::vector(values)); -} - std::shared_ptr> Expressions::NotIn( std::string name, std::vector values) { return NotIn(Ref(std::move(name)), std::move(values)); } -template -std::shared_ptr> Expressions::NotIn( - std::shared_ptr> expr, std::vector values) { - ICEBERG_ASSIGN_OR_THROW( - auto pred, UnboundPredicateImpl::Make(Expression::Operation::kNotIn, - std::move(expr), std::move(values))); - return pred; -} - std::shared_ptr> Expressions::NotIn( std::string name, std::initializer_list values) { return NotIn(Ref(std::move(name)), std::vector(values)); } -template -std::shared_ptr> Expressions::NotIn( - std::shared_ptr> expr, std::initializer_list values) { - return NotIn(expr, std::vector(values)); -} - // Template implementations for generic predicate factory std::shared_ptr> Expressions::Predicate( @@ -404,29 +265,6 @@ std::shared_ptr> Expressions::Predicate( return pred; } -template -std::shared_ptr> Expressions::Predicate( - Expression::Operation op, std::shared_ptr> expr, - std::vector values) { - ICEBERG_ASSIGN_OR_THROW( - auto pred, UnboundPredicateImpl::Make(op, std::move(expr), std::move(values))); - return pred; -} - -template -std::shared_ptr> Expressions::Predicate( - Expression::Operation op, std::shared_ptr> expr, - std::initializer_list values) { - return Predicate(op, std::move(expr), std::vector(values)); -} - -template -std::shared_ptr> Expressions::Predicate( - Expression::Operation op, std::shared_ptr> expr) { - ICEBERG_ASSIGN_OR_THROW(auto pred, UnboundPredicateImpl::Make(op, std::move(expr))); - return pred; -} - // Constants std::shared_ptr Expressions::AlwaysTrue() { return True::Instance(); } diff --git a/src/iceberg/expression/expressions.h b/src/iceberg/expression/expressions.h index cb1d6df7e..92c523ca7 100644 --- a/src/iceberg/expression/expressions.h +++ b/src/iceberg/expression/expressions.h @@ -27,7 +27,6 @@ #include #include -#include "iceberg/exception.h" #include "iceberg/expression/aggregate.h" #include "iceberg/expression/literal.h" #include "iceberg/expression/predicate.h" @@ -152,7 +151,12 @@ class ICEBERG_EXPORT Expressions { /// \brief Create an IS NULL predicate for an unbound term. template static std::shared_ptr> IsNull( - std::shared_ptr> expr); + std::shared_ptr> expr) { + ICEBERG_ASSIGN_OR_THROW( + auto pred, + UnboundPredicateImpl::Make(Expression::Operation::kIsNull, std::move(expr))); + return pred; + } /// \brief Create a NOT NULL predicate for a field name. static std::shared_ptr> NotNull(std::string name); @@ -160,7 +164,12 @@ class ICEBERG_EXPORT Expressions { /// \brief Create a NOT NULL predicate for an unbound term. template static std::shared_ptr> NotNull( - std::shared_ptr> expr); + std::shared_ptr> expr) { + ICEBERG_ASSIGN_OR_THROW( + auto pred, + UnboundPredicateImpl::Make(Expression::Operation::kNotNull, std::move(expr))); + return pred; + } /// \brief Create an IS NaN predicate for a field name. static std::shared_ptr> IsNaN(std::string name); @@ -168,7 +177,12 @@ class ICEBERG_EXPORT Expressions { /// \brief Create an IS NaN predicate for an unbound term. template static std::shared_ptr> IsNaN( - std::shared_ptr> expr); + std::shared_ptr> expr) { + ICEBERG_ASSIGN_OR_THROW( + auto pred, + UnboundPredicateImpl::Make(Expression::Operation::kIsNan, std::move(expr))); + return pred; + } /// \brief Create a NOT NaN predicate for a field name. static std::shared_ptr> NotNaN(std::string name); @@ -176,7 +190,12 @@ class ICEBERG_EXPORT Expressions { /// \brief Create a NOT NaN predicate for an unbound term. template static std::shared_ptr> NotNaN( - std::shared_ptr> expr); + std::shared_ptr> expr) { + ICEBERG_ASSIGN_OR_THROW( + auto pred, + UnboundPredicateImpl::Make(Expression::Operation::kNotNan, std::move(expr))); + return pred; + } // Comparison predicates @@ -187,7 +206,12 @@ class ICEBERG_EXPORT Expressions { /// \brief Create a less than predicate for an unbound term. template static std::shared_ptr> LessThan( - std::shared_ptr> expr, Literal value); + std::shared_ptr> expr, Literal value) { + ICEBERG_ASSIGN_OR_THROW( + auto pred, UnboundPredicateImpl::Make(Expression::Operation::kLt, + std::move(expr), std::move(value))); + return pred; + } /// \brief Create a less than or equal predicate for a field name. static std::shared_ptr> LessThanOrEqual( @@ -196,7 +220,12 @@ class ICEBERG_EXPORT Expressions { /// \brief Create a less than or equal predicate for an unbound term. template static std::shared_ptr> LessThanOrEqual( - std::shared_ptr> expr, Literal value); + std::shared_ptr> expr, Literal value) { + ICEBERG_ASSIGN_OR_THROW( + auto pred, UnboundPredicateImpl::Make(Expression::Operation::kLtEq, + std::move(expr), std::move(value))); + return pred; + } /// \brief Create a greater than predicate for a field name. static std::shared_ptr> GreaterThan( @@ -205,7 +234,12 @@ class ICEBERG_EXPORT Expressions { /// \brief Create a greater than predicate for an unbound term. template static std::shared_ptr> GreaterThan( - std::shared_ptr> expr, Literal value); + std::shared_ptr> expr, Literal value) { + ICEBERG_ASSIGN_OR_THROW( + auto pred, UnboundPredicateImpl::Make(Expression::Operation::kGt, + std::move(expr), std::move(value))); + return pred; + } /// \brief Create a greater than or equal predicate for a field name. static std::shared_ptr> GreaterThanOrEqual( @@ -214,7 +248,12 @@ class ICEBERG_EXPORT Expressions { /// \brief Create a greater than or equal predicate for an unbound term. template static std::shared_ptr> GreaterThanOrEqual( - std::shared_ptr> expr, Literal value); + std::shared_ptr> expr, Literal value) { + ICEBERG_ASSIGN_OR_THROW( + auto pred, UnboundPredicateImpl::Make(Expression::Operation::kGtEq, + std::move(expr), std::move(value))); + return pred; + } /// \brief Create an equal predicate for a field name. static std::shared_ptr> Equal(std::string name, @@ -223,7 +262,12 @@ class ICEBERG_EXPORT Expressions { /// \brief Create an equal predicate for an unbound term. template static std::shared_ptr> Equal( - std::shared_ptr> expr, Literal value); + std::shared_ptr> expr, Literal value) { + ICEBERG_ASSIGN_OR_THROW( + auto pred, UnboundPredicateImpl::Make(Expression::Operation::kEq, + std::move(expr), std::move(value))); + return pred; + } /// \brief Create a not equal predicate for a field name. static std::shared_ptr> NotEqual(std::string name, @@ -232,7 +276,12 @@ class ICEBERG_EXPORT Expressions { /// \brief Create a not equal predicate for an unbound term. template static std::shared_ptr> NotEqual( - std::shared_ptr> expr, Literal value); + std::shared_ptr> expr, Literal value) { + ICEBERG_ASSIGN_OR_THROW( + auto pred, UnboundPredicateImpl::Make(Expression::Operation::kNotEq, + std::move(expr), std::move(value))); + return pred; + } // String predicates @@ -243,7 +292,13 @@ class ICEBERG_EXPORT Expressions { /// \brief Create a starts with predicate for an unbound term. template static std::shared_ptr> StartsWith( - std::shared_ptr> expr, std::string value); + std::shared_ptr> expr, std::string value) { + ICEBERG_ASSIGN_OR_THROW( + auto pred, + UnboundPredicateImpl::Make(Expression::Operation::kStartsWith, std::move(expr), + Literal::String(std::move(value)))); + return pred; + } /// \brief Create a not starts with predicate for a field name. static std::shared_ptr> NotStartsWith( @@ -252,7 +307,13 @@ class ICEBERG_EXPORT Expressions { /// \brief Create a not starts with predicate for an unbound term. template static std::shared_ptr> NotStartsWith( - std::shared_ptr> expr, std::string value); + std::shared_ptr> expr, std::string value) { + ICEBERG_ASSIGN_OR_THROW( + auto pred, UnboundPredicateImpl::Make(Expression::Operation::kNotStartsWith, + std::move(expr), + Literal::String(std::move(value)))); + return pred; + } // Set predicates @@ -263,7 +324,12 @@ class ICEBERG_EXPORT Expressions { /// \brief Create an IN predicate for an unbound term. template static std::shared_ptr> In(std::shared_ptr> expr, - std::vector values); + std::vector values) { + ICEBERG_ASSIGN_OR_THROW( + auto pred, UnboundPredicateImpl::Make(Expression::Operation::kIn, + std::move(expr), std::move(values))); + return pred; + } /// \brief Create an IN predicate for a field name with initializer list. static std::shared_ptr> In( @@ -272,7 +338,9 @@ class ICEBERG_EXPORT Expressions { /// \brief Create an IN predicate for an unbound term with initializer list. template static std::shared_ptr> In( - std::shared_ptr> expr, std::initializer_list values); + std::shared_ptr> expr, std::initializer_list values) { + return In(std::move(expr), std::vector(values)); + } /// \brief Create a NOT IN predicate for a field name. static std::shared_ptr> NotIn( @@ -281,7 +349,12 @@ class ICEBERG_EXPORT Expressions { /// \brief Create a NOT IN predicate for an unbound term. template static std::shared_ptr> NotIn( - std::shared_ptr> expr, std::vector values); + std::shared_ptr> expr, std::vector values) { + ICEBERG_ASSIGN_OR_THROW( + auto pred, UnboundPredicateImpl::Make(Expression::Operation::kNotIn, + std::move(expr), std::move(values))); + return pred; + } /// \brief Create a NOT IN predicate for a field name with initializer list. static std::shared_ptr> NotIn( @@ -290,7 +363,9 @@ class ICEBERG_EXPORT Expressions { /// \brief Create a NOT IN predicate for an unbound term with initializer list. template static std::shared_ptr> NotIn( - std::shared_ptr> expr, std::initializer_list values); + std::shared_ptr> expr, std::initializer_list values) { + return NotIn(expr, std::vector(values)); + } // Generic predicate factory @@ -314,18 +389,28 @@ class ICEBERG_EXPORT Expressions { template static std::shared_ptr> Predicate( Expression::Operation op, std::shared_ptr> expr, - std::vector values); + std::vector values) { + ICEBERG_ASSIGN_OR_THROW( + auto pred, UnboundPredicateImpl::Make(op, std::move(expr), std::move(values))); + return pred; + } /// \brief Create a predicate with operation and multiple values. template static std::shared_ptr> Predicate( Expression::Operation op, std::shared_ptr> expr, - std::initializer_list values); + std::initializer_list values) { + return Predicate(op, std::move(expr), std::vector(values)); + } /// \brief Create a unary predicate for unbound term. template static std::shared_ptr> Predicate( - Expression::Operation op, std::shared_ptr> expr); + Expression::Operation op, std::shared_ptr> expr) { + ICEBERG_ASSIGN_OR_THROW(auto pred, + UnboundPredicateImpl::Make(op, std::move(expr))); + return pred; + } // Constants diff --git a/src/iceberg/expression/inclusive_metrics_evaluator.cc b/src/iceberg/expression/inclusive_metrics_evaluator.cc index ac13c4a7b..4c20e7c81 100644 --- a/src/iceberg/expression/inclusive_metrics_evaluator.cc +++ b/src/iceberg/expression/inclusive_metrics_evaluator.cc @@ -176,7 +176,6 @@ class InclusiveMetricsVisitor : public BoundVisitor { return kRowsMightMatch; } const auto& upper = upper_result.value(); - if (upper <= lit) { return kRowCannotMatch; } @@ -486,7 +485,7 @@ class InclusiveMetricsVisitor : public BoundVisitor { Result> TransformUpperBound(BoundTransform& boundTransform) { auto transform = boundTransform.transform(); if (transform->PreservesOrder()) { - ICEBERG_ASSIGN_OR_RAISE(auto upper, ParseLowerBound(*boundTransform.reference())); + ICEBERG_ASSIGN_OR_RAISE(auto upper, ParseUpperBound(*boundTransform.reference())); ICEBERG_ASSIGN_OR_RAISE(auto transform_func, transform->Bind(boundTransform.reference()->type())); if (upper.has_value()) { diff --git a/src/iceberg/expression/literal.cc b/src/iceberg/expression/literal.cc index c1aad90df..f2955b515 100644 --- a/src/iceberg/expression/literal.cc +++ b/src/iceberg/expression/literal.cc @@ -347,8 +347,9 @@ bool Literal::operator==(const Literal& other) const { return (*this <=> other) // Three-way comparison operator std::partial_ordering Literal::operator<=>(const Literal& other) const { - // If types are different, comparison is unordered - if (type_->type_id() != other.type_->type_id()) { + // If types are different and value type not the same, comparison is unordered + if (value_.index() != other.value_.index() && + type_->type_id() != other.type_->type_id()) { return std::partial_ordering::unordered; } diff --git a/src/iceberg/test/CMakeLists.txt b/src/iceberg/test/CMakeLists.txt index c2ecb5350..08edbb209 100644 --- a/src/iceberg/test/CMakeLists.txt +++ b/src/iceberg/test/CMakeLists.txt @@ -96,6 +96,7 @@ add_iceberg_test(expression_test expression_visitor_test.cc literal_test.cc inclusive_metrics_evaluator_test.cc + inclusive_metrics_evaluator_with_transform_test.cc predicate_test.cc) add_iceberg_test(json_serde_test diff --git a/src/iceberg/test/inclusive_metrics_evaluator_with_transform_test.cc b/src/iceberg/test/inclusive_metrics_evaluator_with_transform_test.cc new file mode 100644 index 000000000..216bfe363 --- /dev/null +++ b/src/iceberg/test/inclusive_metrics_evaluator_with_transform_test.cc @@ -0,0 +1,485 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include +#include + +#include + +#include "iceberg/expression/expressions.h" +#include "iceberg/expression/inclusive_metrics_evaluator.h" +#include "iceberg/expression/term.h" +#include "iceberg/manifest/manifest_entry.h" +#include "iceberg/schema.h" +#include "iceberg/test/matchers.h" +#include "iceberg/type.h" + +namespace iceberg { + +namespace { +constexpr bool kRowsMightMatch = true; +constexpr bool kRowCannotMatch = false; +constexpr int64_t kIntMinValue = 30; +constexpr int64_t kIntMaxValue = 79; +constexpr int64_t kMicrosPerDay = 86'400'000'000LL; +constexpr int64_t kTsMinValue = 30 * kMicrosPerDay; +constexpr int64_t kTsMaxValue = 79 * kMicrosPerDay; + +std::shared_ptr> ToBoundTransform( + const std::shared_ptr& transform) { + return std::static_pointer_cast>(transform); +} +} // namespace + +class InclusiveMetricsEvaluatorWithTransformTest : public ::testing::Test { + protected: + void SetUp() override { + schema_ = std::make_shared( + std::vector{ + SchemaField::MakeRequired(1, "id", int64()), + SchemaField::MakeRequired(2, "ts", timestamp_tz()), + SchemaField::MakeOptional(3, "all_nulls", int64()), + SchemaField::MakeOptional(4, "all_nulls_str", string()), + SchemaField::MakeOptional(5, "no_stats", int64()), + SchemaField::MakeOptional(6, "str", string()), + }, + /*schema_id=*/0); + + data_file_ = std::make_shared(); + data_file_->file_path = "file.avro"; + data_file_->file_format = FileFormatType::kAvro; + data_file_->record_count = 50; + data_file_->value_counts = { + {1, 50L}, + {2, 50L}, + {3, 50L}, + {4, 50L}, + }; + data_file_->null_value_counts = { + {1, 0L}, + {2, 0L}, + {3, 50L}, + {4, 50L}, + }; + data_file_->nan_value_counts.clear(); + data_file_->lower_bounds = { + {2, Literal::TimestampTz(kTsMinValue).Serialize().value()}, + {6, Literal::String("abc").Serialize().value()}, + }; + data_file_->upper_bounds = { + {2, Literal::TimestampTz(kTsMaxValue).Serialize().value()}, + {6, Literal::String("abe").Serialize().value()}, + }; + } + + void ExpectShouldRead(const std::shared_ptr& expr, bool expected_result, + std::shared_ptr file = nullptr, + bool case_sensitive = true) { + auto target_file = file ? file : data_file_; + ICEBERG_UNWRAP_OR_FAIL( + auto evaluator, InclusiveMetricsEvaluator::Make(expr, *schema_, case_sensitive)); + auto eval_result = evaluator->Eval(*target_file); + ASSERT_TRUE(eval_result.has_value()); + ASSERT_EQ(eval_result.value(), expected_result) << expr->ToString(); + } + + std::vector> MissingStatsExpressions() const { + auto truncate_no_stats = ToBoundTransform(Expressions::Truncate("no_stats", 10)); + return { + Expressions::LessThan(truncate_no_stats, Literal::Long(5)), + Expressions::LessThanOrEqual(truncate_no_stats, Literal::Long(30)), + Expressions::Equal(truncate_no_stats, Literal::Long(70)), + Expressions::GreaterThan(truncate_no_stats, Literal::Long(78)), + Expressions::GreaterThanOrEqual(truncate_no_stats, Literal::Long(90)), + Expressions::NotEqual(truncate_no_stats, Literal::Long(101)), + Expressions::IsNull(truncate_no_stats), + Expressions::NotNull(truncate_no_stats), + }; + } + + std::shared_ptr schema_; + std::shared_ptr data_file_; +}; + +TEST_F(InclusiveMetricsEvaluatorWithTransformTest, AllNullsWithNonOrderPreserving) { + auto bucket_all_nulls = ToBoundTransform(Expressions::Bucket("all_nulls", 100)); + ExpectShouldRead(Expressions::IsNull(bucket_all_nulls), kRowsMightMatch); + ExpectShouldRead(Expressions::NotNull(bucket_all_nulls), kRowCannotMatch); + ExpectShouldRead(Expressions::LessThan(bucket_all_nulls, Literal::Int(30)), + kRowCannotMatch); + ExpectShouldRead(Expressions::LessThanOrEqual(bucket_all_nulls, Literal::Int(30)), + kRowCannotMatch); + ExpectShouldRead(Expressions::GreaterThan(bucket_all_nulls, Literal::Int(30)), + kRowCannotMatch); + ExpectShouldRead(Expressions::GreaterThanOrEqual(bucket_all_nulls, Literal::Int(30)), + kRowCannotMatch); + ExpectShouldRead(Expressions::Equal(bucket_all_nulls, Literal::Int(30)), + kRowCannotMatch); + ExpectShouldRead(Expressions::NotEqual(bucket_all_nulls, Literal::Int(30)), + kRowsMightMatch); + ExpectShouldRead(Expressions::In(bucket_all_nulls, {Literal::Int(1), Literal::Int(2)}), + kRowCannotMatch); + ExpectShouldRead( + Expressions::NotIn(bucket_all_nulls, {Literal::Int(1), Literal::Int(2)}), + kRowsMightMatch); +} + +TEST_F(InclusiveMetricsEvaluatorWithTransformTest, RequiredWithNonOrderPreserving) { + auto bucket_ts = ToBoundTransform(Expressions::Bucket("ts", 100)); + ExpectShouldRead(Expressions::IsNull(bucket_ts), kRowsMightMatch); + ExpectShouldRead(Expressions::NotNull(bucket_ts), kRowsMightMatch); + ExpectShouldRead(Expressions::LessThan(bucket_ts, Literal::Int(30)), kRowsMightMatch); + ExpectShouldRead(Expressions::LessThanOrEqual(bucket_ts, Literal::Int(30)), + kRowsMightMatch); + ExpectShouldRead(Expressions::GreaterThan(bucket_ts, Literal::Int(30)), + kRowsMightMatch); + ExpectShouldRead(Expressions::GreaterThanOrEqual(bucket_ts, Literal::Int(30)), + kRowsMightMatch); + ExpectShouldRead(Expressions::Equal(bucket_ts, Literal::Int(30)), kRowsMightMatch); + ExpectShouldRead(Expressions::NotEqual(bucket_ts, Literal::Int(30)), kRowsMightMatch); + ExpectShouldRead(Expressions::In(bucket_ts, {Literal::Int(1), Literal::Int(2)}), + kRowsMightMatch); + ExpectShouldRead(Expressions::NotIn(bucket_ts, {Literal::Int(1), Literal::Int(2)}), + kRowsMightMatch); +} + +TEST_F(InclusiveMetricsEvaluatorWithTransformTest, AllNulls) { + auto truncate_all_nulls = ToBoundTransform(Expressions::Truncate("all_nulls", 10)); + ExpectShouldRead(Expressions::IsNull(truncate_all_nulls), kRowsMightMatch); + ExpectShouldRead(Expressions::NotNull(truncate_all_nulls), kRowCannotMatch); + ExpectShouldRead(Expressions::LessThan(truncate_all_nulls, Literal::Long(30)), + kRowCannotMatch); + ExpectShouldRead(Expressions::LessThanOrEqual(truncate_all_nulls, Literal::Long(30)), + kRowCannotMatch); + ExpectShouldRead(Expressions::GreaterThan(truncate_all_nulls, Literal::Long(30)), + kRowCannotMatch); + ExpectShouldRead(Expressions::GreaterThanOrEqual(truncate_all_nulls, Literal::Long(30)), + kRowCannotMatch); + ExpectShouldRead(Expressions::Equal(truncate_all_nulls, Literal::Long(30)), + kRowCannotMatch); + ExpectShouldRead(Expressions::NotEqual(truncate_all_nulls, Literal::Long(30)), + kRowsMightMatch); + ExpectShouldRead( + Expressions::In(truncate_all_nulls, {Literal::Long(10), Literal::Long(20)}), + kRowCannotMatch); + ExpectShouldRead( + Expressions::NotIn(truncate_all_nulls, {Literal::Long(10), Literal::Long(20)}), + kRowsMightMatch); + + auto truncate_all_nulls_str = + ToBoundTransform(Expressions::Truncate("all_nulls_str", 10)); + ExpectShouldRead(Expressions::StartsWith(truncate_all_nulls_str, "a"), kRowsMightMatch); + ExpectShouldRead(Expressions::NotStartsWith(truncate_all_nulls_str, "a"), + kRowsMightMatch); +} + +TEST_F(InclusiveMetricsEvaluatorWithTransformTest, MissingColumn) { + auto expr = Expressions::LessThan( + ToBoundTransform(Expressions::Truncate("missing", 10)), Literal::Long(20)); + auto result = InclusiveMetricsEvaluator::Make(expr, *schema_, true); + ASSERT_FALSE(result.has_value()) << result.error().message; + ASSERT_TRUE(result.error().message.contains("Cannot find field 'missing'")) + << result.error().message; +} + +TEST_F(InclusiveMetricsEvaluatorWithTransformTest, MissingStats) { + for (const auto& expr : MissingStatsExpressions()) { + ExpectShouldRead(expr, kRowsMightMatch); + } +} + +TEST_F(InclusiveMetricsEvaluatorWithTransformTest, ZeroRecordFile) { + auto zero_record_file = std::make_shared(); + zero_record_file->file_path = "file.parquet"; + zero_record_file->file_format = FileFormatType::kParquet; + zero_record_file->record_count = 0; + + for (const auto& expr : MissingStatsExpressions()) { + ExpectShouldRead(expr, kRowCannotMatch, zero_record_file); + } +} + +TEST_F(InclusiveMetricsEvaluatorWithTransformTest, Not) { + auto day_ts = ToBoundTransform(Expressions::Day("ts")); + ExpectShouldRead( + Expressions::Not(Expressions::LessThan(day_ts, Literal::Long(kIntMinValue - 25))), + kRowsMightMatch); + ExpectShouldRead(Expressions::Not(Expressions::GreaterThan( + day_ts, Literal::Long(kIntMinValue - 25))), + kRowCannotMatch); +} + +TEST_F(InclusiveMetricsEvaluatorWithTransformTest, And) { + auto day_ts = ToBoundTransform(Expressions::Day("ts")); + ExpectShouldRead( + Expressions::And( + Expressions::LessThan(day_ts, Literal::Long(kIntMinValue - 25)), + Expressions::GreaterThanOrEqual(day_ts, Literal::Long(kIntMinValue - 30))), + kRowCannotMatch); + ExpectShouldRead( + Expressions::And( + Expressions::LessThan(day_ts, Literal::Long(kIntMinValue - 25)), + Expressions::GreaterThanOrEqual(day_ts, Literal::Long(kIntMaxValue + 1))), + kRowCannotMatch); + ExpectShouldRead( + Expressions::And(Expressions::GreaterThan(day_ts, Literal::Long(kIntMinValue - 25)), + Expressions::LessThanOrEqual(day_ts, Literal::Long(kIntMinValue))), + kRowsMightMatch); +} + +TEST_F(InclusiveMetricsEvaluatorWithTransformTest, Or) { + auto day_ts = ToBoundTransform(Expressions::Day("ts")); + ExpectShouldRead( + Expressions::Or( + Expressions::LessThan(day_ts, Literal::Long(kIntMinValue - 25)), + Expressions::GreaterThanOrEqual(day_ts, Literal::Long(kIntMaxValue + 1))), + kRowCannotMatch); + ExpectShouldRead( + Expressions::Or( + Expressions::LessThan(day_ts, Literal::Long(kIntMinValue - 25)), + Expressions::GreaterThanOrEqual(day_ts, Literal::Long(kIntMaxValue - 19))), + kRowsMightMatch); +} + +TEST_F(InclusiveMetricsEvaluatorWithTransformTest, IntegerLt) { + auto day_ts = ToBoundTransform(Expressions::Day("ts")); + ExpectShouldRead(Expressions::LessThan(day_ts, Literal::Long(kIntMinValue - 25)), + kRowCannotMatch); + ExpectShouldRead(Expressions::LessThan(day_ts, Literal::Long(kIntMinValue)), + kRowCannotMatch); + ExpectShouldRead(Expressions::LessThan(day_ts, Literal::Long(kIntMinValue + 1)), + kRowsMightMatch); + ExpectShouldRead(Expressions::LessThan(day_ts, Literal::Long(kIntMaxValue)), + kRowsMightMatch); +} + +TEST_F(InclusiveMetricsEvaluatorWithTransformTest, IntegerLtEq) { + auto day_ts = ToBoundTransform(Expressions::Day("ts")); + ExpectShouldRead(Expressions::LessThanOrEqual(day_ts, Literal::Long(kIntMinValue - 25)), + kRowCannotMatch); + ExpectShouldRead(Expressions::LessThanOrEqual(day_ts, Literal::Long(kIntMinValue - 1)), + kRowCannotMatch); + ExpectShouldRead(Expressions::LessThanOrEqual(day_ts, Literal::Long(kIntMinValue)), + kRowsMightMatch); + ExpectShouldRead(Expressions::LessThanOrEqual(day_ts, Literal::Long(kIntMaxValue)), + kRowsMightMatch); +} + +TEST_F(InclusiveMetricsEvaluatorWithTransformTest, IntegerGt) { + auto day_ts = ToBoundTransform(Expressions::Day("ts")); + ExpectShouldRead(Expressions::GreaterThan(day_ts, Literal::Int(kIntMaxValue + 6)), + kRowCannotMatch); + ExpectShouldRead(Expressions::GreaterThan(day_ts, Literal::Date(kIntMaxValue)), + kRowCannotMatch); + ExpectShouldRead(Expressions::GreaterThan(day_ts, Literal::Date(kIntMaxValue - 1)), + kRowsMightMatch); + ExpectShouldRead(Expressions::GreaterThan(day_ts, Literal::Date(kIntMaxValue - 4)), + kRowsMightMatch); +} + +TEST_F(InclusiveMetricsEvaluatorWithTransformTest, IntegerGtEq) { + auto day_ts = ToBoundTransform(Expressions::Day("ts")); + ExpectShouldRead( + Expressions::GreaterThanOrEqual(day_ts, Literal::Long(kIntMaxValue + 6)), + kRowCannotMatch); + ExpectShouldRead( + Expressions::GreaterThanOrEqual(day_ts, Literal::Long(kIntMaxValue + 1)), + kRowCannotMatch); + ExpectShouldRead(Expressions::GreaterThanOrEqual(day_ts, Literal::Long(kIntMaxValue)), + kRowsMightMatch); + ExpectShouldRead( + Expressions::GreaterThanOrEqual(day_ts, Literal::Long(kIntMaxValue - 4)), + kRowsMightMatch); +} + +TEST_F(InclusiveMetricsEvaluatorWithTransformTest, IntegerEq) { + auto day_ts = ToBoundTransform(Expressions::Day("ts")); + ExpectShouldRead(Expressions::Equal(day_ts, Literal::Long(kIntMinValue - 25)), + kRowCannotMatch); + ExpectShouldRead(Expressions::Equal(day_ts, Literal::Long(kIntMinValue - 1)), + kRowCannotMatch); + ExpectShouldRead(Expressions::Equal(day_ts, Literal::Long(kIntMinValue)), + kRowsMightMatch); + ExpectShouldRead(Expressions::Equal(day_ts, Literal::Long(kIntMaxValue - 4)), + kRowsMightMatch); + ExpectShouldRead(Expressions::Equal(day_ts, Literal::Long(kIntMaxValue)), + kRowsMightMatch); + ExpectShouldRead(Expressions::Equal(day_ts, Literal::Long(kIntMaxValue + 1)), + kRowCannotMatch); + ExpectShouldRead(Expressions::Equal(day_ts, Literal::Long(kIntMaxValue + 6)), + kRowCannotMatch); +} + +TEST_F(InclusiveMetricsEvaluatorWithTransformTest, IntegerNotEq) { + auto day_ts = ToBoundTransform(Expressions::Day("ts")); + ExpectShouldRead(Expressions::NotEqual(day_ts, Literal::Long(kIntMinValue - 25)), + kRowsMightMatch); + ExpectShouldRead(Expressions::NotEqual(day_ts, Literal::Long(kIntMinValue - 1)), + kRowsMightMatch); + ExpectShouldRead(Expressions::NotEqual(day_ts, Literal::Long(kIntMinValue)), + kRowsMightMatch); + ExpectShouldRead(Expressions::NotEqual(day_ts, Literal::Long(kIntMaxValue - 4)), + kRowsMightMatch); + ExpectShouldRead(Expressions::NotEqual(day_ts, Literal::Long(kIntMaxValue)), + kRowsMightMatch); + ExpectShouldRead(Expressions::NotEqual(day_ts, Literal::Long(kIntMaxValue + 1)), + kRowsMightMatch); + ExpectShouldRead(Expressions::NotEqual(day_ts, Literal::Long(kIntMaxValue + 6)), + kRowsMightMatch); +} + +TEST_F(InclusiveMetricsEvaluatorWithTransformTest, IntegerNotEqRewritten) { + auto day_ts = ToBoundTransform(Expressions::Day("ts")); + ExpectShouldRead( + Expressions::Not(Expressions::Equal(day_ts, Literal::Long(kIntMinValue - 25))), + kRowsMightMatch); + ExpectShouldRead( + Expressions::Not(Expressions::Equal(day_ts, Literal::Long(kIntMinValue - 1))), + kRowsMightMatch); + ExpectShouldRead( + Expressions::Not(Expressions::Equal(day_ts, Literal::Long(kIntMinValue))), + kRowsMightMatch); + ExpectShouldRead( + Expressions::Not(Expressions::Equal(day_ts, Literal::Long(kIntMaxValue - 4))), + kRowsMightMatch); + ExpectShouldRead( + Expressions::Not(Expressions::Equal(day_ts, Literal::Long(kIntMaxValue))), + kRowsMightMatch); + ExpectShouldRead( + Expressions::Not(Expressions::Equal(day_ts, Literal::Long(kIntMaxValue + 1))), + kRowsMightMatch); + ExpectShouldRead( + Expressions::Not(Expressions::Equal(day_ts, Literal::Long(kIntMaxValue + 6))), + kRowsMightMatch); +} + +TEST_F(InclusiveMetricsEvaluatorWithTransformTest, CaseInsensitiveIntegerNotEqRewritten) { + auto day_ts = ToBoundTransform(Expressions::Day("ts")); + ExpectShouldRead( + Expressions::Not(Expressions::Equal(day_ts, Literal::Long(kIntMinValue - 25))), + kRowsMightMatch, nullptr, false); + ExpectShouldRead( + Expressions::Not(Expressions::Equal(day_ts, Literal::Long(kIntMinValue - 1))), + kRowsMightMatch, nullptr, false); + ExpectShouldRead( + Expressions::Not(Expressions::Equal(day_ts, Literal::Long(kIntMinValue))), + kRowsMightMatch, nullptr, false); + ExpectShouldRead( + Expressions::Not(Expressions::Equal(day_ts, Literal::Long(kIntMaxValue - 4))), + kRowsMightMatch, nullptr, false); + ExpectShouldRead( + Expressions::Not(Expressions::Equal(day_ts, Literal::Long(kIntMaxValue))), + kRowsMightMatch, nullptr, false); + ExpectShouldRead( + Expressions::Not(Expressions::Equal(day_ts, Literal::Long(kIntMaxValue + 1))), + kRowsMightMatch, nullptr, false); + ExpectShouldRead( + Expressions::Not(Expressions::Equal(day_ts, Literal::Long(kIntMaxValue + 6))), + kRowsMightMatch, nullptr, false); +} + +TEST_F(InclusiveMetricsEvaluatorWithTransformTest, CaseSensitiveIntegerNotEqRewritten) { + auto day_ts = ToBoundTransform(Expressions::Day("TS")); + auto expr = Expressions::Not(Expressions::Equal(day_ts, Literal::Long(5))); + auto result = InclusiveMetricsEvaluator::Make(expr, *schema_, true); + ASSERT_FALSE(result.has_value()) << result.error().message; + ASSERT_TRUE(result.error().message.contains("Cannot find field 'TS'")) + << result.error().message; +} + +TEST_F(InclusiveMetricsEvaluatorWithTransformTest, StringStartsWith) { + auto truncate_str = ToBoundTransform(Expressions::Truncate("str", 10)); + ExpectShouldRead(Expressions::StartsWith(truncate_str, "a"), kRowsMightMatch); + ExpectShouldRead(Expressions::StartsWith(truncate_str, "ab"), kRowsMightMatch); + ExpectShouldRead(Expressions::StartsWith(truncate_str, "b"), kRowsMightMatch); +} + +TEST_F(InclusiveMetricsEvaluatorWithTransformTest, StringNotStartsWith) { + auto truncate_str = ToBoundTransform(Expressions::Truncate("str", 10)); + ExpectShouldRead(Expressions::StartsWith(truncate_str, "a"), kRowsMightMatch); + ExpectShouldRead(Expressions::StartsWith(truncate_str, "ab"), kRowsMightMatch); + ExpectShouldRead(Expressions::StartsWith(truncate_str, "b"), kRowsMightMatch); +} + +TEST_F(InclusiveMetricsEvaluatorWithTransformTest, IntegerIn) { + auto day_ts = ToBoundTransform(Expressions::Day("ts")); + ExpectShouldRead(Expressions::In(day_ts, {Literal::Long(kIntMinValue - 25), + Literal::Long(kIntMinValue - 24)}), + kRowCannotMatch); + ExpectShouldRead(Expressions::In(day_ts, {Literal::Long(kIntMinValue - 2), + Literal::Long(kIntMinValue - 1)}), + kRowCannotMatch); + ExpectShouldRead(Expressions::In(day_ts, {Literal::Long(kIntMinValue - 1), + Literal::Long(kIntMinValue)}), + kRowsMightMatch); + ExpectShouldRead(Expressions::In(day_ts, {Literal::Long(kIntMaxValue - 4), + Literal::Long(kIntMaxValue - 3)}), + kRowsMightMatch); + ExpectShouldRead(Expressions::In(day_ts, {Literal::Long(kIntMaxValue), + Literal::Long(kIntMaxValue + 1)}), + kRowsMightMatch); + ExpectShouldRead(Expressions::In(day_ts, {Literal::Long(kIntMaxValue + 1), + Literal::Long(kIntMaxValue + 2)}), + kRowCannotMatch); + ExpectShouldRead(Expressions::In(day_ts, {Literal::Long(kIntMaxValue + 6), + Literal::Long(kIntMaxValue + 7)}), + kRowCannotMatch); + + std::vector ids; + ids.reserve(401); + for (int i = -400; i <= 0; ++i) { + ids.emplace_back(Literal::Long(i)); + } + ExpectShouldRead(Expressions::In(day_ts, ids), kRowsMightMatch); +} + +TEST_F(InclusiveMetricsEvaluatorWithTransformTest, IntegerNotIn) { + auto day_ts = ToBoundTransform(Expressions::Day("ts")); + ExpectShouldRead(Expressions::NotIn(day_ts, {Literal::Long(kIntMinValue - 25), + Literal::Long(kIntMinValue - 24)}), + kRowsMightMatch); + ExpectShouldRead(Expressions::NotIn(day_ts, {Literal::Long(kIntMinValue - 2), + Literal::Long(kIntMinValue - 1)}), + kRowsMightMatch); + ExpectShouldRead(Expressions::NotIn(day_ts, {Literal::Long(kIntMinValue - 1), + Literal::Long(kIntMinValue)}), + kRowsMightMatch); + ExpectShouldRead(Expressions::NotIn(day_ts, {Literal::Long(kIntMaxValue - 4), + Literal::Long(kIntMaxValue - 3)}), + kRowsMightMatch); + ExpectShouldRead(Expressions::NotIn(day_ts, {Literal::Long(kIntMaxValue), + Literal::Long(kIntMaxValue + 1)}), + kRowsMightMatch); + ExpectShouldRead(Expressions::NotIn(day_ts, {Literal::Long(kIntMaxValue + 1), + Literal::Long(kIntMaxValue + 2)}), + kRowsMightMatch); + ExpectShouldRead(Expressions::NotIn(day_ts, {Literal::Long(kIntMaxValue + 6), + Literal::Long(kIntMaxValue + 7)}), + kRowsMightMatch); + + std::vector ids; + ids.reserve(401); + for (int i = -400; i <= 0; ++i) { + ids.emplace_back(Literal::Long(i)); + } + ExpectShouldRead(Expressions::NotIn(day_ts, ids), kRowsMightMatch); +} + +} // namespace iceberg diff --git a/src/iceberg/test/meson.build b/src/iceberg/test/meson.build index 331273dac..b8fd6cef9 100644 --- a/src/iceberg/test/meson.build +++ b/src/iceberg/test/meson.build @@ -61,6 +61,7 @@ iceberg_tests = { 'expression_test.cc', 'expression_visitor_test.cc', 'inclusive_metrics_evaluator_test.cc', + 'inclusive_metrics_evaluator_with_transform_test.cc', 'literal_test.cc', 'predicate_test.cc', ), From 89ad9e93100d17029f89f9d17fddc07014830c05 Mon Sep 17 00:00:00 2001 From: "xiao.dong" Date: Tue, 2 Dec 2025 18:53:35 +0800 Subject: [PATCH 14/15] fix literal comparable --- src/iceberg/expression/literal.cc | 21 ++++++++++++++++--- src/iceberg/expression/literal.h | 2 ++ ...e_metrics_evaluator_with_transform_test.cc | 2 +- 3 files changed, 21 insertions(+), 4 deletions(-) diff --git a/src/iceberg/expression/literal.cc b/src/iceberg/expression/literal.cc index f2955b515..dcaa9eb8b 100644 --- a/src/iceberg/expression/literal.cc +++ b/src/iceberg/expression/literal.cc @@ -343,13 +343,28 @@ std::strong_ordering CompareFloat(T lhs, T rhs) { return lhs_is_negative <=> rhs_is_negative; } +bool Literal::Comparable(TypeId type_id, TypeId other_type_id) { + switch (type_id) { + case TypeId::kInt: + case TypeId::kDate: + return other_type_id == TypeId::kInt || other_type_id == TypeId::kDate; + case TypeId::kLong: + case TypeId::kTimestamp: + case TypeId::kTimestampTz: + return other_type_id == TypeId::kLong || other_type_id == TypeId::kTimestamp || + other_type_id == TypeId::kTimestampTz; + default: + return type_id == other_type_id; + } +} + bool Literal::operator==(const Literal& other) const { return (*this <=> other) == 0; } // Three-way comparison operator std::partial_ordering Literal::operator<=>(const Literal& other) const { - // If types are different and value type not the same, comparison is unordered - if (value_.index() != other.value_.index() && - type_->type_id() != other.type_->type_id()) { + // If types are different, comparison is unordered + // (Int & Date) (Timestamp & Long) were excluded from this check to allow comparison + if (!Comparable(type_->type_id(), other.type_->type_id())) { return std::partial_ordering::unordered; } diff --git a/src/iceberg/expression/literal.h b/src/iceberg/expression/literal.h index b07aaa5e0..3cb1f1b23 100644 --- a/src/iceberg/expression/literal.h +++ b/src/iceberg/expression/literal.h @@ -128,6 +128,8 @@ class ICEBERG_EXPORT Literal : public util::Formattable { /// was not valid Result CastTo(const std::shared_ptr& target_type) const; + static bool Comparable(TypeId type_id, TypeId other_type_id); + bool operator==(const Literal& other) const; /// \brief Compare two literals of the same primitive type. diff --git a/src/iceberg/test/inclusive_metrics_evaluator_with_transform_test.cc b/src/iceberg/test/inclusive_metrics_evaluator_with_transform_test.cc index 216bfe363..3ee877469 100644 --- a/src/iceberg/test/inclusive_metrics_evaluator_with_transform_test.cc +++ b/src/iceberg/test/inclusive_metrics_evaluator_with_transform_test.cc @@ -371,7 +371,7 @@ TEST_F(InclusiveMetricsEvaluatorWithTransformTest, IntegerNotEqRewritten) { } TEST_F(InclusiveMetricsEvaluatorWithTransformTest, CaseInsensitiveIntegerNotEqRewritten) { - auto day_ts = ToBoundTransform(Expressions::Day("ts")); + auto day_ts = ToBoundTransform(Expressions::Day("TS")); ExpectShouldRead( Expressions::Not(Expressions::Equal(day_ts, Literal::Long(kIntMinValue - 25))), kRowsMightMatch, nullptr, false); From bf9822fc5ee1a2320d86a632d111de26b50a0903 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Tue, 2 Dec 2025 21:53:20 +0800 Subject: [PATCH 15/15] polish --- .../expression/inclusive_metrics_evaluator.cc | 148 ++++++++---------- .../expression/inclusive_metrics_evaluator.h | 3 +- src/iceberg/expression/literal.cc | 16 +- src/iceberg/expression/literal.h | 2 - .../test/inclusive_metrics_evaluator_test.cc | 26 +-- ...e_metrics_evaluator_with_transform_test.cc | 2 +- 6 files changed, 87 insertions(+), 110 deletions(-) diff --git a/src/iceberg/expression/inclusive_metrics_evaluator.cc b/src/iceberg/expression/inclusive_metrics_evaluator.cc index 4c20e7c81..29f5aba24 100644 --- a/src/iceberg/expression/inclusive_metrics_evaluator.cc +++ b/src/iceberg/expression/inclusive_metrics_evaluator.cc @@ -86,7 +86,7 @@ class InclusiveMetricsVisitor : public BoundVisitor { if (ContainsNullsOnly(id)) { return kRowCannotMatch; } - if (std::dynamic_pointer_cast(expr) == nullptr) { + if (dynamic_cast(expr.get()) == nullptr) { return kRowsMightMatch; } auto it = data_file_.nan_value_counts.find(id); @@ -97,13 +97,12 @@ class InclusiveMetricsVisitor : public BoundVisitor { } Result NotNaN(const std::shared_ptr& expr) override { - if (std::dynamic_pointer_cast(expr) == nullptr) { + if (dynamic_cast(expr.get()) == nullptr) { // identity transforms are already removed by this time return kRowsMightMatch; } int32_t id = expr->reference()->field().field_id(); - if (ContainsNaNsOnly(id)) { return kRowCannotMatch; } @@ -117,20 +116,18 @@ class InclusiveMetricsVisitor : public BoundVisitor { if (ContainsNullsOnly(id) || ContainsNaNsOnly(id)) { return kRowCannotMatch; } - ICEBERG_ASSIGN_OR_RAISE(auto lower_result, LowerBound(expr)); - if (!lower_result.has_value() || lower_result.value().IsNull() || - lower_result.value().IsNaN()) { + ICEBERG_ASSIGN_OR_RAISE(auto lower, LowerBound(expr)); + if (!lower.has_value() || lower->IsNull() || lower->IsNaN()) { // NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more. return kRowsMightMatch; } - const auto& lower = lower_result.value(); // this also works for transforms that are order preserving: // if a transform f is order preserving, a < b means that f(a) <= f(b). // because lower <= a for all values of a in the file, f(lower) <= f(a). // when f(lower) >= X then f(a) >= f(lower) >= X, so there is no a such that f(a) < X // f(lower) >= X means rows cannot match - if (lower >= lit) { + if (lower.value() >= lit) { return kRowCannotMatch; } @@ -144,20 +141,18 @@ class InclusiveMetricsVisitor : public BoundVisitor { return kRowCannotMatch; } - ICEBERG_ASSIGN_OR_RAISE(auto lower_result, LowerBound(expr)); - if (!lower_result.has_value() || lower_result.value().IsNull() || - lower_result.value().IsNaN()) { + ICEBERG_ASSIGN_OR_RAISE(auto lower, LowerBound(expr)); + if (!lower.has_value() || lower->IsNull() || lower->IsNaN()) { // NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more. return kRowsMightMatch; } - const auto& lower = lower_result.value(); // this also works for transforms that are order preserving: // if a transform f is order preserving, a < b means that f(a) <= f(b). // because lower <= a for all values of a in the file, f(lower) <= f(a). // when f(lower) > X then f(a) >= f(lower) > X, so there is no a such that f(a) <= X // f(lower) > X means rows cannot match - if (lower > lit) { + if (lower.value() > lit) { return kRowCannotMatch; } @@ -171,12 +166,11 @@ class InclusiveMetricsVisitor : public BoundVisitor { return kRowCannotMatch; } - ICEBERG_ASSIGN_OR_RAISE(auto upper_result, UpperBound(expr)); - if (!upper_result.has_value() || upper_result.value().IsNull()) { + ICEBERG_ASSIGN_OR_RAISE(auto upper, UpperBound(expr)); + if (!upper.has_value() || upper->IsNull()) { return kRowsMightMatch; } - const auto& upper = upper_result.value(); - if (upper <= lit) { + if (upper.value() <= lit) { return kRowCannotMatch; } @@ -190,12 +184,11 @@ class InclusiveMetricsVisitor : public BoundVisitor { return kRowCannotMatch; } - ICEBERG_ASSIGN_OR_RAISE(auto upper_result, UpperBound(expr)); - if (!upper_result.has_value() || upper_result.value().IsNull()) { + ICEBERG_ASSIGN_OR_RAISE(auto upper, UpperBound(expr)); + if (!upper.has_value() || upper->IsNull()) { return kRowsMightMatch; } - const auto& upper = upper_result.value(); - if (upper < lit) { + if (upper.value() < lit) { return kRowCannotMatch; } @@ -209,21 +202,18 @@ class InclusiveMetricsVisitor : public BoundVisitor { return kRowCannotMatch; } - ICEBERG_ASSIGN_OR_RAISE(auto lower_result, LowerBound(expr)); - if (lower_result.has_value() && !lower_result.value().IsNull() && - !lower_result.value().IsNaN()) { - const auto& lower = lower_result.value(); - if (!lower.IsNaN() && lower > lit) { + ICEBERG_ASSIGN_OR_RAISE(auto lower, LowerBound(expr)); + if (lower.has_value() && !lower->IsNull() && !lower->IsNaN()) { + if (lower.value() > lit) { return kRowCannotMatch; } } - ICEBERG_ASSIGN_OR_RAISE(auto upper_result, UpperBound(expr)); - if (!upper_result.has_value() || upper_result.value().IsNull()) { + ICEBERG_ASSIGN_OR_RAISE(auto upper, UpperBound(expr)); + if (!upper.has_value() || upper->IsNull()) { return kRowsMightMatch; } - const auto& upper = upper_result.value(); - if (upper < lit) { + if (upper.value() < lit) { return kRowCannotMatch; } @@ -249,28 +239,25 @@ class InclusiveMetricsVisitor : public BoundVisitor { return kRowsMightMatch; } - ICEBERG_ASSIGN_OR_RAISE(auto lower_result, LowerBound(expr)); - if (!lower_result.has_value() || lower_result.value().IsNull() || - lower_result.value().IsNaN()) { + ICEBERG_ASSIGN_OR_RAISE(auto lower, LowerBound(expr)); + if (!lower.has_value() || lower->IsNull() || lower->IsNaN()) { // NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more. return kRowsMightMatch; } - const auto& lower = lower_result.value(); auto literals_view = literal_set | std::views::filter([&](const Literal& lit) { - return lower <= lit; + return lower.value() <= lit; }); // if all values are less than lower bound, rows cannot match if (literals_view.empty()) { return kRowCannotMatch; } - ICEBERG_ASSIGN_OR_RAISE(auto upper_result, UpperBound(expr)); - if (!upper_result.has_value() || upper_result.value().IsNull()) { + ICEBERG_ASSIGN_OR_RAISE(auto upper, UpperBound(expr)); + if (!upper.has_value() || upper->IsNull()) { return kRowsMightMatch; } - const auto& upper = upper_result.value(); auto filtered_view = literals_view | std::views::filter([&](const Literal& lit) { - return upper >= lit; + return upper.value() >= lit; }); // if remaining values are greater than upper bound, rows cannot match if (filtered_view.empty()) { @@ -290,8 +277,8 @@ class InclusiveMetricsVisitor : public BoundVisitor { Result StartsWith(const std::shared_ptr& expr, const Literal& lit) override { - auto transform = std::dynamic_pointer_cast(expr); - if (transform != nullptr && + if (auto transform = dynamic_cast(expr.get()); + transform != nullptr && transform->transform()->transform_type() != TransformType::kIdentity) { // truncate must be rewritten in binding. the result is either always or never // compatible @@ -305,28 +292,26 @@ class InclusiveMetricsVisitor : public BoundVisitor { if (lit.type()->type_id() != TypeId::kString) { return kRowCannotMatch; } - const auto& prefix = get(lit.value()); + const auto& prefix = std::get(lit.value()); - ICEBERG_ASSIGN_OR_RAISE(auto lower_result, LowerBound(expr)); - if (!lower_result.has_value() || lower_result.value().IsNull()) { + ICEBERG_ASSIGN_OR_RAISE(auto lower, LowerBound(expr)); + if (!lower.has_value() || lower->IsNull()) { return kRowsMightMatch; } - const auto& lower = lower_result.value(); - const auto& lower_str = get(lower.value()); + const auto& lower_str = std::get(lower->value()); // truncate lower bound so that its length in bytes is not greater than the length of // prefix - int length = std::min(prefix.size(), lower_str.size()); + size_t length = std::min(prefix.size(), lower_str.size()); // if prefix of lower bound is greater than prefix, rows cannot match if (lower_str.substr(0, length) > prefix) { return kRowCannotMatch; } - ICEBERG_ASSIGN_OR_RAISE(auto upper_result, UpperBound(expr)); - if (!upper_result.has_value() || upper_result.value().IsNull()) { + ICEBERG_ASSIGN_OR_RAISE(auto upper, UpperBound(expr)); + if (!upper.has_value() || upper->IsNull()) { return kRowsMightMatch; } - const auto& upper = upper_result.value(); - const auto& upper_str = get(upper.value()); + const auto& upper_str = std::get(upper->value()); // truncate upper bound so that its length in bytes is not greater than the length of // prefix length = std::min(prefix.size(), upper_str.size()); @@ -350,20 +335,17 @@ class InclusiveMetricsVisitor : public BoundVisitor { if (lit.type()->type_id() != TypeId::kString) { return kRowCannotMatch; } - const auto& prefix = get(lit.value()); + const auto& prefix = std::get(lit.value()); // notStartsWith will match unless all values must start with the prefix. This happens // when the lower and upper bounds both start with the prefix. - ICEBERG_ASSIGN_OR_RAISE(auto lower_result, LowerBound(expr)); - ICEBERG_ASSIGN_OR_RAISE(auto upper_result, UpperBound(expr)); - if (!lower_result.has_value() || lower_result.value().IsNull() || - !upper_result.has_value() || upper_result.value().IsNull()) { + ICEBERG_ASSIGN_OR_RAISE(auto lower, LowerBound(expr)); + ICEBERG_ASSIGN_OR_RAISE(auto upper, UpperBound(expr)); + if (!lower.has_value() || lower->IsNull() || !upper.has_value() || upper->IsNull()) { return kRowsMightMatch; } - const auto& lower = lower_result.value(); - const auto& upper = upper_result.value(); - const auto& lower_str = get(lower.value()); - const auto& upper_str = get(upper.value()); + const auto& lower_str = std::get(lower->value()); + const auto& upper_str = std::get(upper->value()); // if lower is shorter than the prefix then lower doesn't start with the prefix if (lower_str.size() < prefix.size()) { @@ -395,24 +377,24 @@ class InclusiveMetricsVisitor : public BoundVisitor { bool ContainsNullsOnly(int32_t id) { auto val_it = data_file_.value_counts.find(id); auto null_it = data_file_.null_value_counts.find(id); - return val_it != data_file_.value_counts.end() && - null_it != data_file_.null_value_counts.end() && - val_it->second - null_it->second == 0; + return val_it != data_file_.value_counts.cend() && + null_it != data_file_.null_value_counts.cend() && + val_it->second == null_it->second; } bool ContainsNaNsOnly(int32_t id) { auto val_it = data_file_.value_counts.find(id); auto nan_it = data_file_.nan_value_counts.find(id); - return val_it != data_file_.value_counts.end() && - nan_it != data_file_.nan_value_counts.end() && + return val_it != data_file_.value_counts.cend() && + nan_it != data_file_.nan_value_counts.cend() && val_it->second == nan_it->second; } Result> LowerBound(const std::shared_ptr& expr) { - if (auto reference = std::dynamic_pointer_cast(expr); + if (auto reference = dynamic_cast(expr.get()); reference != nullptr) { return ParseLowerBound(*reference); - } else if (auto transform = std::dynamic_pointer_cast(expr); + } else if (auto transform = dynamic_cast(expr.get()); transform != nullptr) { return TransformLowerBound(*transform); } else { @@ -422,10 +404,10 @@ class InclusiveMetricsVisitor : public BoundVisitor { } Result> UpperBound(const std::shared_ptr& expr) { - if (auto reference = std::dynamic_pointer_cast(expr); + if (auto reference = dynamic_cast(expr.get()); reference != nullptr) { return ParseUpperBound(*reference); - } else if (auto transform = std::dynamic_pointer_cast(expr); + } else if (auto transform = dynamic_cast(expr.get()); transform != nullptr) { return TransformUpperBound(*transform); } else { @@ -441,11 +423,8 @@ class InclusiveMetricsVisitor : public BoundVisitor { return NotSupported("Lower bound of non-primitive type is not supported."); } auto primitive_type = internal::checked_pointer_cast(type); - if (!data_file_.lower_bounds.empty() && data_file_.lower_bounds.contains(id)) { - ICEBERG_ASSIGN_OR_RAISE( - auto lower, - Literal::Deserialize(data_file_.lower_bounds.at(id), primitive_type)); - return lower; + if (data_file_.lower_bounds.contains(id)) { + return Literal::Deserialize(data_file_.lower_bounds.at(id), primitive_type); } return std::nullopt; @@ -458,11 +437,8 @@ class InclusiveMetricsVisitor : public BoundVisitor { return NotSupported("Upper bound of non-primitive type is not supported."); } auto primitive_type = internal::checked_pointer_cast(type); - if (!data_file_.upper_bounds.empty() && data_file_.upper_bounds.contains(id)) { - ICEBERG_ASSIGN_OR_RAISE( - auto upper, - Literal::Deserialize(data_file_.upper_bounds.at(id), primitive_type)); - return upper; + if (data_file_.upper_bounds.contains(id)) { + return Literal::Deserialize(data_file_.upper_bounds.at(id), primitive_type); } return std::nullopt; @@ -472,9 +448,9 @@ class InclusiveMetricsVisitor : public BoundVisitor { auto transform = boundTransform.transform(); if (transform->PreservesOrder()) { ICEBERG_ASSIGN_OR_RAISE(auto lower, ParseLowerBound(*boundTransform.reference())); - ICEBERG_ASSIGN_OR_RAISE(auto transform_func, - transform->Bind(boundTransform.reference()->type())); if (lower.has_value()) { + ICEBERG_ASSIGN_OR_RAISE(auto transform_func, + transform->Bind(boundTransform.reference()->type())); return transform_func->Transform(lower.value()); } } @@ -486,9 +462,9 @@ class InclusiveMetricsVisitor : public BoundVisitor { auto transform = boundTransform.transform(); if (transform->PreservesOrder()) { ICEBERG_ASSIGN_OR_RAISE(auto upper, ParseUpperBound(*boundTransform.reference())); - ICEBERG_ASSIGN_OR_RAISE(auto transform_func, - transform->Bind(boundTransform.reference()->type())); if (upper.has_value()) { + ICEBERG_ASSIGN_OR_RAISE(auto transform_func, + transform->Bind(boundTransform.reference()->type())); return transform_func->Transform(upper.value()); } } @@ -498,10 +474,10 @@ class InclusiveMetricsVisitor : public BoundVisitor { /** Returns true if the expression term produces a non-null value for non-null input. */ bool IsNonNullPreserving(const std::shared_ptr& expr) { - if (auto reference = std::dynamic_pointer_cast(expr); + if (auto reference = dynamic_cast(expr.get()); reference != nullptr) { return true; - } else if (auto transform = std::dynamic_pointer_cast(expr); + } else if (auto transform = dynamic_cast(expr.get()); transform != nullptr) { return transform->transform()->PreservesOrder(); } @@ -528,7 +504,7 @@ Result> InclusiveMetricsEvaluator::Ma new InclusiveMetricsEvaluator(std::move(bound_expr))); } -Result InclusiveMetricsEvaluator::Eval(const DataFile& data_file) const { +Result InclusiveMetricsEvaluator::Evaluate(const DataFile& data_file) const { if (data_file.record_count == 0) { return kRowCannotMatch; } diff --git a/src/iceberg/expression/inclusive_metrics_evaluator.h b/src/iceberg/expression/inclusive_metrics_evaluator.h index 7ffe98e70..1887b3399 100644 --- a/src/iceberg/expression/inclusive_metrics_evaluator.h +++ b/src/iceberg/expression/inclusive_metrics_evaluator.h @@ -62,12 +62,11 @@ class ICEBERG_EXPORT InclusiveMetricsEvaluator { /// /// \param data_file The data file to evaluate /// \return true if the file matches the expression, false otherwise, or error - Result Eval(const DataFile& data_file) const; + Result Evaluate(const DataFile& data_file) const; private: explicit InclusiveMetricsEvaluator(std::shared_ptr expr); - private: std::shared_ptr expr_; }; diff --git a/src/iceberg/expression/literal.cc b/src/iceberg/expression/literal.cc index dcaa9eb8b..cb0a4c6d0 100644 --- a/src/iceberg/expression/literal.cc +++ b/src/iceberg/expression/literal.cc @@ -343,21 +343,25 @@ std::strong_ordering CompareFloat(T lhs, T rhs) { return lhs_is_negative <=> rhs_is_negative; } -bool Literal::Comparable(TypeId type_id, TypeId other_type_id) { - switch (type_id) { +namespace { + +bool Comparable(TypeId lhs, TypeId rhs) { + switch (lhs) { case TypeId::kInt: case TypeId::kDate: - return other_type_id == TypeId::kInt || other_type_id == TypeId::kDate; + return rhs == TypeId::kInt || rhs == TypeId::kDate; case TypeId::kLong: case TypeId::kTimestamp: case TypeId::kTimestampTz: - return other_type_id == TypeId::kLong || other_type_id == TypeId::kTimestamp || - other_type_id == TypeId::kTimestampTz; + return rhs == TypeId::kLong || rhs == TypeId::kTimestamp || + rhs == TypeId::kTimestampTz; default: - return type_id == other_type_id; + return lhs == rhs; } } +} // namespace + bool Literal::operator==(const Literal& other) const { return (*this <=> other) == 0; } // Three-way comparison operator diff --git a/src/iceberg/expression/literal.h b/src/iceberg/expression/literal.h index 3cb1f1b23..b07aaa5e0 100644 --- a/src/iceberg/expression/literal.h +++ b/src/iceberg/expression/literal.h @@ -128,8 +128,6 @@ class ICEBERG_EXPORT Literal : public util::Formattable { /// was not valid Result CastTo(const std::shared_ptr& target_type) const; - static bool Comparable(TypeId type_id, TypeId other_type_id); - bool operator==(const Literal& other) const; /// \brief Compare two literals of the same primitive type. diff --git a/src/iceberg/test/inclusive_metrics_evaluator_test.cc b/src/iceberg/test/inclusive_metrics_evaluator_test.cc index f0652edfa..27867f1a4 100644 --- a/src/iceberg/test/inclusive_metrics_evaluator_test.cc +++ b/src/iceberg/test/inclusive_metrics_evaluator_test.cc @@ -108,7 +108,7 @@ class InclusiveMetricsEvaluatorTest : public ::testing::Test { InclusiveMetricsEvaluator::Make(unbound, *schema_, true)); auto file = PrepareDataFile("20251128", 10, 1024, {{"id", static_cast(100)}}, {{"id", static_cast(200)}}); - auto result = evaluator->Eval(*file); + auto result = evaluator->Evaluate(*file); ASSERT_TRUE(result.has_value()); ASSERT_EQ(result.value(), expected_result) << unbound->ToString(); } @@ -118,7 +118,7 @@ class InclusiveMetricsEvaluatorTest : public ::testing::Test { InclusiveMetricsEvaluator::Make(unbound, *schema_, true)); auto file = PrepareDataFile("20251128", 10, 1024, {{"name", "123"}}, {{"name", "456"}}, {{2, 10}}, {{2, 0}}); - auto result = evaluator->Eval(*file); + auto result = evaluator->Evaluate(*file); ASSERT_TRUE(result.has_value()); ASSERT_EQ(result.value(), expected_result) << unbound->ToString(); } @@ -153,7 +153,7 @@ TEST_F(InclusiveMetricsEvaluatorTest, IsNullTest) { InclusiveMetricsEvaluator::Make(unbound, *schema_, true)); auto file = PrepareDataFile("20251128", 10, 1024, {{"name", "1"}}, {{"name", "2"}}, {{2, 10}}, {{2, 5}}, {}); - auto result = evaluator->Eval(*file); + auto result = evaluator->Evaluate(*file); ASSERT_TRUE(result.has_value()); ASSERT_EQ(result.value(), kRowsMightMatch) << unbound->ToString(); } @@ -163,7 +163,7 @@ TEST_F(InclusiveMetricsEvaluatorTest, IsNullTest) { InclusiveMetricsEvaluator::Make(unbound, *schema_, true)); auto file = PrepareDataFile("20251128", 10, 1024, {{"name", "1"}}, {{"name", "2"}}, {{2, 10}}, {{2, 0}}, {}); - auto result = evaluator->Eval(*file); + auto result = evaluator->Evaluate(*file); ASSERT_TRUE(result.has_value()); ASSERT_EQ(result.value(), kRowCannotMatch) << unbound->ToString(); } @@ -176,7 +176,7 @@ TEST_F(InclusiveMetricsEvaluatorTest, NotNullTest) { InclusiveMetricsEvaluator::Make(unbound, *schema_, true)); auto file = PrepareDataFile("20251128", 10, 1024, {{"name", "1"}}, {{"name", "2"}}, {{2, 10}}, {{2, 5}}, {}); - auto result = evaluator->Eval(*file); + auto result = evaluator->Evaluate(*file); ASSERT_TRUE(result.has_value()); ASSERT_EQ(result.value(), kRowsMightMatch) << unbound->ToString(); } @@ -186,7 +186,7 @@ TEST_F(InclusiveMetricsEvaluatorTest, NotNullTest) { InclusiveMetricsEvaluator::Make(unbound, *schema_, true)); auto file = PrepareDataFile("20251128", 10, 1024, {{"name", "1"}}, {{"name", "2"}}, {{2, 10}}, {{2, 10}}, {}); - auto result = evaluator->Eval(*file); + auto result = evaluator->Evaluate(*file); ASSERT_TRUE(result.has_value()); ASSERT_EQ(result.value(), kRowCannotMatch) << unbound->ToString(); } @@ -199,7 +199,7 @@ TEST_F(InclusiveMetricsEvaluatorTest, IsNanTest) { InclusiveMetricsEvaluator::Make(unbound, *schema_, true)); auto file = PrepareDataFile("20251128", 10, 1024, {{"salary", 1.0}}, {{"salary", 2.0}}, {{4, 10}}, {{4, 5}}, {{4, 5}}); - auto result = evaluator->Eval(*file); + auto result = evaluator->Evaluate(*file); ASSERT_TRUE(result.has_value()); ASSERT_EQ(result.value(), kRowsMightMatch) << unbound->ToString(); } @@ -209,7 +209,7 @@ TEST_F(InclusiveMetricsEvaluatorTest, IsNanTest) { InclusiveMetricsEvaluator::Make(unbound, *schema_, true)); auto file = PrepareDataFile("20251128", 10, 1024, {{"salary", 1.0}}, {{"salary", 2.0}}, {{4, 10}}, {{4, 10}}, {{4, 5}}); - auto result = evaluator->Eval(*file); + auto result = evaluator->Evaluate(*file); ASSERT_TRUE(result.has_value()); ASSERT_EQ(result.value(), kRowCannotMatch) << unbound->ToString(); } @@ -219,7 +219,7 @@ TEST_F(InclusiveMetricsEvaluatorTest, IsNanTest) { InclusiveMetricsEvaluator::Make(unbound, *schema_, true)); auto file = PrepareDataFile("20251128", 10, 1024, {{"salary", 1.0}}, {{"salary", 2.0}}, {{4, 10}}, {{4, 5}}, {{4, 0}}); - auto result = evaluator->Eval(*file); + auto result = evaluator->Evaluate(*file); ASSERT_TRUE(result.has_value()); ASSERT_EQ(result.value(), kRowCannotMatch) << unbound->ToString(); } @@ -232,7 +232,7 @@ TEST_F(InclusiveMetricsEvaluatorTest, NotNanTest) { InclusiveMetricsEvaluator::Make(unbound, *schema_, true)); auto file = PrepareDataFile("20251128", 10, 1024, {{"salary", 1.0}}, {{"salary", 2.0}}, {{4, 10}}, {}, {{4, 5}}); - auto result = evaluator->Eval(*file); + auto result = evaluator->Evaluate(*file); ASSERT_TRUE(result.has_value()); ASSERT_EQ(result.value(), kRowsMightMatch) << unbound->ToString(); } @@ -242,7 +242,7 @@ TEST_F(InclusiveMetricsEvaluatorTest, NotNanTest) { InclusiveMetricsEvaluator::Make(unbound, *schema_, true)); auto file = PrepareDataFile("20251128", 10, 1024, {{"salary", 1.0}}, {{"salary", 2.0}}, {{4, 10}}, {}, {{4, 10}}); - auto result = evaluator->Eval(*file); + auto result = evaluator->Evaluate(*file); ASSERT_TRUE(result.has_value()); ASSERT_EQ(result.value(), kRowCannotMatch) << unbound->ToString(); } @@ -379,7 +379,7 @@ TEST_F(InclusiveMetricsEvaluatorTest, NotStartsWithTest) { InclusiveMetricsEvaluator::Make(unbound, *schema_, true)); auto file = PrepareDataFile("20251128", 10, 1024, {{"name", "123"}}, {{"name", "123"}}, {{2, 10}}, {{2, 0}}); - auto result = evaluator->Eval(*file); + auto result = evaluator->Evaluate(*file); ASSERT_TRUE(result.has_value()); ASSERT_EQ(result.value(), expected_result) << unbound->ToString(); }; @@ -536,7 +536,7 @@ class InclusiveMetricsEvaluatorMigratedTest : public InclusiveMetricsEvaluatorTe const std::shared_ptr& file, bool case_sensitive = true) { ICEBERG_UNWRAP_OR_FAIL( auto evaluator, InclusiveMetricsEvaluator::Make(expr, *schema_, case_sensitive)); - auto result = evaluator->Eval(*file); + auto result = evaluator->Evaluate(*file); ASSERT_TRUE(result.has_value()); ASSERT_EQ(result.value(), expected_result) << expr->ToString(); }; diff --git a/src/iceberg/test/inclusive_metrics_evaluator_with_transform_test.cc b/src/iceberg/test/inclusive_metrics_evaluator_with_transform_test.cc index 3ee877469..935f3c3ab 100644 --- a/src/iceberg/test/inclusive_metrics_evaluator_with_transform_test.cc +++ b/src/iceberg/test/inclusive_metrics_evaluator_with_transform_test.cc @@ -95,7 +95,7 @@ class InclusiveMetricsEvaluatorWithTransformTest : public ::testing::Test { auto target_file = file ? file : data_file_; ICEBERG_UNWRAP_OR_FAIL( auto evaluator, InclusiveMetricsEvaluator::Make(expr, *schema_, case_sensitive)); - auto eval_result = evaluator->Eval(*target_file); + auto eval_result = evaluator->Evaluate(*target_file); ASSERT_TRUE(eval_result.has_value()); ASSERT_EQ(eval_result.value(), expected_result) << expr->ToString(); }