From 94dfe90a99bc64a7e83e9158513641b025134c66 Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Wed, 26 Nov 2025 21:55:22 +0800 Subject: [PATCH 1/7] feat: add transform project Add Transform::Project for inclusive predicate projection This PR implements Transform::Project, which transforms a BoundPredicate to an inclusive predicate on partition values. StrictProject will be added in a separate PR to keep the review easier. Move template implementations of Expressions into header file, or the linker will shout out the following: Undefined symbols for architecture x86_64: "std::__1::shared_ptr> iceberg::Expressions::In(std::__1::shared_ptr>, std::initializer_list)", referenced from: iceberg::ProjectionUtil::FixInclusiveTimeProjection(std::__1::shared_ptr> const&) in transform.cc.o --- src/iceberg/expression/predicate.h | 1 + src/iceberg/test/transform_test.cc | 477 ++++++++++++++++++++ src/iceberg/transform.cc | 66 +++ src/iceberg/transform.h | 15 +- src/iceberg/transform_function.h | 3 + src/iceberg/type_fwd.h | 3 + src/iceberg/util/projection_util_internal.h | 476 +++++++++++++++++++ 7 files changed, 1040 insertions(+), 1 deletion(-) create mode 100644 src/iceberg/util/projection_util_internal.h diff --git a/src/iceberg/expression/predicate.h b/src/iceberg/expression/predicate.h index b7ed21ff8..cdd3d1f52 100644 --- a/src/iceberg/expression/predicate.h +++ b/src/iceberg/expression/predicate.h @@ -27,6 +27,7 @@ #include "iceberg/expression/expression.h" #include "iceberg/expression/literal.h" #include "iceberg/expression/term.h" +#include "iceberg/iceberg_export.h" namespace iceberg { diff --git a/src/iceberg/test/transform_test.cc b/src/iceberg/test/transform_test.cc index 529297b27..a9d1ec261 100644 --- a/src/iceberg/test/transform_test.cc +++ b/src/iceberg/test/transform_test.cc @@ -26,12 +26,17 @@ #include #include +#include "iceberg/expression/expressions.h" #include "iceberg/expression/literal.h" +#include "iceberg/expression/predicate.h" +#include "iceberg/schema.h" #include "iceberg/schema_field.h" #include "iceberg/test/matchers.h" #include "iceberg/test/temporal_test_helper.h" #include "iceberg/type.h" +#include "iceberg/util/checked_cast.h" #include "iceberg/util/formatter.h" // IWYU pragma: keep +#include "iceberg/util/macros.h" namespace iceberg { @@ -914,4 +919,476 @@ TEST(TransformCanTransformTest, CanTransform) { } } +// Test fixture for Transform::Project tests +class TransformProjectTest : public ::testing::Test { + protected: + void SetUp() override { + // Create test schemas for different source types + int_schema_ = std::make_shared( + std::vector{SchemaField::MakeRequired(1, "key", int32()), + SchemaField::MakeOptional(2, "value", int32())}, + /*schema_id=*/0); + long_schema_ = std::make_shared( + std::vector{SchemaField::MakeRequired(1, "value", int64())}, + /*schema_id=*/0); + string_schema_ = std::make_shared( + std::vector{SchemaField::MakeRequired(1, "value", string())}, + /*schema_id=*/0); + date_schema_ = std::make_shared( + std::vector{SchemaField::MakeRequired(1, "value", date())}, + /*schema_id=*/0); + timestamp_schema_ = std::make_shared( + std::vector{SchemaField::MakeRequired(1, "value", timestamp())}, + /*schema_id=*/0); + } + + std::shared_ptr int_schema_; + std::shared_ptr long_schema_; + std::shared_ptr string_schema_; + std::shared_ptr date_schema_; + std::shared_ptr timestamp_schema_; +}; + +TEST_F(TransformProjectTest, IdentityProjectEquality) { + auto transform = Transform::Identity(); + + // Test equality predicate + auto unbound = Expressions::Equal("value", Literal::Int(100)); + ICEBERG_ASSIGN_OR_THROW(auto bound, + unbound->Bind(*int_schema_, /*case_sensitive=*/true)); + auto bound_pred = std::dynamic_pointer_cast(bound); + ASSERT_NE(bound_pred, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected, transform->Project("part", bound_pred)); + ASSERT_NE(projected, nullptr); + EXPECT_EQ(projected->op(), Expression::Operation::kEq); + + auto unbound_projected = + internal::checked_pointer_cast>(projected); + ASSERT_NE(unbound_projected, nullptr); + EXPECT_EQ(unbound_projected->op(), Expression::Operation::kEq); + EXPECT_EQ(unbound_projected->literals().size(), 1); + EXPECT_EQ(std::get(unbound_projected->literals().front().value()), 100); +} + +TEST_F(TransformProjectTest, IdentityProjectComparison) { + auto transform = Transform::Identity(); + + // Test less than predicate + auto unbound_lt = Expressions::LessThan("value", Literal::Int(50)); + ICEBERG_ASSIGN_OR_THROW(auto bound_lt, + unbound_lt->Bind(*int_schema_, /*case_sensitive=*/true)); + auto bound_pred_lt = std::dynamic_pointer_cast(bound_lt); + ASSERT_NE(bound_pred_lt, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected_lt, transform->Project("part", bound_pred_lt)); + ASSERT_NE(projected_lt, nullptr); + EXPECT_EQ(projected_lt->op(), Expression::Operation::kLt); + + // Test greater than or equal predicate + auto unbound_gte = Expressions::GreaterThanOrEqual("value", Literal::Int(100)); + ICEBERG_ASSIGN_OR_THROW(auto bound_gte, + unbound_gte->Bind(*int_schema_, /*case_sensitive=*/true)); + auto bound_pred_gte = std::dynamic_pointer_cast(bound_gte); + ASSERT_NE(bound_pred_gte, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected_gte, transform->Project("part", bound_pred_gte)); + ASSERT_NE(projected_gte, nullptr); + EXPECT_EQ(projected_gte->op(), Expression::Operation::kGtEq); +} + +TEST_F(TransformProjectTest, IdentityProjectUnary) { + auto transform = Transform::Identity(); + + // Test IsNull predicate + auto unbound_null = Expressions::IsNull("value"); + ICEBERG_ASSIGN_OR_THROW(auto bound_null, + unbound_null->Bind(*int_schema_, /*case_sensitive=*/true)); + auto bound_pred_null = std::dynamic_pointer_cast(bound_null); + ASSERT_NE(bound_pred_null, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected_null, + transform->Project("part", bound_pred_null)); + ASSERT_NE(projected_null, nullptr); + EXPECT_EQ(projected_null->op(), Expression::Operation::kIsNull); +} + +TEST_F(TransformProjectTest, IdentityProjectSet) { + auto transform = Transform::Identity(); + + // Test IN predicate + auto unbound_in = + Expressions::In("value", {Literal::Int(1), Literal::Int(2), Literal::Int(3)}); + ICEBERG_ASSIGN_OR_THROW(auto bound_in, + unbound_in->Bind(*int_schema_, /*case_sensitive=*/true)); + auto bound_pred_in = std::dynamic_pointer_cast(bound_in); + ASSERT_NE(bound_pred_in, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected_in, transform->Project("part", bound_pred_in)); + ASSERT_NE(projected_in, nullptr); + EXPECT_EQ(projected_in->op(), Expression::Operation::kIn); + auto unbound_projected = + internal::checked_pointer_cast>(projected_in); + ASSERT_NE(unbound_projected, nullptr); + EXPECT_EQ(unbound_projected->op(), Expression::Operation::kIn); + EXPECT_EQ(unbound_projected->literals().size(), 3); + std::vector values; + for (const auto& lit : unbound_projected->literals()) { + values.push_back(std::get(lit.value())); + } + EXPECT_THAT(values, testing::UnorderedElementsAre(1, 2, 3)); +} + +TEST_F(TransformProjectTest, BucketProjectEquality) { + auto transform = Transform::Bucket(4); + + // Bucket can project equality predicates + auto unbound = Expressions::Equal("value", Literal::Int(34)); + ICEBERG_ASSIGN_OR_THROW(auto bound, + unbound->Bind(*int_schema_, /*case_sensitive=*/true)); + auto bound_pred = std::dynamic_pointer_cast(bound); + ASSERT_NE(bound_pred, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected, transform->Project("part", bound_pred)); + ASSERT_NE(projected, nullptr); + EXPECT_EQ(projected->op(), Expression::Operation::kEq); + + auto unbound_projected = + internal::checked_pointer_cast>(projected); + ASSERT_NE(unbound_projected, nullptr); + EXPECT_EQ(unbound_projected->op(), Expression::Operation::kEq); + EXPECT_EQ(unbound_projected->literals().size(), 1); + EXPECT_EQ(std::get(unbound_projected->literals().front().value()), 3); +} + +TEST_F(TransformProjectTest, BucketProjectWithMatchingTransformedChild) { + auto partition_transform = Transform::Bucket(16); + + // Create a predicate like: bucket(value, 16) = 5 + auto bucket_term = Expressions::Bucket("value", 16); + auto unbound = Expressions::Equal(bucket_term, Literal::Int(5)); + ICEBERG_ASSIGN_OR_THROW(auto bound, + unbound->Bind(*int_schema_, /*case_sensitive=*/true)); + auto bound_pred = std::dynamic_pointer_cast(bound); + ASSERT_NE(bound_pred, nullptr); + + // The predicate's term should be a transform + EXPECT_EQ(bound_pred->term()->kind(), Term::Kind::kTransform); + + auto dummy = Expressions::NotEqual(bucket_term, Literal::Int(5)); + + // When the transform matches, Project should use RemoveTransform and return the + // predicate + ICEBERG_ASSIGN_OR_THROW(auto projected, + partition_transform->Project("part", bound_pred)); + ASSERT_NE(projected, nullptr); + EXPECT_EQ(projected->op(), Expression::Operation::kEq); + auto unbound_projected = + internal::checked_pointer_cast>(projected); + ASSERT_NE(unbound_projected, nullptr); + EXPECT_EQ(unbound_projected->op(), Expression::Operation::kEq); + EXPECT_EQ(unbound_projected->literals().size(), 1); + EXPECT_EQ(std::get(unbound_projected->literals().front().value()), 5); +} + +TEST_F(TransformProjectTest, BucketProjectComparisonReturnsNull) { + auto transform = Transform::Bucket(16); + + // Bucket cannot project comparison predicates (they return null) + auto unbound_lt = Expressions::LessThan("value", Literal::Int(50)); + ICEBERG_ASSIGN_OR_THROW(auto bound_lt, + unbound_lt->Bind(*int_schema_, /*case_sensitive=*/true)); + auto bound_pred_lt = std::dynamic_pointer_cast(bound_lt); + ASSERT_NE(bound_pred_lt, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected_lt, transform->Project("part", bound_pred_lt)); + EXPECT_EQ(projected_lt, nullptr); +} + +TEST_F(TransformProjectTest, BucketProjectInSet) { + auto transform = Transform::Bucket(16); + + // Bucket can project IN predicates + auto unbound_in = + Expressions::In("value", {Literal::Int(1), Literal::Int(2), Literal::Int(3)}); + ICEBERG_ASSIGN_OR_THROW(auto bound_in, + unbound_in->Bind(*int_schema_, /*case_sensitive=*/true)); + auto bound_pred_in = std::dynamic_pointer_cast(bound_in); + ASSERT_NE(bound_pred_in, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected_in, transform->Project("part", bound_pred_in)); + ASSERT_NE(projected_in, nullptr); + EXPECT_EQ(projected_in->op(), Expression::Operation::kIn); +} + +TEST_F(TransformProjectTest, BucketProjectNotInReturnsNull) { + auto transform = Transform::Bucket(16); + + // Bucket cannot project NOT IN predicates + auto unbound_not_in = + Expressions::NotIn("value", {Literal::Int(1), Literal::Int(2), Literal::Int(3)}); + ICEBERG_ASSIGN_OR_THROW(auto bound_not_in, + unbound_not_in->Bind(*int_schema_, /*case_sensitive=*/true)); + auto bound_pred_not_in = std::dynamic_pointer_cast(bound_not_in); + ASSERT_NE(bound_pred_not_in, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected_not_in, + transform->Project("part", bound_pred_not_in)); + EXPECT_EQ(projected_not_in, nullptr); +} + +TEST_F(TransformProjectTest, TruncateProjectIntEquality) { + auto transform = Transform::Truncate(10); + + // Truncate can project equality predicates + auto unbound = Expressions::Equal("value", Literal::Int(123)); + ICEBERG_ASSIGN_OR_THROW(auto bound, + unbound->Bind(*int_schema_, /*case_sensitive=*/true)); + auto bound_pred = std::dynamic_pointer_cast(bound); + ASSERT_NE(bound_pred, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected, transform->Project("part", bound_pred)); + ASSERT_NE(projected, nullptr); + EXPECT_EQ(projected->op(), Expression::Operation::kEq); + + auto unbound_projected = + internal::checked_pointer_cast>(projected); + ASSERT_NE(unbound_projected, nullptr); + EXPECT_EQ(unbound_projected->op(), Expression::Operation::kEq); + EXPECT_EQ(unbound_projected->literals().size(), 1); + EXPECT_EQ(std::get(unbound_projected->literals().front().value()), 120); +} + +TEST_F(TransformProjectTest, TruncateProjectIntLessThan) { + auto transform = Transform::Truncate(10); + + // Truncate projects LT as LTE + auto unbound = Expressions::LessThan("value", Literal::Int(25)); + ICEBERG_ASSIGN_OR_THROW(auto bound, + unbound->Bind(*int_schema_, /*case_sensitive=*/true)); + auto bound_pred = std::dynamic_pointer_cast(bound); + ASSERT_NE(bound_pred, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected, transform->Project("part", bound_pred)); + ASSERT_NE(projected, nullptr); + EXPECT_EQ(projected->op(), Expression::Operation::kLtEq); +} + +TEST_F(TransformProjectTest, TruncateProjectIntGreaterThan) { + auto transform = Transform::Truncate(10); + + // Truncate projects GT as GTE + auto unbound = Expressions::GreaterThan("value", Literal::Int(25)); + ICEBERG_ASSIGN_OR_THROW(auto bound, + unbound->Bind(*int_schema_, /*case_sensitive=*/true)); + auto bound_pred = std::dynamic_pointer_cast(bound); + ASSERT_NE(bound_pred, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected, transform->Project("part", bound_pred)); + ASSERT_NE(projected, nullptr); + EXPECT_EQ(projected->op(), Expression::Operation::kGtEq); + + auto unbound_projected = + internal::checked_pointer_cast>(projected); + ASSERT_NE(unbound_projected, nullptr); + EXPECT_EQ(unbound_projected->op(), Expression::Operation::kGtEq); + EXPECT_EQ(unbound_projected->literals().size(), 1); + EXPECT_EQ(std::get(unbound_projected->literals().front().value()), 20); +} + +TEST_F(TransformProjectTest, TruncateProjectStringEquality) { + auto transform = Transform::Truncate(5); + + auto unbound = Expressions::Equal("value", Literal::String("Hello, World!")); + ICEBERG_ASSIGN_OR_THROW(auto bound, + unbound->Bind(*string_schema_, /*case_sensitive=*/true)); + auto bound_pred = std::dynamic_pointer_cast(bound); + ASSERT_NE(bound_pred, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected, transform->Project("part", bound_pred)); + ASSERT_NE(projected, nullptr); + EXPECT_EQ(projected->op(), Expression::Operation::kEq); + + auto unbound_projected = + internal::checked_pointer_cast>(projected); + ASSERT_NE(unbound_projected, nullptr); + EXPECT_EQ(unbound_projected->op(), Expression::Operation::kEq); + EXPECT_EQ(unbound_projected->literals().size(), 1); + EXPECT_EQ(std::get(unbound_projected->literals().front().value()), + "Hello"); +} + +TEST_F(TransformProjectTest, TruncateProjectStringStartsWith) { + auto transform = Transform::Truncate(5); + + // StartsWith with shorter string than width + auto unbound_short = Expressions::StartsWith("value", "Hi"); + ICEBERG_ASSIGN_OR_THROW(auto bound_short, + unbound_short->Bind(*string_schema_, /*case_sensitive=*/true)); + auto bound_pred_short = std::dynamic_pointer_cast(bound_short); + ASSERT_NE(bound_pred_short, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected_short, + transform->Project("part", bound_pred_short)); + ASSERT_NE(projected_short, nullptr); + EXPECT_EQ(projected_short->op(), Expression::Operation::kStartsWith); + + auto unbound_projected_short = + internal::checked_pointer_cast>( + projected_short); + ASSERT_NE(unbound_projected_short, nullptr); + EXPECT_EQ(unbound_projected_short->op(), Expression::Operation::kStartsWith); + EXPECT_EQ(unbound_projected_short->literals().size(), 1); + EXPECT_EQ(std::get(unbound_projected_short->literals().front().value()), + "Hi"); + + // StartsWith with string equal to width + auto unbound_equal = Expressions::StartsWith("value", "Hello"); + ICEBERG_ASSIGN_OR_THROW(auto bound_equal, + unbound_equal->Bind(*string_schema_, /*case_sensitive=*/true)); + auto bound_pred_equal = std::dynamic_pointer_cast(bound_equal); + ASSERT_NE(bound_pred_equal, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected_equal, + transform->Project("part", bound_pred_equal)); + ASSERT_NE(projected_equal, nullptr); + EXPECT_EQ(projected_equal->op(), Expression::Operation::kEq); + + auto unbound_projected_equal = + internal::checked_pointer_cast>( + projected_equal); + ASSERT_NE(unbound_projected_equal, nullptr); + EXPECT_EQ(unbound_projected_equal->op(), Expression::Operation::kEq); + EXPECT_EQ(unbound_projected_equal->literals().size(), 1); + EXPECT_EQ(std::get(unbound_projected_equal->literals().front().value()), + "Hello"); +} + +TEST_F(TransformProjectTest, YearProjectEquality) { + auto transform = Transform::Year(); + + // 2021-06-01 as days from epoch + int32_t date_value = + TemporalTestHelper::CreateDate({.year = 2021, .month = 6, .day = 1}); + auto unbound = Expressions::Equal("value", Literal::Date(date_value)); + ICEBERG_ASSIGN_OR_THROW(auto bound, + unbound->Bind(*date_schema_, /*case_sensitive=*/true)); + auto bound_pred = std::dynamic_pointer_cast(bound); + ASSERT_NE(bound_pred, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected, transform->Project("part", bound_pred)); + ASSERT_NE(projected, nullptr); + EXPECT_EQ(projected->op(), Expression::Operation::kEq); +} + +TEST_F(TransformProjectTest, YearProjectComparison) { + auto transform = Transform::Year(); + + int32_t date_value = + TemporalTestHelper::CreateDate({.year = 2021, .month = 6, .day = 1}); + + // LT projects to LTE + auto unbound_lt = Expressions::LessThan("value", Literal::Date(date_value)); + ICEBERG_ASSIGN_OR_THROW(auto bound_lt, + unbound_lt->Bind(*date_schema_, /*case_sensitive=*/true)); + auto bound_pred_lt = std::dynamic_pointer_cast(bound_lt); + ASSERT_NE(bound_pred_lt, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected_lt, transform->Project("part", bound_pred_lt)); + ASSERT_NE(projected_lt, nullptr); + EXPECT_EQ(projected_lt->op(), Expression::Operation::kLtEq); + + // GT projects to GTE + auto unbound_gt = Expressions::GreaterThan("value", Literal::Date(date_value)); + ICEBERG_ASSIGN_OR_THROW(auto bound_gt, + unbound_gt->Bind(*date_schema_, /*case_sensitive=*/true)); + auto bound_pred_gt = std::dynamic_pointer_cast(bound_gt); + ASSERT_NE(bound_pred_gt, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected_gt, transform->Project("part", bound_pred_gt)); + ASSERT_NE(projected_gt, nullptr); + EXPECT_EQ(projected_gt->op(), Expression::Operation::kGtEq); +} + +TEST_F(TransformProjectTest, MonthProjectEquality) { + auto transform = Transform::Month(); + + int64_t ts_value = + TemporalTestHelper::CreateTimestamp({.year = 2021, .month = 6, .day = 1}); + auto unbound = Expressions::Equal("value", Literal::Timestamp(ts_value)); + ICEBERG_ASSIGN_OR_THROW(auto bound, + unbound->Bind(*timestamp_schema_, /*case_sensitive=*/true)); + auto bound_pred = std::dynamic_pointer_cast(bound); + ASSERT_NE(bound_pred, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected, transform->Project("part", bound_pred)); + ASSERT_NE(projected, nullptr); + EXPECT_EQ(projected->op(), Expression::Operation::kEq); +} + +TEST_F(TransformProjectTest, DayProjectEquality) { + auto transform = Transform::Day(); + + int32_t date_value = + TemporalTestHelper::CreateDate({.year = 2021, .month = 6, .day = 15}); + auto unbound = Expressions::Equal("value", Literal::Date(date_value)); + ICEBERG_ASSIGN_OR_THROW(auto bound, + unbound->Bind(*date_schema_, /*case_sensitive=*/true)); + auto bound_pred = std::dynamic_pointer_cast(bound); + ASSERT_NE(bound_pred, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected, transform->Project("part", bound_pred)); + ASSERT_NE(projected, nullptr); + EXPECT_EQ(projected->op(), Expression::Operation::kEq); +} + +TEST_F(TransformProjectTest, HourProjectEquality) { + auto transform = Transform::Hour(); + + int64_t ts_value = TemporalTestHelper::CreateTimestamp( + {.year = 2021, .month = 6, .day = 1, .hour = 14, .minute = 30}); + auto unbound = Expressions::Equal("value", Literal::Timestamp(ts_value)); + ICEBERG_ASSIGN_OR_THROW(auto bound, + unbound->Bind(*timestamp_schema_, /*case_sensitive=*/true)); + auto bound_pred = std::dynamic_pointer_cast(bound); + ASSERT_NE(bound_pred, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected, transform->Project("part", bound_pred)); + ASSERT_NE(projected, nullptr); + EXPECT_EQ(projected->op(), Expression::Operation::kEq); +} + +TEST_F(TransformProjectTest, VoidProjectReturnsNull) { + auto transform = Transform::Void(); + + auto unbound = Expressions::Equal("value", Literal::Int(100)); + ICEBERG_ASSIGN_OR_THROW(auto bound, + unbound->Bind(*int_schema_, /*case_sensitive=*/true)); + auto bound_pred = std::dynamic_pointer_cast(bound); + ASSERT_NE(bound_pred, nullptr); + + // Void transform always returns null (no projection possible) + ICEBERG_ASSIGN_OR_THROW(auto projected, transform->Project("part", bound_pred)); + EXPECT_EQ(projected, nullptr); +} + +TEST_F(TransformProjectTest, TemporalProjectInSet) { + auto transform = Transform::Year(); + + int32_t date1 = TemporalTestHelper::CreateDate({.year = 2020, .month = 1, .day = 1}); + int32_t date2 = TemporalTestHelper::CreateDate({.year = 2021, .month = 6, .day = 15}); + int32_t date3 = TemporalTestHelper::CreateDate({.year = 2022, .month = 12, .day = 31}); + + auto unbound_in = Expressions::In( + "value", {Literal::Date(date1), Literal::Date(date2), Literal::Date(date3)}); + ICEBERG_ASSIGN_OR_THROW(auto bound_in, + unbound_in->Bind(*date_schema_, /*case_sensitive=*/true)); + auto bound_pred_in = std::dynamic_pointer_cast(bound_in); + ASSERT_NE(bound_pred_in, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected_in, transform->Project("part", bound_pred_in)); + ASSERT_NE(projected_in, nullptr); + EXPECT_EQ(projected_in->op(), Expression::Operation::kIn); +} + } // namespace iceberg diff --git a/src/iceberg/transform.cc b/src/iceberg/transform.cc index 2e39cb06d..857cc7a50 100644 --- a/src/iceberg/transform.cc +++ b/src/iceberg/transform.cc @@ -23,8 +23,14 @@ #include #include +#include "iceberg/expression/predicate.h" +#include "iceberg/expression/term.h" +#include "iceberg/result.h" #include "iceberg/transform_function.h" #include "iceberg/type.h" +#include "iceberg/util/checked_cast.h" +#include "iceberg/util/macros.h" +#include "iceberg/util/projection_util_internal.h" namespace iceberg { namespace { @@ -240,6 +246,66 @@ bool Transform::SatisfiesOrderOf(const Transform& other) const { std::unreachable(); } +Result> Transform::Project( + std::string_view name, const std::shared_ptr& predicate) { + switch (transform_type_) { + case TransformType::kIdentity: + return ProjectionUtil::IdentityProject(name, predicate); + case TransformType::kBucket: { + // If the predicate has a transformed child that matches the given transform, return + // a predicate. + if (predicate->term()->kind() == Term::Kind::kTransform) { + const auto boundTransform = + internal::checked_pointer_cast(predicate->term()); + if (*this == *boundTransform->transform()) { + return ProjectionUtil::RemoveTransform(name, predicate); + } else { + return nullptr; + } + } + ICEBERG_ASSIGN_OR_RAISE(auto func, Bind(predicate->term()->type())); + return ProjectionUtil::BucketProject(name, predicate, func); + } + case TransformType::kTruncate: { + // If the predicate has a transformed child that matches the given transform, return + // a predicate. + if (predicate->term()->kind() == Term::Kind::kTransform) { + const auto boundTransform = + internal::checked_pointer_cast(predicate->term()); + if (*this == *boundTransform->transform()) { + return ProjectionUtil::RemoveTransform(name, predicate); + } else { + return nullptr; + } + } + ICEBERG_ASSIGN_OR_RAISE(auto func, Bind(predicate->term()->type())); + return ProjectionUtil::TruncateProject(name, predicate, func); + } + case TransformType::kYear: + case TransformType::kMonth: + case TransformType::kDay: + case TransformType::kHour: { + // If the predicate has a transformed child that matches the given transform, return + // a predicate. + if (predicate->term()->kind() == Term::Kind::kTransform) { + const auto boundTransform = + internal::checked_pointer_cast(predicate->term()); + if (*this == *boundTransform->transform()) { + return ProjectionUtil::RemoveTransform(name, predicate); + } else { + return nullptr; + } + } + ICEBERG_ASSIGN_OR_RAISE(auto func, Bind(predicate->term()->type())); + return ProjectionUtil::TemporalProject(name, predicate, func); + } + case TransformType::kUnknown: + case TransformType::kVoid: + return nullptr; + } + std::unreachable(); +} + bool TransformFunction::Equals(const TransformFunction& other) const { return transform_type_ == other.transform_type_ && *source_type_ == *other.source_type_; } diff --git a/src/iceberg/transform.h b/src/iceberg/transform.h index 4f608c467..a82a33934 100644 --- a/src/iceberg/transform.h +++ b/src/iceberg/transform.h @@ -23,6 +23,7 @@ #include #include +#include #include #include @@ -164,11 +165,23 @@ class ICEBERG_EXPORT Transform : public util::Formattable { /// For example, sorting by day(ts) will produce an ordering that is also by month(ts) /// or year(ts). However, sorting by day(ts) will not satisfy the order of hour(ts) or /// identity(ts). - /// + /// \param other The other transform to compare with. /// \return true if ordering by this transform is equivalent to ordering by the other /// transform. bool SatisfiesOrderOf(const Transform& other) const; + /// \brief Transforms a BoundPredicate to an inclusive predicate on the partition values + /// produced by the transform. + /// + /// This inclusive transform guarantees that if predicate->Test(value) is true, then + /// Projected(transform(value)) is true. + /// \param name The name of the partition column. + /// \param predicate The predicate to project. + /// \return A Result containing either a shared pointer to the projected predicate or an + /// Error if the projection fails. + Result> Project( + std::string_view name, const std::shared_ptr& predicate); + /// \brief Returns a string representation of this transform (e.g., "bucket[16]"). std::string ToString() const override; diff --git a/src/iceberg/transform_function.h b/src/iceberg/transform_function.h index 33712ca60..b3cfa5a22 100644 --- a/src/iceberg/transform_function.h +++ b/src/iceberg/transform_function.h @@ -83,6 +83,9 @@ class ICEBERG_EXPORT TruncateTransform : public TransformFunction { /// \brief Returns the same type as source_type. std::shared_ptr ResultType() const override; + /// \brief Returns the width to truncate to. + int32_t width() const { return width_; } + /// \brief Create a TruncateTransform. /// \param source_type Type of the input data. /// \param width The width to truncate to. diff --git a/src/iceberg/type_fwd.h b/src/iceberg/type_fwd.h index 68a4543fb..abca91b7e 100644 --- a/src/iceberg/type_fwd.h +++ b/src/iceberg/type_fwd.h @@ -161,6 +161,9 @@ class PendingUpdate; template class PendingUpdateTyped; +class BoundPredicate; +class UnboundPredicate; + /// ---------------------------------------------------------------------------- /// TODO: Forward declarations below are not added yet. /// ---------------------------------------------------------------------------- diff --git a/src/iceberg/util/projection_util_internal.h b/src/iceberg/util/projection_util_internal.h new file mode 100644 index 000000000..57e8a46fd --- /dev/null +++ b/src/iceberg/util/projection_util_internal.h @@ -0,0 +1,476 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "iceberg/expression/expressions.h" +#include "iceberg/expression/literal.h" +#include "iceberg/expression/predicate.h" +#include "iceberg/result.h" +#include "iceberg/transform.h" +#include "iceberg/transform_function.h" +#include "iceberg/util/checked_cast.h" +#include "iceberg/util/decimal.h" +#include "iceberg/util/macros.h" + +namespace iceberg { + +class ProjectionUtil { + private: + static Result> TransformSet( + std::string_view name, const std::shared_ptr& predicate, + const std::shared_ptr& func) { + std::vector transformed; + transformed.reserve(predicate->literal_set().size()); + for (const auto& lit : predicate->literal_set()) { + ICEBERG_ASSIGN_OR_RAISE(auto transformed_lit, func->Transform(lit)); + transformed.push_back(std::move(transformed_lit)); + } + return Expressions::Predicate(predicate->op(), std::string(name), + std::move(transformed)); + } + + static Result> TruncateArray( + std::string_view name, const std::shared_ptr& predicate, + const std::shared_ptr& func) { + const Literal& boundary = predicate->literal(); + + ICEBERG_ASSIGN_OR_RAISE(auto transformed, func->Transform(boundary)); + + switch (predicate->op()) { + case Expression::Operation::kLt: + case Expression::Operation::kLtEq: + return Expressions::Predicate(Expression::Operation::kLtEq, std::string(name), + transformed); + case Expression::Operation::kGt: + case Expression::Operation::kGtEq: + return Expressions::Predicate(Expression::Operation::kGtEq, std::string(name), + transformed); + case Expression::Operation::kEq: + return Expressions::Predicate(Expression::Operation::kEq, std::string(name), + transformed); + case Expression::Operation::kStartsWith: + return Expressions::Predicate(Expression::Operation::kStartsWith, + std::string(name), transformed); + default: + return nullptr; + } + std::unreachable(); + } + + template + requires std::is_same_v || std::is_same_v + static Result> TruncateInteger( + std::string_view name, const std::shared_ptr& predicate, + const std::shared_ptr& func) { + const Literal& literal = predicate->literal(); + bool is_source_date_type = func->source_type()->type_id() == TypeId::kDate; + + switch (predicate->op()) { + case Expression::Operation::kLt: { + // adjust closed and then transform ltEq + ICEBERG_ASSIGN_OR_RAISE( + auto transformed, + func->Transform(is_source_date_type + ? Literal::Date(std::get(literal.value()) - 1) + : Literal::Int(std::get(literal.value()) - 1))); + return Expressions::Predicate(Expression::Operation::kLtEq, std::string(name), + transformed); + } + case Expression::Operation::kLtEq: { + ICEBERG_ASSIGN_OR_RAISE(auto transformed, func->Transform(literal)); + return Expressions::Predicate(Expression::Operation::kLtEq, std::string(name), + transformed); + } + case Expression::Operation::kGt: { + // adjust closed and then transform gtEq + ICEBERG_ASSIGN_OR_RAISE( + auto transformed, + func->Transform(is_source_date_type + ? Literal::Date(std::get(literal.value()) + 1) + : Literal::Int(std::get(literal.value()) + 1))); + return Expressions::Predicate(Expression::Operation::kGtEq, std::string(name), + transformed); + } + case Expression::Operation::kGtEq: { + ICEBERG_ASSIGN_OR_RAISE(auto transformed, func->Transform(literal)); + return Expressions::Predicate(Expression::Operation::kGtEq, std::string(name), + transformed); + } + case Expression::Operation::kEq: { + ICEBERG_ASSIGN_OR_RAISE(auto transformed, func->Transform(literal)); + return Expressions::Predicate(Expression::Operation::kEq, std::string(name), + transformed); + } + default: + return nullptr; + } + } + + static Result> TruncateDecimal( + std::string_view name, const std::shared_ptr& predicate, + const std::shared_ptr& func) { + const Literal& boundary = predicate->literal(); + + // For boundary adjustments, extract type info once + auto make_adjusted_literal = [&boundary](int adjustment) { + const auto& type = internal::checked_pointer_cast(boundary.type()); + Decimal adjusted = std::get(boundary.value()) + Decimal(adjustment); + return Literal::Decimal(adjusted.value(), type->precision(), type->scale()); + }; + + switch (predicate->op()) { + case Expression::Operation::kLt: { + // adjust closed and then transform ltEq + ICEBERG_ASSIGN_OR_RAISE(auto transformed, + func->Transform(make_adjusted_literal(-1))); + return Expressions::Predicate(Expression::Operation::kLtEq, std::string(name), + transformed); + } + case Expression::Operation::kLtEq: { + ICEBERG_ASSIGN_OR_RAISE(auto transformed, func->Transform(boundary)); + return Expressions::Predicate(Expression::Operation::kLtEq, std::string(name), + transformed); + } + case Expression::Operation::kGt: { + // adjust closed and then transform gtEq + ICEBERG_ASSIGN_OR_RAISE(auto transformed, + func->Transform(make_adjusted_literal(1))); + return Expressions::Predicate(Expression::Operation::kGtEq, std::string(name), + transformed); + } + case Expression::Operation::kGtEq: { + ICEBERG_ASSIGN_OR_RAISE(auto transformed, func->Transform(boundary)); + return Expressions::Predicate(Expression::Operation::kGtEq, std::string(name), + transformed); + } + case Expression::Operation::kEq: { + ICEBERG_ASSIGN_OR_RAISE(auto transformed, func->Transform(boundary)); + return Expressions::Predicate(Expression::Operation::kEq, std::string(name), + transformed); + } + default: + return nullptr; + } + } + + static Result> TruncateStringLiteral( + std::string_view name, const std::shared_ptr& predicate, + const std::shared_ptr& func) { + const auto op = predicate->op(); + if (op != Expression::Operation::kStartsWith && + op != Expression::Operation::kNotStartsWith) { + return TruncateArray(name, predicate, func); + } + + const auto& truncate_transform = + internal::checked_pointer_cast(func); + const auto& str_value = std::get(predicate->literal().value()); + const auto width = truncate_transform->width(); + + if (str_value.length() < width) { + return Expressions::Predicate(op, std::string(name), predicate->literal()); + } + + if (str_value.length() == width) { + return op == Expression::Operation::kStartsWith + ? Expressions::Equal(std::string(name), predicate->literal()) + : Expressions::NotEqual(std::string(name), predicate->literal()); + } + + return op == Expression::Operation::kStartsWith ? TruncateArray(name, predicate, func) + : nullptr; + } + + // Fixes an inclusive projection to account for incorrectly transformed values. + // align with Java implementation: + // https://github.com/apache/iceberg/blob/main/api/src/main/java/org/apache/iceberg/transforms/ProjectionUtil.java#L275 + static std::shared_ptr FixInclusiveTimeProjection( + const std::shared_ptr>& projected) { + if (projected == nullptr) { + return nullptr; + } + + // adjust the predicate for values that were 1 larger than the correct transformed + // value + switch (projected->op()) { + case Expression::Operation::kLt: { + ICEBERG_DCHECK(!projected->literals().empty(), "Expected at least one literal"); + const auto& literal = projected->literals().front(); + ICEBERG_DCHECK(std::holds_alternative(literal.value()), + "Expected int32_t"); + auto value = std::get(literal.value()); + if (value < 0) { + return Expressions::LessThan(projected->term(), Literal::Int(value + 1)); + } + + return projected; + } + + case Expression::Operation::kLtEq: { + ICEBERG_DCHECK(!projected->literals().empty(), "Expected at least one literal"); + const auto& literal = projected->literals().front(); + ICEBERG_DCHECK(std::holds_alternative(literal.value()), + "Expected int32_t"); + auto value = std::get(literal.value()); + if (value < 0) { + return Expressions::LessThanOrEqual(projected->term(), Literal::Int(value + 1)); + } + return projected; + } + + case Expression::Operation::kGt: + case Expression::Operation::kGtEq: + // incorrect projected values are already greater than the bound for GT, GT_EQ + return projected; + + case Expression::Operation::kEq: { + ICEBERG_DCHECK(!projected->literals().empty(), "Expected at least one literal"); + const auto& literal = projected->literals().front(); + ICEBERG_DCHECK(std::holds_alternative(literal.value()), + "Expected int32_t"); + auto value = std::get(literal.value()); + if (value < 0) { + // match either the incorrect value (projectedValue + 1) or the correct value + // (projectedValue) + return Expressions::In(projected->term(), {literal, Literal::Int(value + 1)}); + } + return projected; + } + + case Expression::Operation::kIn: { + ICEBERG_DCHECK(!projected->literals().empty(), "Expected at least one literal"); + const auto& literals = projected->literals(); + ICEBERG_DCHECK( + std::ranges::all_of(literals, + [](const auto& lit) { + return std::holds_alternative(lit.value()); + }), + "Expected int32_t"); + std::unordered_set value_set; + bool has_negative_value = false; + for (const auto& lit : literals) { + auto value = std::get(lit.value()); + value_set.insert(value); + if (value < 0) { + value_set.insert(value + 1); + has_negative_value = true; + } + } + if (has_negative_value) { + auto values = + std::views::transform(value_set, + [](int32_t value) { return Literal::Int(value); }) | + std::ranges::to(); + return Expressions::In(projected->term(), std::move(values)); + } + return projected; + } + + case Expression::Operation::kNotIn: + case Expression::Operation::kNotEq: + // there is no inclusive projection for NOT_EQ and NOT_IN + return nullptr; + + default: + return projected; + } + } + + public: + static Result> IdentityProject( + std::string_view name, const std::shared_ptr& predicate) { + switch (predicate->kind()) { + case BoundPredicate::Kind::kUnary: + return Expressions::Predicate(predicate->op(), std::string(name)); + case BoundPredicate::Kind::kLiteral: { + const auto& literalPredicate = + internal::checked_pointer_cast(predicate); + return Expressions::Predicate(predicate->op(), std::string(name), + literalPredicate->literal()); + } + case BoundPredicate::Kind::kSet: { + const auto& setPredicate = + internal::checked_pointer_cast(predicate); + return Expressions::Predicate( + predicate->op(), std::string(name), + {setPredicate->literal_set().begin(), setPredicate->literal_set().end()}); + } + } + + std::unreachable(); + } + + static Result> BucketProject( + std::string_view name, const std::shared_ptr& predicate, + const std::shared_ptr& func) { + switch (predicate->kind()) { + case BoundPredicate::Kind::kUnary: + return Expressions::Predicate(predicate->op(), std::string(name)); + case BoundPredicate::Kind::kLiteral: { + if (predicate->op() == Expression::Operation::kEq) { + const auto& literalPredicate = + internal::checked_pointer_cast(predicate); + ICEBERG_ASSIGN_OR_RAISE(auto transformed, + func->Transform(literalPredicate->literal())); + return Expressions::Predicate(predicate->op(), std::string(name), transformed); + } + break; + } + case BoundPredicate::Kind::kSet: { + // notIn can't be projected + if (predicate->op() == Expression::Operation::kIn) { + const auto& setPredicate = + internal::checked_pointer_cast(predicate); + return TransformSet(name, setPredicate, func); + } + break; + } + } + + // comparison predicates can't be projected, notEq can't be projected + // TODO(anyone): small ranges can be projected. + // for example, (x > 0) and (x < 3) can be turned into in({1, 2}) and projected. + return nullptr; + } + + static Result> TruncateProject( + std::string_view name, const std::shared_ptr& predicate, + const std::shared_ptr& func) { + // Handle unary predicates uniformly for all types + if (predicate->kind() == BoundPredicate::Kind::kUnary) { + return Expressions::Predicate(predicate->op(), std::string(name)); + } + + // Handle set predicates (kIn) uniformly for all types + if (predicate->kind() == BoundPredicate::Kind::kSet) { + if (predicate->op() == Expression::Operation::kIn) { + const auto& setPredicate = + internal::checked_pointer_cast(predicate); + return TransformSet(name, setPredicate, func); + } + return nullptr; + } + + // Handle literal predicates based on source type + const auto& literalPredicate = + internal::checked_pointer_cast(predicate); + + switch (func->source_type()->type_id()) { + case TypeId::kInt: + return TruncateInteger(name, literalPredicate, func); + case TypeId::kLong: + return TruncateInteger(name, literalPredicate, func); + case TypeId::kDecimal: + return TruncateDecimal(name, literalPredicate, func); + case TypeId::kString: + return TruncateStringLiteral(name, literalPredicate, func); + case TypeId::kBinary: + return TruncateArray(name, literalPredicate, func); + default: + return NotSupported("{} is not a valid input type for truncate transform", + func->source_type()->ToString()); + } + } + + static Result> TemporalProject( + std::string_view name, const std::shared_ptr& predicate, + const std::shared_ptr& func) { + if (predicate->kind() == BoundPredicate::Kind::kUnary) { + return Expressions::Predicate(predicate->op(), std::string(name)); + } + + if (predicate->kind() == BoundPredicate::Kind::kSet) { + if (predicate->op() == Expression::Operation::kIn) { + const auto& setPredicate = + internal::checked_pointer_cast(predicate); + ICEBERG_ASSIGN_OR_RAISE(auto projected, TransformSet(name, setPredicate, func)); + if (func->transform_type() != TransformType::kDay) { + return FixInclusiveTimeProjection( + internal::checked_pointer_cast>( + projected)); + } + return projected; + } + return nullptr; + } + + const auto& literalPredicate = + internal::checked_pointer_cast(predicate); + + switch (func->source_type()->type_id()) { + case TypeId::kDate: { + ICEBERG_ASSIGN_OR_RAISE(auto projected, + TruncateInteger(name, literalPredicate, func)); + if (func->transform_type() != TransformType::kDay) { + return FixInclusiveTimeProjection( + internal::checked_pointer_cast>( + projected)); + } + return projected; + } + case TypeId::kTimestamp: + case TypeId::kTimestampTz: { + ICEBERG_ASSIGN_OR_RAISE(auto projected, + TruncateInteger(name, literalPredicate, func)); + if (func->transform_type() != TransformType::kDay) { + return FixInclusiveTimeProjection( + internal::checked_pointer_cast>( + projected)); + } + return projected; + } + + default: + return NotSupported("{} is not a valid input type for temporal transform", + func->source_type()->ToString()); + } + } + + static Result> RemoveTransform( + std::string_view name, const std::shared_ptr& predicate) { + switch (predicate->kind()) { + case BoundPredicate::Kind::kUnary: + return Expressions::Predicate(predicate->op(), std::string(name)); + case BoundPredicate::Kind::kLiteral: { + const auto& literalPredicate = + internal::checked_pointer_cast(predicate); + return Expressions::Predicate(predicate->op(), std::string(name), + literalPredicate->literal()); + } + case BoundPredicate::Kind::kSet: { + const auto& setPredicate = + internal::checked_pointer_cast(predicate); + return Expressions::Predicate( + predicate->op(), std::string(name), + {setPredicate->literal_set().begin(), setPredicate->literal_set().end()}); + } + } + std::unreachable(); + } +}; + +} // namespace iceberg From ad56d179170089c61963b77f40f9dfac94ad7e19 Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Sun, 30 Nov 2025 17:24:59 +0800 Subject: [PATCH 2/7] fix: TruncateInteger should consider Literal type --- src/iceberg/util/projection_util_internal.h | 53 +++++++++++++++------ 1 file changed, 38 insertions(+), 15 deletions(-) diff --git a/src/iceberg/util/projection_util_internal.h b/src/iceberg/util/projection_util_internal.h index 57e8a46fd..08bf76512 100644 --- a/src/iceberg/util/projection_util_internal.h +++ b/src/iceberg/util/projection_util_internal.h @@ -87,18 +87,29 @@ class ProjectionUtil { std::string_view name, const std::shared_ptr& predicate, const std::shared_ptr& func) { const Literal& literal = predicate->literal(); - bool is_source_date_type = func->source_type()->type_id() == TypeId::kDate; switch (predicate->op()) { case Expression::Operation::kLt: { // adjust closed and then transform ltEq - ICEBERG_ASSIGN_OR_RAISE( - auto transformed, - func->Transform(is_source_date_type - ? Literal::Date(std::get(literal.value()) - 1) - : Literal::Int(std::get(literal.value()) - 1))); - return Expressions::Predicate(Expression::Operation::kLtEq, std::string(name), - transformed); + if constexpr (std::is_same_v) { + ICEBERG_ASSIGN_OR_RAISE( + auto transformed, + func->Transform(literal.type()->type_id() == TypeId::kDate + ? Literal::Date(std::get(literal.value()) - 1) + : Literal::Int(std::get(literal.value()) - 1))); + return Expressions::Predicate(Expression::Operation::kLtEq, std::string(name), + transformed); + } else { + ICEBERG_ASSIGN_OR_RAISE( + auto transformed, + func->Transform( + literal.type()->type_id() == TypeId::kTimestamp || + literal.type()->type_id() == TypeId::kTimestampTz + ? Literal::Timestamp(std::get(literal.value()) - 1) + : Literal::Long(std::get(literal.value()) - 1))); + return Expressions::Predicate(Expression::Operation::kLtEq, std::string(name), + transformed); + } } case Expression::Operation::kLtEq: { ICEBERG_ASSIGN_OR_RAISE(auto transformed, func->Transform(literal)); @@ -107,13 +118,25 @@ class ProjectionUtil { } case Expression::Operation::kGt: { // adjust closed and then transform gtEq - ICEBERG_ASSIGN_OR_RAISE( - auto transformed, - func->Transform(is_source_date_type - ? Literal::Date(std::get(literal.value()) + 1) - : Literal::Int(std::get(literal.value()) + 1))); - return Expressions::Predicate(Expression::Operation::kGtEq, std::string(name), - transformed); + if constexpr (std::is_same_v) { + ICEBERG_ASSIGN_OR_RAISE( + auto transformed, + func->Transform(literal.type()->type_id() == TypeId::kDate + ? Literal::Date(std::get(literal.value()) + 1) + : Literal::Int(std::get(literal.value()) + 1))); + return Expressions::Predicate(Expression::Operation::kGtEq, std::string(name), + transformed); + } else { + ICEBERG_ASSIGN_OR_RAISE( + auto transformed, + func->Transform( + literal.type()->type_id() == TypeId::kTimestamp || + literal.type()->type_id() == TypeId::kTimestampTz + ? Literal::Timestamp(std::get(literal.value()) + 1) + : Literal::Long(std::get(literal.value()) + 1))); + return Expressions::Predicate(Expression::Operation::kGtEq, std::string(name), + transformed); + } } case Expression::Operation::kGtEq: { ICEBERG_ASSIGN_OR_RAISE(auto transformed, func->Transform(literal)); From a19bb795553c32b5cf8058afce7fe45d667254e6 Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Mon, 1 Dec 2025 21:57:06 +0800 Subject: [PATCH 3/7] fix: review comments --- src/iceberg/type_fwd.h | 6 +- src/iceberg/util/projection_util_internal.h | 433 +++++++++++++------- 2 files changed, 290 insertions(+), 149 deletions(-) diff --git a/src/iceberg/type_fwd.h b/src/iceberg/type_fwd.h index abca91b7e..7aad3d8d8 100644 --- a/src/iceberg/type_fwd.h +++ b/src/iceberg/type_fwd.h @@ -123,6 +123,9 @@ class Uuid; class Expression; class Literal; +class BoundPredicate; +class UnboundPredicate; + class DataTableScan; class FileScanTask; class ScanTask; @@ -161,9 +164,6 @@ class PendingUpdate; template class PendingUpdateTyped; -class BoundPredicate; -class UnboundPredicate; - /// ---------------------------------------------------------------------------- /// TODO: Forward declarations below are not added yet. /// ---------------------------------------------------------------------------- diff --git a/src/iceberg/util/projection_util_internal.h b/src/iceberg/util/projection_util_internal.h index 08bf76512..6a2debb28 100644 --- a/src/iceberg/util/projection_util_internal.h +++ b/src/iceberg/util/projection_util_internal.h @@ -26,7 +26,6 @@ #include #include -#include "iceberg/expression/expressions.h" #include "iceberg/expression/literal.h" #include "iceberg/expression/predicate.h" #include "iceberg/result.h" @@ -49,36 +48,61 @@ class ProjectionUtil { ICEBERG_ASSIGN_OR_RAISE(auto transformed_lit, func->Transform(lit)); transformed.push_back(std::move(transformed_lit)); } - return Expressions::Predicate(predicate->op(), std::string(name), - std::move(transformed)); + ICEBERG_ASSIGN_OR_RAISE(auto ref, NamedReference::Make(std::string(name))); + ICEBERG_ASSIGN_OR_RAISE(auto pred, + UnboundPredicateImpl::Make( + predicate->op(), std::move(ref), std::move(transformed))); + return pred; } - static Result> TruncateArray( - std::string_view name, const std::shared_ptr& predicate, + // General transform for all literal predicates. This is used as a fallback for special + // cases that are not handled by the other transform functions. + static Result> GenericTransform( + std::unique_ptr ref, + const std::shared_ptr& predicate, const std::shared_ptr& func) { - const Literal& boundary = predicate->literal(); - - ICEBERG_ASSIGN_OR_RAISE(auto transformed, func->Transform(boundary)); - + ICEBERG_ASSIGN_OR_RAISE(auto transformed, func->Transform(predicate->literal())); switch (predicate->op()) { case Expression::Operation::kLt: - case Expression::Operation::kLtEq: - return Expressions::Predicate(Expression::Operation::kLtEq, std::string(name), - transformed); + case Expression::Operation::kLtEq: { + ICEBERG_ASSIGN_OR_RAISE(auto pred, UnboundPredicateImpl::Make( + Expression::Operation::kLtEq, + std::move(ref), std::move(transformed))); + return pred; + } case Expression::Operation::kGt: - case Expression::Operation::kGtEq: - return Expressions::Predicate(Expression::Operation::kGtEq, std::string(name), - transformed); - case Expression::Operation::kEq: - return Expressions::Predicate(Expression::Operation::kEq, std::string(name), - transformed); - case Expression::Operation::kStartsWith: - return Expressions::Predicate(Expression::Operation::kStartsWith, - std::string(name), transformed); + case Expression::Operation::kGtEq: { + ICEBERG_ASSIGN_OR_RAISE(auto pred, UnboundPredicateImpl::Make( + Expression::Operation::kGtEq, + std::move(ref), std::move(transformed))); + return pred; + } + case Expression::Operation::kEq: { + ICEBERG_ASSIGN_OR_RAISE(auto pred, UnboundPredicateImpl::Make( + Expression::Operation::kEq, std::move(ref), + std::move(transformed))); + return pred; + } default: return nullptr; } - std::unreachable(); + } + + static Result> TruncateByteArray( + std::string_view name, const std::shared_ptr& predicate, + const std::shared_ptr& func) { + ICEBERG_ASSIGN_OR_RAISE(auto ref, NamedReference::Make(std::string(name))); + switch (predicate->op()) { + case Expression::Operation::kStartsWith: { + ICEBERG_ASSIGN_OR_RAISE(auto transformed, func->Transform(predicate->literal())); + ICEBERG_ASSIGN_OR_RAISE(auto pred, UnboundPredicateImpl::Make( + Expression::Operation::kStartsWith, + std::move(ref), std::move(transformed))); + return pred; + } + default: + return GenericTransform(std::move(ref), predicate, func); + } } template @@ -87,6 +111,7 @@ class ProjectionUtil { std::string_view name, const std::shared_ptr& predicate, const std::shared_ptr& func) { const Literal& literal = predicate->literal(); + ICEBERG_ASSIGN_OR_RAISE(auto ref, NamedReference::Make(std::string(name))); switch (predicate->op()) { case Expression::Operation::kLt: { @@ -94,62 +119,135 @@ class ProjectionUtil { if constexpr (std::is_same_v) { ICEBERG_ASSIGN_OR_RAISE( auto transformed, - func->Transform(literal.type()->type_id() == TypeId::kDate - ? Literal::Date(std::get(literal.value()) - 1) - : Literal::Int(std::get(literal.value()) - 1))); - return Expressions::Predicate(Expression::Operation::kLtEq, std::string(name), - transformed); + func->Transform(Literal::Int(std::get(literal.value()) - 1))); + ICEBERG_ASSIGN_OR_RAISE(auto pred, UnboundPredicateImpl::Make( + Expression::Operation::kLtEq, + std::move(ref), std::move(transformed))); + return pred; } else { ICEBERG_ASSIGN_OR_RAISE( auto transformed, - func->Transform( - literal.type()->type_id() == TypeId::kTimestamp || - literal.type()->type_id() == TypeId::kTimestampTz - ? Literal::Timestamp(std::get(literal.value()) - 1) - : Literal::Long(std::get(literal.value()) - 1))); - return Expressions::Predicate(Expression::Operation::kLtEq, std::string(name), - transformed); + func->Transform(Literal::Long(std::get(literal.value()) - 1))); + ICEBERG_ASSIGN_OR_RAISE(auto pred, UnboundPredicateImpl::Make( + Expression::Operation::kLtEq, + std::move(ref), std::move(transformed))); + return pred; } } - case Expression::Operation::kLtEq: { - ICEBERG_ASSIGN_OR_RAISE(auto transformed, func->Transform(literal)); - return Expressions::Predicate(Expression::Operation::kLtEq, std::string(name), - transformed); - } case Expression::Operation::kGt: { // adjust closed and then transform gtEq if constexpr (std::is_same_v) { ICEBERG_ASSIGN_OR_RAISE( auto transformed, - func->Transform(literal.type()->type_id() == TypeId::kDate - ? Literal::Date(std::get(literal.value()) + 1) - : Literal::Int(std::get(literal.value()) + 1))); - return Expressions::Predicate(Expression::Operation::kGtEq, std::string(name), - transformed); + func->Transform(Literal::Int(std::get(literal.value()) + 1))); + ICEBERG_ASSIGN_OR_RAISE(auto pred, UnboundPredicateImpl::Make( + Expression::Operation::kGtEq, + std::move(ref), std::move(transformed))); + return pred; } else { ICEBERG_ASSIGN_OR_RAISE( auto transformed, - func->Transform( - literal.type()->type_id() == TypeId::kTimestamp || - literal.type()->type_id() == TypeId::kTimestampTz - ? Literal::Timestamp(std::get(literal.value()) + 1) - : Literal::Long(std::get(literal.value()) + 1))); - return Expressions::Predicate(Expression::Operation::kGtEq, std::string(name), - transformed); + func->Transform(Literal::Long(std::get(literal.value()) + 1))); + ICEBERG_ASSIGN_OR_RAISE(auto pred, UnboundPredicateImpl::Make( + Expression::Operation::kGtEq, + std::move(ref), std::move(transformed))); + return pred; } } - case Expression::Operation::kGtEq: { - ICEBERG_ASSIGN_OR_RAISE(auto transformed, func->Transform(literal)); - return Expressions::Predicate(Expression::Operation::kGtEq, std::string(name), - transformed); + default: + return GenericTransform(std::move(ref), predicate, func); + } + } + + static Result> TransformTemporal( + std::string_view name, const std::shared_ptr& predicate, + const std::shared_ptr& func) { + const Literal& literal = predicate->literal(); + ICEBERG_ASSIGN_OR_RAISE(auto ref, NamedReference::Make(std::string(name))); + + switch (func->source_type()->type_id()) { + case TypeId::kDate: { + switch (predicate->op()) { + case Expression::Operation::kLt: { + ICEBERG_ASSIGN_OR_RAISE( + auto transformed, + func->Transform(Literal::Date(std::get(literal.value()) - 1))); + ICEBERG_ASSIGN_OR_RAISE(auto pred, + UnboundPredicateImpl::Make( + Expression::Operation::kLtEq, std::move(ref), + std::move(transformed))); + return pred; + } + case Expression::Operation::kGt: { + ICEBERG_ASSIGN_OR_RAISE( + auto transformed, + func->Transform(Literal::Date(std::get(literal.value()) + 1))); + ICEBERG_ASSIGN_OR_RAISE(auto pred, + UnboundPredicateImpl::Make( + Expression::Operation::kGtEq, std::move(ref), + std::move(transformed))); + return pred; + } + default: + return GenericTransform(std::move(ref), predicate, func); + } } - case Expression::Operation::kEq: { - ICEBERG_ASSIGN_OR_RAISE(auto transformed, func->Transform(literal)); - return Expressions::Predicate(Expression::Operation::kEq, std::string(name), - transformed); + case TypeId::kTimestamp: { + switch (predicate->op()) { + case Expression::Operation::kLt: { + ICEBERG_ASSIGN_OR_RAISE(auto transformed, + func->Transform(Literal::Timestamp( + std::get(literal.value()) - 1))); + ICEBERG_ASSIGN_OR_RAISE(auto pred, + UnboundPredicateImpl::Make( + Expression::Operation::kLtEq, std::move(ref), + std::move(transformed))); + return pred; + } + case Expression::Operation::kGt: { + ICEBERG_ASSIGN_OR_RAISE(auto transformed, + func->Transform(Literal::Timestamp( + std::get(literal.value()) + 1))); + ICEBERG_ASSIGN_OR_RAISE(auto pred, + UnboundPredicateImpl::Make( + Expression::Operation::kGtEq, std::move(ref), + std::move(transformed))); + return pred; + } + default: + return GenericTransform(std::move(ref), predicate, func); + } + } + case TypeId::kTimestampTz: { + switch (predicate->op()) { + case Expression::Operation::kLt: { + ICEBERG_ASSIGN_OR_RAISE(auto transformed, + func->Transform(Literal::TimestampTz( + std::get(literal.value()) - 1))); + ICEBERG_ASSIGN_OR_RAISE(auto pred, + UnboundPredicateImpl::Make( + Expression::Operation::kLtEq, std::move(ref), + std::move(transformed))); + return pred; + } + case Expression::Operation::kGt: { + ICEBERG_ASSIGN_OR_RAISE(auto transformed, + func->Transform(Literal::TimestampTz( + std::get(literal.value()) + 1))); + ICEBERG_ASSIGN_OR_RAISE(auto pred, + UnboundPredicateImpl::Make( + Expression::Operation::kGtEq, std::move(ref), + std::move(transformed))); + return pred; + } + default: + return GenericTransform(std::move(ref), predicate, func); + } + std::unreachable(); } default: - return nullptr; + return NotSupported("{} is not a valid input type for temporal transform", + func->source_type()->ToString()); } } @@ -157,6 +255,7 @@ class ProjectionUtil { std::string_view name, const std::shared_ptr& predicate, const std::shared_ptr& func) { const Literal& boundary = predicate->literal(); + ICEBERG_ASSIGN_OR_RAISE(auto ref, NamedReference::Make(std::string(name))); // For boundary adjustments, extract type info once auto make_adjusted_literal = [&boundary](int adjustment) { @@ -170,33 +269,22 @@ class ProjectionUtil { // adjust closed and then transform ltEq ICEBERG_ASSIGN_OR_RAISE(auto transformed, func->Transform(make_adjusted_literal(-1))); - return Expressions::Predicate(Expression::Operation::kLtEq, std::string(name), - transformed); - } - case Expression::Operation::kLtEq: { - ICEBERG_ASSIGN_OR_RAISE(auto transformed, func->Transform(boundary)); - return Expressions::Predicate(Expression::Operation::kLtEq, std::string(name), - transformed); + ICEBERG_ASSIGN_OR_RAISE(auto pred, UnboundPredicateImpl::Make( + Expression::Operation::kLtEq, + std::move(ref), std::move(transformed))); + return pred; } case Expression::Operation::kGt: { // adjust closed and then transform gtEq ICEBERG_ASSIGN_OR_RAISE(auto transformed, func->Transform(make_adjusted_literal(1))); - return Expressions::Predicate(Expression::Operation::kGtEq, std::string(name), - transformed); - } - case Expression::Operation::kGtEq: { - ICEBERG_ASSIGN_OR_RAISE(auto transformed, func->Transform(boundary)); - return Expressions::Predicate(Expression::Operation::kGtEq, std::string(name), - transformed); - } - case Expression::Operation::kEq: { - ICEBERG_ASSIGN_OR_RAISE(auto transformed, func->Transform(boundary)); - return Expressions::Predicate(Expression::Operation::kEq, std::string(name), - transformed); + ICEBERG_ASSIGN_OR_RAISE(auto pred, UnboundPredicateImpl::Make( + Expression::Operation::kGtEq, + std::move(ref), std::move(transformed))); + return pred; } default: - return nullptr; + return GenericTransform(std::move(ref), predicate, func); } } @@ -206,32 +294,50 @@ class ProjectionUtil { const auto op = predicate->op(); if (op != Expression::Operation::kStartsWith && op != Expression::Operation::kNotStartsWith) { - return TruncateArray(name, predicate, func); + return TruncateByteArray(name, predicate, func); } const auto& truncate_transform = internal::checked_pointer_cast(func); const auto& str_value = std::get(predicate->literal().value()); const auto width = truncate_transform->width(); + ICEBERG_ASSIGN_OR_RAISE(auto ref, NamedReference::Make(std::string(name))); if (str_value.length() < width) { - return Expressions::Predicate(op, std::string(name), predicate->literal()); + ICEBERG_ASSIGN_OR_RAISE(auto pred, UnboundPredicateImpl::Make( + op, std::move(ref), predicate->literal())); + return pred; } if (str_value.length() == width) { - return op == Expression::Operation::kStartsWith - ? Expressions::Equal(std::string(name), predicate->literal()) - : Expressions::NotEqual(std::string(name), predicate->literal()); + if (op == Expression::Operation::kStartsWith) { + ICEBERG_ASSIGN_OR_RAISE(auto pred, UnboundPredicateImpl::Make( + Expression::Operation::kEq, std::move(ref), + predicate->literal())); + return pred; + } else { + ICEBERG_ASSIGN_OR_RAISE(auto pred, UnboundPredicateImpl::Make( + Expression::Operation::kNotEq, + std::move(ref), predicate->literal())); + return pred; + } } - return op == Expression::Operation::kStartsWith ? TruncateArray(name, predicate, func) - : nullptr; + if (op == Expression::Operation::kStartsWith) { + ICEBERG_ASSIGN_OR_RAISE(auto transformed, func->Transform(predicate->literal())); + ICEBERG_ASSIGN_OR_RAISE(auto pred, UnboundPredicateImpl::Make( + Expression::Operation::kStartsWith, + std::move(ref), std::move(transformed))); + return pred; + } + + return nullptr; } // Fixes an inclusive projection to account for incorrectly transformed values. // align with Java implementation: // https://github.com/apache/iceberg/blob/main/api/src/main/java/org/apache/iceberg/transforms/ProjectionUtil.java#L275 - static std::shared_ptr FixInclusiveTimeProjection( + static Result> FixInclusiveTimeProjection( const std::shared_ptr>& projected) { if (projected == nullptr) { return nullptr; @@ -247,10 +353,14 @@ class ProjectionUtil { "Expected int32_t"); auto value = std::get(literal.value()); if (value < 0) { - return Expressions::LessThan(projected->term(), Literal::Int(value + 1)); + ICEBERG_ASSIGN_OR_RAISE( + auto pred, UnboundPredicateImpl::Make( + Expression::Operation::kLt, std::move(projected->term()), + Literal::Int(value + 1))); + return pred; } - return projected; + return std::move(projected); } case Expression::Operation::kLtEq: { @@ -260,9 +370,13 @@ class ProjectionUtil { "Expected int32_t"); auto value = std::get(literal.value()); if (value < 0) { - return Expressions::LessThanOrEqual(projected->term(), Literal::Int(value + 1)); + ICEBERG_ASSIGN_OR_RAISE( + auto pred, UnboundPredicateImpl::Make( + Expression::Operation::kLtEq, std::move(projected->term()), + Literal::Int(value + 1))); + return pred; } - return projected; + return std::move(projected); } case Expression::Operation::kGt: @@ -279,9 +393,13 @@ class ProjectionUtil { if (value < 0) { // match either the incorrect value (projectedValue + 1) or the correct value // (projectedValue) - return Expressions::In(projected->term(), {literal, Literal::Int(value + 1)}); + ICEBERG_ASSIGN_OR_RAISE( + auto pred, UnboundPredicateImpl::Make( + Expression::Operation::kIn, std::move(projected->term()), + {literal, Literal::Int(value + 1)})); + return pred; } - return projected; + return std::move(projected); } case Expression::Operation::kIn: { @@ -308,9 +426,13 @@ class ProjectionUtil { std::views::transform(value_set, [](int32_t value) { return Literal::Int(value); }) | std::ranges::to(); - return Expressions::In(projected->term(), std::move(values)); + ICEBERG_ASSIGN_OR_RAISE( + auto pred, UnboundPredicateImpl::Make( + Expression::Operation::kIn, std::move(projected->term()), + std::move(values))); + return pred; } - return projected; + return std::move(projected); } case Expression::Operation::kNotIn: @@ -319,28 +441,37 @@ class ProjectionUtil { return nullptr; default: - return projected; + return std::move(projected); } } public: static Result> IdentityProject( std::string_view name, const std::shared_ptr& predicate) { + ICEBERG_ASSIGN_OR_RAISE(auto ref, NamedReference::Make(std::string(name))); switch (predicate->kind()) { - case BoundPredicate::Kind::kUnary: - return Expressions::Predicate(predicate->op(), std::string(name)); + case BoundPredicate::Kind::kUnary: { + ICEBERG_ASSIGN_OR_RAISE(auto pred, UnboundPredicateImpl::Make( + predicate->op(), std::move(ref))); + return pred; + } case BoundPredicate::Kind::kLiteral: { const auto& literalPredicate = internal::checked_pointer_cast(predicate); - return Expressions::Predicate(predicate->op(), std::string(name), - literalPredicate->literal()); + ICEBERG_ASSIGN_OR_RAISE( + auto pred, UnboundPredicateImpl::Make( + predicate->op(), std::move(ref), literalPredicate->literal())); + return pred; } case BoundPredicate::Kind::kSet: { const auto& setPredicate = internal::checked_pointer_cast(predicate); - return Expressions::Predicate( - predicate->op(), std::string(name), - {setPredicate->literal_set().begin(), setPredicate->literal_set().end()}); + ICEBERG_ASSIGN_OR_RAISE( + auto pred, UnboundPredicateImpl::Make( + predicate->op(), std::move(ref), + std::vector(setPredicate->literal_set().begin(), + setPredicate->literal_set().end()))); + return pred; } } @@ -350,16 +481,23 @@ class ProjectionUtil { static Result> BucketProject( std::string_view name, const std::shared_ptr& predicate, const std::shared_ptr& func) { + ICEBERG_ASSIGN_OR_RAISE(auto ref, NamedReference::Make(std::string(name))); switch (predicate->kind()) { - case BoundPredicate::Kind::kUnary: - return Expressions::Predicate(predicate->op(), std::string(name)); + case BoundPredicate::Kind::kUnary: { + ICEBERG_ASSIGN_OR_RAISE(auto pred, UnboundPredicateImpl::Make( + predicate->op(), std::move(ref))); + return pred; + } case BoundPredicate::Kind::kLiteral: { if (predicate->op() == Expression::Operation::kEq) { const auto& literalPredicate = internal::checked_pointer_cast(predicate); ICEBERG_ASSIGN_OR_RAISE(auto transformed, func->Transform(literalPredicate->literal())); - return Expressions::Predicate(predicate->op(), std::string(name), transformed); + ICEBERG_ASSIGN_OR_RAISE( + auto pred, UnboundPredicateImpl::Make( + predicate->op(), std::move(ref), std::move(transformed))); + return pred; } break; } @@ -368,7 +506,8 @@ class ProjectionUtil { if (predicate->op() == Expression::Operation::kIn) { const auto& setPredicate = internal::checked_pointer_cast(predicate); - return TransformSet(name, setPredicate, func); + ICEBERG_ASSIGN_OR_RAISE(auto pred, TransformSet(name, setPredicate, func)); + return pred; } break; } @@ -383,9 +522,12 @@ class ProjectionUtil { static Result> TruncateProject( std::string_view name, const std::shared_ptr& predicate, const std::shared_ptr& func) { + ICEBERG_ASSIGN_OR_RAISE(auto ref, NamedReference::Make(std::string(name))); // Handle unary predicates uniformly for all types if (predicate->kind() == BoundPredicate::Kind::kUnary) { - return Expressions::Predicate(predicate->op(), std::string(name)); + ICEBERG_ASSIGN_OR_RAISE(auto pred, UnboundPredicateImpl::Make( + predicate->op(), std::move(ref))); + return pred; } // Handle set predicates (kIn) uniformly for all types @@ -412,7 +554,7 @@ class ProjectionUtil { case TypeId::kString: return TruncateStringLiteral(name, literalPredicate, func); case TypeId::kBinary: - return TruncateArray(name, literalPredicate, func); + return TruncateByteArray(name, literalPredicate, func); default: return NotSupported("{} is not a valid input type for truncate transform", func->source_type()->ToString()); @@ -422,8 +564,11 @@ class ProjectionUtil { static Result> TemporalProject( std::string_view name, const std::shared_ptr& predicate, const std::shared_ptr& func) { + ICEBERG_ASSIGN_OR_RAISE(auto ref, NamedReference::Make(std::string(name))); if (predicate->kind() == BoundPredicate::Kind::kUnary) { - return Expressions::Predicate(predicate->op(), std::string(name)); + ICEBERG_ASSIGN_OR_RAISE(auto pred, UnboundPredicateImpl::Make( + predicate->op(), std::move(ref))); + return pred; } if (predicate->kind() == BoundPredicate::Kind::kSet) { @@ -432,9 +577,12 @@ class ProjectionUtil { internal::checked_pointer_cast(predicate); ICEBERG_ASSIGN_OR_RAISE(auto projected, TransformSet(name, setPredicate, func)); if (func->transform_type() != TransformType::kDay) { - return FixInclusiveTimeProjection( - internal::checked_pointer_cast>( - projected)); + ICEBERG_ASSIGN_OR_RAISE( + auto fixed, + FixInclusiveTimeProjection( + internal::checked_pointer_cast>( + projected))); + return fixed; } return projected; } @@ -444,52 +592,45 @@ class ProjectionUtil { const auto& literalPredicate = internal::checked_pointer_cast(predicate); - switch (func->source_type()->type_id()) { - case TypeId::kDate: { - ICEBERG_ASSIGN_OR_RAISE(auto projected, - TruncateInteger(name, literalPredicate, func)); - if (func->transform_type() != TransformType::kDay) { - return FixInclusiveTimeProjection( - internal::checked_pointer_cast>( - projected)); - } - return projected; - } - case TypeId::kTimestamp: - case TypeId::kTimestampTz: { - ICEBERG_ASSIGN_OR_RAISE(auto projected, - TruncateInteger(name, literalPredicate, func)); - if (func->transform_type() != TransformType::kDay) { - return FixInclusiveTimeProjection( + ICEBERG_ASSIGN_OR_RAISE(auto projected, + TransformTemporal(name, literalPredicate, func)); + if (func->transform_type() != TransformType::kDay) { + ICEBERG_ASSIGN_OR_RAISE( + auto fixed, + FixInclusiveTimeProjection( internal::checked_pointer_cast>( - projected)); - } - return projected; - } - - default: - return NotSupported("{} is not a valid input type for temporal transform", - func->source_type()->ToString()); + projected))); + return fixed; } + return projected; } static Result> RemoveTransform( std::string_view name, const std::shared_ptr& predicate) { + ICEBERG_ASSIGN_OR_RAISE(auto ref, NamedReference::Make(std::string(name))); switch (predicate->kind()) { - case BoundPredicate::Kind::kUnary: - return Expressions::Predicate(predicate->op(), std::string(name)); + case BoundPredicate::Kind::kUnary: { + ICEBERG_ASSIGN_OR_RAISE(auto pred, UnboundPredicateImpl::Make( + predicate->op(), std::move(ref))); + return pred; + } case BoundPredicate::Kind::kLiteral: { const auto& literalPredicate = internal::checked_pointer_cast(predicate); - return Expressions::Predicate(predicate->op(), std::string(name), - literalPredicate->literal()); + ICEBERG_ASSIGN_OR_RAISE( + auto pred, UnboundPredicateImpl::Make( + predicate->op(), std::move(ref), literalPredicate->literal())); + return pred; } case BoundPredicate::Kind::kSet: { const auto& setPredicate = internal::checked_pointer_cast(predicate); - return Expressions::Predicate( - predicate->op(), std::string(name), - {setPredicate->literal_set().begin(), setPredicate->literal_set().end()}); + ICEBERG_ASSIGN_OR_RAISE( + auto pred, UnboundPredicateImpl::Make( + predicate->op(), std::move(ref), + std::vector(setPredicate->literal_set().begin(), + setPredicate->literal_set().end()))); + return pred; } } std::unreachable(); From da483bdb6ce22a11bc5aae10720dd0f0b73b8c01 Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Mon, 1 Dec 2025 22:07:42 +0800 Subject: [PATCH 4/7] fix: use std::unique_ptr --- src/iceberg/test/transform_test.cc | 25 +++++++++------ src/iceberg/transform.cc | 2 +- src/iceberg/transform.h | 4 +-- src/iceberg/util/projection_util_internal.h | 34 ++++++++++----------- 4 files changed, 36 insertions(+), 29 deletions(-) diff --git a/src/iceberg/test/transform_test.cc b/src/iceberg/test/transform_test.cc index a9d1ec261..b957f370c 100644 --- a/src/iceberg/test/transform_test.cc +++ b/src/iceberg/test/transform_test.cc @@ -964,7 +964,8 @@ TEST_F(TransformProjectTest, IdentityProjectEquality) { EXPECT_EQ(projected->op(), Expression::Operation::kEq); auto unbound_projected = - internal::checked_pointer_cast>(projected); + internal::checked_pointer_cast>( + std::move(projected)); ASSERT_NE(unbound_projected, nullptr); EXPECT_EQ(unbound_projected->op(), Expression::Operation::kEq); EXPECT_EQ(unbound_projected->literals().size(), 1); @@ -1028,7 +1029,8 @@ TEST_F(TransformProjectTest, IdentityProjectSet) { ASSERT_NE(projected_in, nullptr); EXPECT_EQ(projected_in->op(), Expression::Operation::kIn); auto unbound_projected = - internal::checked_pointer_cast>(projected_in); + internal::checked_pointer_cast>( + std::move(projected_in)); ASSERT_NE(unbound_projected, nullptr); EXPECT_EQ(unbound_projected->op(), Expression::Operation::kIn); EXPECT_EQ(unbound_projected->literals().size(), 3); @@ -1054,7 +1056,8 @@ TEST_F(TransformProjectTest, BucketProjectEquality) { EXPECT_EQ(projected->op(), Expression::Operation::kEq); auto unbound_projected = - internal::checked_pointer_cast>(projected); + internal::checked_pointer_cast>( + std::move(projected)); ASSERT_NE(unbound_projected, nullptr); EXPECT_EQ(unbound_projected->op(), Expression::Operation::kEq); EXPECT_EQ(unbound_projected->literals().size(), 1); @@ -1084,7 +1087,8 @@ TEST_F(TransformProjectTest, BucketProjectWithMatchingTransformedChild) { ASSERT_NE(projected, nullptr); EXPECT_EQ(projected->op(), Expression::Operation::kEq); auto unbound_projected = - internal::checked_pointer_cast>(projected); + internal::checked_pointer_cast>( + std::move(projected)); ASSERT_NE(unbound_projected, nullptr); EXPECT_EQ(unbound_projected->op(), Expression::Operation::kEq); EXPECT_EQ(unbound_projected->literals().size(), 1); @@ -1152,7 +1156,8 @@ TEST_F(TransformProjectTest, TruncateProjectIntEquality) { EXPECT_EQ(projected->op(), Expression::Operation::kEq); auto unbound_projected = - internal::checked_pointer_cast>(projected); + internal::checked_pointer_cast>( + std::move(projected)); ASSERT_NE(unbound_projected, nullptr); EXPECT_EQ(unbound_projected->op(), Expression::Operation::kEq); EXPECT_EQ(unbound_projected->literals().size(), 1); @@ -1189,7 +1194,8 @@ TEST_F(TransformProjectTest, TruncateProjectIntGreaterThan) { EXPECT_EQ(projected->op(), Expression::Operation::kGtEq); auto unbound_projected = - internal::checked_pointer_cast>(projected); + internal::checked_pointer_cast>( + std::move(projected)); ASSERT_NE(unbound_projected, nullptr); EXPECT_EQ(unbound_projected->op(), Expression::Operation::kGtEq); EXPECT_EQ(unbound_projected->literals().size(), 1); @@ -1210,7 +1216,8 @@ TEST_F(TransformProjectTest, TruncateProjectStringEquality) { EXPECT_EQ(projected->op(), Expression::Operation::kEq); auto unbound_projected = - internal::checked_pointer_cast>(projected); + internal::checked_pointer_cast>( + std::move(projected)); ASSERT_NE(unbound_projected, nullptr); EXPECT_EQ(unbound_projected->op(), Expression::Operation::kEq); EXPECT_EQ(unbound_projected->literals().size(), 1); @@ -1235,7 +1242,7 @@ TEST_F(TransformProjectTest, TruncateProjectStringStartsWith) { auto unbound_projected_short = internal::checked_pointer_cast>( - projected_short); + std::move(projected_short)); ASSERT_NE(unbound_projected_short, nullptr); EXPECT_EQ(unbound_projected_short->op(), Expression::Operation::kStartsWith); EXPECT_EQ(unbound_projected_short->literals().size(), 1); @@ -1256,7 +1263,7 @@ TEST_F(TransformProjectTest, TruncateProjectStringStartsWith) { auto unbound_projected_equal = internal::checked_pointer_cast>( - projected_equal); + std::move(projected_equal)); ASSERT_NE(unbound_projected_equal, nullptr); EXPECT_EQ(unbound_projected_equal->op(), Expression::Operation::kEq); EXPECT_EQ(unbound_projected_equal->literals().size(), 1); diff --git a/src/iceberg/transform.cc b/src/iceberg/transform.cc index 857cc7a50..f8d2f0655 100644 --- a/src/iceberg/transform.cc +++ b/src/iceberg/transform.cc @@ -246,7 +246,7 @@ bool Transform::SatisfiesOrderOf(const Transform& other) const { std::unreachable(); } -Result> Transform::Project( +Result> Transform::Project( std::string_view name, const std::shared_ptr& predicate) { switch (transform_type_) { case TransformType::kIdentity: diff --git a/src/iceberg/transform.h b/src/iceberg/transform.h index a82a33934..3c83825b2 100644 --- a/src/iceberg/transform.h +++ b/src/iceberg/transform.h @@ -177,9 +177,9 @@ class ICEBERG_EXPORT Transform : public util::Formattable { /// Projected(transform(value)) is true. /// \param name The name of the partition column. /// \param predicate The predicate to project. - /// \return A Result containing either a shared pointer to the projected predicate or an + /// \return A Result containing either a unique pointer to the projected predicate or an /// Error if the projection fails. - Result> Project( + Result> Project( std::string_view name, const std::shared_ptr& predicate); /// \brief Returns a string representation of this transform (e.g., "bucket[16]"). diff --git a/src/iceberg/util/projection_util_internal.h b/src/iceberg/util/projection_util_internal.h index 6a2debb28..91ffb7b92 100644 --- a/src/iceberg/util/projection_util_internal.h +++ b/src/iceberg/util/projection_util_internal.h @@ -39,7 +39,7 @@ namespace iceberg { class ProjectionUtil { private: - static Result> TransformSet( + static Result> TransformSet( std::string_view name, const std::shared_ptr& predicate, const std::shared_ptr& func) { std::vector transformed; @@ -57,7 +57,7 @@ class ProjectionUtil { // General transform for all literal predicates. This is used as a fallback for special // cases that are not handled by the other transform functions. - static Result> GenericTransform( + static Result> GenericTransform( std::unique_ptr ref, const std::shared_ptr& predicate, const std::shared_ptr& func) { @@ -88,7 +88,7 @@ class ProjectionUtil { } } - static Result> TruncateByteArray( + static Result> TruncateByteArray( std::string_view name, const std::shared_ptr& predicate, const std::shared_ptr& func) { ICEBERG_ASSIGN_OR_RAISE(auto ref, NamedReference::Make(std::string(name))); @@ -107,7 +107,7 @@ class ProjectionUtil { template requires std::is_same_v || std::is_same_v - static Result> TruncateInteger( + static Result> TruncateInteger( std::string_view name, const std::shared_ptr& predicate, const std::shared_ptr& func) { const Literal& literal = predicate->literal(); @@ -159,7 +159,7 @@ class ProjectionUtil { } } - static Result> TransformTemporal( + static Result> TransformTemporal( std::string_view name, const std::shared_ptr& predicate, const std::shared_ptr& func) { const Literal& literal = predicate->literal(); @@ -251,7 +251,7 @@ class ProjectionUtil { } } - static Result> TruncateDecimal( + static Result> TruncateDecimal( std::string_view name, const std::shared_ptr& predicate, const std::shared_ptr& func) { const Literal& boundary = predicate->literal(); @@ -288,7 +288,7 @@ class ProjectionUtil { } } - static Result> TruncateStringLiteral( + static Result> TruncateStringLiteral( std::string_view name, const std::shared_ptr& predicate, const std::shared_ptr& func) { const auto op = predicate->op(); @@ -337,8 +337,8 @@ class ProjectionUtil { // Fixes an inclusive projection to account for incorrectly transformed values. // align with Java implementation: // https://github.com/apache/iceberg/blob/main/api/src/main/java/org/apache/iceberg/transforms/ProjectionUtil.java#L275 - static Result> FixInclusiveTimeProjection( - const std::shared_ptr>& projected) { + static Result> FixInclusiveTimeProjection( + std::unique_ptr> projected) { if (projected == nullptr) { return nullptr; } @@ -382,7 +382,7 @@ class ProjectionUtil { case Expression::Operation::kGt: case Expression::Operation::kGtEq: // incorrect projected values are already greater than the bound for GT, GT_EQ - return projected; + return std::move(projected); case Expression::Operation::kEq: { ICEBERG_DCHECK(!projected->literals().empty(), "Expected at least one literal"); @@ -446,7 +446,7 @@ class ProjectionUtil { } public: - static Result> IdentityProject( + static Result> IdentityProject( std::string_view name, const std::shared_ptr& predicate) { ICEBERG_ASSIGN_OR_RAISE(auto ref, NamedReference::Make(std::string(name))); switch (predicate->kind()) { @@ -478,7 +478,7 @@ class ProjectionUtil { std::unreachable(); } - static Result> BucketProject( + static Result> BucketProject( std::string_view name, const std::shared_ptr& predicate, const std::shared_ptr& func) { ICEBERG_ASSIGN_OR_RAISE(auto ref, NamedReference::Make(std::string(name))); @@ -519,7 +519,7 @@ class ProjectionUtil { return nullptr; } - static Result> TruncateProject( + static Result> TruncateProject( std::string_view name, const std::shared_ptr& predicate, const std::shared_ptr& func) { ICEBERG_ASSIGN_OR_RAISE(auto ref, NamedReference::Make(std::string(name))); @@ -561,7 +561,7 @@ class ProjectionUtil { } } - static Result> TemporalProject( + static Result> TemporalProject( std::string_view name, const std::shared_ptr& predicate, const std::shared_ptr& func) { ICEBERG_ASSIGN_OR_RAISE(auto ref, NamedReference::Make(std::string(name))); @@ -581,7 +581,7 @@ class ProjectionUtil { auto fixed, FixInclusiveTimeProjection( internal::checked_pointer_cast>( - projected))); + std::move(projected)))); return fixed; } return projected; @@ -599,13 +599,13 @@ class ProjectionUtil { auto fixed, FixInclusiveTimeProjection( internal::checked_pointer_cast>( - projected))); + std::move(projected)))); return fixed; } return projected; } - static Result> RemoveTransform( + static Result> RemoveTransform( std::string_view name, const std::shared_ptr& predicate) { ICEBERG_ASSIGN_OR_RAISE(auto ref, NamedReference::Make(std::string(name))); switch (predicate->kind()) { From 7ec8a912db9d346ea39b83593bfb52704942fbd8 Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Tue, 2 Dec 2025 22:54:57 +0800 Subject: [PATCH 5/7] fix: review comments --- src/iceberg/test/transform_test.cc | 284 +++++++++++++++++++ src/iceberg/transform.h | 4 +- src/iceberg/util/projection_util_internal.h | 289 +++++++------------- src/iceberg/util/string_util.h | 11 + 4 files changed, 401 insertions(+), 187 deletions(-) diff --git a/src/iceberg/test/transform_test.cc b/src/iceberg/test/transform_test.cc index b957f370c..f557f7787 100644 --- a/src/iceberg/test/transform_test.cc +++ b/src/iceberg/test/transform_test.cc @@ -1271,6 +1271,263 @@ TEST_F(TransformProjectTest, TruncateProjectStringStartsWith) { "Hello"); } +TEST_F(TransformProjectTest, TruncateProjectStringStartsWithCodePointCountLessThanWidth) { + auto transform = Transform::Truncate(5); + + // Code point count < width (multi-byte UTF-8 characters) + // "😜🧐" has 2 code points, width is 5 + auto unbound_emoji_short = Expressions::StartsWith("value", "😜🧐"); + ICEBERG_ASSIGN_OR_THROW( + auto bound_emoji_short, + unbound_emoji_short->Bind(*string_schema_, /*case_sensitive=*/true)); + auto bound_pred_emoji_short = + std::dynamic_pointer_cast(bound_emoji_short); + ASSERT_NE(bound_pred_emoji_short, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected_emoji_short, + transform->Project("part", bound_pred_emoji_short)); + ASSERT_NE(projected_emoji_short, nullptr); + EXPECT_EQ(projected_emoji_short->op(), Expression::Operation::kStartsWith); + + auto unbound_projected_emoji_short = + internal::checked_pointer_cast>( + std::move(projected_emoji_short)); + ASSERT_NE(unbound_projected_emoji_short, nullptr); + EXPECT_EQ(unbound_projected_emoji_short->op(), Expression::Operation::kStartsWith); + EXPECT_EQ(unbound_projected_emoji_short->literals().size(), 1); + EXPECT_EQ( + std::get(unbound_projected_emoji_short->literals().front().value()), + "😜🧐"); +} + +TEST_F(TransformProjectTest, TruncateProjectStringStartsWithCodePointCountEqualToWidth) { + auto transform = Transform::Truncate(5); + + // Code point count == width (exactly 5 code points) + // "πŸ˜œπŸ§πŸ€”πŸ€ͺπŸ₯³" has exactly 5 code points + auto unbound_emoji_equal = Expressions::StartsWith("value", "πŸ˜œπŸ§πŸ€”πŸ€ͺπŸ₯³"); + ICEBERG_ASSIGN_OR_THROW( + auto bound_emoji_equal, + unbound_emoji_equal->Bind(*string_schema_, /*case_sensitive=*/true)); + auto bound_pred_emoji_equal = + std::dynamic_pointer_cast(bound_emoji_equal); + ASSERT_NE(bound_pred_emoji_equal, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected_emoji_equal, + transform->Project("part", bound_pred_emoji_equal)); + ASSERT_NE(projected_emoji_equal, nullptr); + EXPECT_EQ(projected_emoji_equal->op(), Expression::Operation::kEq); + + auto unbound_projected_emoji_equal = + internal::checked_pointer_cast>( + std::move(projected_emoji_equal)); + ASSERT_NE(unbound_projected_emoji_equal, nullptr); + EXPECT_EQ(unbound_projected_emoji_equal->op(), Expression::Operation::kEq); + EXPECT_EQ(unbound_projected_emoji_equal->literals().size(), 1); + EXPECT_EQ( + std::get(unbound_projected_emoji_equal->literals().front().value()), + "πŸ˜œπŸ§πŸ€”πŸ€ͺπŸ₯³"); +} + +TEST_F(TransformProjectTest, + TruncateProjectStringStartsWithCodePointCountGreaterThanWidth) { + auto transform = Transform::Truncate(5); + + // Code point count > width (truncate to 5 code points) + // "πŸ˜œπŸ§πŸ€”πŸ€ͺπŸ₯³πŸ˜΅β€πŸ’«πŸ˜‚" has 7 code points, should truncate to 5 + auto unbound_emoji_long = + Expressions::StartsWith("value", "πŸ˜œπŸ§πŸ€”πŸ€ͺπŸ₯³πŸ˜΅β€πŸ’«πŸ˜‚"); + ICEBERG_ASSIGN_OR_THROW( + auto bound_emoji_long, + unbound_emoji_long->Bind(*string_schema_, /*case_sensitive=*/true)); + auto bound_pred_emoji_long = + std::dynamic_pointer_cast(bound_emoji_long); + ASSERT_NE(bound_pred_emoji_long, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected_emoji_long, + transform->Project("part", bound_pred_emoji_long)); + ASSERT_NE(projected_emoji_long, nullptr); + EXPECT_EQ(projected_emoji_long->op(), Expression::Operation::kStartsWith); + + auto unbound_projected_emoji_long = + internal::checked_pointer_cast>( + std::move(projected_emoji_long)); + ASSERT_NE(unbound_projected_emoji_long, nullptr); + EXPECT_EQ(unbound_projected_emoji_long->op(), Expression::Operation::kStartsWith); + EXPECT_EQ(unbound_projected_emoji_long->literals().size(), 1); + EXPECT_EQ( + std::get(unbound_projected_emoji_long->literals().front().value()), + "πŸ˜œπŸ§πŸ€”πŸ€ͺπŸ₯³"); +} + +TEST_F(TransformProjectTest, TruncateProjectStringStartsWithMixedAsciiAndMultiByte) { + auto transform = Transform::Truncate(5); + + // Mixed ASCII and multi-byte UTF-8 characters + // "a😜b🧐c" has 5 code points (3 ASCII + 2 emojis) + auto unbound_mixed_equal = Expressions::StartsWith("value", "a😜b🧐c"); + ICEBERG_ASSIGN_OR_THROW( + auto bound_mixed_equal, + unbound_mixed_equal->Bind(*string_schema_, /*case_sensitive=*/true)); + auto bound_pred_mixed_equal = + std::dynamic_pointer_cast(bound_mixed_equal); + ASSERT_NE(bound_pred_mixed_equal, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected_mixed_equal, + transform->Project("part", bound_pred_mixed_equal)); + ASSERT_NE(projected_mixed_equal, nullptr); + EXPECT_EQ(projected_mixed_equal->op(), Expression::Operation::kEq); + + auto unbound_projected_mixed_equal = + internal::checked_pointer_cast>( + std::move(projected_mixed_equal)); + ASSERT_NE(unbound_projected_mixed_equal, nullptr); + EXPECT_EQ(unbound_projected_mixed_equal->op(), Expression::Operation::kEq); + EXPECT_EQ(unbound_projected_mixed_equal->literals().size(), 1); + EXPECT_EQ( + std::get(unbound_projected_mixed_equal->literals().front().value()), + "a😜b🧐c"); +} + +TEST_F(TransformProjectTest, TruncateProjectStringStartsWithChineseCharactersShort) { + auto transform = Transform::Truncate(5); + + // Chinese characters (3-byte UTF-8) + // "δ½ ε₯½δΈ–η•Œ" has 4 code points, width is 5 + auto unbound_chinese_short = Expressions::StartsWith("value", "δ½ ε₯½δΈ–η•Œ"); + ICEBERG_ASSIGN_OR_THROW( + auto bound_chinese_short, + unbound_chinese_short->Bind(*string_schema_, /*case_sensitive=*/true)); + auto bound_pred_chinese_short = + std::dynamic_pointer_cast(bound_chinese_short); + ASSERT_NE(bound_pred_chinese_short, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected_chinese_short, + transform->Project("part", bound_pred_chinese_short)); + ASSERT_NE(projected_chinese_short, nullptr); + EXPECT_EQ(projected_chinese_short->op(), Expression::Operation::kStartsWith); + + auto unbound_projected_chinese_short = + internal::checked_pointer_cast>( + std::move(projected_chinese_short)); + ASSERT_NE(unbound_projected_chinese_short, nullptr); + EXPECT_EQ(unbound_projected_chinese_short->op(), Expression::Operation::kStartsWith); + EXPECT_EQ(unbound_projected_chinese_short->literals().size(), 1); + EXPECT_EQ( + std::get(unbound_projected_chinese_short->literals().front().value()), + "δ½ ε₯½δΈ–η•Œ"); +} + +TEST_F(TransformProjectTest, TruncateProjectStringStartsWithChineseCharactersEqualWidth) { + auto transform = Transform::Truncate(5); + + // Chinese characters exactly matching width + // "δ½ ε₯½δΈ–η•Œε₯½" has exactly 5 code points + auto unbound_chinese_equal = Expressions::StartsWith("value", "δ½ ε₯½δΈ–η•Œε₯½"); + ICEBERG_ASSIGN_OR_THROW( + auto bound_chinese_equal, + unbound_chinese_equal->Bind(*string_schema_, /*case_sensitive=*/true)); + auto bound_pred_chinese_equal = + std::dynamic_pointer_cast(bound_chinese_equal); + ASSERT_NE(bound_pred_chinese_equal, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected_chinese_equal, + transform->Project("part", bound_pred_chinese_equal)); + ASSERT_NE(projected_chinese_equal, nullptr); + EXPECT_EQ(projected_chinese_equal->op(), Expression::Operation::kEq); + + auto unbound_projected_chinese_equal = + internal::checked_pointer_cast>( + std::move(projected_chinese_equal)); + ASSERT_NE(unbound_projected_chinese_equal, nullptr); + EXPECT_EQ(unbound_projected_chinese_equal->op(), Expression::Operation::kEq); + EXPECT_EQ(unbound_projected_chinese_equal->literals().size(), 1); + EXPECT_EQ( + std::get(unbound_projected_chinese_equal->literals().front().value()), + "δ½ ε₯½δΈ–η•Œε₯½"); +} + +TEST_F(TransformProjectTest, + TruncateProjectStringNotStartsWithCodePointCountEqualToWidth) { + auto transform = Transform::Truncate(5); + + // NotStartsWith with code point count == width + // Should convert to NotEq + auto unbound_not_starts_equal = Expressions::NotStartsWith("value", "πŸ˜œπŸ§πŸ€”πŸ€ͺπŸ₯³"); + ICEBERG_ASSIGN_OR_THROW( + auto bound_not_starts_equal, + unbound_not_starts_equal->Bind(*string_schema_, /*case_sensitive=*/true)); + auto bound_pred_not_starts_equal = + std::dynamic_pointer_cast(bound_not_starts_equal); + ASSERT_NE(bound_pred_not_starts_equal, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected_not_starts_equal, + transform->Project("part", bound_pred_not_starts_equal)); + ASSERT_NE(projected_not_starts_equal, nullptr); + EXPECT_EQ(projected_not_starts_equal->op(), Expression::Operation::kNotEq); + + auto unbound_projected_not_starts_equal = + internal::checked_pointer_cast>( + std::move(projected_not_starts_equal)); + ASSERT_NE(unbound_projected_not_starts_equal, nullptr); + EXPECT_EQ(unbound_projected_not_starts_equal->op(), Expression::Operation::kNotEq); + EXPECT_EQ(unbound_projected_not_starts_equal->literals().size(), 1); + EXPECT_EQ(std::get( + unbound_projected_not_starts_equal->literals().front().value()), + "πŸ˜œπŸ§πŸ€”πŸ€ͺπŸ₯³"); +} + +TEST_F(TransformProjectTest, + TruncateProjectStringNotStartsWithCodePointCountLessThanWidth) { + auto transform = Transform::Truncate(5); + + // NotStartsWith with code point count < width + // Should remain NotStartsWith + auto unbound_not_starts_short = Expressions::NotStartsWith("value", "😜🧐"); + ICEBERG_ASSIGN_OR_THROW( + auto bound_not_starts_short, + unbound_not_starts_short->Bind(*string_schema_, /*case_sensitive=*/true)); + auto bound_pred_not_starts_short = + std::dynamic_pointer_cast(bound_not_starts_short); + ASSERT_NE(bound_pred_not_starts_short, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected_not_starts_short, + transform->Project("part", bound_pred_not_starts_short)); + ASSERT_NE(projected_not_starts_short, nullptr); + EXPECT_EQ(projected_not_starts_short->op(), Expression::Operation::kNotStartsWith); + + auto unbound_projected_not_starts_short = + internal::checked_pointer_cast>( + std::move(projected_not_starts_short)); + ASSERT_NE(unbound_projected_not_starts_short, nullptr); + EXPECT_EQ(unbound_projected_not_starts_short->op(), + Expression::Operation::kNotStartsWith); + EXPECT_EQ(unbound_projected_not_starts_short->literals().size(), 1); + EXPECT_EQ(std::get( + unbound_projected_not_starts_short->literals().front().value()), + "😜🧐"); +} + +TEST_F(TransformProjectTest, + TruncateProjectStringNotStartsWithCodePointCountGreaterThanWidth) { + auto transform = Transform::Truncate(5); + + // NotStartsWith with code point count > width + // Should return nullptr (cannot project) + auto unbound_not_starts_long = + Expressions::NotStartsWith("value", "πŸ˜œπŸ§πŸ€”πŸ€ͺπŸ₯³πŸ˜΅β€πŸ’«πŸ˜‚"); + ICEBERG_ASSIGN_OR_THROW( + auto bound_not_starts_long, + unbound_not_starts_long->Bind(*string_schema_, /*case_sensitive=*/true)); + auto bound_pred_not_starts_long = + std::dynamic_pointer_cast(bound_not_starts_long); + ASSERT_NE(bound_pred_not_starts_long, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected_not_starts_long, + transform->Project("part", bound_pred_not_starts_long)); + EXPECT_EQ(projected_not_starts_long, nullptr); +} + TEST_F(TransformProjectTest, YearProjectEquality) { auto transform = Transform::Year(); @@ -1398,4 +1655,31 @@ TEST_F(TransformProjectTest, TemporalProjectInSet) { EXPECT_EQ(projected_in->op(), Expression::Operation::kIn); } +TEST_F(TransformProjectTest, DayTimestampProjectionFix) { + auto transform = Transform::Day(); + + // Predicate: value < 1970-01-01 00:00:00 (0) + // This implies value <= -1 micros. + // day(-1 micros) = -1 day (1969-12-31). + // If we don't fix, we project to day <= -1. + // If we fix (for buggy writers), we project to day <= 0. + auto unbound = Expressions::LessThan("value", Literal::Timestamp(0)); + + ICEBERG_ASSIGN_OR_THROW(auto bound, + unbound->Bind(*timestamp_schema_, /*case_sensitive=*/true)); + auto bound_pred = std::dynamic_pointer_cast(bound); + ASSERT_NE(bound_pred, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected, transform->Project("part", bound_pred)); + ASSERT_NE(projected, nullptr); + + auto unbound_projected = + internal::checked_pointer_cast>( + std::move(projected)); + EXPECT_EQ(unbound_projected->op(), Expression::Operation::kLtEq); + ASSERT_EQ(unbound_projected->literals().size(), 1); + int32_t val = std::get(unbound_projected->literals().front().value()); + EXPECT_EQ(val, 0) << "Expected projected value to be 0 (fix applied), but got " << val; +} + } // namespace iceberg diff --git a/src/iceberg/transform.h b/src/iceberg/transform.h index 3c83825b2..64b850725 100644 --- a/src/iceberg/transform.h +++ b/src/iceberg/transform.h @@ -177,8 +177,8 @@ class ICEBERG_EXPORT Transform : public util::Formattable { /// Projected(transform(value)) is true. /// \param name The name of the partition column. /// \param predicate The predicate to project. - /// \return A Result containing either a unique pointer to the projected predicate or an - /// Error if the projection fails. + /// \return A Result containing either a unique pointer to the projected predicate, + /// nullptr if the projection cannot be performed, or an Error if the projection fails. Result> Project( std::string_view name, const std::shared_ptr& predicate); diff --git a/src/iceberg/util/projection_util_internal.h b/src/iceberg/util/projection_util_internal.h index 91ffb7b92..29e9dc7f9 100644 --- a/src/iceberg/util/projection_util_internal.h +++ b/src/iceberg/util/projection_util_internal.h @@ -34,6 +34,7 @@ #include "iceberg/util/checked_cast.h" #include "iceberg/util/decimal.h" #include "iceberg/util/macros.h" +#include "iceberg/util/string_util.h" namespace iceberg { @@ -49,10 +50,8 @@ class ProjectionUtil { transformed.push_back(std::move(transformed_lit)); } ICEBERG_ASSIGN_OR_RAISE(auto ref, NamedReference::Make(std::string(name))); - ICEBERG_ASSIGN_OR_RAISE(auto pred, - UnboundPredicateImpl::Make( - predicate->op(), std::move(ref), std::move(transformed))); - return pred; + return UnboundPredicateImpl::Make(predicate->op(), std::move(ref), + std::move(transformed)); } // General transform for all literal predicates. This is used as a fallback for special @@ -65,23 +64,17 @@ class ProjectionUtil { switch (predicate->op()) { case Expression::Operation::kLt: case Expression::Operation::kLtEq: { - ICEBERG_ASSIGN_OR_RAISE(auto pred, UnboundPredicateImpl::Make( - Expression::Operation::kLtEq, - std::move(ref), std::move(transformed))); - return pred; + return UnboundPredicateImpl::Make( + Expression::Operation::kLtEq, std::move(ref), std::move(transformed)); } case Expression::Operation::kGt: case Expression::Operation::kGtEq: { - ICEBERG_ASSIGN_OR_RAISE(auto pred, UnboundPredicateImpl::Make( - Expression::Operation::kGtEq, - std::move(ref), std::move(transformed))); - return pred; + return UnboundPredicateImpl::Make( + Expression::Operation::kGtEq, std::move(ref), std::move(transformed)); } case Expression::Operation::kEq: { - ICEBERG_ASSIGN_OR_RAISE(auto pred, UnboundPredicateImpl::Make( - Expression::Operation::kEq, std::move(ref), - std::move(transformed))); - return pred; + return UnboundPredicateImpl::Make( + Expression::Operation::kEq, std::move(ref), std::move(transformed)); } default: return nullptr; @@ -95,10 +88,8 @@ class ProjectionUtil { switch (predicate->op()) { case Expression::Operation::kStartsWith: { ICEBERG_ASSIGN_OR_RAISE(auto transformed, func->Transform(predicate->literal())); - ICEBERG_ASSIGN_OR_RAISE(auto pred, UnboundPredicateImpl::Make( - Expression::Operation::kStartsWith, - std::move(ref), std::move(transformed))); - return pred; + return UnboundPredicateImpl::Make( + Expression::Operation::kStartsWith, std::move(ref), std::move(transformed)); } default: return GenericTransform(std::move(ref), predicate, func); @@ -120,18 +111,14 @@ class ProjectionUtil { ICEBERG_ASSIGN_OR_RAISE( auto transformed, func->Transform(Literal::Int(std::get(literal.value()) - 1))); - ICEBERG_ASSIGN_OR_RAISE(auto pred, UnboundPredicateImpl::Make( - Expression::Operation::kLtEq, - std::move(ref), std::move(transformed))); - return pred; + return UnboundPredicateImpl::Make( + Expression::Operation::kLtEq, std::move(ref), std::move(transformed)); } else { ICEBERG_ASSIGN_OR_RAISE( auto transformed, func->Transform(Literal::Long(std::get(literal.value()) - 1))); - ICEBERG_ASSIGN_OR_RAISE(auto pred, UnboundPredicateImpl::Make( - Expression::Operation::kLtEq, - std::move(ref), std::move(transformed))); - return pred; + return UnboundPredicateImpl::Make( + Expression::Operation::kLtEq, std::move(ref), std::move(transformed)); } } case Expression::Operation::kGt: { @@ -140,18 +127,14 @@ class ProjectionUtil { ICEBERG_ASSIGN_OR_RAISE( auto transformed, func->Transform(Literal::Int(std::get(literal.value()) + 1))); - ICEBERG_ASSIGN_OR_RAISE(auto pred, UnboundPredicateImpl::Make( - Expression::Operation::kGtEq, - std::move(ref), std::move(transformed))); - return pred; + return UnboundPredicateImpl::Make( + Expression::Operation::kGtEq, std::move(ref), std::move(transformed)); } else { ICEBERG_ASSIGN_OR_RAISE( auto transformed, func->Transform(Literal::Long(std::get(literal.value()) + 1))); - ICEBERG_ASSIGN_OR_RAISE(auto pred, UnboundPredicateImpl::Make( - Expression::Operation::kGtEq, - std::move(ref), std::move(transformed))); - return pred; + return UnboundPredicateImpl::Make( + Expression::Operation::kGtEq, std::move(ref), std::move(transformed)); } } default: @@ -172,21 +155,15 @@ class ProjectionUtil { ICEBERG_ASSIGN_OR_RAISE( auto transformed, func->Transform(Literal::Date(std::get(literal.value()) - 1))); - ICEBERG_ASSIGN_OR_RAISE(auto pred, - UnboundPredicateImpl::Make( - Expression::Operation::kLtEq, std::move(ref), - std::move(transformed))); - return pred; + return UnboundPredicateImpl::Make( + Expression::Operation::kLtEq, std::move(ref), std::move(transformed)); } case Expression::Operation::kGt: { ICEBERG_ASSIGN_OR_RAISE( auto transformed, func->Transform(Literal::Date(std::get(literal.value()) + 1))); - ICEBERG_ASSIGN_OR_RAISE(auto pred, - UnboundPredicateImpl::Make( - Expression::Operation::kGtEq, std::move(ref), - std::move(transformed))); - return pred; + return UnboundPredicateImpl::Make( + Expression::Operation::kGtEq, std::move(ref), std::move(transformed)); } default: return GenericTransform(std::move(ref), predicate, func); @@ -198,21 +175,15 @@ class ProjectionUtil { ICEBERG_ASSIGN_OR_RAISE(auto transformed, func->Transform(Literal::Timestamp( std::get(literal.value()) - 1))); - ICEBERG_ASSIGN_OR_RAISE(auto pred, - UnboundPredicateImpl::Make( - Expression::Operation::kLtEq, std::move(ref), - std::move(transformed))); - return pred; + return UnboundPredicateImpl::Make( + Expression::Operation::kLtEq, std::move(ref), std::move(transformed)); } case Expression::Operation::kGt: { ICEBERG_ASSIGN_OR_RAISE(auto transformed, func->Transform(Literal::Timestamp( std::get(literal.value()) + 1))); - ICEBERG_ASSIGN_OR_RAISE(auto pred, - UnboundPredicateImpl::Make( - Expression::Operation::kGtEq, std::move(ref), - std::move(transformed))); - return pred; + return UnboundPredicateImpl::Make( + Expression::Operation::kGtEq, std::move(ref), std::move(transformed)); } default: return GenericTransform(std::move(ref), predicate, func); @@ -224,26 +195,19 @@ class ProjectionUtil { ICEBERG_ASSIGN_OR_RAISE(auto transformed, func->Transform(Literal::TimestampTz( std::get(literal.value()) - 1))); - ICEBERG_ASSIGN_OR_RAISE(auto pred, - UnboundPredicateImpl::Make( - Expression::Operation::kLtEq, std::move(ref), - std::move(transformed))); - return pred; + return UnboundPredicateImpl::Make( + Expression::Operation::kLtEq, std::move(ref), std::move(transformed)); } case Expression::Operation::kGt: { ICEBERG_ASSIGN_OR_RAISE(auto transformed, func->Transform(Literal::TimestampTz( std::get(literal.value()) + 1))); - ICEBERG_ASSIGN_OR_RAISE(auto pred, - UnboundPredicateImpl::Make( - Expression::Operation::kGtEq, std::move(ref), - std::move(transformed))); - return pred; + return UnboundPredicateImpl::Make( + Expression::Operation::kGtEq, std::move(ref), std::move(transformed)); } default: return GenericTransform(std::move(ref), predicate, func); } - std::unreachable(); } default: return NotSupported("{} is not a valid input type for temporal transform", @@ -269,19 +233,15 @@ class ProjectionUtil { // adjust closed and then transform ltEq ICEBERG_ASSIGN_OR_RAISE(auto transformed, func->Transform(make_adjusted_literal(-1))); - ICEBERG_ASSIGN_OR_RAISE(auto pred, UnboundPredicateImpl::Make( - Expression::Operation::kLtEq, - std::move(ref), std::move(transformed))); - return pred; + return UnboundPredicateImpl::Make( + Expression::Operation::kLtEq, std::move(ref), std::move(transformed)); } case Expression::Operation::kGt: { // adjust closed and then transform gtEq ICEBERG_ASSIGN_OR_RAISE(auto transformed, func->Transform(make_adjusted_literal(1))); - ICEBERG_ASSIGN_OR_RAISE(auto pred, UnboundPredicateImpl::Make( - Expression::Operation::kGtEq, - std::move(ref), std::move(transformed))); - return pred; + return UnboundPredicateImpl::Make( + Expression::Operation::kGtEq, std::move(ref), std::move(transformed)); } default: return GenericTransform(std::move(ref), predicate, func); @@ -303,32 +263,25 @@ class ProjectionUtil { const auto width = truncate_transform->width(); ICEBERG_ASSIGN_OR_RAISE(auto ref, NamedReference::Make(std::string(name))); - if (str_value.length() < width) { - ICEBERG_ASSIGN_OR_RAISE(auto pred, UnboundPredicateImpl::Make( - op, std::move(ref), predicate->literal())); - return pred; + if (StringUtils::CodePointCount(str_value) < width) { + return UnboundPredicateImpl::Make(op, std::move(ref), + predicate->literal()); } - if (str_value.length() == width) { + if (StringUtils::CodePointCount(str_value) == width) { if (op == Expression::Operation::kStartsWith) { - ICEBERG_ASSIGN_OR_RAISE(auto pred, UnboundPredicateImpl::Make( - Expression::Operation::kEq, std::move(ref), - predicate->literal())); - return pred; + return UnboundPredicateImpl::Make( + Expression::Operation::kEq, std::move(ref), predicate->literal()); } else { - ICEBERG_ASSIGN_OR_RAISE(auto pred, UnboundPredicateImpl::Make( - Expression::Operation::kNotEq, - std::move(ref), predicate->literal())); - return pred; + return UnboundPredicateImpl::Make( + Expression::Operation::kNotEq, std::move(ref), predicate->literal()); } } if (op == Expression::Operation::kStartsWith) { ICEBERG_ASSIGN_OR_RAISE(auto transformed, func->Transform(predicate->literal())); - ICEBERG_ASSIGN_OR_RAISE(auto pred, UnboundPredicateImpl::Make( - Expression::Operation::kStartsWith, - std::move(ref), std::move(transformed))); - return pred; + return UnboundPredicateImpl::Make( + Expression::Operation::kStartsWith, std::move(ref), std::move(transformed)); } return nullptr; @@ -353,11 +306,9 @@ class ProjectionUtil { "Expected int32_t"); auto value = std::get(literal.value()); if (value < 0) { - ICEBERG_ASSIGN_OR_RAISE( - auto pred, UnboundPredicateImpl::Make( - Expression::Operation::kLt, std::move(projected->term()), - Literal::Int(value + 1))); - return pred; + return UnboundPredicateImpl::Make(Expression::Operation::kLt, + std::move(projected->term()), + Literal::Int(value + 1)); } return std::move(projected); @@ -370,11 +321,9 @@ class ProjectionUtil { "Expected int32_t"); auto value = std::get(literal.value()); if (value < 0) { - ICEBERG_ASSIGN_OR_RAISE( - auto pred, UnboundPredicateImpl::Make( - Expression::Operation::kLtEq, std::move(projected->term()), - Literal::Int(value + 1))); - return pred; + return UnboundPredicateImpl::Make(Expression::Operation::kLtEq, + std::move(projected->term()), + Literal::Int(value + 1)); } return std::move(projected); } @@ -393,11 +342,9 @@ class ProjectionUtil { if (value < 0) { // match either the incorrect value (projectedValue + 1) or the correct value // (projectedValue) - ICEBERG_ASSIGN_OR_RAISE( - auto pred, UnboundPredicateImpl::Make( - Expression::Operation::kIn, std::move(projected->term()), - {literal, Literal::Int(value + 1)})); - return pred; + return UnboundPredicateImpl::Make( + Expression::Operation::kIn, std::move(projected->term()), + {literal, Literal::Int(value + 1)}); } return std::move(projected); } @@ -426,11 +373,9 @@ class ProjectionUtil { std::views::transform(value_set, [](int32_t value) { return Literal::Int(value); }) | std::ranges::to(); - ICEBERG_ASSIGN_OR_RAISE( - auto pred, UnboundPredicateImpl::Make( - Expression::Operation::kIn, std::move(projected->term()), - std::move(values))); - return pred; + return UnboundPredicateImpl::Make(Expression::Operation::kIn, + std::move(projected->term()), + std::move(values)); } return std::move(projected); } @@ -451,30 +396,24 @@ class ProjectionUtil { ICEBERG_ASSIGN_OR_RAISE(auto ref, NamedReference::Make(std::string(name))); switch (predicate->kind()) { case BoundPredicate::Kind::kUnary: { - ICEBERG_ASSIGN_OR_RAISE(auto pred, UnboundPredicateImpl::Make( - predicate->op(), std::move(ref))); - return pred; + return UnboundPredicateImpl::Make(predicate->op(), + std::move(ref)); } case BoundPredicate::Kind::kLiteral: { const auto& literalPredicate = internal::checked_pointer_cast(predicate); - ICEBERG_ASSIGN_OR_RAISE( - auto pred, UnboundPredicateImpl::Make( - predicate->op(), std::move(ref), literalPredicate->literal())); - return pred; + return UnboundPredicateImpl::Make(predicate->op(), std::move(ref), + literalPredicate->literal()); } case BoundPredicate::Kind::kSet: { const auto& setPredicate = internal::checked_pointer_cast(predicate); - ICEBERG_ASSIGN_OR_RAISE( - auto pred, UnboundPredicateImpl::Make( - predicate->op(), std::move(ref), - std::vector(setPredicate->literal_set().begin(), - setPredicate->literal_set().end()))); - return pred; + return UnboundPredicateImpl::Make( + predicate->op(), std::move(ref), + std::vector(setPredicate->literal_set().begin(), + setPredicate->literal_set().end())); } } - std::unreachable(); } @@ -484,9 +423,8 @@ class ProjectionUtil { ICEBERG_ASSIGN_OR_RAISE(auto ref, NamedReference::Make(std::string(name))); switch (predicate->kind()) { case BoundPredicate::Kind::kUnary: { - ICEBERG_ASSIGN_OR_RAISE(auto pred, UnboundPredicateImpl::Make( - predicate->op(), std::move(ref))); - return pred; + return UnboundPredicateImpl::Make(predicate->op(), + std::move(ref)); } case BoundPredicate::Kind::kLiteral: { if (predicate->op() == Expression::Operation::kEq) { @@ -494,10 +432,8 @@ class ProjectionUtil { internal::checked_pointer_cast(predicate); ICEBERG_ASSIGN_OR_RAISE(auto transformed, func->Transform(literalPredicate->literal())); - ICEBERG_ASSIGN_OR_RAISE( - auto pred, UnboundPredicateImpl::Make( - predicate->op(), std::move(ref), std::move(transformed))); - return pred; + return UnboundPredicateImpl::Make( + predicate->op(), std::move(ref), std::move(transformed)); } break; } @@ -506,8 +442,7 @@ class ProjectionUtil { if (predicate->op() == Expression::Operation::kIn) { const auto& setPredicate = internal::checked_pointer_cast(predicate); - ICEBERG_ASSIGN_OR_RAISE(auto pred, TransformSet(name, setPredicate, func)); - return pred; + return TransformSet(name, setPredicate, func); } break; } @@ -525,9 +460,7 @@ class ProjectionUtil { ICEBERG_ASSIGN_OR_RAISE(auto ref, NamedReference::Make(std::string(name))); // Handle unary predicates uniformly for all types if (predicate->kind() == BoundPredicate::Kind::kUnary) { - ICEBERG_ASSIGN_OR_RAISE(auto pred, UnboundPredicateImpl::Make( - predicate->op(), std::move(ref))); - return pred; + return UnboundPredicateImpl::Make(predicate->op(), std::move(ref)); } // Handle set predicates (kIn) uniformly for all types @@ -566,43 +499,34 @@ class ProjectionUtil { const std::shared_ptr& func) { ICEBERG_ASSIGN_OR_RAISE(auto ref, NamedReference::Make(std::string(name))); if (predicate->kind() == BoundPredicate::Kind::kUnary) { - ICEBERG_ASSIGN_OR_RAISE(auto pred, UnboundPredicateImpl::Make( - predicate->op(), std::move(ref))); - return pred; - } - - if (predicate->kind() == BoundPredicate::Kind::kSet) { - if (predicate->op() == Expression::Operation::kIn) { - const auto& setPredicate = - internal::checked_pointer_cast(predicate); - ICEBERG_ASSIGN_OR_RAISE(auto projected, TransformSet(name, setPredicate, func)); - if (func->transform_type() != TransformType::kDay) { - ICEBERG_ASSIGN_OR_RAISE( - auto fixed, - FixInclusiveTimeProjection( - internal::checked_pointer_cast>( - std::move(projected)))); - return fixed; - } - return projected; + return UnboundPredicateImpl::Make(predicate->op(), std::move(ref)); + } else if (predicate->kind() == BoundPredicate::Kind::kLiteral) { + const auto& literalPredicate = + internal::checked_pointer_cast(predicate); + ICEBERG_ASSIGN_OR_RAISE(auto projected, + TransformTemporal(name, literalPredicate, func)); + if (func->transform_type() != TransformType::kDay || + func->source_type()->type_id() != TypeId::kDate) { + return FixInclusiveTimeProjection( + internal::checked_pointer_cast>( + std::move(projected))); } - return nullptr; + return projected; + } else if (predicate->kind() == BoundPredicate::Kind::kSet && + predicate->op() == Expression::Operation::kIn) { + const auto& setPredicate = + internal::checked_pointer_cast(predicate); + ICEBERG_ASSIGN_OR_RAISE(auto projected, TransformSet(name, setPredicate, func)); + if (func->transform_type() != TransformType::kDay || + func->source_type()->type_id() != TypeId::kDate) { + return FixInclusiveTimeProjection( + internal::checked_pointer_cast>( + std::move(projected))); + } + return projected; } - const auto& literalPredicate = - internal::checked_pointer_cast(predicate); - - ICEBERG_ASSIGN_OR_RAISE(auto projected, - TransformTemporal(name, literalPredicate, func)); - if (func->transform_type() != TransformType::kDay) { - ICEBERG_ASSIGN_OR_RAISE( - auto fixed, - FixInclusiveTimeProjection( - internal::checked_pointer_cast>( - std::move(projected)))); - return fixed; - } - return projected; + return nullptr; } static Result> RemoveTransform( @@ -610,27 +534,22 @@ class ProjectionUtil { ICEBERG_ASSIGN_OR_RAISE(auto ref, NamedReference::Make(std::string(name))); switch (predicate->kind()) { case BoundPredicate::Kind::kUnary: { - ICEBERG_ASSIGN_OR_RAISE(auto pred, UnboundPredicateImpl::Make( - predicate->op(), std::move(ref))); - return pred; + return UnboundPredicateImpl::Make(predicate->op(), + std::move(ref)); } case BoundPredicate::Kind::kLiteral: { const auto& literalPredicate = internal::checked_pointer_cast(predicate); - ICEBERG_ASSIGN_OR_RAISE( - auto pred, UnboundPredicateImpl::Make( - predicate->op(), std::move(ref), literalPredicate->literal())); - return pred; + return UnboundPredicateImpl::Make(predicate->op(), std::move(ref), + literalPredicate->literal()); } case BoundPredicate::Kind::kSet: { const auto& setPredicate = internal::checked_pointer_cast(predicate); - ICEBERG_ASSIGN_OR_RAISE( - auto pred, UnboundPredicateImpl::Make( - predicate->op(), std::move(ref), - std::vector(setPredicate->literal_set().begin(), - setPredicate->literal_set().end()))); - return pred; + return UnboundPredicateImpl::Make( + predicate->op(), std::move(ref), + std::vector(setPredicate->literal_set().begin(), + setPredicate->literal_set().end())); } } std::unreachable(); diff --git a/src/iceberg/util/string_util.h b/src/iceberg/util/string_util.h index 8eee314f5..de7780369 100644 --- a/src/iceberg/util/string_util.h +++ b/src/iceberg/util/string_util.h @@ -43,6 +43,17 @@ class ICEBERG_EXPORT StringUtils { return std::ranges::equal( lhs, rhs, [](char lc, char rc) { return std::tolower(lc) == std::tolower(rc); }); } + + /// \brief Count the number of code points in a UTF-8 string. + static size_t CodePointCount(std::string_view str) { + size_t count = 0; + for (char i : str) { + if ((i & 0xC0) != 0x80) { + count++; + } + } + return count; + } }; /// \brief Transparent hash function that supports std::string_view as lookup key From 5b9ddaedf4b0ef6bab5b74039f5e144ee9de168c Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Tue, 2 Dec 2025 22:58:36 +0800 Subject: [PATCH 6/7] fix: change to a permanent link --- src/iceberg/util/projection_util_internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/iceberg/util/projection_util_internal.h b/src/iceberg/util/projection_util_internal.h index 29e9dc7f9..3ce2dbf8f 100644 --- a/src/iceberg/util/projection_util_internal.h +++ b/src/iceberg/util/projection_util_internal.h @@ -289,7 +289,7 @@ class ProjectionUtil { // Fixes an inclusive projection to account for incorrectly transformed values. // align with Java implementation: - // https://github.com/apache/iceberg/blob/main/api/src/main/java/org/apache/iceberg/transforms/ProjectionUtil.java#L275 + // https://github.com/apache/iceberg/blob/1.10.x/api/src/main/java/org/apache/iceberg/transforms/ProjectionUtil.java#L275 static Result> FixInclusiveTimeProjection( std::unique_ptr> projected) { if (projected == nullptr) { From 6456788723c89b5b0de8817286f62619a2a42bc7 Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Wed, 3 Dec 2025 11:05:23 +0800 Subject: [PATCH 7/7] fix: remove a unused statement --- src/iceberg/test/transform_test.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/iceberg/test/transform_test.cc b/src/iceberg/test/transform_test.cc index f557f7787..821edac55 100644 --- a/src/iceberg/test/transform_test.cc +++ b/src/iceberg/test/transform_test.cc @@ -1078,8 +1078,6 @@ TEST_F(TransformProjectTest, BucketProjectWithMatchingTransformedChild) { // The predicate's term should be a transform EXPECT_EQ(bound_pred->term()->kind(), Term::Kind::kTransform); - auto dummy = Expressions::NotEqual(bucket_term, Literal::Int(5)); - // When the transform matches, Project should use RemoveTransform and return the // predicate ICEBERG_ASSIGN_OR_THROW(auto projected,