diff --git a/src/iceberg/expression/predicate.h b/src/iceberg/expression/predicate.h index b7ed21ff8..cdd3d1f52 100644 --- a/src/iceberg/expression/predicate.h +++ b/src/iceberg/expression/predicate.h @@ -27,6 +27,7 @@ #include "iceberg/expression/expression.h" #include "iceberg/expression/literal.h" #include "iceberg/expression/term.h" +#include "iceberg/iceberg_export.h" namespace iceberg { diff --git a/src/iceberg/test/transform_test.cc b/src/iceberg/test/transform_test.cc index 529297b27..821edac55 100644 --- a/src/iceberg/test/transform_test.cc +++ b/src/iceberg/test/transform_test.cc @@ -26,12 +26,17 @@ #include #include +#include "iceberg/expression/expressions.h" #include "iceberg/expression/literal.h" +#include "iceberg/expression/predicate.h" +#include "iceberg/schema.h" #include "iceberg/schema_field.h" #include "iceberg/test/matchers.h" #include "iceberg/test/temporal_test_helper.h" #include "iceberg/type.h" +#include "iceberg/util/checked_cast.h" #include "iceberg/util/formatter.h" // IWYU pragma: keep +#include "iceberg/util/macros.h" namespace iceberg { @@ -914,4 +919,765 @@ TEST(TransformCanTransformTest, CanTransform) { } } +// Test fixture for Transform::Project tests +class TransformProjectTest : public ::testing::Test { + protected: + void SetUp() override { + // Create test schemas for different source types + int_schema_ = std::make_shared( + std::vector{SchemaField::MakeRequired(1, "key", int32()), + SchemaField::MakeOptional(2, "value", int32())}, + /*schema_id=*/0); + long_schema_ = std::make_shared( + std::vector{SchemaField::MakeRequired(1, "value", int64())}, + /*schema_id=*/0); + string_schema_ = std::make_shared( + std::vector{SchemaField::MakeRequired(1, "value", string())}, + /*schema_id=*/0); + date_schema_ = std::make_shared( + std::vector{SchemaField::MakeRequired(1, "value", date())}, + /*schema_id=*/0); + timestamp_schema_ = std::make_shared( + std::vector{SchemaField::MakeRequired(1, "value", timestamp())}, + /*schema_id=*/0); + } + + std::shared_ptr int_schema_; + std::shared_ptr long_schema_; + std::shared_ptr string_schema_; + std::shared_ptr date_schema_; + std::shared_ptr timestamp_schema_; +}; + +TEST_F(TransformProjectTest, IdentityProjectEquality) { + auto transform = Transform::Identity(); + + // Test equality predicate + auto unbound = Expressions::Equal("value", Literal::Int(100)); + ICEBERG_ASSIGN_OR_THROW(auto bound, + unbound->Bind(*int_schema_, /*case_sensitive=*/true)); + auto bound_pred = std::dynamic_pointer_cast(bound); + ASSERT_NE(bound_pred, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected, transform->Project("part", bound_pred)); + ASSERT_NE(projected, nullptr); + EXPECT_EQ(projected->op(), Expression::Operation::kEq); + + auto unbound_projected = + internal::checked_pointer_cast>( + std::move(projected)); + ASSERT_NE(unbound_projected, nullptr); + EXPECT_EQ(unbound_projected->op(), Expression::Operation::kEq); + EXPECT_EQ(unbound_projected->literals().size(), 1); + EXPECT_EQ(std::get(unbound_projected->literals().front().value()), 100); +} + +TEST_F(TransformProjectTest, IdentityProjectComparison) { + auto transform = Transform::Identity(); + + // Test less than predicate + auto unbound_lt = Expressions::LessThan("value", Literal::Int(50)); + ICEBERG_ASSIGN_OR_THROW(auto bound_lt, + unbound_lt->Bind(*int_schema_, /*case_sensitive=*/true)); + auto bound_pred_lt = std::dynamic_pointer_cast(bound_lt); + ASSERT_NE(bound_pred_lt, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected_lt, transform->Project("part", bound_pred_lt)); + ASSERT_NE(projected_lt, nullptr); + EXPECT_EQ(projected_lt->op(), Expression::Operation::kLt); + + // Test greater than or equal predicate + auto unbound_gte = Expressions::GreaterThanOrEqual("value", Literal::Int(100)); + ICEBERG_ASSIGN_OR_THROW(auto bound_gte, + unbound_gte->Bind(*int_schema_, /*case_sensitive=*/true)); + auto bound_pred_gte = std::dynamic_pointer_cast(bound_gte); + ASSERT_NE(bound_pred_gte, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected_gte, transform->Project("part", bound_pred_gte)); + ASSERT_NE(projected_gte, nullptr); + EXPECT_EQ(projected_gte->op(), Expression::Operation::kGtEq); +} + +TEST_F(TransformProjectTest, IdentityProjectUnary) { + auto transform = Transform::Identity(); + + // Test IsNull predicate + auto unbound_null = Expressions::IsNull("value"); + ICEBERG_ASSIGN_OR_THROW(auto bound_null, + unbound_null->Bind(*int_schema_, /*case_sensitive=*/true)); + auto bound_pred_null = std::dynamic_pointer_cast(bound_null); + ASSERT_NE(bound_pred_null, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected_null, + transform->Project("part", bound_pred_null)); + ASSERT_NE(projected_null, nullptr); + EXPECT_EQ(projected_null->op(), Expression::Operation::kIsNull); +} + +TEST_F(TransformProjectTest, IdentityProjectSet) { + auto transform = Transform::Identity(); + + // Test IN predicate + auto unbound_in = + Expressions::In("value", {Literal::Int(1), Literal::Int(2), Literal::Int(3)}); + ICEBERG_ASSIGN_OR_THROW(auto bound_in, + unbound_in->Bind(*int_schema_, /*case_sensitive=*/true)); + auto bound_pred_in = std::dynamic_pointer_cast(bound_in); + ASSERT_NE(bound_pred_in, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected_in, transform->Project("part", bound_pred_in)); + ASSERT_NE(projected_in, nullptr); + EXPECT_EQ(projected_in->op(), Expression::Operation::kIn); + auto unbound_projected = + internal::checked_pointer_cast>( + std::move(projected_in)); + ASSERT_NE(unbound_projected, nullptr); + EXPECT_EQ(unbound_projected->op(), Expression::Operation::kIn); + EXPECT_EQ(unbound_projected->literals().size(), 3); + std::vector values; + for (const auto& lit : unbound_projected->literals()) { + values.push_back(std::get(lit.value())); + } + EXPECT_THAT(values, testing::UnorderedElementsAre(1, 2, 3)); +} + +TEST_F(TransformProjectTest, BucketProjectEquality) { + auto transform = Transform::Bucket(4); + + // Bucket can project equality predicates + auto unbound = Expressions::Equal("value", Literal::Int(34)); + ICEBERG_ASSIGN_OR_THROW(auto bound, + unbound->Bind(*int_schema_, /*case_sensitive=*/true)); + auto bound_pred = std::dynamic_pointer_cast(bound); + ASSERT_NE(bound_pred, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected, transform->Project("part", bound_pred)); + ASSERT_NE(projected, nullptr); + EXPECT_EQ(projected->op(), Expression::Operation::kEq); + + auto unbound_projected = + internal::checked_pointer_cast>( + std::move(projected)); + ASSERT_NE(unbound_projected, nullptr); + EXPECT_EQ(unbound_projected->op(), Expression::Operation::kEq); + EXPECT_EQ(unbound_projected->literals().size(), 1); + EXPECT_EQ(std::get(unbound_projected->literals().front().value()), 3); +} + +TEST_F(TransformProjectTest, BucketProjectWithMatchingTransformedChild) { + auto partition_transform = Transform::Bucket(16); + + // Create a predicate like: bucket(value, 16) = 5 + auto bucket_term = Expressions::Bucket("value", 16); + auto unbound = Expressions::Equal(bucket_term, Literal::Int(5)); + ICEBERG_ASSIGN_OR_THROW(auto bound, + unbound->Bind(*int_schema_, /*case_sensitive=*/true)); + auto bound_pred = std::dynamic_pointer_cast(bound); + ASSERT_NE(bound_pred, nullptr); + + // The predicate's term should be a transform + EXPECT_EQ(bound_pred->term()->kind(), Term::Kind::kTransform); + + // When the transform matches, Project should use RemoveTransform and return the + // predicate + ICEBERG_ASSIGN_OR_THROW(auto projected, + partition_transform->Project("part", bound_pred)); + ASSERT_NE(projected, nullptr); + EXPECT_EQ(projected->op(), Expression::Operation::kEq); + auto unbound_projected = + internal::checked_pointer_cast>( + std::move(projected)); + ASSERT_NE(unbound_projected, nullptr); + EXPECT_EQ(unbound_projected->op(), Expression::Operation::kEq); + EXPECT_EQ(unbound_projected->literals().size(), 1); + EXPECT_EQ(std::get(unbound_projected->literals().front().value()), 5); +} + +TEST_F(TransformProjectTest, BucketProjectComparisonReturnsNull) { + auto transform = Transform::Bucket(16); + + // Bucket cannot project comparison predicates (they return null) + auto unbound_lt = Expressions::LessThan("value", Literal::Int(50)); + ICEBERG_ASSIGN_OR_THROW(auto bound_lt, + unbound_lt->Bind(*int_schema_, /*case_sensitive=*/true)); + auto bound_pred_lt = std::dynamic_pointer_cast(bound_lt); + ASSERT_NE(bound_pred_lt, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected_lt, transform->Project("part", bound_pred_lt)); + EXPECT_EQ(projected_lt, nullptr); +} + +TEST_F(TransformProjectTest, BucketProjectInSet) { + auto transform = Transform::Bucket(16); + + // Bucket can project IN predicates + auto unbound_in = + Expressions::In("value", {Literal::Int(1), Literal::Int(2), Literal::Int(3)}); + ICEBERG_ASSIGN_OR_THROW(auto bound_in, + unbound_in->Bind(*int_schema_, /*case_sensitive=*/true)); + auto bound_pred_in = std::dynamic_pointer_cast(bound_in); + ASSERT_NE(bound_pred_in, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected_in, transform->Project("part", bound_pred_in)); + ASSERT_NE(projected_in, nullptr); + EXPECT_EQ(projected_in->op(), Expression::Operation::kIn); +} + +TEST_F(TransformProjectTest, BucketProjectNotInReturnsNull) { + auto transform = Transform::Bucket(16); + + // Bucket cannot project NOT IN predicates + auto unbound_not_in = + Expressions::NotIn("value", {Literal::Int(1), Literal::Int(2), Literal::Int(3)}); + ICEBERG_ASSIGN_OR_THROW(auto bound_not_in, + unbound_not_in->Bind(*int_schema_, /*case_sensitive=*/true)); + auto bound_pred_not_in = std::dynamic_pointer_cast(bound_not_in); + ASSERT_NE(bound_pred_not_in, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected_not_in, + transform->Project("part", bound_pred_not_in)); + EXPECT_EQ(projected_not_in, nullptr); +} + +TEST_F(TransformProjectTest, TruncateProjectIntEquality) { + auto transform = Transform::Truncate(10); + + // Truncate can project equality predicates + auto unbound = Expressions::Equal("value", Literal::Int(123)); + ICEBERG_ASSIGN_OR_THROW(auto bound, + unbound->Bind(*int_schema_, /*case_sensitive=*/true)); + auto bound_pred = std::dynamic_pointer_cast(bound); + ASSERT_NE(bound_pred, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected, transform->Project("part", bound_pred)); + ASSERT_NE(projected, nullptr); + EXPECT_EQ(projected->op(), Expression::Operation::kEq); + + auto unbound_projected = + internal::checked_pointer_cast>( + std::move(projected)); + ASSERT_NE(unbound_projected, nullptr); + EXPECT_EQ(unbound_projected->op(), Expression::Operation::kEq); + EXPECT_EQ(unbound_projected->literals().size(), 1); + EXPECT_EQ(std::get(unbound_projected->literals().front().value()), 120); +} + +TEST_F(TransformProjectTest, TruncateProjectIntLessThan) { + auto transform = Transform::Truncate(10); + + // Truncate projects LT as LTE + auto unbound = Expressions::LessThan("value", Literal::Int(25)); + ICEBERG_ASSIGN_OR_THROW(auto bound, + unbound->Bind(*int_schema_, /*case_sensitive=*/true)); + auto bound_pred = std::dynamic_pointer_cast(bound); + ASSERT_NE(bound_pred, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected, transform->Project("part", bound_pred)); + ASSERT_NE(projected, nullptr); + EXPECT_EQ(projected->op(), Expression::Operation::kLtEq); +} + +TEST_F(TransformProjectTest, TruncateProjectIntGreaterThan) { + auto transform = Transform::Truncate(10); + + // Truncate projects GT as GTE + auto unbound = Expressions::GreaterThan("value", Literal::Int(25)); + ICEBERG_ASSIGN_OR_THROW(auto bound, + unbound->Bind(*int_schema_, /*case_sensitive=*/true)); + auto bound_pred = std::dynamic_pointer_cast(bound); + ASSERT_NE(bound_pred, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected, transform->Project("part", bound_pred)); + ASSERT_NE(projected, nullptr); + EXPECT_EQ(projected->op(), Expression::Operation::kGtEq); + + auto unbound_projected = + internal::checked_pointer_cast>( + std::move(projected)); + ASSERT_NE(unbound_projected, nullptr); + EXPECT_EQ(unbound_projected->op(), Expression::Operation::kGtEq); + EXPECT_EQ(unbound_projected->literals().size(), 1); + EXPECT_EQ(std::get(unbound_projected->literals().front().value()), 20); +} + +TEST_F(TransformProjectTest, TruncateProjectStringEquality) { + auto transform = Transform::Truncate(5); + + auto unbound = Expressions::Equal("value", Literal::String("Hello, World!")); + ICEBERG_ASSIGN_OR_THROW(auto bound, + unbound->Bind(*string_schema_, /*case_sensitive=*/true)); + auto bound_pred = std::dynamic_pointer_cast(bound); + ASSERT_NE(bound_pred, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected, transform->Project("part", bound_pred)); + ASSERT_NE(projected, nullptr); + EXPECT_EQ(projected->op(), Expression::Operation::kEq); + + auto unbound_projected = + internal::checked_pointer_cast>( + std::move(projected)); + ASSERT_NE(unbound_projected, nullptr); + EXPECT_EQ(unbound_projected->op(), Expression::Operation::kEq); + EXPECT_EQ(unbound_projected->literals().size(), 1); + EXPECT_EQ(std::get(unbound_projected->literals().front().value()), + "Hello"); +} + +TEST_F(TransformProjectTest, TruncateProjectStringStartsWith) { + auto transform = Transform::Truncate(5); + + // StartsWith with shorter string than width + auto unbound_short = Expressions::StartsWith("value", "Hi"); + ICEBERG_ASSIGN_OR_THROW(auto bound_short, + unbound_short->Bind(*string_schema_, /*case_sensitive=*/true)); + auto bound_pred_short = std::dynamic_pointer_cast(bound_short); + ASSERT_NE(bound_pred_short, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected_short, + transform->Project("part", bound_pred_short)); + ASSERT_NE(projected_short, nullptr); + EXPECT_EQ(projected_short->op(), Expression::Operation::kStartsWith); + + auto unbound_projected_short = + internal::checked_pointer_cast>( + std::move(projected_short)); + ASSERT_NE(unbound_projected_short, nullptr); + EXPECT_EQ(unbound_projected_short->op(), Expression::Operation::kStartsWith); + EXPECT_EQ(unbound_projected_short->literals().size(), 1); + EXPECT_EQ(std::get(unbound_projected_short->literals().front().value()), + "Hi"); + + // StartsWith with string equal to width + auto unbound_equal = Expressions::StartsWith("value", "Hello"); + ICEBERG_ASSIGN_OR_THROW(auto bound_equal, + unbound_equal->Bind(*string_schema_, /*case_sensitive=*/true)); + auto bound_pred_equal = std::dynamic_pointer_cast(bound_equal); + ASSERT_NE(bound_pred_equal, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected_equal, + transform->Project("part", bound_pred_equal)); + ASSERT_NE(projected_equal, nullptr); + EXPECT_EQ(projected_equal->op(), Expression::Operation::kEq); + + auto unbound_projected_equal = + internal::checked_pointer_cast>( + std::move(projected_equal)); + ASSERT_NE(unbound_projected_equal, nullptr); + EXPECT_EQ(unbound_projected_equal->op(), Expression::Operation::kEq); + EXPECT_EQ(unbound_projected_equal->literals().size(), 1); + EXPECT_EQ(std::get(unbound_projected_equal->literals().front().value()), + "Hello"); +} + +TEST_F(TransformProjectTest, TruncateProjectStringStartsWithCodePointCountLessThanWidth) { + auto transform = Transform::Truncate(5); + + // Code point count < width (multi-byte UTF-8 characters) + // "😜🧐" has 2 code points, width is 5 + auto unbound_emoji_short = Expressions::StartsWith("value", "😜🧐"); + ICEBERG_ASSIGN_OR_THROW( + auto bound_emoji_short, + unbound_emoji_short->Bind(*string_schema_, /*case_sensitive=*/true)); + auto bound_pred_emoji_short = + std::dynamic_pointer_cast(bound_emoji_short); + ASSERT_NE(bound_pred_emoji_short, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected_emoji_short, + transform->Project("part", bound_pred_emoji_short)); + ASSERT_NE(projected_emoji_short, nullptr); + EXPECT_EQ(projected_emoji_short->op(), Expression::Operation::kStartsWith); + + auto unbound_projected_emoji_short = + internal::checked_pointer_cast>( + std::move(projected_emoji_short)); + ASSERT_NE(unbound_projected_emoji_short, nullptr); + EXPECT_EQ(unbound_projected_emoji_short->op(), Expression::Operation::kStartsWith); + EXPECT_EQ(unbound_projected_emoji_short->literals().size(), 1); + EXPECT_EQ( + std::get(unbound_projected_emoji_short->literals().front().value()), + "😜🧐"); +} + +TEST_F(TransformProjectTest, TruncateProjectStringStartsWithCodePointCountEqualToWidth) { + auto transform = Transform::Truncate(5); + + // Code point count == width (exactly 5 code points) + // "πŸ˜œπŸ§πŸ€”πŸ€ͺπŸ₯³" has exactly 5 code points + auto unbound_emoji_equal = Expressions::StartsWith("value", "πŸ˜œπŸ§πŸ€”πŸ€ͺπŸ₯³"); + ICEBERG_ASSIGN_OR_THROW( + auto bound_emoji_equal, + unbound_emoji_equal->Bind(*string_schema_, /*case_sensitive=*/true)); + auto bound_pred_emoji_equal = + std::dynamic_pointer_cast(bound_emoji_equal); + ASSERT_NE(bound_pred_emoji_equal, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected_emoji_equal, + transform->Project("part", bound_pred_emoji_equal)); + ASSERT_NE(projected_emoji_equal, nullptr); + EXPECT_EQ(projected_emoji_equal->op(), Expression::Operation::kEq); + + auto unbound_projected_emoji_equal = + internal::checked_pointer_cast>( + std::move(projected_emoji_equal)); + ASSERT_NE(unbound_projected_emoji_equal, nullptr); + EXPECT_EQ(unbound_projected_emoji_equal->op(), Expression::Operation::kEq); + EXPECT_EQ(unbound_projected_emoji_equal->literals().size(), 1); + EXPECT_EQ( + std::get(unbound_projected_emoji_equal->literals().front().value()), + "πŸ˜œπŸ§πŸ€”πŸ€ͺπŸ₯³"); +} + +TEST_F(TransformProjectTest, + TruncateProjectStringStartsWithCodePointCountGreaterThanWidth) { + auto transform = Transform::Truncate(5); + + // Code point count > width (truncate to 5 code points) + // "πŸ˜œπŸ§πŸ€”πŸ€ͺπŸ₯³πŸ˜΅β€πŸ’«πŸ˜‚" has 7 code points, should truncate to 5 + auto unbound_emoji_long = + Expressions::StartsWith("value", "πŸ˜œπŸ§πŸ€”πŸ€ͺπŸ₯³πŸ˜΅β€πŸ’«πŸ˜‚"); + ICEBERG_ASSIGN_OR_THROW( + auto bound_emoji_long, + unbound_emoji_long->Bind(*string_schema_, /*case_sensitive=*/true)); + auto bound_pred_emoji_long = + std::dynamic_pointer_cast(bound_emoji_long); + ASSERT_NE(bound_pred_emoji_long, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected_emoji_long, + transform->Project("part", bound_pred_emoji_long)); + ASSERT_NE(projected_emoji_long, nullptr); + EXPECT_EQ(projected_emoji_long->op(), Expression::Operation::kStartsWith); + + auto unbound_projected_emoji_long = + internal::checked_pointer_cast>( + std::move(projected_emoji_long)); + ASSERT_NE(unbound_projected_emoji_long, nullptr); + EXPECT_EQ(unbound_projected_emoji_long->op(), Expression::Operation::kStartsWith); + EXPECT_EQ(unbound_projected_emoji_long->literals().size(), 1); + EXPECT_EQ( + std::get(unbound_projected_emoji_long->literals().front().value()), + "πŸ˜œπŸ§πŸ€”πŸ€ͺπŸ₯³"); +} + +TEST_F(TransformProjectTest, TruncateProjectStringStartsWithMixedAsciiAndMultiByte) { + auto transform = Transform::Truncate(5); + + // Mixed ASCII and multi-byte UTF-8 characters + // "a😜b🧐c" has 5 code points (3 ASCII + 2 emojis) + auto unbound_mixed_equal = Expressions::StartsWith("value", "a😜b🧐c"); + ICEBERG_ASSIGN_OR_THROW( + auto bound_mixed_equal, + unbound_mixed_equal->Bind(*string_schema_, /*case_sensitive=*/true)); + auto bound_pred_mixed_equal = + std::dynamic_pointer_cast(bound_mixed_equal); + ASSERT_NE(bound_pred_mixed_equal, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected_mixed_equal, + transform->Project("part", bound_pred_mixed_equal)); + ASSERT_NE(projected_mixed_equal, nullptr); + EXPECT_EQ(projected_mixed_equal->op(), Expression::Operation::kEq); + + auto unbound_projected_mixed_equal = + internal::checked_pointer_cast>( + std::move(projected_mixed_equal)); + ASSERT_NE(unbound_projected_mixed_equal, nullptr); + EXPECT_EQ(unbound_projected_mixed_equal->op(), Expression::Operation::kEq); + EXPECT_EQ(unbound_projected_mixed_equal->literals().size(), 1); + EXPECT_EQ( + std::get(unbound_projected_mixed_equal->literals().front().value()), + "a😜b🧐c"); +} + +TEST_F(TransformProjectTest, TruncateProjectStringStartsWithChineseCharactersShort) { + auto transform = Transform::Truncate(5); + + // Chinese characters (3-byte UTF-8) + // "δ½ ε₯½δΈ–η•Œ" has 4 code points, width is 5 + auto unbound_chinese_short = Expressions::StartsWith("value", "δ½ ε₯½δΈ–η•Œ"); + ICEBERG_ASSIGN_OR_THROW( + auto bound_chinese_short, + unbound_chinese_short->Bind(*string_schema_, /*case_sensitive=*/true)); + auto bound_pred_chinese_short = + std::dynamic_pointer_cast(bound_chinese_short); + ASSERT_NE(bound_pred_chinese_short, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected_chinese_short, + transform->Project("part", bound_pred_chinese_short)); + ASSERT_NE(projected_chinese_short, nullptr); + EXPECT_EQ(projected_chinese_short->op(), Expression::Operation::kStartsWith); + + auto unbound_projected_chinese_short = + internal::checked_pointer_cast>( + std::move(projected_chinese_short)); + ASSERT_NE(unbound_projected_chinese_short, nullptr); + EXPECT_EQ(unbound_projected_chinese_short->op(), Expression::Operation::kStartsWith); + EXPECT_EQ(unbound_projected_chinese_short->literals().size(), 1); + EXPECT_EQ( + std::get(unbound_projected_chinese_short->literals().front().value()), + "δ½ ε₯½δΈ–η•Œ"); +} + +TEST_F(TransformProjectTest, TruncateProjectStringStartsWithChineseCharactersEqualWidth) { + auto transform = Transform::Truncate(5); + + // Chinese characters exactly matching width + // "δ½ ε₯½δΈ–η•Œε₯½" has exactly 5 code points + auto unbound_chinese_equal = Expressions::StartsWith("value", "δ½ ε₯½δΈ–η•Œε₯½"); + ICEBERG_ASSIGN_OR_THROW( + auto bound_chinese_equal, + unbound_chinese_equal->Bind(*string_schema_, /*case_sensitive=*/true)); + auto bound_pred_chinese_equal = + std::dynamic_pointer_cast(bound_chinese_equal); + ASSERT_NE(bound_pred_chinese_equal, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected_chinese_equal, + transform->Project("part", bound_pred_chinese_equal)); + ASSERT_NE(projected_chinese_equal, nullptr); + EXPECT_EQ(projected_chinese_equal->op(), Expression::Operation::kEq); + + auto unbound_projected_chinese_equal = + internal::checked_pointer_cast>( + std::move(projected_chinese_equal)); + ASSERT_NE(unbound_projected_chinese_equal, nullptr); + EXPECT_EQ(unbound_projected_chinese_equal->op(), Expression::Operation::kEq); + EXPECT_EQ(unbound_projected_chinese_equal->literals().size(), 1); + EXPECT_EQ( + std::get(unbound_projected_chinese_equal->literals().front().value()), + "δ½ ε₯½δΈ–η•Œε₯½"); +} + +TEST_F(TransformProjectTest, + TruncateProjectStringNotStartsWithCodePointCountEqualToWidth) { + auto transform = Transform::Truncate(5); + + // NotStartsWith with code point count == width + // Should convert to NotEq + auto unbound_not_starts_equal = Expressions::NotStartsWith("value", "πŸ˜œπŸ§πŸ€”πŸ€ͺπŸ₯³"); + ICEBERG_ASSIGN_OR_THROW( + auto bound_not_starts_equal, + unbound_not_starts_equal->Bind(*string_schema_, /*case_sensitive=*/true)); + auto bound_pred_not_starts_equal = + std::dynamic_pointer_cast(bound_not_starts_equal); + ASSERT_NE(bound_pred_not_starts_equal, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected_not_starts_equal, + transform->Project("part", bound_pred_not_starts_equal)); + ASSERT_NE(projected_not_starts_equal, nullptr); + EXPECT_EQ(projected_not_starts_equal->op(), Expression::Operation::kNotEq); + + auto unbound_projected_not_starts_equal = + internal::checked_pointer_cast>( + std::move(projected_not_starts_equal)); + ASSERT_NE(unbound_projected_not_starts_equal, nullptr); + EXPECT_EQ(unbound_projected_not_starts_equal->op(), Expression::Operation::kNotEq); + EXPECT_EQ(unbound_projected_not_starts_equal->literals().size(), 1); + EXPECT_EQ(std::get( + unbound_projected_not_starts_equal->literals().front().value()), + "πŸ˜œπŸ§πŸ€”πŸ€ͺπŸ₯³"); +} + +TEST_F(TransformProjectTest, + TruncateProjectStringNotStartsWithCodePointCountLessThanWidth) { + auto transform = Transform::Truncate(5); + + // NotStartsWith with code point count < width + // Should remain NotStartsWith + auto unbound_not_starts_short = Expressions::NotStartsWith("value", "😜🧐"); + ICEBERG_ASSIGN_OR_THROW( + auto bound_not_starts_short, + unbound_not_starts_short->Bind(*string_schema_, /*case_sensitive=*/true)); + auto bound_pred_not_starts_short = + std::dynamic_pointer_cast(bound_not_starts_short); + ASSERT_NE(bound_pred_not_starts_short, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected_not_starts_short, + transform->Project("part", bound_pred_not_starts_short)); + ASSERT_NE(projected_not_starts_short, nullptr); + EXPECT_EQ(projected_not_starts_short->op(), Expression::Operation::kNotStartsWith); + + auto unbound_projected_not_starts_short = + internal::checked_pointer_cast>( + std::move(projected_not_starts_short)); + ASSERT_NE(unbound_projected_not_starts_short, nullptr); + EXPECT_EQ(unbound_projected_not_starts_short->op(), + Expression::Operation::kNotStartsWith); + EXPECT_EQ(unbound_projected_not_starts_short->literals().size(), 1); + EXPECT_EQ(std::get( + unbound_projected_not_starts_short->literals().front().value()), + "😜🧐"); +} + +TEST_F(TransformProjectTest, + TruncateProjectStringNotStartsWithCodePointCountGreaterThanWidth) { + auto transform = Transform::Truncate(5); + + // NotStartsWith with code point count > width + // Should return nullptr (cannot project) + auto unbound_not_starts_long = + Expressions::NotStartsWith("value", "πŸ˜œπŸ§πŸ€”πŸ€ͺπŸ₯³πŸ˜΅β€πŸ’«πŸ˜‚"); + ICEBERG_ASSIGN_OR_THROW( + auto bound_not_starts_long, + unbound_not_starts_long->Bind(*string_schema_, /*case_sensitive=*/true)); + auto bound_pred_not_starts_long = + std::dynamic_pointer_cast(bound_not_starts_long); + ASSERT_NE(bound_pred_not_starts_long, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected_not_starts_long, + transform->Project("part", bound_pred_not_starts_long)); + EXPECT_EQ(projected_not_starts_long, nullptr); +} + +TEST_F(TransformProjectTest, YearProjectEquality) { + auto transform = Transform::Year(); + + // 2021-06-01 as days from epoch + int32_t date_value = + TemporalTestHelper::CreateDate({.year = 2021, .month = 6, .day = 1}); + auto unbound = Expressions::Equal("value", Literal::Date(date_value)); + ICEBERG_ASSIGN_OR_THROW(auto bound, + unbound->Bind(*date_schema_, /*case_sensitive=*/true)); + auto bound_pred = std::dynamic_pointer_cast(bound); + ASSERT_NE(bound_pred, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected, transform->Project("part", bound_pred)); + ASSERT_NE(projected, nullptr); + EXPECT_EQ(projected->op(), Expression::Operation::kEq); +} + +TEST_F(TransformProjectTest, YearProjectComparison) { + auto transform = Transform::Year(); + + int32_t date_value = + TemporalTestHelper::CreateDate({.year = 2021, .month = 6, .day = 1}); + + // LT projects to LTE + auto unbound_lt = Expressions::LessThan("value", Literal::Date(date_value)); + ICEBERG_ASSIGN_OR_THROW(auto bound_lt, + unbound_lt->Bind(*date_schema_, /*case_sensitive=*/true)); + auto bound_pred_lt = std::dynamic_pointer_cast(bound_lt); + ASSERT_NE(bound_pred_lt, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected_lt, transform->Project("part", bound_pred_lt)); + ASSERT_NE(projected_lt, nullptr); + EXPECT_EQ(projected_lt->op(), Expression::Operation::kLtEq); + + // GT projects to GTE + auto unbound_gt = Expressions::GreaterThan("value", Literal::Date(date_value)); + ICEBERG_ASSIGN_OR_THROW(auto bound_gt, + unbound_gt->Bind(*date_schema_, /*case_sensitive=*/true)); + auto bound_pred_gt = std::dynamic_pointer_cast(bound_gt); + ASSERT_NE(bound_pred_gt, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected_gt, transform->Project("part", bound_pred_gt)); + ASSERT_NE(projected_gt, nullptr); + EXPECT_EQ(projected_gt->op(), Expression::Operation::kGtEq); +} + +TEST_F(TransformProjectTest, MonthProjectEquality) { + auto transform = Transform::Month(); + + int64_t ts_value = + TemporalTestHelper::CreateTimestamp({.year = 2021, .month = 6, .day = 1}); + auto unbound = Expressions::Equal("value", Literal::Timestamp(ts_value)); + ICEBERG_ASSIGN_OR_THROW(auto bound, + unbound->Bind(*timestamp_schema_, /*case_sensitive=*/true)); + auto bound_pred = std::dynamic_pointer_cast(bound); + ASSERT_NE(bound_pred, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected, transform->Project("part", bound_pred)); + ASSERT_NE(projected, nullptr); + EXPECT_EQ(projected->op(), Expression::Operation::kEq); +} + +TEST_F(TransformProjectTest, DayProjectEquality) { + auto transform = Transform::Day(); + + int32_t date_value = + TemporalTestHelper::CreateDate({.year = 2021, .month = 6, .day = 15}); + auto unbound = Expressions::Equal("value", Literal::Date(date_value)); + ICEBERG_ASSIGN_OR_THROW(auto bound, + unbound->Bind(*date_schema_, /*case_sensitive=*/true)); + auto bound_pred = std::dynamic_pointer_cast(bound); + ASSERT_NE(bound_pred, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected, transform->Project("part", bound_pred)); + ASSERT_NE(projected, nullptr); + EXPECT_EQ(projected->op(), Expression::Operation::kEq); +} + +TEST_F(TransformProjectTest, HourProjectEquality) { + auto transform = Transform::Hour(); + + int64_t ts_value = TemporalTestHelper::CreateTimestamp( + {.year = 2021, .month = 6, .day = 1, .hour = 14, .minute = 30}); + auto unbound = Expressions::Equal("value", Literal::Timestamp(ts_value)); + ICEBERG_ASSIGN_OR_THROW(auto bound, + unbound->Bind(*timestamp_schema_, /*case_sensitive=*/true)); + auto bound_pred = std::dynamic_pointer_cast(bound); + ASSERT_NE(bound_pred, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected, transform->Project("part", bound_pred)); + ASSERT_NE(projected, nullptr); + EXPECT_EQ(projected->op(), Expression::Operation::kEq); +} + +TEST_F(TransformProjectTest, VoidProjectReturnsNull) { + auto transform = Transform::Void(); + + auto unbound = Expressions::Equal("value", Literal::Int(100)); + ICEBERG_ASSIGN_OR_THROW(auto bound, + unbound->Bind(*int_schema_, /*case_sensitive=*/true)); + auto bound_pred = std::dynamic_pointer_cast(bound); + ASSERT_NE(bound_pred, nullptr); + + // Void transform always returns null (no projection possible) + ICEBERG_ASSIGN_OR_THROW(auto projected, transform->Project("part", bound_pred)); + EXPECT_EQ(projected, nullptr); +} + +TEST_F(TransformProjectTest, TemporalProjectInSet) { + auto transform = Transform::Year(); + + int32_t date1 = TemporalTestHelper::CreateDate({.year = 2020, .month = 1, .day = 1}); + int32_t date2 = TemporalTestHelper::CreateDate({.year = 2021, .month = 6, .day = 15}); + int32_t date3 = TemporalTestHelper::CreateDate({.year = 2022, .month = 12, .day = 31}); + + auto unbound_in = Expressions::In( + "value", {Literal::Date(date1), Literal::Date(date2), Literal::Date(date3)}); + ICEBERG_ASSIGN_OR_THROW(auto bound_in, + unbound_in->Bind(*date_schema_, /*case_sensitive=*/true)); + auto bound_pred_in = std::dynamic_pointer_cast(bound_in); + ASSERT_NE(bound_pred_in, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected_in, transform->Project("part", bound_pred_in)); + ASSERT_NE(projected_in, nullptr); + EXPECT_EQ(projected_in->op(), Expression::Operation::kIn); +} + +TEST_F(TransformProjectTest, DayTimestampProjectionFix) { + auto transform = Transform::Day(); + + // Predicate: value < 1970-01-01 00:00:00 (0) + // This implies value <= -1 micros. + // day(-1 micros) = -1 day (1969-12-31). + // If we don't fix, we project to day <= -1. + // If we fix (for buggy writers), we project to day <= 0. + auto unbound = Expressions::LessThan("value", Literal::Timestamp(0)); + + ICEBERG_ASSIGN_OR_THROW(auto bound, + unbound->Bind(*timestamp_schema_, /*case_sensitive=*/true)); + auto bound_pred = std::dynamic_pointer_cast(bound); + ASSERT_NE(bound_pred, nullptr); + + ICEBERG_ASSIGN_OR_THROW(auto projected, transform->Project("part", bound_pred)); + ASSERT_NE(projected, nullptr); + + auto unbound_projected = + internal::checked_pointer_cast>( + std::move(projected)); + EXPECT_EQ(unbound_projected->op(), Expression::Operation::kLtEq); + ASSERT_EQ(unbound_projected->literals().size(), 1); + int32_t val = std::get(unbound_projected->literals().front().value()); + EXPECT_EQ(val, 0) << "Expected projected value to be 0 (fix applied), but got " << val; +} + } // namespace iceberg diff --git a/src/iceberg/transform.cc b/src/iceberg/transform.cc index 2e39cb06d..f8d2f0655 100644 --- a/src/iceberg/transform.cc +++ b/src/iceberg/transform.cc @@ -23,8 +23,14 @@ #include #include +#include "iceberg/expression/predicate.h" +#include "iceberg/expression/term.h" +#include "iceberg/result.h" #include "iceberg/transform_function.h" #include "iceberg/type.h" +#include "iceberg/util/checked_cast.h" +#include "iceberg/util/macros.h" +#include "iceberg/util/projection_util_internal.h" namespace iceberg { namespace { @@ -240,6 +246,66 @@ bool Transform::SatisfiesOrderOf(const Transform& other) const { std::unreachable(); } +Result> Transform::Project( + std::string_view name, const std::shared_ptr& predicate) { + switch (transform_type_) { + case TransformType::kIdentity: + return ProjectionUtil::IdentityProject(name, predicate); + case TransformType::kBucket: { + // If the predicate has a transformed child that matches the given transform, return + // a predicate. + if (predicate->term()->kind() == Term::Kind::kTransform) { + const auto boundTransform = + internal::checked_pointer_cast(predicate->term()); + if (*this == *boundTransform->transform()) { + return ProjectionUtil::RemoveTransform(name, predicate); + } else { + return nullptr; + } + } + ICEBERG_ASSIGN_OR_RAISE(auto func, Bind(predicate->term()->type())); + return ProjectionUtil::BucketProject(name, predicate, func); + } + case TransformType::kTruncate: { + // If the predicate has a transformed child that matches the given transform, return + // a predicate. + if (predicate->term()->kind() == Term::Kind::kTransform) { + const auto boundTransform = + internal::checked_pointer_cast(predicate->term()); + if (*this == *boundTransform->transform()) { + return ProjectionUtil::RemoveTransform(name, predicate); + } else { + return nullptr; + } + } + ICEBERG_ASSIGN_OR_RAISE(auto func, Bind(predicate->term()->type())); + return ProjectionUtil::TruncateProject(name, predicate, func); + } + case TransformType::kYear: + case TransformType::kMonth: + case TransformType::kDay: + case TransformType::kHour: { + // If the predicate has a transformed child that matches the given transform, return + // a predicate. + if (predicate->term()->kind() == Term::Kind::kTransform) { + const auto boundTransform = + internal::checked_pointer_cast(predicate->term()); + if (*this == *boundTransform->transform()) { + return ProjectionUtil::RemoveTransform(name, predicate); + } else { + return nullptr; + } + } + ICEBERG_ASSIGN_OR_RAISE(auto func, Bind(predicate->term()->type())); + return ProjectionUtil::TemporalProject(name, predicate, func); + } + case TransformType::kUnknown: + case TransformType::kVoid: + return nullptr; + } + std::unreachable(); +} + bool TransformFunction::Equals(const TransformFunction& other) const { return transform_type_ == other.transform_type_ && *source_type_ == *other.source_type_; } diff --git a/src/iceberg/transform.h b/src/iceberg/transform.h index 4f608c467..64b850725 100644 --- a/src/iceberg/transform.h +++ b/src/iceberg/transform.h @@ -23,6 +23,7 @@ #include #include +#include #include #include @@ -164,11 +165,23 @@ class ICEBERG_EXPORT Transform : public util::Formattable { /// For example, sorting by day(ts) will produce an ordering that is also by month(ts) /// or year(ts). However, sorting by day(ts) will not satisfy the order of hour(ts) or /// identity(ts). - /// + /// \param other The other transform to compare with. /// \return true if ordering by this transform is equivalent to ordering by the other /// transform. bool SatisfiesOrderOf(const Transform& other) const; + /// \brief Transforms a BoundPredicate to an inclusive predicate on the partition values + /// produced by the transform. + /// + /// This inclusive transform guarantees that if predicate->Test(value) is true, then + /// Projected(transform(value)) is true. + /// \param name The name of the partition column. + /// \param predicate The predicate to project. + /// \return A Result containing either a unique pointer to the projected predicate, + /// nullptr if the projection cannot be performed, or an Error if the projection fails. + Result> Project( + std::string_view name, const std::shared_ptr& predicate); + /// \brief Returns a string representation of this transform (e.g., "bucket[16]"). std::string ToString() const override; diff --git a/src/iceberg/transform_function.h b/src/iceberg/transform_function.h index 33712ca60..b3cfa5a22 100644 --- a/src/iceberg/transform_function.h +++ b/src/iceberg/transform_function.h @@ -83,6 +83,9 @@ class ICEBERG_EXPORT TruncateTransform : public TransformFunction { /// \brief Returns the same type as source_type. std::shared_ptr ResultType() const override; + /// \brief Returns the width to truncate to. + int32_t width() const { return width_; } + /// \brief Create a TruncateTransform. /// \param source_type Type of the input data. /// \param width The width to truncate to. diff --git a/src/iceberg/type_fwd.h b/src/iceberg/type_fwd.h index 68a4543fb..7aad3d8d8 100644 --- a/src/iceberg/type_fwd.h +++ b/src/iceberg/type_fwd.h @@ -123,6 +123,9 @@ class Uuid; class Expression; class Literal; +class BoundPredicate; +class UnboundPredicate; + class DataTableScan; class FileScanTask; class ScanTask; diff --git a/src/iceberg/util/projection_util_internal.h b/src/iceberg/util/projection_util_internal.h new file mode 100644 index 000000000..3ce2dbf8f --- /dev/null +++ b/src/iceberg/util/projection_util_internal.h @@ -0,0 +1,559 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "iceberg/expression/literal.h" +#include "iceberg/expression/predicate.h" +#include "iceberg/result.h" +#include "iceberg/transform.h" +#include "iceberg/transform_function.h" +#include "iceberg/util/checked_cast.h" +#include "iceberg/util/decimal.h" +#include "iceberg/util/macros.h" +#include "iceberg/util/string_util.h" + +namespace iceberg { + +class ProjectionUtil { + private: + static Result> TransformSet( + std::string_view name, const std::shared_ptr& predicate, + const std::shared_ptr& func) { + std::vector transformed; + transformed.reserve(predicate->literal_set().size()); + for (const auto& lit : predicate->literal_set()) { + ICEBERG_ASSIGN_OR_RAISE(auto transformed_lit, func->Transform(lit)); + transformed.push_back(std::move(transformed_lit)); + } + ICEBERG_ASSIGN_OR_RAISE(auto ref, NamedReference::Make(std::string(name))); + return UnboundPredicateImpl::Make(predicate->op(), std::move(ref), + std::move(transformed)); + } + + // General transform for all literal predicates. This is used as a fallback for special + // cases that are not handled by the other transform functions. + static Result> GenericTransform( + std::unique_ptr ref, + const std::shared_ptr& predicate, + const std::shared_ptr& func) { + ICEBERG_ASSIGN_OR_RAISE(auto transformed, func->Transform(predicate->literal())); + switch (predicate->op()) { + case Expression::Operation::kLt: + case Expression::Operation::kLtEq: { + return UnboundPredicateImpl::Make( + Expression::Operation::kLtEq, std::move(ref), std::move(transformed)); + } + case Expression::Operation::kGt: + case Expression::Operation::kGtEq: { + return UnboundPredicateImpl::Make( + Expression::Operation::kGtEq, std::move(ref), std::move(transformed)); + } + case Expression::Operation::kEq: { + return UnboundPredicateImpl::Make( + Expression::Operation::kEq, std::move(ref), std::move(transformed)); + } + default: + return nullptr; + } + } + + static Result> TruncateByteArray( + std::string_view name, const std::shared_ptr& predicate, + const std::shared_ptr& func) { + ICEBERG_ASSIGN_OR_RAISE(auto ref, NamedReference::Make(std::string(name))); + switch (predicate->op()) { + case Expression::Operation::kStartsWith: { + ICEBERG_ASSIGN_OR_RAISE(auto transformed, func->Transform(predicate->literal())); + return UnboundPredicateImpl::Make( + Expression::Operation::kStartsWith, std::move(ref), std::move(transformed)); + } + default: + return GenericTransform(std::move(ref), predicate, func); + } + } + + template + requires std::is_same_v || std::is_same_v + static Result> TruncateInteger( + std::string_view name, const std::shared_ptr& predicate, + const std::shared_ptr& func) { + const Literal& literal = predicate->literal(); + ICEBERG_ASSIGN_OR_RAISE(auto ref, NamedReference::Make(std::string(name))); + + switch (predicate->op()) { + case Expression::Operation::kLt: { + // adjust closed and then transform ltEq + if constexpr (std::is_same_v) { + ICEBERG_ASSIGN_OR_RAISE( + auto transformed, + func->Transform(Literal::Int(std::get(literal.value()) - 1))); + return UnboundPredicateImpl::Make( + Expression::Operation::kLtEq, std::move(ref), std::move(transformed)); + } else { + ICEBERG_ASSIGN_OR_RAISE( + auto transformed, + func->Transform(Literal::Long(std::get(literal.value()) - 1))); + return UnboundPredicateImpl::Make( + Expression::Operation::kLtEq, std::move(ref), std::move(transformed)); + } + } + case Expression::Operation::kGt: { + // adjust closed and then transform gtEq + if constexpr (std::is_same_v) { + ICEBERG_ASSIGN_OR_RAISE( + auto transformed, + func->Transform(Literal::Int(std::get(literal.value()) + 1))); + return UnboundPredicateImpl::Make( + Expression::Operation::kGtEq, std::move(ref), std::move(transformed)); + } else { + ICEBERG_ASSIGN_OR_RAISE( + auto transformed, + func->Transform(Literal::Long(std::get(literal.value()) + 1))); + return UnboundPredicateImpl::Make( + Expression::Operation::kGtEq, std::move(ref), std::move(transformed)); + } + } + default: + return GenericTransform(std::move(ref), predicate, func); + } + } + + static Result> TransformTemporal( + std::string_view name, const std::shared_ptr& predicate, + const std::shared_ptr& func) { + const Literal& literal = predicate->literal(); + ICEBERG_ASSIGN_OR_RAISE(auto ref, NamedReference::Make(std::string(name))); + + switch (func->source_type()->type_id()) { + case TypeId::kDate: { + switch (predicate->op()) { + case Expression::Operation::kLt: { + ICEBERG_ASSIGN_OR_RAISE( + auto transformed, + func->Transform(Literal::Date(std::get(literal.value()) - 1))); + return UnboundPredicateImpl::Make( + Expression::Operation::kLtEq, std::move(ref), std::move(transformed)); + } + case Expression::Operation::kGt: { + ICEBERG_ASSIGN_OR_RAISE( + auto transformed, + func->Transform(Literal::Date(std::get(literal.value()) + 1))); + return UnboundPredicateImpl::Make( + Expression::Operation::kGtEq, std::move(ref), std::move(transformed)); + } + default: + return GenericTransform(std::move(ref), predicate, func); + } + } + case TypeId::kTimestamp: { + switch (predicate->op()) { + case Expression::Operation::kLt: { + ICEBERG_ASSIGN_OR_RAISE(auto transformed, + func->Transform(Literal::Timestamp( + std::get(literal.value()) - 1))); + return UnboundPredicateImpl::Make( + Expression::Operation::kLtEq, std::move(ref), std::move(transformed)); + } + case Expression::Operation::kGt: { + ICEBERG_ASSIGN_OR_RAISE(auto transformed, + func->Transform(Literal::Timestamp( + std::get(literal.value()) + 1))); + return UnboundPredicateImpl::Make( + Expression::Operation::kGtEq, std::move(ref), std::move(transformed)); + } + default: + return GenericTransform(std::move(ref), predicate, func); + } + } + case TypeId::kTimestampTz: { + switch (predicate->op()) { + case Expression::Operation::kLt: { + ICEBERG_ASSIGN_OR_RAISE(auto transformed, + func->Transform(Literal::TimestampTz( + std::get(literal.value()) - 1))); + return UnboundPredicateImpl::Make( + Expression::Operation::kLtEq, std::move(ref), std::move(transformed)); + } + case Expression::Operation::kGt: { + ICEBERG_ASSIGN_OR_RAISE(auto transformed, + func->Transform(Literal::TimestampTz( + std::get(literal.value()) + 1))); + return UnboundPredicateImpl::Make( + Expression::Operation::kGtEq, std::move(ref), std::move(transformed)); + } + default: + return GenericTransform(std::move(ref), predicate, func); + } + } + default: + return NotSupported("{} is not a valid input type for temporal transform", + func->source_type()->ToString()); + } + } + + static Result> TruncateDecimal( + std::string_view name, const std::shared_ptr& predicate, + const std::shared_ptr& func) { + const Literal& boundary = predicate->literal(); + ICEBERG_ASSIGN_OR_RAISE(auto ref, NamedReference::Make(std::string(name))); + + // For boundary adjustments, extract type info once + auto make_adjusted_literal = [&boundary](int adjustment) { + const auto& type = internal::checked_pointer_cast(boundary.type()); + Decimal adjusted = std::get(boundary.value()) + Decimal(adjustment); + return Literal::Decimal(adjusted.value(), type->precision(), type->scale()); + }; + + switch (predicate->op()) { + case Expression::Operation::kLt: { + // adjust closed and then transform ltEq + ICEBERG_ASSIGN_OR_RAISE(auto transformed, + func->Transform(make_adjusted_literal(-1))); + return UnboundPredicateImpl::Make( + Expression::Operation::kLtEq, std::move(ref), std::move(transformed)); + } + case Expression::Operation::kGt: { + // adjust closed and then transform gtEq + ICEBERG_ASSIGN_OR_RAISE(auto transformed, + func->Transform(make_adjusted_literal(1))); + return UnboundPredicateImpl::Make( + Expression::Operation::kGtEq, std::move(ref), std::move(transformed)); + } + default: + return GenericTransform(std::move(ref), predicate, func); + } + } + + static Result> TruncateStringLiteral( + std::string_view name, const std::shared_ptr& predicate, + const std::shared_ptr& func) { + const auto op = predicate->op(); + if (op != Expression::Operation::kStartsWith && + op != Expression::Operation::kNotStartsWith) { + return TruncateByteArray(name, predicate, func); + } + + const auto& truncate_transform = + internal::checked_pointer_cast(func); + const auto& str_value = std::get(predicate->literal().value()); + const auto width = truncate_transform->width(); + ICEBERG_ASSIGN_OR_RAISE(auto ref, NamedReference::Make(std::string(name))); + + if (StringUtils::CodePointCount(str_value) < width) { + return UnboundPredicateImpl::Make(op, std::move(ref), + predicate->literal()); + } + + if (StringUtils::CodePointCount(str_value) == width) { + if (op == Expression::Operation::kStartsWith) { + return UnboundPredicateImpl::Make( + Expression::Operation::kEq, std::move(ref), predicate->literal()); + } else { + return UnboundPredicateImpl::Make( + Expression::Operation::kNotEq, std::move(ref), predicate->literal()); + } + } + + if (op == Expression::Operation::kStartsWith) { + ICEBERG_ASSIGN_OR_RAISE(auto transformed, func->Transform(predicate->literal())); + return UnboundPredicateImpl::Make( + Expression::Operation::kStartsWith, std::move(ref), std::move(transformed)); + } + + return nullptr; + } + + // Fixes an inclusive projection to account for incorrectly transformed values. + // align with Java implementation: + // https://github.com/apache/iceberg/blob/1.10.x/api/src/main/java/org/apache/iceberg/transforms/ProjectionUtil.java#L275 + static Result> FixInclusiveTimeProjection( + std::unique_ptr> projected) { + if (projected == nullptr) { + return nullptr; + } + + // adjust the predicate for values that were 1 larger than the correct transformed + // value + switch (projected->op()) { + case Expression::Operation::kLt: { + ICEBERG_DCHECK(!projected->literals().empty(), "Expected at least one literal"); + const auto& literal = projected->literals().front(); + ICEBERG_DCHECK(std::holds_alternative(literal.value()), + "Expected int32_t"); + auto value = std::get(literal.value()); + if (value < 0) { + return UnboundPredicateImpl::Make(Expression::Operation::kLt, + std::move(projected->term()), + Literal::Int(value + 1)); + } + + return std::move(projected); + } + + case Expression::Operation::kLtEq: { + ICEBERG_DCHECK(!projected->literals().empty(), "Expected at least one literal"); + const auto& literal = projected->literals().front(); + ICEBERG_DCHECK(std::holds_alternative(literal.value()), + "Expected int32_t"); + auto value = std::get(literal.value()); + if (value < 0) { + return UnboundPredicateImpl::Make(Expression::Operation::kLtEq, + std::move(projected->term()), + Literal::Int(value + 1)); + } + return std::move(projected); + } + + case Expression::Operation::kGt: + case Expression::Operation::kGtEq: + // incorrect projected values are already greater than the bound for GT, GT_EQ + return std::move(projected); + + case Expression::Operation::kEq: { + ICEBERG_DCHECK(!projected->literals().empty(), "Expected at least one literal"); + const auto& literal = projected->literals().front(); + ICEBERG_DCHECK(std::holds_alternative(literal.value()), + "Expected int32_t"); + auto value = std::get(literal.value()); + if (value < 0) { + // match either the incorrect value (projectedValue + 1) or the correct value + // (projectedValue) + return UnboundPredicateImpl::Make( + Expression::Operation::kIn, std::move(projected->term()), + {literal, Literal::Int(value + 1)}); + } + return std::move(projected); + } + + case Expression::Operation::kIn: { + ICEBERG_DCHECK(!projected->literals().empty(), "Expected at least one literal"); + const auto& literals = projected->literals(); + ICEBERG_DCHECK( + std::ranges::all_of(literals, + [](const auto& lit) { + return std::holds_alternative(lit.value()); + }), + "Expected int32_t"); + std::unordered_set value_set; + bool has_negative_value = false; + for (const auto& lit : literals) { + auto value = std::get(lit.value()); + value_set.insert(value); + if (value < 0) { + value_set.insert(value + 1); + has_negative_value = true; + } + } + if (has_negative_value) { + auto values = + std::views::transform(value_set, + [](int32_t value) { return Literal::Int(value); }) | + std::ranges::to(); + return UnboundPredicateImpl::Make(Expression::Operation::kIn, + std::move(projected->term()), + std::move(values)); + } + return std::move(projected); + } + + case Expression::Operation::kNotIn: + case Expression::Operation::kNotEq: + // there is no inclusive projection for NOT_EQ and NOT_IN + return nullptr; + + default: + return std::move(projected); + } + } + + public: + static Result> IdentityProject( + std::string_view name, const std::shared_ptr& predicate) { + ICEBERG_ASSIGN_OR_RAISE(auto ref, NamedReference::Make(std::string(name))); + switch (predicate->kind()) { + case BoundPredicate::Kind::kUnary: { + return UnboundPredicateImpl::Make(predicate->op(), + std::move(ref)); + } + case BoundPredicate::Kind::kLiteral: { + const auto& literalPredicate = + internal::checked_pointer_cast(predicate); + return UnboundPredicateImpl::Make(predicate->op(), std::move(ref), + literalPredicate->literal()); + } + case BoundPredicate::Kind::kSet: { + const auto& setPredicate = + internal::checked_pointer_cast(predicate); + return UnboundPredicateImpl::Make( + predicate->op(), std::move(ref), + std::vector(setPredicate->literal_set().begin(), + setPredicate->literal_set().end())); + } + } + std::unreachable(); + } + + static Result> BucketProject( + std::string_view name, const std::shared_ptr& predicate, + const std::shared_ptr& func) { + ICEBERG_ASSIGN_OR_RAISE(auto ref, NamedReference::Make(std::string(name))); + switch (predicate->kind()) { + case BoundPredicate::Kind::kUnary: { + return UnboundPredicateImpl::Make(predicate->op(), + std::move(ref)); + } + case BoundPredicate::Kind::kLiteral: { + if (predicate->op() == Expression::Operation::kEq) { + const auto& literalPredicate = + internal::checked_pointer_cast(predicate); + ICEBERG_ASSIGN_OR_RAISE(auto transformed, + func->Transform(literalPredicate->literal())); + return UnboundPredicateImpl::Make( + predicate->op(), std::move(ref), std::move(transformed)); + } + break; + } + case BoundPredicate::Kind::kSet: { + // notIn can't be projected + if (predicate->op() == Expression::Operation::kIn) { + const auto& setPredicate = + internal::checked_pointer_cast(predicate); + return TransformSet(name, setPredicate, func); + } + break; + } + } + + // comparison predicates can't be projected, notEq can't be projected + // TODO(anyone): small ranges can be projected. + // for example, (x > 0) and (x < 3) can be turned into in({1, 2}) and projected. + return nullptr; + } + + static Result> TruncateProject( + std::string_view name, const std::shared_ptr& predicate, + const std::shared_ptr& func) { + ICEBERG_ASSIGN_OR_RAISE(auto ref, NamedReference::Make(std::string(name))); + // Handle unary predicates uniformly for all types + if (predicate->kind() == BoundPredicate::Kind::kUnary) { + return UnboundPredicateImpl::Make(predicate->op(), std::move(ref)); + } + + // Handle set predicates (kIn) uniformly for all types + if (predicate->kind() == BoundPredicate::Kind::kSet) { + if (predicate->op() == Expression::Operation::kIn) { + const auto& setPredicate = + internal::checked_pointer_cast(predicate); + return TransformSet(name, setPredicate, func); + } + return nullptr; + } + + // Handle literal predicates based on source type + const auto& literalPredicate = + internal::checked_pointer_cast(predicate); + + switch (func->source_type()->type_id()) { + case TypeId::kInt: + return TruncateInteger(name, literalPredicate, func); + case TypeId::kLong: + return TruncateInteger(name, literalPredicate, func); + case TypeId::kDecimal: + return TruncateDecimal(name, literalPredicate, func); + case TypeId::kString: + return TruncateStringLiteral(name, literalPredicate, func); + case TypeId::kBinary: + return TruncateByteArray(name, literalPredicate, func); + default: + return NotSupported("{} is not a valid input type for truncate transform", + func->source_type()->ToString()); + } + } + + static Result> TemporalProject( + std::string_view name, const std::shared_ptr& predicate, + const std::shared_ptr& func) { + ICEBERG_ASSIGN_OR_RAISE(auto ref, NamedReference::Make(std::string(name))); + if (predicate->kind() == BoundPredicate::Kind::kUnary) { + return UnboundPredicateImpl::Make(predicate->op(), std::move(ref)); + } else if (predicate->kind() == BoundPredicate::Kind::kLiteral) { + const auto& literalPredicate = + internal::checked_pointer_cast(predicate); + ICEBERG_ASSIGN_OR_RAISE(auto projected, + TransformTemporal(name, literalPredicate, func)); + if (func->transform_type() != TransformType::kDay || + func->source_type()->type_id() != TypeId::kDate) { + return FixInclusiveTimeProjection( + internal::checked_pointer_cast>( + std::move(projected))); + } + return projected; + } else if (predicate->kind() == BoundPredicate::Kind::kSet && + predicate->op() == Expression::Operation::kIn) { + const auto& setPredicate = + internal::checked_pointer_cast(predicate); + ICEBERG_ASSIGN_OR_RAISE(auto projected, TransformSet(name, setPredicate, func)); + if (func->transform_type() != TransformType::kDay || + func->source_type()->type_id() != TypeId::kDate) { + return FixInclusiveTimeProjection( + internal::checked_pointer_cast>( + std::move(projected))); + } + return projected; + } + + return nullptr; + } + + static Result> RemoveTransform( + std::string_view name, const std::shared_ptr& predicate) { + ICEBERG_ASSIGN_OR_RAISE(auto ref, NamedReference::Make(std::string(name))); + switch (predicate->kind()) { + case BoundPredicate::Kind::kUnary: { + return UnboundPredicateImpl::Make(predicate->op(), + std::move(ref)); + } + case BoundPredicate::Kind::kLiteral: { + const auto& literalPredicate = + internal::checked_pointer_cast(predicate); + return UnboundPredicateImpl::Make(predicate->op(), std::move(ref), + literalPredicate->literal()); + } + case BoundPredicate::Kind::kSet: { + const auto& setPredicate = + internal::checked_pointer_cast(predicate); + return UnboundPredicateImpl::Make( + predicate->op(), std::move(ref), + std::vector(setPredicate->literal_set().begin(), + setPredicate->literal_set().end())); + } + } + std::unreachable(); + } +}; + +} // namespace iceberg diff --git a/src/iceberg/util/string_util.h b/src/iceberg/util/string_util.h index 8eee314f5..de7780369 100644 --- a/src/iceberg/util/string_util.h +++ b/src/iceberg/util/string_util.h @@ -43,6 +43,17 @@ class ICEBERG_EXPORT StringUtils { return std::ranges::equal( lhs, rhs, [](char lc, char rc) { return std::tolower(lc) == std::tolower(rc); }); } + + /// \brief Count the number of code points in a UTF-8 string. + static size_t CodePointCount(std::string_view str) { + size_t count = 0; + for (char i : str) { + if ((i & 0xC0) != 0x80) { + count++; + } + } + return count; + } }; /// \brief Transparent hash function that supports std::string_view as lookup key