diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt index a13f095aa..e21853ba4 100644 --- a/src/iceberg/CMakeLists.txt +++ b/src/iceberg/CMakeLists.txt @@ -41,6 +41,7 @@ set(ICEBERG_SOURCES partition_spec.cc row/arrow_array_wrapper.cc row/manifest_wrapper.cc + row/struct_like.cc schema.cc schema_field.cc schema_internal.cc diff --git a/src/iceberg/expression/term.cc b/src/iceberg/expression/term.cc index ba6e55ea3..34dfb918b 100644 --- a/src/iceberg/expression/term.cc +++ b/src/iceberg/expression/term.cc @@ -21,8 +21,8 @@ #include -#include "iceberg/exception.h" #include "iceberg/result.h" +#include "iceberg/row/struct_like.h" #include "iceberg/schema.h" #include "iceberg/transform.h" #include "iceberg/util/checked_cast.h" @@ -64,7 +64,11 @@ Result> NamedReference::Bind(const Schema& schem return InvalidExpression("Cannot find field '{}' in struct: {}", field_name_, schema.ToString()); } - return BoundReference::Make(field_opt.value().get()); + + int32_t field_id = field_opt.value().get().field_id(); + ICEBERG_ASSIGN_OR_RAISE(auto accessor, schema.GetAccessorById(field_id)); + + return BoundReference::Make(field_opt.value().get(), std::move(accessor)); } std::string NamedReference::ToString() const { @@ -72,17 +76,25 @@ std::string NamedReference::ToString() const { } // BoundReference implementation -Result> BoundReference::Make(SchemaField field) { +Result> BoundReference::Make( + SchemaField field, std::unique_ptr accessor) { if (auto status = field.Validate(); !status.has_value()) [[unlikely]] { return InvalidExpression("Cannot create BoundReference with invalid field: {}", status.error().message); } - return std::unique_ptr(new BoundReference(std::move(field))); + if (!accessor) [[unlikely]] { + return InvalidExpression("Cannot create BoundReference without accessor"); + } + return std::unique_ptr( + new BoundReference(std::move(field), std::move(accessor))); } -BoundReference::BoundReference(SchemaField field) : field_(std::move(field)) { +BoundReference::BoundReference(SchemaField field, + std::unique_ptr accessor) + : field_(std::move(field)), accessor_(std::move(accessor)) { ICEBERG_DCHECK(field_.Validate().has_value(), "Cannot create BoundReference with invalid field"); + ICEBERG_DCHECK(accessor_ != nullptr, "Cannot create BoundReference without accessor"); } BoundReference::~BoundReference() = default; @@ -92,7 +104,7 @@ std::string BoundReference::ToString() const { } Result BoundReference::Evaluate(const StructLike& data) const { - return NotImplemented("BoundReference::Evaluate(StructLike) not implemented"); + return accessor_->GetLiteral(data); } bool BoundReference::Equals(const BoundTerm& other) const { @@ -167,14 +179,14 @@ std::string BoundTransform::ToString() const { } Result BoundTransform::Evaluate(const StructLike& data) const { - throw IcebergError("BoundTransform::Evaluate(StructLike) not implemented"); + ICEBERG_ASSIGN_OR_RAISE(auto literal, ref_->Evaluate(data)); + return transform_func_->Transform(literal); } bool BoundTransform::MayProduceNull() const { // transforms must produce null for null input values // transforms may produce null for non-null inputs when not order-preserving - // FIXME: add Transform::is_order_preserving() - return ref_->MayProduceNull(); // || !transform_->is_order_preserving(); + return ref_->MayProduceNull() || !transform_->PreservesOrder(); } std::shared_ptr BoundTransform::type() const { diff --git a/src/iceberg/expression/term.h b/src/iceberg/expression/term.h index 6259b826e..e2a378feb 100644 --- a/src/iceberg/expression/term.h +++ b/src/iceberg/expression/term.h @@ -163,7 +163,8 @@ class ICEBERG_EXPORT BoundReference /// \brief Create a bound reference. /// /// \param field The schema field - static Result> Make(SchemaField field); + static Result> Make( + SchemaField field, std::unique_ptr accessor); ~BoundReference() override; @@ -186,9 +187,10 @@ class ICEBERG_EXPORT BoundReference Kind kind() const override { return Kind::kReference; } private: - explicit BoundReference(SchemaField field); + BoundReference(SchemaField field, std::unique_ptr accessor); SchemaField field_; + std::unique_ptr accessor_; }; /// \brief An unbound transform expression. diff --git a/src/iceberg/meson.build b/src/iceberg/meson.build index 1b24f85fb..b3433f485 100644 --- a/src/iceberg/meson.build +++ b/src/iceberg/meson.build @@ -63,6 +63,7 @@ iceberg_sources = files( 'partition_spec.cc', 'row/arrow_array_wrapper.cc', 'row/manifest_wrapper.cc', + 'row/struct_like.cc', 'schema.cc', 'schema_field.cc', 'schema_internal.cc', diff --git a/src/iceberg/row/struct_like.cc b/src/iceberg/row/struct_like.cc new file mode 100644 index 000000000..b0fb67fb4 --- /dev/null +++ b/src/iceberg/row/struct_like.cc @@ -0,0 +1,129 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/row/struct_like.h" + +#include + +#include "iceberg/result.h" +#include "iceberg/util/checked_cast.h" +#include "iceberg/util/formatter_internal.h" +#include "iceberg/util/macros.h" + +namespace iceberg { + +StructLikeAccessor::StructLikeAccessor(std::shared_ptr type, + std::span position_path) + : type_(std::move(type)) { + if (position_path.size() == 1) { + accessor_ = [pos = + position_path[0]](const StructLike& struct_like) -> Result { + return struct_like.GetField(pos); + }; + } else if (position_path.size() == 2) { + accessor_ = [pos0 = position_path[0], pos1 = position_path[1]]( + const StructLike& struct_like) -> Result { + ICEBERG_ASSIGN_OR_RAISE(auto first_level_field, struct_like.GetField(pos0)); + if (!std::holds_alternative>(first_level_field)) { + return InvalidSchema("Encountered non-struct in the position path [{},{}]", pos0, + pos1); + } + return std::get>(first_level_field)->GetField(pos1); + }; + } else if (!position_path.empty()) { + accessor_ = [position_path](const StructLike& struct_like) -> Result { + std::vector> backups; + const StructLike* current_struct_like = &struct_like; + for (size_t i = 0; i < position_path.size() - 1; ++i) { + ICEBERG_ASSIGN_OR_RAISE(auto field, + current_struct_like->GetField(position_path[i])); + if (!std::holds_alternative>(field)) { + return InvalidSchema("Encountered non-struct in the position path [{}]", + position_path); + } + backups.push_back(std::get>(field)); + current_struct_like = backups.back().get(); + } + return current_struct_like->GetField(position_path.back()); + }; + } else { + accessor_ = [](const StructLike&) -> Result { + return Invalid("Cannot read StructLike with empty position path"); + }; + } +} + +Result StructLikeAccessor::GetLiteral(const StructLike& struct_like) const { + if (!type_->is_primitive()) { + return NotSupported("Cannot get literal value for non-primitive type {}", + type_->ToString()); + } + + ICEBERG_ASSIGN_OR_RAISE(auto scalar, Get(struct_like)); + + if (std::holds_alternative(scalar)) { + return Literal::Null(internal::checked_pointer_cast(type_)); + } + + switch (type_->type_id()) { + case TypeId::kBoolean: + return Literal::Boolean(std::get(scalar)); + case TypeId::kInt: + return Literal::Int(std::get(scalar)); + case TypeId::kLong: + return Literal::Long(std::get(scalar)); + case TypeId::kFloat: + return Literal::Float(std::get(scalar)); + case TypeId::kDouble: + return Literal::Double(std::get(scalar)); + case TypeId::kString: + return Literal::String(std::string(std::get(scalar))); + case TypeId::kBinary: { + auto binary_data = std::get(scalar); + return Literal::Binary( + std::vector(binary_data.cbegin(), binary_data.cend())); + } + case TypeId::kDecimal: { + const auto& decimal_type = internal::checked_cast(*type_); + return Literal::Decimal(std::get(scalar).value(), decimal_type.precision(), + decimal_type.scale()); + } + case TypeId::kDate: + return Literal::Date(std::get(scalar)); + case TypeId::kTime: + return Literal::Time(std::get(scalar)); + case TypeId::kTimestamp: + return Literal::Timestamp(std::get(scalar)); + case TypeId::kTimestampTz: + return Literal::TimestampTz(std::get(scalar)); + case TypeId::kFixed: { + const auto& fixed_data = std::get(scalar); + return Literal::Fixed(std::vector(fixed_data.cbegin(), fixed_data.cend())); + } + case TypeId::kUuid: + // TODO(gangwu): Implement UUID type + default: + return NotSupported("Cannot convert scalar to literal of type {}", + type_->ToString()); + } + + std::unreachable(); +} + +} // namespace iceberg diff --git a/src/iceberg/row/struct_like.h b/src/iceberg/row/struct_like.h index 3093f752c..4999da69e 100644 --- a/src/iceberg/row/struct_like.h +++ b/src/iceberg/row/struct_like.h @@ -26,11 +26,13 @@ /// ManifestEntry. Note that they do not carry type information and should be /// used in conjunction with the schema to get the type information. +#include #include +#include #include #include -#include +#include "iceberg/expression/literal.h" #include "iceberg/result.h" #include "iceberg/type_fwd.h" #include "iceberg/util/decimal.h" @@ -96,4 +98,29 @@ class ICEBERG_EXPORT MapLike { virtual size_t size() const = 0; }; +/// \brief An accessor for a struct-like object. +class ICEBERG_EXPORT StructLikeAccessor { + public: + explicit StructLikeAccessor(std::shared_ptr type, + std::span position_path); + + /// \brief Get the scalar value at the given position. + Result Get(const StructLike& struct_like) const { + return accessor_(struct_like); + } + + /// \brief Get the literal value at the given position. + /// + /// \return The literal value at the given position, or an error if it is + /// not a primitive type. + Result GetLiteral(const StructLike& struct_like) const; + + /// \brief Get the type of the value that this accessor is bound to. + const Type& type() const { return *type_; } + + private: + std::shared_ptr type_; + std::function(const StructLike&)> accessor_; +}; + } // namespace iceberg diff --git a/src/iceberg/schema.cc b/src/iceberg/schema.cc index bfb47b306..8719f22b5 100644 --- a/src/iceberg/schema.cc +++ b/src/iceberg/schema.cc @@ -22,9 +22,12 @@ #include #include +#include "iceberg/result.h" +#include "iceberg/row/struct_like.h" #include "iceberg/schema_internal.h" #include "iceberg/type.h" #include "iceberg/util/formatter.h" // IWYU pragma: keep +#include "iceberg/util/formatter_internal.h" #include "iceberg/util/macros.h" #include "iceberg/util/visit_type.h" @@ -69,6 +72,48 @@ class NameToIdVisitor { std::function quoting_func_; }; +class PositionPathVisitor { + public: + Status Visit(const PrimitiveType& type) { + if (current_field_id_ == kUnassignedFieldId) { + return InvalidSchema("Current field id is not assigned, type: {}", type.ToString()); + } + + if (auto ret = position_path_.try_emplace(current_field_id_, current_path_); + !ret.second) { + return InvalidSchema("Duplicate field id found: {}, prev path: {}, curr path: {}", + current_field_id_, ret.first->second, current_path_); + } + + return {}; + } + + Status Visit(const StructType& type) { + for (size_t i = 0; i < type.fields().size(); ++i) { + const auto& field = type.fields()[i]; + current_field_id_ = field.field_id(); + current_path_.push_back(i); + ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*field.type(), this)); + current_path_.pop_back(); + } + return {}; + } + + // Non-struct types are not supported yet, but it is not an error. + Status Visit(const ListType& type) { return {}; } + Status Visit(const MapType& type) { return {}; } + + std::unordered_map> Finish() { + return std::move(position_path_); + } + + private: + constexpr static int32_t kUnassignedFieldId = -1; + int32_t current_field_id_ = kUnassignedFieldId; + std::vector current_path_; + std::unordered_map> position_path_; +}; + Schema::Schema(std::vector fields, std::optional schema_id) : StructType(std::move(fields)), schema_id_(schema_id) {} @@ -144,6 +189,27 @@ Result>> Schema::FindFie return it->second; } +Result>> Schema::InitIdToPositionPath( + const Schema& self) { + PositionPathVisitor visitor; + ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(self, &visitor)); + return visitor.Finish(); +} + +Result> Schema::GetAccessorById( + int32_t field_id) const { + ICEBERG_ASSIGN_OR_RAISE(auto id_to_position_path, id_to_position_path_.Get(*this)); + if (auto it = id_to_position_path.get().find(field_id); + it != id_to_position_path.get().cend()) { + ICEBERG_ASSIGN_OR_RAISE(auto field, FindFieldById(field_id)); + if (!field.has_value()) { + return NotFound("Cannot get accessor for field id: {}", field_id); + } + return std::make_unique(field.value().get().type(), it->second); + } + return NotFound("Cannot get accessor for field id: {}", field_id); +} + IdToFieldVisitor::IdToFieldVisitor( std::unordered_map>& id_to_field) : id_to_field_(id_to_field) {} diff --git a/src/iceberg/schema.h b/src/iceberg/schema.h index 32914bedc..94a8764dc 100644 --- a/src/iceberg/schema.h +++ b/src/iceberg/schema.h @@ -75,6 +75,12 @@ class ICEBERG_EXPORT Schema : public StructType { Result>> FindFieldById( int32_t field_id) const; + /// \brief Get the accessor to access the field by field id. + /// + /// \param field_id The id of the field to get the accessor for. + /// \return The accessor to access the field, or NotFound if the field is not found. + Result> GetAccessorById(int32_t field_id) const; + /// \brief Creates a projected schema from selected field names. /// /// \param names Selected field names and nested names are dot-concatenated. @@ -106,6 +112,8 @@ class ICEBERG_EXPORT Schema : public StructType { InitNameToIdMap(const Schema&); static Result>> InitLowerCaseNameToIdMap(const Schema&); + static Result>> InitIdToPositionPath( + const Schema&); const std::optional schema_id_; /// Mapping from field id to field. @@ -114,6 +122,8 @@ class ICEBERG_EXPORT Schema : public StructType { Lazy name_to_id_; /// Mapping from lowercased field name to field id Lazy lowercase_name_to_id_; + /// Mapping from field id to (nested) position path to access the field. + Lazy id_to_position_path_; }; } // namespace iceberg diff --git a/src/iceberg/test/CMakeLists.txt b/src/iceberg/test/CMakeLists.txt index 544e92f7c..6b90bb62b 100644 --- a/src/iceberg/test/CMakeLists.txt +++ b/src/iceberg/test/CMakeLists.txt @@ -140,6 +140,12 @@ if(ICEBERG_BUILD_BUNDLE) test_common.cc in_memory_catalog_test.cc) + add_iceberg_test(eval_expr_test + USE_BUNDLE + SOURCES + eval_expr_test.cc + test_common.cc) + add_iceberg_test(parquet_test USE_BUNDLE SOURCES diff --git a/src/iceberg/test/eval_expr_test.cc b/src/iceberg/test/eval_expr_test.cc new file mode 100644 index 000000000..880f1ffb6 --- /dev/null +++ b/src/iceberg/test/eval_expr_test.cc @@ -0,0 +1,245 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include +#include +#include +#include + +#include "iceberg/arrow_c_data.h" +#include "iceberg/arrow_c_data_guard_internal.h" +#include "iceberg/expression/expression.h" +#include "iceberg/expression/literal.h" +#include "iceberg/expression/term.h" +#include "iceberg/row/arrow_array_wrapper.h" +#include "iceberg/schema.h" +#include "iceberg/schema_internal.h" +#include "iceberg/test/matchers.h" +#include "iceberg/transform.h" +#include "iceberg/type.h" + +namespace iceberg { + +class BoundExpressionTest : public ::testing::Test { + protected: + void SetUp() override { + schema_ = std::make_unique(std::vector{ + SchemaField::MakeOptional(1, "id", int32()), + SchemaField::MakeOptional(2, "name", string()), + SchemaField::MakeRequired(3, "timestamp_field", timestamp()), + SchemaField::MakeRequired(4, "string_field", string())}); + + arrow_data_type_ = ::arrow::struct_( + {::arrow::field("id", ::arrow::int32()), ::arrow::field("name", ::arrow::utf8()), + ::arrow::field("timestamp_field", ::arrow::timestamp(::arrow::TimeUnit::MICRO)), + ::arrow::field("string_field", ::arrow::utf8())}); + + arrow_array_ = ::arrow::json::ArrayFromJSONString(arrow_data_type_, R"([ + {"id": 1, "name": "Alice", "timestamp_field": 1609459200000000, "string_field": "hello_world"}, + {"id": 2, "name": null, "timestamp_field": 1609459200000000, "string_field": "hello_world"} + ])") + .ValueOrDie(); + + ASSERT_TRUE(::arrow::ExportType(*arrow_data_type_, &arrow_c_schema_).ok()); + ASSERT_TRUE(::arrow::ExportArray(*arrow_array_, &arrow_c_array_).ok()); + } + + void TearDown() override { + if (arrow_c_schema_.release != nullptr) { + ArrowSchemaRelease(&arrow_c_schema_); + } + if (arrow_c_array_.release != nullptr) { + ArrowArrayRelease(&arrow_c_array_); + } + } + + std::unique_ptr schema_; + std::shared_ptr<::arrow::DataType> arrow_data_type_; + std::shared_ptr<::arrow::Array> arrow_array_; + ArrowSchema arrow_c_schema_; + ArrowArray arrow_c_array_; +}; + +TEST_F(BoundExpressionTest, EvaluateBoundReference) { + ICEBERG_UNWRAP_OR_FAIL(auto id_ref, NamedReference::Make("id")); + ICEBERG_UNWRAP_OR_FAIL(auto id_bound_ref, + id_ref->Bind(*schema_, /*case_sensitive=*/true)); + + ICEBERG_UNWRAP_OR_FAIL(auto name_ref, NamedReference::Make("name")); + ICEBERG_UNWRAP_OR_FAIL(auto name_bound_ref, + name_ref->Bind(*schema_, /*case_sensitive=*/true)); + + struct TestCase { + size_t row_index; + Literal expected_id; + Literal expected_name; + }; + + for (const auto& test_case : std::vector{ + {.row_index = 0, + .expected_id = Literal::Int(1), + .expected_name = Literal::String("Alice")}, + {.row_index = 1, + .expected_id = Literal::Int(2), + .expected_name = Literal::Null(string())}, + }) { + ICEBERG_UNWRAP_OR_FAIL( + auto struct_like, + ArrowArrayStructLike::Make(arrow_c_schema_, arrow_c_array_, test_case.row_index)); + + ICEBERG_UNWRAP_OR_FAIL(auto id_literal, id_bound_ref->Evaluate(*struct_like)); + EXPECT_EQ(id_literal, test_case.expected_id); + + ICEBERG_UNWRAP_OR_FAIL(auto name_literal, name_bound_ref->Evaluate(*struct_like)); + if (test_case.expected_name.IsNull()) { + EXPECT_TRUE(name_literal.IsNull()); + } else { + EXPECT_EQ(name_literal, test_case.expected_name); + } + } +} + +TEST_F(BoundExpressionTest, IdentityTransform) { + ICEBERG_UNWRAP_OR_FAIL(auto name_ref, NamedReference::Make("name")); + ICEBERG_UNWRAP_OR_FAIL( + auto name_transform, + UnboundTransform::Make(std::move(name_ref), Transform::Identity())); + ICEBERG_UNWRAP_OR_FAIL(auto bound_transform, + name_transform->Bind(*schema_, /*case_sensitive=*/true)); + + struct TestCase { + size_t row_index; + Literal expected_name; + }; + + for (const auto& test_case : std::vector{ + {.row_index = 0, .expected_name = Literal::String("Alice")}, + {.row_index = 1, .expected_name = Literal::Null(string())}, + }) { + ICEBERG_UNWRAP_OR_FAIL( + auto struct_like, + ArrowArrayStructLike::Make(arrow_c_schema_, arrow_c_array_, test_case.row_index)); + ICEBERG_UNWRAP_OR_FAIL(auto result, bound_transform->Evaluate(*struct_like)); + if (test_case.expected_name.IsNull()) { + EXPECT_TRUE(result.IsNull()); + } else { + EXPECT_EQ(result, test_case.expected_name); + } + } +} + +TEST_F(BoundExpressionTest, YearTransform) { + // Create and bind year transform + ICEBERG_UNWRAP_OR_FAIL(auto timestamp_ref, NamedReference::Make("timestamp_field")); + ICEBERG_UNWRAP_OR_FAIL( + auto unbound_transform, + UnboundTransform::Make(std::move(timestamp_ref), Transform::Year())); + ICEBERG_UNWRAP_OR_FAIL(auto bound_transform, + unbound_transform->Bind(*schema_, /*case_sensitive=*/true)); + + // Test data: 2021-01-01 00:00:00 UTC = 1609459200000000 microseconds + ICEBERG_UNWRAP_OR_FAIL(auto struct_like, + ArrowArrayStructLike::Make(arrow_c_schema_, arrow_c_array_, 0)); + + // Evaluate (2021) + ICEBERG_UNWRAP_OR_FAIL(auto result, bound_transform->Evaluate(*struct_like)); + EXPECT_FALSE(result.IsNull()); + EXPECT_EQ(std::get(result.value()), 2021); // Year value +} + +TEST_F(BoundExpressionTest, MonthTransform) { + // Create and bind month transform + ICEBERG_UNWRAP_OR_FAIL(auto timestamp_ref, NamedReference::Make("timestamp_field")); + ICEBERG_UNWRAP_OR_FAIL( + auto unbound_transform, + UnboundTransform::Make(std::move(timestamp_ref), Transform::Month())); + ICEBERG_UNWRAP_OR_FAIL(auto bound_transform, + unbound_transform->Bind(*schema_, /*case_sensitive=*/true)); + + // Test data: 2021-01-01 + ICEBERG_UNWRAP_OR_FAIL(auto struct_like, + ArrowArrayStructLike::Make(arrow_c_schema_, arrow_c_array_, 0)); + + // Evaluate (2021-01) + ICEBERG_UNWRAP_OR_FAIL(auto result, bound_transform->Evaluate(*struct_like)); + EXPECT_FALSE(result.IsNull()); + EXPECT_EQ(std::get(result.value()), 612); // Months since 1970-01 +} + +TEST_F(BoundExpressionTest, DayTransform) { + // Create and bind day transform + ICEBERG_UNWRAP_OR_FAIL(auto timestamp_ref, NamedReference::Make("timestamp_field")); + ICEBERG_UNWRAP_OR_FAIL( + auto unbound_transform, + UnboundTransform::Make(std::move(timestamp_ref), Transform::Day())); + ICEBERG_UNWRAP_OR_FAIL(auto bound_transform, + unbound_transform->Bind(*schema_, /*case_sensitive=*/true)); + + // Test data: 2021-01-01 + ICEBERG_UNWRAP_OR_FAIL(auto struct_like, + ArrowArrayStructLike::Make(arrow_c_schema_, arrow_c_array_, 0)); + + // Evaluate + ICEBERG_UNWRAP_OR_FAIL(auto result, bound_transform->Evaluate(*struct_like)); + EXPECT_FALSE(result.IsNull()); + EXPECT_EQ(std::get(result.value()), 18628); // Days since 1970-01-01 +} + +TEST_F(BoundExpressionTest, BucketTransform) { + // Create and bind bucket[4] transform + ICEBERG_UNWRAP_OR_FAIL(auto string_ref, NamedReference::Make("string_field")); + ICEBERG_UNWRAP_OR_FAIL( + auto unbound_transform, + UnboundTransform::Make(std::move(string_ref), Transform::Bucket(4))); + ICEBERG_UNWRAP_OR_FAIL(auto bound_transform, + unbound_transform->Bind(*schema_, /*case_sensitive=*/true)); + + // Test data: "hello_world" + ICEBERG_UNWRAP_OR_FAIL(auto struct_like, + ArrowArrayStructLike::Make(arrow_c_schema_, arrow_c_array_, 0)); + + // Evaluate - verify result is in range [0, 3] + ICEBERG_UNWRAP_OR_FAIL(auto result, bound_transform->Evaluate(*struct_like)); + EXPECT_FALSE(result.IsNull()); + auto bucket_value = std::get(result.value()); + EXPECT_GE(bucket_value, 0); + EXPECT_LT(bucket_value, 4); +} + +TEST_F(BoundExpressionTest, TruncateTransform) { + // Create and bind truncate[5] transform + ICEBERG_UNWRAP_OR_FAIL(auto string_ref, NamedReference::Make("string_field")); + ICEBERG_UNWRAP_OR_FAIL( + auto unbound_transform, + UnboundTransform::Make(std::move(string_ref), Transform::Truncate(5))); + ICEBERG_UNWRAP_OR_FAIL(auto bound_transform, + unbound_transform->Bind(*schema_, /*case_sensitive=*/true)); + + // Test data: "hello_world" + ICEBERG_UNWRAP_OR_FAIL(auto struct_like, + ArrowArrayStructLike::Make(arrow_c_schema_, arrow_c_array_, 0)); + + // Evaluate - "hello_world" truncated to 5 chars = "hello" + ICEBERG_UNWRAP_OR_FAIL(auto result, bound_transform->Evaluate(*struct_like)); + EXPECT_FALSE(result.IsNull()); + EXPECT_EQ(std::get(result.value()), "hello"); +} + +} // namespace iceberg diff --git a/src/iceberg/test/struct_like_test.cc b/src/iceberg/test/struct_like_test.cc index b18ab8c0f..3683ed225 100644 --- a/src/iceberg/test/struct_like_test.cc +++ b/src/iceberg/test/struct_like_test.cc @@ -20,6 +20,7 @@ #include #include #include +#include #include #include "iceberg/arrow_c_data_guard_internal.h" @@ -27,8 +28,10 @@ #include "iceberg/manifest_reader_internal.h" #include "iceberg/row/arrow_array_wrapper.h" #include "iceberg/row/manifest_wrapper.h" +#include "iceberg/schema.h" #include "iceberg/schema_internal.h" #include "iceberg/test/matchers.h" +#include "iceberg/type.h" namespace iceberg { @@ -386,4 +389,52 @@ TEST(ArrowArrayStructLike, PrimitiveMap) { } } +TEST(ArrowArrayStructLike, Accessor) { + Schema schema{std::vector{ + SchemaField::MakeOptional(1, "c1", int32()), + SchemaField::MakeOptional( + 2, "c2", + struct_({ + SchemaField::MakeOptional(3, "c3", int32()), + SchemaField::MakeOptional(4, "c4", + struct_({ + SchemaField::MakeOptional(5, "c5", int32()), + })), + })), + }}; + + auto arrow_schema = ::arrow::struct_({ + ::arrow::field("c1", ::arrow::int32()), + ::arrow::field("c2", + ::arrow::struct_({ + ::arrow::field("c3", ::arrow::int32()), + ::arrow::field("c4", ::arrow::struct_({ + ::arrow::field("c5", ::arrow::int32()), + })), + })), + }); + + auto arrow_array = + ::arrow::json::ArrayFromJSONString( + arrow_schema, R"([ {"c1": 1, "c2": {"c3": 3, "c4": {"c5": 5}}} ])") + .ValueOrDie(); + + ArrowSchema c_schema; + ArrowArray c_array; + internal::ArrowSchemaGuard schema_guard(&c_schema); + internal::ArrowArrayGuard array_guard(&c_array); + ASSERT_TRUE(::arrow::ExportType(*arrow_schema, &c_schema).ok()); + ASSERT_TRUE(::arrow::ExportArray(*arrow_array, &c_array).ok()); + + ICEBERG_UNWRAP_OR_FAIL(auto struct_like, ArrowArrayStructLike::Make(c_schema, c_array)); + + // Test nested accessors from 1 to 3 levels deep + for (int32_t field_id : {1, 3, 5}) { + ICEBERG_UNWRAP_OR_FAIL(auto accessor, schema.GetAccessorById(field_id)); + ICEBERG_UNWRAP_OR_FAIL(auto scalar, accessor->Get(*struct_like)); + ASSERT_TRUE(std::holds_alternative(scalar)); + EXPECT_EQ(std::get(scalar), field_id); + } +} + } // namespace iceberg diff --git a/src/iceberg/type_fwd.h b/src/iceberg/type_fwd.h index 5485d83fe..79b43f5fd 100644 --- a/src/iceberg/type_fwd.h +++ b/src/iceberg/type_fwd.h @@ -144,9 +144,10 @@ struct WriterOptions; class Reader; class Writer; -class StructLike; class ArrayLike; class MapLike; +class StructLike; +class StructLikeAccessor; class TableUpdate; class TableRequirement;