From 79d13fe5cb3ffa74ab4b10466840f56d8e9293d0 Mon Sep 17 00:00:00 2001 From: nullccxsy <32149055912@qq.com> Date: Wed, 3 Sep 2025 18:12:42 +0800 Subject: [PATCH 01/10] feat: implement schema selection and projection methods - Added select and project methods to the Schema class for creating projection schemas based on specified field names or IDs. - Introduced PruneColumnVisitor to handle the logic for selecting and projecting fields, including support for nested structures. --- src/iceberg/schema.cc | 255 +++++++++++++ src/iceberg/schema.h | 35 +- test/schema_test.cc | 838 +++++++++++++++++++++++++++++++++++++++--- 3 files changed, 1063 insertions(+), 65 deletions(-) diff --git a/src/iceberg/schema.cc b/src/iceberg/schema.cc index 1df20c60b..ddffb8a83 100644 --- a/src/iceberg/schema.cc +++ b/src/iceberg/schema.cc @@ -260,4 +260,259 @@ void NameToIdVisitor::Finish() { } } +class PruneColumnVisitor { + public: + explicit PruneColumnVisitor(const std::unordered_set& selected_ids, + bool select_full_types = false); + Status Visit(const ListType& type); + Status Visit(const MapType& type); + Status Visit(const StructType& type); + Status Visit(const PrimitiveType& type); + std::shared_ptr GetResult() const; + void SetResult(std::shared_ptr result); + Status ProjectList(const SchemaField& element, + std::shared_ptr element_result); + Status ProjectMap(const SchemaField& key_field, const SchemaField& value_field, + std::shared_ptr value_result); + + private: + const std::unordered_set& selected_ids_; + bool select_full_types_; + std::shared_ptr result_; +}; + +Result> Schema::select( + const std::vector& names, bool case_sensitive) const { + return internalSelect(names, case_sensitive); +} + +Result> Schema::select( + const std::initializer_list& names, bool case_sensitive) const { + return internalSelect(std::vector(names), case_sensitive); +} + +Result> Schema::internalSelect( + const std::vector& names, bool case_sensitive) const { + const std::string ALL_COLUMNS = "*"; + if (std::ranges::find(names, ALL_COLUMNS) != names.end()) { + return shared_from_this(); + } + + std::unordered_set selected_ids; + for (const auto& name : names) { + ICEBERG_ASSIGN_OR_RAISE(auto result, FindFieldByName(name, case_sensitive)); + if (result.has_value()) { + selected_ids.insert(result.value().get().field_id()); + } + } + + PruneColumnVisitor visitor(selected_ids, /*select_full_types=*/true); + ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*this, &visitor)); + + auto projected_type = visitor.GetResult(); + if (!projected_type) { + return std::make_shared(std::vector{}, schema_id_); + } + + if (projected_type->type_id() != TypeId::kStruct) { + return InvalidSchema("Projected type must be a struct type"); + } + + const auto& projected_struct = + internal::checked_cast(*projected_type); + + std::vector fields_vec(projected_struct.fields().begin(), + projected_struct.fields().end()); + return std::make_shared(std::move(fields_vec), schema_id_); +} + +Result> Schema::project( + std::unordered_set& selected_ids) const { + PruneColumnVisitor visitor(selected_ids, /*select_full_types=*/false); + ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*this, &visitor)); + + auto projected_type = visitor.GetResult(); + if (!projected_type) { + return std::make_shared(std::vector{}, schema_id_); + } + + if (projected_type->type_id() != TypeId::kStruct) { + return InvalidSchema("Projected type must be a struct type"); + } + + const auto& projected_struct = + internal::checked_cast(*projected_type); + std::vector fields_vec(projected_struct.fields().begin(), + projected_struct.fields().end()); + return std::make_shared(std::move(fields_vec), schema_id_); +} + +PruneColumnVisitor::PruneColumnVisitor(const std::unordered_set& selected_ids, + bool select_full_types) + : selected_ids_(selected_ids), select_full_types_(select_full_types) {} + +std::shared_ptr PruneColumnVisitor::GetResult() const { return result_; } + +void PruneColumnVisitor::SetResult(std::shared_ptr result) { + result_ = std::move(result); +} + +Status PruneColumnVisitor::Visit(const StructType& type) { + std::vector> selected_types; + const auto& fields = type.fields(); + for (const auto& field : fields) { + PruneColumnVisitor field_visitor(selected_ids_, select_full_types_); + ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*field.type(), &field_visitor)); + auto result = field_visitor.GetResult(); + if (selected_ids_.contains(field.field_id())) { + // select + if (select_full_types_) { + selected_types.emplace_back(field.type()); + } else if (field.type()->type_id() == TypeId::kStruct) { + // project(kstruct) + if (!result) { + result = std::make_shared(std::vector{}); + } + selected_types.emplace_back(std::move(result)); + } else { + // project(list, map, primitive) + if (!field.type()->is_primitive()) { + return InvalidArgument( + "Cannot explicitly project List or Map types, {}:{} of type {} was " + "selected", + field.field_id(), field.name(), field.type()->ToString()); + } + selected_types.emplace_back(field.type()); + } + } else if (result) { + // project, select + selected_types.emplace_back(std::move(result)); + } else { + // project, select + selected_types.emplace_back(nullptr); + } + } + + bool same_types = true; + std::vector selected_fields; + for (size_t i = 0; i < fields.size(); i++) { + if (fields[i].type() == selected_types[i]) { + selected_fields.emplace_back(fields[i]); + } else if (selected_types[i]) { + same_types = false; + selected_fields.emplace_back(fields[i].field_id(), std::string(fields[i].name()), + std::const_pointer_cast(selected_types[i]), + fields[i].optional(), std::string(fields[i].doc())); + } + } + + if (!selected_fields.empty()) { + if (selected_fields.size() == fields.size() && same_types) { + result_ = std::make_shared(type); + } else { + result_ = std::make_shared(std::move(selected_fields)); + } + } + + return {}; +} + +Status PruneColumnVisitor::Visit(const ListType& type) { + const auto& element_field = type.fields()[0]; + + PruneColumnVisitor element_visitor(selected_ids_, select_full_types_); + ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*element_field.type(), &element_visitor)); + + auto element_result = element_visitor.GetResult(); + + if (selected_ids_.contains(element_field.field_id())) { + if (select_full_types_) { + result_ = std::make_shared(element_field); + } else if (element_field.type()->type_id() == TypeId::kStruct) { + ICEBERG_RETURN_UNEXPECTED(ProjectList(element_field, element_result)); + } else { + if (!element_field.type()->is_primitive()) { + return InvalidArgument( + "Cannot explicitly project List or Map types, List element {} of type {} was " + "selected", + element_field.field_id(), element_field.name()); + } + result_ = std::make_shared(element_field); + } + } else if (element_result) { + ICEBERG_RETURN_UNEXPECTED(ProjectList(element_field, element_result)); + } + + return {}; +} + +Status PruneColumnVisitor::Visit(const MapType& type) { + const auto& key_field = type.fields()[0]; + const auto& value_field = type.fields()[1]; + + PruneColumnVisitor key_visitor(selected_ids_, select_full_types_); + ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*key_field.type(), &key_visitor)); + auto key_result = key_visitor.GetResult(); + + PruneColumnVisitor value_visitor(selected_ids_, select_full_types_); + ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*value_field.type(), &value_visitor)); + auto value_result = value_visitor.GetResult(); + + if (selected_ids_.contains(value_field.field_id())) { + if (select_full_types_) { + result_ = std::make_shared(type); + } else if (value_field.type()->type_id() == TypeId::kStruct) { + ICEBERG_RETURN_UNEXPECTED(ProjectMap(key_field, value_field, value_result)); + } else { + if (!value_field.type()->is_primitive()) { + return InvalidArgument( + "Cannot explicitly project List or Map types, Map value {} of type {} was " + "selected", + value_field.field_id(), type.ToString()); + } + result_ = std::make_shared(type); + } + } else if (value_result) { + ICEBERG_RETURN_UNEXPECTED(ProjectMap(key_field, value_field, value_result)); + } else if (selected_ids_.contains(key_field.field_id())) { + result_ = std::make_shared(type); + } + + return {}; +} + +Status PruneColumnVisitor::Visit(const PrimitiveType& type) { return {}; } + +Status PruneColumnVisitor::ProjectList(const SchemaField& element_field, + std::shared_ptr element_result) { + if (!element_result) { + return InvalidArgument("Cannot project a list when the element result is null"); + } + if (element_field.type() == element_result) { + result_ = std::make_shared(element_field); + } else { + result_ = std::make_shared(element_field.field_id(), + std::const_pointer_cast(element_result), + element_field.optional()); + } + return {}; +} + +Status PruneColumnVisitor::ProjectMap(const SchemaField& key_field, + const SchemaField& value_field, + std::shared_ptr value_result) { + if (!value_result) { + return InvalidArgument("Attempted to project a map without a defined map value type"); + } + if (value_field.type() == value_result) { + result_ = std::make_shared(key_field, value_field); + } else { + result_ = std::make_shared( + key_field, + SchemaField(value_field.field_id(), std::string(value_field.name()), + std::const_pointer_cast(value_result), value_field.optional())); + } + return {}; +} + } // namespace iceberg diff --git a/src/iceberg/schema.h b/src/iceberg/schema.h index 260d9d342..9a40fb09a 100644 --- a/src/iceberg/schema.h +++ b/src/iceberg/schema.h @@ -27,6 +27,7 @@ #include #include #include +#include #include #include "iceberg/iceberg_export.h" @@ -42,7 +43,8 @@ namespace iceberg { /// A schema is a list of typed columns, along with a unique integer ID. A /// Table may have different schemas over its lifetime due to schema /// evolution. -class ICEBERG_EXPORT Schema : public StructType { +class ICEBERG_EXPORT Schema : public StructType, + public std::enable_shared_from_this { public: static constexpr int32_t kInitialSchemaId = 0; @@ -53,9 +55,9 @@ class ICEBERG_EXPORT Schema : public StructType { /// /// A schema is identified by a unique ID for the purposes of schema /// evolution. - [[nodiscard]] std::optional schema_id() const; + std::optional schema_id() const; - [[nodiscard]] std::string ToString() const override; + std::string ToString() const override; /// \brief Find the SchemaField by field name. /// @@ -66,12 +68,31 @@ class ICEBERG_EXPORT Schema : public StructType { /// canonical name 'm.value.x' /// FIXME: Currently only handles ASCII lowercase conversion; extend to support /// non-ASCII characters (e.g., using std::towlower or ICU) - [[nodiscard]] Result>> - FindFieldByName(std::string_view name, bool case_sensitive = true) const; + Result>> FindFieldByName( + std::string_view name, bool case_sensitive = true) const; /// \brief Find the SchemaField by field id. - [[nodiscard]] Result>> - FindFieldById(int32_t field_id) const; + Result>> FindFieldById( + int32_t field_id) const; + + /// \brief Creates a projection schema for a subset of columns, selected by name. + Result> select(const std::vector& names, + bool case_sensitive = true) const; + + /// \brief Creates a projection schema for a subset of columns, selected by name. + Result> select( + const std::initializer_list& names, bool case_sensitive = true) const; + + /// \brief Creates a projection schema for a subset of columns, selected by name. + template + Result> select(Args&&... names, + bool case_sensitive = true) const { + static_assert((std::is_convertible_v && ...), + "All arguments must be convertible to std::string"); + return select({std::string(names)...}, case_sensitive); + } + + Result> project(std::unordered_set& ids) const; friend bool operator==(const Schema& lhs, const Schema& rhs) { return lhs.Equals(rhs); } diff --git a/test/schema_test.cc b/test/schema_test.cc index b01ffe9ba..34350551f 100644 --- a/test/schema_test.cc +++ b/test/schema_test.cc @@ -26,6 +26,7 @@ #include #include +#include "iceberg/result.h" #include "iceberg/schema_field.h" #include "iceberg/util/formatter.h" // IWYU pragma: keep #include "matchers.h" @@ -492,70 +493,791 @@ TEST(SchemaTest, NestedDuplicateFieldIdError) { ::testing::HasSubstr("Duplicate field id found: 1")); } -// Thread safety tests for Lazy Init -class SchemaThreadSafetyTest : public ::testing::Test { - protected: - void SetUp() override { - field1_ = std::make_unique(1, "id", iceberg::int32(), true); - field2_ = std::make_unique(2, "name", iceberg::string(), true); - field3_ = std::make_unique(3, "age", iceberg::int32(), true); - schema_ = std::make_unique( - std::vector{*field1_, *field2_, *field3_}, 100); - } - - std::unique_ptr schema_; - std::unique_ptr field1_; - std::unique_ptr field2_; - std::unique_ptr field3_; +struct SelectTestParam { + std::string test_name; + std::function()> create_schema; + std::vector select_fields; + std::function()> expected_schema; + bool should_succeed; + std::string expected_error_message; + bool case_sensitive = true; }; -TEST_F(SchemaThreadSafetyTest, ConcurrentFindFieldById) { - const int num_threads = 10; - const int iterations_per_thread = 100; - std::vector threads; - - for (int i = 0; i < num_threads; ++i) { - threads.emplace_back([this, iterations_per_thread]() { - for (int j = 0; j < iterations_per_thread; ++j) { - ASSERT_THAT(schema_->FindFieldById(1), ::testing::Optional(*field1_)); - ASSERT_THAT(schema_->FindFieldById(999), ::testing::Optional(std::nullopt)); - } - }); - } +class SelectParamTest : public ::testing::TestWithParam {}; + +TEST_P(SelectParamTest, SelectFields) { + const auto& param = GetParam(); + auto input_schema = param.create_schema(); + + auto result = input_schema->select(param.select_fields, param.case_sensitive); + + if (param.should_succeed) { + ASSERT_TRUE(result.has_value()) << "Select should succeed for: " << param.test_name; - for (auto& thread : threads) { - thread.join(); + auto actual_schema = result.value(); + auto expected_schema = param.expected_schema(); + ASSERT_EQ(*actual_schema, *expected_schema) + << "Schema mismatch for: " << param.test_name; + } else { + ASSERT_FALSE(result.has_value()) << "Select should fail for: " << param.test_name; + ASSERT_THAT(result, iceberg::IsError(iceberg::ErrorKind::kInvalidArgument)) + << "Should return InvalidArgument error for: " << param.test_name; + + ASSERT_THAT(result, iceberg::HasErrorMessage(param.expected_error_message)) + << "Error message mismatch for: " << param.test_name; } } -TEST_F(SchemaThreadSafetyTest, MixedConcurrentOperations) { - const int num_threads = 8; - const int iterations_per_thread = 50; - std::vector threads; - - for (int i = 0; i < num_threads; ++i) { - threads.emplace_back([this, iterations_per_thread, i]() { - for (int j = 0; j < iterations_per_thread; ++j) { - if (i % 4 == 0) { - ASSERT_THAT(schema_->FindFieldById(1), ::testing::Optional(*field1_)); - } else if (i % 4 == 1) { - ASSERT_THAT(schema_->FindFieldByName("name", true), - ::testing::Optional(*field2_)); - } else if (i % 4 == 2) { - ASSERT_THAT(schema_->FindFieldByName("AGE", false), - ::testing::Optional(*field3_)); - } else { - ASSERT_THAT(schema_->FindFieldById(2), ::testing::Optional(*field2_)); - ASSERT_THAT(schema_->FindFieldByName("id", true), - ::testing::Optional(*field1_)); - ASSERT_THAT(schema_->FindFieldByName("age", false), - ::testing::Optional(*field3_)); - } - } - }); - } +INSTANTIATE_TEST_SUITE_P( + SelectTestCases, SelectParamTest, + ::testing::Values( + SelectTestParam{.test_name = "SelectAllColumns", + .create_schema = + []() { + auto field1 = std::make_unique( + 1, "id", iceberg::int32(), false); + auto field2 = std::make_unique( + 2, "name", iceberg::string(), true); + auto field3 = std::make_unique( + 3, "age", iceberg::int32(), true); + auto field4 = std::make_unique( + 4, "email", iceberg::string(), true); + return std::make_shared( + std::vector{*field1, *field2, + *field3, *field4}, + 100); + }, + .select_fields = {"*"}, + .expected_schema = + []() { + auto field1 = std::make_unique( + 1, "id", iceberg::int32(), false); + auto field2 = std::make_unique( + 2, "name", iceberg::string(), true); + auto field3 = std::make_unique( + 3, "age", iceberg::int32(), true); + auto field4 = std::make_unique( + 4, "email", iceberg::string(), true); + return std::make_shared( + std::vector{*field1, *field2, + *field3, *field4}, + 100); + }, + .should_succeed = true}, + + SelectTestParam{.test_name = "SelectSingleField", + .create_schema = + []() { + auto field1 = std::make_unique( + 1, "id", iceberg::int32(), false); + auto field2 = std::make_unique( + 2, "name", iceberg::string(), true); + auto field3 = std::make_unique( + 3, "age", iceberg::int32(), true); + auto field4 = std::make_unique( + 4, "email", iceberg::string(), true); + return std::make_shared( + std::vector{*field1, *field2, + *field3, *field4}, + 100); + }, + .select_fields = {"name"}, + .expected_schema = + []() { + auto field = std::make_unique( + 2, "name", iceberg::string(), true); + return std::make_shared( + std::vector{*field}, 100); + }, + .should_succeed = true}, + + SelectTestParam{ + .test_name = "SelectMultipleFields", + .create_schema = + []() { + auto field1 = std::make_unique( + 1, "id", iceberg::int32(), false); + auto field2 = std::make_unique( + 2, "name", iceberg::string(), true); + auto field3 = std::make_unique( + 3, "age", iceberg::int32(), true); + auto field4 = std::make_unique( + 4, "email", iceberg::string(), true); + return std::make_shared( + std::vector{*field1, *field2, *field3, + *field4}, + 100); + }, + .select_fields = {"id", "name", "age"}, + .expected_schema = + []() { + auto field1 = std::make_unique( + 1, "id", iceberg::int32(), false); + auto field2 = std::make_unique( + 2, "name", iceberg::string(), true); + auto field3 = std::make_unique( + 3, "age", iceberg::int32(), true); + return std::make_shared( + std::vector{*field1, *field2, *field3}, 100); + }, + .should_succeed = true}, + + SelectTestParam{.test_name = "SelectNonExistentField", + .create_schema = + []() { + auto field1 = std::make_unique( + 1, "id", iceberg::int32(), false); + auto field2 = std::make_unique( + 2, "name", iceberg::string(), true); + auto field3 = std::make_unique( + 3, "age", iceberg::int32(), true); + auto field4 = std::make_unique( + 4, "email", iceberg::string(), true); + return std::make_shared( + std::vector{*field1, *field2, + *field3, *field4}, + 100); + }, + .select_fields = {"nonexistent"}, + .expected_schema = + []() { + return std::make_shared( + std::vector{}, 100); + }, + .should_succeed = true}, + + SelectTestParam{.test_name = "SelectCaseSensitive", + .create_schema = + []() { + auto field1 = std::make_unique( + 1, "id", iceberg::int32(), false); + auto field2 = std::make_unique( + 2, "name", iceberg::string(), true); + auto field3 = std::make_unique( + 3, "age", iceberg::int32(), true); + auto field4 = std::make_unique( + 4, "email", iceberg::string(), true); + return std::make_shared( + std::vector{*field1, *field2, + *field3, *field4}, + 100); + }, + .select_fields = {"Name"}, // case-sensitive + .expected_schema = + []() { + return std::make_shared( + std::vector{}, 100); + }, + .should_succeed = true}, + + SelectTestParam{.test_name = "SelectCaseInsensitive", + .create_schema = + []() { + auto field1 = std::make_unique( + 1, "id", iceberg::int32(), false); + auto field2 = std::make_unique( + 2, "name", iceberg::string(), true); + auto field3 = std::make_unique( + 3, "age", iceberg::int32(), true); + auto field4 = std::make_unique( + 4, "email", iceberg::string(), true); + return std::make_shared( + std::vector{*field1, *field2, + *field3, *field4}, + 100); + }, + .select_fields = {"Name"}, // case-insensitive + .expected_schema = + []() { + auto field = std::make_unique( + 2, "name", iceberg::string(), true); + return std::make_shared( + std::vector{*field}, 100); + }, + .should_succeed = true, + .case_sensitive = false})); + +INSTANTIATE_TEST_SUITE_P( + SelectNestedTestCases, SelectParamTest, + ::testing::Values( + SelectTestParam{ + .test_name = "SelectTopLevelFields", + .create_schema = + []() { + auto nested_field1 = std::make_unique( + 1, "street", iceberg::string(), true); + auto nested_field2 = std::make_unique( + 2, "city", iceberg::string(), true); + auto nested_field3 = std::make_unique( + 3, "zip", iceberg::int32(), true); + auto address_type = std::make_shared( + std::vector{*nested_field1, *nested_field2, + *nested_field3}); + auto field1 = std::make_unique( + 4, "id", iceberg::int32(), false); + auto field2 = std::make_unique( + 5, "name", iceberg::string(), true); + auto field3 = std::make_unique( + 6, "address", address_type, true); + return std::make_shared( + std::vector{*field1, *field2, *field3}, 200); + }, + .select_fields = {"id", "name"}, + .expected_schema = + []() { + auto field1 = std::make_unique( + 4, "id", iceberg::int32(), false); + auto field2 = std::make_unique( + 5, "name", iceberg::string(), true); + return std::make_shared( + std::vector{*field1, *field2}, 200); + }, + .should_succeed = true}, + + SelectTestParam{ + .test_name = "SelectNestedField", + .create_schema = + []() { + auto nested_field1 = std::make_unique( + 1, "street", iceberg::string(), true); + auto nested_field2 = std::make_unique( + 2, "city", iceberg::string(), true); + auto nested_field3 = std::make_unique( + 3, "zip", iceberg::int32(), true); + auto address_type = std::make_shared( + std::vector{*nested_field1, *nested_field2, + *nested_field3}); + auto field1 = std::make_unique( + 4, "id", iceberg::int32(), false); + auto field2 = std::make_unique( + 5, "name", iceberg::string(), true); + auto field3 = std::make_unique( + 6, "address", address_type, true); + return std::make_shared( + std::vector{*field1, *field2, *field3}, 200); + }, + .select_fields = {"address.street"}, + .expected_schema = + []() { + auto street_field = std::make_unique( + 1, "street", iceberg::string(), true); + auto address_type = std::make_shared( + std::vector{*street_field}); + auto address_field = std::make_unique( + 6, "address", address_type, true); + return std::make_shared( + std::vector{*address_field}, 200); + }, + .should_succeed = true})); + +INSTANTIATE_TEST_SUITE_P( + SelectMultiLevelTestCases, SelectParamTest, + ::testing::Values( + SelectTestParam{ + .test_name = "SelectTopLevelAndNestedFields", + .create_schema = + []() { + // {id: int, user: {name: string, address: {street: string, city: + // string}}} + auto street_field = std::make_unique( + 1, "street", iceberg::string(), true); + auto city_field = std::make_unique( + 2, "city", iceberg::string(), true); + auto address_type = std::make_shared( + std::vector{*street_field, *city_field}); + auto address_field = std::make_unique( + 3, "address", address_type, true); + + auto name_field = std::make_unique( + 4, "name", iceberg::string(), true); + auto user_type = std::make_shared( + std::vector{*name_field, *address_field}); + auto user_field = + std::make_unique(5, "user", user_type, true); + + auto id_field = std::make_unique( + 6, "id", iceberg::int32(), false); + return std::make_shared( + std::vector{*id_field, *user_field}, 300); + }, + .select_fields = {"id", "user.name", "user.address.street"}, + .expected_schema = + []() { + auto id_field = std::make_unique( + 6, "id", iceberg::int32(), false); + auto name_field = std::make_unique( + 4, "name", iceberg::string(), true); + auto street_field = std::make_unique( + 1, "street", iceberg::string(), true); + auto address_type = std::make_shared( + std::vector{*street_field}); + auto address_field = std::make_unique( + 3, "address", address_type, true); + auto user_type = std::make_shared( + std::vector{*name_field, *address_field}); + auto user_field = + std::make_unique(5, "user", user_type, true); + return std::make_shared( + std::vector{*id_field, *user_field}, 300); + }, + .should_succeed = true}, + + SelectTestParam{ + .test_name = "SelectNestedFieldsAtDifferentLevels", + .create_schema = + []() { + // {id: int, user: {profile: {name: string, age: int}, settings: {theme: + // string}}} + auto name_field = std::make_unique( + 1, "name", iceberg::string(), true); + auto age_field = std::make_unique( + 2, "age", iceberg::int32(), true); + auto profile_type = std::make_shared( + std::vector{*name_field, *age_field}); + auto profile_field = std::make_unique( + 3, "profile", profile_type, true); + + auto theme_field = std::make_unique( + 4, "theme", iceberg::string(), true); + auto settings_type = std::make_shared( + std::vector{*theme_field}); + auto settings_field = std::make_unique( + 5, "settings", settings_type, true); + + auto user_type = std::make_shared( + std::vector{*profile_field, *settings_field}); + auto user_field = + std::make_unique(6, "user", user_type, true); + + auto id_field = std::make_unique( + 7, "id", iceberg::int32(), false); + return std::make_shared( + std::vector{*id_field, *user_field}, 400); + }, + .select_fields = {"user.profile.name", "user.settings.theme"}, + .expected_schema = + []() { + auto name_field = std::make_unique( + 1, "name", iceberg::string(), true); + auto profile_type = std::make_shared( + std::vector{*name_field}); + auto profile_field = std::make_unique( + 3, "profile", profile_type, true); + + auto theme_field = std::make_unique( + 4, "theme", iceberg::string(), true); + auto settings_type = std::make_shared( + std::vector{*theme_field}); + auto settings_field = std::make_unique( + 5, "settings", settings_type, true); + + auto user_type = std::make_shared( + std::vector{*profile_field, *settings_field}); + auto user_field = + std::make_unique(6, "user", user_type, true); + return std::make_shared( + std::vector{*user_field}, 400); + }, + .should_succeed = true}, + + SelectTestParam{ + .test_name = "SelectListAndNestedFields", + .create_schema = + []() { + // {id: int, tags: list, user: {name: string, age: int}} + auto list_element = std::make_unique( + 1, "element", iceberg::string(), false); + auto list_type = std::make_shared(*list_element); + auto tags_field = + std::make_unique(2, "tags", list_type, true); + + auto name_field = std::make_unique( + 3, "name", iceberg::string(), true); + auto age_field = std::make_unique( + 4, "age", iceberg::int32(), true); + auto user_type = std::make_shared( + std::vector{*name_field, *age_field}); + auto user_field = + std::make_unique(5, "user", user_type, true); + + auto id_field = std::make_unique( + 6, "id", iceberg::int32(), false); + return std::make_shared( + std::vector{*id_field, *tags_field, + *user_field}, + 500); + }, + .select_fields = {"id", "user.name"}, + .expected_schema = + []() { + auto id_field = std::make_unique( + 6, "id", iceberg::int32(), false); + auto name_field = std::make_unique( + 3, "name", iceberg::string(), true); + auto user_type = std::make_shared( + std::vector{*name_field}); + auto user_field = + std::make_unique(5, "user", user_type, true); + return std::make_shared( + std::vector{*id_field, *user_field}, 500); + }, + .should_succeed = true})); + +struct ProjectTestParam { + std::string test_name; + std::function()> create_schema; + std::unordered_set selected_ids; + std::function()> expected_schema; + bool should_succeed; + std::string expected_error_message; +}; - for (auto& thread : threads) { - thread.join(); +class ProjectParamTest : public ::testing::TestWithParam {}; + +TEST_P(ProjectParamTest, ProjectFields) { + const auto& param = GetParam(); + auto input_schema = param.create_schema(); + auto selected_ids = param.selected_ids; + auto result = input_schema->project(selected_ids); + + if (param.should_succeed) { + ASSERT_TRUE(result.has_value()) << "Project should succeed for: " << param.test_name; + + auto actual_schema = result.value(); + auto expected_schema = param.expected_schema(); + ASSERT_EQ(*actual_schema, *expected_schema) + << "Schema mismatch for: " << param.test_name; + } else { + ASSERT_FALSE(result.has_value()) << "Project should fail for: " << param.test_name; + ASSERT_THAT(result, iceberg::IsError(iceberg::ErrorKind::kInvalidArgument)) + << "Should return InvalidArgument error for: " << param.test_name; + + if (!param.expected_error_message.empty()) { + ASSERT_THAT(result, iceberg::HasErrorMessage(param.expected_error_message)) + << "Error message mismatch for: " << param.test_name; + } } } + +INSTANTIATE_TEST_SUITE_P( + ProjectTestCases, ProjectParamTest, + ::testing::Values( + ProjectTestParam{ + .test_name = "ProjectAllFields", + .create_schema = + []() { + auto field1 = std::make_unique( + 1, "id", iceberg::int32(), false); + auto field2 = std::make_unique( + 2, "name", iceberg::string(), true); + auto field3 = std::make_unique( + 3, "age", iceberg::int32(), true); + return std::make_shared( + std::vector{*field1, *field2, *field3}, 100); + }, + .selected_ids = {1, 2, 3}, + .expected_schema = + []() { + auto field1 = std::make_unique( + 1, "id", iceberg::int32(), false); + auto field2 = std::make_unique( + 2, "name", iceberg::string(), true); + auto field3 = std::make_unique( + 3, "age", iceberg::int32(), true); + return std::make_shared( + std::vector{*field1, *field2, *field3}, 100); + }, + .should_succeed = true}, + + ProjectTestParam{ + .test_name = "ProjectSingleField", + .create_schema = + []() { + auto field1 = std::make_unique( + 1, "id", iceberg::int32(), false); + auto field2 = std::make_unique( + 2, "name", iceberg::string(), true); + auto field3 = std::make_unique( + 3, "age", iceberg::int32(), true); + return std::make_shared( + std::vector{*field1, *field2, *field3}, 100); + }, + .selected_ids = {2}, + .expected_schema = + []() { + auto field = std::make_unique( + 2, "name", iceberg::string(), true); + return std::make_shared( + std::vector{*field}, 100); + }, + .should_succeed = true}, + + ProjectTestParam{.test_name = "ProjectNonExistentFieldId", + .create_schema = + []() { + auto field1 = std::make_unique( + 1, "id", iceberg::int32(), false); + auto field2 = std::make_unique( + 2, "name", iceberg::string(), true); + return std::make_shared( + std::vector{*field1, *field2}, + 100); + }, + .selected_ids = {999}, + .expected_schema = + []() { + return std::make_shared( + std::vector{}, 100); + }, + .should_succeed = true}, + + ProjectTestParam{.test_name = "ProjectEmptySelection", + .create_schema = + []() { + auto field1 = std::make_unique( + 1, "id", iceberg::int32(), false); + auto field2 = std::make_unique( + 2, "name", iceberg::string(), true); + return std::make_shared( + std::vector{*field1, *field2}, + 100); + }, + .selected_ids = {}, + .expected_schema = + []() { + return std::make_shared( + std::vector{}, 100); + }, + .should_succeed = true})); + +INSTANTIATE_TEST_SUITE_P( + ProjectNestedTestCases, ProjectParamTest, + ::testing::Values(ProjectTestParam{ + .test_name = "ProjectNestedStructField", + .create_schema = + []() { + auto nested_field1 = std::make_unique( + 1, "street", iceberg::string(), true); + auto nested_field2 = std::make_unique( + 2, "city", iceberg::string(), true); + auto address_type = std::make_shared( + std::vector{*nested_field1, *nested_field2}); + auto field1 = std::make_unique( + 3, "id", iceberg::int32(), false); + auto field2 = std::make_unique(4, "address", + address_type, true); + return std::make_shared( + std::vector{*field1, *field2}, 200); + }, + .selected_ids = {1}, + .expected_schema = + []() { + auto street_field = std::make_unique( + 1, "street", iceberg::string(), true); + auto address_type = std::make_shared( + std::vector{*street_field}); + auto address_field = std::make_unique( + 4, "address", address_type, true); + return std::make_shared( + std::vector{*address_field}, 200); + }, + .should_succeed = true})); + +INSTANTIATE_TEST_SUITE_P( + ProjectMultiLevelTestCases, ProjectParamTest, + ::testing::Values( + ProjectTestParam{ + .test_name = "ProjectTopLevelAndNestedFields", + .create_schema = + []() { + // {id: int, user: {name: string, address: {street: string, city: + // string}}} + auto street_field = std::make_unique( + 1, "street", iceberg::string(), true); + auto city_field = std::make_unique( + 2, "city", iceberg::string(), true); + auto address_type = std::make_shared( + std::vector{*street_field, *city_field}); + auto address_field = std::make_unique( + 3, "address", address_type, true); + + auto name_field = std::make_unique( + 4, "name", iceberg::string(), true); + auto user_type = std::make_shared( + std::vector{*name_field, *address_field}); + auto user_field = + std::make_unique(5, "user", user_type, true); + + auto id_field = std::make_unique( + 6, "id", iceberg::int32(), false); + return std::make_shared( + std::vector{*id_field, *user_field}, 300); + }, + .selected_ids = {6, 4, 1}, + .expected_schema = + []() { + auto id_field = std::make_unique( + 6, "id", iceberg::int32(), false); + auto name_field = std::make_unique( + 4, "name", iceberg::string(), true); + auto street_field = std::make_unique( + 1, "street", iceberg::string(), true); + auto address_type = std::make_shared( + std::vector{*street_field}); + auto address_field = std::make_unique( + 3, "address", address_type, true); + auto user_type = std::make_shared( + std::vector{*name_field, *address_field}); + auto user_field = + std::make_unique(5, "user", user_type, true); + return std::make_shared( + std::vector{*id_field, *user_field}, 300); + }, + .should_succeed = true}, + + ProjectTestParam{ + .test_name = "ProjectNestedFieldsAtDifferentLevels", + .create_schema = + []() { + // {id: int, user: {profile: {name: string, age: int}, settings: {theme: + // string}}} + auto name_field = std::make_unique( + 1, "name", iceberg::string(), true); + auto age_field = std::make_unique( + 2, "age", iceberg::int32(), true); + auto profile_type = std::make_shared( + std::vector{*name_field, *age_field}); + auto profile_field = std::make_unique( + 3, "profile", profile_type, true); + + auto theme_field = std::make_unique( + 4, "theme", iceberg::string(), true); + auto settings_type = std::make_shared( + std::vector{*theme_field}); + auto settings_field = std::make_unique( + 5, "settings", settings_type, true); + + auto user_type = std::make_shared( + std::vector{*profile_field, *settings_field}); + auto user_field = + std::make_unique(6, "user", user_type, true); + + auto id_field = std::make_unique( + 7, "id", iceberg::int32(), false); + return std::make_shared( + std::vector{*id_field, *user_field}, 400); + }, + .selected_ids = {1, 4}, + .expected_schema = + []() { + auto name_field = std::make_unique( + 1, "name", iceberg::string(), true); + auto profile_type = std::make_shared( + std::vector{*name_field}); + auto profile_field = std::make_unique( + 3, "profile", profile_type, true); + + auto theme_field = std::make_unique( + 4, "theme", iceberg::string(), true); + auto settings_type = std::make_shared( + std::vector{*theme_field}); + auto settings_field = std::make_unique( + 5, "settings", settings_type, true); + + auto user_type = std::make_shared( + std::vector{*profile_field, *settings_field}); + auto user_field = + std::make_unique(6, "user", user_type, true); + return std::make_shared( + std::vector{*user_field}, 400); + }, + .should_succeed = true}, + + ProjectTestParam{ + .test_name = "ProjectListAndNestedFields", + .create_schema = + []() { + // {id: int, tags: list, user: {name: string, age: int}} + auto list_element = std::make_unique( + 1, "element", iceberg::string(), false); + auto list_type = std::make_shared(*list_element); + auto tags_field = + std::make_unique(2, "tags", list_type, true); + + auto name_field = std::make_unique( + 3, "name", iceberg::string(), true); + auto age_field = std::make_unique( + 4, "age", iceberg::int32(), true); + auto user_type = std::make_shared( + std::vector{*name_field, *age_field}); + auto user_field = + std::make_unique(5, "user", user_type, true); + + auto id_field = std::make_unique( + 6, "id", iceberg::int32(), false); + return std::make_shared( + std::vector{*id_field, *tags_field, + *user_field}, + 500); + }, + .selected_ids = {6, 3}, + .expected_schema = + []() { + auto id_field = std::make_unique( + 6, "id", iceberg::int32(), false); + auto name_field = std::make_unique( + 3, "name", iceberg::string(), true); + auto user_type = std::make_shared( + std::vector{*name_field}); + auto user_field = + std::make_unique(5, "user", user_type, true); + return std::make_shared( + std::vector{*id_field, *user_field}, 500); + }, + .should_succeed = true})); + +INSTANTIATE_TEST_SUITE_P( + ProjectMapErrorTestCases, ProjectParamTest, + ::testing::Values( + ProjectTestParam{ + .test_name = "ProjectMapTypeError", + .create_schema = + []() { + auto map_key = std::make_unique( + 1, "key", iceberg::string(), false); + auto map_value = std::make_unique( + 2, "value", iceberg::string(), false); + auto map_type = + std::make_shared(*map_key, *map_value); + auto map_field = std::make_unique( + 3, "string_map", map_type, false); + auto id_field = std::make_unique( + 4, "id", iceberg::int32(), false); + return std::make_shared( + std::vector{*id_field, *map_field}, 100); + }, + .selected_ids = {3}, + .expected_schema = []() { return nullptr; }, + .should_succeed = false, + .expected_error_message = + R"(Cannot explicitly project List or Map types, 3:string_map of type map was selected)"}, + + ProjectTestParam{ + .test_name = "ProjectListTypeError", + .create_schema = + []() { + auto list_element = std::make_unique( + 1, "element", iceberg::int32(), false); + auto list_type = std::make_shared(*list_element); + auto list_field = std::make_unique( + 2, "int_list", list_type, false); + auto id_field = std::make_unique( + 3, "id", iceberg::int32(), false); + return std::make_shared( + std::vector{*id_field, *list_field}, 100); + }, + .selected_ids = {2}, + .expected_schema = []() { return nullptr; }, + .should_succeed = false, + .expected_error_message = + R"(Cannot explicitly project List or Map types, 2:int_list of type list was selected)"})); From 4adee8cd2f6da11f4e34e5a5c8596d94a50972df Mon Sep 17 00:00:00 2001 From: nullccxsy <32149055912@qq.com> Date: Fri, 5 Sep 2025 15:45:17 +0800 Subject: [PATCH 02/10] refactor: update PruneColumnVisitor to use shared pointers for result handling - Modified the PruneColumnVisitor class to pass results as shared pointers, improving memory management and clarity. - Updated Visit methods for ListType, MapType, and StructType to accommodate the new result handling approach. --- src/iceberg/schema.cc | 151 +++++++++++++++++++++--------------------- src/iceberg/schema.h | 2 +- 2 files changed, 77 insertions(+), 76 deletions(-) diff --git a/src/iceberg/schema.cc b/src/iceberg/schema.cc index ddffb8a83..de4b3ea09 100644 --- a/src/iceberg/schema.cc +++ b/src/iceberg/schema.cc @@ -264,21 +264,20 @@ class PruneColumnVisitor { public: explicit PruneColumnVisitor(const std::unordered_set& selected_ids, bool select_full_types = false); - Status Visit(const ListType& type); - Status Visit(const MapType& type); - Status Visit(const StructType& type); - Status Visit(const PrimitiveType& type); - std::shared_ptr GetResult() const; - void SetResult(std::shared_ptr result); + Status Visit(const ListType& type, std::shared_ptr& result); + Status Visit(const MapType& type, std::shared_ptr& result); + Status Visit(const StructType& type, std::shared_ptr& result); + Status Visit(const PrimitiveType& type, std::shared_ptr& result); Status ProjectList(const SchemaField& element, - std::shared_ptr element_result); + std::shared_ptr& child_result, + std::shared_ptr& result); Status ProjectMap(const SchemaField& key_field, const SchemaField& value_field, - std::shared_ptr value_result); + std::shared_ptr& value_result, + std::shared_ptr& result); private: const std::unordered_set& selected_ids_; bool select_full_types_; - std::shared_ptr result_; }; Result> Schema::select( @@ -306,20 +305,19 @@ Result> Schema::internalSelect( } } + std::shared_ptr result; PruneColumnVisitor visitor(selected_ids, /*select_full_types=*/true); - ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*this, &visitor)); + ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*this, &visitor, result)); - auto projected_type = visitor.GetResult(); - if (!projected_type) { + if (!result) { return std::make_shared(std::vector{}, schema_id_); } - if (projected_type->type_id() != TypeId::kStruct) { + if (result->type_id() != TypeId::kStruct) { return InvalidSchema("Projected type must be a struct type"); } - const auto& projected_struct = - internal::checked_cast(*projected_type); + const auto& projected_struct = internal::checked_cast(*result); std::vector fields_vec(projected_struct.fields().begin(), projected_struct.fields().end()); @@ -329,19 +327,19 @@ Result> Schema::internalSelect( Result> Schema::project( std::unordered_set& selected_ids) const { PruneColumnVisitor visitor(selected_ids, /*select_full_types=*/false); - ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*this, &visitor)); - auto projected_type = visitor.GetResult(); - if (!projected_type) { + std::shared_ptr result; + ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*this, &visitor, result)); + + if (!result) { return std::make_shared(std::vector{}, schema_id_); } - if (projected_type->type_id() != TypeId::kStruct) { + if (result->type_id() != TypeId::kStruct) { return InvalidSchema("Projected type must be a struct type"); } - const auto& projected_struct = - internal::checked_cast(*projected_type); + const auto& projected_struct = internal::checked_cast(*result); std::vector fields_vec(projected_struct.fields().begin(), projected_struct.fields().end()); return std::make_shared(std::move(fields_vec), schema_id_); @@ -351,29 +349,23 @@ PruneColumnVisitor::PruneColumnVisitor(const std::unordered_set& select bool select_full_types) : selected_ids_(selected_ids), select_full_types_(select_full_types) {} -std::shared_ptr PruneColumnVisitor::GetResult() const { return result_; } - -void PruneColumnVisitor::SetResult(std::shared_ptr result) { - result_ = std::move(result); -} - -Status PruneColumnVisitor::Visit(const StructType& type) { +Status PruneColumnVisitor::Visit(const StructType& type, + std::shared_ptr& result) { std::vector> selected_types; const auto& fields = type.fields(); for (const auto& field : fields) { - PruneColumnVisitor field_visitor(selected_ids_, select_full_types_); - ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*field.type(), &field_visitor)); - auto result = field_visitor.GetResult(); + std::shared_ptr child_result; + ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*field.type(), this, child_result)); if (selected_ids_.contains(field.field_id())) { // select if (select_full_types_) { selected_types.emplace_back(field.type()); } else if (field.type()->type_id() == TypeId::kStruct) { // project(kstruct) - if (!result) { - result = std::make_shared(std::vector{}); + if (!child_result) { + child_result = std::make_shared(std::vector{}); } - selected_types.emplace_back(std::move(result)); + selected_types.emplace_back(std::move(child_result)); } else { // project(list, map, primitive) if (!field.type()->is_primitive()) { @@ -384,9 +376,9 @@ Status PruneColumnVisitor::Visit(const StructType& type) { } selected_types.emplace_back(field.type()); } - } else if (result) { + } else if (child_result) { // project, select - selected_types.emplace_back(std::move(result)); + selected_types.emplace_back(std::move(child_result)); } else { // project, select selected_types.emplace_back(nullptr); @@ -407,29 +399,31 @@ Status PruneColumnVisitor::Visit(const StructType& type) { } if (!selected_fields.empty()) { - if (selected_fields.size() == fields.size() && same_types) { - result_ = std::make_shared(type); + if (same_types && selected_fields.size() == fields.size()) { + result = std::make_shared(type); } else { - result_ = std::make_shared(std::move(selected_fields)); + result = std::make_shared(std::move(selected_fields)); } } return {}; } -Status PruneColumnVisitor::Visit(const ListType& type) { +Status PruneColumnVisitor::Visit(const ListType& type, + std::shared_ptr& result) { const auto& element_field = type.fields()[0]; - PruneColumnVisitor element_visitor(selected_ids_, select_full_types_); - ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*element_field.type(), &element_visitor)); + if (select_full_types_ and selected_ids_.contains(element_field.field_id())) { + result = std::make_shared(type); + return {}; + } - auto element_result = element_visitor.GetResult(); + std::shared_ptr child_result; + ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*element_field.type(), this, child_result)); if (selected_ids_.contains(element_field.field_id())) { - if (select_full_types_) { - result_ = std::make_shared(element_field); - } else if (element_field.type()->type_id() == TypeId::kStruct) { - ICEBERG_RETURN_UNEXPECTED(ProjectList(element_field, element_result)); + if (element_field.type()->type_id() == TypeId::kStruct) { + ICEBERG_RETURN_UNEXPECTED(ProjectList(element_field, child_result, result)); } else { if (!element_field.type()->is_primitive()) { return InvalidArgument( @@ -437,32 +431,34 @@ Status PruneColumnVisitor::Visit(const ListType& type) { "selected", element_field.field_id(), element_field.name()); } - result_ = std::make_shared(element_field); + result = std::make_shared(element_field); } - } else if (element_result) { - ICEBERG_RETURN_UNEXPECTED(ProjectList(element_field, element_result)); + } else if (child_result) { + ICEBERG_RETURN_UNEXPECTED(ProjectList(element_field, child_result, result)); } return {}; } -Status PruneColumnVisitor::Visit(const MapType& type) { +Status PruneColumnVisitor::Visit(const MapType& type, + std::shared_ptr& result) { const auto& key_field = type.fields()[0]; const auto& value_field = type.fields()[1]; - PruneColumnVisitor key_visitor(selected_ids_, select_full_types_); - ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*key_field.type(), &key_visitor)); - auto key_result = key_visitor.GetResult(); + if (select_full_types_ and selected_ids_.contains(value_field.field_id())) { + result = std::make_shared(type); + return {}; + } - PruneColumnVisitor value_visitor(selected_ids_, select_full_types_); - ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*value_field.type(), &value_visitor)); - auto value_result = value_visitor.GetResult(); + std::shared_ptr key_result; + ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*key_field.type(), this, key_result)); + + std::shared_ptr value_result; + ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*value_field.type(), this, value_result)); if (selected_ids_.contains(value_field.field_id())) { - if (select_full_types_) { - result_ = std::make_shared(type); - } else if (value_field.type()->type_id() == TypeId::kStruct) { - ICEBERG_RETURN_UNEXPECTED(ProjectMap(key_field, value_field, value_result)); + if (value_field.type()->type_id() == TypeId::kStruct) { + ICEBERG_RETURN_UNEXPECTED(ProjectMap(key_field, value_field, value_result, result)); } else { if (!value_field.type()->is_primitive()) { return InvalidArgument( @@ -470,44 +466,49 @@ Status PruneColumnVisitor::Visit(const MapType& type) { "selected", value_field.field_id(), type.ToString()); } - result_ = std::make_shared(type); + result = std::make_shared(type); } } else if (value_result) { - ICEBERG_RETURN_UNEXPECTED(ProjectMap(key_field, value_field, value_result)); + ICEBERG_RETURN_UNEXPECTED(ProjectMap(key_field, value_field, value_result, result)); } else if (selected_ids_.contains(key_field.field_id())) { - result_ = std::make_shared(type); + result = std::make_shared(type); } return {}; } -Status PruneColumnVisitor::Visit(const PrimitiveType& type) { return {}; } +Status PruneColumnVisitor::Visit(const PrimitiveType& type, + std::shared_ptr& result) { + return {}; +} Status PruneColumnVisitor::ProjectList(const SchemaField& element_field, - std::shared_ptr element_result) { - if (!element_result) { + std::shared_ptr& child_result, + std::shared_ptr& result) { + if (!child_result) { return InvalidArgument("Cannot project a list when the element result is null"); } - if (element_field.type() == element_result) { - result_ = std::make_shared(element_field); + if (element_field.type() == child_result) { + result = std::make_shared(element_field); } else { - result_ = std::make_shared(element_field.field_id(), - std::const_pointer_cast(element_result), - element_field.optional()); + result = std::make_shared(element_field.field_id(), + std::const_pointer_cast(child_result), + element_field.optional()); } return {}; } Status PruneColumnVisitor::ProjectMap(const SchemaField& key_field, const SchemaField& value_field, - std::shared_ptr value_result) { + std::shared_ptr& value_result, + std::shared_ptr& result) { if (!value_result) { return InvalidArgument("Attempted to project a map without a defined map value type"); } if (value_field.type() == value_result) { - result_ = std::make_shared(key_field, value_field); + result = std::make_shared(key_field, value_field); } else { - result_ = std::make_shared( + result = std::make_shared( key_field, SchemaField(value_field.field_id(), std::string(value_field.name()), std::const_pointer_cast(value_result), value_field.optional())); diff --git a/src/iceberg/schema.h b/src/iceberg/schema.h index 9a40fb09a..3d15c2ab0 100644 --- a/src/iceberg/schema.h +++ b/src/iceberg/schema.h @@ -98,7 +98,7 @@ class ICEBERG_EXPORT Schema : public StructType, private: /// \brief Compare two schemas for equality. - [[nodiscard]] bool Equals(const Schema& other) const; + bool Equals(const Schema& other) const; Status InitIdToFieldMap() const; Status InitNameToIdMap() const; From a0460769c48410b4bc52bb423d670941aa0f4088 Mon Sep 17 00:00:00 2001 From: nullccxsy <32149055912@qq.com> Date: Sat, 6 Sep 2025 16:54:29 +0800 Subject: [PATCH 03/10] fix review --- src/iceberg/schema.cc | 413 +++++++++++++++++++++--------------------- src/iceberg/schema.h | 91 ++++++++-- test/schema_test.cc | 8 +- 3 files changed, 295 insertions(+), 217 deletions(-) diff --git a/src/iceberg/schema.cc b/src/iceberg/schema.cc index de4b3ea09..20296f131 100644 --- a/src/iceberg/schema.cc +++ b/src/iceberg/schema.cc @@ -22,6 +22,7 @@ #include #include +#include "iceberg/schema_internal.h" #include "iceberg/type.h" #include "iceberg/util/formatter.h" // IWYU pragma: keep #include "iceberg/util/macros.h" @@ -260,260 +261,268 @@ void NameToIdVisitor::Finish() { } } +/// \brief Visitor class for pruning schema columns based on selected field IDs. +/// +/// This visitor traverses a schema and creates a projected version containing only +/// the specified fields. It handles different projection modes: +/// - select_full_types=true: Include entire fields when their ID is selected +/// - select_full_types=false: Recursively project nested fields within selected structs +/// +/// \warning Error conditions that will cause projection to fail: +/// - Attempting to explicitly project List or Map types (returns InvalidArgument) +/// - Projecting a List when element result is null (returns InvalidArgument) +/// - Projecting a Map without a defined map value type (returns InvalidArgument) +/// - Projecting a struct when result is not StructType (returns InvalidArgument) class PruneColumnVisitor { public: - explicit PruneColumnVisitor(const std::unordered_set& selected_ids, - bool select_full_types = false); - Status Visit(const ListType& type, std::shared_ptr& result); - Status Visit(const MapType& type, std::shared_ptr& result); - Status Visit(const StructType& type, std::shared_ptr& result); - Status Visit(const PrimitiveType& type, std::shared_ptr& result); - Status ProjectList(const SchemaField& element, - std::shared_ptr& child_result, - std::shared_ptr& result); - Status ProjectMap(const SchemaField& key_field, const SchemaField& value_field, - std::shared_ptr& value_result, - std::shared_ptr& result); - - private: - const std::unordered_set& selected_ids_; - bool select_full_types_; -}; - -Result> Schema::select( - const std::vector& names, bool case_sensitive) const { - return internalSelect(names, case_sensitive); -} - -Result> Schema::select( - const std::initializer_list& names, bool case_sensitive) const { - return internalSelect(std::vector(names), case_sensitive); -} - -Result> Schema::internalSelect( - const std::vector& names, bool case_sensitive) const { - const std::string ALL_COLUMNS = "*"; - if (std::ranges::find(names, ALL_COLUMNS) != names.end()) { - return shared_from_this(); - } - - std::unordered_set selected_ids; - for (const auto& name : names) { - ICEBERG_ASSIGN_OR_RAISE(auto result, FindFieldByName(name, case_sensitive)); - if (result.has_value()) { - selected_ids.insert(result.value().get().field_id()); + PruneColumnVisitor(const std::unordered_set& selected_ids, + bool select_full_types) + : selected_ids_(selected_ids), select_full_types_(select_full_types) {} + + Status Visit(const StructType& type, std::unique_ptr* out) const { + std::vector> selected_types; + for (const auto& field : type.fields()) { + std::unique_ptr child_result; + if (select_full_types_ and selected_ids_.contains(field.field_id())) { + selected_types.emplace_back(field.type()); + continue; + } + ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*field.type(), this, &child_result)); + if (selected_ids_.contains(field.field_id())) { + // select + if (field.type()->type_id() == TypeId::kStruct) { + // project(kstruct) + ICEBERG_RETURN_UNEXPECTED(ProjectStruct(field, &child_result)); + selected_types.emplace_back(std::move(child_result)); + } else { + // project(list, map, primitive) + if (!field.type()->is_primitive()) { + return InvalidArgument( + "Cannot explicitly project List or Map types, {}:{} of type {} was " + "selected", + field.field_id(), field.name(), field.type()->ToString()); + } + selected_types.emplace_back(field.type()); + } + } else if (child_result) { + // project, select + selected_types.emplace_back(std::move(child_result)); + } else { + // project, select + selected_types.emplace_back(nullptr); + } } - } - std::shared_ptr result; - PruneColumnVisitor visitor(selected_ids, /*select_full_types=*/true); - ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*this, &visitor, result)); + bool same_types = true; + std::vector selected_fields; + const auto& fields = type.fields(); + for (size_t i = 0; i < fields.size(); i++) { + if (fields[i].type() == selected_types[i]) { + selected_fields.emplace_back(fields[i]); + } else if (selected_types[i]) { + same_types = false; + selected_fields.emplace_back(fields[i].field_id(), std::string(fields[i].name()), + std::const_pointer_cast(selected_types[i]), + fields[i].optional(), std::string(fields[i].doc())); + } + } - if (!result) { - return std::make_shared(std::vector{}, schema_id_); - } + if (!selected_fields.empty()) { + if (same_types && selected_fields.size() == fields.size()) { + *out = std::make_unique(type); + } else { + *out = std::make_unique(std::move(selected_fields)); + } + } - if (result->type_id() != TypeId::kStruct) { - return InvalidSchema("Projected type must be a struct type"); + return {}; } - const auto& projected_struct = internal::checked_cast(*result); + Status Visit(const ListType& type, std::unique_ptr* out) const { + const auto& element_field = type.fields()[0]; - std::vector fields_vec(projected_struct.fields().begin(), - projected_struct.fields().end()); - return std::make_shared(std::move(fields_vec), schema_id_); -} + if (select_full_types_ and selected_ids_.contains(element_field.field_id())) { + *out = std::make_unique(type); + return {}; + } -Result> Schema::project( - std::unordered_set& selected_ids) const { - PruneColumnVisitor visitor(selected_ids, /*select_full_types=*/false); + std::unique_ptr child_result; + ICEBERG_RETURN_UNEXPECTED( + VisitTypeInline(*element_field.type(), this, &child_result)); - std::shared_ptr result; - ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*this, &visitor, result)); + if (selected_ids_.contains(element_field.field_id())) { + if (element_field.type()->type_id() == TypeId::kStruct) { + ICEBERG_RETURN_UNEXPECTED(ProjectStruct(element_field, &child_result)); + ICEBERG_RETURN_UNEXPECTED( + ProjectList(element_field, std::move(child_result), out)); + } else { + if (!element_field.type()->is_primitive()) { + return InvalidArgument( + "Cannot explicitly project List or Map types, List element {} of type {} " + "was " + "selected", + element_field.field_id(), element_field.name()); + } + *out = std::make_unique(element_field); + } + } else if (child_result) { + ICEBERG_RETURN_UNEXPECTED(ProjectList(element_field, std::move(child_result), out)); + } - if (!result) { - return std::make_shared(std::vector{}, schema_id_); + return {}; } - if (result->type_id() != TypeId::kStruct) { - return InvalidSchema("Projected type must be a struct type"); - } + Status Visit(const MapType& type, std::unique_ptr* out) const { + const auto& key_field = type.fields()[0]; + const auto& value_field = type.fields()[1]; - const auto& projected_struct = internal::checked_cast(*result); - std::vector fields_vec(projected_struct.fields().begin(), - projected_struct.fields().end()); - return std::make_shared(std::move(fields_vec), schema_id_); -} + if (select_full_types_ and selected_ids_.contains(value_field.field_id())) { + *out = std::make_unique(type); + return {}; + } -PruneColumnVisitor::PruneColumnVisitor(const std::unordered_set& selected_ids, - bool select_full_types) - : selected_ids_(selected_ids), select_full_types_(select_full_types) {} + std::unique_ptr key_result; + ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*key_field.type(), this, &key_result)); -Status PruneColumnVisitor::Visit(const StructType& type, - std::shared_ptr& result) { - std::vector> selected_types; - const auto& fields = type.fields(); - for (const auto& field : fields) { - std::shared_ptr child_result; - ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*field.type(), this, child_result)); - if (selected_ids_.contains(field.field_id())) { - // select - if (select_full_types_) { - selected_types.emplace_back(field.type()); - } else if (field.type()->type_id() == TypeId::kStruct) { - // project(kstruct) - if (!child_result) { - child_result = std::make_shared(std::vector{}); - } - selected_types.emplace_back(std::move(child_result)); + std::unique_ptr value_result; + ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*value_field.type(), this, &value_result)); + + if (selected_ids_.contains(value_field.field_id())) { + if (value_field.type()->type_id() == TypeId::kStruct) { + ICEBERG_RETURN_UNEXPECTED(ProjectStruct(value_field, &value_result)); + ICEBERG_RETURN_UNEXPECTED( + ProjectMap(key_field, value_field, std::move(value_result), out)); } else { - // project(list, map, primitive) - if (!field.type()->is_primitive()) { + if (!value_field.type()->is_primitive()) { return InvalidArgument( - "Cannot explicitly project List or Map types, {}:{} of type {} was " + "Cannot explicitly project List or Map types, Map value {} of type {} was " "selected", - field.field_id(), field.name(), field.type()->ToString()); + value_field.field_id(), type.ToString()); } - selected_types.emplace_back(field.type()); + *out = std::make_unique(type); } - } else if (child_result) { - // project, select - selected_types.emplace_back(std::move(child_result)); - } else { - // project, select - selected_types.emplace_back(nullptr); + } else if (value_result) { + ICEBERG_RETURN_UNEXPECTED( + ProjectMap(key_field, value_field, std::move(value_result), out)); + } else if (selected_ids_.contains(key_field.field_id())) { + *out = std::make_unique(type); } + + return {}; } - bool same_types = true; - std::vector selected_fields; - for (size_t i = 0; i < fields.size(); i++) { - if (fields[i].type() == selected_types[i]) { - selected_fields.emplace_back(fields[i]); - } else if (selected_types[i]) { - same_types = false; - selected_fields.emplace_back(fields[i].field_id(), std::string(fields[i].name()), - std::const_pointer_cast(selected_types[i]), - fields[i].optional(), std::string(fields[i].doc())); - } + Status Visit(const PrimitiveType& type, std::unique_ptr* result) const { + return {}; } - if (!selected_fields.empty()) { - if (same_types && selected_fields.size() == fields.size()) { - result = std::make_shared(type); + Status ProjectList(const SchemaField& element_field, std::shared_ptr child_result, + std::unique_ptr* out) const { + if (!child_result) { + return InvalidArgument("Cannot project a list when the element result is null"); + } + if (element_field.type() == child_result) { + *out = std::make_unique(element_field); } else { - result = std::make_shared(std::move(selected_fields)); + *out = std::make_unique(element_field.field_id(), child_result, + element_field.optional()); } + return {}; } - return {}; -} - -Status PruneColumnVisitor::Visit(const ListType& type, - std::shared_ptr& result) { - const auto& element_field = type.fields()[0]; - - if (select_full_types_ and selected_ids_.contains(element_field.field_id())) { - result = std::make_shared(type); + Status ProjectMap(const SchemaField& key_field, const SchemaField& value_field, + std::shared_ptr value_result, + std::unique_ptr* out) const { + if (!value_result) { + return InvalidArgument( + "Attempted to project a map without a defined map value type"); + } + if (value_field.type() == value_result) { + *out = std::make_unique(key_field, value_field); + } else { + *out = std::make_unique( + key_field, SchemaField(value_field.field_id(), std::string(value_field.name()), + value_result, value_field.optional())); + } return {}; } - std::shared_ptr child_result; - ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*element_field.type(), this, child_result)); - - if (selected_ids_.contains(element_field.field_id())) { - if (element_field.type()->type_id() == TypeId::kStruct) { - ICEBERG_RETURN_UNEXPECTED(ProjectList(element_field, child_result, result)); - } else { - if (!element_field.type()->is_primitive()) { - return InvalidArgument( - "Cannot explicitly project List or Map types, List element {} of type {} was " - "selected", - element_field.field_id(), element_field.name()); - } - result = std::make_shared(element_field); + Status ProjectStruct(const SchemaField& field, std::unique_ptr* out) const { + if (out && (*out)->type_id() != TypeId::kStruct) { + return InvalidArgument("Project struct {}:{}, but result is not StructType", + field.name(), field.field_id()); } - } else if (child_result) { - ICEBERG_RETURN_UNEXPECTED(ProjectList(element_field, child_result, result)); + if (out == nullptr) { + *out = std::make_unique(std::vector{}); + } + return {}; } - return {}; + private: + const std::unordered_set& selected_ids_; + bool select_full_types_; +}; + +Result> Schema::Select(std::span names, + bool case_sensitive) const { + return SelectInternal(names, case_sensitive); } -Status PruneColumnVisitor::Visit(const MapType& type, - std::shared_ptr& result) { - const auto& key_field = type.fields()[0]; - const auto& value_field = type.fields()[1]; +Result> Schema::Select( + const std::initializer_list& names, bool case_sensitive) const { + return SelectInternal(names, case_sensitive); +} - if (select_full_types_ and selected_ids_.contains(value_field.field_id())) { - result = std::make_shared(type); - return {}; +Result> Schema::SelectInternal( + std::span names, bool case_sensitive) const { + const std::string kAllColumns = "*"; + if (std::ranges::find(names, kAllColumns) != names.end()) { + return std::make_unique(*this); } - std::shared_ptr key_result; - ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*key_field.type(), this, key_result)); - - std::shared_ptr value_result; - ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*value_field.type(), this, value_result)); - - if (selected_ids_.contains(value_field.field_id())) { - if (value_field.type()->type_id() == TypeId::kStruct) { - ICEBERG_RETURN_UNEXPECTED(ProjectMap(key_field, value_field, value_result, result)); - } else { - if (!value_field.type()->is_primitive()) { - return InvalidArgument( - "Cannot explicitly project List or Map types, Map value {} of type {} was " - "selected", - value_field.field_id(), type.ToString()); - } - result = std::make_shared(type); + std::unordered_set selected_ids; + for (const auto& name : names) { + ICEBERG_ASSIGN_OR_RAISE(auto result, FindFieldByName(name, case_sensitive)); + if (result.has_value()) { + selected_ids.insert(result.value().get().field_id()); } - } else if (value_result) { - ICEBERG_RETURN_UNEXPECTED(ProjectMap(key_field, value_field, value_result, result)); - } else if (selected_ids_.contains(key_field.field_id())) { - result = std::make_shared(type); } - return {}; -} - -Status PruneColumnVisitor::Visit(const PrimitiveType& type, - std::shared_ptr& result) { - return {}; -} + std::unique_ptr result; + PruneColumnVisitor visitor(selected_ids, /*select_full_types=*/true); + ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*this, &visitor, &result)); -Status PruneColumnVisitor::ProjectList(const SchemaField& element_field, - std::shared_ptr& child_result, - std::shared_ptr& result) { - if (!child_result) { - return InvalidArgument("Cannot project a list when the element result is null"); + if (!result) { + return std::make_unique(std::vector{}, schema_id_); } - if (element_field.type() == child_result) { - result = std::make_shared(element_field); - } else { - result = std::make_shared(element_field.field_id(), - std::const_pointer_cast(child_result), - element_field.optional()); + + if (result->type_id() != TypeId::kStruct) { + return InvalidSchema("Projected type must be a struct type"); } - return {}; + + auto& projected_struct = internal::checked_cast(*result); + + return FromStructType(std::move(const_cast(projected_struct)), schema_id_); } -Status PruneColumnVisitor::ProjectMap(const SchemaField& key_field, - const SchemaField& value_field, - std::shared_ptr& value_result, - std::shared_ptr& result) { - if (!value_result) { - return InvalidArgument("Attempted to project a map without a defined map value type"); +Result> Schema::Project( + std::unordered_set& field_ids) const { + PruneColumnVisitor visitor(field_ids, /*select_full_types=*/false); + + std::unique_ptr result; + ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*this, &visitor, &result)); + + if (!result) { + return std::make_unique(std::vector{}, schema_id_); } - if (value_field.type() == value_result) { - result = std::make_shared(key_field, value_field); - } else { - result = std::make_shared( - key_field, - SchemaField(value_field.field_id(), std::string(value_field.name()), - std::const_pointer_cast(value_result), value_field.optional())); + + if (result->type_id() != TypeId::kStruct) { + return InvalidSchema("Projected type must be a struct type"); } - return {}; + + const auto& projected_struct = internal::checked_cast(*result); + std::vector fields_vec(projected_struct.fields().begin(), + projected_struct.fields().end()); + return std::make_unique(std::move(fields_vec), schema_id_); } } // namespace iceberg diff --git a/src/iceberg/schema.h b/src/iceberg/schema.h index 3d15c2ab0..f79689b49 100644 --- a/src/iceberg/schema.h +++ b/src/iceberg/schema.h @@ -43,8 +43,7 @@ namespace iceberg { /// A schema is a list of typed columns, along with a unique integer ID. A /// Table may have different schemas over its lifetime due to schema /// evolution. -class ICEBERG_EXPORT Schema : public StructType, - public std::enable_shared_from_this { +class ICEBERG_EXPORT Schema : public StructType { public: static constexpr int32_t kInitialSchemaId = 0; @@ -75,24 +74,94 @@ class ICEBERG_EXPORT Schema : public StructType, Result>> FindFieldById( int32_t field_id) const; - /// \brief Creates a projection schema for a subset of columns, selected by name. - Result> select(const std::vector& names, + /// \brief Creates a projected schema from selected field names. + /// + /// Selects fields by their names using dot notation for nested fields. + /// Supports both canonical names (e.g., "user.address.street") and short names + /// (e.g., "user.street" for map values, "list.element" for list elements). + /// + /// \param names Field names to select (supports nested field paths) + /// \param case_sensitive Whether name matching is case-sensitive (default: true) + /// \return Projected schema containing only the specified fields + /// + /// \example + /// \code + /// // Original schema: struct { + /// // id: int, + /// // user: struct { + /// // name: string, + /// // address: struct { street: string, city: string } + /// // } + /// // } + /// + /// // Select by names - you must specify the exact path + /// auto result1 = schema->Select({"id", "user.name"}); + /// // Result: struct { id: int, user: struct { name: string } } + /// + /// auto result2 = schema->Select({"user.address.street"}); + /// // Result: struct { user: struct { address: struct { street: string } } } + /// + /// auto result3 = schema->Select({"user.name"}); + /// Result: struct { user: struct { name: string } } + /// \endcode + Result> Select(std::span names, bool case_sensitive = true) const; - /// \brief Creates a projection schema for a subset of columns, selected by name. - Result> select( + /// \brief Creates a projected schema from selected field names. + Result> Select( const std::initializer_list& names, bool case_sensitive = true) const; - /// \brief Creates a projection schema for a subset of columns, selected by name. + /// \brief Creates a projected schema from selected field names. template - Result> select(Args&&... names, + Result> Select(Args&&... names, bool case_sensitive = true) const { - static_assert((std::is_convertible_v && ...), + static_assert(((std::is_convertible_v || + std::is_convertible_v) && + ...), "All arguments must be convertible to std::string"); - return select({std::string(names)...}, case_sensitive); + return select({(names)...}, case_sensitive); } - Result> project(std::unordered_set& ids) const; + /// \brief Creates a projected schema from selected field IDs. + /// + /// Selects fields by their numeric IDs. More efficient than Select() when you + /// already know the field IDs. Handles recursive projection of nested structs. + /// + /// \param field_ids Set of field IDs to select + /// \return Projected schema containing only the specified fields + /// + /// \note When a struct field ID is specified: + /// - If nested field IDs are also in field_ids, they are recursively projected + /// - If no nested field IDs are in field_ids, an empty struct is included + /// - List/Map types cannot be explicitly projected (returns error) + /// + /// \example + /// \code + /// // Original schema with field IDs: + /// // struct { + /// // 1: id: int, + /// // 2: user: struct { + /// // 3: name: string, + /// // 4: address: struct { 5: street: string, 6: city: string } + /// // } + /// // } + /// + /// // Project by IDs - recursive behavior for structs + /// std::unordered_set ids1 = {1, 2, 3}; // id, user, user.name + /// auto result1 = schema->Project(ids1); + /// // Result: struct { id: int, user: struct { name: string } } + /// + /// std::unordered_set ids2 = {2}; // Only user struct + /// auto result2 = schema->Project(ids2); + /// // Result: struct { user: struct {} } // Empty struct because no nested fields + /// selected + /// + /// std::unordered_set ids3 = {2, 5}; // user struct + street field + /// auto result3 = schema->Project(ids3); + /// // Result: struct { user: struct { address: struct { street: string } } } + /// \endcode + Result> Project( + std::unordered_set& field_ids) const; friend bool operator==(const Schema& lhs, const Schema& rhs) { return lhs.Equals(rhs); } diff --git a/test/schema_test.cc b/test/schema_test.cc index 34350551f..2e79aa1f3 100644 --- a/test/schema_test.cc +++ b/test/schema_test.cc @@ -509,12 +509,12 @@ TEST_P(SelectParamTest, SelectFields) { const auto& param = GetParam(); auto input_schema = param.create_schema(); - auto result = input_schema->select(param.select_fields, param.case_sensitive); + auto result = input_schema->Select(param.select_fields, param.case_sensitive); if (param.should_succeed) { ASSERT_TRUE(result.has_value()) << "Select should succeed for: " << param.test_name; - auto actual_schema = result.value(); + auto actual_schema = std::move(result.value()); auto expected_schema = param.expected_schema(); ASSERT_EQ(*actual_schema, *expected_schema) << "Schema mismatch for: " << param.test_name; @@ -938,12 +938,12 @@ TEST_P(ProjectParamTest, ProjectFields) { const auto& param = GetParam(); auto input_schema = param.create_schema(); auto selected_ids = param.selected_ids; - auto result = input_schema->project(selected_ids); + auto result = input_schema->Project(selected_ids); if (param.should_succeed) { ASSERT_TRUE(result.has_value()) << "Project should succeed for: " << param.test_name; - auto actual_schema = result.value(); + auto actual_schema = std::move(result.value()); auto expected_schema = param.expected_schema(); ASSERT_EQ(*actual_schema, *expected_schema) << "Schema mismatch for: " << param.test_name; From 50ea9c3a2e0772845486e386a2d428a55bd2569f Mon Sep 17 00:00:00 2001 From: nullccxsy <32149055912@qq.com> Date: Sat, 6 Sep 2025 17:51:28 +0800 Subject: [PATCH 04/10] fix cpp-linter --- src/iceberg/schema.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/iceberg/schema.cc b/src/iceberg/schema.cc index 20296f131..54bcc2053 100644 --- a/src/iceberg/schema.cc +++ b/src/iceberg/schema.cc @@ -447,11 +447,11 @@ class PruneColumnVisitor { } Status ProjectStruct(const SchemaField& field, std::unique_ptr* out) const { - if (out && (*out)->type_id() != TypeId::kStruct) { + if (*out && (*out)->type_id() != TypeId::kStruct) { return InvalidArgument("Project struct {}:{}, but result is not StructType", field.name(), field.field_id()); } - if (out == nullptr) { + if (*out == nullptr) { *out = std::make_unique(std::vector{}); } return {}; From 564e933ac8303a5a8eb275d2c957379f3148327b Mon Sep 17 00:00:00 2001 From: nullccxsy <32149055912@qq.com> Date: Tue, 9 Sep 2025 17:51:16 +0800 Subject: [PATCH 05/10] refactor: enhance PruneColumnVisitor for improved type handling and error reporting - Updated the PruneColumnVisitor class to utilize shared pointers for type results, enhancing memory management. - Refined Visit methods for StructType, ListType, and MapType to improve clarity and error handling, particularly for cases involving invalid projections. --- src/iceberg/schema.cc | 236 +++++++++++++++++++----------------------- src/iceberg/schema.h | 47 --------- test/schema_test.cc | 208 +++++++++++++++++++++++++++++++++---- 3 files changed, 290 insertions(+), 201 deletions(-) diff --git a/src/iceberg/schema.cc b/src/iceberg/schema.cc index 54bcc2053..5c4f988e0 100644 --- a/src/iceberg/schema.cc +++ b/src/iceberg/schema.cc @@ -269,192 +269,167 @@ void NameToIdVisitor::Finish() { /// - select_full_types=false: Recursively project nested fields within selected structs /// /// \warning Error conditions that will cause projection to fail: -/// - Attempting to explicitly project List or Map types (returns InvalidArgument) -/// - Projecting a List when element result is null (returns InvalidArgument) -/// - Projecting a Map without a defined map value type (returns InvalidArgument) -/// - Projecting a struct when result is not StructType (returns InvalidArgument) +/// - Project or Select a Map with just key or value (returns InvalidArgument) class PruneColumnVisitor { public: PruneColumnVisitor(const std::unordered_set& selected_ids, bool select_full_types) : selected_ids_(selected_ids), select_full_types_(select_full_types) {} - Status Visit(const StructType& type, std::unique_ptr* out) const { - std::vector> selected_types; - for (const auto& field : type.fields()) { - std::unique_ptr child_result; + Result> Visit(const std::shared_ptr& type) const { + switch (type->type_id()) { + case TypeId::kStruct: { + auto struct_type = std::static_pointer_cast(type); + return Visit(struct_type); + } + case TypeId::kList: { + auto list_type = std::static_pointer_cast(type); + return Visit(list_type); + } + case TypeId::kMap: { + auto map_type = std::static_pointer_cast(type); + return Visit(map_type); + } + default: { + auto primitive_type = std::static_pointer_cast(type); + return Visit(primitive_type); + } + } + } + + Result> Visit(const std::shared_ptr& type) const { + std::vector> selected_types; + for (const auto& field : type->fields()) { if (select_full_types_ and selected_ids_.contains(field.field_id())) { selected_types.emplace_back(field.type()); continue; } - ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*field.type(), this, &child_result)); + ICEBERG_ASSIGN_OR_RAISE(auto child_result, Visit(field.type())); if (selected_ids_.contains(field.field_id())) { - // select - if (field.type()->type_id() == TypeId::kStruct) { - // project(kstruct) - ICEBERG_RETURN_UNEXPECTED(ProjectStruct(field, &child_result)); - selected_types.emplace_back(std::move(child_result)); - } else { - // project(list, map, primitive) - if (!field.type()->is_primitive()) { - return InvalidArgument( - "Cannot explicitly project List or Map types, {}:{} of type {} was " - "selected", - field.field_id(), field.name(), field.type()->ToString()); - } - selected_types.emplace_back(field.type()); - } - } else if (child_result) { - // project, select - selected_types.emplace_back(std::move(child_result)); + selected_types.emplace_back( + field.type()->is_primitive() ? field.type() : std::move(child_result)); } else { - // project, select - selected_types.emplace_back(nullptr); + selected_types.emplace_back(std::move(child_result)); } } bool same_types = true; std::vector selected_fields; - const auto& fields = type.fields(); + const auto& fields = type->fields(); for (size_t i = 0; i < fields.size(); i++) { if (fields[i].type() == selected_types[i]) { - selected_fields.emplace_back(fields[i]); + selected_fields.emplace_back(std::move(fields[i])); } else if (selected_types[i]) { same_types = false; selected_fields.emplace_back(fields[i].field_id(), std::string(fields[i].name()), - std::const_pointer_cast(selected_types[i]), - fields[i].optional(), std::string(fields[i].doc())); + std::move(selected_types[i]), fields[i].optional(), + std::string(fields[i].doc())); } } if (!selected_fields.empty()) { if (same_types && selected_fields.size() == fields.size()) { - *out = std::make_unique(type); + return type; } else { - *out = std::make_unique(std::move(selected_fields)); + return std::make_shared(std::move(selected_fields)); } } - return {}; + return nullptr; } - Status Visit(const ListType& type, std::unique_ptr* out) const { - const auto& element_field = type.fields()[0]; - + Result> Visit(const std::shared_ptr& type) const { + const auto& element_field = type->fields()[0]; if (select_full_types_ and selected_ids_.contains(element_field.field_id())) { - *out = std::make_unique(type); - return {}; + return type; } - std::unique_ptr child_result; - ICEBERG_RETURN_UNEXPECTED( - VisitTypeInline(*element_field.type(), this, &child_result)); + ICEBERG_ASSIGN_OR_RAISE(auto child_result, Visit(element_field.type())); + std::shared_ptr out; if (selected_ids_.contains(element_field.field_id())) { - if (element_field.type()->type_id() == TypeId::kStruct) { - ICEBERG_RETURN_UNEXPECTED(ProjectStruct(element_field, &child_result)); - ICEBERG_RETURN_UNEXPECTED( - ProjectList(element_field, std::move(child_result), out)); + if (element_field.type()->is_primitive()) { + out = std::make_shared(element_field); } else { - if (!element_field.type()->is_primitive()) { - return InvalidArgument( - "Cannot explicitly project List or Map types, List element {} of type {} " - "was " - "selected", - element_field.field_id(), element_field.name()); - } - *out = std::make_unique(element_field); + ICEBERG_ASSIGN_OR_RAISE(out, ProjectList(element_field, std::move(child_result))); } } else if (child_result) { - ICEBERG_RETURN_UNEXPECTED(ProjectList(element_field, std::move(child_result), out)); + ICEBERG_ASSIGN_OR_RAISE(out, ProjectList(element_field, std::move(child_result))); } - - return {}; + return out; } - Status Visit(const MapType& type, std::unique_ptr* out) const { - const auto& key_field = type.fields()[0]; - const auto& value_field = type.fields()[1]; + Result> Visit(const std::shared_ptr& type) const { + const auto& key_field = type->fields()[0]; + const auto& value_field = type->fields()[1]; - if (select_full_types_ and selected_ids_.contains(value_field.field_id())) { - *out = std::make_unique(type); - return {}; + if (select_full_types_ and selected_ids_.contains(key_field.field_id()) and + selected_ids_.contains(value_field.field_id())) { + return type; } - std::unique_ptr key_result; - ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*key_field.type(), this, &key_result)); + ICEBERG_ASSIGN_OR_RAISE(auto key_result, Visit(key_field.type())); + ICEBERG_ASSIGN_OR_RAISE(auto value_result, Visit(value_field.type())); - std::unique_ptr value_result; - ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*value_field.type(), this, &value_result)); + if (selected_ids_.contains(value_field.field_id()) and + value_field.type()->is_primitive()) { + value_result = value_field.type(); + } + if (selected_ids_.contains(key_field.field_id()) and + key_field.type()->is_primitive()) { + key_result = key_field.type(); + } - if (selected_ids_.contains(value_field.field_id())) { - if (value_field.type()->type_id() == TypeId::kStruct) { - ICEBERG_RETURN_UNEXPECTED(ProjectStruct(value_field, &value_result)); - ICEBERG_RETURN_UNEXPECTED( - ProjectMap(key_field, value_field, std::move(value_result), out)); - } else { - if (!value_field.type()->is_primitive()) { - return InvalidArgument( - "Cannot explicitly project List or Map types, Map value {} of type {} was " - "selected", - value_field.field_id(), type.ToString()); - } - *out = std::make_unique(type); - } - } else if (value_result) { - ICEBERG_RETURN_UNEXPECTED( - ProjectMap(key_field, value_field, std::move(value_result), out)); - } else if (selected_ids_.contains(key_field.field_id())) { - *out = std::make_unique(type); + if (!key_result && !value_result) { + return nullptr; } - return {}; + if (!key_result || !value_result) { + return InvalidArgument( + "Cannot project Map with only key or value: key={}, value={}", + key_result ? "present" : "null", value_result ? "present" : "null"); + } + + ICEBERG_ASSIGN_OR_RAISE(auto out, + ProjectMap(key_field, value_field, key_result, value_result)); + return out; } - Status Visit(const PrimitiveType& type, std::unique_ptr* result) const { - return {}; + Result> Visit(const std::shared_ptr& type) const { + return nullptr; } - Status ProjectList(const SchemaField& element_field, std::shared_ptr child_result, - std::unique_ptr* out) const { + Result> ProjectList(const SchemaField& element_field, + std::shared_ptr child_result) const { if (!child_result) { - return InvalidArgument("Cannot project a list when the element result is null"); + return nullptr; } if (element_field.type() == child_result) { - *out = std::make_unique(element_field); - } else { - *out = std::make_unique(element_field.field_id(), child_result, - element_field.optional()); + return std::make_shared(element_field); } - return {}; + return std::make_shared(element_field.field_id(), child_result, + element_field.optional()); } - Status ProjectMap(const SchemaField& key_field, const SchemaField& value_field, - std::shared_ptr value_result, - std::unique_ptr* out) const { - if (!value_result) { - return InvalidArgument( - "Attempted to project a map without a defined map value type"); - } - if (value_field.type() == value_result) { - *out = std::make_unique(key_field, value_field); - } else { - *out = std::make_unique( - key_field, SchemaField(value_field.field_id(), std::string(value_field.name()), - value_result, value_field.optional())); + Result> ProjectMap(const SchemaField& key_field, + const SchemaField& value_field, + std::shared_ptr key_result, + std::shared_ptr value_result) const { + SchemaField projected_key_field = key_field; + if (key_field.type() != key_result) { + projected_key_field = + SchemaField(key_field.field_id(), std::string(key_field.name()), key_result, + key_field.optional()); } - return {}; - } - Status ProjectStruct(const SchemaField& field, std::unique_ptr* out) const { - if (*out && (*out)->type_id() != TypeId::kStruct) { - return InvalidArgument("Project struct {}:{}, but result is not StructType", - field.name(), field.field_id()); + SchemaField projected_value_field = value_field; + if (value_field.type() != value_result) { + projected_value_field = + SchemaField(value_field.field_id(), std::string(value_field.name()), + value_result, value_field.optional()); } - if (*out == nullptr) { - *out = std::make_unique(std::vector{}); - } - return {}; + + return std::make_shared(projected_key_field, projected_value_field); } private: @@ -487,9 +462,10 @@ Result> Schema::SelectInternal( } } - std::unique_ptr result; PruneColumnVisitor visitor(selected_ids, /*select_full_types=*/true); - ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*this, &visitor, &result)); + auto self = std::shared_ptr(this, [](const StructType*) {}); + ICEBERG_ASSIGN_OR_RAISE(auto result, + visitor.Visit(std::const_pointer_cast(self))); if (!result) { return std::make_unique(std::vector{}, schema_id_); @@ -507,9 +483,9 @@ Result> Schema::SelectInternal( Result> Schema::Project( std::unordered_set& field_ids) const { PruneColumnVisitor visitor(field_ids, /*select_full_types=*/false); - - std::unique_ptr result; - ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*this, &visitor, &result)); + auto self = std::shared_ptr(this, [](const StructType*) {}); + ICEBERG_ASSIGN_OR_RAISE(auto result, + visitor.Visit(std::const_pointer_cast(self))); if (!result) { return std::make_unique(std::vector{}, schema_id_); @@ -519,10 +495,8 @@ Result> Schema::Project( return InvalidSchema("Projected type must be a struct type"); } - const auto& projected_struct = internal::checked_cast(*result); - std::vector fields_vec(projected_struct.fields().begin(), - projected_struct.fields().end()); - return std::make_unique(std::move(fields_vec), schema_id_); + auto& projected_struct = internal::checked_cast(*result); + return FromStructType(std::move(const_cast(projected_struct)), schema_id_); } } // namespace iceberg diff --git a/src/iceberg/schema.h b/src/iceberg/schema.h index f79689b49..0c2839379 100644 --- a/src/iceberg/schema.h +++ b/src/iceberg/schema.h @@ -83,27 +83,6 @@ class ICEBERG_EXPORT Schema : public StructType { /// \param names Field names to select (supports nested field paths) /// \param case_sensitive Whether name matching is case-sensitive (default: true) /// \return Projected schema containing only the specified fields - /// - /// \example - /// \code - /// // Original schema: struct { - /// // id: int, - /// // user: struct { - /// // name: string, - /// // address: struct { street: string, city: string } - /// // } - /// // } - /// - /// // Select by names - you must specify the exact path - /// auto result1 = schema->Select({"id", "user.name"}); - /// // Result: struct { id: int, user: struct { name: string } } - /// - /// auto result2 = schema->Select({"user.address.street"}); - /// // Result: struct { user: struct { address: struct { street: string } } } - /// - /// auto result3 = schema->Select({"user.name"}); - /// Result: struct { user: struct { name: string } } - /// \endcode Result> Select(std::span names, bool case_sensitive = true) const; @@ -134,32 +113,6 @@ class ICEBERG_EXPORT Schema : public StructType { /// - If nested field IDs are also in field_ids, they are recursively projected /// - If no nested field IDs are in field_ids, an empty struct is included /// - List/Map types cannot be explicitly projected (returns error) - /// - /// \example - /// \code - /// // Original schema with field IDs: - /// // struct { - /// // 1: id: int, - /// // 2: user: struct { - /// // 3: name: string, - /// // 4: address: struct { 5: street: string, 6: city: string } - /// // } - /// // } - /// - /// // Project by IDs - recursive behavior for structs - /// std::unordered_set ids1 = {1, 2, 3}; // id, user, user.name - /// auto result1 = schema->Project(ids1); - /// // Result: struct { id: int, user: struct { name: string } } - /// - /// std::unordered_set ids2 = {2}; // Only user struct - /// auto result2 = schema->Project(ids2); - /// // Result: struct { user: struct {} } // Empty struct because no nested fields - /// selected - /// - /// std::unordered_set ids3 = {2, 5}; // user struct + street field - /// auto result3 = schema->Project(ids3); - /// // Result: struct { user: struct { address: struct { street: string } } } - /// \endcode Result> Project( std::unordered_set& field_ids) const; diff --git a/test/schema_test.cc b/test/schema_test.cc index 2e79aa1f3..4099972f8 100644 --- a/test/schema_test.cc +++ b/test/schema_test.cc @@ -1240,44 +1240,206 @@ INSTANTIATE_TEST_SUITE_P( ProjectMapErrorTestCases, ProjectParamTest, ::testing::Values( ProjectTestParam{ - .test_name = "ProjectMapTypeError", + .test_name = "ProjectMapWithOnlyKey", .create_schema = []() { - auto map_key = std::make_unique( - 1, "key", iceberg::string(), false); - auto map_value = std::make_unique( + // Create a map with key and value fields + auto key_field = std::make_unique( + 1, "key", iceberg::int32(), false); + auto value_field = std::make_unique( 2, "value", iceberg::string(), false); auto map_type = - std::make_shared(*map_key, *map_value); - auto map_field = std::make_unique( - 3, "string_map", map_type, false); - auto id_field = std::make_unique( - 4, "id", iceberg::int32(), false); + std::make_shared(*key_field, *value_field); + auto map_field = std::make_unique(3, "map_field", + map_type, true); return std::make_shared( - std::vector{*id_field, *map_field}, 100); + std::vector{*map_field}, 100); }, - .selected_ids = {3}, + .selected_ids = {1}, // Only select key field, not value field .expected_schema = []() { return nullptr; }, .should_succeed = false, .expected_error_message = - R"(Cannot explicitly project List or Map types, 3:string_map of type map was selected)"}, + "Cannot project Map with only key or value: key=present, value=null"}, ProjectTestParam{ - .test_name = "ProjectListTypeError", + .test_name = "ProjectMapWithOnlyValue", .create_schema = []() { - auto list_element = std::make_unique( - 1, "element", iceberg::int32(), false); - auto list_type = std::make_shared(*list_element); - auto list_field = std::make_unique( - 2, "int_list", list_type, false); - auto id_field = std::make_unique( - 3, "id", iceberg::int32(), false); + auto key_field = std::make_unique( + 1, "key", iceberg::int32(), false); + auto value_field = std::make_unique( + 2, "value", iceberg::string(), false); + auto map_type = + std::make_shared(*key_field, *value_field); + auto map_field = std::make_unique(3, "map_field", + map_type, true); return std::make_shared( - std::vector{*id_field, *list_field}, 100); + std::vector{*map_field}, 100); }, - .selected_ids = {2}, + .selected_ids = {2}, // Only select value field, not key field .expected_schema = []() { return nullptr; }, .should_succeed = false, .expected_error_message = - R"(Cannot explicitly project List or Map types, 2:int_list of type list was selected)"})); + "Cannot project Map with only key or value: key=null, value=present"})); + +INSTANTIATE_TEST_SUITE_P( + ProjectListAndMapTestCases, ProjectParamTest, + ::testing::Values( + ProjectTestParam{ + .test_name = "ProjectListElement", + .create_schema = + []() { + // Create a list with nested struct element + auto nested_field1 = std::make_unique( + 1, "name", iceberg::string(), true); + auto nested_field2 = std::make_unique( + 2, "age", iceberg::int32(), true); + auto struct_type = std::make_shared( + std::vector{*nested_field1, *nested_field2}); + auto element_field = std::make_unique( + 3, "element", struct_type, false); + auto list_type = std::make_shared(*element_field); + auto list_field = std::make_unique( + 4, "list_field", list_type, true); + return std::make_shared( + std::vector{*list_field}, 100); + }, + .selected_ids = {1}, // Only select name field from list element + .expected_schema = + []() { + auto name_field = std::make_unique( + 1, "name", iceberg::string(), true); + auto struct_type = std::make_shared( + std::vector{*name_field}); + auto element_field = std::make_unique( + 3, "element", struct_type, false); + auto list_type = std::make_shared(*element_field); + auto list_field = std::make_unique( + 4, "list_field", list_type, true); + return std::make_shared( + std::vector{*list_field}, 100); + }, + .should_succeed = true}, + + ProjectTestParam{ + .test_name = "ProjectListOfMap", + .create_schema = + []() { + // Create a list of map with nested fields + auto map_key_field = std::make_unique( + 1, "key", iceberg::string(), false); + auto map_value_field1 = std::make_unique( + 2, "value_name", iceberg::string(), false); + auto map_value_field2 = std::make_unique( + 3, "value_age", iceberg::int32(), false); + auto map_value_struct = std::make_shared( + std::vector{*map_value_field1, + *map_value_field2}); + auto map_value_field = std::make_unique( + 4, "value", map_value_struct, false); + auto map_type = std::make_shared(*map_key_field, + *map_value_field); + auto list_element = std::make_unique( + 5, "element", map_type, false); + auto list_type = std::make_shared(*list_element); + auto list_field = std::make_unique( + 6, "list_field", list_type, true); + return std::make_shared( + std::vector{*list_field}, 100); + }, + .selected_ids = {1, 2}, + .expected_schema = + []() { + auto map_key_field = std::make_unique( + 1, "key", iceberg::string(), false); + auto map_value_field1 = std::make_unique( + 2, "value_name", iceberg::string(), false); + auto map_value_struct = std::make_shared( + std::vector{*map_value_field1}); + auto map_value_field = std::make_unique( + 4, "value", map_value_struct, false); + auto map_type = std::make_shared(*map_key_field, + *map_value_field); + auto list_element = std::make_unique( + 5, "element", map_type, false); + auto list_type = std::make_shared(*list_element); + auto list_field = std::make_unique( + 6, "list_field", list_type, true); + return std::make_shared( + std::vector{*list_field}, 100); + }, + .should_succeed = true}, + + ProjectTestParam{ + .test_name = "ProjectMapKeyAndValue", + .create_schema = + []() { + auto key_field1 = std::make_unique( + 1, "key_id", iceberg::int32(), false); + auto key_field2 = std::make_unique( + 2, "key_name", iceberg::string(), false); + auto key_struct = std::make_shared( + std::vector{*key_field1, *key_field2}); + auto key_field = + std::make_unique(3, "key", key_struct, false); + + auto value_field1 = std::make_unique( + 4, "value_id", iceberg::int32(), false); + auto value_field2 = std::make_unique( + 5, "value_name", iceberg::string(), false); + auto value_struct = std::make_shared( + std::vector{*value_field1, *value_field2}); + auto value_field = std::make_unique( + 6, "value", value_struct, false); + + auto map_type = + std::make_shared(*key_field, *value_field); + auto map_field = std::make_unique(7, "map_field", + map_type, true); + return std::make_shared( + std::vector{*map_field}, 100); + }, + .selected_ids = {1, 4}, + .expected_schema = + []() { + auto key_field1 = std::make_unique( + 1, "key_id", iceberg::int32(), false); + auto key_struct = std::make_shared( + std::vector{*key_field1}); + auto key_field = + std::make_unique(3, "key", key_struct, false); + + auto value_field1 = std::make_unique( + 4, "value_id", iceberg::int32(), false); + auto value_struct = std::make_shared( + std::vector{*value_field1}); + auto value_field = std::make_unique( + 6, "value", value_struct, false); + + auto map_type = + std::make_shared(*key_field, *value_field); + auto map_field = std::make_unique(7, "map_field", + map_type, true); + return std::make_shared( + std::vector{*map_field}, 100); + }, + .should_succeed = true}, + + ProjectTestParam{.test_name = "ProjectEmptyResult", + .create_schema = + []() { + auto field1 = std::make_unique( + 1, "id", iceberg::int32(), false); + auto field2 = std::make_unique( + 2, "name", iceberg::string(), true); + return std::make_shared( + std::vector{*field1, *field2}, + 100); + }, + .selected_ids = {999}, // Select non-existent field + .expected_schema = + []() { + return std::make_shared( + std::vector{}, 100); + }, + .should_succeed = true})); From 619e527da3c66a418c67488a5468c6a684e9ed7b Mon Sep 17 00:00:00 2001 From: nullccxsy <32149055912@qq.com> Date: Wed, 10 Sep 2025 11:59:28 +0800 Subject: [PATCH 06/10] fix review --- src/iceberg/schema.cc | 172 ++++++++++++------------------------------ test/schema_test.cc | 91 +++++++--------------- 2 files changed, 76 insertions(+), 187 deletions(-) diff --git a/src/iceberg/schema.cc b/src/iceberg/schema.cc index 5c4f988e0..670df2050 100644 --- a/src/iceberg/schema.cc +++ b/src/iceberg/schema.cc @@ -264,12 +264,11 @@ void NameToIdVisitor::Finish() { /// \brief Visitor class for pruning schema columns based on selected field IDs. /// /// This visitor traverses a schema and creates a projected version containing only -/// the specified fields. It handles different projection modes: -/// - select_full_types=true: Include entire fields when their ID is selected -/// - select_full_types=false: Recursively project nested fields within selected structs +/// the specified fields. When `select_full_types` is true, a field with all its +/// sub-fields are selected if its field-id has been selected; otherwise, only leaf +/// fields of selected field-ids are selected. /// -/// \warning Error conditions that will cause projection to fail: -/// - Project or Select a Map with just key or value (returns InvalidArgument) +/// \note It returns an error when projection is not successful. class PruneColumnVisitor { public: PruneColumnVisitor(const std::unordered_set& selected_ids, @@ -279,157 +278,80 @@ class PruneColumnVisitor { Result> Visit(const std::shared_ptr& type) const { switch (type->type_id()) { case TypeId::kStruct: { - auto struct_type = std::static_pointer_cast(type); - return Visit(struct_type); + return Visit(internal::checked_pointer_cast(type)); } case TypeId::kList: { - auto list_type = std::static_pointer_cast(type); - return Visit(list_type); + return Visit(internal::checked_pointer_cast(type)); } case TypeId::kMap: { - auto map_type = std::static_pointer_cast(type); - return Visit(map_type); + return Visit(internal::checked_pointer_cast(type)); } default: { - auto primitive_type = std::static_pointer_cast(type); - return Visit(primitive_type); + return nullptr; } } } - Result> Visit(const std::shared_ptr& type) const { - std::vector> selected_types; - for (const auto& field : type->fields()) { - if (select_full_types_ and selected_ids_.contains(field.field_id())) { - selected_types.emplace_back(field.type()); - continue; - } - ICEBERG_ASSIGN_OR_RAISE(auto child_result, Visit(field.type())); - if (selected_ids_.contains(field.field_id())) { - selected_types.emplace_back( - field.type()->is_primitive() ? field.type() : std::move(child_result)); - } else { - selected_types.emplace_back(std::move(child_result)); - } + Result> Visit(const SchemaField& field) const { + if (selected_ids_.contains(field.field_id())) { + return (select_full_types_ || field.type()->is_primitive()) ? field.type() + : Visit(field.type()); } + return Visit(field.type()); + } + static SchemaField MakeField(const SchemaField& field, std::shared_ptr type) { + return {field.field_id(), std::string(field.name()), std::move(type), + field.optional(), std::string(field.doc())}; + } + + Result> Visit(const std::shared_ptr& type) const { bool same_types = true; std::vector selected_fields; - const auto& fields = type->fields(); - for (size_t i = 0; i < fields.size(); i++) { - if (fields[i].type() == selected_types[i]) { - selected_fields.emplace_back(std::move(fields[i])); - } else if (selected_types[i]) { - same_types = false; - selected_fields.emplace_back(fields[i].field_id(), std::string(fields[i].name()), - std::move(selected_types[i]), fields[i].optional(), - std::string(fields[i].doc())); + for (const auto& field : type->fields()) { + ICEBERG_ASSIGN_OR_RAISE(auto child_type, Visit(field)); + if (child_type) { + same_types = same_types && (child_type == field.type()); + selected_fields.emplace_back(MakeField(field, std::move(child_type))); } } - if (!selected_fields.empty()) { - if (same_types && selected_fields.size() == fields.size()) { - return type; - } else { - return std::make_shared(std::move(selected_fields)); - } + if (selected_fields.empty()) { + return nullptr; + } else if (same_types and selected_fields.size() == type->fields().size()) { + return type; } - - return nullptr; + return std::make_shared(std::move(selected_fields)); } Result> Visit(const std::shared_ptr& type) const { - const auto& element_field = type->fields()[0]; - if (select_full_types_ and selected_ids_.contains(element_field.field_id())) { + const auto& elem_field = type->fields()[0]; + ICEBERG_ASSIGN_OR_RAISE(auto elem_type, Visit(elem_field)); + if (elem_type == nullptr) { + return nullptr; + } else if (elem_type == elem_field.type()) { return type; } - - ICEBERG_ASSIGN_OR_RAISE(auto child_result, Visit(element_field.type())); - - std::shared_ptr out; - if (selected_ids_.contains(element_field.field_id())) { - if (element_field.type()->is_primitive()) { - out = std::make_shared(element_field); - } else { - ICEBERG_ASSIGN_OR_RAISE(out, ProjectList(element_field, std::move(child_result))); - } - } else if (child_result) { - ICEBERG_ASSIGN_OR_RAISE(out, ProjectList(element_field, std::move(child_result))); - } - return out; + return std::make_shared(MakeField(elem_field, std::move(elem_type))); } Result> Visit(const std::shared_ptr& type) const { const auto& key_field = type->fields()[0]; const auto& value_field = type->fields()[1]; + ICEBERG_ASSIGN_OR_RAISE(auto key_type, Visit(key_field)); + ICEBERG_ASSIGN_OR_RAISE(auto value_type, Visit(value_field)); - if (select_full_types_ and selected_ids_.contains(key_field.field_id()) and - selected_ids_.contains(value_field.field_id())) { - return type; - } - - ICEBERG_ASSIGN_OR_RAISE(auto key_result, Visit(key_field.type())); - ICEBERG_ASSIGN_OR_RAISE(auto value_result, Visit(value_field.type())); - - if (selected_ids_.contains(value_field.field_id()) and - value_field.type()->is_primitive()) { - value_result = value_field.type(); - } - if (selected_ids_.contains(key_field.field_id()) and - key_field.type()->is_primitive()) { - key_result = key_field.type(); - } - - if (!key_result && !value_result) { + if (key_type == nullptr && value_type == nullptr) { return nullptr; + } else if (value_type == value_field.type() && + (key_type == key_field.type() || key_type == nullptr)) { + return type; + } else if (value_type == nullptr) { + return InvalidArgument("Cannot project Map without value field"); } - - if (!key_result || !value_result) { - return InvalidArgument( - "Cannot project Map with only key or value: key={}, value={}", - key_result ? "present" : "null", value_result ? "present" : "null"); - } - - ICEBERG_ASSIGN_OR_RAISE(auto out, - ProjectMap(key_field, value_field, key_result, value_result)); - return out; - } - - Result> Visit(const std::shared_ptr& type) const { - return nullptr; - } - - Result> ProjectList(const SchemaField& element_field, - std::shared_ptr child_result) const { - if (!child_result) { - return nullptr; - } - if (element_field.type() == child_result) { - return std::make_shared(element_field); - } - return std::make_shared(element_field.field_id(), child_result, - element_field.optional()); - } - - Result> ProjectMap(const SchemaField& key_field, - const SchemaField& value_field, - std::shared_ptr key_result, - std::shared_ptr value_result) const { - SchemaField projected_key_field = key_field; - if (key_field.type() != key_result) { - projected_key_field = - SchemaField(key_field.field_id(), std::string(key_field.name()), key_result, - key_field.optional()); - } - - SchemaField projected_value_field = value_field; - if (value_field.type() != value_result) { - projected_value_field = - SchemaField(value_field.field_id(), std::string(value_field.name()), - value_result, value_field.optional()); - } - - return std::make_shared(projected_key_field, projected_value_field); + return std::make_shared( + (key_type == nullptr ? key_field : MakeField(key_field, std::move(key_type))), + MakeField(value_field, std::move(value_type))); } private: diff --git a/test/schema_test.cc b/test/schema_test.cc index 4099972f8..42e7bd604 100644 --- a/test/schema_test.cc +++ b/test/schema_test.cc @@ -512,19 +512,16 @@ TEST_P(SelectParamTest, SelectFields) { auto result = input_schema->Select(param.select_fields, param.case_sensitive); if (param.should_succeed) { - ASSERT_TRUE(result.has_value()) << "Select should succeed for: " << param.test_name; - + ASSERT_TRUE(result.has_value()); auto actual_schema = std::move(result.value()); auto expected_schema = param.expected_schema(); ASSERT_EQ(*actual_schema, *expected_schema) << "Schema mismatch for: " << param.test_name; } else { - ASSERT_FALSE(result.has_value()) << "Select should fail for: " << param.test_name; - ASSERT_THAT(result, iceberg::IsError(iceberg::ErrorKind::kInvalidArgument)) - << "Should return InvalidArgument error for: " << param.test_name; + ASSERT_FALSE(result.has_value()); + ASSERT_THAT(result, iceberg::IsError(iceberg::ErrorKind::kInvalidArgument)); - ASSERT_THAT(result, iceberg::HasErrorMessage(param.expected_error_message)) - << "Error message mismatch for: " << param.test_name; + ASSERT_THAT(result, iceberg::HasErrorMessage(param.expected_error_message)); } } @@ -941,21 +938,14 @@ TEST_P(ProjectParamTest, ProjectFields) { auto result = input_schema->Project(selected_ids); if (param.should_succeed) { - ASSERT_TRUE(result.has_value()) << "Project should succeed for: " << param.test_name; - + ASSERT_TRUE(result.has_value()); auto actual_schema = std::move(result.value()); auto expected_schema = param.expected_schema(); - ASSERT_EQ(*actual_schema, *expected_schema) - << "Schema mismatch for: " << param.test_name; + ASSERT_EQ(*actual_schema, *expected_schema); } else { - ASSERT_FALSE(result.has_value()) << "Project should fail for: " << param.test_name; - ASSERT_THAT(result, iceberg::IsError(iceberg::ErrorKind::kInvalidArgument)) - << "Should return InvalidArgument error for: " << param.test_name; - - if (!param.expected_error_message.empty()) { - ASSERT_THAT(result, iceberg::HasErrorMessage(param.expected_error_message)) - << "Error message mismatch for: " << param.test_name; - } + ASSERT_FALSE(result.has_value()); + ASSERT_THAT(result, iceberg::IsError(iceberg::ErrorKind::kInvalidArgument)); + ASSERT_THAT(result, iceberg::HasErrorMessage(param.expected_error_message)); } } @@ -1238,49 +1228,26 @@ INSTANTIATE_TEST_SUITE_P( INSTANTIATE_TEST_SUITE_P( ProjectMapErrorTestCases, ProjectParamTest, - ::testing::Values( - ProjectTestParam{ - .test_name = "ProjectMapWithOnlyKey", - .create_schema = - []() { - // Create a map with key and value fields - auto key_field = std::make_unique( - 1, "key", iceberg::int32(), false); - auto value_field = std::make_unique( - 2, "value", iceberg::string(), false); - auto map_type = - std::make_shared(*key_field, *value_field); - auto map_field = std::make_unique(3, "map_field", - map_type, true); - return std::make_shared( - std::vector{*map_field}, 100); - }, - .selected_ids = {1}, // Only select key field, not value field - .expected_schema = []() { return nullptr; }, - .should_succeed = false, - .expected_error_message = - "Cannot project Map with only key or value: key=present, value=null"}, - - ProjectTestParam{ - .test_name = "ProjectMapWithOnlyValue", - .create_schema = - []() { - auto key_field = std::make_unique( - 1, "key", iceberg::int32(), false); - auto value_field = std::make_unique( - 2, "value", iceberg::string(), false); - auto map_type = - std::make_shared(*key_field, *value_field); - auto map_field = std::make_unique(3, "map_field", - map_type, true); - return std::make_shared( - std::vector{*map_field}, 100); - }, - .selected_ids = {2}, // Only select value field, not key field - .expected_schema = []() { return nullptr; }, - .should_succeed = false, - .expected_error_message = - "Cannot project Map with only key or value: key=null, value=present"})); + ::testing::Values(ProjectTestParam{ + .test_name = "ProjectMapWithOnlyKey", + .create_schema = + []() { + // Create a map with key and value fields + auto key_field = std::make_unique( + 1, "key", iceberg::int32(), false); + auto value_field = std::make_unique( + 2, "value", iceberg::string(), false); + auto map_type = + std::make_shared(*key_field, *value_field); + auto map_field = + std::make_unique(3, "map_field", map_type, true); + return std::make_shared( + std::vector{*map_field}, 100); + }, + .selected_ids = {1}, // Only select key field, not value field + .expected_schema = []() { return nullptr; }, + .should_succeed = false, + .expected_error_message = "Cannot project Map without value field"})); INSTANTIATE_TEST_SUITE_P( ProjectListAndMapTestCases, ProjectParamTest, From c02534701d07036c7d01494ceec908133e767a00 Mon Sep 17 00:00:00 2001 From: nullccxsy <32149055912@qq.com> Date: Thu, 11 Sep 2025 15:16:54 +0800 Subject: [PATCH 07/10] fix comments --- src/iceberg/schema.cc | 53 +- src/iceberg/schema.h | 47 +- src/iceberg/schema_internal.cc | 5 + src/iceberg/schema_internal.h | 2 + test/schema_test.cc | 997 +++++++++++++-------------------- 5 files changed, 420 insertions(+), 684 deletions(-) diff --git a/src/iceberg/schema.cc b/src/iceberg/schema.cc index 670df2050..b099c3722 100644 --- a/src/iceberg/schema.cc +++ b/src/iceberg/schema.cc @@ -277,18 +277,14 @@ class PruneColumnVisitor { Result> Visit(const std::shared_ptr& type) const { switch (type->type_id()) { - case TypeId::kStruct: { + case TypeId::kStruct: return Visit(internal::checked_pointer_cast(type)); - } - case TypeId::kList: { + case TypeId::kList: return Visit(internal::checked_pointer_cast(type)); - } - case TypeId::kMap: { + case TypeId::kMap: return Visit(internal::checked_pointer_cast(type)); - } - default: { + default: return nullptr; - } } } @@ -356,21 +352,11 @@ class PruneColumnVisitor { private: const std::unordered_set& selected_ids_; - bool select_full_types_; + const bool select_full_types_; }; -Result> Schema::Select(std::span names, - bool case_sensitive) const { - return SelectInternal(names, case_sensitive); -} - -Result> Schema::Select( - const std::initializer_list& names, bool case_sensitive) const { - return SelectInternal(names, case_sensitive); -} - -Result> Schema::SelectInternal( - std::span names, bool case_sensitive) const { +Result> Schema::Select(std::span names, + bool case_sensitive) const { const std::string kAllColumns = "*"; if (std::ranges::find(names, kAllColumns) != names.end()) { return std::make_unique(*this); @@ -385,9 +371,8 @@ Result> Schema::SelectInternal( } PruneColumnVisitor visitor(selected_ids, /*select_full_types=*/true); - auto self = std::shared_ptr(this, [](const StructType*) {}); - ICEBERG_ASSIGN_OR_RAISE(auto result, - visitor.Visit(std::const_pointer_cast(self))); + ICEBERG_ASSIGN_OR_RAISE( + auto result, visitor.Visit(std::shared_ptr(ToStructType(*this)))); if (!result) { return std::make_unique(std::vector{}, schema_id_); @@ -397,17 +382,16 @@ Result> Schema::SelectInternal( return InvalidSchema("Projected type must be a struct type"); } - auto& projected_struct = internal::checked_cast(*result); - - return FromStructType(std::move(const_cast(projected_struct)), schema_id_); + return FromStructType(std::move(const_cast( + internal::checked_cast(*result))), + schema_id_); } -Result> Schema::Project( - std::unordered_set& field_ids) const { +Result> Schema::Project( + const std::unordered_set& field_ids) const { PruneColumnVisitor visitor(field_ids, /*select_full_types=*/false); - auto self = std::shared_ptr(this, [](const StructType*) {}); - ICEBERG_ASSIGN_OR_RAISE(auto result, - visitor.Visit(std::const_pointer_cast(self))); + ICEBERG_ASSIGN_OR_RAISE( + auto result, visitor.Visit(std::shared_ptr(ToStructType(*this)))); if (!result) { return std::make_unique(std::vector{}, schema_id_); @@ -417,8 +401,9 @@ Result> Schema::Project( return InvalidSchema("Projected type must be a struct type"); } - auto& projected_struct = internal::checked_cast(*result); - return FromStructType(std::move(const_cast(projected_struct)), schema_id_); + return FromStructType(std::move(const_cast( + internal::checked_cast(*result))), + schema_id_); } } // namespace iceberg diff --git a/src/iceberg/schema.h b/src/iceberg/schema.h index 0c2839379..81f9aa394 100644 --- a/src/iceberg/schema.h +++ b/src/iceberg/schema.h @@ -76,45 +76,22 @@ class ICEBERG_EXPORT Schema : public StructType { /// \brief Creates a projected schema from selected field names. /// - /// Selects fields by their names using dot notation for nested fields. - /// Supports both canonical names (e.g., "user.address.street") and short names - /// (e.g., "user.street" for map values, "list.element" for list elements). - /// - /// \param names Field names to select (supports nested field paths) - /// \param case_sensitive Whether name matching is case-sensitive (default: true) - /// \return Projected schema containing only the specified fields - Result> Select(std::span names, - bool case_sensitive = true) const; - - /// \brief Creates a projected schema from selected field names. - Result> Select( - const std::initializer_list& names, bool case_sensitive = true) const; - - /// \brief Creates a projected schema from selected field names. - template - Result> Select(Args&&... names, - bool case_sensitive = true) const { - static_assert(((std::is_convertible_v || - std::is_convertible_v) && - ...), - "All arguments must be convertible to std::string"); - return select({(names)...}, case_sensitive); - } + /// \param names Selected field names and nested names are dot-concatenated. + /// \param case_sensitive Whether name matching is case-sensitive (default: true). + /// \return Projected schema containing only selected fields. + /// \note If the field name of a nested type has been selected, all of its + /// sub-fields will be selected. + Result> Select(std::span names, + bool case_sensitive = true) const; /// \brief Creates a projected schema from selected field IDs. /// - /// Selects fields by their numeric IDs. More efficient than Select() when you - /// already know the field IDs. Handles recursive projection of nested structs. - /// /// \param field_ids Set of field IDs to select - /// \return Projected schema containing only the specified fields - /// - /// \note When a struct field ID is specified: - /// - If nested field IDs are also in field_ids, they are recursively projected - /// - If no nested field IDs are in field_ids, an empty struct is included - /// - List/Map types cannot be explicitly projected (returns error) - Result> Project( - std::unordered_set& field_ids) const; + /// \return Projected schema containing only the specified fields. + /// \note Field ID of a nested field may not be projected unless at least + /// one of its sub-fields has been projected. + Result> Project( + const std::unordered_set& field_ids) const; friend bool operator==(const Schema& lhs, const Schema& rhs) { return lhs.Equals(rhs); } diff --git a/src/iceberg/schema_internal.cc b/src/iceberg/schema_internal.cc index beb973b28..e020a9b7b 100644 --- a/src/iceberg/schema_internal.cc +++ b/src/iceberg/schema_internal.cc @@ -325,4 +325,9 @@ Result> FromArrowSchema(const ArrowSchema& schema, return FromStructType(std::move(struct_type), schema_id); } +std::unique_ptr ToStructType(const Schema& schema) { + std::vector fields(schema.fields().begin(), schema.fields().end()); + return std::make_unique(std::move(fields)); +} + } // namespace iceberg diff --git a/src/iceberg/schema_internal.h b/src/iceberg/schema_internal.h index 8b290852a..5c7209d64 100644 --- a/src/iceberg/schema_internal.h +++ b/src/iceberg/schema_internal.h @@ -53,4 +53,6 @@ Result> FromArrowSchema(const ArrowSchema& schema, std::unique_ptr FromStructType(StructType&& struct_type, std::optional schema_id); +std::unique_ptr ToStructType(const Schema& schema); + } // namespace iceberg diff --git a/test/schema_test.cc b/test/schema_test.cc index 42e7bd604..d14226305 100644 --- a/test/schema_test.cc +++ b/test/schema_test.cc @@ -493,6 +493,232 @@ TEST(SchemaTest, NestedDuplicateFieldIdError) { ::testing::HasSubstr("Duplicate field id found: 1")); } +namespace { + +struct TestFieldFactory { + static std::unique_ptr Id(int32_t field_id = 1, + bool optional = false) { + return std::make_unique(field_id, "id", iceberg::int32(), + optional); + } + + static std::unique_ptr Name(int32_t field_id = 2, + bool optional = true) { + return std::make_unique(field_id, "name", iceberg::string(), + optional); + } + + static std::unique_ptr Age(int32_t field_id = 3, + bool optional = true) { + return std::make_unique(field_id, "age", iceberg::int32(), + optional); + } + + static std::unique_ptr Email(int32_t field_id = 4, + bool optional = true) { + return std::make_unique(field_id, "email", iceberg::string(), + optional); + } + + static std::unique_ptr Street(int32_t field_id = 11, + bool optional = true) { + return std::make_unique(field_id, "street", iceberg::string(), + optional); + } + + static std::unique_ptr City(int32_t field_id = 12, + bool optional = true) { + return std::make_unique(field_id, "city", iceberg::string(), + optional); + } + + static std::unique_ptr Zip(int32_t field_id = 13, + bool optional = true) { + return std::make_unique(field_id, "zip", iceberg::int32(), + optional); + } + + static std::unique_ptr Theme(int32_t field_id = 21, + bool optional = true) { + return std::make_unique(field_id, "theme", iceberg::string(), + optional); + } + + static std::unique_ptr Key(int32_t field_id = 31, + bool optional = false) { + return std::make_unique(field_id, "key", iceberg::int32(), + optional); + } + + static std::unique_ptr Value(int32_t field_id = 32, + bool optional = false) { + return std::make_unique(field_id, "value", iceberg::string(), + optional); + } + + static std::unique_ptr Element(int32_t field_id = 41, + bool optional = false) { + return std::make_unique(field_id, "element", iceberg::string(), + optional); + } +}; + +struct TestSchemaFactory { + static std::shared_ptr BasicSchema(int32_t schema_id = 100) { + auto field1 = TestFieldFactory::Id(1, false); + auto field2 = TestFieldFactory::Name(2, true); + auto field3 = TestFieldFactory::Age(3, true); + auto field4 = TestFieldFactory::Email(4, true); + return std::make_shared( + std::vector{*field1, *field2, *field3, *field4}, schema_id); + } + + static std::shared_ptr AddressSchema(int32_t schema_id = 200) { + auto street = TestFieldFactory::Street(11, true); + auto city = TestFieldFactory::City(12, true); + auto zip = TestFieldFactory::Zip(13, true); + auto address_type = std::make_shared( + std::vector{*street, *city, *zip}); + auto address_field = + std::make_unique(14, "address", address_type, true); + auto id_field = TestFieldFactory::Id(15, false); + auto name_field = TestFieldFactory::Name(16, true); + return std::make_shared( + std::vector{*id_field, *name_field, *address_field}, + schema_id); + } + + static std::shared_ptr NestedUserSchema(int32_t schema_id = 300) { + auto street = TestFieldFactory::Street(11, true); + auto city = TestFieldFactory::City(12, true); + auto address_type = std::make_shared( + std::vector{*street, *city}); + auto address_field = + std::make_unique(13, "address", address_type, true); + + auto name_field = TestFieldFactory::Name(14, true); + auto user_type = std::make_shared( + std::vector{*name_field, *address_field}); + auto user_field = std::make_unique(15, "user", user_type, true); + + auto id_field = TestFieldFactory::Id(16, false); + return std::make_shared( + std::vector{*id_field, *user_field}, schema_id); + } + + static std::shared_ptr MultiLevelSchema(int32_t schema_id = 400) { + auto name_field = TestFieldFactory::Name(21, true); + auto age_field = TestFieldFactory::Age(22, true); + auto profile_type = std::make_shared( + std::vector{*name_field, *age_field}); + auto profile_field = + std::make_unique(23, "profile", profile_type, true); + + auto theme_field = TestFieldFactory::Theme(24, true); + auto settings_type = std::make_shared( + std::vector{*theme_field}); + auto settings_field = + std::make_unique(25, "settings", settings_type, true); + + auto user_type = std::make_shared( + std::vector{*profile_field, *settings_field}); + auto user_field = std::make_unique(26, "user", user_type, true); + + auto id_field = TestFieldFactory::Id(27, false); + return std::make_shared( + std::vector{*id_field, *user_field}, schema_id); + } + + static std::shared_ptr ListSchema(int32_t schema_id = 500) { + auto list_element = TestFieldFactory::Element(41, false); + auto list_type = std::make_shared(*list_element); + auto tags_field = std::make_unique(42, "tags", list_type, true); + + auto name_field = TestFieldFactory::Name(43, true); + auto age_field = TestFieldFactory::Age(44, true); + auto user_type = std::make_shared( + std::vector{*name_field, *age_field}); + auto user_field = std::make_unique(45, "user", user_type, true); + + auto id_field = TestFieldFactory::Id(46, false); + return std::make_shared( + std::vector{*id_field, *tags_field, *user_field}, + schema_id); + } + + static std::shared_ptr MapSchema(int32_t schema_id = 100) { + auto key_field = TestFieldFactory::Key(31, false); + auto value_field = TestFieldFactory::Value(32, false); + auto map_type = std::make_shared(*key_field, *value_field); + auto map_field = + std::make_unique(33, "map_field", map_type, true); + return std::make_shared( + std::vector{*map_field}, schema_id); + } + + static std::shared_ptr ListWithStructElementSchema( + int32_t schema_id = 100) { + auto name_field = TestFieldFactory::Name(51, true); + auto age_field = TestFieldFactory::Age(52, true); + auto struct_type = std::make_shared( + std::vector{*name_field, *age_field}); + auto element_field = + std::make_unique(53, "element", struct_type, false); + auto list_type = std::make_shared(*element_field); + auto list_field = + std::make_unique(54, "list_field", list_type, true); + return std::make_shared( + std::vector{*list_field}, schema_id); + } + + static std::shared_ptr ListOfMapSchema(int32_t schema_id = 100) { + auto map_key_field = TestFieldFactory::Key(61, false); + auto map_value_field1 = std::make_unique( + 62, "value_name", iceberg::string(), false); + auto map_value_field2 = + std::make_unique(63, "value_age", iceberg::int32(), false); + auto map_value_struct = std::make_shared( + std::vector{*map_value_field1, *map_value_field2}); + auto map_value_field = + std::make_unique(64, "value", map_value_struct, false); + auto map_type = std::make_shared(*map_key_field, *map_value_field); + auto list_element = + std::make_unique(65, "element", map_type, false); + auto list_type = std::make_shared(*list_element); + auto list_field = + std::make_unique(66, "list_field", list_type, true); + return std::make_shared( + std::vector{*list_field}, schema_id); + } + + static std::shared_ptr ComplexMapSchema(int32_t schema_id = 100) { + auto key_field1 = + std::make_unique(71, "key_id", iceberg::int32(), false); + auto key_field2 = + std::make_unique(72, "key_name", iceberg::string(), false); + auto key_struct = std::make_shared( + std::vector{*key_field1, *key_field2}); + auto key_field = std::make_unique(73, "key", key_struct, false); + + auto value_field1 = + std::make_unique(74, "value_id", iceberg::int32(), false); + auto value_field2 = std::make_unique(75, "value_name", + iceberg::string(), false); + auto value_struct = std::make_shared( + std::vector{*value_field1, *value_field2}); + auto value_field = + std::make_unique(76, "value", value_struct, false); + + auto map_type = std::make_shared(*key_field, *value_field); + auto map_field = + std::make_unique(77, "map_field", map_type, true); + return std::make_shared( + std::vector{*map_field}, schema_id); + } +}; + +} // namespace + struct SelectTestParam { std::string test_name; std::function()> create_schema; @@ -508,19 +734,16 @@ class SelectParamTest : public ::testing::TestWithParam {}; TEST_P(SelectParamTest, SelectFields) { const auto& param = GetParam(); auto input_schema = param.create_schema(); - auto result = input_schema->Select(param.select_fields, param.case_sensitive); if (param.should_succeed) { ASSERT_TRUE(result.has_value()); auto actual_schema = std::move(result.value()); auto expected_schema = param.expected_schema(); - ASSERT_EQ(*actual_schema, *expected_schema) - << "Schema mismatch for: " << param.test_name; + ASSERT_EQ(*actual_schema, *expected_schema); } else { ASSERT_FALSE(result.has_value()); ASSERT_THAT(result, iceberg::IsError(iceberg::ErrorKind::kInvalidArgument)); - ASSERT_THAT(result, iceberg::HasErrorMessage(param.expected_error_message)); } } @@ -528,204 +751,85 @@ TEST_P(SelectParamTest, SelectFields) { INSTANTIATE_TEST_SUITE_P( SelectTestCases, SelectParamTest, ::testing::Values( - SelectTestParam{.test_name = "SelectAllColumns", - .create_schema = - []() { - auto field1 = std::make_unique( - 1, "id", iceberg::int32(), false); - auto field2 = std::make_unique( - 2, "name", iceberg::string(), true); - auto field3 = std::make_unique( - 3, "age", iceberg::int32(), true); - auto field4 = std::make_unique( - 4, "email", iceberg::string(), true); - return std::make_shared( - std::vector{*field1, *field2, - *field3, *field4}, - 100); - }, - .select_fields = {"*"}, - .expected_schema = - []() { - auto field1 = std::make_unique( - 1, "id", iceberg::int32(), false); - auto field2 = std::make_unique( - 2, "name", iceberg::string(), true); - auto field3 = std::make_unique( - 3, "age", iceberg::int32(), true); - auto field4 = std::make_unique( - 4, "email", iceberg::string(), true); - return std::make_shared( - std::vector{*field1, *field2, - *field3, *field4}, - 100); - }, - .should_succeed = true}, - - SelectTestParam{.test_name = "SelectSingleField", - .create_schema = - []() { - auto field1 = std::make_unique( - 1, "id", iceberg::int32(), false); - auto field2 = std::make_unique( - 2, "name", iceberg::string(), true); - auto field3 = std::make_unique( - 3, "age", iceberg::int32(), true); - auto field4 = std::make_unique( - 4, "email", iceberg::string(), true); - return std::make_shared( - std::vector{*field1, *field2, - *field3, *field4}, - 100); - }, - .select_fields = {"name"}, - .expected_schema = - []() { - auto field = std::make_unique( - 2, "name", iceberg::string(), true); - return std::make_shared( - std::vector{*field}, 100); - }, - .should_succeed = true}, + SelectTestParam{ + .test_name = "SelectAllColumns", + .create_schema = []() { return TestSchemaFactory::BasicSchema(100); }, + .select_fields = {"*"}, + .expected_schema = []() { return TestSchemaFactory::BasicSchema(100); }, + .should_succeed = true}, SelectTestParam{ - .test_name = "SelectMultipleFields", - .create_schema = + .test_name = "SelectSingleField", + .create_schema = []() { return TestSchemaFactory::BasicSchema(100); }, + .select_fields = {"name"}, + .expected_schema = []() { - auto field1 = std::make_unique( - 1, "id", iceberg::int32(), false); - auto field2 = std::make_unique( - 2, "name", iceberg::string(), true); - auto field3 = std::make_unique( - 3, "age", iceberg::int32(), true); - auto field4 = std::make_unique( - 4, "email", iceberg::string(), true); + auto field = TestFieldFactory::Name(2, true); return std::make_shared( - std::vector{*field1, *field2, *field3, - *field4}, - 100); + std::vector{*field}, 100); }, + .should_succeed = true}, + + SelectTestParam{ + .test_name = "SelectMultipleFields", + .create_schema = []() { return TestSchemaFactory::BasicSchema(100); }, .select_fields = {"id", "name", "age"}, .expected_schema = []() { - auto field1 = std::make_unique( - 1, "id", iceberg::int32(), false); - auto field2 = std::make_unique( - 2, "name", iceberg::string(), true); - auto field3 = std::make_unique( - 3, "age", iceberg::int32(), true); + auto field1 = TestFieldFactory::Id(1, false); + auto field2 = TestFieldFactory::Name(2, true); + auto field3 = TestFieldFactory::Age(3, true); return std::make_shared( std::vector{*field1, *field2, *field3}, 100); }, .should_succeed = true}, - SelectTestParam{.test_name = "SelectNonExistentField", - .create_schema = - []() { - auto field1 = std::make_unique( - 1, "id", iceberg::int32(), false); - auto field2 = std::make_unique( - 2, "name", iceberg::string(), true); - auto field3 = std::make_unique( - 3, "age", iceberg::int32(), true); - auto field4 = std::make_unique( - 4, "email", iceberg::string(), true); - return std::make_shared( - std::vector{*field1, *field2, - *field3, *field4}, - 100); - }, - .select_fields = {"nonexistent"}, - .expected_schema = - []() { - return std::make_shared( - std::vector{}, 100); - }, - .should_succeed = true}, - - SelectTestParam{.test_name = "SelectCaseSensitive", - .create_schema = - []() { - auto field1 = std::make_unique( - 1, "id", iceberg::int32(), false); - auto field2 = std::make_unique( - 2, "name", iceberg::string(), true); - auto field3 = std::make_unique( - 3, "age", iceberg::int32(), true); - auto field4 = std::make_unique( - 4, "email", iceberg::string(), true); - return std::make_shared( - std::vector{*field1, *field2, - *field3, *field4}, - 100); - }, - .select_fields = {"Name"}, // case-sensitive - .expected_schema = - []() { - return std::make_shared( - std::vector{}, 100); - }, - .should_succeed = true}, - - SelectTestParam{.test_name = "SelectCaseInsensitive", - .create_schema = - []() { - auto field1 = std::make_unique( - 1, "id", iceberg::int32(), false); - auto field2 = std::make_unique( - 2, "name", iceberg::string(), true); - auto field3 = std::make_unique( - 3, "age", iceberg::int32(), true); - auto field4 = std::make_unique( - 4, "email", iceberg::string(), true); - return std::make_shared( - std::vector{*field1, *field2, - *field3, *field4}, - 100); - }, - .select_fields = {"Name"}, // case-insensitive - .expected_schema = - []() { - auto field = std::make_unique( - 2, "name", iceberg::string(), true); - return std::make_shared( - std::vector{*field}, 100); - }, - .should_succeed = true, - .case_sensitive = false})); + SelectTestParam{ + .test_name = "SelectNonExistentField", + .create_schema = []() { return TestSchemaFactory::BasicSchema(100); }, + .select_fields = {"nonexistent"}, + .expected_schema = + []() { + return std::make_shared( + std::vector{}, 100); + }, + .should_succeed = true}, + + SelectTestParam{ + .test_name = "SelectCaseSensitive", + .create_schema = []() { return TestSchemaFactory::BasicSchema(100); }, + .select_fields = {"Name"}, // case-sensitive + .expected_schema = + []() { + return std::make_shared( + std::vector{}, 100); + }, + .should_succeed = true}, + + SelectTestParam{ + .test_name = "SelectCaseInsensitive", + .create_schema = []() { return TestSchemaFactory::BasicSchema(100); }, + .select_fields = {"Name"}, // case-insensitive + .expected_schema = + []() { + auto field = TestFieldFactory::Name(2, true); + return std::make_shared( + std::vector{*field}, 100); + }, + .should_succeed = true, + .case_sensitive = false})); INSTANTIATE_TEST_SUITE_P( SelectNestedTestCases, SelectParamTest, ::testing::Values( SelectTestParam{ .test_name = "SelectTopLevelFields", - .create_schema = - []() { - auto nested_field1 = std::make_unique( - 1, "street", iceberg::string(), true); - auto nested_field2 = std::make_unique( - 2, "city", iceberg::string(), true); - auto nested_field3 = std::make_unique( - 3, "zip", iceberg::int32(), true); - auto address_type = std::make_shared( - std::vector{*nested_field1, *nested_field2, - *nested_field3}); - auto field1 = std::make_unique( - 4, "id", iceberg::int32(), false); - auto field2 = std::make_unique( - 5, "name", iceberg::string(), true); - auto field3 = std::make_unique( - 6, "address", address_type, true); - return std::make_shared( - std::vector{*field1, *field2, *field3}, 200); - }, + .create_schema = []() { return TestSchemaFactory::AddressSchema(200); }, .select_fields = {"id", "name"}, .expected_schema = []() { - auto field1 = std::make_unique( - 4, "id", iceberg::int32(), false); - auto field2 = std::make_unique( - 5, "name", iceberg::string(), true); + auto field1 = TestFieldFactory::Id(15, false); + auto field2 = TestFieldFactory::Name(16, true); return std::make_shared( std::vector{*field1, *field2}, 200); }, @@ -733,35 +837,15 @@ INSTANTIATE_TEST_SUITE_P( SelectTestParam{ .test_name = "SelectNestedField", - .create_schema = - []() { - auto nested_field1 = std::make_unique( - 1, "street", iceberg::string(), true); - auto nested_field2 = std::make_unique( - 2, "city", iceberg::string(), true); - auto nested_field3 = std::make_unique( - 3, "zip", iceberg::int32(), true); - auto address_type = std::make_shared( - std::vector{*nested_field1, *nested_field2, - *nested_field3}); - auto field1 = std::make_unique( - 4, "id", iceberg::int32(), false); - auto field2 = std::make_unique( - 5, "name", iceberg::string(), true); - auto field3 = std::make_unique( - 6, "address", address_type, true); - return std::make_shared( - std::vector{*field1, *field2, *field3}, 200); - }, + .create_schema = []() { return TestSchemaFactory::AddressSchema(200); }, .select_fields = {"address.street"}, .expected_schema = []() { - auto street_field = std::make_unique( - 1, "street", iceberg::string(), true); + auto street_field = TestFieldFactory::Street(11, true); auto address_type = std::make_shared( std::vector{*street_field}); auto address_field = std::make_unique( - 6, "address", address_type, true); + 14, "address", address_type, true); return std::make_shared( std::vector{*address_field}, 200); }, @@ -772,48 +856,21 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values( SelectTestParam{ .test_name = "SelectTopLevelAndNestedFields", - .create_schema = - []() { - // {id: int, user: {name: string, address: {street: string, city: - // string}}} - auto street_field = std::make_unique( - 1, "street", iceberg::string(), true); - auto city_field = std::make_unique( - 2, "city", iceberg::string(), true); - auto address_type = std::make_shared( - std::vector{*street_field, *city_field}); - auto address_field = std::make_unique( - 3, "address", address_type, true); - - auto name_field = std::make_unique( - 4, "name", iceberg::string(), true); - auto user_type = std::make_shared( - std::vector{*name_field, *address_field}); - auto user_field = - std::make_unique(5, "user", user_type, true); - - auto id_field = std::make_unique( - 6, "id", iceberg::int32(), false); - return std::make_shared( - std::vector{*id_field, *user_field}, 300); - }, + .create_schema = []() { return TestSchemaFactory::NestedUserSchema(300); }, .select_fields = {"id", "user.name", "user.address.street"}, .expected_schema = []() { - auto id_field = std::make_unique( - 6, "id", iceberg::int32(), false); - auto name_field = std::make_unique( - 4, "name", iceberg::string(), true); - auto street_field = std::make_unique( - 1, "street", iceberg::string(), true); + auto id_field = TestFieldFactory::Id(16, false); + auto name_field = TestFieldFactory::Name(14, true); + auto street_field = TestFieldFactory::Street(11, true); auto address_type = std::make_shared( std::vector{*street_field}); auto address_field = std::make_unique( - 3, "address", address_type, true); + 13, "address", address_type, true); auto user_type = std::make_shared( std::vector{*name_field, *address_field}); auto user_field = - std::make_unique(5, "user", user_type, true); + std::make_unique(15, "user", user_type, true); return std::make_shared( std::vector{*id_field, *user_field}, 300); }, @@ -821,57 +878,26 @@ INSTANTIATE_TEST_SUITE_P( SelectTestParam{ .test_name = "SelectNestedFieldsAtDifferentLevels", - .create_schema = - []() { - // {id: int, user: {profile: {name: string, age: int}, settings: {theme: - // string}}} - auto name_field = std::make_unique( - 1, "name", iceberg::string(), true); - auto age_field = std::make_unique( - 2, "age", iceberg::int32(), true); - auto profile_type = std::make_shared( - std::vector{*name_field, *age_field}); - auto profile_field = std::make_unique( - 3, "profile", profile_type, true); - - auto theme_field = std::make_unique( - 4, "theme", iceberg::string(), true); - auto settings_type = std::make_shared( - std::vector{*theme_field}); - auto settings_field = std::make_unique( - 5, "settings", settings_type, true); - - auto user_type = std::make_shared( - std::vector{*profile_field, *settings_field}); - auto user_field = - std::make_unique(6, "user", user_type, true); - - auto id_field = std::make_unique( - 7, "id", iceberg::int32(), false); - return std::make_shared( - std::vector{*id_field, *user_field}, 400); - }, + .create_schema = []() { return TestSchemaFactory::MultiLevelSchema(400); }, .select_fields = {"user.profile.name", "user.settings.theme"}, .expected_schema = []() { - auto name_field = std::make_unique( - 1, "name", iceberg::string(), true); + auto name_field = TestFieldFactory::Name(21, true); auto profile_type = std::make_shared( std::vector{*name_field}); auto profile_field = std::make_unique( - 3, "profile", profile_type, true); + 23, "profile", profile_type, true); - auto theme_field = std::make_unique( - 4, "theme", iceberg::string(), true); + auto theme_field = TestFieldFactory::Theme(24, true); auto settings_type = std::make_shared( std::vector{*theme_field}); auto settings_field = std::make_unique( - 5, "settings", settings_type, true); + 25, "settings", settings_type, true); auto user_type = std::make_shared( std::vector{*profile_field, *settings_field}); auto user_field = - std::make_unique(6, "user", user_type, true); + std::make_unique(26, "user", user_type, true); return std::make_shared( std::vector{*user_field}, 400); }, @@ -879,42 +905,16 @@ INSTANTIATE_TEST_SUITE_P( SelectTestParam{ .test_name = "SelectListAndNestedFields", - .create_schema = - []() { - // {id: int, tags: list, user: {name: string, age: int}} - auto list_element = std::make_unique( - 1, "element", iceberg::string(), false); - auto list_type = std::make_shared(*list_element); - auto tags_field = - std::make_unique(2, "tags", list_type, true); - - auto name_field = std::make_unique( - 3, "name", iceberg::string(), true); - auto age_field = std::make_unique( - 4, "age", iceberg::int32(), true); - auto user_type = std::make_shared( - std::vector{*name_field, *age_field}); - auto user_field = - std::make_unique(5, "user", user_type, true); - - auto id_field = std::make_unique( - 6, "id", iceberg::int32(), false); - return std::make_shared( - std::vector{*id_field, *tags_field, - *user_field}, - 500); - }, + .create_schema = []() { return TestSchemaFactory::ListSchema(500); }, .select_fields = {"id", "user.name"}, .expected_schema = []() { - auto id_field = std::make_unique( - 6, "id", iceberg::int32(), false); - auto name_field = std::make_unique( - 3, "name", iceberg::string(), true); + auto id_field = TestFieldFactory::Id(46, false); + auto name_field = TestFieldFactory::Name(43, true); auto user_type = std::make_shared( std::vector{*name_field}); auto user_field = - std::make_unique(5, "user", user_type, true); + std::make_unique(45, "user", user_type, true); return std::make_shared( std::vector{*id_field, *user_field}, 500); }, @@ -954,120 +954,58 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values( ProjectTestParam{ .test_name = "ProjectAllFields", - .create_schema = - []() { - auto field1 = std::make_unique( - 1, "id", iceberg::int32(), false); - auto field2 = std::make_unique( - 2, "name", iceberg::string(), true); - auto field3 = std::make_unique( - 3, "age", iceberg::int32(), true); - return std::make_shared( - std::vector{*field1, *field2, *field3}, 100); - }, - .selected_ids = {1, 2, 3}, + .create_schema = []() { return TestSchemaFactory::BasicSchema(100); }, + .selected_ids = {1, 2, 3, 4}, + .expected_schema = []() { return TestSchemaFactory::BasicSchema(100); }, + .should_succeed = true}, + + ProjectTestParam{ + .test_name = "ProjectSingleField", + .create_schema = []() { return TestSchemaFactory::BasicSchema(100); }, + .selected_ids = {2}, .expected_schema = []() { - auto field1 = std::make_unique( - 1, "id", iceberg::int32(), false); - auto field2 = std::make_unique( - 2, "name", iceberg::string(), true); - auto field3 = std::make_unique( - 3, "age", iceberg::int32(), true); + auto field = TestFieldFactory::Name(2, true); return std::make_shared( - std::vector{*field1, *field2, *field3}, 100); + std::vector{*field}, 100); }, .should_succeed = true}, ProjectTestParam{ - .test_name = "ProjectSingleField", - .create_schema = + .test_name = "ProjectNonExistentFieldId", + .create_schema = []() { return TestSchemaFactory::BasicSchema(100); }, + .selected_ids = {999}, + .expected_schema = []() { - auto field1 = std::make_unique( - 1, "id", iceberg::int32(), false); - auto field2 = std::make_unique( - 2, "name", iceberg::string(), true); - auto field3 = std::make_unique( - 3, "age", iceberg::int32(), true); return std::make_shared( - std::vector{*field1, *field2, *field3}, 100); + std::vector{}, 100); }, - .selected_ids = {2}, + .should_succeed = true}, + + ProjectTestParam{ + .test_name = "ProjectEmptySelection", + .create_schema = []() { return TestSchemaFactory::BasicSchema(100); }, + .selected_ids = {}, .expected_schema = []() { - auto field = std::make_unique( - 2, "name", iceberg::string(), true); return std::make_shared( - std::vector{*field}, 100); + std::vector{}, 100); }, - .should_succeed = true}, - - ProjectTestParam{.test_name = "ProjectNonExistentFieldId", - .create_schema = - []() { - auto field1 = std::make_unique( - 1, "id", iceberg::int32(), false); - auto field2 = std::make_unique( - 2, "name", iceberg::string(), true); - return std::make_shared( - std::vector{*field1, *field2}, - 100); - }, - .selected_ids = {999}, - .expected_schema = - []() { - return std::make_shared( - std::vector{}, 100); - }, - .should_succeed = true}, - - ProjectTestParam{.test_name = "ProjectEmptySelection", - .create_schema = - []() { - auto field1 = std::make_unique( - 1, "id", iceberg::int32(), false); - auto field2 = std::make_unique( - 2, "name", iceberg::string(), true); - return std::make_shared( - std::vector{*field1, *field2}, - 100); - }, - .selected_ids = {}, - .expected_schema = - []() { - return std::make_shared( - std::vector{}, 100); - }, - .should_succeed = true})); + .should_succeed = true})); INSTANTIATE_TEST_SUITE_P( ProjectNestedTestCases, ProjectParamTest, ::testing::Values(ProjectTestParam{ .test_name = "ProjectNestedStructField", - .create_schema = - []() { - auto nested_field1 = std::make_unique( - 1, "street", iceberg::string(), true); - auto nested_field2 = std::make_unique( - 2, "city", iceberg::string(), true); - auto address_type = std::make_shared( - std::vector{*nested_field1, *nested_field2}); - auto field1 = std::make_unique( - 3, "id", iceberg::int32(), false); - auto field2 = std::make_unique(4, "address", - address_type, true); - return std::make_shared( - std::vector{*field1, *field2}, 200); - }, - .selected_ids = {1}, + .create_schema = []() { return TestSchemaFactory::AddressSchema(200); }, + .selected_ids = {11}, .expected_schema = []() { - auto street_field = std::make_unique( - 1, "street", iceberg::string(), true); + auto street_field = TestFieldFactory::Street(11, true); auto address_type = std::make_shared( std::vector{*street_field}); auto address_field = std::make_unique( - 4, "address", address_type, true); + 14, "address", address_type, true); return std::make_shared( std::vector{*address_field}, 200); }, @@ -1078,48 +1016,21 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values( ProjectTestParam{ .test_name = "ProjectTopLevelAndNestedFields", - .create_schema = - []() { - // {id: int, user: {name: string, address: {street: string, city: - // string}}} - auto street_field = std::make_unique( - 1, "street", iceberg::string(), true); - auto city_field = std::make_unique( - 2, "city", iceberg::string(), true); - auto address_type = std::make_shared( - std::vector{*street_field, *city_field}); - auto address_field = std::make_unique( - 3, "address", address_type, true); - - auto name_field = std::make_unique( - 4, "name", iceberg::string(), true); - auto user_type = std::make_shared( - std::vector{*name_field, *address_field}); - auto user_field = - std::make_unique(5, "user", user_type, true); - - auto id_field = std::make_unique( - 6, "id", iceberg::int32(), false); - return std::make_shared( - std::vector{*id_field, *user_field}, 300); - }, - .selected_ids = {6, 4, 1}, + .create_schema = []() { return TestSchemaFactory::NestedUserSchema(300); }, + .selected_ids = {16, 14, 11}, .expected_schema = []() { - auto id_field = std::make_unique( - 6, "id", iceberg::int32(), false); - auto name_field = std::make_unique( - 4, "name", iceberg::string(), true); - auto street_field = std::make_unique( - 1, "street", iceberg::string(), true); + auto id_field = TestFieldFactory::Id(16, false); + auto name_field = TestFieldFactory::Name(14, true); + auto street_field = TestFieldFactory::Street(11, true); auto address_type = std::make_shared( std::vector{*street_field}); auto address_field = std::make_unique( - 3, "address", address_type, true); + 13, "address", address_type, true); auto user_type = std::make_shared( std::vector{*name_field, *address_field}); auto user_field = - std::make_unique(5, "user", user_type, true); + std::make_unique(15, "user", user_type, true); return std::make_shared( std::vector{*id_field, *user_field}, 300); }, @@ -1127,57 +1038,26 @@ INSTANTIATE_TEST_SUITE_P( ProjectTestParam{ .test_name = "ProjectNestedFieldsAtDifferentLevels", - .create_schema = - []() { - // {id: int, user: {profile: {name: string, age: int}, settings: {theme: - // string}}} - auto name_field = std::make_unique( - 1, "name", iceberg::string(), true); - auto age_field = std::make_unique( - 2, "age", iceberg::int32(), true); - auto profile_type = std::make_shared( - std::vector{*name_field, *age_field}); - auto profile_field = std::make_unique( - 3, "profile", profile_type, true); - - auto theme_field = std::make_unique( - 4, "theme", iceberg::string(), true); - auto settings_type = std::make_shared( - std::vector{*theme_field}); - auto settings_field = std::make_unique( - 5, "settings", settings_type, true); - - auto user_type = std::make_shared( - std::vector{*profile_field, *settings_field}); - auto user_field = - std::make_unique(6, "user", user_type, true); - - auto id_field = std::make_unique( - 7, "id", iceberg::int32(), false); - return std::make_shared( - std::vector{*id_field, *user_field}, 400); - }, - .selected_ids = {1, 4}, + .create_schema = []() { return TestSchemaFactory::MultiLevelSchema(400); }, + .selected_ids = {21, 24}, .expected_schema = []() { - auto name_field = std::make_unique( - 1, "name", iceberg::string(), true); + auto name_field = TestFieldFactory::Name(21, true); auto profile_type = std::make_shared( std::vector{*name_field}); auto profile_field = std::make_unique( - 3, "profile", profile_type, true); + 23, "profile", profile_type, true); - auto theme_field = std::make_unique( - 4, "theme", iceberg::string(), true); + auto theme_field = TestFieldFactory::Theme(24, true); auto settings_type = std::make_shared( std::vector{*theme_field}); auto settings_field = std::make_unique( - 5, "settings", settings_type, true); + 25, "settings", settings_type, true); auto user_type = std::make_shared( std::vector{*profile_field, *settings_field}); auto user_field = - std::make_unique(6, "user", user_type, true); + std::make_unique(26, "user", user_type, true); return std::make_shared( std::vector{*user_field}, 400); }, @@ -1185,42 +1065,16 @@ INSTANTIATE_TEST_SUITE_P( ProjectTestParam{ .test_name = "ProjectListAndNestedFields", - .create_schema = - []() { - // {id: int, tags: list, user: {name: string, age: int}} - auto list_element = std::make_unique( - 1, "element", iceberg::string(), false); - auto list_type = std::make_shared(*list_element); - auto tags_field = - std::make_unique(2, "tags", list_type, true); - - auto name_field = std::make_unique( - 3, "name", iceberg::string(), true); - auto age_field = std::make_unique( - 4, "age", iceberg::int32(), true); - auto user_type = std::make_shared( - std::vector{*name_field, *age_field}); - auto user_field = - std::make_unique(5, "user", user_type, true); - - auto id_field = std::make_unique( - 6, "id", iceberg::int32(), false); - return std::make_shared( - std::vector{*id_field, *tags_field, - *user_field}, - 500); - }, - .selected_ids = {6, 3}, + .create_schema = []() { return TestSchemaFactory::ListSchema(500); }, + .selected_ids = {46, 43}, .expected_schema = []() { - auto id_field = std::make_unique( - 6, "id", iceberg::int32(), false); - auto name_field = std::make_unique( - 3, "name", iceberg::string(), true); + auto id_field = TestFieldFactory::Id(46, false); + auto name_field = TestFieldFactory::Name(43, true); auto user_type = std::make_shared( std::vector{*name_field}); auto user_field = - std::make_unique(5, "user", user_type, true); + std::make_unique(45, "user", user_type, true); return std::make_shared( std::vector{*id_field, *user_field}, 500); }, @@ -1230,21 +1084,8 @@ INSTANTIATE_TEST_SUITE_P( ProjectMapErrorTestCases, ProjectParamTest, ::testing::Values(ProjectTestParam{ .test_name = "ProjectMapWithOnlyKey", - .create_schema = - []() { - // Create a map with key and value fields - auto key_field = std::make_unique( - 1, "key", iceberg::int32(), false); - auto value_field = std::make_unique( - 2, "value", iceberg::string(), false); - auto map_type = - std::make_shared(*key_field, *value_field); - auto map_field = - std::make_unique(3, "map_field", map_type, true); - return std::make_shared( - std::vector{*map_field}, 100); - }, - .selected_ids = {1}, // Only select key field, not value field + .create_schema = []() { return TestSchemaFactory::MapSchema(100); }, + .selected_ids = {31}, // Only select key field, not value field .expected_schema = []() { return nullptr; }, .should_succeed = false, .expected_error_message = "Cannot project Map without value field"})); @@ -1255,34 +1096,18 @@ INSTANTIATE_TEST_SUITE_P( ProjectTestParam{ .test_name = "ProjectListElement", .create_schema = - []() { - // Create a list with nested struct element - auto nested_field1 = std::make_unique( - 1, "name", iceberg::string(), true); - auto nested_field2 = std::make_unique( - 2, "age", iceberg::int32(), true); - auto struct_type = std::make_shared( - std::vector{*nested_field1, *nested_field2}); - auto element_field = std::make_unique( - 3, "element", struct_type, false); - auto list_type = std::make_shared(*element_field); - auto list_field = std::make_unique( - 4, "list_field", list_type, true); - return std::make_shared( - std::vector{*list_field}, 100); - }, - .selected_ids = {1}, // Only select name field from list element + []() { return TestSchemaFactory::ListWithStructElementSchema(100); }, + .selected_ids = {51}, // Only select name field from list element .expected_schema = []() { - auto name_field = std::make_unique( - 1, "name", iceberg::string(), true); + auto name_field = TestFieldFactory::Name(51, true); auto struct_type = std::make_shared( std::vector{*name_field}); auto element_field = std::make_unique( - 3, "element", struct_type, false); + 53, "element", struct_type, false); auto list_type = std::make_shared(*element_field); auto list_field = std::make_unique( - 4, "list_field", list_type, true); + 54, "list_field", list_type, true); return std::make_shared( std::vector{*list_field}, 100); }, @@ -1290,48 +1115,24 @@ INSTANTIATE_TEST_SUITE_P( ProjectTestParam{ .test_name = "ProjectListOfMap", - .create_schema = - []() { - // Create a list of map with nested fields - auto map_key_field = std::make_unique( - 1, "key", iceberg::string(), false); - auto map_value_field1 = std::make_unique( - 2, "value_name", iceberg::string(), false); - auto map_value_field2 = std::make_unique( - 3, "value_age", iceberg::int32(), false); - auto map_value_struct = std::make_shared( - std::vector{*map_value_field1, - *map_value_field2}); - auto map_value_field = std::make_unique( - 4, "value", map_value_struct, false); - auto map_type = std::make_shared(*map_key_field, - *map_value_field); - auto list_element = std::make_unique( - 5, "element", map_type, false); - auto list_type = std::make_shared(*list_element); - auto list_field = std::make_unique( - 6, "list_field", list_type, true); - return std::make_shared( - std::vector{*list_field}, 100); - }, - .selected_ids = {1, 2}, + .create_schema = []() { return TestSchemaFactory::ListOfMapSchema(100); }, + .selected_ids = {61, 62}, .expected_schema = []() { - auto map_key_field = std::make_unique( - 1, "key", iceberg::string(), false); + auto map_key_field = TestFieldFactory::Key(61, false); auto map_value_field1 = std::make_unique( - 2, "value_name", iceberg::string(), false); + 62, "value_name", iceberg::string(), false); auto map_value_struct = std::make_shared( std::vector{*map_value_field1}); auto map_value_field = std::make_unique( - 4, "value", map_value_struct, false); + 64, "value", map_value_struct, false); auto map_type = std::make_shared(*map_key_field, *map_value_field); auto list_element = std::make_unique( - 5, "element", map_type, false); + 65, "element", map_type, false); auto list_type = std::make_shared(*list_element); auto list_field = std::make_unique( - 6, "list_field", list_type, true); + 66, "list_field", list_type, true); return std::make_shared( std::vector{*list_field}, 100); }, @@ -1339,74 +1140,40 @@ INSTANTIATE_TEST_SUITE_P( ProjectTestParam{ .test_name = "ProjectMapKeyAndValue", - .create_schema = - []() { - auto key_field1 = std::make_unique( - 1, "key_id", iceberg::int32(), false); - auto key_field2 = std::make_unique( - 2, "key_name", iceberg::string(), false); - auto key_struct = std::make_shared( - std::vector{*key_field1, *key_field2}); - auto key_field = - std::make_unique(3, "key", key_struct, false); - - auto value_field1 = std::make_unique( - 4, "value_id", iceberg::int32(), false); - auto value_field2 = std::make_unique( - 5, "value_name", iceberg::string(), false); - auto value_struct = std::make_shared( - std::vector{*value_field1, *value_field2}); - auto value_field = std::make_unique( - 6, "value", value_struct, false); - - auto map_type = - std::make_shared(*key_field, *value_field); - auto map_field = std::make_unique(7, "map_field", - map_type, true); - return std::make_shared( - std::vector{*map_field}, 100); - }, - .selected_ids = {1, 4}, + .create_schema = []() { return TestSchemaFactory::ComplexMapSchema(100); }, + .selected_ids = {71, 74}, .expected_schema = []() { auto key_field1 = std::make_unique( - 1, "key_id", iceberg::int32(), false); + 71, "key_id", iceberg::int32(), false); auto key_struct = std::make_shared( std::vector{*key_field1}); - auto key_field = - std::make_unique(3, "key", key_struct, false); + auto key_field = std::make_unique( + 73, "key", key_struct, false); auto value_field1 = std::make_unique( - 4, "value_id", iceberg::int32(), false); + 74, "value_id", iceberg::int32(), false); auto value_struct = std::make_shared( std::vector{*value_field1}); auto value_field = std::make_unique( - 6, "value", value_struct, false); + 76, "value", value_struct, false); auto map_type = std::make_shared(*key_field, *value_field); - auto map_field = std::make_unique(7, "map_field", + auto map_field = std::make_unique(77, "map_field", map_type, true); return std::make_shared( std::vector{*map_field}, 100); }, .should_succeed = true}, - ProjectTestParam{.test_name = "ProjectEmptyResult", - .create_schema = - []() { - auto field1 = std::make_unique( - 1, "id", iceberg::int32(), false); - auto field2 = std::make_unique( - 2, "name", iceberg::string(), true); - return std::make_shared( - std::vector{*field1, *field2}, - 100); - }, - .selected_ids = {999}, // Select non-existent field - .expected_schema = - []() { - return std::make_shared( - std::vector{}, 100); - }, - .should_succeed = true})); + ProjectTestParam{ + .test_name = "ProjectEmptyResult", + .create_schema = []() { return TestSchemaFactory::BasicSchema(100); }, + .selected_ids = {999}, // Select non-existent field + .expected_schema = + []() { + return std::make_shared( + std::vector{}, 100); + }, + .should_succeed = true})); From 388c8b124128e161c17f190b2fcecdcb8cb163ee Mon Sep 17 00:00:00 2001 From: nullccxsy <32149055912@qq.com> Date: Fri, 12 Sep 2025 16:23:55 +0800 Subject: [PATCH 08/10] fix review --- src/iceberg/schema.cc | 30 +-- test/schema_test.cc | 598 ++++++++++++++---------------------------- 2 files changed, 212 insertions(+), 416 deletions(-) diff --git a/src/iceberg/schema.cc b/src/iceberg/schema.cc index b099c3722..2a62023b5 100644 --- a/src/iceberg/schema.cc +++ b/src/iceberg/schema.cc @@ -261,7 +261,7 @@ void NameToIdVisitor::Finish() { } } -/// \brief Visitor class for pruning schema columns based on selected field IDs. +/// \brief Visitor for pruning columns based on selected field IDs. /// /// This visitor traverses a schema and creates a projected version containing only /// the specified fields. When `select_full_types` is true, a field with all its @@ -314,7 +314,7 @@ class PruneColumnVisitor { if (selected_fields.empty()) { return nullptr; - } else if (same_types and selected_fields.size() == type->fields().size()) { + } else if (same_types && selected_fields.size() == type->fields().size()) { return type; } return std::make_shared(std::move(selected_fields)); @@ -372,38 +372,36 @@ Result> Schema::Select(std::span name PruneColumnVisitor visitor(selected_ids, /*select_full_types=*/true); ICEBERG_ASSIGN_OR_RAISE( - auto result, visitor.Visit(std::shared_ptr(ToStructType(*this)))); + auto pruned_type, visitor.Visit(std::shared_ptr(ToStructType(*this)))); - if (!result) { - return std::make_unique(std::vector{}, schema_id_); + if (!pruned_type) { + return std::make_unique(std::vector{}, std::nullopt); } - if (result->type_id() != TypeId::kStruct) { + if (pruned_type->type_id() != TypeId::kStruct) { return InvalidSchema("Projected type must be a struct type"); } - return FromStructType(std::move(const_cast( - internal::checked_cast(*result))), - schema_id_); + return FromStructType(std::move(internal::checked_cast(*pruned_type)), + std::nullopt); } Result> Schema::Project( const std::unordered_set& field_ids) const { PruneColumnVisitor visitor(field_ids, /*select_full_types=*/false); ICEBERG_ASSIGN_OR_RAISE( - auto result, visitor.Visit(std::shared_ptr(ToStructType(*this)))); + auto project_type, visitor.Visit(std::shared_ptr(ToStructType(*this)))); - if (!result) { - return std::make_unique(std::vector{}, schema_id_); + if (!project_type) { + return std::make_unique(std::vector{}, std::nullopt); } - if (result->type_id() != TypeId::kStruct) { + if (project_type->type_id() != TypeId::kStruct) { return InvalidSchema("Projected type must be a struct type"); } - return FromStructType(std::move(const_cast( - internal::checked_cast(*result))), - schema_id_); + return FromStructType(std::move(internal::checked_cast(*project_type)), + std::nullopt); } } // namespace iceberg diff --git a/test/schema_test.cc b/test/schema_test.cc index d14226305..2872b6880 100644 --- a/test/schema_test.cc +++ b/test/schema_test.cc @@ -31,6 +31,18 @@ #include "iceberg/util/formatter.h" // IWYU pragma: keep #include "matchers.h" +template +std::shared_ptr MakeStructType(Args... args) { + return std::make_shared( + std::vector{args...}); +} + +template +std::shared_ptr MakeSchema(Args... args) { + return std::make_shared(std::vector{args...}, + std::nullopt); +} + TEST(SchemaTest, Basics) { { iceberg::SchemaField field1(5, "foo", iceberg::int32(), true); @@ -495,225 +507,106 @@ TEST(SchemaTest, NestedDuplicateFieldIdError) { namespace { -struct TestFieldFactory { - static std::unique_ptr Id(int32_t field_id = 1, - bool optional = false) { - return std::make_unique(field_id, "id", iceberg::int32(), - optional); - } - - static std::unique_ptr Name(int32_t field_id = 2, - bool optional = true) { - return std::make_unique(field_id, "name", iceberg::string(), - optional); - } - - static std::unique_ptr Age(int32_t field_id = 3, - bool optional = true) { - return std::make_unique(field_id, "age", iceberg::int32(), - optional); - } - - static std::unique_ptr Email(int32_t field_id = 4, - bool optional = true) { - return std::make_unique(field_id, "email", iceberg::string(), - optional); - } - - static std::unique_ptr Street(int32_t field_id = 11, - bool optional = true) { - return std::make_unique(field_id, "street", iceberg::string(), - optional); - } - - static std::unique_ptr City(int32_t field_id = 12, - bool optional = true) { - return std::make_unique(field_id, "city", iceberg::string(), - optional); - } - - static std::unique_ptr Zip(int32_t field_id = 13, - bool optional = true) { - return std::make_unique(field_id, "zip", iceberg::int32(), - optional); - } - - static std::unique_ptr Theme(int32_t field_id = 21, - bool optional = true) { - return std::make_unique(field_id, "theme", iceberg::string(), - optional); - } - - static std::unique_ptr Key(int32_t field_id = 31, - bool optional = false) { - return std::make_unique(field_id, "key", iceberg::int32(), - optional); - } - - static std::unique_ptr Value(int32_t field_id = 32, - bool optional = false) { - return std::make_unique(field_id, "value", iceberg::string(), - optional); - } - - static std::unique_ptr Element(int32_t field_id = 41, - bool optional = false) { - return std::make_unique(field_id, "element", iceberg::string(), - optional); - } -}; +namespace TestFields { +iceberg::SchemaField Id() { return {1, "id", iceberg::int32(), true}; } +iceberg::SchemaField Name() { return {2, "name", iceberg::string(), false}; } +iceberg::SchemaField Age() { return {3, "age", iceberg::int32(), true}; } +iceberg::SchemaField Email() { return {4, "email", iceberg::string(), true}; } +iceberg::SchemaField Street() { return {11, "street", iceberg::string(), true}; } +iceberg::SchemaField City() { return {12, "city", iceberg::string(), true}; } +iceberg::SchemaField Zip() { return {13, "zip", iceberg::int32(), true}; } +iceberg::SchemaField Theme() { return {24, "theme", iceberg::string(), true}; } +iceberg::SchemaField Key() { return {31, "key", iceberg::int32(), false}; } +iceberg::SchemaField Value() { return {32, "value", iceberg::string(), false}; } +iceberg::SchemaField Element() { return {41, "element", iceberg::string(), false}; } + +} // namespace TestFields struct TestSchemaFactory { static std::shared_ptr BasicSchema(int32_t schema_id = 100) { - auto field1 = TestFieldFactory::Id(1, false); - auto field2 = TestFieldFactory::Name(2, true); - auto field3 = TestFieldFactory::Age(3, true); - auto field4 = TestFieldFactory::Email(4, true); - return std::make_shared( - std::vector{*field1, *field2, *field3, *field4}, schema_id); + return MakeSchema(TestFields::Id(), TestFields::Name(), TestFields::Age(), + TestFields::Email()); } static std::shared_ptr AddressSchema(int32_t schema_id = 200) { - auto street = TestFieldFactory::Street(11, true); - auto city = TestFieldFactory::City(12, true); - auto zip = TestFieldFactory::Zip(13, true); - auto address_type = std::make_shared( - std::vector{*street, *city, *zip}); - auto address_field = - std::make_unique(14, "address", address_type, true); - auto id_field = TestFieldFactory::Id(15, false); - auto name_field = TestFieldFactory::Name(16, true); - return std::make_shared( - std::vector{*id_field, *name_field, *address_field}, - schema_id); + auto address_type = + MakeStructType(TestFields::Street(), TestFields::City(), TestFields::Zip()); + auto address_field = iceberg::SchemaField{14, "address", address_type, true}; + return MakeSchema(TestFields::Id(), TestFields::Name(), address_field); } static std::shared_ptr NestedUserSchema(int32_t schema_id = 300) { - auto street = TestFieldFactory::Street(11, true); - auto city = TestFieldFactory::City(12, true); - auto address_type = std::make_shared( - std::vector{*street, *city}); - auto address_field = - std::make_unique(13, "address", address_type, true); - - auto name_field = TestFieldFactory::Name(14, true); - auto user_type = std::make_shared( - std::vector{*name_field, *address_field}); - auto user_field = std::make_unique(15, "user", user_type, true); - - auto id_field = TestFieldFactory::Id(16, false); - return std::make_shared( - std::vector{*id_field, *user_field}, schema_id); + auto address_type = MakeStructType(TestFields::Street(), TestFields::City()); + auto address_field = iceberg::SchemaField{16, "address", address_type, true}; + auto user_type = MakeStructType(TestFields::Name(), address_field); + auto user_field = iceberg::SchemaField{17, "user", user_type, true}; + return MakeSchema(TestFields::Id(), user_field); } static std::shared_ptr MultiLevelSchema(int32_t schema_id = 400) { - auto name_field = TestFieldFactory::Name(21, true); - auto age_field = TestFieldFactory::Age(22, true); - auto profile_type = std::make_shared( - std::vector{*name_field, *age_field}); - auto profile_field = - std::make_unique(23, "profile", profile_type, true); - - auto theme_field = TestFieldFactory::Theme(24, true); - auto settings_type = std::make_shared( - std::vector{*theme_field}); - auto settings_field = - std::make_unique(25, "settings", settings_type, true); - - auto user_type = std::make_shared( - std::vector{*profile_field, *settings_field}); - auto user_field = std::make_unique(26, "user", user_type, true); - - auto id_field = TestFieldFactory::Id(27, false); - return std::make_shared( - std::vector{*id_field, *user_field}, schema_id); + auto profile_type = MakeStructType(TestFields::Name(), TestFields::Age()); + auto profile_field = iceberg::SchemaField{23, "profile", profile_type, true}; + + auto settings_type = MakeStructType(TestFields::Theme()); + auto settings_field = iceberg::SchemaField{25, "settings", settings_type, true}; + + auto user_type = MakeStructType(profile_field, settings_field); + auto user_field = iceberg::SchemaField{26, "user", user_type, true}; + + return MakeSchema(TestFields::Id(), user_field); } static std::shared_ptr ListSchema(int32_t schema_id = 500) { - auto list_element = TestFieldFactory::Element(41, false); - auto list_type = std::make_shared(*list_element); - auto tags_field = std::make_unique(42, "tags", list_type, true); - - auto name_field = TestFieldFactory::Name(43, true); - auto age_field = TestFieldFactory::Age(44, true); - auto user_type = std::make_shared( - std::vector{*name_field, *age_field}); - auto user_field = std::make_unique(45, "user", user_type, true); - - auto id_field = TestFieldFactory::Id(46, false); - return std::make_shared( - std::vector{*id_field, *tags_field, *user_field}, - schema_id); + auto list_type = std::make_shared(TestFields::Element()); + auto tags_field = iceberg::SchemaField{42, "tags", list_type, true}; + + auto user_type = MakeStructType(TestFields::Name(), TestFields::Age()); + auto user_field = iceberg::SchemaField{45, "user", user_type, true}; + + return MakeSchema(TestFields::Id(), tags_field, user_field); } - static std::shared_ptr MapSchema(int32_t schema_id = 100) { - auto key_field = TestFieldFactory::Key(31, false); - auto value_field = TestFieldFactory::Value(32, false); - auto map_type = std::make_shared(*key_field, *value_field); - auto map_field = - std::make_unique(33, "map_field", map_type, true); - return std::make_shared( - std::vector{*map_field}, schema_id); + static std::shared_ptr MapSchema(int32_t schema_id = 600) { + auto map_type = + std::make_shared(TestFields::Key(), TestFields::Value()); + auto map_field = iceberg::SchemaField{33, "map_field", map_type, true}; + return MakeSchema(map_field); } static std::shared_ptr ListWithStructElementSchema( - int32_t schema_id = 100) { - auto name_field = TestFieldFactory::Name(51, true); - auto age_field = TestFieldFactory::Age(52, true); - auto struct_type = std::make_shared( - std::vector{*name_field, *age_field}); - auto element_field = - std::make_unique(53, "element", struct_type, false); - auto list_type = std::make_shared(*element_field); - auto list_field = - std::make_unique(54, "list_field", list_type, true); - return std::make_shared( - std::vector{*list_field}, schema_id); + int32_t schema_id = 700) { + auto struct_type = MakeStructType(TestFields::Name(), TestFields::Age()); + auto element_field = iceberg::SchemaField{53, "element", struct_type, false}; + auto list_type = std::make_shared(element_field); + auto list_field = iceberg::SchemaField{54, "list_field", list_type, true}; + return MakeSchema(list_field); } - static std::shared_ptr ListOfMapSchema(int32_t schema_id = 100) { - auto map_key_field = TestFieldFactory::Key(61, false); - auto map_value_field1 = std::make_unique( - 62, "value_name", iceberg::string(), false); - auto map_value_field2 = - std::make_unique(63, "value_age", iceberg::int32(), false); - auto map_value_struct = std::make_shared( - std::vector{*map_value_field1, *map_value_field2}); - auto map_value_field = - std::make_unique(64, "value", map_value_struct, false); - auto map_type = std::make_shared(*map_key_field, *map_value_field); - auto list_element = - std::make_unique(65, "element", map_type, false); - auto list_type = std::make_shared(*list_element); - auto list_field = - std::make_unique(66, "list_field", list_type, true); - return std::make_shared( - std::vector{*list_field}, schema_id); + static std::shared_ptr ListOfMapSchema(int32_t schema_id = 800) { + auto map_value_struct = MakeStructType(TestFields::Name(), TestFields::Age()); + auto map_value_field = iceberg::SchemaField{64, "value", map_value_struct, false}; + auto map_type = + std::make_shared(TestFields::Key(), map_value_field); + auto list_element = iceberg::SchemaField{65, "element", map_type, false}; + auto list_type = std::make_shared(list_element); + auto list_field = iceberg::SchemaField{66, "list_field", list_type, true}; + return MakeSchema(list_field); } - static std::shared_ptr ComplexMapSchema(int32_t schema_id = 100) { - auto key_field1 = - std::make_unique(71, "key_id", iceberg::int32(), false); - auto key_field2 = - std::make_unique(72, "key_name", iceberg::string(), false); - auto key_struct = std::make_shared( - std::vector{*key_field1, *key_field2}); - auto key_field = std::make_unique(73, "key", key_struct, false); - - auto value_field1 = - std::make_unique(74, "value_id", iceberg::int32(), false); - auto value_field2 = std::make_unique(75, "value_name", - iceberg::string(), false); - auto value_struct = std::make_shared( - std::vector{*value_field1, *value_field2}); - auto value_field = - std::make_unique(76, "value", value_struct, false); - - auto map_type = std::make_shared(*key_field, *value_field); - auto map_field = - std::make_unique(77, "map_field", map_type, true); - return std::make_shared( - std::vector{*map_field}, schema_id); + static std::shared_ptr ComplexMapSchema(int32_t schema_id = 900) { + auto key_id_field = iceberg::SchemaField{71, "id", iceberg::int32(), false}; + auto key_name_field = iceberg::SchemaField{72, "name", iceberg::string(), false}; + auto key_struct = MakeStructType(key_id_field, key_name_field); + auto key_field = iceberg::SchemaField{73, "key", key_struct, false}; + + auto value_id_field = iceberg::SchemaField{74, "id", iceberg::int32(), false}; + auto value_name_field = iceberg::SchemaField{75, "name", iceberg::string(), false}; + auto value_struct = MakeStructType(value_id_field, value_name_field); + auto value_field = iceberg::SchemaField{76, "value", value_struct, false}; + + auto map_type = std::make_shared(key_field, value_field); + auto map_field = iceberg::SchemaField{77, "map_field", map_type, true}; + return MakeSchema(map_field); } }; @@ -731,6 +624,14 @@ struct SelectTestParam { class SelectParamTest : public ::testing::TestWithParam {}; +void CompareSchema(std::shared_ptr actual_schema, + std::shared_ptr expected_schema) { + ASSERT_EQ(actual_schema->fields().size(), expected_schema->fields().size()); + for (int i = 0; i < actual_schema->fields().size(); i++) { + ASSERT_EQ(actual_schema->fields()[i], expected_schema->fields()[i]); + } +} + TEST_P(SelectParamTest, SelectFields) { const auto& param = GetParam(); auto input_schema = param.create_schema(); @@ -738,9 +639,7 @@ TEST_P(SelectParamTest, SelectFields) { if (param.should_succeed) { ASSERT_TRUE(result.has_value()); - auto actual_schema = std::move(result.value()); - auto expected_schema = param.expected_schema(); - ASSERT_EQ(*actual_schema, *expected_schema); + CompareSchema(std::move(result.value()), param.expected_schema()); } else { ASSERT_FALSE(result.has_value()); ASSERT_THAT(result, iceberg::IsError(iceberg::ErrorKind::kInvalidArgument)); @@ -762,12 +661,7 @@ INSTANTIATE_TEST_SUITE_P( .test_name = "SelectSingleField", .create_schema = []() { return TestSchemaFactory::BasicSchema(100); }, .select_fields = {"name"}, - .expected_schema = - []() { - auto field = TestFieldFactory::Name(2, true); - return std::make_shared( - std::vector{*field}, 100); - }, + .expected_schema = []() { return MakeSchema(TestFields::Name()); }, .should_succeed = true}, SelectTestParam{ @@ -776,11 +670,8 @@ INSTANTIATE_TEST_SUITE_P( .select_fields = {"id", "name", "age"}, .expected_schema = []() { - auto field1 = TestFieldFactory::Id(1, false); - auto field2 = TestFieldFactory::Name(2, true); - auto field3 = TestFieldFactory::Age(3, true); - return std::make_shared( - std::vector{*field1, *field2, *field3}, 100); + return MakeSchema(TestFields::Id(), TestFields::Name(), + TestFields::Age()); }, .should_succeed = true}, @@ -788,34 +679,21 @@ INSTANTIATE_TEST_SUITE_P( .test_name = "SelectNonExistentField", .create_schema = []() { return TestSchemaFactory::BasicSchema(100); }, .select_fields = {"nonexistent"}, - .expected_schema = - []() { - return std::make_shared( - std::vector{}, 100); - }, + .expected_schema = []() { return MakeSchema(); }, .should_succeed = true}, SelectTestParam{ .test_name = "SelectCaseSensitive", .create_schema = []() { return TestSchemaFactory::BasicSchema(100); }, .select_fields = {"Name"}, // case-sensitive - .expected_schema = - []() { - return std::make_shared( - std::vector{}, 100); - }, + .expected_schema = []() { return MakeSchema(); }, .should_succeed = true}, SelectTestParam{ .test_name = "SelectCaseInsensitive", .create_schema = []() { return TestSchemaFactory::BasicSchema(100); }, .select_fields = {"Name"}, // case-insensitive - .expected_schema = - []() { - auto field = TestFieldFactory::Name(2, true); - return std::make_shared( - std::vector{*field}, 100); - }, + .expected_schema = []() { return MakeSchema(TestFields::Name()); }, .should_succeed = true, .case_sensitive = false})); @@ -827,12 +705,7 @@ INSTANTIATE_TEST_SUITE_P( .create_schema = []() { return TestSchemaFactory::AddressSchema(200); }, .select_fields = {"id", "name"}, .expected_schema = - []() { - auto field1 = TestFieldFactory::Id(15, false); - auto field2 = TestFieldFactory::Name(16, true); - return std::make_shared( - std::vector{*field1, *field2}, 200); - }, + []() { return MakeSchema(TestFields::Id(), TestFields::Name()); }, .should_succeed = true}, SelectTestParam{ @@ -841,13 +714,10 @@ INSTANTIATE_TEST_SUITE_P( .select_fields = {"address.street"}, .expected_schema = []() { - auto street_field = TestFieldFactory::Street(11, true); - auto address_type = std::make_shared( - std::vector{*street_field}); - auto address_field = std::make_unique( - 14, "address", address_type, true); - return std::make_shared( - std::vector{*address_field}, 200); + auto address_type = MakeStructType(TestFields::Street()); + auto address_field = + iceberg::SchemaField{14, "address", address_type, true}; + return MakeSchema(address_field); }, .should_succeed = true})); @@ -860,19 +730,12 @@ INSTANTIATE_TEST_SUITE_P( .select_fields = {"id", "user.name", "user.address.street"}, .expected_schema = []() { - auto id_field = TestFieldFactory::Id(16, false); - auto name_field = TestFieldFactory::Name(14, true); - auto street_field = TestFieldFactory::Street(11, true); - auto address_type = std::make_shared( - std::vector{*street_field}); - auto address_field = std::make_unique( - 13, "address", address_type, true); - auto user_type = std::make_shared( - std::vector{*name_field, *address_field}); - auto user_field = - std::make_unique(15, "user", user_type, true); - return std::make_shared( - std::vector{*id_field, *user_field}, 300); + auto address_type = MakeStructType(TestFields::Street()); + auto address_field = + iceberg::SchemaField{16, "address", address_type, true}; + auto user_type = MakeStructType(TestFields::Name(), address_field); + auto user_field = iceberg::SchemaField{17, "user", user_type, true}; + return MakeSchema(TestFields::Id(), user_field); }, .should_succeed = true}, @@ -882,24 +745,17 @@ INSTANTIATE_TEST_SUITE_P( .select_fields = {"user.profile.name", "user.settings.theme"}, .expected_schema = []() { - auto name_field = TestFieldFactory::Name(21, true); - auto profile_type = std::make_shared( - std::vector{*name_field}); - auto profile_field = std::make_unique( - 23, "profile", profile_type, true); - - auto theme_field = TestFieldFactory::Theme(24, true); - auto settings_type = std::make_shared( - std::vector{*theme_field}); - auto settings_field = std::make_unique( - 25, "settings", settings_type, true); - - auto user_type = std::make_shared( - std::vector{*profile_field, *settings_field}); - auto user_field = - std::make_unique(26, "user", user_type, true); - return std::make_shared( - std::vector{*user_field}, 400); + auto profile_type = MakeStructType(TestFields::Name()); + auto profile_field = + iceberg::SchemaField{23, "profile", profile_type, true}; + + auto settings_type = MakeStructType(TestFields::Theme()); + auto settings_field = + iceberg::SchemaField{25, "settings", settings_type, true}; + + auto user_type = MakeStructType(profile_field, settings_field); + auto user_field = iceberg::SchemaField{26, "user", user_type, true}; + return MakeSchema(user_field); }, .should_succeed = true}, @@ -909,14 +765,9 @@ INSTANTIATE_TEST_SUITE_P( .select_fields = {"id", "user.name"}, .expected_schema = []() { - auto id_field = TestFieldFactory::Id(46, false); - auto name_field = TestFieldFactory::Name(43, true); - auto user_type = std::make_shared( - std::vector{*name_field}); - auto user_field = - std::make_unique(45, "user", user_type, true); - return std::make_shared( - std::vector{*id_field, *user_field}, 500); + auto user_type = MakeStructType(TestFields::Name()); + auto user_field = iceberg::SchemaField{45, "user", user_type, true}; + return MakeSchema(TestFields::Id(), user_field); }, .should_succeed = true})); @@ -939,9 +790,7 @@ TEST_P(ProjectParamTest, ProjectFields) { if (param.should_succeed) { ASSERT_TRUE(result.has_value()); - auto actual_schema = std::move(result.value()); - auto expected_schema = param.expected_schema(); - ASSERT_EQ(*actual_schema, *expected_schema); + CompareSchema(std::move(result.value()), param.expected_schema()); } else { ASSERT_FALSE(result.has_value()); ASSERT_THAT(result, iceberg::IsError(iceberg::ErrorKind::kInvalidArgument)); @@ -963,34 +812,21 @@ INSTANTIATE_TEST_SUITE_P( .test_name = "ProjectSingleField", .create_schema = []() { return TestSchemaFactory::BasicSchema(100); }, .selected_ids = {2}, - .expected_schema = - []() { - auto field = TestFieldFactory::Name(2, true); - return std::make_shared( - std::vector{*field}, 100); - }, + .expected_schema = []() { return MakeSchema(TestFields::Name()); }, .should_succeed = true}, ProjectTestParam{ .test_name = "ProjectNonExistentFieldId", .create_schema = []() { return TestSchemaFactory::BasicSchema(100); }, .selected_ids = {999}, - .expected_schema = - []() { - return std::make_shared( - std::vector{}, 100); - }, + .expected_schema = []() { return MakeSchema(); }, .should_succeed = true}, ProjectTestParam{ .test_name = "ProjectEmptySelection", .create_schema = []() { return TestSchemaFactory::BasicSchema(100); }, .selected_ids = {}, - .expected_schema = - []() { - return std::make_shared( - std::vector{}, 100); - }, + .expected_schema = []() { return MakeSchema(); }, .should_succeed = true})); INSTANTIATE_TEST_SUITE_P( @@ -1001,13 +837,10 @@ INSTANTIATE_TEST_SUITE_P( .selected_ids = {11}, .expected_schema = []() { - auto street_field = TestFieldFactory::Street(11, true); - auto address_type = std::make_shared( - std::vector{*street_field}); - auto address_field = std::make_unique( - 14, "address", address_type, true); - return std::make_shared( - std::vector{*address_field}, 200); + auto address_type = MakeStructType(TestFields::Street()); + auto address_field = + iceberg::SchemaField{14, "address", address_type, true}; + return MakeSchema(address_field); }, .should_succeed = true})); @@ -1017,66 +850,47 @@ INSTANTIATE_TEST_SUITE_P( ProjectTestParam{ .test_name = "ProjectTopLevelAndNestedFields", .create_schema = []() { return TestSchemaFactory::NestedUserSchema(300); }, - .selected_ids = {16, 14, 11}, + .selected_ids = {1, 2, 11}, .expected_schema = []() { - auto id_field = TestFieldFactory::Id(16, false); - auto name_field = TestFieldFactory::Name(14, true); - auto street_field = TestFieldFactory::Street(11, true); - auto address_type = std::make_shared( - std::vector{*street_field}); - auto address_field = std::make_unique( - 13, "address", address_type, true); - auto user_type = std::make_shared( - std::vector{*name_field, *address_field}); - auto user_field = - std::make_unique(15, "user", user_type, true); - return std::make_shared( - std::vector{*id_field, *user_field}, 300); + auto address_type = MakeStructType(TestFields::Street()); + auto address_field = + iceberg::SchemaField{16, "address", address_type, true}; + auto user_type = MakeStructType(TestFields::Name(), address_field); + auto user_field = iceberg::SchemaField{17, "user", user_type, true}; + return MakeSchema(TestFields::Id(), user_field); }, .should_succeed = true}, ProjectTestParam{ .test_name = "ProjectNestedFieldsAtDifferentLevels", .create_schema = []() { return TestSchemaFactory::MultiLevelSchema(400); }, - .selected_ids = {21, 24}, + .selected_ids = {2, 24}, .expected_schema = []() { - auto name_field = TestFieldFactory::Name(21, true); - auto profile_type = std::make_shared( - std::vector{*name_field}); - auto profile_field = std::make_unique( - 23, "profile", profile_type, true); - - auto theme_field = TestFieldFactory::Theme(24, true); - auto settings_type = std::make_shared( - std::vector{*theme_field}); - auto settings_field = std::make_unique( - 25, "settings", settings_type, true); - - auto user_type = std::make_shared( - std::vector{*profile_field, *settings_field}); - auto user_field = - std::make_unique(26, "user", user_type, true); - return std::make_shared( - std::vector{*user_field}, 400); + auto profile_type = MakeStructType(TestFields::Name()); + auto profile_field = + iceberg::SchemaField{23, "profile", profile_type, true}; + + auto settings_type = MakeStructType(TestFields::Theme()); + auto settings_field = + iceberg::SchemaField{25, "settings", settings_type, true}; + + auto user_type = MakeStructType(profile_field, settings_field); + auto user_field = iceberg::SchemaField{26, "user", user_type, true}; + return MakeSchema(user_field); }, .should_succeed = true}, ProjectTestParam{ .test_name = "ProjectListAndNestedFields", .create_schema = []() { return TestSchemaFactory::ListSchema(500); }, - .selected_ids = {46, 43}, + .selected_ids = {1, 2}, .expected_schema = []() { - auto id_field = TestFieldFactory::Id(46, false); - auto name_field = TestFieldFactory::Name(43, true); - auto user_type = std::make_shared( - std::vector{*name_field}); - auto user_field = - std::make_unique(45, "user", user_type, true); - return std::make_shared( - std::vector{*id_field, *user_field}, 500); + auto user_type = MakeStructType(TestFields::Name()); + auto user_field = iceberg::SchemaField{45, "user", user_type, true}; + return MakeSchema(TestFields::Id(), user_field); }, .should_succeed = true})); @@ -1084,7 +898,7 @@ INSTANTIATE_TEST_SUITE_P( ProjectMapErrorTestCases, ProjectParamTest, ::testing::Values(ProjectTestParam{ .test_name = "ProjectMapWithOnlyKey", - .create_schema = []() { return TestSchemaFactory::MapSchema(100); }, + .create_schema = []() { return TestSchemaFactory::MapSchema(600); }, .selected_ids = {31}, // Only select key field, not value field .expected_schema = []() { return nullptr; }, .should_succeed = false, @@ -1096,74 +910,62 @@ INSTANTIATE_TEST_SUITE_P( ProjectTestParam{ .test_name = "ProjectListElement", .create_schema = - []() { return TestSchemaFactory::ListWithStructElementSchema(100); }, - .selected_ids = {51}, // Only select name field from list element + []() { return TestSchemaFactory::ListWithStructElementSchema(700); }, + .selected_ids = {2}, // Only select name field from list element .expected_schema = []() { - auto name_field = TestFieldFactory::Name(51, true); - auto struct_type = std::make_shared( - std::vector{*name_field}); - auto element_field = std::make_unique( - 53, "element", struct_type, false); - auto list_type = std::make_shared(*element_field); - auto list_field = std::make_unique( - 54, "list_field", list_type, true); - return std::make_shared( - std::vector{*list_field}, 100); + auto struct_type = MakeStructType(TestFields::Name()); + auto element_field = + iceberg::SchemaField{53, "element", struct_type, false}; + auto list_type = std::make_shared(element_field); + auto list_field = + iceberg::SchemaField{54, "list_field", list_type, true}; + return MakeSchema(list_field); }, .should_succeed = true}, ProjectTestParam{ .test_name = "ProjectListOfMap", - .create_schema = []() { return TestSchemaFactory::ListOfMapSchema(100); }, - .selected_ids = {61, 62}, + .create_schema = []() { return TestSchemaFactory::ListOfMapSchema(800); }, + .selected_ids = {2, 3}, .expected_schema = []() { - auto map_key_field = TestFieldFactory::Key(61, false); - auto map_value_field1 = std::make_unique( - 62, "value_name", iceberg::string(), false); - auto map_value_struct = std::make_shared( - std::vector{*map_value_field1}); - auto map_value_field = std::make_unique( - 64, "value", map_value_struct, false); - auto map_type = std::make_shared(*map_key_field, - *map_value_field); - auto list_element = std::make_unique( - 65, "element", map_type, false); - auto list_type = std::make_shared(*list_element); - auto list_field = std::make_unique( - 66, "list_field", list_type, true); - return std::make_shared( - std::vector{*list_field}, 100); + auto map_value_struct = + MakeStructType(TestFields::Name(), TestFields::Age()); + auto map_value_field = + iceberg::SchemaField{64, "value", map_value_struct, false}; + auto map_type = std::make_shared(TestFields::Key(), + map_value_field); + auto list_element = + iceberg::SchemaField{65, "element", map_type, false}; + auto list_type = std::make_shared(list_element); + auto list_field = + iceberg::SchemaField{66, "list_field", list_type, true}; + return MakeSchema(list_field); }, .should_succeed = true}, ProjectTestParam{ .test_name = "ProjectMapKeyAndValue", - .create_schema = []() { return TestSchemaFactory::ComplexMapSchema(100); }, + .create_schema = []() { return TestSchemaFactory::ComplexMapSchema(900); }, .selected_ids = {71, 74}, .expected_schema = []() { - auto key_field1 = std::make_unique( - 71, "key_id", iceberg::int32(), false); - auto key_struct = std::make_shared( - std::vector{*key_field1}); - auto key_field = std::make_unique( - 73, "key", key_struct, false); - - auto value_field1 = std::make_unique( - 74, "value_id", iceberg::int32(), false); - auto value_struct = std::make_shared( - std::vector{*value_field1}); - auto value_field = std::make_unique( - 76, "value", value_struct, false); + auto key_id_field = + iceberg::SchemaField{71, "id", iceberg::int32(), false}; + auto key_struct = MakeStructType(key_id_field); + auto key_field = iceberg::SchemaField{73, "key", key_struct, false}; + + auto value_id_field = + iceberg::SchemaField{74, "id", iceberg::int32(), false}; + auto value_struct = MakeStructType(value_id_field); + auto value_field = + iceberg::SchemaField{76, "value", value_struct, false}; auto map_type = - std::make_shared(*key_field, *value_field); - auto map_field = std::make_unique(77, "map_field", - map_type, true); - return std::make_shared( - std::vector{*map_field}, 100); + std::make_shared(key_field, value_field); + auto map_field = iceberg::SchemaField{77, "map_field", map_type, true}; + return MakeSchema(map_field); }, .should_succeed = true}, @@ -1171,9 +973,5 @@ INSTANTIATE_TEST_SUITE_P( .test_name = "ProjectEmptyResult", .create_schema = []() { return TestSchemaFactory::BasicSchema(100); }, .selected_ids = {999}, // Select non-existent field - .expected_schema = - []() { - return std::make_shared( - std::vector{}, 100); - }, + .expected_schema = []() { return MakeSchema(); }, .should_succeed = true})); From 9163c2f707dc24978cde5ec6de704766e71b0a45 Mon Sep 17 00:00:00 2001 From: nullccxsy <32149055912@qq.com> Date: Mon, 15 Sep 2025 13:23:17 +0800 Subject: [PATCH 09/10] fix comments --- src/iceberg/schema.cc | 3 +- test/schema_test.cc | 649 ++++++++++++++++++++---------------------- 2 files changed, 310 insertions(+), 342 deletions(-) diff --git a/src/iceberg/schema.cc b/src/iceberg/schema.cc index 2a62023b5..2ab2f7e70 100644 --- a/src/iceberg/schema.cc +++ b/src/iceberg/schema.cc @@ -359,7 +359,8 @@ Result> Schema::Select(std::span name bool case_sensitive) const { const std::string kAllColumns = "*"; if (std::ranges::find(names, kAllColumns) != names.end()) { - return std::make_unique(*this); + auto struct_type = ToStructType(*this); + return FromStructType(std::move(*struct_type), std::nullopt); } std::unordered_set selected_ids; diff --git a/test/schema_test.cc b/test/schema_test.cc index 2872b6880..54b9a96e2 100644 --- a/test/schema_test.cc +++ b/test/schema_test.cc @@ -32,15 +32,15 @@ #include "matchers.h" template -std::shared_ptr MakeStructType(Args... args) { +std::shared_ptr MakeStructType(Args&&... args) { return std::make_shared( - std::vector{args...}); + std::vector{std::move(args)...}); } template -std::shared_ptr MakeSchema(Args... args) { - return std::make_shared(std::vector{args...}, - std::nullopt); +std::unique_ptr MakeSchema(Args&&... args) { + return std::make_unique( + std::vector{std::move(args)...}, std::nullopt); } TEST(SchemaTest, Basics) { @@ -507,7 +507,6 @@ TEST(SchemaTest, NestedDuplicateFieldIdError) { namespace { -namespace TestFields { iceberg::SchemaField Id() { return {1, "id", iceberg::int32(), true}; } iceberg::SchemaField Name() { return {2, "name", iceberg::string(), false}; } iceberg::SchemaField Age() { return {3, "age", iceberg::int32(), true}; } @@ -520,103 +519,95 @@ iceberg::SchemaField Key() { return {31, "key", iceberg::int32(), false}; } iceberg::SchemaField Value() { return {32, "value", iceberg::string(), false}; } iceberg::SchemaField Element() { return {41, "element", iceberg::string(), false}; } -} // namespace TestFields - -struct TestSchemaFactory { - static std::shared_ptr BasicSchema(int32_t schema_id = 100) { - return MakeSchema(TestFields::Id(), TestFields::Name(), TestFields::Age(), - TestFields::Email()); - } - - static std::shared_ptr AddressSchema(int32_t schema_id = 200) { - auto address_type = - MakeStructType(TestFields::Street(), TestFields::City(), TestFields::Zip()); - auto address_field = iceberg::SchemaField{14, "address", address_type, true}; - return MakeSchema(TestFields::Id(), TestFields::Name(), address_field); - } +static std::unique_ptr BasicSchema() { + return MakeSchema(Id(), Name(), Age(), Email()); +} - static std::shared_ptr NestedUserSchema(int32_t schema_id = 300) { - auto address_type = MakeStructType(TestFields::Street(), TestFields::City()); - auto address_field = iceberg::SchemaField{16, "address", address_type, true}; - auto user_type = MakeStructType(TestFields::Name(), address_field); - auto user_field = iceberg::SchemaField{17, "user", user_type, true}; - return MakeSchema(TestFields::Id(), user_field); - } +static std::unique_ptr AddressSchema() { + auto address_type = MakeStructType(Street(), City(), Zip()); + auto address_field = iceberg::SchemaField{14, "address", std::move(address_type), true}; + return MakeSchema(Id(), Name(), std::move(address_field)); +} - static std::shared_ptr MultiLevelSchema(int32_t schema_id = 400) { - auto profile_type = MakeStructType(TestFields::Name(), TestFields::Age()); - auto profile_field = iceberg::SchemaField{23, "profile", profile_type, true}; +static std::unique_ptr NestedUserSchema() { + auto address_type = MakeStructType(Street(), City()); + auto address_field = iceberg::SchemaField{16, "address", std::move(address_type), true}; + auto user_type = MakeStructType(Name(), address_field); + auto user_field = iceberg::SchemaField{17, "user", std::move(user_type), true}; + return MakeSchema(Id(), user_field); +} - auto settings_type = MakeStructType(TestFields::Theme()); - auto settings_field = iceberg::SchemaField{25, "settings", settings_type, true}; +static std::unique_ptr MultiLevelSchema() { + auto profile_type = MakeStructType(Name(), Age()); + auto profile_field = iceberg::SchemaField{23, "profile", std::move(profile_type), true}; - auto user_type = MakeStructType(profile_field, settings_field); - auto user_field = iceberg::SchemaField{26, "user", user_type, true}; + auto settings_type = MakeStructType(Theme()); + auto settings_field = + iceberg::SchemaField{25, "settings", std::move(settings_type), true}; - return MakeSchema(TestFields::Id(), user_field); - } + auto user_type = MakeStructType(profile_field, settings_field); + auto user_field = iceberg::SchemaField{26, "user", std::move(user_type), true}; - static std::shared_ptr ListSchema(int32_t schema_id = 500) { - auto list_type = std::make_shared(TestFields::Element()); - auto tags_field = iceberg::SchemaField{42, "tags", list_type, true}; + return MakeSchema(Id(), user_field); +} - auto user_type = MakeStructType(TestFields::Name(), TestFields::Age()); - auto user_field = iceberg::SchemaField{45, "user", user_type, true}; +static std::unique_ptr ListSchema() { + auto list_type = std::make_shared(Element()); + auto tags_field = iceberg::SchemaField{42, "tags", std::move(list_type), true}; - return MakeSchema(TestFields::Id(), tags_field, user_field); - } + auto user_type = MakeStructType(Name(), Age()); + auto user_field = iceberg::SchemaField{45, "user", std::move(user_type), true}; - static std::shared_ptr MapSchema(int32_t schema_id = 600) { - auto map_type = - std::make_shared(TestFields::Key(), TestFields::Value()); - auto map_field = iceberg::SchemaField{33, "map_field", map_type, true}; - return MakeSchema(map_field); - } + return MakeSchema(Id(), tags_field, user_field); +} - static std::shared_ptr ListWithStructElementSchema( - int32_t schema_id = 700) { - auto struct_type = MakeStructType(TestFields::Name(), TestFields::Age()); - auto element_field = iceberg::SchemaField{53, "element", struct_type, false}; - auto list_type = std::make_shared(element_field); - auto list_field = iceberg::SchemaField{54, "list_field", list_type, true}; - return MakeSchema(list_field); - } +static std::unique_ptr MapSchema() { + auto map_type = std::make_shared(Key(), Value()); + auto map_field = iceberg::SchemaField{33, "map_field", std::move(map_type), true}; + return MakeSchema(map_field); +} - static std::shared_ptr ListOfMapSchema(int32_t schema_id = 800) { - auto map_value_struct = MakeStructType(TestFields::Name(), TestFields::Age()); - auto map_value_field = iceberg::SchemaField{64, "value", map_value_struct, false}; - auto map_type = - std::make_shared(TestFields::Key(), map_value_field); - auto list_element = iceberg::SchemaField{65, "element", map_type, false}; - auto list_type = std::make_shared(list_element); - auto list_field = iceberg::SchemaField{66, "list_field", list_type, true}; - return MakeSchema(list_field); - } +static std::unique_ptr ListWithStructElementSchema() { + auto struct_type = MakeStructType(Name(), Age()); + auto element_field = iceberg::SchemaField{53, "element", std::move(struct_type), false}; + auto list_type = std::make_shared(element_field); + auto list_field = iceberg::SchemaField{54, "list_field", std::move(list_type), true}; + return MakeSchema(list_field); +} - static std::shared_ptr ComplexMapSchema(int32_t schema_id = 900) { - auto key_id_field = iceberg::SchemaField{71, "id", iceberg::int32(), false}; - auto key_name_field = iceberg::SchemaField{72, "name", iceberg::string(), false}; - auto key_struct = MakeStructType(key_id_field, key_name_field); - auto key_field = iceberg::SchemaField{73, "key", key_struct, false}; +static std::unique_ptr ListOfMapSchema() { + auto map_value_struct = MakeStructType(Name(), Age()); + auto map_value_field = + iceberg::SchemaField{64, "value", std::move(map_value_struct), false}; + auto map_type = std::make_shared(Key(), map_value_field); + auto list_element = iceberg::SchemaField{65, "element", std::move(map_type), false}; + auto list_type = std::make_shared(list_element); + auto list_field = iceberg::SchemaField{66, "list_field", std::move(list_type), true}; + return MakeSchema(list_field); +} - auto value_id_field = iceberg::SchemaField{74, "id", iceberg::int32(), false}; - auto value_name_field = iceberg::SchemaField{75, "name", iceberg::string(), false}; - auto value_struct = MakeStructType(value_id_field, value_name_field); - auto value_field = iceberg::SchemaField{76, "value", value_struct, false}; +static std::unique_ptr ComplexMapSchema() { + auto key_id_field = iceberg::SchemaField{71, "id", iceberg::int32(), false}; + auto key_name_field = iceberg::SchemaField{72, "name", iceberg::string(), false}; + auto key_struct = MakeStructType(key_id_field, key_name_field); + auto key_field = iceberg::SchemaField{73, "key", std::move(key_struct), false}; - auto map_type = std::make_shared(key_field, value_field); - auto map_field = iceberg::SchemaField{77, "map_field", map_type, true}; - return MakeSchema(map_field); - } -}; + auto value_id_field = iceberg::SchemaField{74, "id", iceberg::int32(), false}; + auto value_name_field = iceberg::SchemaField{75, "name", iceberg::string(), false}; + auto value_struct = MakeStructType(value_id_field, value_name_field); + auto value_field = iceberg::SchemaField{76, "value", std::move(value_struct), false}; + auto map_type = std::make_shared(key_field, value_field); + auto map_field = iceberg::SchemaField{77, "map_field", std::move(map_type), true}; + return MakeSchema(map_field); +} } // namespace struct SelectTestParam { std::string test_name; - std::function()> create_schema; + std::function()> create_schema; std::vector select_fields; - std::function()> expected_schema; + std::function()> expected_schema; bool should_succeed; std::string expected_error_message; bool case_sensitive = true; @@ -624,14 +615,6 @@ struct SelectTestParam { class SelectParamTest : public ::testing::TestWithParam {}; -void CompareSchema(std::shared_ptr actual_schema, - std::shared_ptr expected_schema) { - ASSERT_EQ(actual_schema->fields().size(), expected_schema->fields().size()); - for (int i = 0; i < actual_schema->fields().size(); i++) { - ASSERT_EQ(actual_schema->fields()[i], expected_schema->fields()[i]); - } -} - TEST_P(SelectParamTest, SelectFields) { const auto& param = GetParam(); auto input_schema = param.create_schema(); @@ -639,7 +622,7 @@ TEST_P(SelectParamTest, SelectFields) { if (param.should_succeed) { ASSERT_TRUE(result.has_value()); - CompareSchema(std::move(result.value()), param.expected_schema()); + ASSERT_EQ(*result.value(), *param.expected_schema()); } else { ASSERT_FALSE(result.has_value()); ASSERT_THAT(result, iceberg::IsError(iceberg::ErrorKind::kInvalidArgument)); @@ -650,132 +633,122 @@ TEST_P(SelectParamTest, SelectFields) { INSTANTIATE_TEST_SUITE_P( SelectTestCases, SelectParamTest, ::testing::Values( - SelectTestParam{ - .test_name = "SelectAllColumns", - .create_schema = []() { return TestSchemaFactory::BasicSchema(100); }, - .select_fields = {"*"}, - .expected_schema = []() { return TestSchemaFactory::BasicSchema(100); }, - .should_succeed = true}, - - SelectTestParam{ - .test_name = "SelectSingleField", - .create_schema = []() { return TestSchemaFactory::BasicSchema(100); }, - .select_fields = {"name"}, - .expected_schema = []() { return MakeSchema(TestFields::Name()); }, - .should_succeed = true}, + SelectTestParam{.test_name = "SelectAllColumns", + .create_schema = []() { return BasicSchema(); }, + .select_fields = {"*"}, + .expected_schema = []() { return BasicSchema(); }, + .should_succeed = true}, + + SelectTestParam{.test_name = "SelectSingleField", + .create_schema = []() { return BasicSchema(); }, + .select_fields = {"name"}, + .expected_schema = []() { return MakeSchema(Name()); }, + .should_succeed = true}, SelectTestParam{ .test_name = "SelectMultipleFields", - .create_schema = []() { return TestSchemaFactory::BasicSchema(100); }, + .create_schema = []() { return BasicSchema(); }, .select_fields = {"id", "name", "age"}, - .expected_schema = - []() { - return MakeSchema(TestFields::Id(), TestFields::Name(), - TestFields::Age()); - }, - .should_succeed = true}, - - SelectTestParam{ - .test_name = "SelectNonExistentField", - .create_schema = []() { return TestSchemaFactory::BasicSchema(100); }, - .select_fields = {"nonexistent"}, - .expected_schema = []() { return MakeSchema(); }, + .expected_schema = []() { return MakeSchema(Id(), Name(), Age()); }, .should_succeed = true}, - SelectTestParam{ - .test_name = "SelectCaseSensitive", - .create_schema = []() { return TestSchemaFactory::BasicSchema(100); }, - .select_fields = {"Name"}, // case-sensitive - .expected_schema = []() { return MakeSchema(); }, - .should_succeed = true}, - - SelectTestParam{ - .test_name = "SelectCaseInsensitive", - .create_schema = []() { return TestSchemaFactory::BasicSchema(100); }, - .select_fields = {"Name"}, // case-insensitive - .expected_schema = []() { return MakeSchema(TestFields::Name()); }, - .should_succeed = true, - .case_sensitive = false})); + SelectTestParam{.test_name = "SelectNonExistentField", + .create_schema = []() { return BasicSchema(); }, + .select_fields = {"nonexistent"}, + .expected_schema = []() { return MakeSchema(); }, + .should_succeed = true}, + + SelectTestParam{.test_name = "SelectCaseSensitive", + .create_schema = []() { return BasicSchema(); }, + .select_fields = {"Name"}, // case-sensitive + .expected_schema = []() { return MakeSchema(); }, + .should_succeed = true}, + + SelectTestParam{.test_name = "SelectCaseInsensitive", + .create_schema = []() { return BasicSchema(); }, + .select_fields = {"Name"}, // case-insensitive + .expected_schema = []() { return MakeSchema(Name()); }, + .should_succeed = true, + .case_sensitive = false})); INSTANTIATE_TEST_SUITE_P( SelectNestedTestCases, SelectParamTest, - ::testing::Values( - SelectTestParam{ - .test_name = "SelectTopLevelFields", - .create_schema = []() { return TestSchemaFactory::AddressSchema(200); }, - .select_fields = {"id", "name"}, - .expected_schema = - []() { return MakeSchema(TestFields::Id(), TestFields::Name()); }, - .should_succeed = true}, - - SelectTestParam{ - .test_name = "SelectNestedField", - .create_schema = []() { return TestSchemaFactory::AddressSchema(200); }, - .select_fields = {"address.street"}, - .expected_schema = - []() { - auto address_type = MakeStructType(TestFields::Street()); - auto address_field = - iceberg::SchemaField{14, "address", address_type, true}; - return MakeSchema(address_field); - }, - .should_succeed = true})); + ::testing::Values(SelectTestParam{ + .test_name = "SelectTopLevelFields", + .create_schema = []() { return AddressSchema(); }, + .select_fields = {"id", "name"}, + .expected_schema = []() { return MakeSchema(Id(), Name()); }, + .should_succeed = true}, + + SelectTestParam{.test_name = "SelectNestedField", + .create_schema = []() { return AddressSchema(); }, + .select_fields = {"address.street"}, + .expected_schema = + []() { + auto address_type = MakeStructType(Street()); + auto address_field = iceberg::SchemaField{ + 14, "address", std::move(address_type), + true}; + return MakeSchema(address_field); + }, + .should_succeed = true})); INSTANTIATE_TEST_SUITE_P( SelectMultiLevelTestCases, SelectParamTest, ::testing::Values( - SelectTestParam{ - .test_name = "SelectTopLevelAndNestedFields", - .create_schema = []() { return TestSchemaFactory::NestedUserSchema(300); }, - .select_fields = {"id", "user.name", "user.address.street"}, - .expected_schema = - []() { - auto address_type = MakeStructType(TestFields::Street()); - auto address_field = - iceberg::SchemaField{16, "address", address_type, true}; - auto user_type = MakeStructType(TestFields::Name(), address_field); - auto user_field = iceberg::SchemaField{17, "user", user_type, true}; - return MakeSchema(TestFields::Id(), user_field); - }, - .should_succeed = true}, - - SelectTestParam{ - .test_name = "SelectNestedFieldsAtDifferentLevels", - .create_schema = []() { return TestSchemaFactory::MultiLevelSchema(400); }, - .select_fields = {"user.profile.name", "user.settings.theme"}, - .expected_schema = - []() { - auto profile_type = MakeStructType(TestFields::Name()); - auto profile_field = - iceberg::SchemaField{23, "profile", profile_type, true}; - - auto settings_type = MakeStructType(TestFields::Theme()); - auto settings_field = - iceberg::SchemaField{25, "settings", settings_type, true}; - - auto user_type = MakeStructType(profile_field, settings_field); - auto user_field = iceberg::SchemaField{26, "user", user_type, true}; - return MakeSchema(user_field); - }, - .should_succeed = true}, - - SelectTestParam{ - .test_name = "SelectListAndNestedFields", - .create_schema = []() { return TestSchemaFactory::ListSchema(500); }, - .select_fields = {"id", "user.name"}, - .expected_schema = - []() { - auto user_type = MakeStructType(TestFields::Name()); - auto user_field = iceberg::SchemaField{45, "user", user_type, true}; - return MakeSchema(TestFields::Id(), user_field); - }, - .should_succeed = true})); + SelectTestParam{.test_name = "SelectTopLevelAndNestedFields", + .create_schema = []() { return NestedUserSchema(); }, + .select_fields = {"id", "user.name", "user.address.street"}, + .expected_schema = + []() { + auto address_type = MakeStructType(Street()); + auto address_field = iceberg::SchemaField{ + 16, "address", std::move(address_type), true}; + auto user_type = MakeStructType(Name(), address_field); + auto user_field = iceberg::SchemaField{ + 17, "user", std::move(user_type), true}; + return MakeSchema(Id(), user_field); + }, + .should_succeed = true}, + + SelectTestParam{.test_name = "SelectNestedFieldsAtDifferentLevels", + .create_schema = []() { return MultiLevelSchema(); }, + .select_fields = {"user.profile.name", "user.settings.theme"}, + .expected_schema = + []() { + auto profile_type = MakeStructType(Name()); + auto profile_field = iceberg::SchemaField{ + 23, "profile", std::move(profile_type), true}; + + auto settings_type = MakeStructType(Theme()); + auto settings_field = iceberg::SchemaField{ + 25, "settings", std::move(settings_type), true}; + + auto user_type = + MakeStructType(profile_field, settings_field); + auto user_field = iceberg::SchemaField{ + 26, "user", std::move(user_type), true}; + return MakeSchema(user_field); + }, + .should_succeed = true}, + + SelectTestParam{.test_name = "SelectListAndNestedFields", + .create_schema = []() { return ListSchema(); }, + .select_fields = {"id", "user.name"}, + .expected_schema = + []() { + auto user_type = MakeStructType(Name()); + auto user_field = iceberg::SchemaField{ + 45, "user", std::move(user_type), true}; + return MakeSchema(Id(), user_field); + }, + .should_succeed = true})); struct ProjectTestParam { std::string test_name; - std::function()> create_schema; + std::function()> create_schema; std::unordered_set selected_ids; - std::function()> expected_schema; + std::function()> expected_schema; bool should_succeed; std::string expected_error_message; }; @@ -785,12 +758,11 @@ class ProjectParamTest : public ::testing::TestWithParam {}; TEST_P(ProjectParamTest, ProjectFields) { const auto& param = GetParam(); auto input_schema = param.create_schema(); - auto selected_ids = param.selected_ids; - auto result = input_schema->Project(selected_ids); + auto result = input_schema->Project(param.selected_ids); if (param.should_succeed) { ASSERT_TRUE(result.has_value()); - CompareSchema(std::move(result.value()), param.expected_schema()); + ASSERT_THAT(*result.value(), *param.expected_schema()); } else { ASSERT_FALSE(result.has_value()); ASSERT_THAT(result, iceberg::IsError(iceberg::ErrorKind::kInvalidArgument)); @@ -800,105 +772,101 @@ TEST_P(ProjectParamTest, ProjectFields) { INSTANTIATE_TEST_SUITE_P( ProjectTestCases, ProjectParamTest, - ::testing::Values( - ProjectTestParam{ - .test_name = "ProjectAllFields", - .create_schema = []() { return TestSchemaFactory::BasicSchema(100); }, - .selected_ids = {1, 2, 3, 4}, - .expected_schema = []() { return TestSchemaFactory::BasicSchema(100); }, - .should_succeed = true}, - - ProjectTestParam{ - .test_name = "ProjectSingleField", - .create_schema = []() { return TestSchemaFactory::BasicSchema(100); }, - .selected_ids = {2}, - .expected_schema = []() { return MakeSchema(TestFields::Name()); }, - .should_succeed = true}, - - ProjectTestParam{ - .test_name = "ProjectNonExistentFieldId", - .create_schema = []() { return TestSchemaFactory::BasicSchema(100); }, - .selected_ids = {999}, - .expected_schema = []() { return MakeSchema(); }, - .should_succeed = true}, - - ProjectTestParam{ - .test_name = "ProjectEmptySelection", - .create_schema = []() { return TestSchemaFactory::BasicSchema(100); }, - .selected_ids = {}, - .expected_schema = []() { return MakeSchema(); }, - .should_succeed = true})); - -INSTANTIATE_TEST_SUITE_P( - ProjectNestedTestCases, ProjectParamTest, - ::testing::Values(ProjectTestParam{ - .test_name = "ProjectNestedStructField", - .create_schema = []() { return TestSchemaFactory::AddressSchema(200); }, - .selected_ids = {11}, - .expected_schema = - []() { - auto address_type = MakeStructType(TestFields::Street()); - auto address_field = - iceberg::SchemaField{14, "address", address_type, true}; - return MakeSchema(address_field); - }, - .should_succeed = true})); + ::testing::Values(ProjectTestParam{.test_name = "ProjectAllFields", + .create_schema = []() { return BasicSchema(); }, + .selected_ids = {1, 2, 3, 4}, + .expected_schema = []() { return BasicSchema(); }, + .should_succeed = true}, + + ProjectTestParam{ + .test_name = "ProjectSingleField", + .create_schema = []() { return BasicSchema(); }, + .selected_ids = {2}, + .expected_schema = []() { return MakeSchema(Name()); }, + .should_succeed = true}, + + ProjectTestParam{.test_name = "ProjectNonExistentFieldId", + .create_schema = []() { return BasicSchema(); }, + .selected_ids = {999}, + .expected_schema = []() { return MakeSchema(); }, + .should_succeed = true}, + + ProjectTestParam{.test_name = "ProjectEmptySelection", + .create_schema = []() { return BasicSchema(); }, + .selected_ids = {}, + .expected_schema = []() { return MakeSchema(); }, + .should_succeed = true})); + +INSTANTIATE_TEST_SUITE_P(ProjectNestedTestCases, ProjectParamTest, + ::testing::Values(ProjectTestParam{ + .test_name = "ProjectNestedStructField", + .create_schema = []() { return AddressSchema(); }, + .selected_ids = {11}, + .expected_schema = + []() { + auto address_type = MakeStructType(Street()); + auto address_field = iceberg::SchemaField{ + 14, "address", std::move(address_type), true}; + return MakeSchema(address_field); + }, + .should_succeed = true})); INSTANTIATE_TEST_SUITE_P( ProjectMultiLevelTestCases, ProjectParamTest, ::testing::Values( - ProjectTestParam{ - .test_name = "ProjectTopLevelAndNestedFields", - .create_schema = []() { return TestSchemaFactory::NestedUserSchema(300); }, - .selected_ids = {1, 2, 11}, - .expected_schema = - []() { - auto address_type = MakeStructType(TestFields::Street()); - auto address_field = - iceberg::SchemaField{16, "address", address_type, true}; - auto user_type = MakeStructType(TestFields::Name(), address_field); - auto user_field = iceberg::SchemaField{17, "user", user_type, true}; - return MakeSchema(TestFields::Id(), user_field); - }, - .should_succeed = true}, - - ProjectTestParam{ - .test_name = "ProjectNestedFieldsAtDifferentLevels", - .create_schema = []() { return TestSchemaFactory::MultiLevelSchema(400); }, - .selected_ids = {2, 24}, - .expected_schema = - []() { - auto profile_type = MakeStructType(TestFields::Name()); - auto profile_field = - iceberg::SchemaField{23, "profile", profile_type, true}; - - auto settings_type = MakeStructType(TestFields::Theme()); - auto settings_field = - iceberg::SchemaField{25, "settings", settings_type, true}; - - auto user_type = MakeStructType(profile_field, settings_field); - auto user_field = iceberg::SchemaField{26, "user", user_type, true}; - return MakeSchema(user_field); - }, - .should_succeed = true}, - - ProjectTestParam{ - .test_name = "ProjectListAndNestedFields", - .create_schema = []() { return TestSchemaFactory::ListSchema(500); }, - .selected_ids = {1, 2}, - .expected_schema = - []() { - auto user_type = MakeStructType(TestFields::Name()); - auto user_field = iceberg::SchemaField{45, "user", user_type, true}; - return MakeSchema(TestFields::Id(), user_field); - }, - .should_succeed = true})); + ProjectTestParam{.test_name = "ProjectTopLevelAndNestedFields", + .create_schema = []() { return NestedUserSchema(); }, + .selected_ids = {1, 2, 11}, + .expected_schema = + []() { + auto address_type = MakeStructType(Street()); + auto address_field = iceberg::SchemaField{ + 16, "address", std::move(address_type), true}; + auto user_type = MakeStructType(Name(), address_field); + auto user_field = iceberg::SchemaField{ + 17, "user", std::move(user_type), true}; + return MakeSchema(Id(), user_field); + }, + .should_succeed = true}, + + ProjectTestParam{.test_name = "ProjectNestedFieldsAtDifferentLevels", + .create_schema = []() { return MultiLevelSchema(); }, + .selected_ids = {2, 24}, + .expected_schema = + []() { + auto profile_type = MakeStructType(Name()); + auto profile_field = iceberg::SchemaField{ + 23, "profile", std::move(profile_type), true}; + + auto settings_type = MakeStructType(Theme()); + auto settings_field = iceberg::SchemaField{ + 25, "settings", std::move(settings_type), true}; + + auto user_type = + MakeStructType(profile_field, settings_field); + auto user_field = iceberg::SchemaField{ + 26, "user", std::move(user_type), true}; + return MakeSchema(user_field); + }, + .should_succeed = true}, + + ProjectTestParam{.test_name = "ProjectListAndNestedFields", + .create_schema = []() { return ListSchema(); }, + .selected_ids = {1, 2}, + .expected_schema = + []() { + auto user_type = MakeStructType(Name()); + auto user_field = iceberg::SchemaField{ + 45, "user", std::move(user_type), true}; + return MakeSchema(Id(), user_field); + }, + .should_succeed = true})); INSTANTIATE_TEST_SUITE_P( ProjectMapErrorTestCases, ProjectParamTest, ::testing::Values(ProjectTestParam{ .test_name = "ProjectMapWithOnlyKey", - .create_schema = []() { return TestSchemaFactory::MapSchema(600); }, + .create_schema = []() { return MapSchema(); }, .selected_ids = {31}, // Only select key field, not value field .expected_schema = []() { return nullptr; }, .should_succeed = false, @@ -907,71 +875,70 @@ INSTANTIATE_TEST_SUITE_P( INSTANTIATE_TEST_SUITE_P( ProjectListAndMapTestCases, ProjectParamTest, ::testing::Values( - ProjectTestParam{ - .test_name = "ProjectListElement", - .create_schema = - []() { return TestSchemaFactory::ListWithStructElementSchema(700); }, - .selected_ids = {2}, // Only select name field from list element - .expected_schema = - []() { - auto struct_type = MakeStructType(TestFields::Name()); - auto element_field = - iceberg::SchemaField{53, "element", struct_type, false}; - auto list_type = std::make_shared(element_field); - auto list_field = - iceberg::SchemaField{54, "list_field", list_type, true}; - return MakeSchema(list_field); - }, - .should_succeed = true}, - - ProjectTestParam{ - .test_name = "ProjectListOfMap", - .create_schema = []() { return TestSchemaFactory::ListOfMapSchema(800); }, - .selected_ids = {2, 3}, - .expected_schema = - []() { - auto map_value_struct = - MakeStructType(TestFields::Name(), TestFields::Age()); - auto map_value_field = - iceberg::SchemaField{64, "value", map_value_struct, false}; - auto map_type = std::make_shared(TestFields::Key(), - map_value_field); - auto list_element = - iceberg::SchemaField{65, "element", map_type, false}; - auto list_type = std::make_shared(list_element); - auto list_field = - iceberg::SchemaField{66, "list_field", list_type, true}; - return MakeSchema(list_field); - }, - .should_succeed = true}, + ProjectTestParam{.test_name = "ProjectListElement", + .create_schema = []() { return ListWithStructElementSchema(); }, + .selected_ids = {2}, // Only select name field from list element + .expected_schema = + []() { + auto struct_type = MakeStructType(Name()); + auto element_field = iceberg::SchemaField{ + 53, "element", std::move(struct_type), false}; + auto list_type = + std::make_shared(element_field); + auto list_field = iceberg::SchemaField{ + 54, "list_field", std::move(list_type), true}; + return MakeSchema(list_field); + }, + .should_succeed = true}, + + ProjectTestParam{.test_name = "ProjectListOfMap", + .create_schema = []() { return ListOfMapSchema(); }, + .selected_ids = {2, 3}, + .expected_schema = + []() { + auto map_value_struct = MakeStructType(Name(), Age()); + auto map_value_field = iceberg::SchemaField{ + 64, "value", std::move(map_value_struct), false}; + auto map_type = std::make_shared( + Key(), map_value_field); + auto list_element = iceberg::SchemaField{ + 65, "element", std::move(map_type), false}; + auto list_type = + std::make_shared(list_element); + auto list_field = iceberg::SchemaField{ + 66, "list_field", std::move(list_type), true}; + return MakeSchema(list_field); + }, + .should_succeed = true}, ProjectTestParam{ .test_name = "ProjectMapKeyAndValue", - .create_schema = []() { return TestSchemaFactory::ComplexMapSchema(900); }, + .create_schema = []() { return ComplexMapSchema(); }, .selected_ids = {71, 74}, .expected_schema = []() { auto key_id_field = iceberg::SchemaField{71, "id", iceberg::int32(), false}; auto key_struct = MakeStructType(key_id_field); - auto key_field = iceberg::SchemaField{73, "key", key_struct, false}; + auto key_field = + iceberg::SchemaField{73, "key", std::move(key_struct), false}; auto value_id_field = iceberg::SchemaField{74, "id", iceberg::int32(), false}; auto value_struct = MakeStructType(value_id_field); auto value_field = - iceberg::SchemaField{76, "value", value_struct, false}; + iceberg::SchemaField{76, "value", std::move(value_struct), false}; auto map_type = std::make_shared(key_field, value_field); - auto map_field = iceberg::SchemaField{77, "map_field", map_type, true}; + auto map_field = + iceberg::SchemaField{77, "map_field", std::move(map_type), true}; return MakeSchema(map_field); }, .should_succeed = true}, - ProjectTestParam{ - .test_name = "ProjectEmptyResult", - .create_schema = []() { return TestSchemaFactory::BasicSchema(100); }, - .selected_ids = {999}, // Select non-existent field - .expected_schema = []() { return MakeSchema(); }, - .should_succeed = true})); + ProjectTestParam{.test_name = "ProjectEmptyResult", + .create_schema = []() { return BasicSchema(); }, + .selected_ids = {999}, // Select non-existent field + .expected_schema = []() { return MakeSchema(); }, + .should_succeed = true})); From 7e7e2a588dcb712139f1f0ad6077e9b12acfd3ba Mon Sep 17 00:00:00 2001 From: nullccxsy <32149055912@qq.com> Date: Tue, 23 Sep 2025 10:59:53 +0800 Subject: [PATCH 10/10] fix: solve merge conflicts --- test/schema_test.cc | 70 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 69 insertions(+), 1 deletion(-) diff --git a/test/schema_test.cc b/test/schema_test.cc index 54b9a96e2..3d10fb824 100644 --- a/test/schema_test.cc +++ b/test/schema_test.cc @@ -26,6 +26,7 @@ #include #include +#include "gtest/gtest.h" #include "iceberg/result.h" #include "iceberg/schema_field.h" #include "iceberg/util/formatter.h" // IWYU pragma: keep @@ -762,7 +763,7 @@ TEST_P(ProjectParamTest, ProjectFields) { if (param.should_succeed) { ASSERT_TRUE(result.has_value()); - ASSERT_THAT(*result.value(), *param.expected_schema()); + ASSERT_EQ(*result.value(), *param.expected_schema()); } else { ASSERT_FALSE(result.has_value()); ASSERT_THAT(result, iceberg::IsError(iceberg::ErrorKind::kInvalidArgument)); @@ -942,3 +943,70 @@ INSTANTIATE_TEST_SUITE_P( .selected_ids = {999}, // Select non-existent field .expected_schema = []() { return MakeSchema(); }, .should_succeed = true})); + +class SchemaThreadSafetyTest : public ::testing::Test { + protected: + void SetUp() override { + field1_ = std::make_unique(1, "id", iceberg::int32(), true); + field2_ = std::make_unique(2, "name", iceberg::string(), true); + field3_ = std::make_unique(3, "age", iceberg::int32(), true); + schema_ = std::make_unique( + std::vector{*field1_, *field2_, *field3_}, 100); + } + + std::unique_ptr schema_; + std::unique_ptr field1_; + std::unique_ptr field2_; + std::unique_ptr field3_; +}; + +TEST_F(SchemaThreadSafetyTest, ConcurrentFindFieldById) { + const int num_threads = 10; + const int iterations_per_thread = 100; + std::vector threads; + + for (int i = 0; i < num_threads; ++i) { + threads.emplace_back([this, iterations_per_thread]() { + for (int j = 0; j < iterations_per_thread; ++j) { + ASSERT_THAT(schema_->FindFieldById(1), ::testing::Optional(*field1_)); + ASSERT_THAT(schema_->FindFieldById(999), ::testing::Optional(std::nullopt)); + } + }); + } + + for (auto& thread : threads) { + thread.join(); + } +} + +TEST_F(SchemaThreadSafetyTest, MixedConcurrentOperations) { + const int num_threads = 8; + const int iterations_per_thread = 50; + std::vector threads; + + for (int i = 0; i < num_threads; ++i) { + threads.emplace_back([this, iterations_per_thread, i]() { + for (int j = 0; j < iterations_per_thread; ++j) { + if (i % 4 == 0) { + ASSERT_THAT(schema_->FindFieldById(1), ::testing::Optional(*field1_)); + } else if (i % 4 == 1) { + ASSERT_THAT(schema_->FindFieldByName("name", true), + ::testing::Optional(*field2_)); + } else if (i % 4 == 2) { + ASSERT_THAT(schema_->FindFieldByName("AGE", false), + ::testing::Optional(*field3_)); + } else { + ASSERT_THAT(schema_->FindFieldById(2), ::testing::Optional(*field2_)); + ASSERT_THAT(schema_->FindFieldByName("id", true), + ::testing::Optional(*field1_)); + ASSERT_THAT(schema_->FindFieldByName("age", false), + ::testing::Optional(*field3_)); + } + } + }); + } + + for (auto& thread : threads) { + thread.join(); + } +}