From 7999dc46be4997d242e1d6392979eec6a1662c50 Mon Sep 17 00:00:00 2001 From: nullccxsy <32149055912@qq.com> Date: Wed, 20 Aug 2025 18:53:51 +0800 Subject: [PATCH 01/14] feat(schema): implement nested field lookup with name pruning - Add GetFieldByName method to support nested field queries with dot notation - Implement InitNameToIndexMap and InitIdToIndexMap for field mapping - Introduce SchemaFieldVisitor for schema traversal - Implement name pruning logic for element and value in nested structure --- src/iceberg/schema.cc | 144 +++++++++++++++++++++++++++++++++++++++++- src/iceberg/schema.h | 31 +++++++++ src/iceberg/type.cc | 22 ++++++- src/iceberg/type.h | 17 +++-- test/schema_test.cc | 72 +++++++++++++++++++++ 5 files changed, 277 insertions(+), 9 deletions(-) diff --git a/src/iceberg/schema.cc b/src/iceberg/schema.cc index c31554ead..ad4a66bd5 100644 --- a/src/iceberg/schema.cc +++ b/src/iceberg/schema.cc @@ -21,13 +21,15 @@ #include +#include "iceberg/exception.h" #include "iceberg/type.h" #include "iceberg/util/formatter.h" // IWYU pragma: keep - namespace iceberg { Schema::Schema(std::vector fields, std::optional schema_id) - : StructType(std::move(fields)), schema_id_(schema_id) {} + : StructType(std::move(fields)), schema_id_(schema_id) { + InitIdToIndexMap(); +} std::optional Schema::schema_id() const { return schema_id_; } @@ -44,4 +46,142 @@ bool Schema::Equals(const Schema& other) const { return schema_id_ == other.schema_id_ && fields_ == other.fields_; } +std::optional> Schema::GetFieldByName( + std::string_view name, bool case_sensitive) const { + if (case_sensitive) { + InitNameToIndexMap(); + auto it = name_to_index_.find(std::string(name)); + if (it == name_to_index_.end()) return std::nullopt; + return full_schemafield_[it->second]; + } + InitLowerCaseNameToIndexMap(); + std::string lower_name(name); + std::ranges::transform(lower_name, lower_name.begin(), ::tolower); + auto it = lowercase_name_to_index_.find(lower_name); + if (it == lowercase_name_to_index_.end()) return std::nullopt; + return full_schemafield_[it->second]; +} + +std::optional> Schema::GetFieldByName( + std::string_view name) const { + return GetFieldByName(name, true); +} + +void Schema::InitIdToIndexMap() const { + if (!id_to_index_.empty()) { + return; + } + SchemaFieldVisitor visitor; + auto result = VisitTypeInline(*this, &visitor, id_to_index_, full_schemafield_); +} + +void Schema::InitNameToIndexMap() const { + if (!name_to_index_.empty()) { + return; + } + int index = 0; + std::string_view path, short_path; + SchemaFieldVisitor visitor; + std::unordered_map shortname_to_index; + auto tmp = VisitTypeInline(*this, &visitor, name_to_index_, path, shortname_to_index, + short_path, index, true); + if (!tmp.has_value()) { + throw IcebergError("Failed to perform InitNameToIndexMap"); + } + for (const auto& pair : shortname_to_index) { + if (!name_to_index_.count(pair.first)) { + name_to_index_.emplace(pair.first, pair.second); + } + } +} + +void Schema::InitLowerCaseNameToIndexMap() const { + if (!lowercase_name_to_index_.empty()) { + return; + } + int index = 0; + std::string_view path, short_path; + SchemaFieldVisitor visitor; + std::unordered_map shortlowercasename_to_index; + auto tmp = VisitTypeInline(*this, &visitor, lowercase_name_to_index_, path, + shortlowercasename_to_index, short_path, index, false); + if (!tmp.has_value()) { + throw IcebergError("Failed to perform InitLowerCaseNameToIndexMap"); + } + for (const auto& pair : shortlowercasename_to_index) { + if (!lowercase_name_to_index_.count(pair.first)) { + lowercase_name_to_index_.emplace(pair.first, pair.second); + } + } +} + +std::optional> Schema::GetFieldById( + int32_t field_id) const { + InitIdToIndexMap(); + auto it = id_to_index_.find(field_id); + if (it == id_to_index_.end()) { + return std::nullopt; + } + return full_schemafield_[it->second]; +} + +Status SchemaFieldVisitor::Visit(const Type& type, + std::unordered_map& id_to_index, + std::vector& full_schemafield) { + const auto& nested = iceberg::internal::checked_cast(type); + for (const auto& field : nested.fields()) { + id_to_index[field.field_id()] = full_schemafield.size(); + full_schemafield.emplace_back(field); + if (field.type()->is_nested()) { + auto tmp = Visit(*field.type(), id_to_index, full_schemafield); + if (!tmp.has_value()) { + throw IcebergError("Failed to perform visit(id_to_index)"); + } + } + } + return {}; +} +std::string SchemaFieldVisitor::GetPath(const std::string& last_path, + const std::string& field_name, + bool case_sensitive) { + if (case_sensitive) { + return last_path.empty() ? field_name : last_path + "." + field_name; + } + std::string lower_name(field_name); + std::ranges::transform(lower_name, lower_name.begin(), ::tolower); + return last_path.empty() ? lower_name : last_path + "." + lower_name; +} + +Status SchemaFieldVisitor::Visit( + const Type& type, std::unordered_map& name_to_index, + std::string_view path, std::unordered_map& shortname_to_index, + std::string_view short_path, int& index, bool case_sensitive) { + const char dot = '.'; + const auto& nested = iceberg::internal::checked_cast(type); + for (const auto& field : nested.fields()) { + std::string full_path, short_full_path; + full_path = GetPath(std::string(path), std::string(field.name()), case_sensitive); + name_to_index[full_path] = index; + + if (type.type_id() == TypeId::kList and field.type()->type_id() == TypeId::kStruct) { + short_full_path = short_path; + } else if (type.type_id() == TypeId::kMap and field.name() == "value" and + field.type()->type_id() == TypeId::kStruct) { + short_full_path = short_path; + } else { + short_full_path = + GetPath(std::string(short_path), std::string(field.name()), case_sensitive); + } + shortname_to_index[short_full_path] = index++; + if (field.type()->is_nested()) { + auto tmp = Visit(*field.type(), name_to_index, full_path, shortname_to_index, + short_full_path, index, case_sensitive); + if (!tmp.has_value()) { + throw IcebergError("Failed to perform visit(name_to_index)"); + } + } + } + return {}; +} + } // namespace iceberg diff --git a/src/iceberg/schema.h b/src/iceberg/schema.h index 490acb6de..392a724ca 100644 --- a/src/iceberg/schema.h +++ b/src/iceberg/schema.h @@ -23,6 +23,7 @@ /// Schemas for Iceberg tables. This header contains the definition of Schema /// and any utility functions. See iceberg/type.h and iceberg/field.h as well. +#include #include #include #include @@ -31,6 +32,7 @@ #include "iceberg/iceberg_export.h" #include "iceberg/schema_field.h" #include "iceberg/type.h" +#include "iceberg/util/visit_type.h" namespace iceberg { @@ -54,13 +56,42 @@ class ICEBERG_EXPORT Schema : public StructType { [[nodiscard]] std::string ToString() const override; + [[nodiscard]] std::optional> GetFieldByName( + std::string_view name, bool case_sensitive) const override; + + [[nodiscard]] std::optional> GetFieldByName( + std::string_view name) const; + + [[nodiscard]] std::optional> GetFieldById( + int32_t field_id) const override; + friend bool operator==(const Schema& lhs, const Schema& rhs) { return lhs.Equals(rhs); } + mutable std::unordered_map id_to_index_; + mutable std::unordered_map name_to_index_; + mutable std::unordered_map lowercase_name_to_index_; + mutable std::vector full_schemafield_; + private: /// \brief Compare two schemas for equality. [[nodiscard]] bool Equals(const Schema& other) const; const std::optional schema_id_; + + void InitIdToIndexMap() const; + void InitNameToIndexMap() const; + void InitLowerCaseNameToIndexMap() const; }; +class SchemaFieldVisitor { + public: + Status Visit(const Type& type, std::unordered_map& id_to_index, + std::vector& full_schemafield); + std::string GetPath(const std::string& last_path, const std::string& field_name, + bool case_sensitive); + Status Visit(const Type& type, std::unordered_map& name_to_index, + std::string_view path, + std::unordered_map& shortname_to_index, + std::string_view short_path, int& index, bool case_sensitive); +}; } // namespace iceberg diff --git a/src/iceberg/type.cc b/src/iceberg/type.cc index e66f96daf..b8036774e 100644 --- a/src/iceberg/type.cc +++ b/src/iceberg/type.cc @@ -27,6 +27,10 @@ #include "iceberg/util/formatter.h" // IWYU pragma: keep namespace iceberg { +std::optional> NestedType::GetFieldByName( + std::string_view name) const { + return GetFieldByName(name, true); +} StructType::StructType(std::vector fields) : fields_(std::move(fields)) { size_t index = 0; @@ -67,7 +71,7 @@ std::optional> StructType::GetFieldByI return fields_[index]; } std::optional> StructType::GetFieldByName( - std::string_view name) const { + std::string_view name, bool case_sensitive) const { // N.B. duplicate names are not permitted (looking at the Java // implementation) so there is nothing in particular we need to do here for (const auto& field : fields_) { @@ -84,6 +88,10 @@ bool StructType::Equals(const Type& other) const { const auto& struct_ = static_cast(other); return fields_ == struct_.fields_; } +std::optional> StructType::GetFieldByName( + std::string_view name) const { + return GetFieldByName(name, true); +} ListType::ListType(SchemaField element) : element_(std::move(element)) { if (element_.name() != kElementName) { @@ -120,7 +128,7 @@ std::optional> ListType::GetFieldByInd return std::nullopt; } std::optional> ListType::GetFieldByName( - std::string_view name) const { + std::string_view name, bool case_sensitive) const { if (name == element_.name()) { return std::cref(element_); } @@ -133,6 +141,10 @@ bool ListType::Equals(const Type& other) const { const auto& list = static_cast(other); return element_ == list.element_; } +std::optional> ListType::GetFieldByName( + std::string_view name) const { + return GetFieldByName(name, false); +} MapType::MapType(SchemaField key, SchemaField value) : fields_{std::move(key), std::move(value)} { @@ -178,7 +190,7 @@ std::optional> MapType::GetFieldByInde return std::nullopt; } std::optional> MapType::GetFieldByName( - std::string_view name) const { + std::string_view name, bool case_sensitive) const { if (name == kKeyName) { return key(); } else if (name == kValueName) { @@ -193,6 +205,10 @@ bool MapType::Equals(const Type& other) const { const auto& map = static_cast(other); return fields_ == map.fields_; } +std::optional> MapType::GetFieldByName( + std::string_view name) const { + return GetFieldByName(name, false); +} TypeId BooleanType::type_id() const { return kTypeId; } std::string BooleanType::ToString() const { return "boolean"; } diff --git a/src/iceberg/type.h b/src/iceberg/type.h index 78c0141b1..23b483b53 100644 --- a/src/iceberg/type.h +++ b/src/iceberg/type.h @@ -91,7 +91,10 @@ class ICEBERG_EXPORT NestedType : public Type { /// /// \note This is currently O(n) complexity. [[nodiscard]] virtual std::optional> - GetFieldByName(std::string_view name) const = 0; + GetFieldByName(std::string_view name, bool case_sensitive) const = 0; + + std::optional> GetFieldByName( + std::string_view name) const; }; /// \defgroup type-nested Nested Types @@ -114,7 +117,9 @@ class ICEBERG_EXPORT StructType : public NestedType { std::optional> GetFieldByIndex( int32_t index) const override; std::optional> GetFieldByName( - std::string_view name) const override; + std::string_view name, bool case_sensitive) const override; + std::optional> GetFieldByName( + std::string_view name) const; protected: bool Equals(const Type& other) const override; @@ -145,7 +150,9 @@ class ICEBERG_EXPORT ListType : public NestedType { std::optional> GetFieldByIndex( int32_t index) const override; std::optional> GetFieldByName( - std::string_view name) const override; + std::string_view name, bool case_sensitive) const override; + std::optional> GetFieldByName( + std::string_view name) const; protected: bool Equals(const Type& other) const override; @@ -177,7 +184,9 @@ class ICEBERG_EXPORT MapType : public NestedType { std::optional> GetFieldByIndex( int32_t index) const override; std::optional> GetFieldByName( - std::string_view name) const override; + std::string_view name, bool case_sensitive) const override; + std::optional> GetFieldByName( + std::string_view name) const; protected: bool Equals(const Type& other) const override; diff --git a/test/schema_test.cc b/test/schema_test.cc index 932395973..f3c5b7bf5 100644 --- a/test/schema_test.cc +++ b/test/schema_test.cc @@ -81,3 +81,75 @@ TEST(SchemaTest, Equality) { ASSERT_EQ(schema1, schema5); ASSERT_EQ(schema5, schema1); } + +TEST(SchemaTest, NestedType) { + iceberg::SchemaField field1(1, "Foo", iceberg::int32(), true); + iceberg::SchemaField field2(2, "Bar", iceberg::string(), true); + iceberg::SchemaField field3(3, "Foobar", iceberg::int32(), true); + + iceberg::StructType structtype = iceberg::StructType({field1, field2, field3}); + + auto listype = iceberg::ListType(iceberg::SchemaField::MakeRequired( + 4, "element", std::make_shared(structtype))); + + auto maptype = + iceberg::MapType(iceberg::SchemaField::MakeRequired(5, "key", iceberg::int32()), + iceberg::SchemaField::MakeRequired( + 6, "value", std::make_shared(listype))); + + auto field4 = iceberg::SchemaField::MakeRequired( + 4, "element", std::make_shared(structtype)); + auto field5 = iceberg::SchemaField::MakeRequired(5, "key", iceberg::int32()); + auto field6 = iceberg::SchemaField::MakeRequired( + 6, "value", std::make_shared(listype)); + auto field7 = iceberg::SchemaField::MakeRequired( + 7, "Value", std::make_shared(maptype)); + + iceberg::Schema schema({field7}, 1); + + ASSERT_EQ(schema.full_schemafield_.size(), 7); + ASSERT_THAT(schema.GetFieldById(7), ::testing::Optional(field7)); + ASSERT_THAT(schema.GetFieldById(6), ::testing::Optional(field6)); + ASSERT_THAT(schema.GetFieldById(5), ::testing::Optional(field5)); + ASSERT_THAT(schema.GetFieldById(4), ::testing::Optional(field4)); + ASSERT_THAT(schema.GetFieldById(3), ::testing::Optional(field3)); + ASSERT_THAT(schema.GetFieldById(2), ::testing::Optional(field2)); + ASSERT_THAT(schema.GetFieldById(1), ::testing::Optional(field1)); + ASSERT_EQ(schema.full_schemafield_.size(), 7); + + ASSERT_THAT(schema.GetFieldByName("Value"), ::testing::Optional(field7)); + ASSERT_THAT(schema.GetFieldByName("Value.value"), ::testing::Optional(field6)); + ASSERT_THAT(schema.GetFieldByName("Value.key"), ::testing::Optional(field5)); + ASSERT_THAT(schema.GetFieldByName("Value.value.element"), ::testing::Optional(field4)); + ASSERT_THAT(schema.GetFieldByName("Value.value.element.Foobar"), + ::testing::Optional(field3)); + ASSERT_THAT(schema.GetFieldByName("Value.value.element.Bar"), + ::testing::Optional(field2)); + ASSERT_THAT(schema.GetFieldByName("Value.value.element.Foo"), + ::testing::Optional(field1)); + ASSERT_EQ(schema.full_schemafield_.size(), 7); + + ASSERT_THAT(schema.GetFieldByName("vALue", false), ::testing::Optional(field7)); + ASSERT_THAT(schema.GetFieldByName("vALue.VALUE", false), ::testing::Optional(field6)); + ASSERT_THAT(schema.GetFieldByName("valUe.kEy", false), ::testing::Optional(field5)); + ASSERT_THAT(schema.GetFieldByName("vaLue.vAlue.elEment", false), + ::testing::Optional(field4)); + ASSERT_THAT(schema.GetFieldByName("vaLue.vAlue.eLement.fOObar", false), + ::testing::Optional(field3)); + ASSERT_THAT(schema.GetFieldByName("valUe.vaLUe.elemEnt.Bar", false), + ::testing::Optional(field2)); + ASSERT_THAT(schema.GetFieldByName("valUe.valUe.ELEMENT.FOO", false), + ::testing::Optional(field1)); + ASSERT_EQ(schema.full_schemafield_.size(), 7); + + ASSERT_THAT(schema.GetFieldByName("vaLue.value.FOO", false), + ::testing::Optional(field1)); + ASSERT_THAT(schema.GetFieldByName("Value.value.Bar", false), + ::testing::Optional(field2)); + ASSERT_THAT(schema.GetFieldByName("Value.value.FooBAR", false), + ::testing::Optional(field3)); + + ASSERT_THAT(schema.GetFieldByName("Value.value.Foo"), ::testing::Optional(field1)); + ASSERT_THAT(schema.GetFieldByName("Value.value.Bar"), ::testing::Optional(field2)); + ASSERT_THAT(schema.GetFieldByName("Value.value.Foobar"), ::testing::Optional(field3)); +} From f379edceb4ab738a4602978ff5c027647bad585c Mon Sep 17 00:00:00 2001 From: nullccxsy <32149055912@qq.com> Date: Thu, 21 Aug 2025 19:44:13 +0800 Subject: [PATCH 02/14] feat(schema): implement nested field lookup with name pruning - Introduce IdVisitor for schema traversal to init id_to_index_(Map) - Introduce NameVisitor for schema traversal to init name_to_index_(Map), lowercase_name_to_index_(map) --- src/iceberg/schema.cc | 236 +++++++++++++++++++++++++++--------------- src/iceberg/schema.h | 44 ++++---- src/iceberg/type.cc | 3 + test/schema_test.cc | 158 +++++++++++++++++++++++----- 4 files changed, 305 insertions(+), 136 deletions(-) diff --git a/src/iceberg/schema.cc b/src/iceberg/schema.cc index ad4a66bd5..540b336d4 100644 --- a/src/iceberg/schema.cc +++ b/src/iceberg/schema.cc @@ -21,15 +21,49 @@ #include -#include "iceberg/exception.h" #include "iceberg/type.h" #include "iceberg/util/formatter.h" // IWYU pragma: keep namespace iceberg { +class IdVisitor { + public: + explicit IdVisitor(bool has_init_ = false); + Status Visit(const Type& type); -Schema::Schema(std::vector fields, std::optional schema_id) - : StructType(std::move(fields)), schema_id_(schema_id) { - InitIdToIndexMap(); + bool has_init; + int index = 0; + std::unordered_map id_to_index; + std::vector> full_schemafield; +}; + +std::string GetPath(const std::string& last_path, const std::string& field_name, + bool case_sensitive) { + if (case_sensitive) { + return last_path.empty() ? field_name : last_path + "." + field_name; + } + std::string lower_name(field_name); + std::ranges::transform(lower_name, lower_name.begin(), ::tolower); + return last_path.empty() ? lower_name : last_path + "." + lower_name; } +class NameVisitor { + public: + explicit NameVisitor(bool case_sensitive_ = true, bool has_init_ = false); + Status Visit(const ListType& type, const std::string& path, + const std::string& short_path); + Status Visit(const MapType& type, const std::string& path, + const std::string& short_path); + Status Visit(const StructType& type, const std::string& path, + const std::string& short_path); + Status Visit(const PrimitiveType& type, const std::string& path, + const std::string& short_path); + + int index = 0; + bool case_sensitive; + bool has_init; + std::unordered_map name_to_index; + std::vector> full_schemafield; +}; +Schema::Schema(std::vector fields, std::optional schema_id) + : StructType(std::move(fields)), schema_id_(schema_id) {} std::optional Schema::schema_id() const { return schema_id_; } @@ -46,15 +80,15 @@ bool Schema::Equals(const Schema& other) const { return schema_id_ == other.schema_id_ && fields_ == other.fields_; } -std::optional> Schema::GetFieldByName( +Result>> Schema::FindFieldByName( std::string_view name, bool case_sensitive) const { if (case_sensitive) { - InitNameToIndexMap(); + ICEBERG_RETURN_UNEXPECTED(InitNameToIndexMap()); auto it = name_to_index_.find(std::string(name)); if (it == name_to_index_.end()) return std::nullopt; return full_schemafield_[it->second]; } - InitLowerCaseNameToIndexMap(); + ICEBERG_RETURN_UNEXPECTED(InitLowerCaseNameToIndexMap()); std::string lower_name(name); std::ranges::transform(lower_name, lower_name.begin(), ::tolower); auto it = lowercase_name_to_index_.find(lower_name); @@ -62,62 +96,58 @@ std::optional> Schema::GetFieldByName( return full_schemafield_[it->second]; } -std::optional> Schema::GetFieldByName( +Result>> Schema::FindFieldByName( std::string_view name) const { - return GetFieldByName(name, true); + return FindFieldByName(name, /*case_sensitive*/ true); } -void Schema::InitIdToIndexMap() const { +Result Schema::InitIdToIndexMap() const { if (!id_to_index_.empty()) { - return; + return {}; } - SchemaFieldVisitor visitor; - auto result = VisitTypeInline(*this, &visitor, id_to_index_, full_schemafield_); + bool has_init = !full_schemafield_.empty(); + IdVisitor visitor(has_init); + ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*this, &visitor)); + id_to_index_ = std::move(visitor.id_to_index); + if (!has_init) { + full_schemafield_ = std::move(visitor.full_schemafield); + } + return {}; } -void Schema::InitNameToIndexMap() const { +Result Schema::InitNameToIndexMap() const { if (!name_to_index_.empty()) { - return; + return {}; } - int index = 0; - std::string_view path, short_path; - SchemaFieldVisitor visitor; - std::unordered_map shortname_to_index; - auto tmp = VisitTypeInline(*this, &visitor, name_to_index_, path, shortname_to_index, - short_path, index, true); - if (!tmp.has_value()) { - throw IcebergError("Failed to perform InitNameToIndexMap"); - } - for (const auto& pair : shortname_to_index) { - if (!name_to_index_.count(pair.first)) { - name_to_index_.emplace(pair.first, pair.second); - } + bool has_init = !full_schemafield_.empty(); + std::string path, short_path; + NameVisitor visitor(true, has_init); + ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*this, &visitor, path, short_path)); + name_to_index_ = std::move(visitor.name_to_index); + if (!has_init) { + full_schemafield_ = std::move(visitor.full_schemafield); } + return {}; } -void Schema::InitLowerCaseNameToIndexMap() const { +Result Schema::InitLowerCaseNameToIndexMap() const { if (!lowercase_name_to_index_.empty()) { - return; + return {}; } - int index = 0; - std::string_view path, short_path; - SchemaFieldVisitor visitor; - std::unordered_map shortlowercasename_to_index; - auto tmp = VisitTypeInline(*this, &visitor, lowercase_name_to_index_, path, - shortlowercasename_to_index, short_path, index, false); - if (!tmp.has_value()) { - throw IcebergError("Failed to perform InitLowerCaseNameToIndexMap"); - } - for (const auto& pair : shortlowercasename_to_index) { - if (!lowercase_name_to_index_.count(pair.first)) { - lowercase_name_to_index_.emplace(pair.first, pair.second); - } + bool has_init = !full_schemafield_.empty(); + std::string path, short_path; + NameVisitor visitor(false, has_init); + ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*this, &visitor, path, short_path)); + lowercase_name_to_index_ = std::move(visitor.name_to_index); + if (!has_init) { + full_schemafield_ = std::move(visitor.full_schemafield); } + return {}; } -std::optional> Schema::GetFieldById( +Result>> Schema::FindFieldById( int32_t field_id) const { - InitIdToIndexMap(); + ICEBERG_RETURN_UNEXPECTED(InitIdToIndexMap()); auto it = id_to_index_.find(field_id); if (it == id_to_index_.end()) { return std::nullopt; @@ -125,63 +155,97 @@ std::optional> Schema::GetFieldById( return full_schemafield_[it->second]; } -Status SchemaFieldVisitor::Visit(const Type& type, - std::unordered_map& id_to_index, - std::vector& full_schemafield) { +IdVisitor::IdVisitor(bool has_init_) : has_init(has_init_) {} + +Status IdVisitor::Visit(const Type& type) { const auto& nested = iceberg::internal::checked_cast(type); - for (const auto& field : nested.fields()) { - id_to_index[field.field_id()] = full_schemafield.size(); - full_schemafield.emplace_back(field); + const auto& fields = nested.fields(); + for (const auto& field : fields) { + id_to_index[field.field_id()] = index++; + if (!has_init) { + full_schemafield.emplace_back(field); + } if (field.type()->is_nested()) { - auto tmp = Visit(*field.type(), id_to_index, full_schemafield); - if (!tmp.has_value()) { - throw IcebergError("Failed to perform visit(id_to_index)"); - } + ICEBERG_RETURN_UNEXPECTED(Visit(*field.type())); } } return {}; } -std::string SchemaFieldVisitor::GetPath(const std::string& last_path, - const std::string& field_name, - bool case_sensitive) { - if (case_sensitive) { - return last_path.empty() ? field_name : last_path + "." + field_name; + +NameVisitor::NameVisitor(bool case_sensitive_, bool has_init_) + : case_sensitive(case_sensitive_), has_init(has_init_) {} + +Status NameVisitor::Visit(const ListType& type, const std::string& path, + const std::string& short_path) { + const auto& field = type.fields()[0]; + std::string full_path = + iceberg::GetPath(path, std::string(field.name()), case_sensitive); + std::string short_full_path; + if (field.type()->type_id() == TypeId::kStruct) { + short_full_path = short_path; + } else { + short_full_path = + iceberg::GetPath(short_path, std::string(field.name()), case_sensitive); } - std::string lower_name(field_name); - std::ranges::transform(lower_name, lower_name.begin(), ::tolower); - return last_path.empty() ? lower_name : last_path + "." + lower_name; + name_to_index[full_path] = index++; + if (!has_init) { + full_schemafield.emplace_back(field); + } + name_to_index.emplace(short_full_path, index - 1); + if (field.type()->is_nested()) { + ICEBERG_RETURN_UNEXPECTED( + VisitTypeInline(*field.type(), this, full_path, short_full_path)); + } + return {}; } -Status SchemaFieldVisitor::Visit( - const Type& type, std::unordered_map& name_to_index, - std::string_view path, std::unordered_map& shortname_to_index, - std::string_view short_path, int& index, bool case_sensitive) { - const char dot = '.'; - const auto& nested = iceberg::internal::checked_cast(type); - for (const auto& field : nested.fields()) { - std::string full_path, short_full_path; - full_path = GetPath(std::string(path), std::string(field.name()), case_sensitive); - name_to_index[full_path] = index; - - if (type.type_id() == TypeId::kList and field.type()->type_id() == TypeId::kStruct) { - short_full_path = short_path; - } else if (type.type_id() == TypeId::kMap and field.name() == "value" and - field.type()->type_id() == TypeId::kStruct) { +Status NameVisitor::Visit(const MapType& type, const std::string& path, + const std::string& short_path) { + std::string full_path, short_full_path; + for (const auto& field : type.fields()) { + full_path = iceberg::GetPath(path, std::string(field.name()), case_sensitive); + if (field.name() == MapType::kValueName && + field.type()->type_id() == TypeId::kStruct) { short_full_path = short_path; } else { - short_full_path = - GetPath(std::string(short_path), std::string(field.name()), case_sensitive); + short_full_path = iceberg::GetPath(path, std::string(field.name()), case_sensitive); + } + name_to_index[full_path] = index++; + if (!has_init) { + full_schemafield.emplace_back(field); } - shortname_to_index[short_full_path] = index++; + name_to_index.emplace(short_full_path, index - 1); if (field.type()->is_nested()) { - auto tmp = Visit(*field.type(), name_to_index, full_path, shortname_to_index, - short_full_path, index, case_sensitive); - if (!tmp.has_value()) { - throw IcebergError("Failed to perform visit(name_to_index)"); - } + ICEBERG_RETURN_UNEXPECTED( + VisitTypeInline(*field.type(), this, full_path, short_full_path)); } } return {}; } +Status NameVisitor::Visit(const StructType& type, const std::string& path, + const std::string& short_path) { + const auto& fields = type.fields(); + std::string full_path, short_full_path; + for (const auto& field : fields) { + full_path = iceberg::GetPath(path, std::string(field.name()), case_sensitive); + short_full_path = + iceberg::GetPath(short_path, std::string(field.name()), case_sensitive); + name_to_index[full_path] = index++; + if (!has_init) { + full_schemafield.emplace_back(field); + } + name_to_index.emplace(short_full_path, index - 1); + if (field.type()->is_nested()) { + ICEBERG_RETURN_UNEXPECTED( + VisitTypeInline(*field.type(), this, full_path, short_full_path)); + } + } + return {}; +} + +Status NameVisitor::Visit(const PrimitiveType& type, const std::string& path, + const std::string& short_path) { + return {}; +} } // namespace iceberg diff --git a/src/iceberg/schema.h b/src/iceberg/schema.h index 392a724ca..ad3a55ee5 100644 --- a/src/iceberg/schema.h +++ b/src/iceberg/schema.h @@ -32,6 +32,7 @@ #include "iceberg/iceberg_export.h" #include "iceberg/schema_field.h" #include "iceberg/type.h" +#include "iceberg/util/macros.h" #include "iceberg/util/visit_type.h" namespace iceberg { @@ -56,21 +57,32 @@ class ICEBERG_EXPORT Schema : public StructType { [[nodiscard]] std::string ToString() const override; - [[nodiscard]] std::optional> GetFieldByName( - std::string_view name, bool case_sensitive) const override; + ///\brief Get thd SchemaField By Name + /// + /// Short names for maps and lists are included for any name that does not conflict with + /// a canonical name. For example, a list, 'l', of structs with field 'x' will produce + /// short name 'l.x' in addition to canonical name 'l.element.x'. a map 'm', if its + /// value include a structs with field 'x' wil produce short name 'm.x' in addition to + /// canonical name 'm.value.x' + [[nodiscard]] Result>> + FindFieldByName(std::string_view name, bool case_sensitive) const; - [[nodiscard]] std::optional> GetFieldByName( - std::string_view name) const; + [[nodiscard]] Result>> + FindFieldByName(std::string_view name) const; - [[nodiscard]] std::optional> GetFieldById( - int32_t field_id) const override; + [[nodiscard]] Result>> + FindFieldById(int32_t field_id) const; friend bool operator==(const Schema& lhs, const Schema& rhs) { return lhs.Equals(rhs); } + /// Mapping from field id to index of `full_schemafield_`. mutable std::unordered_map id_to_index_; + /// Mapping from field name to index of `full_schemafield_`. mutable std::unordered_map name_to_index_; + /// Mapping from field lowercase_name(suppoert case_insensitive query) to index of + /// `full_schemafield_`. mutable std::unordered_map lowercase_name_to_index_; - mutable std::vector full_schemafield_; + mutable std::vector> full_schemafield_; private: /// \brief Compare two schemas for equality. @@ -78,20 +90,8 @@ class ICEBERG_EXPORT Schema : public StructType { const std::optional schema_id_; - void InitIdToIndexMap() const; - void InitNameToIndexMap() const; - void InitLowerCaseNameToIndexMap() const; -}; - -class SchemaFieldVisitor { - public: - Status Visit(const Type& type, std::unordered_map& id_to_index, - std::vector& full_schemafield); - std::string GetPath(const std::string& last_path, const std::string& field_name, - bool case_sensitive); - Status Visit(const Type& type, std::unordered_map& name_to_index, - std::string_view path, - std::unordered_map& shortname_to_index, - std::string_view short_path, int& index, bool case_sensitive); + Result InitIdToIndexMap() const; + Result InitNameToIndexMap() const; + Result InitLowerCaseNameToIndexMap() const; }; } // namespace iceberg diff --git a/src/iceberg/type.cc b/src/iceberg/type.cc index b8036774e..19ff95723 100644 --- a/src/iceberg/type.cc +++ b/src/iceberg/type.cc @@ -70,6 +70,7 @@ std::optional> StructType::GetFieldByI } return fields_[index]; } +// todo std::optional> StructType::GetFieldByName( std::string_view name, bool case_sensitive) const { // N.B. duplicate names are not permitted (looking at the Java @@ -127,6 +128,7 @@ std::optional> ListType::GetFieldByInd } return std::nullopt; } +// todo std::optional> ListType::GetFieldByName( std::string_view name, bool case_sensitive) const { if (name == element_.name()) { @@ -189,6 +191,7 @@ std::optional> MapType::GetFieldByInde } return std::nullopt; } +// todo std::optional> MapType::GetFieldByName( std::string_view name, bool case_sensitive) const { if (name == kKeyName) { diff --git a/test/schema_test.cc b/test/schema_test.cc index f3c5b7bf5..b67ab0ba8 100644 --- a/test/schema_test.cc +++ b/test/schema_test.cc @@ -107,49 +107,151 @@ TEST(SchemaTest, NestedType) { iceberg::Schema schema({field7}, 1); - ASSERT_EQ(schema.full_schemafield_.size(), 7); - ASSERT_THAT(schema.GetFieldById(7), ::testing::Optional(field7)); - ASSERT_THAT(schema.GetFieldById(6), ::testing::Optional(field6)); - ASSERT_THAT(schema.GetFieldById(5), ::testing::Optional(field5)); - ASSERT_THAT(schema.GetFieldById(4), ::testing::Optional(field4)); - ASSERT_THAT(schema.GetFieldById(3), ::testing::Optional(field3)); - ASSERT_THAT(schema.GetFieldById(2), ::testing::Optional(field2)); - ASSERT_THAT(schema.GetFieldById(1), ::testing::Optional(field1)); + ASSERT_EQ(schema.full_schemafield_.size(), 0); + ASSERT_THAT(schema.FindFieldById(7), ::testing::Optional(field7)); + ASSERT_THAT(schema.FindFieldById(6), ::testing::Optional(field6)); + ASSERT_THAT(schema.FindFieldById(5), ::testing::Optional(field5)); + ASSERT_THAT(schema.FindFieldById(4), ::testing::Optional(field4)); + ASSERT_THAT(schema.FindFieldById(3), ::testing::Optional(field3)); + ASSERT_THAT(schema.FindFieldById(2), ::testing::Optional(field2)); + ASSERT_THAT(schema.FindFieldById(1), ::testing::Optional(field1)); ASSERT_EQ(schema.full_schemafield_.size(), 7); - ASSERT_THAT(schema.GetFieldByName("Value"), ::testing::Optional(field7)); - ASSERT_THAT(schema.GetFieldByName("Value.value"), ::testing::Optional(field6)); - ASSERT_THAT(schema.GetFieldByName("Value.key"), ::testing::Optional(field5)); - ASSERT_THAT(schema.GetFieldByName("Value.value.element"), ::testing::Optional(field4)); - ASSERT_THAT(schema.GetFieldByName("Value.value.element.Foobar"), + ASSERT_THAT(schema.FindFieldByName("Value"), ::testing::Optional(field7)); + ASSERT_THAT(schema.FindFieldByName("Value.value"), ::testing::Optional(field6)); + ASSERT_THAT(schema.FindFieldByName("Value.key"), ::testing::Optional(field5)); + ASSERT_THAT(schema.FindFieldByName("Value.value.element"), ::testing::Optional(field4)); + ASSERT_THAT(schema.FindFieldByName("Value.value.element.Foobar"), ::testing::Optional(field3)); - ASSERT_THAT(schema.GetFieldByName("Value.value.element.Bar"), + ASSERT_THAT(schema.FindFieldByName("Value.value.element.Bar"), ::testing::Optional(field2)); - ASSERT_THAT(schema.GetFieldByName("Value.value.element.Foo"), + ASSERT_THAT(schema.FindFieldByName("Value.value.element.Foo"), ::testing::Optional(field1)); ASSERT_EQ(schema.full_schemafield_.size(), 7); - ASSERT_THAT(schema.GetFieldByName("vALue", false), ::testing::Optional(field7)); - ASSERT_THAT(schema.GetFieldByName("vALue.VALUE", false), ::testing::Optional(field6)); - ASSERT_THAT(schema.GetFieldByName("valUe.kEy", false), ::testing::Optional(field5)); - ASSERT_THAT(schema.GetFieldByName("vaLue.vAlue.elEment", false), + ASSERT_THAT(schema.FindFieldByName("vALue", false), ::testing::Optional(field7)); + ASSERT_THAT(schema.FindFieldByName("vALue.VALUE", false), ::testing::Optional(field6)); + ASSERT_THAT(schema.FindFieldByName("valUe.kEy", false), ::testing::Optional(field5)); + ASSERT_THAT(schema.FindFieldByName("vaLue.vAlue.elEment", false), ::testing::Optional(field4)); - ASSERT_THAT(schema.GetFieldByName("vaLue.vAlue.eLement.fOObar", false), + ASSERT_THAT(schema.FindFieldByName("vaLue.vAlue.eLement.fOObar", false), ::testing::Optional(field3)); - ASSERT_THAT(schema.GetFieldByName("valUe.vaLUe.elemEnt.Bar", false), + ASSERT_THAT(schema.FindFieldByName("valUe.vaLUe.elemEnt.Bar", false), ::testing::Optional(field2)); - ASSERT_THAT(schema.GetFieldByName("valUe.valUe.ELEMENT.FOO", false), + ASSERT_THAT(schema.FindFieldByName("valUe.valUe.ELEMENT.FOO", false), ::testing::Optional(field1)); ASSERT_EQ(schema.full_schemafield_.size(), 7); - ASSERT_THAT(schema.GetFieldByName("vaLue.value.FOO", false), + ASSERT_THAT(schema.FindFieldByName("vaLue.value.FOO", false), + ::testing::Optional(field1)); + ASSERT_THAT(schema.FindFieldByName("Value.value.Bar", false), + ::testing::Optional(field2)); + ASSERT_THAT(schema.FindFieldByName("Value.value.FooBAR", false), + ::testing::Optional(field3)); + + ASSERT_THAT(schema.FindFieldByName("Value.value.Foo"), ::testing::Optional(field1)); + ASSERT_THAT(schema.FindFieldByName("Value.value.Bar"), ::testing::Optional(field2)); + ASSERT_THAT(schema.FindFieldByName("Value.value.Foobar"), ::testing::Optional(field3)); +} + +TEST(SchemaTest, NestType2) { + iceberg::SchemaField field1(1, "Foo", iceberg::int32(), true); + iceberg::SchemaField field2(2, "Bar", iceberg::string(), true); + iceberg::SchemaField field3(3, "Foobar", iceberg::int32(), true); + + iceberg::StructType structtype = iceberg::StructType({field1, field2, field3}); + + auto field4 = iceberg::SchemaField::MakeRequired( + 4, "element", std::make_shared(structtype)); + + auto listype = iceberg::ListType(field4); + + auto structtype2 = iceberg::StructType( + {iceberg::SchemaField::MakeRequired(5, "First_child", iceberg::int32()), + iceberg::SchemaField::MakeRequired(6, "Second_child", + std::make_shared(listype))}); + + auto maptype = iceberg::MapType( + iceberg::SchemaField::MakeRequired(7, "key", iceberg::int32()), + iceberg::SchemaField::MakeRequired( + 8, "value", std::make_shared(structtype2))); + + auto field5 = iceberg::SchemaField::MakeRequired(5, "First_child", iceberg::int32()); + auto field6 = iceberg::SchemaField::MakeRequired( + 6, "Second_child", std::make_shared(listype)); + auto field7 = iceberg::SchemaField::MakeRequired(7, "key", iceberg::int32()); + auto field8 = iceberg::SchemaField::MakeRequired( + 8, "value", std::make_shared(structtype2)); + auto field9 = iceberg::SchemaField::MakeRequired( + 9, "Map", std::make_shared(maptype)); + + iceberg::Schema schema({field9}, 1); + + ASSERT_EQ(schema.full_schemafield_.size(), 0); + ASSERT_THAT(schema.FindFieldById(9), ::testing::Optional(field9)); + ASSERT_THAT(schema.FindFieldById(8), ::testing::Optional(field8)); + ASSERT_THAT(schema.FindFieldById(7), ::testing::Optional(field7)); + ASSERT_THAT(schema.FindFieldById(6), ::testing::Optional(field6)); + ASSERT_THAT(schema.FindFieldById(5), ::testing::Optional(field5)); + ASSERT_THAT(schema.FindFieldById(4), ::testing::Optional(field4)); + ASSERT_THAT(schema.FindFieldById(3), ::testing::Optional(field3)); + ASSERT_THAT(schema.FindFieldById(2), ::testing::Optional(field2)); + ASSERT_THAT(schema.FindFieldById(1), ::testing::Optional(field1)); + ASSERT_EQ(schema.full_schemafield_.size(), 9); + + ASSERT_THAT(schema.FindFieldByName("Map"), ::testing::Optional(field9)); + ASSERT_THAT(schema.FindFieldByName("Map.value"), ::testing::Optional(field8)); + ASSERT_THAT(schema.FindFieldByName("Map.key"), ::testing::Optional(field7)); + ASSERT_THAT(schema.FindFieldByName("Map.value.Second_child"), + ::testing::Optional(field6)); + ASSERT_THAT(schema.FindFieldByName("Map.value.First_child"), + ::testing::Optional(field5)); + ASSERT_THAT(schema.FindFieldByName("Map.value.Second_child.element"), + ::testing::Optional(field4)); + ASSERT_THAT(schema.FindFieldByName("Map.value.Second_child.element.Foobar"), + ::testing::Optional(field3)); + ASSERT_THAT(schema.FindFieldByName("Map.value.Second_child.element.Bar"), + ::testing::Optional(field2)); + ASSERT_THAT(schema.FindFieldByName("Map.value.Second_child.element.Foo"), ::testing::Optional(field1)); - ASSERT_THAT(schema.GetFieldByName("Value.value.Bar", false), + ASSERT_EQ(schema.full_schemafield_.size(), 9); + + ASSERT_THAT(schema.FindFieldByName("map", false), ::testing::Optional(field9)); + ASSERT_THAT(schema.FindFieldByName("map.vALUE", false), ::testing::Optional(field8)); + ASSERT_THAT(schema.FindFieldByName("map.Key", false), ::testing::Optional(field7)); + ASSERT_THAT(schema.FindFieldByName("map.Value.second_Child", false), + ::testing::Optional(field6)); + ASSERT_THAT(schema.FindFieldByName("map.Value.first_chIld", false), + ::testing::Optional(field5)); + ASSERT_THAT(schema.FindFieldByName("map.Value.second_child.Element", false), + ::testing::Optional(field4)); + ASSERT_THAT(schema.FindFieldByName("map.Value.second_child.Element.foobar", false), + ::testing::Optional(field3)); + ASSERT_THAT(schema.FindFieldByName("map.VaLue.second_child.Element.bar", false), ::testing::Optional(field2)); - ASSERT_THAT(schema.GetFieldByName("Value.value.FooBAR", false), + ASSERT_THAT(schema.FindFieldByName("map.value.Second_child.Element.foo", false), + ::testing::Optional(field1)); + ASSERT_EQ(schema.full_schemafield_.size(), 9); + + ASSERT_THAT(schema.FindFieldByName("Map.Second_child"), ::testing::Optional(field6)); + ASSERT_THAT(schema.FindFieldByName("Map.First_child"), ::testing::Optional(field5)); + ASSERT_THAT(schema.FindFieldByName("Map.Second_child.Foobar"), ::testing::Optional(field3)); + ASSERT_THAT(schema.FindFieldByName("Map.Second_child.Bar"), + ::testing::Optional(field2)); + ASSERT_THAT(schema.FindFieldByName("Map.Second_child.Foo"), + ::testing::Optional(field1)); + ASSERT_EQ(schema.full_schemafield_.size(), 9); - ASSERT_THAT(schema.GetFieldByName("Value.value.Foo"), ::testing::Optional(field1)); - ASSERT_THAT(schema.GetFieldByName("Value.value.Bar"), ::testing::Optional(field2)); - ASSERT_THAT(schema.GetFieldByName("Value.value.Foobar"), ::testing::Optional(field3)); + ASSERT_THAT(schema.FindFieldByName("map.second_child", false), + ::testing::Optional(field6)); + ASSERT_THAT(schema.FindFieldByName("map.first_child", false), + ::testing::Optional(field5)); + ASSERT_THAT(schema.FindFieldByName("map.second_child.foobar", false), + ::testing::Optional(field3)); + ASSERT_THAT(schema.FindFieldByName("map.second_child.bar", false), + ::testing::Optional(field2)); + ASSERT_THAT(schema.FindFieldByName("map.second_child.foo", false), + ::testing::Optional(field1)); + ASSERT_EQ(schema.full_schemafield_.size(), 9); } From 6e4e68cdd2e6e0095903e53e43b59b8371ecde95 Mon Sep 17 00:00:00 2001 From: nullccxsy <32149055912@qq.com> Date: Fri, 22 Aug 2025 13:46:17 +0800 Subject: [PATCH 03/14] refactor(schema): optimize IdVisitor & merge id/index maps to align Java impl 1. Refactor IdVisitor: Add VisitNested() to handle NestedType, make Visit() generic for all Type hierarchy (avoid anti-pattern) 2. Merge id_to_index_ + full_schemafield_ into id_to_field_ (std::unordered_map>) to reduce memory footprint 3. Align with Java impl's idToField design (support future alias/identifier fields without refactor) --- src/iceberg/schema.cc | 193 +++++++++++++++++++----------------------- src/iceberg/schema.h | 21 ++--- src/iceberg/type.cc | 25 +----- src/iceberg/type.h | 17 +--- test/schema_test.cc | 14 +-- 5 files changed, 106 insertions(+), 164 deletions(-) diff --git a/src/iceberg/schema.cc b/src/iceberg/schema.cc index 540b336d4..ed832286c 100644 --- a/src/iceberg/schema.cc +++ b/src/iceberg/schema.cc @@ -19,34 +19,29 @@ #include "iceberg/schema.h" +#include #include #include "iceberg/type.h" #include "iceberg/util/formatter.h" // IWYU pragma: keep +#include "iceberg/util/macros.h" +#include "iceberg/util/visit_type.h" namespace iceberg { class IdVisitor { public: - explicit IdVisitor(bool has_init_ = false); + explicit IdVisitor( + std::unordered_map>& + id_to_field); Status Visit(const Type& type); + Status VisitNestedType(const Type& type); - bool has_init; - int index = 0; - std::unordered_map id_to_index; - std::vector> full_schemafield; + private: + std::unordered_map>& id_to_field_; }; - -std::string GetPath(const std::string& last_path, const std::string& field_name, - bool case_sensitive) { - if (case_sensitive) { - return last_path.empty() ? field_name : last_path + "." + field_name; - } - std::string lower_name(field_name); - std::ranges::transform(lower_name, lower_name.begin(), ::tolower); - return last_path.empty() ? lower_name : last_path + "." + lower_name; -} -class NameVisitor { +class NametoIdVisitor { public: - explicit NameVisitor(bool case_sensitive_ = true, bool has_init_ = false); + explicit NametoIdVisitor(std::unordered_map& name_to_id, + bool case_sensitive_ = true); Status Visit(const ListType& type, const std::string& path, const std::string& short_path); Status Visit(const MapType& type, const std::string& path, @@ -55,12 +50,12 @@ class NameVisitor { const std::string& short_path); Status Visit(const PrimitiveType& type, const std::string& path, const std::string& short_path); + static std::string GetPath(const std::string& last_path, const std::string& field_name, + bool case_sensitive = true); - int index = 0; - bool case_sensitive; - bool has_init; - std::unordered_map name_to_index; - std::vector> full_schemafield; + private: + bool case_sensitive_; + std::unordered_map& name_to_id_; }; Schema::Schema(std::vector fields, std::optional schema_id) : StructType(std::move(fields)), schema_id_(schema_id) {} @@ -84,16 +79,16 @@ Result>> Schema::FindFie std::string_view name, bool case_sensitive) const { if (case_sensitive) { ICEBERG_RETURN_UNEXPECTED(InitNameToIndexMap()); - auto it = name_to_index_.find(std::string(name)); - if (it == name_to_index_.end()) return std::nullopt; - return full_schemafield_[it->second]; + auto it = name_to_id_.find(std::string(name)); + if (it == name_to_id_.end()) return std::nullopt; + return FindFieldById(it->second); } ICEBERG_RETURN_UNEXPECTED(InitLowerCaseNameToIndexMap()); std::string lower_name(name); std::ranges::transform(lower_name, lower_name.begin(), ::tolower); - auto it = lowercase_name_to_index_.find(lower_name); - if (it == lowercase_name_to_index_.end()) return std::nullopt; - return full_schemafield_[it->second]; + auto it = lowercase_name_to_id_.find(lower_name); + if (it == lowercase_name_to_id_.end()) return std::nullopt; + return FindFieldById(it->second); } Result>> Schema::FindFieldByName( @@ -102,150 +97,134 @@ Result>> Schema::FindFie } Result Schema::InitIdToIndexMap() const { - if (!id_to_index_.empty()) { + if (!id_to_field_.empty()) { return {}; } - bool has_init = !full_schemafield_.empty(); - IdVisitor visitor(has_init); + IdVisitor visitor(id_to_field_); ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*this, &visitor)); - id_to_index_ = std::move(visitor.id_to_index); - if (!has_init) { - full_schemafield_ = std::move(visitor.full_schemafield); - } return {}; } Result Schema::InitNameToIndexMap() const { - if (!name_to_index_.empty()) { + if (!name_to_id_.empty()) { return {}; } - bool has_init = !full_schemafield_.empty(); std::string path, short_path; - NameVisitor visitor(true, has_init); + NametoIdVisitor visitor(name_to_id_, true); ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*this, &visitor, path, short_path)); - name_to_index_ = std::move(visitor.name_to_index); - if (!has_init) { - full_schemafield_ = std::move(visitor.full_schemafield); - } return {}; } Result Schema::InitLowerCaseNameToIndexMap() const { - if (!lowercase_name_to_index_.empty()) { + if (!lowercase_name_to_id_.empty()) { return {}; } - bool has_init = !full_schemafield_.empty(); std::string path, short_path; - NameVisitor visitor(false, has_init); + NametoIdVisitor visitor(lowercase_name_to_id_, false); ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*this, &visitor, path, short_path)); - lowercase_name_to_index_ = std::move(visitor.name_to_index); - if (!has_init) { - full_schemafield_ = std::move(visitor.full_schemafield); - } return {}; } Result>> Schema::FindFieldById( int32_t field_id) const { ICEBERG_RETURN_UNEXPECTED(InitIdToIndexMap()); - auto it = id_to_index_.find(field_id); - if (it == id_to_index_.end()) { + auto it = id_to_field_.find(field_id); + if (it == id_to_field_.end()) { return std::nullopt; } - return full_schemafield_[it->second]; + return it->second; } -IdVisitor::IdVisitor(bool has_init_) : has_init(has_init_) {} +IdVisitor::IdVisitor( + std::unordered_map>& id_to_field) + : id_to_field_(id_to_field) {} Status IdVisitor::Visit(const Type& type) { + if (type.is_nested()) { + ICEBERG_RETURN_UNEXPECTED(VisitNestedType(type)); + } + return {}; +} + +Status IdVisitor::VisitNestedType(const Type& type) { const auto& nested = iceberg::internal::checked_cast(type); const auto& fields = nested.fields(); for (const auto& field : fields) { - id_to_index[field.field_id()] = index++; - if (!has_init) { - full_schemafield.emplace_back(field); - } - if (field.type()->is_nested()) { - ICEBERG_RETURN_UNEXPECTED(Visit(*field.type())); - } + id_to_field_.emplace(field.field_id(), std::cref(field)); + ICEBERG_RETURN_UNEXPECTED(Visit(*field.type())); } return {}; } -NameVisitor::NameVisitor(bool case_sensitive_, bool has_init_) - : case_sensitive(case_sensitive_), has_init(has_init_) {} +NametoIdVisitor::NametoIdVisitor(std::unordered_map& name_to_id, + bool case_sensitive) + : name_to_id_(name_to_id), case_sensitive_(case_sensitive) {} -Status NameVisitor::Visit(const ListType& type, const std::string& path, - const std::string& short_path) { +Status NametoIdVisitor::Visit(const ListType& type, const std::string& path, + const std::string& short_path) { const auto& field = type.fields()[0]; - std::string full_path = - iceberg::GetPath(path, std::string(field.name()), case_sensitive); + std::string full_path = GetPath(path, std::string(field.name()), case_sensitive_); std::string short_full_path; if (field.type()->type_id() == TypeId::kStruct) { short_full_path = short_path; } else { - short_full_path = - iceberg::GetPath(short_path, std::string(field.name()), case_sensitive); - } - name_to_index[full_path] = index++; - if (!has_init) { - full_schemafield.emplace_back(field); - } - name_to_index.emplace(short_full_path, index - 1); - if (field.type()->is_nested()) { - ICEBERG_RETURN_UNEXPECTED( - VisitTypeInline(*field.type(), this, full_path, short_full_path)); + short_full_path = GetPath(short_path, std::string(field.name()), case_sensitive_); } + name_to_id_[full_path] = field.field_id(); + name_to_id_.emplace(short_full_path, field.field_id()); + ICEBERG_RETURN_UNEXPECTED( + VisitTypeInline(*field.type(), this, full_path, short_full_path)); return {}; } -Status NameVisitor::Visit(const MapType& type, const std::string& path, - const std::string& short_path) { +Status NametoIdVisitor::Visit(const MapType& type, const std::string& path, + const std::string& short_path) { std::string full_path, short_full_path; - for (const auto& field : type.fields()) { - full_path = iceberg::GetPath(path, std::string(field.name()), case_sensitive); + const auto& fields = type.fields(); + for (const auto& field : fields) { + full_path = GetPath(path, std::string(field.name()), case_sensitive_); if (field.name() == MapType::kValueName && field.type()->type_id() == TypeId::kStruct) { short_full_path = short_path; } else { - short_full_path = iceberg::GetPath(path, std::string(field.name()), case_sensitive); - } - name_to_index[full_path] = index++; - if (!has_init) { - full_schemafield.emplace_back(field); - } - name_to_index.emplace(short_full_path, index - 1); - if (field.type()->is_nested()) { - ICEBERG_RETURN_UNEXPECTED( - VisitTypeInline(*field.type(), this, full_path, short_full_path)); + short_full_path = GetPath(path, std::string(field.name()), case_sensitive_); } + name_to_id_[full_path] = field.field_id(); + name_to_id_.emplace(short_full_path, field.field_id()); + ICEBERG_RETURN_UNEXPECTED( + VisitTypeInline(*field.type(), this, full_path, short_full_path)); } return {}; } -Status NameVisitor::Visit(const StructType& type, const std::string& path, - const std::string& short_path) { +Status NametoIdVisitor::Visit(const StructType& type, const std::string& path, + const std::string& short_path) { const auto& fields = type.fields(); std::string full_path, short_full_path; for (const auto& field : fields) { - full_path = iceberg::GetPath(path, std::string(field.name()), case_sensitive); - short_full_path = - iceberg::GetPath(short_path, std::string(field.name()), case_sensitive); - name_to_index[full_path] = index++; - if (!has_init) { - full_schemafield.emplace_back(field); - } - name_to_index.emplace(short_full_path, index - 1); - if (field.type()->is_nested()) { - ICEBERG_RETURN_UNEXPECTED( - VisitTypeInline(*field.type(), this, full_path, short_full_path)); - } + full_path = + NametoIdVisitor::GetPath(path, std::string(field.name()), case_sensitive_); + short_full_path = GetPath(short_path, std::string(field.name()), case_sensitive_); + name_to_id_[full_path] = field.field_id(); + name_to_id_.emplace(short_full_path, field.field_id()); + ICEBERG_RETURN_UNEXPECTED( + VisitTypeInline(*field.type(), this, full_path, short_full_path)); } return {}; } -Status NameVisitor::Visit(const PrimitiveType& type, const std::string& path, - const std::string& short_path) { +Status NametoIdVisitor::Visit(const PrimitiveType& type, const std::string& path, + const std::string& short_path) { return {}; } + +std::string NametoIdVisitor::GetPath(const std::string& last_path, + const std::string& field_name, bool case_sensitive) { + if (case_sensitive) { + return last_path.empty() ? field_name : last_path + "." + field_name; + } + std::string lower_name(field_name); + std::ranges::transform(lower_name, lower_name.begin(), ::tolower); + return last_path.empty() ? lower_name : last_path + "." + lower_name; +} } // namespace iceberg diff --git a/src/iceberg/schema.h b/src/iceberg/schema.h index ad3a55ee5..55d42b0d4 100644 --- a/src/iceberg/schema.h +++ b/src/iceberg/schema.h @@ -23,17 +23,15 @@ /// Schemas for Iceberg tables. This header contains the definition of Schema /// and any utility functions. See iceberg/type.h and iceberg/field.h as well. -#include #include #include #include #include #include "iceberg/iceberg_export.h" +#include "iceberg/result.h" #include "iceberg/schema_field.h" #include "iceberg/type.h" -#include "iceberg/util/macros.h" -#include "iceberg/util/visit_type.h" namespace iceberg { @@ -57,7 +55,7 @@ class ICEBERG_EXPORT Schema : public StructType { [[nodiscard]] std::string ToString() const override; - ///\brief Get thd SchemaField By Name + ///\brief Find thd SchemaField By field name /// /// Short names for maps and lists are included for any name that does not conflict with /// a canonical name. For example, a list, 'l', of structs with field 'x' will produce @@ -75,14 +73,13 @@ class ICEBERG_EXPORT Schema : public StructType { friend bool operator==(const Schema& lhs, const Schema& rhs) { return lhs.Equals(rhs); } - /// Mapping from field id to index of `full_schemafield_`. - mutable std::unordered_map id_to_index_; - /// Mapping from field name to index of `full_schemafield_`. - mutable std::unordered_map name_to_index_; - /// Mapping from field lowercase_name(suppoert case_insensitive query) to index of - /// `full_schemafield_`. - mutable std::unordered_map lowercase_name_to_index_; - mutable std::vector> full_schemafield_; + /// Mapping from field id to field. + mutable std::unordered_map> + id_to_field_; + /// Mapping from field name to id of field. + mutable std::unordered_map name_to_id_; + /// Mapping from field lowercase_name(suppoert case_insensitive query) to id of field + mutable std::unordered_map lowercase_name_to_id_; private: /// \brief Compare two schemas for equality. diff --git a/src/iceberg/type.cc b/src/iceberg/type.cc index 19ff95723..e66f96daf 100644 --- a/src/iceberg/type.cc +++ b/src/iceberg/type.cc @@ -27,10 +27,6 @@ #include "iceberg/util/formatter.h" // IWYU pragma: keep namespace iceberg { -std::optional> NestedType::GetFieldByName( - std::string_view name) const { - return GetFieldByName(name, true); -} StructType::StructType(std::vector fields) : fields_(std::move(fields)) { size_t index = 0; @@ -70,9 +66,8 @@ std::optional> StructType::GetFieldByI } return fields_[index]; } -// todo std::optional> StructType::GetFieldByName( - std::string_view name, bool case_sensitive) const { + std::string_view name) const { // N.B. duplicate names are not permitted (looking at the Java // implementation) so there is nothing in particular we need to do here for (const auto& field : fields_) { @@ -89,10 +84,6 @@ bool StructType::Equals(const Type& other) const { const auto& struct_ = static_cast(other); return fields_ == struct_.fields_; } -std::optional> StructType::GetFieldByName( - std::string_view name) const { - return GetFieldByName(name, true); -} ListType::ListType(SchemaField element) : element_(std::move(element)) { if (element_.name() != kElementName) { @@ -128,9 +119,8 @@ std::optional> ListType::GetFieldByInd } return std::nullopt; } -// todo std::optional> ListType::GetFieldByName( - std::string_view name, bool case_sensitive) const { + std::string_view name) const { if (name == element_.name()) { return std::cref(element_); } @@ -143,10 +133,6 @@ bool ListType::Equals(const Type& other) const { const auto& list = static_cast(other); return element_ == list.element_; } -std::optional> ListType::GetFieldByName( - std::string_view name) const { - return GetFieldByName(name, false); -} MapType::MapType(SchemaField key, SchemaField value) : fields_{std::move(key), std::move(value)} { @@ -191,9 +177,8 @@ std::optional> MapType::GetFieldByInde } return std::nullopt; } -// todo std::optional> MapType::GetFieldByName( - std::string_view name, bool case_sensitive) const { + std::string_view name) const { if (name == kKeyName) { return key(); } else if (name == kValueName) { @@ -208,10 +193,6 @@ bool MapType::Equals(const Type& other) const { const auto& map = static_cast(other); return fields_ == map.fields_; } -std::optional> MapType::GetFieldByName( - std::string_view name) const { - return GetFieldByName(name, false); -} TypeId BooleanType::type_id() const { return kTypeId; } std::string BooleanType::ToString() const { return "boolean"; } diff --git a/src/iceberg/type.h b/src/iceberg/type.h index 23b483b53..78c0141b1 100644 --- a/src/iceberg/type.h +++ b/src/iceberg/type.h @@ -91,10 +91,7 @@ class ICEBERG_EXPORT NestedType : public Type { /// /// \note This is currently O(n) complexity. [[nodiscard]] virtual std::optional> - GetFieldByName(std::string_view name, bool case_sensitive) const = 0; - - std::optional> GetFieldByName( - std::string_view name) const; + GetFieldByName(std::string_view name) const = 0; }; /// \defgroup type-nested Nested Types @@ -117,9 +114,7 @@ class ICEBERG_EXPORT StructType : public NestedType { std::optional> GetFieldByIndex( int32_t index) const override; std::optional> GetFieldByName( - std::string_view name, bool case_sensitive) const override; - std::optional> GetFieldByName( - std::string_view name) const; + std::string_view name) const override; protected: bool Equals(const Type& other) const override; @@ -150,9 +145,7 @@ class ICEBERG_EXPORT ListType : public NestedType { std::optional> GetFieldByIndex( int32_t index) const override; std::optional> GetFieldByName( - std::string_view name, bool case_sensitive) const override; - std::optional> GetFieldByName( - std::string_view name) const; + std::string_view name) const override; protected: bool Equals(const Type& other) const override; @@ -184,9 +177,7 @@ class ICEBERG_EXPORT MapType : public NestedType { std::optional> GetFieldByIndex( int32_t index) const override; std::optional> GetFieldByName( - std::string_view name, bool case_sensitive) const override; - std::optional> GetFieldByName( - std::string_view name) const; + std::string_view name) const override; protected: bool Equals(const Type& other) const override; diff --git a/test/schema_test.cc b/test/schema_test.cc index b67ab0ba8..7dc55b89e 100644 --- a/test/schema_test.cc +++ b/test/schema_test.cc @@ -107,7 +107,7 @@ TEST(SchemaTest, NestedType) { iceberg::Schema schema({field7}, 1); - ASSERT_EQ(schema.full_schemafield_.size(), 0); + ASSERT_EQ(schema.id_to_field_.size(), 0); ASSERT_THAT(schema.FindFieldById(7), ::testing::Optional(field7)); ASSERT_THAT(schema.FindFieldById(6), ::testing::Optional(field6)); ASSERT_THAT(schema.FindFieldById(5), ::testing::Optional(field5)); @@ -115,7 +115,6 @@ TEST(SchemaTest, NestedType) { ASSERT_THAT(schema.FindFieldById(3), ::testing::Optional(field3)); ASSERT_THAT(schema.FindFieldById(2), ::testing::Optional(field2)); ASSERT_THAT(schema.FindFieldById(1), ::testing::Optional(field1)); - ASSERT_EQ(schema.full_schemafield_.size(), 7); ASSERT_THAT(schema.FindFieldByName("Value"), ::testing::Optional(field7)); ASSERT_THAT(schema.FindFieldByName("Value.value"), ::testing::Optional(field6)); @@ -127,7 +126,6 @@ TEST(SchemaTest, NestedType) { ::testing::Optional(field2)); ASSERT_THAT(schema.FindFieldByName("Value.value.element.Foo"), ::testing::Optional(field1)); - ASSERT_EQ(schema.full_schemafield_.size(), 7); ASSERT_THAT(schema.FindFieldByName("vALue", false), ::testing::Optional(field7)); ASSERT_THAT(schema.FindFieldByName("vALue.VALUE", false), ::testing::Optional(field6)); @@ -140,7 +138,6 @@ TEST(SchemaTest, NestedType) { ::testing::Optional(field2)); ASSERT_THAT(schema.FindFieldByName("valUe.valUe.ELEMENT.FOO", false), ::testing::Optional(field1)); - ASSERT_EQ(schema.full_schemafield_.size(), 7); ASSERT_THAT(schema.FindFieldByName("vaLue.value.FOO", false), ::testing::Optional(field1)); @@ -152,6 +149,7 @@ TEST(SchemaTest, NestedType) { ASSERT_THAT(schema.FindFieldByName("Value.value.Foo"), ::testing::Optional(field1)); ASSERT_THAT(schema.FindFieldByName("Value.value.Bar"), ::testing::Optional(field2)); ASSERT_THAT(schema.FindFieldByName("Value.value.Foobar"), ::testing::Optional(field3)); + ASSERT_EQ(schema.id_to_field_.size(), 7); } TEST(SchemaTest, NestType2) { @@ -187,7 +185,7 @@ TEST(SchemaTest, NestType2) { iceberg::Schema schema({field9}, 1); - ASSERT_EQ(schema.full_schemafield_.size(), 0); + ASSERT_EQ(schema.id_to_field_.size(), 0); ASSERT_THAT(schema.FindFieldById(9), ::testing::Optional(field9)); ASSERT_THAT(schema.FindFieldById(8), ::testing::Optional(field8)); ASSERT_THAT(schema.FindFieldById(7), ::testing::Optional(field7)); @@ -197,7 +195,6 @@ TEST(SchemaTest, NestType2) { ASSERT_THAT(schema.FindFieldById(3), ::testing::Optional(field3)); ASSERT_THAT(schema.FindFieldById(2), ::testing::Optional(field2)); ASSERT_THAT(schema.FindFieldById(1), ::testing::Optional(field1)); - ASSERT_EQ(schema.full_schemafield_.size(), 9); ASSERT_THAT(schema.FindFieldByName("Map"), ::testing::Optional(field9)); ASSERT_THAT(schema.FindFieldByName("Map.value"), ::testing::Optional(field8)); @@ -214,7 +211,6 @@ TEST(SchemaTest, NestType2) { ::testing::Optional(field2)); ASSERT_THAT(schema.FindFieldByName("Map.value.Second_child.element.Foo"), ::testing::Optional(field1)); - ASSERT_EQ(schema.full_schemafield_.size(), 9); ASSERT_THAT(schema.FindFieldByName("map", false), ::testing::Optional(field9)); ASSERT_THAT(schema.FindFieldByName("map.vALUE", false), ::testing::Optional(field8)); @@ -231,7 +227,6 @@ TEST(SchemaTest, NestType2) { ::testing::Optional(field2)); ASSERT_THAT(schema.FindFieldByName("map.value.Second_child.Element.foo", false), ::testing::Optional(field1)); - ASSERT_EQ(schema.full_schemafield_.size(), 9); ASSERT_THAT(schema.FindFieldByName("Map.Second_child"), ::testing::Optional(field6)); ASSERT_THAT(schema.FindFieldByName("Map.First_child"), ::testing::Optional(field5)); @@ -241,7 +236,6 @@ TEST(SchemaTest, NestType2) { ::testing::Optional(field2)); ASSERT_THAT(schema.FindFieldByName("Map.Second_child.Foo"), ::testing::Optional(field1)); - ASSERT_EQ(schema.full_schemafield_.size(), 9); ASSERT_THAT(schema.FindFieldByName("map.second_child", false), ::testing::Optional(field6)); @@ -253,5 +247,5 @@ TEST(SchemaTest, NestType2) { ::testing::Optional(field2)); ASSERT_THAT(schema.FindFieldByName("map.second_child.foo", false), ::testing::Optional(field1)); - ASSERT_EQ(schema.full_schemafield_.size(), 9); + ASSERT_EQ(schema.id_to_field_.size(), 9); } From 36f9a96d9a30882697ffc6ae7765175b3c9c410d Mon Sep 17 00:00:00 2001 From: nullccxsy <32149055912@qq.com> Date: Fri, 22 Aug 2025 18:48:01 +0800 Subject: [PATCH 04/14] style(schema): fix code style issues in schema_test - change the type of id from size_t to int_32 - rename GetPath to BuildPath - fix bug new_short_path should build from short_path, not path --- src/iceberg/schema.cc | 80 ++++----- src/iceberg/schema.h | 14 +- test/schema_test.cc | 401 ++++++++++++++++++++++++++---------------- 3 files changed, 290 insertions(+), 205 deletions(-) diff --git a/src/iceberg/schema.cc b/src/iceberg/schema.cc index ed832286c..7051f8fe0 100644 --- a/src/iceberg/schema.cc +++ b/src/iceberg/schema.cc @@ -26,10 +26,11 @@ #include "iceberg/util/formatter.h" // IWYU pragma: keep #include "iceberg/util/macros.h" #include "iceberg/util/visit_type.h" + namespace iceberg { -class IdVisitor { +class IdToFieldVisitor { public: - explicit IdVisitor( + explicit IdToFieldVisitor( std::unordered_map>& id_to_field); Status Visit(const Type& type); @@ -38,9 +39,10 @@ class IdVisitor { private: std::unordered_map>& id_to_field_; }; + class NametoIdVisitor { public: - explicit NametoIdVisitor(std::unordered_map& name_to_id, + explicit NametoIdVisitor(std::unordered_map& name_to_id, bool case_sensitive_ = true); Status Visit(const ListType& type, const std::string& path, const std::string& short_path); @@ -50,13 +52,14 @@ class NametoIdVisitor { const std::string& short_path); Status Visit(const PrimitiveType& type, const std::string& path, const std::string& short_path); - static std::string GetPath(const std::string& last_path, const std::string& field_name, - bool case_sensitive = true); + static std::string BuildPath(std::string_view prefix, std::string_view field_name, + bool case_sensitive); private: bool case_sensitive_; - std::unordered_map& name_to_id_; + std::unordered_map& name_to_id_; }; + Schema::Schema(std::vector fields, std::optional schema_id) : StructType(std::move(fields)), schema_id_(schema_id) {} @@ -91,16 +94,11 @@ Result>> Schema::FindFie return FindFieldById(it->second); } -Result>> Schema::FindFieldByName( - std::string_view name) const { - return FindFieldByName(name, /*case_sensitive*/ true); -} - Result Schema::InitIdToIndexMap() const { if (!id_to_field_.empty()) { return {}; } - IdVisitor visitor(id_to_field_); + IdToFieldVisitor visitor(id_to_field_); ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*this, &visitor)); return {}; } @@ -135,18 +133,18 @@ Result>> Schema::FindFie return it->second; } -IdVisitor::IdVisitor( +IdToFieldVisitor::IdToFieldVisitor( std::unordered_map>& id_to_field) : id_to_field_(id_to_field) {} -Status IdVisitor::Visit(const Type& type) { +Status IdToFieldVisitor::Visit(const Type& type) { if (type.is_nested()) { ICEBERG_RETURN_UNEXPECTED(VisitNestedType(type)); } return {}; } -Status IdVisitor::VisitNestedType(const Type& type) { +Status IdToFieldVisitor::VisitNestedType(const Type& type) { const auto& nested = iceberg::internal::checked_cast(type); const auto& fields = nested.fields(); for (const auto& field : fields) { @@ -156,43 +154,43 @@ Status IdVisitor::VisitNestedType(const Type& type) { return {}; } -NametoIdVisitor::NametoIdVisitor(std::unordered_map& name_to_id, +NametoIdVisitor::NametoIdVisitor(std::unordered_map& name_to_id, bool case_sensitive) : name_to_id_(name_to_id), case_sensitive_(case_sensitive) {} Status NametoIdVisitor::Visit(const ListType& type, const std::string& path, const std::string& short_path) { const auto& field = type.fields()[0]; - std::string full_path = GetPath(path, std::string(field.name()), case_sensitive_); - std::string short_full_path; + std::string new_path = BuildPath(path, field.name(), case_sensitive_); + std::string new_short_path; if (field.type()->type_id() == TypeId::kStruct) { - short_full_path = short_path; + new_short_path = short_path; } else { - short_full_path = GetPath(short_path, std::string(field.name()), case_sensitive_); + new_short_path = BuildPath(short_path, field.name(), case_sensitive_); } - name_to_id_[full_path] = field.field_id(); - name_to_id_.emplace(short_full_path, field.field_id()); + name_to_id_[new_path] = field.field_id(); + name_to_id_.emplace(new_short_path, field.field_id()); ICEBERG_RETURN_UNEXPECTED( - VisitTypeInline(*field.type(), this, full_path, short_full_path)); + VisitTypeInline(*field.type(), this, new_path, new_short_path)); return {}; } Status NametoIdVisitor::Visit(const MapType& type, const std::string& path, const std::string& short_path) { - std::string full_path, short_full_path; + std::string new_path, new_short_path; const auto& fields = type.fields(); for (const auto& field : fields) { - full_path = GetPath(path, std::string(field.name()), case_sensitive_); + new_path = BuildPath(path, field.name(), case_sensitive_); if (field.name() == MapType::kValueName && field.type()->type_id() == TypeId::kStruct) { - short_full_path = short_path; + new_short_path = short_path; } else { - short_full_path = GetPath(path, std::string(field.name()), case_sensitive_); + new_short_path = BuildPath(short_path, field.name(), case_sensitive_); } - name_to_id_[full_path] = field.field_id(); - name_to_id_.emplace(short_full_path, field.field_id()); + name_to_id_[new_path] = field.field_id(); + name_to_id_.emplace(new_short_path, field.field_id()); ICEBERG_RETURN_UNEXPECTED( - VisitTypeInline(*field.type(), this, full_path, short_full_path)); + VisitTypeInline(*field.type(), this, new_path, new_short_path)); } return {}; } @@ -200,15 +198,14 @@ Status NametoIdVisitor::Visit(const MapType& type, const std::string& path, Status NametoIdVisitor::Visit(const StructType& type, const std::string& path, const std::string& short_path) { const auto& fields = type.fields(); - std::string full_path, short_full_path; + std::string new_path, new_short_path; for (const auto& field : fields) { - full_path = - NametoIdVisitor::GetPath(path, std::string(field.name()), case_sensitive_); - short_full_path = GetPath(short_path, std::string(field.name()), case_sensitive_); - name_to_id_[full_path] = field.field_id(); - name_to_id_.emplace(short_full_path, field.field_id()); + new_path = BuildPath(path, field.name(), case_sensitive_); + new_short_path = BuildPath(short_path, field.name(), case_sensitive_); + name_to_id_[new_path] = field.field_id(); + name_to_id_.emplace(new_short_path, field.field_id()); ICEBERG_RETURN_UNEXPECTED( - VisitTypeInline(*field.type(), this, full_path, short_full_path)); + VisitTypeInline(*field.type(), this, new_path, new_short_path)); } return {}; } @@ -218,13 +215,14 @@ Status NametoIdVisitor::Visit(const PrimitiveType& type, const std::string& path return {}; } -std::string NametoIdVisitor::GetPath(const std::string& last_path, - const std::string& field_name, bool case_sensitive) { +std::string NametoIdVisitor::BuildPath(std::string_view prefix, + std::string_view field_name, bool case_sensitive) { if (case_sensitive) { - return last_path.empty() ? field_name : last_path + "." + field_name; + return prefix.empty() ? std::string(field_name) + : std::string(prefix) + "." + std::string(field_name); } std::string lower_name(field_name); std::ranges::transform(lower_name, lower_name.begin(), ::tolower); - return last_path.empty() ? lower_name : last_path + "." + lower_name; + return prefix.empty() ? lower_name : std::string(prefix) + "." + lower_name; } } // namespace iceberg diff --git a/src/iceberg/schema.h b/src/iceberg/schema.h index 55d42b0d4..b9b3f38d1 100644 --- a/src/iceberg/schema.h +++ b/src/iceberg/schema.h @@ -63,23 +63,21 @@ class ICEBERG_EXPORT Schema : public StructType { /// value include a structs with field 'x' wil produce short name 'm.x' in addition to /// canonical name 'm.value.x' [[nodiscard]] Result>> - FindFieldByName(std::string_view name, bool case_sensitive) const; - - [[nodiscard]] Result>> - FindFieldByName(std::string_view name) const; + FindFieldByName(std::string_view name, bool case_sensitive = true) const; [[nodiscard]] Result>> FindFieldById(int32_t field_id) const; friend bool operator==(const Schema& lhs, const Schema& rhs) { return lhs.Equals(rhs); } + private: /// Mapping from field id to field. mutable std::unordered_map> id_to_field_; - /// Mapping from field name to id of field. - mutable std::unordered_map name_to_id_; - /// Mapping from field lowercase_name(suppoert case_insensitive query) to id of field - mutable std::unordered_map lowercase_name_to_id_; + /// Mapping from field name to field id. + mutable std::unordered_map name_to_id_; + /// Mapping from lowercased field name to field id + mutable std::unordered_map lowercase_name_to_id_; private: /// \brief Compare two schemas for equality. diff --git a/test/schema_test.cc b/test/schema_test.cc index 7dc55b89e..5dc44cb5f 100644 --- a/test/schema_test.cc +++ b/test/schema_test.cc @@ -82,170 +82,259 @@ TEST(SchemaTest, Equality) { ASSERT_EQ(schema5, schema1); } -TEST(SchemaTest, NestedType) { - iceberg::SchemaField field1(1, "Foo", iceberg::int32(), true); - iceberg::SchemaField field2(2, "Bar", iceberg::string(), true); - iceberg::SchemaField field3(3, "Foobar", iceberg::int32(), true); +class NestedTypeTest : public ::testing::Test { + protected: + void SetUp() override { + field1_ = iceberg::SchemaField(1, "Foo", iceberg::int32(), true); + field2_ = iceberg::SchemaField(2, "Bar", iceberg::string(), true); + field3_ = iceberg::SchemaField(3, "Foobar", iceberg::int32(), true); + + iceberg::StructType structtype = + iceberg::StructType(std::vector{ + field1_.value(), field2_.value(), field3_.value()}); + + auto listype = iceberg::ListType(iceberg::SchemaField::MakeRequired( + 4, "element", std::make_shared(structtype))); + + auto maptype = + iceberg::MapType(iceberg::SchemaField::MakeRequired(5, "key", iceberg::int32()), + iceberg::SchemaField::MakeRequired( + 6, "value", std::make_shared(listype))); + + field4_ = iceberg::SchemaField::MakeRequired( + 4, "element", std::make_shared(structtype)); + field5_ = iceberg::SchemaField::MakeRequired(5, "key", iceberg::int32()); + field6_ = iceberg::SchemaField::MakeRequired( + 6, "value", std::make_shared(listype)); + field7_ = iceberg::SchemaField::MakeRequired( + 7, "Value", std::make_shared(maptype)); + + schema_ = std::make_shared( + std::vector{field7_.value()}, 1); + } - iceberg::StructType structtype = iceberg::StructType({field1, field2, field3}); + std::shared_ptr schema_; + std::optional field1_; + std::optional field2_; + std::optional field3_; + std::optional field4_; + std::optional field5_; + std::optional field6_; + std::optional field7_; +}; + +TEST_F(NestedTypeTest, TestFindById) { + ASSERT_THAT(schema_->FindFieldById(7), ::testing::Optional(field7_)); + ASSERT_THAT(schema_->FindFieldById(6), ::testing::Optional(field6_)); + ASSERT_THAT(schema_->FindFieldById(5), ::testing::Optional(field5_)); + ASSERT_THAT(schema_->FindFieldById(4), ::testing::Optional(field4_)); + ASSERT_THAT(schema_->FindFieldById(3), ::testing::Optional(field3_)); + ASSERT_THAT(schema_->FindFieldById(2), ::testing::Optional(field2_)); + ASSERT_THAT(schema_->FindFieldById(1), ::testing::Optional(field1_)); + + ASSERT_THAT(schema_->FindFieldById(10), ::testing::Optional(std::nullopt)); +} - auto listype = iceberg::ListType(iceberg::SchemaField::MakeRequired( - 4, "element", std::make_shared(structtype))); +TEST_F(NestedTypeTest, TestFindByName) { + ASSERT_THAT(schema_->FindFieldByName("Value"), ::testing::Optional(field7_)); + ASSERT_THAT(schema_->FindFieldByName("Value.value"), ::testing::Optional(field6_)); + ASSERT_THAT(schema_->FindFieldByName("Value.key"), ::testing::Optional(field5_)); + ASSERT_THAT(schema_->FindFieldByName("Value.value.element"), + ::testing::Optional(field4_)); + ASSERT_THAT(schema_->FindFieldByName("Value.value.element.Foobar"), + ::testing::Optional(field3_)); + ASSERT_THAT(schema_->FindFieldByName("Value.value.element.Bar"), + ::testing::Optional(field2_)); + ASSERT_THAT(schema_->FindFieldByName("Value.value.element.Foo"), + ::testing::Optional(field1_)); + + ASSERT_THAT(schema_->FindFieldByName("Value.value.element.FoO"), + ::testing::Optional(std::nullopt)); +} - auto maptype = - iceberg::MapType(iceberg::SchemaField::MakeRequired(5, "key", iceberg::int32()), - iceberg::SchemaField::MakeRequired( - 6, "value", std::make_shared(listype))); +TEST_F(NestedTypeTest, TestFindByNameCaseInsensitive) { + ASSERT_THAT(schema_->FindFieldByName("vALue", false), ::testing::Optional(field7_)); + ASSERT_THAT(schema_->FindFieldByName("vALue.VALUE", false), + ::testing::Optional(field6_)); + ASSERT_THAT(schema_->FindFieldByName("valUe.kEy", false), ::testing::Optional(field5_)); + ASSERT_THAT(schema_->FindFieldByName("vaLue.vAlue.elEment", false), + ::testing::Optional(field4_)); + ASSERT_THAT(schema_->FindFieldByName("vaLue.vAlue.eLement.fOObar", false), + ::testing::Optional(field3_)); + ASSERT_THAT(schema_->FindFieldByName("valUe.vaLUe.elemEnt.Bar", false), + ::testing::Optional(field2_)); + ASSERT_THAT(schema_->FindFieldByName("valUe.valUe.ELEMENT.FOO", false), + ::testing::Optional(field1_)); + ASSERT_THAT(schema_->FindFieldByName("valUe.valUe.ELEMENT.FO", false), + ::testing::Optional(std::nullopt)); +} - auto field4 = iceberg::SchemaField::MakeRequired( - 4, "element", std::make_shared(structtype)); - auto field5 = iceberg::SchemaField::MakeRequired(5, "key", iceberg::int32()); - auto field6 = iceberg::SchemaField::MakeRequired( - 6, "value", std::make_shared(listype)); - auto field7 = iceberg::SchemaField::MakeRequired( - 7, "Value", std::make_shared(maptype)); - - iceberg::Schema schema({field7}, 1); - - ASSERT_EQ(schema.id_to_field_.size(), 0); - ASSERT_THAT(schema.FindFieldById(7), ::testing::Optional(field7)); - ASSERT_THAT(schema.FindFieldById(6), ::testing::Optional(field6)); - ASSERT_THAT(schema.FindFieldById(5), ::testing::Optional(field5)); - ASSERT_THAT(schema.FindFieldById(4), ::testing::Optional(field4)); - ASSERT_THAT(schema.FindFieldById(3), ::testing::Optional(field3)); - ASSERT_THAT(schema.FindFieldById(2), ::testing::Optional(field2)); - ASSERT_THAT(schema.FindFieldById(1), ::testing::Optional(field1)); - - ASSERT_THAT(schema.FindFieldByName("Value"), ::testing::Optional(field7)); - ASSERT_THAT(schema.FindFieldByName("Value.value"), ::testing::Optional(field6)); - ASSERT_THAT(schema.FindFieldByName("Value.key"), ::testing::Optional(field5)); - ASSERT_THAT(schema.FindFieldByName("Value.value.element"), ::testing::Optional(field4)); - ASSERT_THAT(schema.FindFieldByName("Value.value.element.Foobar"), - ::testing::Optional(field3)); - ASSERT_THAT(schema.FindFieldByName("Value.value.element.Bar"), - ::testing::Optional(field2)); - ASSERT_THAT(schema.FindFieldByName("Value.value.element.Foo"), - ::testing::Optional(field1)); - - ASSERT_THAT(schema.FindFieldByName("vALue", false), ::testing::Optional(field7)); - ASSERT_THAT(schema.FindFieldByName("vALue.VALUE", false), ::testing::Optional(field6)); - ASSERT_THAT(schema.FindFieldByName("valUe.kEy", false), ::testing::Optional(field5)); - ASSERT_THAT(schema.FindFieldByName("vaLue.vAlue.elEment", false), - ::testing::Optional(field4)); - ASSERT_THAT(schema.FindFieldByName("vaLue.vAlue.eLement.fOObar", false), - ::testing::Optional(field3)); - ASSERT_THAT(schema.FindFieldByName("valUe.vaLUe.elemEnt.Bar", false), - ::testing::Optional(field2)); - ASSERT_THAT(schema.FindFieldByName("valUe.valUe.ELEMENT.FOO", false), - ::testing::Optional(field1)); - - ASSERT_THAT(schema.FindFieldByName("vaLue.value.FOO", false), - ::testing::Optional(field1)); - ASSERT_THAT(schema.FindFieldByName("Value.value.Bar", false), - ::testing::Optional(field2)); - ASSERT_THAT(schema.FindFieldByName("Value.value.FooBAR", false), - ::testing::Optional(field3)); - - ASSERT_THAT(schema.FindFieldByName("Value.value.Foo"), ::testing::Optional(field1)); - ASSERT_THAT(schema.FindFieldByName("Value.value.Bar"), ::testing::Optional(field2)); - ASSERT_THAT(schema.FindFieldByName("Value.value.Foobar"), ::testing::Optional(field3)); - ASSERT_EQ(schema.id_to_field_.size(), 7); +TEST_F(NestedTypeTest, TestFindByShortNameCaseInsensitive) { + ASSERT_THAT(schema_->FindFieldByName("vaLue.value.FOO", false), + ::testing::Optional(field1_)); + ASSERT_THAT(schema_->FindFieldByName("Value.value.Bar", false), + ::testing::Optional(field2_)); + ASSERT_THAT(schema_->FindFieldByName("Value.value.FooBAR", false), + ::testing::Optional(field3_)); + ASSERT_THAT(schema_->FindFieldByName("Value.value.FooBAR.a", false), + ::testing::Optional(std::nullopt)); } -TEST(SchemaTest, NestType2) { - iceberg::SchemaField field1(1, "Foo", iceberg::int32(), true); - iceberg::SchemaField field2(2, "Bar", iceberg::string(), true); - iceberg::SchemaField field3(3, "Foobar", iceberg::int32(), true); +class NestType2Test : public ::testing::Test { + protected: + void SetUp() override { + field1_ = iceberg::SchemaField(1, "Foo", iceberg::int32(), true); + field2_ = iceberg::SchemaField(2, "Bar", iceberg::string(), true); + field3_ = iceberg::SchemaField(3, "Foobar", iceberg::int32(), true); + + iceberg::StructType structtype = + iceberg::StructType({field1_.value(), field2_.value(), field3_.value()}); + + field4_ = iceberg::SchemaField::MakeRequired( + 4, "element", std::make_shared(structtype)); + auto listype = iceberg::ListType(field4_.value()); + + iceberg::StructType structtype2 = iceberg::StructType( + {iceberg::SchemaField::MakeRequired(5, "First_child", iceberg::int32()), + iceberg::SchemaField::MakeRequired( + 6, "Second_child", std::make_shared(listype))}); + + auto maptype = iceberg::MapType( + iceberg::SchemaField::MakeRequired(7, "key", iceberg::int32()), + iceberg::SchemaField::MakeRequired( + 8, "value", std::make_shared(structtype2))); + + field5_ = iceberg::SchemaField::MakeRequired(5, "First_child", iceberg::int32()); + field6_ = iceberg::SchemaField::MakeRequired( + 6, "Second_child", std::make_shared(listype)); + field7_ = iceberg::SchemaField::MakeRequired(7, "key", iceberg::int32()); + field8_ = iceberg::SchemaField::MakeRequired( + 8, "value", std::make_shared(structtype2)); + field9_ = iceberg::SchemaField::MakeRequired( + 9, "Map", std::make_shared(maptype)); + + schema_ = std::make_shared( + std::vector{field9_.value()}, 1); + } - iceberg::StructType structtype = iceberg::StructType({field1, field2, field3}); + std::shared_ptr schema_; + std::optional field1_; + std::optional field2_; + std::optional field3_; + std::optional field4_; + std::optional field5_; + std::optional field6_; + std::optional field7_; + std::optional field8_; + std::optional field9_; +}; + +TEST_F(NestType2Test, TestFindById) { + ASSERT_THAT(schema_->FindFieldById(9), ::testing::Optional(field9_)); + ASSERT_THAT(schema_->FindFieldById(8), ::testing::Optional(field8_)); + ASSERT_THAT(schema_->FindFieldById(7), ::testing::Optional(field7_)); + ASSERT_THAT(schema_->FindFieldById(6), ::testing::Optional(field6_)); + ASSERT_THAT(schema_->FindFieldById(5), ::testing::Optional(field5_)); + ASSERT_THAT(schema_->FindFieldById(4), ::testing::Optional(field4_)); + ASSERT_THAT(schema_->FindFieldById(3), ::testing::Optional(field3_)); + ASSERT_THAT(schema_->FindFieldById(2), ::testing::Optional(field2_)); + ASSERT_THAT(schema_->FindFieldById(1), ::testing::Optional(field1_)); + + ASSERT_THAT(schema_->FindFieldById(0), ::testing::Optional(std::nullopt)); +} + +TEST_F(NestType2Test, TestFindByName) { + ASSERT_THAT(schema_->FindFieldByName("Map"), ::testing::Optional(field9_)); + ASSERT_THAT(schema_->FindFieldByName("Map.value"), ::testing::Optional(field8_)); + ASSERT_THAT(schema_->FindFieldByName("Map.key"), ::testing::Optional(field7_)); + ASSERT_THAT(schema_->FindFieldByName("Map.value.Second_child"), + ::testing::Optional(field6_)); + ASSERT_THAT(schema_->FindFieldByName("Map.value.First_child"), + ::testing::Optional(field5_)); + ASSERT_THAT(schema_->FindFieldByName("Map.value.Second_child.element"), + ::testing::Optional(field4_)); + ASSERT_THAT(schema_->FindFieldByName("Map.value.Second_child.element.Foobar"), + ::testing::Optional(field3_)); + ASSERT_THAT(schema_->FindFieldByName("Map.value.Second_child.element.Bar"), + ::testing::Optional(field2_)); + ASSERT_THAT(schema_->FindFieldByName("Map.value.Second_child.element.Foo"), + ::testing::Optional(field1_)); + ASSERT_THAT(schema_->FindFieldByName("Map.value.Second_child.element.Fooo"), + ::testing::Optional(std::nullopt)); +} + +TEST_F(NestType2Test, TestFindByNameCaseInsensitive) { + ASSERT_THAT(schema_->FindFieldByName("map", false), ::testing::Optional(field9_)); + ASSERT_THAT(schema_->FindFieldByName("map.vALUE", false), ::testing::Optional(field8_)); + ASSERT_THAT(schema_->FindFieldByName("map.Key", false), ::testing::Optional(field7_)); + ASSERT_THAT(schema_->FindFieldByName("map.Value.second_Child", false), + ::testing::Optional(field6_)); + ASSERT_THAT(schema_->FindFieldByName("map.Value.first_chIld", false), + ::testing::Optional(field5_)); + ASSERT_THAT(schema_->FindFieldByName("map.Value.second_child.Element", false), + ::testing::Optional(field4_)); + ASSERT_THAT(schema_->FindFieldByName("map.Value.second_child.Element.foobar", false), + ::testing::Optional(field3_)); + ASSERT_THAT(schema_->FindFieldByName("map.VaLue.second_child.Element.bar", false), + ::testing::Optional(field2_)); + ASSERT_THAT(schema_->FindFieldByName("map.value.Second_child.Element.foo", false), + ::testing::Optional(field1_)); + ASSERT_THAT(schema_->FindFieldByName("map.value.Second_child.Element.fooo", false), + ::testing::Optional(std::nullopt)); +} - auto field4 = iceberg::SchemaField::MakeRequired( +TEST_F(NestType2Test, TestFindByShortName) { + ASSERT_THAT(schema_->FindFieldByName("Map.Second_child"), ::testing::Optional(field6_)); + ASSERT_THAT(schema_->FindFieldByName("Map.First_child"), ::testing::Optional(field5_)); + ASSERT_THAT(schema_->FindFieldByName("Map.Second_child.Foobar"), + ::testing::Optional(field3_)); + ASSERT_THAT(schema_->FindFieldByName("Map.Second_child.Bar"), + ::testing::Optional(field2_)); + ASSERT_THAT(schema_->FindFieldByName("Map.Second_child.Foo"), + ::testing::Optional(field1_)); + ASSERT_THAT(schema_->FindFieldByName("Map.Second_child.aaa"), + ::testing::Optional(std::nullopt)); +} + +TEST_F(NestType2Test, TestFindByShortNameCaseInsensitive) { + ASSERT_THAT(schema_->FindFieldByName("map.second_child", false), + ::testing::Optional(field6_)); + ASSERT_THAT(schema_->FindFieldByName("map.first_child", false), + ::testing::Optional(field5_)); + ASSERT_THAT(schema_->FindFieldByName("map.second_child.foobar", false), + ::testing::Optional(field3_)); + ASSERT_THAT(schema_->FindFieldByName("map.second_child.bar", false), + ::testing::Optional(field2_)); + ASSERT_THAT(schema_->FindFieldByName("map.second_child.foo", false), + ::testing::Optional(field1_)); + ASSERT_THAT(schema_->FindFieldByName("Map.Second_child.aaa", false), + ::testing::Optional(std::nullopt)); +} + +TEST_F(NestType2Test, TestMapKey) { + auto field1_ = iceberg::SchemaField(1, "Foo", iceberg::int32(), true); + auto field2_ = iceberg::SchemaField(2, "Bar", iceberg::string(), true); + auto field3_ = iceberg::SchemaField(3, "Foobar", iceberg::int32(), true); + + iceberg::StructType structtype = iceberg::StructType({field1_, field2_, field3_}); + + auto field4_ = iceberg::SchemaField::MakeRequired( 4, "element", std::make_shared(structtype)); - auto listype = iceberg::ListType(field4); - - auto structtype2 = iceberg::StructType( - {iceberg::SchemaField::MakeRequired(5, "First_child", iceberg::int32()), - iceberg::SchemaField::MakeRequired(6, "Second_child", - std::make_shared(listype))}); - - auto maptype = iceberg::MapType( - iceberg::SchemaField::MakeRequired(7, "key", iceberg::int32()), - iceberg::SchemaField::MakeRequired( - 8, "value", std::make_shared(structtype2))); - - auto field5 = iceberg::SchemaField::MakeRequired(5, "First_child", iceberg::int32()); - auto field6 = iceberg::SchemaField::MakeRequired( - 6, "Second_child", std::make_shared(listype)); - auto field7 = iceberg::SchemaField::MakeRequired(7, "key", iceberg::int32()); - auto field8 = iceberg::SchemaField::MakeRequired( - 8, "value", std::make_shared(structtype2)); - auto field9 = iceberg::SchemaField::MakeRequired( - 9, "Map", std::make_shared(maptype)); - - iceberg::Schema schema({field9}, 1); - - ASSERT_EQ(schema.id_to_field_.size(), 0); - ASSERT_THAT(schema.FindFieldById(9), ::testing::Optional(field9)); - ASSERT_THAT(schema.FindFieldById(8), ::testing::Optional(field8)); - ASSERT_THAT(schema.FindFieldById(7), ::testing::Optional(field7)); - ASSERT_THAT(schema.FindFieldById(6), ::testing::Optional(field6)); - ASSERT_THAT(schema.FindFieldById(5), ::testing::Optional(field5)); - ASSERT_THAT(schema.FindFieldById(4), ::testing::Optional(field4)); - ASSERT_THAT(schema.FindFieldById(3), ::testing::Optional(field3)); - ASSERT_THAT(schema.FindFieldById(2), ::testing::Optional(field2)); - ASSERT_THAT(schema.FindFieldById(1), ::testing::Optional(field1)); - - ASSERT_THAT(schema.FindFieldByName("Map"), ::testing::Optional(field9)); - ASSERT_THAT(schema.FindFieldByName("Map.value"), ::testing::Optional(field8)); - ASSERT_THAT(schema.FindFieldByName("Map.key"), ::testing::Optional(field7)); - ASSERT_THAT(schema.FindFieldByName("Map.value.Second_child"), - ::testing::Optional(field6)); - ASSERT_THAT(schema.FindFieldByName("Map.value.First_child"), - ::testing::Optional(field5)); - ASSERT_THAT(schema.FindFieldByName("Map.value.Second_child.element"), - ::testing::Optional(field4)); - ASSERT_THAT(schema.FindFieldByName("Map.value.Second_child.element.Foobar"), - ::testing::Optional(field3)); - ASSERT_THAT(schema.FindFieldByName("Map.value.Second_child.element.Bar"), - ::testing::Optional(field2)); - ASSERT_THAT(schema.FindFieldByName("Map.value.Second_child.element.Foo"), - ::testing::Optional(field1)); - - ASSERT_THAT(schema.FindFieldByName("map", false), ::testing::Optional(field9)); - ASSERT_THAT(schema.FindFieldByName("map.vALUE", false), ::testing::Optional(field8)); - ASSERT_THAT(schema.FindFieldByName("map.Key", false), ::testing::Optional(field7)); - ASSERT_THAT(schema.FindFieldByName("map.Value.second_Child", false), - ::testing::Optional(field6)); - ASSERT_THAT(schema.FindFieldByName("map.Value.first_chIld", false), - ::testing::Optional(field5)); - ASSERT_THAT(schema.FindFieldByName("map.Value.second_child.Element", false), - ::testing::Optional(field4)); - ASSERT_THAT(schema.FindFieldByName("map.Value.second_child.Element.foobar", false), - ::testing::Optional(field3)); - ASSERT_THAT(schema.FindFieldByName("map.VaLue.second_child.Element.bar", false), - ::testing::Optional(field2)); - ASSERT_THAT(schema.FindFieldByName("map.value.Second_child.Element.foo", false), - ::testing::Optional(field1)); - - ASSERT_THAT(schema.FindFieldByName("Map.Second_child"), ::testing::Optional(field6)); - ASSERT_THAT(schema.FindFieldByName("Map.First_child"), ::testing::Optional(field5)); - ASSERT_THAT(schema.FindFieldByName("Map.Second_child.Foobar"), - ::testing::Optional(field3)); - ASSERT_THAT(schema.FindFieldByName("Map.Second_child.Bar"), - ::testing::Optional(field2)); - ASSERT_THAT(schema.FindFieldByName("Map.Second_child.Foo"), - ::testing::Optional(field1)); - - ASSERT_THAT(schema.FindFieldByName("map.second_child", false), - ::testing::Optional(field6)); - ASSERT_THAT(schema.FindFieldByName("map.first_child", false), - ::testing::Optional(field5)); - ASSERT_THAT(schema.FindFieldByName("map.second_child.foobar", false), - ::testing::Optional(field3)); - ASSERT_THAT(schema.FindFieldByName("map.second_child.bar", false), - ::testing::Optional(field2)); - ASSERT_THAT(schema.FindFieldByName("map.second_child.foo", false), - ::testing::Optional(field1)); - ASSERT_EQ(schema.id_to_field_.size(), 9); + iceberg::MapType maptype = + iceberg::MapType(iceberg::SchemaField::MakeRequired( + 5, "key", std::make_shared(structtype)), + iceberg::SchemaField::MakeRequired(6, "value", iceberg::int32())); + + auto field5_ = iceberg::SchemaField::MakeRequired( + 5, "key", std::make_shared(structtype)); + auto field6_ = iceberg::SchemaField::MakeRequired(6, "value", iceberg::int32()); + + auto field7_ = iceberg::SchemaField::MakeRequired( + 7, "Map", std::make_shared(maptype)); + + iceberg::Schema schema({field7_}, 1); + + ASSERT_THAT(schema.FindFieldByName("Map.key.Foo"), ::testing::Optional(field1_)); + ASSERT_THAT(schema.FindFieldByName("Map.Foo"), ::testing::Optional(std::nullopt)); } From 8d2154a7baadafad73f4939bbb205201046a359d Mon Sep 17 00:00:00 2001 From: nullccxsy <32149055912@qq.com> Date: Mon, 25 Aug 2025 16:21:51 +0800 Subject: [PATCH 05/14] Fix: fix comments, add duplicate name check, and resolve duplicate result call in macros - Added logic to detect and handle duplicate names in NameToIdVisitor to prevent invalid schema errors. - Fixed duplicate result call issue in src/iceberg/util/macros.h by optimizing ICEBERG_RETURN_UNEXPECTED macro. --- src/iceberg/schema.cc | 107 +++++++--- src/iceberg/schema.h | 11 +- src/iceberg/util/macros.h | 11 +- test/schema_test.cc | 421 ++++++++++++++++++++++++++------------ 4 files changed, 379 insertions(+), 171 deletions(-) diff --git a/src/iceberg/schema.cc b/src/iceberg/schema.cc index 7051f8fe0..c45324746 100644 --- a/src/iceberg/schema.cc +++ b/src/iceberg/schema.cc @@ -21,6 +21,7 @@ #include #include +#include #include "iceberg/type.h" #include "iceberg/util/formatter.h" // IWYU pragma: keep @@ -28,6 +29,7 @@ #include "iceberg/util/visit_type.h" namespace iceberg { + class IdToFieldVisitor { public: explicit IdToFieldVisitor( @@ -40,10 +42,14 @@ class IdToFieldVisitor { std::unordered_map>& id_to_field_; }; -class NametoIdVisitor { +class NameToIdVisitor { public: - explicit NametoIdVisitor(std::unordered_map& name_to_id, - bool case_sensitive_ = true); + explicit NameToIdVisitor( + std::unordered_map& name_to_id, bool case_sensitive_ = true, + std::function quoting_func_ = + [](std::string_view s) { return std::string(s); }); + ~NameToIdVisitor(); + Status Visit(const ListType& type, const std::string& path, const std::string& short_path); Status Visit(const MapType& type, const std::string& path, @@ -52,12 +58,17 @@ class NametoIdVisitor { const std::string& short_path); Status Visit(const PrimitiveType& type, const std::string& path, const std::string& short_path); - static std::string BuildPath(std::string_view prefix, std::string_view field_name, - bool case_sensitive); + + private: + std::string BuildPath(std::string_view prefix, std::string_view field_name, + bool case_sensitive); + void Merge(); private: bool case_sensitive_; std::unordered_map& name_to_id_; + std::unordered_map shortname_to_id_; + std::function quoting_func_; }; Schema::Schema(std::vector fields, std::optional schema_id) @@ -81,12 +92,12 @@ bool Schema::Equals(const Schema& other) const { Result>> Schema::FindFieldByName( std::string_view name, bool case_sensitive) const { if (case_sensitive) { - ICEBERG_RETURN_UNEXPECTED(InitNameToIndexMap()); + ICEBERG_RETURN_UNEXPECTED(InitNameToIdMap()); auto it = name_to_id_.find(std::string(name)); if (it == name_to_id_.end()) return std::nullopt; return FindFieldById(it->second); } - ICEBERG_RETURN_UNEXPECTED(InitLowerCaseNameToIndexMap()); + ICEBERG_RETURN_UNEXPECTED(InitLowerCaseNameToIdMap()); std::string lower_name(name); std::ranges::transform(lower_name, lower_name.begin(), ::tolower); auto it = lowercase_name_to_id_.find(lower_name); @@ -94,7 +105,7 @@ Result>> Schema::FindFie return FindFieldById(it->second); } -Result Schema::InitIdToIndexMap() const { +Status Schema::InitIdToFieldMap() const { if (!id_to_field_.empty()) { return {}; } @@ -103,29 +114,29 @@ Result Schema::InitIdToIndexMap() const { return {}; } -Result Schema::InitNameToIndexMap() const { +Status Schema::InitNameToIdMap() const { if (!name_to_id_.empty()) { return {}; } - std::string path, short_path; - NametoIdVisitor visitor(name_to_id_, true); - ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*this, &visitor, path, short_path)); + NameToIdVisitor visitor(name_to_id_, /*case_sensitive=*/true); + ICEBERG_RETURN_UNEXPECTED( + VisitTypeInline(*this, &visitor, /*path=*/"", /*short_path=*/"")); return {}; } -Result Schema::InitLowerCaseNameToIndexMap() const { +Status Schema::InitLowerCaseNameToIdMap() const { if (!lowercase_name_to_id_.empty()) { return {}; } - std::string path, short_path; - NametoIdVisitor visitor(lowercase_name_to_id_, false); - ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*this, &visitor, path, short_path)); + NameToIdVisitor visitor(lowercase_name_to_id_, /*case_sensitive=*/false); + ICEBERG_RETURN_UNEXPECTED( + VisitTypeInline(*this, &visitor, /*path=*/"", /*short_path=*/"")); return {}; } Result>> Schema::FindFieldById( int32_t field_id) const { - ICEBERG_RETURN_UNEXPECTED(InitIdToIndexMap()); + ICEBERG_RETURN_UNEXPECTED(InitIdToFieldMap()); auto it = id_to_field_.find(field_id); if (it == id_to_field_.end()) { return std::nullopt; @@ -154,11 +165,16 @@ Status IdToFieldVisitor::VisitNestedType(const Type& type) { return {}; } -NametoIdVisitor::NametoIdVisitor(std::unordered_map& name_to_id, - bool case_sensitive) - : name_to_id_(name_to_id), case_sensitive_(case_sensitive) {} +NameToIdVisitor::NameToIdVisitor( + std::unordered_map& name_to_id, bool case_sensitive, + std::function quoting_func) + : name_to_id_(name_to_id), + case_sensitive_(case_sensitive), + quoting_func_(std::move(quoting_func)) {} + +NameToIdVisitor::~NameToIdVisitor() { Merge(); } -Status NametoIdVisitor::Visit(const ListType& type, const std::string& path, +Status NameToIdVisitor::Visit(const ListType& type, const std::string& path, const std::string& short_path) { const auto& field = type.fields()[0]; std::string new_path = BuildPath(path, field.name(), case_sensitive_); @@ -168,14 +184,17 @@ Status NametoIdVisitor::Visit(const ListType& type, const std::string& path, } else { new_short_path = BuildPath(short_path, field.name(), case_sensitive_); } - name_to_id_[new_path] = field.field_id(); - name_to_id_.emplace(new_short_path, field.field_id()); + auto it = name_to_id_.emplace(new_path, field.field_id()); + if (!it.second) { + return NotSupported("Duplicate path in name_to_id_: {}", new_path); + } + shortname_to_id_.emplace(new_short_path, field.field_id()); ICEBERG_RETURN_UNEXPECTED( VisitTypeInline(*field.type(), this, new_path, new_short_path)); return {}; } -Status NametoIdVisitor::Visit(const MapType& type, const std::string& path, +Status NameToIdVisitor::Visit(const MapType& type, const std::string& path, const std::string& short_path) { std::string new_path, new_short_path; const auto& fields = type.fields(); @@ -187,42 +206,62 @@ Status NametoIdVisitor::Visit(const MapType& type, const std::string& path, } else { new_short_path = BuildPath(short_path, field.name(), case_sensitive_); } - name_to_id_[new_path] = field.field_id(); - name_to_id_.emplace(new_short_path, field.field_id()); + auto it = name_to_id_.emplace(new_path, field.field_id()); + if (!it.second) { + return NotSupported("Duplicate path in name_to_id_: {}", new_path); + } + shortname_to_id_.emplace(new_short_path, field.field_id()); ICEBERG_RETURN_UNEXPECTED( VisitTypeInline(*field.type(), this, new_path, new_short_path)); } return {}; } -Status NametoIdVisitor::Visit(const StructType& type, const std::string& path, +Status NameToIdVisitor::Visit(const StructType& type, const std::string& path, const std::string& short_path) { const auto& fields = type.fields(); std::string new_path, new_short_path; for (const auto& field : fields) { new_path = BuildPath(path, field.name(), case_sensitive_); new_short_path = BuildPath(short_path, field.name(), case_sensitive_); - name_to_id_[new_path] = field.field_id(); - name_to_id_.emplace(new_short_path, field.field_id()); + auto it = name_to_id_.emplace(new_path, field.field_id()); + if (!it.second) { + return NotSupported("Duplicate path in name_to_id_: {}", new_path); + } + shortname_to_id_.emplace(new_short_path, field.field_id()); ICEBERG_RETURN_UNEXPECTED( VisitTypeInline(*field.type(), this, new_path, new_short_path)); } return {}; } -Status NametoIdVisitor::Visit(const PrimitiveType& type, const std::string& path, +Status NameToIdVisitor::Visit(const PrimitiveType& type, const std::string& path, const std::string& short_path) { return {}; } -std::string NametoIdVisitor::BuildPath(std::string_view prefix, +std::string NameToIdVisitor::BuildPath(std::string_view prefix, std::string_view field_name, bool case_sensitive) { + std::string quoted_name; + if (!quoting_func_) { + quoted_name = std::string(field_name); + } else { + quoted_name = quoting_func_(field_name); + } if (case_sensitive) { - return prefix.empty() ? std::string(field_name) - : std::string(prefix) + "." + std::string(field_name); + return prefix.empty() ? quoted_name : std::string(prefix) + "." + quoted_name; } - std::string lower_name(field_name); + std::string lower_name = quoted_name; std::ranges::transform(lower_name, lower_name.begin(), ::tolower); return prefix.empty() ? lower_name : std::string(prefix) + "." + lower_name; } + +void NameToIdVisitor::Merge() { + for (const auto& it : shortname_to_id_) { + if (name_to_id_.find(it.first) == name_to_id_.end()) { + name_to_id_[it.first] = it.second; + } + } +} + } // namespace iceberg diff --git a/src/iceberg/schema.h b/src/iceberg/schema.h index b9b3f38d1..867f10b45 100644 --- a/src/iceberg/schema.h +++ b/src/iceberg/schema.h @@ -62,6 +62,8 @@ class ICEBERG_EXPORT Schema : public StructType { /// short name 'l.x' in addition to canonical name 'l.element.x'. a map 'm', if its /// value include a structs with field 'x' wil produce short name 'm.x' in addition to /// canonical name 'm.value.x' + /// FIXME: Currently only handles ASCII lowercase conversion; extend to support + /// non-ASCII characters (e.g., using std::towlower or ICU) [[nodiscard]] Result>> FindFieldByName(std::string_view name, bool case_sensitive = true) const; @@ -85,8 +87,11 @@ class ICEBERG_EXPORT Schema : public StructType { const std::optional schema_id_; - Result InitIdToIndexMap() const; - Result InitNameToIndexMap() const; - Result InitLowerCaseNameToIndexMap() const; + // TODO(nullccxsy): Address potential concurrency issues in lazy initialization (e.g., + // use std::call_once) + Status InitIdToFieldMap() const; + Status InitNameToIdMap() const; + Status InitLowerCaseNameToIdMap() const; }; + } // namespace iceberg diff --git a/src/iceberg/util/macros.h b/src/iceberg/util/macros.h index 4d687bf51..ad0b61624 100644 --- a/src/iceberg/util/macros.h +++ b/src/iceberg/util/macros.h @@ -19,10 +19,13 @@ #pragma once -#define ICEBERG_RETURN_UNEXPECTED(result) \ - if (!result) [[unlikely]] { \ - return std::unexpected(result.error()); \ - } +#define ICEBERG_RETURN_UNEXPECTED(result) \ + do { \ + auto&& iceberg_temp_result = (result); \ + if (!iceberg_temp_result) [[unlikely]] { \ + return std::unexpected(iceberg_temp_result.error()); \ + } \ + } while (false); #define ICEBERG_ASSIGN_OR_RAISE_IMPL(result_name, lhs, rexpr) \ auto&& result_name = (rexpr); \ diff --git a/test/schema_test.cc b/test/schema_test.cc index 5dc44cb5f..f7a9d55f6 100644 --- a/test/schema_test.cc +++ b/test/schema_test.cc @@ -24,6 +24,7 @@ #include #include +#include #include "iceberg/schema_field.h" #include "iceberg/util/formatter.h" // IWYU pragma: keep @@ -85,13 +86,12 @@ TEST(SchemaTest, Equality) { class NestedTypeTest : public ::testing::Test { protected: void SetUp() override { - field1_ = iceberg::SchemaField(1, "Foo", iceberg::int32(), true); - field2_ = iceberg::SchemaField(2, "Bar", iceberg::string(), true); - field3_ = iceberg::SchemaField(3, "Foobar", iceberg::int32(), true); + field1_ = std::make_unique(1, "Foo", iceberg::int32(), true); + field2_ = std::make_unique(2, "Bar", iceberg::string(), true); + field3_ = std::make_unique(3, "Foobar", iceberg::int32(), true); - iceberg::StructType structtype = - iceberg::StructType(std::vector{ - field1_.value(), field2_.value(), field3_.value()}); + iceberg::StructType structtype = iceberg::StructType( + std::vector{*field1_, *field2_, *field3_}); auto listype = iceberg::ListType(iceberg::SchemaField::MakeRequired( 4, "element", std::make_shared(structtype))); @@ -101,81 +101,82 @@ class NestedTypeTest : public ::testing::Test { iceberg::SchemaField::MakeRequired( 6, "value", std::make_shared(listype))); - field4_ = iceberg::SchemaField::MakeRequired( - 4, "element", std::make_shared(structtype)); - field5_ = iceberg::SchemaField::MakeRequired(5, "key", iceberg::int32()); - field6_ = iceberg::SchemaField::MakeRequired( - 6, "value", std::make_shared(listype)); - field7_ = iceberg::SchemaField::MakeRequired( - 7, "Value", std::make_shared(maptype)); + field4_ = std::make_unique( + 4, "element", std::make_shared(structtype), false); + field5_ = std::make_unique(5, "key", iceberg::int32(), false); + field6_ = std::make_unique( + 6, "value", std::make_shared(listype), false); + field7_ = std::make_unique( + 7, "Value", std::make_shared(maptype), false); - schema_ = std::make_shared( - std::vector{field7_.value()}, 1); + schema_ = + std::make_shared(std::vector{*field7_}, 1); } std::shared_ptr schema_; - std::optional field1_; - std::optional field2_; - std::optional field3_; - std::optional field4_; - std::optional field5_; - std::optional field6_; - std::optional field7_; + std::unique_ptr field1_; + std::unique_ptr field2_; + std::unique_ptr field3_; + std::unique_ptr field4_; + std::unique_ptr field5_; + std::unique_ptr field6_; + std::unique_ptr field7_; }; TEST_F(NestedTypeTest, TestFindById) { - ASSERT_THAT(schema_->FindFieldById(7), ::testing::Optional(field7_)); - ASSERT_THAT(schema_->FindFieldById(6), ::testing::Optional(field6_)); - ASSERT_THAT(schema_->FindFieldById(5), ::testing::Optional(field5_)); - ASSERT_THAT(schema_->FindFieldById(4), ::testing::Optional(field4_)); - ASSERT_THAT(schema_->FindFieldById(3), ::testing::Optional(field3_)); - ASSERT_THAT(schema_->FindFieldById(2), ::testing::Optional(field2_)); - ASSERT_THAT(schema_->FindFieldById(1), ::testing::Optional(field1_)); + ASSERT_THAT(schema_->FindFieldById(7), ::testing::Optional(*field7_)); + ASSERT_THAT(schema_->FindFieldById(6), ::testing::Optional(*field6_)); + ASSERT_THAT(schema_->FindFieldById(5), ::testing::Optional(*field5_)); + ASSERT_THAT(schema_->FindFieldById(4), ::testing::Optional(*field4_)); + ASSERT_THAT(schema_->FindFieldById(3), ::testing::Optional(*field3_)); + ASSERT_THAT(schema_->FindFieldById(2), ::testing::Optional(*field2_)); + ASSERT_THAT(schema_->FindFieldById(1), ::testing::Optional(*field1_)); ASSERT_THAT(schema_->FindFieldById(10), ::testing::Optional(std::nullopt)); } TEST_F(NestedTypeTest, TestFindByName) { - ASSERT_THAT(schema_->FindFieldByName("Value"), ::testing::Optional(field7_)); - ASSERT_THAT(schema_->FindFieldByName("Value.value"), ::testing::Optional(field6_)); - ASSERT_THAT(schema_->FindFieldByName("Value.key"), ::testing::Optional(field5_)); + ASSERT_THAT(schema_->FindFieldByName("Value"), ::testing::Optional(*field7_)); + ASSERT_THAT(schema_->FindFieldByName("Value.value"), ::testing::Optional(*field6_)); + ASSERT_THAT(schema_->FindFieldByName("Value.key"), ::testing::Optional(*field5_)); ASSERT_THAT(schema_->FindFieldByName("Value.value.element"), - ::testing::Optional(field4_)); + ::testing::Optional(*field4_)); ASSERT_THAT(schema_->FindFieldByName("Value.value.element.Foobar"), - ::testing::Optional(field3_)); + ::testing::Optional(*field3_)); ASSERT_THAT(schema_->FindFieldByName("Value.value.element.Bar"), - ::testing::Optional(field2_)); + ::testing::Optional(*field2_)); ASSERT_THAT(schema_->FindFieldByName("Value.value.element.Foo"), - ::testing::Optional(field1_)); + ::testing::Optional(*field1_)); ASSERT_THAT(schema_->FindFieldByName("Value.value.element.FoO"), ::testing::Optional(std::nullopt)); } TEST_F(NestedTypeTest, TestFindByNameCaseInsensitive) { - ASSERT_THAT(schema_->FindFieldByName("vALue", false), ::testing::Optional(field7_)); + ASSERT_THAT(schema_->FindFieldByName("vALue", false), ::testing::Optional(*field7_)); ASSERT_THAT(schema_->FindFieldByName("vALue.VALUE", false), - ::testing::Optional(field6_)); - ASSERT_THAT(schema_->FindFieldByName("valUe.kEy", false), ::testing::Optional(field5_)); + ::testing::Optional(*field6_)); + ASSERT_THAT(schema_->FindFieldByName("valUe.kEy", false), + ::testing::Optional(*field5_)); ASSERT_THAT(schema_->FindFieldByName("vaLue.vAlue.elEment", false), - ::testing::Optional(field4_)); + ::testing::Optional(*field4_)); ASSERT_THAT(schema_->FindFieldByName("vaLue.vAlue.eLement.fOObar", false), - ::testing::Optional(field3_)); + ::testing::Optional(*field3_)); ASSERT_THAT(schema_->FindFieldByName("valUe.vaLUe.elemEnt.Bar", false), - ::testing::Optional(field2_)); + ::testing::Optional(*field2_)); ASSERT_THAT(schema_->FindFieldByName("valUe.valUe.ELEMENT.FOO", false), - ::testing::Optional(field1_)); + ::testing::Optional(*field1_)); ASSERT_THAT(schema_->FindFieldByName("valUe.valUe.ELEMENT.FO", false), ::testing::Optional(std::nullopt)); } TEST_F(NestedTypeTest, TestFindByShortNameCaseInsensitive) { ASSERT_THAT(schema_->FindFieldByName("vaLue.value.FOO", false), - ::testing::Optional(field1_)); + ::testing::Optional(*field1_)); ASSERT_THAT(schema_->FindFieldByName("Value.value.Bar", false), - ::testing::Optional(field2_)); + ::testing::Optional(*field2_)); ASSERT_THAT(schema_->FindFieldByName("Value.value.FooBAR", false), - ::testing::Optional(field3_)); + ::testing::Optional(*field3_)); ASSERT_THAT(schema_->FindFieldByName("Value.value.FooBAR.a", false), ::testing::Optional(std::nullopt)); } @@ -183,16 +184,16 @@ TEST_F(NestedTypeTest, TestFindByShortNameCaseInsensitive) { class NestType2Test : public ::testing::Test { protected: void SetUp() override { - field1_ = iceberg::SchemaField(1, "Foo", iceberg::int32(), true); - field2_ = iceberg::SchemaField(2, "Bar", iceberg::string(), true); - field3_ = iceberg::SchemaField(3, "Foobar", iceberg::int32(), true); + field1_ = std::make_unique(1, "Foo", iceberg::int32(), true); + field2_ = std::make_unique(2, "Bar", iceberg::string(), true); + field3_ = std::make_unique(3, "Foobar", iceberg::int32(), true); + + iceberg::StructType structtype = iceberg::StructType({*field1_, *field2_, *field3_}); - iceberg::StructType structtype = - iceberg::StructType({field1_.value(), field2_.value(), field3_.value()}); + field4_ = std::make_unique( + 4, "element", std::make_shared(structtype), false); - field4_ = iceberg::SchemaField::MakeRequired( - 4, "element", std::make_shared(structtype)); - auto listype = iceberg::ListType(field4_.value()); + auto listype = iceberg::ListType(*field4_); iceberg::StructType structtype2 = iceberg::StructType( {iceberg::SchemaField::MakeRequired(5, "First_child", iceberg::int32()), @@ -204,137 +205,297 @@ class NestType2Test : public ::testing::Test { iceberg::SchemaField::MakeRequired( 8, "value", std::make_shared(structtype2))); - field5_ = iceberg::SchemaField::MakeRequired(5, "First_child", iceberg::int32()); - field6_ = iceberg::SchemaField::MakeRequired( - 6, "Second_child", std::make_shared(listype)); - field7_ = iceberg::SchemaField::MakeRequired(7, "key", iceberg::int32()); - field8_ = iceberg::SchemaField::MakeRequired( - 8, "value", std::make_shared(structtype2)); - field9_ = iceberg::SchemaField::MakeRequired( - 9, "Map", std::make_shared(maptype)); - - schema_ = std::make_shared( - std::vector{field9_.value()}, 1); + field5_ = + std::make_unique(5, "First_child", iceberg::int32(), false); + field6_ = std::make_unique( + 6, "Second_child", std::make_shared(listype), false); + field7_ = std::make_unique(7, "key", iceberg::int32(), false); + field8_ = std::make_unique( + 8, "value", std::make_shared(structtype2), false); + field9_ = std::make_unique( + 9, "Map", std::make_shared(maptype), false); + + schema_ = + std::make_shared(std::vector{*field9_}, 1); } std::shared_ptr schema_; - std::optional field1_; - std::optional field2_; - std::optional field3_; - std::optional field4_; - std::optional field5_; - std::optional field6_; - std::optional field7_; - std::optional field8_; - std::optional field9_; + std::unique_ptr field1_; + std::unique_ptr field2_; + std::unique_ptr field3_; + std::unique_ptr field4_; + std::unique_ptr field5_; + std::unique_ptr field6_; + std::unique_ptr field7_; + std::unique_ptr field8_; + std::unique_ptr field9_; }; TEST_F(NestType2Test, TestFindById) { - ASSERT_THAT(schema_->FindFieldById(9), ::testing::Optional(field9_)); - ASSERT_THAT(schema_->FindFieldById(8), ::testing::Optional(field8_)); - ASSERT_THAT(schema_->FindFieldById(7), ::testing::Optional(field7_)); - ASSERT_THAT(schema_->FindFieldById(6), ::testing::Optional(field6_)); - ASSERT_THAT(schema_->FindFieldById(5), ::testing::Optional(field5_)); - ASSERT_THAT(schema_->FindFieldById(4), ::testing::Optional(field4_)); - ASSERT_THAT(schema_->FindFieldById(3), ::testing::Optional(field3_)); - ASSERT_THAT(schema_->FindFieldById(2), ::testing::Optional(field2_)); - ASSERT_THAT(schema_->FindFieldById(1), ::testing::Optional(field1_)); + ASSERT_THAT(schema_->FindFieldById(9), ::testing::Optional(*field9_)); + ASSERT_THAT(schema_->FindFieldById(8), ::testing::Optional(*field8_)); + ASSERT_THAT(schema_->FindFieldById(7), ::testing::Optional(*field7_)); + ASSERT_THAT(schema_->FindFieldById(6), ::testing::Optional(*field6_)); + ASSERT_THAT(schema_->FindFieldById(5), ::testing::Optional(*field5_)); + ASSERT_THAT(schema_->FindFieldById(4), ::testing::Optional(*field4_)); + ASSERT_THAT(schema_->FindFieldById(3), ::testing::Optional(*field3_)); + ASSERT_THAT(schema_->FindFieldById(2), ::testing::Optional(*field2_)); + ASSERT_THAT(schema_->FindFieldById(1), ::testing::Optional(*field1_)); ASSERT_THAT(schema_->FindFieldById(0), ::testing::Optional(std::nullopt)); } TEST_F(NestType2Test, TestFindByName) { - ASSERT_THAT(schema_->FindFieldByName("Map"), ::testing::Optional(field9_)); - ASSERT_THAT(schema_->FindFieldByName("Map.value"), ::testing::Optional(field8_)); - ASSERT_THAT(schema_->FindFieldByName("Map.key"), ::testing::Optional(field7_)); + ASSERT_THAT(schema_->FindFieldByName("Map"), ::testing::Optional(*field9_)); + ASSERT_THAT(schema_->FindFieldByName("Map.value"), ::testing::Optional(*field8_)); + ASSERT_THAT(schema_->FindFieldByName("Map.key"), ::testing::Optional(*field7_)); ASSERT_THAT(schema_->FindFieldByName("Map.value.Second_child"), - ::testing::Optional(field6_)); + ::testing::Optional(*field6_)); ASSERT_THAT(schema_->FindFieldByName("Map.value.First_child"), - ::testing::Optional(field5_)); + ::testing::Optional(*field5_)); ASSERT_THAT(schema_->FindFieldByName("Map.value.Second_child.element"), - ::testing::Optional(field4_)); + ::testing::Optional(*field4_)); ASSERT_THAT(schema_->FindFieldByName("Map.value.Second_child.element.Foobar"), - ::testing::Optional(field3_)); + ::testing::Optional(*field3_)); ASSERT_THAT(schema_->FindFieldByName("Map.value.Second_child.element.Bar"), - ::testing::Optional(field2_)); + ::testing::Optional(*field2_)); ASSERT_THAT(schema_->FindFieldByName("Map.value.Second_child.element.Foo"), - ::testing::Optional(field1_)); + ::testing::Optional(*field1_)); ASSERT_THAT(schema_->FindFieldByName("Map.value.Second_child.element.Fooo"), ::testing::Optional(std::nullopt)); } TEST_F(NestType2Test, TestFindByNameCaseInsensitive) { - ASSERT_THAT(schema_->FindFieldByName("map", false), ::testing::Optional(field9_)); - ASSERT_THAT(schema_->FindFieldByName("map.vALUE", false), ::testing::Optional(field8_)); - ASSERT_THAT(schema_->FindFieldByName("map.Key", false), ::testing::Optional(field7_)); + ASSERT_THAT(schema_->FindFieldByName("map", false), ::testing::Optional(*field9_)); + ASSERT_THAT(schema_->FindFieldByName("map.vALUE", false), + ::testing::Optional(*field8_)); + ASSERT_THAT(schema_->FindFieldByName("map.Key", false), ::testing::Optional(*field7_)); ASSERT_THAT(schema_->FindFieldByName("map.Value.second_Child", false), - ::testing::Optional(field6_)); + ::testing::Optional(*field6_)); ASSERT_THAT(schema_->FindFieldByName("map.Value.first_chIld", false), - ::testing::Optional(field5_)); + ::testing::Optional(*field5_)); ASSERT_THAT(schema_->FindFieldByName("map.Value.second_child.Element", false), - ::testing::Optional(field4_)); + ::testing::Optional(*field4_)); ASSERT_THAT(schema_->FindFieldByName("map.Value.second_child.Element.foobar", false), - ::testing::Optional(field3_)); + ::testing::Optional(*field3_)); ASSERT_THAT(schema_->FindFieldByName("map.VaLue.second_child.Element.bar", false), - ::testing::Optional(field2_)); + ::testing::Optional(*field2_)); ASSERT_THAT(schema_->FindFieldByName("map.value.Second_child.Element.foo", false), - ::testing::Optional(field1_)); + ::testing::Optional(*field1_)); ASSERT_THAT(schema_->FindFieldByName("map.value.Second_child.Element.fooo", false), ::testing::Optional(std::nullopt)); } TEST_F(NestType2Test, TestFindByShortName) { - ASSERT_THAT(schema_->FindFieldByName("Map.Second_child"), ::testing::Optional(field6_)); - ASSERT_THAT(schema_->FindFieldByName("Map.First_child"), ::testing::Optional(field5_)); + ASSERT_THAT(schema_->FindFieldByName("Map.Second_child"), + ::testing::Optional(*field6_)); + ASSERT_THAT(schema_->FindFieldByName("Map.First_child"), ::testing::Optional(*field5_)); ASSERT_THAT(schema_->FindFieldByName("Map.Second_child.Foobar"), - ::testing::Optional(field3_)); + ::testing::Optional(*field3_)); ASSERT_THAT(schema_->FindFieldByName("Map.Second_child.Bar"), - ::testing::Optional(field2_)); + ::testing::Optional(*field2_)); ASSERT_THAT(schema_->FindFieldByName("Map.Second_child.Foo"), - ::testing::Optional(field1_)); + ::testing::Optional(*field1_)); ASSERT_THAT(schema_->FindFieldByName("Map.Second_child.aaa"), ::testing::Optional(std::nullopt)); } TEST_F(NestType2Test, TestFindByShortNameCaseInsensitive) { ASSERT_THAT(schema_->FindFieldByName("map.second_child", false), - ::testing::Optional(field6_)); + ::testing::Optional(*field6_)); ASSERT_THAT(schema_->FindFieldByName("map.first_child", false), - ::testing::Optional(field5_)); + ::testing::Optional(*field5_)); ASSERT_THAT(schema_->FindFieldByName("map.second_child.foobar", false), - ::testing::Optional(field3_)); + ::testing::Optional(*field3_)); ASSERT_THAT(schema_->FindFieldByName("map.second_child.bar", false), - ::testing::Optional(field2_)); + ::testing::Optional(*field2_)); ASSERT_THAT(schema_->FindFieldByName("map.second_child.foo", false), - ::testing::Optional(field1_)); + ::testing::Optional(*field1_)); ASSERT_THAT(schema_->FindFieldByName("Map.Second_child.aaa", false), ::testing::Optional(std::nullopt)); } -TEST_F(NestType2Test, TestMapKey) { - auto field1_ = iceberg::SchemaField(1, "Foo", iceberg::int32(), true); - auto field2_ = iceberg::SchemaField(2, "Bar", iceberg::string(), true); - auto field3_ = iceberg::SchemaField(3, "Foobar", iceberg::int32(), true); +class ComplexMapStructTest : public ::testing::Test { + protected: + void SetUp() override { + // Separate inner struct for key: {inner_key: int, inner_value: int} + inner_struct_type_key_ = + std::make_shared(std::vector{ + iceberg::SchemaField(10, "inner_key", iceberg::int32(), false), + iceberg::SchemaField(11, "inner_value", iceberg::int32(), false)}); + + exp_inner_key_key_ = + std::make_unique(10, "inner_key", iceberg::int32(), false); + exp_inner_key_value_ = std::make_unique( + 11, "inner_value", iceberg::int32(), false); + + // Separate inner struct for value: {inner_k: int, inner_v: int} + inner_struct_type_value_ = + std::make_shared(std::vector{ + iceberg::SchemaField(12, "inner_k", iceberg::int32(), false), + iceberg::SchemaField(13, "inner_v", iceberg::int32(), false)}); + + exp_inner_value_k_ = + std::make_unique(12, "inner_k", iceberg::int32(), false); + exp_inner_value_v_ = + std::make_unique(13, "inner_v", iceberg::int32(), false); + + // Key struct: {key: int, value: inner_struct_key_} + key_struct_type_ = + std::make_shared(std::vector{ + iceberg::SchemaField(14, "key", iceberg::int32(), false), + iceberg::SchemaField(15, "value", inner_struct_type_key_, false)}); + + exp_key_struct_key_ = + std::make_unique(14, "key", iceberg::int32(), false); + exp_key_struct_value_ = std::make_unique( + 15, "value", inner_struct_type_key_, false); + + // Value struct: {key: int, value: inner_struct_value_} + value_struct_type_ = + std::make_shared(std::vector{ + iceberg::SchemaField(16, "key", iceberg::int32(), false), + iceberg::SchemaField(17, "value", inner_struct_type_value_, false)}); + + exp_value_struct_key_ = + std::make_unique(16, "key", iceberg::int32(), false); + exp_value_struct_value_ = std::make_unique( + 17, "value", inner_struct_type_value_, false); + + // Map type: map + map_type_ = std::make_shared( + iceberg::SchemaField(18, "key", key_struct_type_, false), + iceberg::SchemaField(19, "value", value_struct_type_, false)); + + exp_map_key_ = + std::make_unique(18, "key", key_struct_type_, false); + exp_map_value_ = + std::make_unique(19, "value", value_struct_type_, false); + + // Top-level field: a: map<...> + exp_field_a_ = std::make_unique(20, "a", map_type_, false); + + // Create schema + schema_ = std::make_shared( + std::vector{*exp_field_a_}, 1); + } - iceberg::StructType structtype = iceberg::StructType({field1_, field2_, field3_}); + std::shared_ptr schema_; + std::shared_ptr inner_struct_type_key_; + std::shared_ptr inner_struct_type_value_; + std::shared_ptr key_struct_type_; + std::shared_ptr value_struct_type_; + std::shared_ptr map_type_; + + std::unique_ptr exp_inner_key_key_; + std::unique_ptr exp_inner_key_value_; + std::unique_ptr exp_inner_value_k_; + std::unique_ptr exp_inner_value_v_; + std::unique_ptr exp_key_struct_key_; + std::unique_ptr exp_key_struct_value_; + std::unique_ptr exp_value_struct_key_; + std::unique_ptr exp_value_struct_value_; + std::unique_ptr exp_map_key_; + std::unique_ptr exp_map_value_; + std::unique_ptr exp_field_a_; +}; - auto field4_ = iceberg::SchemaField::MakeRequired( - 4, "element", std::make_shared(structtype)); +TEST_F(ComplexMapStructTest, TestFindById) { + ASSERT_THAT(schema_->FindFieldById(20), ::testing::Optional(*exp_field_a_)); + ASSERT_THAT(schema_->FindFieldById(19), ::testing::Optional(*exp_map_value_)); + ASSERT_THAT(schema_->FindFieldById(18), ::testing::Optional(*exp_map_key_)); + ASSERT_THAT(schema_->FindFieldById(17), ::testing::Optional(*exp_value_struct_value_)); + ASSERT_THAT(schema_->FindFieldById(16), ::testing::Optional(*exp_value_struct_key_)); + ASSERT_THAT(schema_->FindFieldById(15), ::testing::Optional(*exp_key_struct_value_)); + ASSERT_THAT(schema_->FindFieldById(14), ::testing::Optional(*exp_key_struct_key_)); + ASSERT_THAT(schema_->FindFieldById(13), ::testing::Optional(*exp_inner_value_v_)); + ASSERT_THAT(schema_->FindFieldById(12), ::testing::Optional(*exp_inner_value_k_)); + ASSERT_THAT(schema_->FindFieldById(11), ::testing::Optional(*exp_inner_key_value_)); + ASSERT_THAT(schema_->FindFieldById(10), ::testing::Optional(*exp_inner_key_key_)); +} - iceberg::MapType maptype = - iceberg::MapType(iceberg::SchemaField::MakeRequired( - 5, "key", std::make_shared(structtype)), - iceberg::SchemaField::MakeRequired(6, "value", iceberg::int32())); +TEST_F(ComplexMapStructTest, TestFindByName) { + ASSERT_THAT(schema_->FindFieldByName("a"), ::testing::Optional(*exp_field_a_)); + ASSERT_THAT(schema_->FindFieldByName("a.key"), ::testing::Optional(*exp_map_key_)); + ASSERT_THAT(schema_->FindFieldByName("a.value"), ::testing::Optional(*exp_map_value_)); + ASSERT_THAT(schema_->FindFieldByName("a.key.key"), + ::testing::Optional(*exp_key_struct_key_)); + ASSERT_THAT(schema_->FindFieldByName("a.key.value"), + ::testing::Optional(*exp_key_struct_value_)); + ASSERT_THAT(schema_->FindFieldByName("a.key.value.inner_key"), + ::testing::Optional(*exp_inner_key_key_)); + ASSERT_THAT(schema_->FindFieldByName("a.key.value.inner_value"), + ::testing::Optional(*exp_inner_key_value_)); + ASSERT_THAT(schema_->FindFieldByName("a.value.key"), + ::testing::Optional(*exp_value_struct_key_)); + ASSERT_THAT(schema_->FindFieldByName("a.value.value"), + ::testing::Optional(*exp_value_struct_value_)); + ASSERT_THAT(schema_->FindFieldByName("a.value.value.inner_k"), + ::testing::Optional(*exp_inner_value_k_)); + ASSERT_THAT(schema_->FindFieldByName("a.value.value.inner_v"), + ::testing::Optional(*exp_inner_value_v_)); +} - auto field5_ = iceberg::SchemaField::MakeRequired( - 5, "key", std::make_shared(structtype)); - auto field6_ = iceberg::SchemaField::MakeRequired(6, "value", iceberg::int32()); +TEST_F(ComplexMapStructTest, TestFindByNameCaseInsensitive) { + ASSERT_THAT(schema_->FindFieldByName("A", false), ::testing::Optional(*exp_field_a_)); + ASSERT_THAT(schema_->FindFieldByName("A.KEY", false), + ::testing::Optional(*exp_map_key_)); + ASSERT_THAT(schema_->FindFieldByName("A.VALUE", false), + ::testing::Optional(*exp_map_value_)); + ASSERT_THAT(schema_->FindFieldByName("A.KEY.KEY", false), + ::testing::Optional(*exp_key_struct_key_)); + ASSERT_THAT(schema_->FindFieldByName("A.KEY.VALUE", false), + ::testing::Optional(*exp_key_struct_value_)); + ASSERT_THAT(schema_->FindFieldByName("A.KEY.VALUE.INNER_KEY", false), + ::testing::Optional(*exp_inner_key_key_)); + ASSERT_THAT(schema_->FindFieldByName("A.KEY.VALUE.INNER_VALUE", false), + ::testing::Optional(*exp_inner_key_value_)); + ASSERT_THAT(schema_->FindFieldByName("A.VALUE.KEY", false), + ::testing::Optional(*exp_value_struct_key_)); + ASSERT_THAT(schema_->FindFieldByName("A.VALUE.VALUE", false), + ::testing::Optional(*exp_value_struct_value_)); + ASSERT_THAT(schema_->FindFieldByName("A.VALUE.VALUE.INNER_K", false), + ::testing::Optional(*exp_inner_value_k_)); + ASSERT_THAT(schema_->FindFieldByName("A.VALUE.VALUE.INNER_V", false), + ::testing::Optional(*exp_inner_value_v_)); +} - auto field7_ = iceberg::SchemaField::MakeRequired( - 7, "Map", std::make_shared(maptype)); +TEST_F(ComplexMapStructTest, TestInvalidPaths) { + ASSERT_THAT(schema_->FindFieldByName("a.invalid"), ::testing::Optional(std::nullopt)); + ASSERT_THAT(schema_->FindFieldByName("a.key.invalid"), + ::testing::Optional(std::nullopt)); + ASSERT_THAT(schema_->FindFieldByName("a.value.invalid"), + ::testing::Optional(std::nullopt)); + ASSERT_THAT(schema_->FindFieldByName("A.KEY.VALUE.INVALID", false), + ::testing::Optional(std::nullopt)); +} - iceberg::Schema schema({field7_}, 1); +TEST(SchemaTest, DuplicatePathErrorCaseSensitive) { + iceberg::SchemaField nested_b(2, "b", iceberg::int32(), false); + iceberg::StructType nested_struct({nested_b}); + iceberg::SchemaField a(1, "a", std::make_shared(nested_struct), + false); + iceberg::SchemaField duplicate_ab(3, "a.b", iceberg::int32(), false); + iceberg::Schema schema({a, duplicate_ab}, 1); + + auto result = schema.FindFieldByName("a.b", /*case_sensitive=*/true); + ASSERT_FALSE(result.has_value()); + EXPECT_EQ(result.error().kind, iceberg::ErrorKind::kNotSupported); + EXPECT_THAT(result.error().message, + ::testing::HasSubstr("Duplicate path in name_to_id_: a.b")); +} - ASSERT_THAT(schema.FindFieldByName("Map.key.Foo"), ::testing::Optional(field1_)); - ASSERT_THAT(schema.FindFieldByName("Map.Foo"), ::testing::Optional(std::nullopt)); +TEST(SchemaTest, DuplicatePathErrorCaseInsensitive) { + iceberg::SchemaField nested_b(2, "B", iceberg::int32(), false); + iceberg::StructType nested_struct({nested_b}); + iceberg::SchemaField a(1, "A", std::make_shared(nested_struct), + false); + iceberg::SchemaField duplicate_ab(3, "a.b", iceberg::int32(), false); + iceberg::Schema schema({a, duplicate_ab}, 1); + + auto result = schema.FindFieldByName("A.B", /*case_sensitive=*/false); + ASSERT_FALSE(result.has_value()); + EXPECT_EQ(result.error().kind, iceberg::ErrorKind::kNotSupported); + EXPECT_THAT(result.error().message, + ::testing::HasSubstr("Duplicate path in name_to_id_: a.b")); } From 10bb5d46365d2f2852f24d4bbf975a2a83834a16 Mon Sep 17 00:00:00 2001 From: nullccxsy <32149055912@qq.com> Date: Mon, 25 Aug 2025 19:53:39 +0800 Subject: [PATCH 06/14] fix: fix comments, add duplicate field ID detection, update lowercase conversion function - Implemented detection for duplicate field IDs, Names in visitors, returning kInvalidSchema error with message. - Updated lowercase conversion in NameToIdVisitor --- src/iceberg/schema.cc | 59 ++++++++++++++++++------------------ src/iceberg/schema.h | 2 +- test/schema_test.cc | 70 ++++++++++++++++++++++++++++--------------- 3 files changed, 77 insertions(+), 54 deletions(-) diff --git a/src/iceberg/schema.cc b/src/iceberg/schema.cc index c45324746..739e9097b 100644 --- a/src/iceberg/schema.cc +++ b/src/iceberg/schema.cc @@ -23,6 +23,8 @@ #include #include +#include + #include "iceberg/type.h" #include "iceberg/util/formatter.h" // IWYU pragma: keep #include "iceberg/util/macros.h" @@ -46,10 +48,7 @@ class NameToIdVisitor { public: explicit NameToIdVisitor( std::unordered_map& name_to_id, bool case_sensitive_ = true, - std::function quoting_func_ = - [](std::string_view s) { return std::string(s); }); - ~NameToIdVisitor(); - + std::function quoting_func = {}); Status Visit(const ListType& type, const std::string& path, const std::string& short_path); Status Visit(const MapType& type, const std::string& path, @@ -58,16 +57,16 @@ class NameToIdVisitor { const std::string& short_path); Status Visit(const PrimitiveType& type, const std::string& path, const std::string& short_path); + void Finish(); private: std::string BuildPath(std::string_view prefix, std::string_view field_name, bool case_sensitive); - void Merge(); private: bool case_sensitive_; std::unordered_map& name_to_id_; - std::unordered_map shortname_to_id_; + std::unordered_map short_name_to_id_; std::function quoting_func_; }; @@ -98,9 +97,7 @@ Result>> Schema::FindFie return FindFieldById(it->second); } ICEBERG_RETURN_UNEXPECTED(InitLowerCaseNameToIdMap()); - std::string lower_name(name); - std::ranges::transform(lower_name, lower_name.begin(), ::tolower); - auto it = lowercase_name_to_id_.find(lower_name); + auto it = lowercase_name_to_id_.find(StringUtils::ToLower(name)); if (it == lowercase_name_to_id_.end()) return std::nullopt; return FindFieldById(it->second); } @@ -121,6 +118,7 @@ Status Schema::InitNameToIdMap() const { NameToIdVisitor visitor(name_to_id_, /*case_sensitive=*/true); ICEBERG_RETURN_UNEXPECTED( VisitTypeInline(*this, &visitor, /*path=*/"", /*short_path=*/"")); + visitor.Finish(); return {}; } @@ -131,6 +129,7 @@ Status Schema::InitLowerCaseNameToIdMap() const { NameToIdVisitor visitor(lowercase_name_to_id_, /*case_sensitive=*/false); ICEBERG_RETURN_UNEXPECTED( VisitTypeInline(*this, &visitor, /*path=*/"", /*short_path=*/"")); + visitor.Finish(); return {}; } @@ -159,7 +158,10 @@ Status IdToFieldVisitor::VisitNestedType(const Type& type) { const auto& nested = iceberg::internal::checked_cast(type); const auto& fields = nested.fields(); for (const auto& field : fields) { - id_to_field_.emplace(field.field_id(), std::cref(field)); + auto it = id_to_field_.try_emplace(field.field_id(), std::cref(field)); + if (!it.second) { + return InvalidSchema("Duplicate field id found: {}", field.field_id()); + } ICEBERG_RETURN_UNEXPECTED(Visit(*field.type())); } return {}; @@ -172,8 +174,6 @@ NameToIdVisitor::NameToIdVisitor( case_sensitive_(case_sensitive), quoting_func_(std::move(quoting_func)) {} -NameToIdVisitor::~NameToIdVisitor() { Merge(); } - Status NameToIdVisitor::Visit(const ListType& type, const std::string& path, const std::string& short_path) { const auto& field = type.fields()[0]; @@ -184,11 +184,12 @@ Status NameToIdVisitor::Visit(const ListType& type, const std::string& path, } else { new_short_path = BuildPath(short_path, field.name(), case_sensitive_); } - auto it = name_to_id_.emplace(new_path, field.field_id()); + auto it = name_to_id_.try_emplace(new_path, field.field_id()); if (!it.second) { - return NotSupported("Duplicate path in name_to_id_: {}", new_path); + return InvalidSchema("Duplicate path found: {}, prev id: {}, curr id: {}", + it.first->first, it.first->second, field.field_id()); } - shortname_to_id_.emplace(new_short_path, field.field_id()); + short_name_to_id_.try_emplace(new_short_path, field.field_id()); ICEBERG_RETURN_UNEXPECTED( VisitTypeInline(*field.type(), this, new_path, new_short_path)); return {}; @@ -206,11 +207,12 @@ Status NameToIdVisitor::Visit(const MapType& type, const std::string& path, } else { new_short_path = BuildPath(short_path, field.name(), case_sensitive_); } - auto it = name_to_id_.emplace(new_path, field.field_id()); + auto it = name_to_id_.try_emplace(new_path, field.field_id()); if (!it.second) { - return NotSupported("Duplicate path in name_to_id_: {}", new_path); + return InvalidSchema("Duplicate path found: {}, prev id: {}, curr id: {}", + it.first->first, it.first->second, field.field_id()); } - shortname_to_id_.emplace(new_short_path, field.field_id()); + short_name_to_id_.try_emplace(new_short_path, field.field_id()); ICEBERG_RETURN_UNEXPECTED( VisitTypeInline(*field.type(), this, new_path, new_short_path)); } @@ -224,11 +226,12 @@ Status NameToIdVisitor::Visit(const StructType& type, const std::string& path, for (const auto& field : fields) { new_path = BuildPath(path, field.name(), case_sensitive_); new_short_path = BuildPath(short_path, field.name(), case_sensitive_); - auto it = name_to_id_.emplace(new_path, field.field_id()); + auto it = name_to_id_.try_emplace(new_path, field.field_id()); if (!it.second) { - return NotSupported("Duplicate path in name_to_id_: {}", new_path); + return InvalidSchema("Duplicate path found: {}, prev id: {}, curr id: {}", + it.first->first, it.first->second, field.field_id()); } - shortname_to_id_.emplace(new_short_path, field.field_id()); + short_name_to_id_.try_emplace(new_short_path, field.field_id()); ICEBERG_RETURN_UNEXPECTED( VisitTypeInline(*field.type(), this, new_path, new_short_path)); } @@ -251,16 +254,14 @@ std::string NameToIdVisitor::BuildPath(std::string_view prefix, if (case_sensitive) { return prefix.empty() ? quoted_name : std::string(prefix) + "." + quoted_name; } - std::string lower_name = quoted_name; - std::ranges::transform(lower_name, lower_name.begin(), ::tolower); - return prefix.empty() ? lower_name : std::string(prefix) + "." + lower_name; + return prefix.empty() ? StringUtils::ToLower(quoted_name) + : std::string(prefix) + "." + StringUtils::ToLower(quoted_name); + ; } -void NameToIdVisitor::Merge() { - for (const auto& it : shortname_to_id_) { - if (name_to_id_.find(it.first) == name_to_id_.end()) { - name_to_id_[it.first] = it.second; - } +void NameToIdVisitor::Finish() { + for (auto&& it : short_name_to_id_) { + name_to_id_.try_emplace(it.first, it.second); } } diff --git a/src/iceberg/schema.h b/src/iceberg/schema.h index 867f10b45..1df7b2b05 100644 --- a/src/iceberg/schema.h +++ b/src/iceberg/schema.h @@ -55,7 +55,7 @@ class ICEBERG_EXPORT Schema : public StructType { [[nodiscard]] std::string ToString() const override; - ///\brief Find thd SchemaField By field name + /// \brief Find thd SchemaField By field name /// /// Short names for maps and lists are included for any name that does not conflict with /// a canonical name. For example, a list, 'l', of structs with field 'x' will produce diff --git a/test/schema_test.cc b/test/schema_test.cc index f7a9d55f6..6cb525a4d 100644 --- a/test/schema_test.cc +++ b/test/schema_test.cc @@ -24,7 +24,6 @@ #include #include -#include #include "iceberg/schema_field.h" #include "iceberg/util/formatter.h" // IWYU pragma: keep @@ -83,14 +82,14 @@ TEST(SchemaTest, Equality) { ASSERT_EQ(schema5, schema1); } -class NestedTypeTest : public ::testing::Test { +class BasicShortNameTest : public ::testing::Test { protected: void SetUp() override { field1_ = std::make_unique(1, "Foo", iceberg::int32(), true); field2_ = std::make_unique(2, "Bar", iceberg::string(), true); field3_ = std::make_unique(3, "Foobar", iceberg::int32(), true); - iceberg::StructType structtype = iceberg::StructType( + auto structtype = iceberg::StructType( std::vector{*field1_, *field2_, *field3_}); auto listype = iceberg::ListType(iceberg::SchemaField::MakeRequired( @@ -123,7 +122,7 @@ class NestedTypeTest : public ::testing::Test { std::unique_ptr field7_; }; -TEST_F(NestedTypeTest, TestFindById) { +TEST_F(BasicShortNameTest, TestFindById) { ASSERT_THAT(schema_->FindFieldById(7), ::testing::Optional(*field7_)); ASSERT_THAT(schema_->FindFieldById(6), ::testing::Optional(*field6_)); ASSERT_THAT(schema_->FindFieldById(5), ::testing::Optional(*field5_)); @@ -135,7 +134,7 @@ TEST_F(NestedTypeTest, TestFindById) { ASSERT_THAT(schema_->FindFieldById(10), ::testing::Optional(std::nullopt)); } -TEST_F(NestedTypeTest, TestFindByName) { +TEST_F(BasicShortNameTest, TestFindByName) { ASSERT_THAT(schema_->FindFieldByName("Value"), ::testing::Optional(*field7_)); ASSERT_THAT(schema_->FindFieldByName("Value.value"), ::testing::Optional(*field6_)); ASSERT_THAT(schema_->FindFieldByName("Value.key"), ::testing::Optional(*field5_)); @@ -152,7 +151,7 @@ TEST_F(NestedTypeTest, TestFindByName) { ::testing::Optional(std::nullopt)); } -TEST_F(NestedTypeTest, TestFindByNameCaseInsensitive) { +TEST_F(BasicShortNameTest, TestFindByNameCaseInsensitive) { ASSERT_THAT(schema_->FindFieldByName("vALue", false), ::testing::Optional(*field7_)); ASSERT_THAT(schema_->FindFieldByName("vALue.VALUE", false), ::testing::Optional(*field6_)); @@ -170,7 +169,7 @@ TEST_F(NestedTypeTest, TestFindByNameCaseInsensitive) { ::testing::Optional(std::nullopt)); } -TEST_F(NestedTypeTest, TestFindByShortNameCaseInsensitive) { +TEST_F(BasicShortNameTest, TestFindByShortNameCaseInsensitive) { ASSERT_THAT(schema_->FindFieldByName("vaLue.value.FOO", false), ::testing::Optional(*field1_)); ASSERT_THAT(schema_->FindFieldByName("Value.value.Bar", false), @@ -181,21 +180,21 @@ TEST_F(NestedTypeTest, TestFindByShortNameCaseInsensitive) { ::testing::Optional(std::nullopt)); } -class NestType2Test : public ::testing::Test { +class ComplexShortNameTest : public ::testing::Test { protected: void SetUp() override { field1_ = std::make_unique(1, "Foo", iceberg::int32(), true); field2_ = std::make_unique(2, "Bar", iceberg::string(), true); field3_ = std::make_unique(3, "Foobar", iceberg::int32(), true); - iceberg::StructType structtype = iceberg::StructType({*field1_, *field2_, *field3_}); + auto structtype = iceberg::StructType({*field1_, *field2_, *field3_}); field4_ = std::make_unique( 4, "element", std::make_shared(structtype), false); auto listype = iceberg::ListType(*field4_); - iceberg::StructType structtype2 = iceberg::StructType( + auto structtype2 = iceberg::StructType( {iceberg::SchemaField::MakeRequired(5, "First_child", iceberg::int32()), iceberg::SchemaField::MakeRequired( 6, "Second_child", std::make_shared(listype))}); @@ -231,7 +230,7 @@ class NestType2Test : public ::testing::Test { std::unique_ptr field9_; }; -TEST_F(NestType2Test, TestFindById) { +TEST_F(ComplexShortNameTest, TestFindById) { ASSERT_THAT(schema_->FindFieldById(9), ::testing::Optional(*field9_)); ASSERT_THAT(schema_->FindFieldById(8), ::testing::Optional(*field8_)); ASSERT_THAT(schema_->FindFieldById(7), ::testing::Optional(*field7_)); @@ -245,7 +244,7 @@ TEST_F(NestType2Test, TestFindById) { ASSERT_THAT(schema_->FindFieldById(0), ::testing::Optional(std::nullopt)); } -TEST_F(NestType2Test, TestFindByName) { +TEST_F(ComplexShortNameTest, TestFindByName) { ASSERT_THAT(schema_->FindFieldByName("Map"), ::testing::Optional(*field9_)); ASSERT_THAT(schema_->FindFieldByName("Map.value"), ::testing::Optional(*field8_)); ASSERT_THAT(schema_->FindFieldByName("Map.key"), ::testing::Optional(*field7_)); @@ -265,7 +264,7 @@ TEST_F(NestType2Test, TestFindByName) { ::testing::Optional(std::nullopt)); } -TEST_F(NestType2Test, TestFindByNameCaseInsensitive) { +TEST_F(ComplexShortNameTest, TestFindByNameCaseInsensitive) { ASSERT_THAT(schema_->FindFieldByName("map", false), ::testing::Optional(*field9_)); ASSERT_THAT(schema_->FindFieldByName("map.vALUE", false), ::testing::Optional(*field8_)); @@ -286,7 +285,7 @@ TEST_F(NestType2Test, TestFindByNameCaseInsensitive) { ::testing::Optional(std::nullopt)); } -TEST_F(NestType2Test, TestFindByShortName) { +TEST_F(ComplexShortNameTest, TestFindByShortName) { ASSERT_THAT(schema_->FindFieldByName("Map.Second_child"), ::testing::Optional(*field6_)); ASSERT_THAT(schema_->FindFieldByName("Map.First_child"), ::testing::Optional(*field5_)); @@ -300,7 +299,7 @@ TEST_F(NestType2Test, TestFindByShortName) { ::testing::Optional(std::nullopt)); } -TEST_F(NestType2Test, TestFindByShortNameCaseInsensitive) { +TEST_F(ComplexShortNameTest, TestFindByShortNameCaseInsensitive) { ASSERT_THAT(schema_->FindFieldByName("map.second_child", false), ::testing::Optional(*field6_)); ASSERT_THAT(schema_->FindFieldByName("map.first_child", false), @@ -315,7 +314,7 @@ TEST_F(NestType2Test, TestFindByShortNameCaseInsensitive) { ::testing::Optional(std::nullopt)); } -class ComplexMapStructTest : public ::testing::Test { +class ComplexMapStructShortNameTest : public ::testing::Test { protected: void SetUp() override { // Separate inner struct for key: {inner_key: int, inner_value: int} @@ -400,7 +399,7 @@ class ComplexMapStructTest : public ::testing::Test { std::unique_ptr exp_field_a_; }; -TEST_F(ComplexMapStructTest, TestFindById) { +TEST_F(ComplexMapStructShortNameTest, TestFindById) { ASSERT_THAT(schema_->FindFieldById(20), ::testing::Optional(*exp_field_a_)); ASSERT_THAT(schema_->FindFieldById(19), ::testing::Optional(*exp_map_value_)); ASSERT_THAT(schema_->FindFieldById(18), ::testing::Optional(*exp_map_key_)); @@ -414,7 +413,7 @@ TEST_F(ComplexMapStructTest, TestFindById) { ASSERT_THAT(schema_->FindFieldById(10), ::testing::Optional(*exp_inner_key_key_)); } -TEST_F(ComplexMapStructTest, TestFindByName) { +TEST_F(ComplexMapStructShortNameTest, TestFindByName) { ASSERT_THAT(schema_->FindFieldByName("a"), ::testing::Optional(*exp_field_a_)); ASSERT_THAT(schema_->FindFieldByName("a.key"), ::testing::Optional(*exp_map_key_)); ASSERT_THAT(schema_->FindFieldByName("a.value"), ::testing::Optional(*exp_map_value_)); @@ -436,7 +435,7 @@ TEST_F(ComplexMapStructTest, TestFindByName) { ::testing::Optional(*exp_inner_value_v_)); } -TEST_F(ComplexMapStructTest, TestFindByNameCaseInsensitive) { +TEST_F(ComplexMapStructShortNameTest, TestFindByNameCaseInsensitive) { ASSERT_THAT(schema_->FindFieldByName("A", false), ::testing::Optional(*exp_field_a_)); ASSERT_THAT(schema_->FindFieldByName("A.KEY", false), ::testing::Optional(*exp_map_key_)); @@ -460,7 +459,7 @@ TEST_F(ComplexMapStructTest, TestFindByNameCaseInsensitive) { ::testing::Optional(*exp_inner_value_v_)); } -TEST_F(ComplexMapStructTest, TestInvalidPaths) { +TEST_F(ComplexMapStructShortNameTest, TestInvalidPaths) { ASSERT_THAT(schema_->FindFieldByName("a.invalid"), ::testing::Optional(std::nullopt)); ASSERT_THAT(schema_->FindFieldByName("a.key.invalid"), ::testing::Optional(std::nullopt)); @@ -480,9 +479,9 @@ TEST(SchemaTest, DuplicatePathErrorCaseSensitive) { auto result = schema.FindFieldByName("a.b", /*case_sensitive=*/true); ASSERT_FALSE(result.has_value()); - EXPECT_EQ(result.error().kind, iceberg::ErrorKind::kNotSupported); + EXPECT_EQ(result.error().kind, iceberg::ErrorKind::kInvalidSchema); EXPECT_THAT(result.error().message, - ::testing::HasSubstr("Duplicate path in name_to_id_: a.b")); + ::testing::HasSubstr("Duplicate path found: a.b, prev id: 2, curr id: 3")); } TEST(SchemaTest, DuplicatePathErrorCaseInsensitive) { @@ -495,7 +494,30 @@ TEST(SchemaTest, DuplicatePathErrorCaseInsensitive) { auto result = schema.FindFieldByName("A.B", /*case_sensitive=*/false); ASSERT_FALSE(result.has_value()); - EXPECT_EQ(result.error().kind, iceberg::ErrorKind::kNotSupported); + EXPECT_EQ(result.error().kind, iceberg::ErrorKind::kInvalidSchema); EXPECT_THAT(result.error().message, - ::testing::HasSubstr("Duplicate path in name_to_id_: a.b")); + ::testing::HasSubstr("Duplicate path found: a.b, prev id: 2, curr id: 3")); +} + +TEST(SchemaTest, NestedDuplicateFieldIdError) { + // Outer struct with field ID 1 + iceberg::SchemaField outer_field(1, "outer", iceberg::int32(), true); + + // Inner struct with duplicate field ID 1 + iceberg::SchemaField inner_field(1, "inner", iceberg::string(), true); + auto inner_struct = iceberg::StructType({inner_field}); + + // Nested field with inner struct + iceberg::SchemaField nested_field( + 2, "nested", std::make_shared(inner_struct), true); + + // Schema with outer and nested fields + iceberg::Schema schema({outer_field, nested_field}, 1); + + // Attempt to find a field, which should trigger duplicate ID detection + auto result = schema.FindFieldById(1); + ASSERT_FALSE(result.has_value()); + EXPECT_EQ(result.error().kind, iceberg::ErrorKind::kInvalidSchema); + EXPECT_THAT(result.error().message, + ::testing::HasSubstr("Duplicate field id found: 1")); } From 92b6db491561932f74b6f54fce5e98604cd0200f Mon Sep 17 00:00:00 2001 From: nullccxsy <32149055912@qq.com> Date: Tue, 26 Aug 2025 00:29:26 +0800 Subject: [PATCH 07/14] fix: fix comments, optimize smart pointer usage in schema_test.cc to reduce copies and unify initialization - Switched field_ and schema_ members in test classes to std::unique_ptr for unique ownership and to avoid duplicate and avoid creating duplicate objects. --- src/iceberg/schema.cc | 5 +- src/iceberg/schema.h | 3 +- test/schema_test.cc | 168 +++++++++++++++++------------------------- 3 files changed, 73 insertions(+), 103 deletions(-) diff --git a/src/iceberg/schema.cc b/src/iceberg/schema.cc index 739e9097b..7d3dc827a 100644 --- a/src/iceberg/schema.cc +++ b/src/iceberg/schema.cc @@ -23,11 +23,10 @@ #include #include -#include - #include "iceberg/type.h" #include "iceberg/util/formatter.h" // IWYU pragma: keep #include "iceberg/util/macros.h" +#include "iceberg/util/string_utils.h" #include "iceberg/util/visit_type.h" namespace iceberg { @@ -47,7 +46,7 @@ class IdToFieldVisitor { class NameToIdVisitor { public: explicit NameToIdVisitor( - std::unordered_map& name_to_id, bool case_sensitive_ = true, + std::unordered_map& name_to_id, bool case_sensitive = true, std::function quoting_func = {}); Status Visit(const ListType& type, const std::string& path, const std::string& short_path); diff --git a/src/iceberg/schema.h b/src/iceberg/schema.h index 1df7b2b05..1f8db79c2 100644 --- a/src/iceberg/schema.h +++ b/src/iceberg/schema.h @@ -55,7 +55,7 @@ class ICEBERG_EXPORT Schema : public StructType { [[nodiscard]] std::string ToString() const override; - /// \brief Find thd SchemaField By field name + /// \brief Find the SchemaField by field name. /// /// Short names for maps and lists are included for any name that does not conflict with /// a canonical name. For example, a list, 'l', of structs with field 'x' will produce @@ -67,6 +67,7 @@ class ICEBERG_EXPORT Schema : public StructType { [[nodiscard]] Result>> FindFieldByName(std::string_view name, bool case_sensitive = true) const; + /// \brief Find the SchemaField by field id. [[nodiscard]] Result>> FindFieldById(int32_t field_id) const; diff --git a/test/schema_test.cc b/test/schema_test.cc index 6cb525a4d..d282cea9d 100644 --- a/test/schema_test.cc +++ b/test/schema_test.cc @@ -89,30 +89,25 @@ class BasicShortNameTest : public ::testing::Test { field2_ = std::make_unique(2, "Bar", iceberg::string(), true); field3_ = std::make_unique(3, "Foobar", iceberg::int32(), true); - auto structtype = iceberg::StructType( + auto structtype = std::make_shared( std::vector{*field1_, *field2_, *field3_}); - auto listype = iceberg::ListType(iceberg::SchemaField::MakeRequired( - 4, "element", std::make_shared(structtype))); + field4_ = std::make_unique(4, "element", structtype, false); - auto maptype = - iceberg::MapType(iceberg::SchemaField::MakeRequired(5, "key", iceberg::int32()), - iceberg::SchemaField::MakeRequired( - 6, "value", std::make_shared(listype))); + auto listype = std::make_shared(*field4_); - field4_ = std::make_unique( - 4, "element", std::make_shared(structtype), false); field5_ = std::make_unique(5, "key", iceberg::int32(), false); - field6_ = std::make_unique( - 6, "value", std::make_shared(listype), false); - field7_ = std::make_unique( - 7, "Value", std::make_shared(maptype), false); + field6_ = std::make_unique(6, "value", listype, false); + + auto maptype = std::make_shared(*field5_, *field6_); + + field7_ = std::make_unique(7, "Value", maptype, false); schema_ = - std::make_shared(std::vector{*field7_}, 1); + std::make_unique(std::vector{*field7_}, 1); } - std::shared_ptr schema_; + std::unique_ptr schema_; std::unique_ptr field1_; std::unique_ptr field2_; std::unique_ptr field3_; @@ -187,38 +182,32 @@ class ComplexShortNameTest : public ::testing::Test { field2_ = std::make_unique(2, "Bar", iceberg::string(), true); field3_ = std::make_unique(3, "Foobar", iceberg::int32(), true); - auto structtype = iceberg::StructType({*field1_, *field2_, *field3_}); - - field4_ = std::make_unique( - 4, "element", std::make_shared(structtype), false); - - auto listype = iceberg::ListType(*field4_); + auto structtype = std::make_shared( + std::vector{*field1_, *field2_, *field3_}); - auto structtype2 = iceberg::StructType( - {iceberg::SchemaField::MakeRequired(5, "First_child", iceberg::int32()), - iceberg::SchemaField::MakeRequired( - 6, "Second_child", std::make_shared(listype))}); + field4_ = std::make_unique(4, "element", structtype, false); - auto maptype = iceberg::MapType( - iceberg::SchemaField::MakeRequired(7, "key", iceberg::int32()), - iceberg::SchemaField::MakeRequired( - 8, "value", std::make_shared(structtype2))); + auto listype = std::make_shared(*field4_); field5_ = std::make_unique(5, "First_child", iceberg::int32(), false); - field6_ = std::make_unique( - 6, "Second_child", std::make_shared(listype), false); + field6_ = std::make_unique(6, "Second_child", listype, false); + + auto structtype2 = std::make_shared( + std::vector{*field5_, *field6_}); + field7_ = std::make_unique(7, "key", iceberg::int32(), false); - field8_ = std::make_unique( - 8, "value", std::make_shared(structtype2), false); - field9_ = std::make_unique( - 9, "Map", std::make_shared(maptype), false); + field8_ = std::make_unique(8, "value", structtype2, false); + + auto maptype = std::make_shared(*field7_, *field8_); + + field9_ = std::make_unique(9, "Map", maptype, false); schema_ = - std::make_shared(std::vector{*field9_}, 1); + std::make_unique(std::vector{*field9_}, 1); } - std::shared_ptr schema_; + std::unique_ptr schema_; std::unique_ptr field1_; std::unique_ptr field2_; std::unique_ptr field3_; @@ -317,75 +306,48 @@ TEST_F(ComplexShortNameTest, TestFindByShortNameCaseInsensitive) { class ComplexMapStructShortNameTest : public ::testing::Test { protected: void SetUp() override { - // Separate inner struct for key: {inner_key: int, inner_value: int} - inner_struct_type_key_ = - std::make_shared(std::vector{ - iceberg::SchemaField(10, "inner_key", iceberg::int32(), false), - iceberg::SchemaField(11, "inner_value", iceberg::int32(), false)}); - exp_inner_key_key_ = std::make_unique(10, "inner_key", iceberg::int32(), false); exp_inner_key_value_ = std::make_unique( 11, "inner_value", iceberg::int32(), false); - - // Separate inner struct for value: {inner_k: int, inner_v: int} - inner_struct_type_value_ = - std::make_shared(std::vector{ - iceberg::SchemaField(12, "inner_k", iceberg::int32(), false), - iceberg::SchemaField(13, "inner_v", iceberg::int32(), false)}); + auto inner_struct_type_key_ = std::make_shared( + std::vector{*exp_inner_key_key_, *exp_inner_key_value_}); exp_inner_value_k_ = std::make_unique(12, "inner_k", iceberg::int32(), false); exp_inner_value_v_ = std::make_unique(13, "inner_v", iceberg::int32(), false); - - // Key struct: {key: int, value: inner_struct_key_} - key_struct_type_ = - std::make_shared(std::vector{ - iceberg::SchemaField(14, "key", iceberg::int32(), false), - iceberg::SchemaField(15, "value", inner_struct_type_key_, false)}); + auto inner_struct_type_value_ = std::make_shared( + std::vector{*exp_inner_value_k_, *exp_inner_value_v_}); exp_key_struct_key_ = std::make_unique(14, "key", iceberg::int32(), false); exp_key_struct_value_ = std::make_unique( 15, "value", inner_struct_type_key_, false); - - // Value struct: {key: int, value: inner_struct_value_} - value_struct_type_ = - std::make_shared(std::vector{ - iceberg::SchemaField(16, "key", iceberg::int32(), false), - iceberg::SchemaField(17, "value", inner_struct_type_value_, false)}); + auto key_struct_type_ = std::make_shared( + std::vector{*exp_key_struct_key_, *exp_key_struct_value_}); exp_value_struct_key_ = std::make_unique(16, "key", iceberg::int32(), false); exp_value_struct_value_ = std::make_unique( 17, "value", inner_struct_type_value_, false); - - // Map type: map - map_type_ = std::make_shared( - iceberg::SchemaField(18, "key", key_struct_type_, false), - iceberg::SchemaField(19, "value", value_struct_type_, false)); + auto value_struct_type_ = + std::make_shared(std::vector{ + *exp_value_struct_key_, *exp_value_struct_value_}); exp_map_key_ = std::make_unique(18, "key", key_struct_type_, false); exp_map_value_ = std::make_unique(19, "value", value_struct_type_, false); + auto map_type_ = std::make_shared(*exp_map_key_, *exp_map_value_); - // Top-level field: a: map<...> exp_field_a_ = std::make_unique(20, "a", map_type_, false); - // Create schema - schema_ = std::make_shared( + schema_ = std::make_unique( std::vector{*exp_field_a_}, 1); } - std::shared_ptr schema_; - std::shared_ptr inner_struct_type_key_; - std::shared_ptr inner_struct_type_value_; - std::shared_ptr key_struct_type_; - std::shared_ptr value_struct_type_; - std::shared_ptr map_type_; - + std::unique_ptr schema_; std::unique_ptr exp_inner_key_key_; std::unique_ptr exp_inner_key_value_; std::unique_ptr exp_inner_value_k_; @@ -470,14 +432,16 @@ TEST_F(ComplexMapStructShortNameTest, TestInvalidPaths) { } TEST(SchemaTest, DuplicatePathErrorCaseSensitive) { - iceberg::SchemaField nested_b(2, "b", iceberg::int32(), false); - iceberg::StructType nested_struct({nested_b}); - iceberg::SchemaField a(1, "a", std::make_shared(nested_struct), - false); - iceberg::SchemaField duplicate_ab(3, "a.b", iceberg::int32(), false); - iceberg::Schema schema({a, duplicate_ab}, 1); - - auto result = schema.FindFieldByName("a.b", /*case_sensitive=*/true); + auto nested_b = std::make_unique(2, "b", iceberg::int32(), false); + auto nested_struct = + std::make_shared(std::vector{*nested_b}); + auto a = std::make_unique(1, "a", nested_struct, false); + auto duplicate_ab = + std::make_unique(3, "a.b", iceberg::int32(), false); + auto schema = std::make_unique( + std::vector{*a, *duplicate_ab}, 1); + + auto result = schema->FindFieldByName("a.b", /*case_sensitive=*/true); ASSERT_FALSE(result.has_value()); EXPECT_EQ(result.error().kind, iceberg::ErrorKind::kInvalidSchema); EXPECT_THAT(result.error().message, @@ -485,14 +449,16 @@ TEST(SchemaTest, DuplicatePathErrorCaseSensitive) { } TEST(SchemaTest, DuplicatePathErrorCaseInsensitive) { - iceberg::SchemaField nested_b(2, "B", iceberg::int32(), false); - iceberg::StructType nested_struct({nested_b}); - iceberg::SchemaField a(1, "A", std::make_shared(nested_struct), - false); - iceberg::SchemaField duplicate_ab(3, "a.b", iceberg::int32(), false); - iceberg::Schema schema({a, duplicate_ab}, 1); - - auto result = schema.FindFieldByName("A.B", /*case_sensitive=*/false); + auto nested_b = std::make_unique(2, "B", iceberg::int32(), false); + auto nested_struct = + std::make_shared(std::vector{*nested_b}); + auto a = std::make_unique(1, "A", nested_struct, false); + auto duplicate_ab = + std::make_unique(3, "a.b", iceberg::int32(), false); + auto schema = std::make_unique( + std::vector{*a, *duplicate_ab}, 1); + + auto result = schema->FindFieldByName("A.B", /*case_sensitive=*/false); ASSERT_FALSE(result.has_value()); EXPECT_EQ(result.error().kind, iceberg::ErrorKind::kInvalidSchema); EXPECT_THAT(result.error().message, @@ -501,21 +467,25 @@ TEST(SchemaTest, DuplicatePathErrorCaseInsensitive) { TEST(SchemaTest, NestedDuplicateFieldIdError) { // Outer struct with field ID 1 - iceberg::SchemaField outer_field(1, "outer", iceberg::int32(), true); + auto outer_field = + std::make_unique(1, "outer", iceberg::int32(), true); // Inner struct with duplicate field ID 1 - iceberg::SchemaField inner_field(1, "inner", iceberg::string(), true); - auto inner_struct = iceberg::StructType({inner_field}); + auto inner_field = + std::make_unique(1, "inner", iceberg::string(), true); + auto inner_struct = std::make_shared( + std::vector{*inner_field}); // Nested field with inner struct - iceberg::SchemaField nested_field( - 2, "nested", std::make_shared(inner_struct), true); + auto nested_field = + std::make_unique(2, "nested", inner_struct, true); // Schema with outer and nested fields - iceberg::Schema schema({outer_field, nested_field}, 1); + auto schema = std::make_unique( + std::vector{*outer_field, *nested_field}, 1); // Attempt to find a field, which should trigger duplicate ID detection - auto result = schema.FindFieldById(1); + auto result = schema->FindFieldById(1); ASSERT_FALSE(result.has_value()); EXPECT_EQ(result.error().kind, iceberg::ErrorKind::kInvalidSchema); EXPECT_THAT(result.error().message, From ef5c1cfc11f2fc38c11fa125833d25170656c0e1 Mon Sep 17 00:00:00 2001 From: nullccxsy <32149055912@qq.com> Date: Wed, 27 Aug 2025 14:39:50 +0800 Subject: [PATCH 08/14] fix: fix comments, add transparent string hash to eliminate temporary std::string creation --- src/iceberg/schema.cc | 14 ++++++++------ src/iceberg/schema.h | 17 +++++++++++++++-- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/src/iceberg/schema.cc b/src/iceberg/schema.cc index 7d3dc827a..1633e2d35 100644 --- a/src/iceberg/schema.cc +++ b/src/iceberg/schema.cc @@ -46,7 +46,8 @@ class IdToFieldVisitor { class NameToIdVisitor { public: explicit NameToIdVisitor( - std::unordered_map& name_to_id, bool case_sensitive = true, + std::unordered_map>& name_to_id, + bool case_sensitive = true, std::function quoting_func = {}); Status Visit(const ListType& type, const std::string& path, const std::string& short_path); @@ -64,8 +65,9 @@ class NameToIdVisitor { private: bool case_sensitive_; - std::unordered_map& name_to_id_; - std::unordered_map short_name_to_id_; + std::unordered_map>& name_to_id_; + std::unordered_map> + short_name_to_id_; std::function quoting_func_; }; @@ -91,7 +93,7 @@ Result>> Schema::FindFie std::string_view name, bool case_sensitive) const { if (case_sensitive) { ICEBERG_RETURN_UNEXPECTED(InitNameToIdMap()); - auto it = name_to_id_.find(std::string(name)); + auto it = name_to_id_.find(name); if (it == name_to_id_.end()) return std::nullopt; return FindFieldById(it->second); } @@ -167,8 +169,8 @@ Status IdToFieldVisitor::VisitNestedType(const Type& type) { } NameToIdVisitor::NameToIdVisitor( - std::unordered_map& name_to_id, bool case_sensitive, - std::function quoting_func) + std::unordered_map>& name_to_id, + bool case_sensitive, std::function quoting_func) : name_to_id_(name_to_id), case_sensitive_(case_sensitive), quoting_func_(std::move(quoting_func)) {} diff --git a/src/iceberg/schema.h b/src/iceberg/schema.h index 1f8db79c2..f9cb3c834 100644 --- a/src/iceberg/schema.h +++ b/src/iceberg/schema.h @@ -35,6 +35,17 @@ namespace iceberg { +/// \brief Transparent hash function that supports std::string_view as lookup key +/// +/// Enables std::unordered_map to directly accept std::string_view lookup keys +/// without creating temporary std::string objects, using C++20's transparent lookup. +struct string_hash { + using hash_type = std::hash; + using is_transparent = void; + + std::size_t operator()(std::string_view str) const { return hash_type{}(str); } +}; + /// \brief A schema for a Table. /// /// A schema is a list of typed columns, along with a unique integer ID. A @@ -78,9 +89,11 @@ class ICEBERG_EXPORT Schema : public StructType { mutable std::unordered_map> id_to_field_; /// Mapping from field name to field id. - mutable std::unordered_map name_to_id_; + mutable std::unordered_map> + name_to_id_; /// Mapping from lowercased field name to field id - mutable std::unordered_map lowercase_name_to_id_; + mutable std::unordered_map> + lowercase_name_to_id_; private: /// \brief Compare two schemas for equality. From 52b53ca31c6040938759b2dd5832680b4a60c237 Mon Sep 17 00:00:00 2001 From: nullccxsy <32149055912@qq.com> Date: Wed, 27 Aug 2025 18:38:26 +0800 Subject: [PATCH 09/14] fix comments --- src/iceberg/schema.cc | 1 - src/iceberg/schema.h | 12 +----------- src/iceberg/util/string_util.h | 11 +++++++++++ 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/iceberg/schema.cc b/src/iceberg/schema.cc index 1633e2d35..d6652ad7b 100644 --- a/src/iceberg/schema.cc +++ b/src/iceberg/schema.cc @@ -26,7 +26,6 @@ #include "iceberg/type.h" #include "iceberg/util/formatter.h" // IWYU pragma: keep #include "iceberg/util/macros.h" -#include "iceberg/util/string_utils.h" #include "iceberg/util/visit_type.h" namespace iceberg { diff --git a/src/iceberg/schema.h b/src/iceberg/schema.h index f9cb3c834..f20d1a67f 100644 --- a/src/iceberg/schema.h +++ b/src/iceberg/schema.h @@ -32,20 +32,10 @@ #include "iceberg/result.h" #include "iceberg/schema_field.h" #include "iceberg/type.h" +#include "iceberg/util/string_utils.h" namespace iceberg { -/// \brief Transparent hash function that supports std::string_view as lookup key -/// -/// Enables std::unordered_map to directly accept std::string_view lookup keys -/// without creating temporary std::string objects, using C++20's transparent lookup. -struct string_hash { - using hash_type = std::hash; - using is_transparent = void; - - std::size_t operator()(std::string_view str) const { return hash_type{}(str); } -}; - /// \brief A schema for a Table. /// /// A schema is a list of typed columns, along with a unique integer ID. A diff --git a/src/iceberg/util/string_util.h b/src/iceberg/util/string_util.h index 558fc293c..176d7428e 100644 --- a/src/iceberg/util/string_util.h +++ b/src/iceberg/util/string_util.h @@ -46,4 +46,15 @@ class ICEBERG_EXPORT StringUtils { } }; +/// \brief Transparent hash function that supports std::string_view as lookup key +/// +/// Enables std::unordered_map to directly accept std::string_view lookup keys +/// without creating temporary std::string objects, using C++20's transparent lookup. +struct string_hash { + using hash_type = std::hash; + using is_transparent = void; + + std::size_t operator()(std::string_view str) const { return hash_type{}(str); } +}; + } // namespace iceberg From 9aa221d8e54ebf7533fb2956d46517ce93e73894 Mon Sep 17 00:00:00 2001 From: nullccxsy <32149055912@qq.com> Date: Wed, 27 Aug 2025 18:55:18 +0800 Subject: [PATCH 10/14] fix comments --- src/iceberg/schema.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/iceberg/schema.h b/src/iceberg/schema.h index f20d1a67f..c24dbc277 100644 --- a/src/iceberg/schema.h +++ b/src/iceberg/schema.h @@ -32,7 +32,7 @@ #include "iceberg/result.h" #include "iceberg/schema_field.h" #include "iceberg/type.h" -#include "iceberg/util/string_utils.h" +#include "iceberg/util/string_util.h" namespace iceberg { From 0e0d42c5684be853f62df6093db15301805d66cf Mon Sep 17 00:00:00 2001 From: nullccxsy <32149055912@qq.com> Date: Thu, 28 Aug 2025 14:50:22 +0800 Subject: [PATCH 11/14] fix comments --- src/iceberg/util/string_util.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/iceberg/util/string_util.h b/src/iceberg/util/string_util.h index 176d7428e..0ecf8d01d 100644 --- a/src/iceberg/util/string_util.h +++ b/src/iceberg/util/string_util.h @@ -50,7 +50,7 @@ class ICEBERG_EXPORT StringUtils { /// /// Enables std::unordered_map to directly accept std::string_view lookup keys /// without creating temporary std::string objects, using C++20's transparent lookup. -struct string_hash { +struct ICEBERG_EXPORT string_hash { using hash_type = std::hash; using is_transparent = void; From 7223081cce3c73317322c26f251a27ce14fe86a9 Mon Sep 17 00:00:00 2001 From: nullccxsy <32149055912@qq.com> Date: Sun, 31 Aug 2025 17:00:25 +0800 Subject: [PATCH 12/14] fix comments --- src/iceberg/schema.cc | 16 +++++----------- src/iceberg/util/string_util.h | 2 ++ 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/src/iceberg/schema.cc b/src/iceberg/schema.cc index d6652ad7b..6d6a7da54 100644 --- a/src/iceberg/schema.cc +++ b/src/iceberg/schema.cc @@ -35,8 +35,8 @@ class IdToFieldVisitor { explicit IdToFieldVisitor( std::unordered_map>& id_to_field); - Status Visit(const Type& type); - Status VisitNestedType(const Type& type); + Status Visit(const PrimitiveType& type); + Status Visit(const NestedType& type); private: std::unordered_map>& id_to_field_; @@ -147,14 +147,9 @@ IdToFieldVisitor::IdToFieldVisitor( std::unordered_map>& id_to_field) : id_to_field_(id_to_field) {} -Status IdToFieldVisitor::Visit(const Type& type) { - if (type.is_nested()) { - ICEBERG_RETURN_UNEXPECTED(VisitNestedType(type)); - } - return {}; -} +Status IdToFieldVisitor::Visit(const PrimitiveType& type) { return {}; } -Status IdToFieldVisitor::VisitNestedType(const Type& type) { +Status IdToFieldVisitor::Visit(const NestedType& type) { const auto& nested = iceberg::internal::checked_cast(type); const auto& fields = nested.fields(); for (const auto& field : fields) { @@ -162,7 +157,7 @@ Status IdToFieldVisitor::VisitNestedType(const Type& type) { if (!it.second) { return InvalidSchema("Duplicate field id found: {}", field.field_id()); } - ICEBERG_RETURN_UNEXPECTED(Visit(*field.type())); + ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*field.type(), this)); } return {}; } @@ -256,7 +251,6 @@ std::string NameToIdVisitor::BuildPath(std::string_view prefix, } return prefix.empty() ? StringUtils::ToLower(quoted_name) : std::string(prefix) + "." + StringUtils::ToLower(quoted_name); - ; } void NameToIdVisitor::Finish() { diff --git a/src/iceberg/util/string_util.h b/src/iceberg/util/string_util.h index 0ecf8d01d..6b0bfe043 100644 --- a/src/iceberg/util/string_util.h +++ b/src/iceberg/util/string_util.h @@ -55,6 +55,8 @@ struct ICEBERG_EXPORT string_hash { using is_transparent = void; std::size_t operator()(std::string_view str) const { return hash_type{}(str); } + std::size_t operator()(const char* str) const { return hash_type{}(str); } + std::size_t operator()(const std::string& str) const { return hash_type{}(str); } }; } // namespace iceberg From 54b0c041db562cf70fba851eb0f31d0c45070ed8 Mon Sep 17 00:00:00 2001 From: nullccxsy <32149055912@qq.com> Date: Mon, 1 Sep 2025 14:10:03 +0800 Subject: [PATCH 13/14] fix comments --- src/iceberg/schema.cc | 11 +++++------ src/iceberg/schema.h | 24 +++++++++++------------- src/iceberg/util/macros.h | 12 ++++++------ src/iceberg/util/string_util.h | 2 +- 4 files changed, 23 insertions(+), 26 deletions(-) diff --git a/src/iceberg/schema.cc b/src/iceberg/schema.cc index 6d6a7da54..0b7f5f9ae 100644 --- a/src/iceberg/schema.cc +++ b/src/iceberg/schema.cc @@ -45,7 +45,7 @@ class IdToFieldVisitor { class NameToIdVisitor { public: explicit NameToIdVisitor( - std::unordered_map>& name_to_id, + std::unordered_map>& name_to_id, bool case_sensitive = true, std::function quoting_func = {}); Status Visit(const ListType& type, const std::string& path, @@ -64,9 +64,8 @@ class NameToIdVisitor { private: bool case_sensitive_; - std::unordered_map>& name_to_id_; - std::unordered_map> - short_name_to_id_; + std::unordered_map>& name_to_id_; + std::unordered_map> short_name_to_id_; std::function quoting_func_; }; @@ -150,7 +149,7 @@ IdToFieldVisitor::IdToFieldVisitor( Status IdToFieldVisitor::Visit(const PrimitiveType& type) { return {}; } Status IdToFieldVisitor::Visit(const NestedType& type) { - const auto& nested = iceberg::internal::checked_cast(type); + const auto& nested = internal::checked_cast(type); const auto& fields = nested.fields(); for (const auto& field : fields) { auto it = id_to_field_.try_emplace(field.field_id(), std::cref(field)); @@ -163,7 +162,7 @@ Status IdToFieldVisitor::Visit(const NestedType& type) { } NameToIdVisitor::NameToIdVisitor( - std::unordered_map>& name_to_id, + std::unordered_map>& name_to_id, bool case_sensitive, std::function quoting_func) : name_to_id_(name_to_id), case_sensitive_(case_sensitive), diff --git a/src/iceberg/schema.h b/src/iceberg/schema.h index c24dbc277..1de829c80 100644 --- a/src/iceberg/schema.h +++ b/src/iceberg/schema.h @@ -74,28 +74,26 @@ class ICEBERG_EXPORT Schema : public StructType { friend bool operator==(const Schema& lhs, const Schema& rhs) { return lhs.Equals(rhs); } - private: - /// Mapping from field id to field. - mutable std::unordered_map> - id_to_field_; - /// Mapping from field name to field id. - mutable std::unordered_map> - name_to_id_; - /// Mapping from lowercased field name to field id - mutable std::unordered_map> - lowercase_name_to_id_; - private: /// \brief Compare two schemas for equality. [[nodiscard]] bool Equals(const Schema& other) const; - const std::optional schema_id_; - // TODO(nullccxsy): Address potential concurrency issues in lazy initialization (e.g., // use std::call_once) Status InitIdToFieldMap() const; Status InitNameToIdMap() const; Status InitLowerCaseNameToIdMap() const; + + const std::optional schema_id_; + /// Mapping from field id to field. + mutable std::unordered_map> + id_to_field_; + /// Mapping from field name to field id. + mutable std::unordered_map> + name_to_id_; + /// Mapping from lowercased field name to field id + mutable std::unordered_map> + lowercase_name_to_id_; }; } // namespace iceberg diff --git a/src/iceberg/util/macros.h b/src/iceberg/util/macros.h index ad0b61624..3519c9a63 100644 --- a/src/iceberg/util/macros.h +++ b/src/iceberg/util/macros.h @@ -19,12 +19,12 @@ #pragma once -#define ICEBERG_RETURN_UNEXPECTED(result) \ - do { \ - auto&& iceberg_temp_result = (result); \ - if (!iceberg_temp_result) [[unlikely]] { \ - return std::unexpected(iceberg_temp_result.error()); \ - } \ +#define ICEBERG_RETURN_UNEXPECTED(result) \ + do { \ + auto&& result_name = (result); \ + if (!result_name) [[unlikely]] { \ + return std::unexpected(result_name.error()); \ + } \ } while (false); #define ICEBERG_ASSIGN_OR_RAISE_IMPL(result_name, lhs, rexpr) \ diff --git a/src/iceberg/util/string_util.h b/src/iceberg/util/string_util.h index 6b0bfe043..a0fccfd35 100644 --- a/src/iceberg/util/string_util.h +++ b/src/iceberg/util/string_util.h @@ -50,7 +50,7 @@ class ICEBERG_EXPORT StringUtils { /// /// Enables std::unordered_map to directly accept std::string_view lookup keys /// without creating temporary std::string objects, using C++20's transparent lookup. -struct ICEBERG_EXPORT string_hash { +struct ICEBERG_EXPORT StringHash { using hash_type = std::hash; using is_transparent = void; From 071c9ce70cb696849cb090fb46e29719deea89e5 Mon Sep 17 00:00:00 2001 From: nullccxsy <32149055912@qq.com> Date: Mon, 1 Sep 2025 15:05:07 +0800 Subject: [PATCH 14/14] fix: clean headers --- src/iceberg/schema.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/src/iceberg/schema.cc b/src/iceberg/schema.cc index 0b7f5f9ae..0b67dfb0d 100644 --- a/src/iceberg/schema.cc +++ b/src/iceberg/schema.cc @@ -19,7 +19,6 @@ #include "iceberg/schema.h" -#include #include #include