Skip to content

Commit 3b0653a

Browse files
author
nullccxsy
committed
feat(schema): implement nested field lookup with name pruning
- Add GetFieldByName method to support nested field queries with dot notation - Implement InitNameToIndexMap and InitIdToIndexMap for field mapping - Introduce SchemaFieldVisitor for schema traversal - Implement name pruning logic for element and value in nested structure
1 parent f2d0abd commit 3b0653a

File tree

5 files changed

+264
-8
lines changed

5 files changed

+264
-8
lines changed

src/iceberg/schema.cc

Lines changed: 129 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,9 @@
2727
namespace iceberg {
2828

2929
Schema::Schema(std::vector<SchemaField> fields, std::optional<int32_t> schema_id)
30-
: StructType(std::move(fields)), schema_id_(schema_id) {}
30+
: StructType(std::move(fields)), schema_id_(schema_id) {
31+
InitIdToIndexMap();
32+
}
3133

3234
std::optional<int32_t> Schema::schema_id() const { return schema_id_; }
3335

@@ -44,4 +46,130 @@ bool Schema::Equals(const Schema& other) const {
4446
return schema_id_ == other.schema_id_ && fields_ == other.fields_;
4547
}
4648

49+
std::optional<std::reference_wrapper<const SchemaField>> Schema::GetFieldByName(
50+
std::string_view name, bool case_sensitive) const {
51+
if (case_sensitive) {
52+
InitNameToIndexMap();
53+
auto it = name_to_index_.find(std::string(name));
54+
if (it == name_to_index_.end()) return std::nullopt;
55+
return full_schemafield_[it->second];
56+
}
57+
InitLowerCaseNameToIndexMap();
58+
std::string lower_name(name);
59+
std::ranges::transform(lower_name, lower_name.begin(), ::tolower);
60+
auto it = lowercase_name_to_index_.find(lower_name);
61+
if (it == lowercase_name_to_index_.end()) return std::nullopt;
62+
return full_schemafield_[it->second];
63+
}
64+
65+
std::optional<std::reference_wrapper<const SchemaField>> Schema::GetFieldByName(
66+
std::string_view name) const {
67+
return GetFieldByName(name, true);
68+
}
69+
70+
void Schema::InitIdToIndexMap() const {
71+
if (!id_to_index_.empty()) {
72+
return;
73+
}
74+
SchemaFieldVisitor visitor;
75+
auto result = VisitTypeInline(*this, &visitor, id_to_index_, full_schemafield_);
76+
}
77+
78+
void Schema::InitNameToIndexMap() const {
79+
if (!name_to_index_.empty()) {
80+
return;
81+
}
82+
int index = 0;
83+
std::string_view path, short_path;
84+
SchemaFieldVisitor visitor;
85+
std::unordered_map<std::string, size_t> shortname_to_index;
86+
VisitTypeInline(*this, &visitor, name_to_index_, path, shortname_to_index, short_path,
87+
index, true);
88+
for (const auto& [key, value] : shortname_to_index) {
89+
if (!name_to_index_.count(key)) {
90+
name_to_index_[key] = value;
91+
}
92+
}
93+
}
94+
95+
void Schema::InitLowerCaseNameToIndexMap() const {
96+
if (!lowercase_name_to_index_.empty()) {
97+
return;
98+
}
99+
int index = 0;
100+
std::string_view path, short_path;
101+
SchemaFieldVisitor visitor;
102+
std::unordered_map<std::string, size_t> shortlowercasename_to_index;
103+
VisitTypeInline(*this, &visitor, lowercase_name_to_index_, path,
104+
shortlowercasename_to_index, short_path, index, false);
105+
for (const auto& [key, value] : shortlowercasename_to_index) {
106+
if (!lowercase_name_to_index_.count(key)) {
107+
lowercase_name_to_index_[key] = value;
108+
}
109+
}
110+
}
111+
112+
std::optional<std::reference_wrapper<const SchemaField>> Schema::GetFieldById(
113+
int32_t field_id) const {
114+
InitIdToIndexMap();
115+
auto it = id_to_index_.find(field_id);
116+
if (it == id_to_index_.end()) {
117+
return std::nullopt;
118+
}
119+
return full_schemafield_[it->second];
120+
}
121+
122+
Status SchemaFieldVisitor::Visit(const Type& type,
123+
std::unordered_map<int, size_t>& id_to_index,
124+
std::vector<SchemaField>& full_schemafield) {
125+
const auto& nested = iceberg::internal::checked_cast<const NestedType&>(type);
126+
for (const auto& field : nested.fields()) {
127+
id_to_index[field.field_id()] = full_schemafield.size();
128+
full_schemafield.emplace_back(field);
129+
if (field.type()->is_nested()) {
130+
Visit(*field.type(), id_to_index, full_schemafield);
131+
}
132+
}
133+
return {};
134+
}
135+
std::string SchemaFieldVisitor::GetPath(const std::string& last_path,
136+
const std::string& field_name,
137+
bool case_sensitive) {
138+
if (case_sensitive) {
139+
return last_path.empty() ? field_name : last_path + "." + field_name;
140+
}
141+
std::string lower_name(field_name);
142+
std::ranges::transform(lower_name, lower_name.begin(), ::tolower);
143+
return last_path.empty() ? lower_name : last_path + "." + lower_name;
144+
}
145+
146+
Status SchemaFieldVisitor::Visit(
147+
const Type& type, std::unordered_map<std::string, size_t>& name_to_index,
148+
std::string_view path, std::unordered_map<std::string, size_t>& shortname_to_index,
149+
std::string_view short_path, int& index, bool case_sensitive) {
150+
const char dot = '.';
151+
const auto& nested = iceberg::internal::checked_cast<const NestedType&>(type);
152+
for (const auto& field : nested.fields()) {
153+
std::string full_path, short_full_path;
154+
full_path = GetPath(std::string(path), std::string(field.name()), case_sensitive);
155+
name_to_index[full_path] = index;
156+
157+
if (type.type_id() == TypeId::kList and field.type()->type_id() == TypeId::kStruct) {
158+
short_full_path = short_path;
159+
} else if (type.type_id() == TypeId::kMap and field.name() == "value" and
160+
field.type()->type_id() == TypeId::kStruct) {
161+
short_full_path = short_path;
162+
} else {
163+
short_full_path =
164+
GetPath(std::string(short_path), std::string(field.name()), case_sensitive);
165+
}
166+
shortname_to_index[short_full_path] = index++;
167+
if (field.type()->is_nested()) {
168+
Visit(*field.type(), name_to_index, full_path, shortname_to_index, short_full_path,
169+
index, case_sensitive);
170+
}
171+
}
172+
return {};
173+
}
174+
47175
} // namespace iceberg

src/iceberg/schema.h

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
/// Schemas for Iceberg tables. This header contains the definition of Schema
2424
/// and any utility functions. See iceberg/type.h and iceberg/field.h as well.
2525

26+
#include <algorithm>
2627
#include <cstdint>
2728
#include <optional>
2829
#include <string>
@@ -31,6 +32,7 @@
3132
#include "iceberg/iceberg_export.h"
3233
#include "iceberg/schema_field.h"
3334
#include "iceberg/type.h"
35+
#include "iceberg/util/visit_type.h"
3436

3537
namespace iceberg {
3638

@@ -54,13 +56,42 @@ class ICEBERG_EXPORT Schema : public StructType {
5456

5557
[[nodiscard]] std::string ToString() const override;
5658

59+
[[nodiscard]] std::optional<std::reference_wrapper<const SchemaField>> GetFieldByName(
60+
std::string_view name, bool case_sensitive) const override;
61+
62+
[[nodiscard]] std::optional<std::reference_wrapper<const SchemaField>> GetFieldByName(
63+
std::string_view name) const;
64+
65+
[[nodiscard]] std::optional<std::reference_wrapper<const SchemaField>> GetFieldById(
66+
int32_t field_id) const override;
67+
5768
friend bool operator==(const Schema& lhs, const Schema& rhs) { return lhs.Equals(rhs); }
5869

70+
mutable std::unordered_map<int, size_t> id_to_index_;
71+
mutable std::unordered_map<std::string, size_t> name_to_index_;
72+
mutable std::unordered_map<std::string, size_t> lowercase_name_to_index_;
73+
mutable std::vector<SchemaField> full_schemafield_;
74+
5975
private:
6076
/// \brief Compare two schemas for equality.
6177
[[nodiscard]] bool Equals(const Schema& other) const;
6278

6379
const std::optional<int32_t> schema_id_;
80+
81+
void InitIdToIndexMap() const;
82+
void InitNameToIndexMap() const;
83+
void InitLowerCaseNameToIndexMap() const;
6484
};
6585

86+
class SchemaFieldVisitor {
87+
public:
88+
Status Visit(const Type& type, std::unordered_map<int, size_t>& id_to_index,
89+
std::vector<SchemaField>& full_schemafield);
90+
std::string GetPath(const std::string& last_path, const std::string& field_name,
91+
bool case_sensitive);
92+
Status Visit(const Type& type, std::unordered_map<std::string, size_t>& name_to_index,
93+
std::string_view path,
94+
std::unordered_map<std::string, size_t>& shortname_to_index,
95+
std::string_view short_path, int& index, bool case_sensitive);
96+
};
6697
} // namespace iceberg

src/iceberg/type.cc

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,10 @@
2727
#include "iceberg/util/formatter.h" // IWYU pragma: keep
2828

2929
namespace iceberg {
30+
std::optional<std::reference_wrapper<const SchemaField>> NestedType::GetFieldByName(
31+
std::string_view name) const {
32+
return GetFieldByName(name, true);
33+
}
3034

3135
StructType::StructType(std::vector<SchemaField> fields) : fields_(std::move(fields)) {
3236
size_t index = 0;
@@ -67,7 +71,7 @@ std::optional<std::reference_wrapper<const SchemaField>> StructType::GetFieldByI
6771
return fields_[index];
6872
}
6973
std::optional<std::reference_wrapper<const SchemaField>> StructType::GetFieldByName(
70-
std::string_view name) const {
74+
std::string_view name, bool case_sensitive) const {
7175
// N.B. duplicate names are not permitted (looking at the Java
7276
// implementation) so there is nothing in particular we need to do here
7377
for (const auto& field : fields_) {
@@ -84,6 +88,10 @@ bool StructType::Equals(const Type& other) const {
8488
const auto& struct_ = static_cast<const StructType&>(other);
8589
return fields_ == struct_.fields_;
8690
}
91+
std::optional<std::reference_wrapper<const SchemaField>> StructType::GetFieldByName(
92+
std::string_view name) const {
93+
return GetFieldByName(name, true);
94+
}
8795

8896
ListType::ListType(SchemaField element) : element_(std::move(element)) {
8997
if (element_.name() != kElementName) {
@@ -120,7 +128,7 @@ std::optional<std::reference_wrapper<const SchemaField>> ListType::GetFieldByInd
120128
return std::nullopt;
121129
}
122130
std::optional<std::reference_wrapper<const SchemaField>> ListType::GetFieldByName(
123-
std::string_view name) const {
131+
std::string_view name, bool case_sensitive) const {
124132
if (name == element_.name()) {
125133
return std::cref(element_);
126134
}
@@ -133,6 +141,10 @@ bool ListType::Equals(const Type& other) const {
133141
const auto& list = static_cast<const ListType&>(other);
134142
return element_ == list.element_;
135143
}
144+
std::optional<std::reference_wrapper<const SchemaField>> ListType::GetFieldByName(
145+
std::string_view name) const {
146+
return GetFieldByName(name, false);
147+
}
136148

137149
MapType::MapType(SchemaField key, SchemaField value)
138150
: fields_{std::move(key), std::move(value)} {
@@ -178,7 +190,7 @@ std::optional<std::reference_wrapper<const SchemaField>> MapType::GetFieldByInde
178190
return std::nullopt;
179191
}
180192
std::optional<std::reference_wrapper<const SchemaField>> MapType::GetFieldByName(
181-
std::string_view name) const {
193+
std::string_view name, bool case_sensitive) const {
182194
if (name == kKeyName) {
183195
return key();
184196
} else if (name == kValueName) {
@@ -193,6 +205,10 @@ bool MapType::Equals(const Type& other) const {
193205
const auto& map = static_cast<const MapType&>(other);
194206
return fields_ == map.fields_;
195207
}
208+
std::optional<std::reference_wrapper<const SchemaField>> MapType::GetFieldByName(
209+
std::string_view name) const {
210+
return GetFieldByName(name, false);
211+
}
196212

197213
TypeId BooleanType::type_id() const { return kTypeId; }
198214
std::string BooleanType::ToString() const { return "boolean"; }

src/iceberg/type.h

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,10 @@ class ICEBERG_EXPORT NestedType : public Type {
9191
///
9292
/// \note This is currently O(n) complexity.
9393
[[nodiscard]] virtual std::optional<std::reference_wrapper<const SchemaField>>
94-
GetFieldByName(std::string_view name) const = 0;
94+
GetFieldByName(std::string_view name, bool case_sensitive) const = 0;
95+
96+
std::optional<std::reference_wrapper<const SchemaField>> GetFieldByName(
97+
std::string_view name) const;
9598
};
9699

97100
/// \defgroup type-nested Nested Types
@@ -114,7 +117,9 @@ class ICEBERG_EXPORT StructType : public NestedType {
114117
std::optional<std::reference_wrapper<const SchemaField>> GetFieldByIndex(
115118
int32_t index) const override;
116119
std::optional<std::reference_wrapper<const SchemaField>> GetFieldByName(
117-
std::string_view name) const override;
120+
std::string_view name, bool case_sensitive) const override;
121+
std::optional<std::reference_wrapper<const SchemaField>> GetFieldByName(
122+
std::string_view name) const;
118123

119124
protected:
120125
bool Equals(const Type& other) const override;
@@ -145,7 +150,9 @@ class ICEBERG_EXPORT ListType : public NestedType {
145150
std::optional<std::reference_wrapper<const SchemaField>> GetFieldByIndex(
146151
int32_t index) const override;
147152
std::optional<std::reference_wrapper<const SchemaField>> GetFieldByName(
148-
std::string_view name) const override;
153+
std::string_view name, bool case_sensitive) const override;
154+
std::optional<std::reference_wrapper<const SchemaField>> GetFieldByName(
155+
std::string_view name) const;
149156

150157
protected:
151158
bool Equals(const Type& other) const override;
@@ -177,7 +184,9 @@ class ICEBERG_EXPORT MapType : public NestedType {
177184
std::optional<std::reference_wrapper<const SchemaField>> GetFieldByIndex(
178185
int32_t index) const override;
179186
std::optional<std::reference_wrapper<const SchemaField>> GetFieldByName(
180-
std::string_view name) const override;
187+
std::string_view name, bool case_sensitive) const override;
188+
std::optional<std::reference_wrapper<const SchemaField>> GetFieldByName(
189+
std::string_view name) const;
181190

182191
protected:
183192
bool Equals(const Type& other) const override;

test/schema_test.cc

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,3 +81,75 @@ TEST(SchemaTest, Equality) {
8181
ASSERT_EQ(schema1, schema5);
8282
ASSERT_EQ(schema5, schema1);
8383
}
84+
85+
TEST(SchemaTest, NestedType) {
86+
iceberg::SchemaField field1(1, "Foo", iceberg::int32(), true);
87+
iceberg::SchemaField field2(2, "Bar", iceberg::string(), true);
88+
iceberg::SchemaField field3(3, "Foobar", iceberg::int32(), true);
89+
90+
iceberg::StructType structtype = iceberg::StructType({field1, field2, field3});
91+
92+
auto listype = iceberg::ListType(iceberg::SchemaField::MakeRequired(
93+
4, "element", std::make_shared<iceberg::StructType>(structtype)));
94+
95+
auto maptype =
96+
iceberg::MapType(iceberg::SchemaField::MakeRequired(5, "key", iceberg::int32()),
97+
iceberg::SchemaField::MakeRequired(
98+
6, "value", std::make_shared<iceberg::ListType>(listype)));
99+
100+
auto field4 = iceberg::SchemaField::MakeRequired(
101+
4, "element", std::make_shared<iceberg::StructType>(structtype));
102+
auto field5 = iceberg::SchemaField::MakeRequired(5, "key", iceberg::int32());
103+
auto field6 = iceberg::SchemaField::MakeRequired(
104+
6, "value", std::make_shared<iceberg::ListType>(listype));
105+
auto field7 = iceberg::SchemaField::MakeRequired(
106+
7, "Value", std::make_shared<iceberg::MapType>(maptype));
107+
108+
iceberg::Schema schema({field7}, 1);
109+
110+
ASSERT_EQ(schema.full_schemafield_.size(), 7);
111+
ASSERT_THAT(schema.GetFieldById(7), ::testing::Optional(field7));
112+
ASSERT_THAT(schema.GetFieldById(6), ::testing::Optional(field6));
113+
ASSERT_THAT(schema.GetFieldById(5), ::testing::Optional(field5));
114+
ASSERT_THAT(schema.GetFieldById(4), ::testing::Optional(field4));
115+
ASSERT_THAT(schema.GetFieldById(3), ::testing::Optional(field3));
116+
ASSERT_THAT(schema.GetFieldById(2), ::testing::Optional(field2));
117+
ASSERT_THAT(schema.GetFieldById(1), ::testing::Optional(field1));
118+
ASSERT_EQ(schema.full_schemafield_.size(), 7);
119+
120+
ASSERT_THAT(schema.GetFieldByName("Value"), ::testing::Optional(field7));
121+
ASSERT_THAT(schema.GetFieldByName("Value.value"), ::testing::Optional(field6));
122+
ASSERT_THAT(schema.GetFieldByName("Value.key"), ::testing::Optional(field5));
123+
ASSERT_THAT(schema.GetFieldByName("Value.value.element"), ::testing::Optional(field4));
124+
ASSERT_THAT(schema.GetFieldByName("Value.value.element.Foobar"),
125+
::testing::Optional(field3));
126+
ASSERT_THAT(schema.GetFieldByName("Value.value.element.Bar"),
127+
::testing::Optional(field2));
128+
ASSERT_THAT(schema.GetFieldByName("Value.value.element.Foo"),
129+
::testing::Optional(field1));
130+
ASSERT_EQ(schema.full_schemafield_.size(), 7);
131+
132+
ASSERT_THAT(schema.GetFieldByName("vALue", false), ::testing::Optional(field7));
133+
ASSERT_THAT(schema.GetFieldByName("vALue.VALUE", false), ::testing::Optional(field6));
134+
ASSERT_THAT(schema.GetFieldByName("valUe.kEy", false), ::testing::Optional(field5));
135+
ASSERT_THAT(schema.GetFieldByName("vaLue.vAlue.elEment", false),
136+
::testing::Optional(field4));
137+
ASSERT_THAT(schema.GetFieldByName("vaLue.vAlue.eLement.fOObar", false),
138+
::testing::Optional(field3));
139+
ASSERT_THAT(schema.GetFieldByName("valUe.vaLUe.elemEnt.Bar", false),
140+
::testing::Optional(field2));
141+
ASSERT_THAT(schema.GetFieldByName("valUe.valUe.ELEMENT.FOO", false),
142+
::testing::Optional(field1));
143+
ASSERT_EQ(schema.full_schemafield_.size(), 7);
144+
145+
ASSERT_THAT(schema.GetFieldByName("vaLue.value.FOO", false),
146+
::testing::Optional(field1));
147+
ASSERT_THAT(schema.GetFieldByName("Value.value.Bar", false),
148+
::testing::Optional(field2));
149+
ASSERT_THAT(schema.GetFieldByName("Value.value.FooBAR", false),
150+
::testing::Optional(field3));
151+
152+
ASSERT_THAT(schema.GetFieldByName("Value.value.Foo"), ::testing::Optional(field1));
153+
ASSERT_THAT(schema.GetFieldByName("Value.value.Bar"), ::testing::Optional(field2));
154+
ASSERT_THAT(schema.GetFieldByName("Value.value.Foobar"), ::testing::Optional(field3));
155+
}

0 commit comments

Comments
 (0)