Skip to content

Commit ae4515e

Browse files
author
nullccxsy
committed
feat(schema): implement nested field lookup with name pruning
- Add GetFieldByName method to support nested field queries with dot notation - Implement InitNameToIndexMap and InitIdToIndexMap for field mapping - Introduce SchemaFieldVisitor for schema traversal - Implement name pruning logic for element and value in nested structure
1 parent f2d0abd commit ae4515e

File tree

5 files changed

+277
-9
lines changed

5 files changed

+277
-9
lines changed

src/iceberg/schema.cc

Lines changed: 142 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,15 @@
2121

2222
#include <format>
2323

24+
#include "iceberg/exception.h"
2425
#include "iceberg/type.h"
2526
#include "iceberg/util/formatter.h" // IWYU pragma: keep
26-
2727
namespace iceberg {
2828

2929
Schema::Schema(std::vector<SchemaField> fields, std::optional<int32_t> schema_id)
30-
: StructType(std::move(fields)), schema_id_(schema_id) {}
30+
: StructType(std::move(fields)), schema_id_(schema_id) {
31+
InitIdToIndexMap();
32+
}
3133

3234
std::optional<int32_t> Schema::schema_id() const { return schema_id_; }
3335

@@ -44,4 +46,142 @@ bool Schema::Equals(const Schema& other) const {
4446
return schema_id_ == other.schema_id_ && fields_ == other.fields_;
4547
}
4648

49+
std::optional<std::reference_wrapper<const SchemaField>> Schema::GetFieldByName(
50+
std::string_view name, bool case_sensitive) const {
51+
if (case_sensitive) {
52+
InitNameToIndexMap();
53+
auto it = name_to_index_.find(std::string(name));
54+
if (it == name_to_index_.end()) return std::nullopt;
55+
return full_schemafield_[it->second];
56+
}
57+
InitLowerCaseNameToIndexMap();
58+
std::string lower_name(name);
59+
std::ranges::transform(lower_name, lower_name.begin(), ::tolower);
60+
auto it = lowercase_name_to_index_.find(lower_name);
61+
if (it == lowercase_name_to_index_.end()) return std::nullopt;
62+
return full_schemafield_[it->second];
63+
}
64+
65+
std::optional<std::reference_wrapper<const SchemaField>> Schema::GetFieldByName(
66+
std::string_view name) const {
67+
return GetFieldByName(name, true);
68+
}
69+
70+
void Schema::InitIdToIndexMap() const {
71+
if (!id_to_index_.empty()) {
72+
return;
73+
}
74+
SchemaFieldVisitor visitor;
75+
auto result = VisitTypeInline(*this, &visitor, id_to_index_, full_schemafield_);
76+
}
77+
78+
void Schema::InitNameToIndexMap() const {
79+
if (!name_to_index_.empty()) {
80+
return;
81+
}
82+
int index = 0;
83+
std::string_view path, short_path;
84+
SchemaFieldVisitor visitor;
85+
std::unordered_map<std::string, size_t> shortname_to_index;
86+
auto tmp = VisitTypeInline(*this, &visitor, name_to_index_, path, shortname_to_index,
87+
short_path, index, true);
88+
if (!tmp.has_value()) {
89+
throw IcebergError("Failed to perform InitNameToIndexMap");
90+
}
91+
for (const auto& pair : shortname_to_index) {
92+
if (!name_to_index_.count(pair.first)) {
93+
name_to_index_.emplace(pair.first, pair.second);
94+
}
95+
}
96+
}
97+
98+
void Schema::InitLowerCaseNameToIndexMap() const {
99+
if (!lowercase_name_to_index_.empty()) {
100+
return;
101+
}
102+
int index = 0;
103+
std::string_view path, short_path;
104+
SchemaFieldVisitor visitor;
105+
std::unordered_map<std::string, size_t> shortlowercasename_to_index;
106+
auto tmp = VisitTypeInline(*this, &visitor, lowercase_name_to_index_, path,
107+
shortlowercasename_to_index, short_path, index, false);
108+
if (!tmp.has_value()) {
109+
throw IcebergError("Failed to perform InitLowerCaseNameToIndexMap");
110+
}
111+
for (const auto& pair : shortlowercasename_to_index) {
112+
if (!lowercase_name_to_index_.count(pair.first)) {
113+
lowercase_name_to_index_.emplace(pair.first, pair.second);
114+
}
115+
}
116+
}
117+
118+
std::optional<std::reference_wrapper<const SchemaField>> Schema::GetFieldById(
119+
int32_t field_id) const {
120+
InitIdToIndexMap();
121+
auto it = id_to_index_.find(field_id);
122+
if (it == id_to_index_.end()) {
123+
return std::nullopt;
124+
}
125+
return full_schemafield_[it->second];
126+
}
127+
128+
Status SchemaFieldVisitor::Visit(const Type& type,
129+
std::unordered_map<int, size_t>& id_to_index,
130+
std::vector<SchemaField>& full_schemafield) {
131+
const auto& nested = iceberg::internal::checked_cast<const NestedType&>(type);
132+
for (const auto& field : nested.fields()) {
133+
id_to_index[field.field_id()] = full_schemafield.size();
134+
full_schemafield.emplace_back(field);
135+
if (field.type()->is_nested()) {
136+
auto tmp = Visit(*field.type(), id_to_index, full_schemafield);
137+
if (!tmp.has_value()) {
138+
throw IcebergError("Failed to perform visit(id_to_index)");
139+
}
140+
}
141+
}
142+
return {};
143+
}
144+
std::string SchemaFieldVisitor::GetPath(const std::string& last_path,
145+
const std::string& field_name,
146+
bool case_sensitive) {
147+
if (case_sensitive) {
148+
return last_path.empty() ? field_name : last_path + "." + field_name;
149+
}
150+
std::string lower_name(field_name);
151+
std::ranges::transform(lower_name, lower_name.begin(), ::tolower);
152+
return last_path.empty() ? lower_name : last_path + "." + lower_name;
153+
}
154+
155+
Status SchemaFieldVisitor::Visit(
156+
const Type& type, std::unordered_map<std::string, size_t>& name_to_index,
157+
std::string_view path, std::unordered_map<std::string, size_t>& shortname_to_index,
158+
std::string_view short_path, int& index, bool case_sensitive) {
159+
const char dot = '.';
160+
const auto& nested = iceberg::internal::checked_cast<const NestedType&>(type);
161+
for (const auto& field : nested.fields()) {
162+
std::string full_path, short_full_path;
163+
full_path = GetPath(std::string(path), std::string(field.name()), case_sensitive);
164+
name_to_index[full_path] = index;
165+
166+
if (type.type_id() == TypeId::kList and field.type()->type_id() == TypeId::kStruct) {
167+
short_full_path = short_path;
168+
} else if (type.type_id() == TypeId::kMap and field.name() == "value" and
169+
field.type()->type_id() == TypeId::kStruct) {
170+
short_full_path = short_path;
171+
} else {
172+
short_full_path =
173+
GetPath(std::string(short_path), std::string(field.name()), case_sensitive);
174+
}
175+
shortname_to_index[short_full_path] = index++;
176+
if (field.type()->is_nested()) {
177+
auto tmp = Visit(*field.type(), name_to_index, full_path, shortname_to_index,
178+
short_full_path, index, case_sensitive);
179+
if (!tmp.has_value()) {
180+
throw IcebergError("Failed to perform visit(name_to_index)");
181+
}
182+
}
183+
}
184+
return {};
185+
}
186+
47187
} // namespace iceberg

src/iceberg/schema.h

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
/// Schemas for Iceberg tables. This header contains the definition of Schema
2424
/// and any utility functions. See iceberg/type.h and iceberg/field.h as well.
2525

26+
#include <algorithm>
2627
#include <cstdint>
2728
#include <optional>
2829
#include <string>
@@ -31,6 +32,7 @@
3132
#include "iceberg/iceberg_export.h"
3233
#include "iceberg/schema_field.h"
3334
#include "iceberg/type.h"
35+
#include "iceberg/util/visit_type.h"
3436

3537
namespace iceberg {
3638

@@ -54,13 +56,42 @@ class ICEBERG_EXPORT Schema : public StructType {
5456

5557
[[nodiscard]] std::string ToString() const override;
5658

59+
[[nodiscard]] std::optional<std::reference_wrapper<const SchemaField>> GetFieldByName(
60+
std::string_view name, bool case_sensitive) const override;
61+
62+
[[nodiscard]] std::optional<std::reference_wrapper<const SchemaField>> GetFieldByName(
63+
std::string_view name) const;
64+
65+
[[nodiscard]] std::optional<std::reference_wrapper<const SchemaField>> GetFieldById(
66+
int32_t field_id) const override;
67+
5768
friend bool operator==(const Schema& lhs, const Schema& rhs) { return lhs.Equals(rhs); }
5869

70+
mutable std::unordered_map<int, size_t> id_to_index_;
71+
mutable std::unordered_map<std::string, size_t> name_to_index_;
72+
mutable std::unordered_map<std::string, size_t> lowercase_name_to_index_;
73+
mutable std::vector<SchemaField> full_schemafield_;
74+
5975
private:
6076
/// \brief Compare two schemas for equality.
6177
[[nodiscard]] bool Equals(const Schema& other) const;
6278

6379
const std::optional<int32_t> schema_id_;
80+
81+
void InitIdToIndexMap() const;
82+
void InitNameToIndexMap() const;
83+
void InitLowerCaseNameToIndexMap() const;
6484
};
6585

86+
class SchemaFieldVisitor {
87+
public:
88+
Status Visit(const Type& type, std::unordered_map<int, size_t>& id_to_index,
89+
std::vector<SchemaField>& full_schemafield);
90+
std::string GetPath(const std::string& last_path, const std::string& field_name,
91+
bool case_sensitive);
92+
Status Visit(const Type& type, std::unordered_map<std::string, size_t>& name_to_index,
93+
std::string_view path,
94+
std::unordered_map<std::string, size_t>& shortname_to_index,
95+
std::string_view short_path, int& index, bool case_sensitive);
96+
};
6697
} // namespace iceberg

src/iceberg/type.cc

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,10 @@
2727
#include "iceberg/util/formatter.h" // IWYU pragma: keep
2828

2929
namespace iceberg {
30+
std::optional<std::reference_wrapper<const SchemaField>> NestedType::GetFieldByName(
31+
std::string_view name) const {
32+
return GetFieldByName(name, true);
33+
}
3034

3135
StructType::StructType(std::vector<SchemaField> fields) : fields_(std::move(fields)) {
3236
size_t index = 0;
@@ -67,7 +71,7 @@ std::optional<std::reference_wrapper<const SchemaField>> StructType::GetFieldByI
6771
return fields_[index];
6872
}
6973
std::optional<std::reference_wrapper<const SchemaField>> StructType::GetFieldByName(
70-
std::string_view name) const {
74+
std::string_view name, bool case_sensitive) const {
7175
// N.B. duplicate names are not permitted (looking at the Java
7276
// implementation) so there is nothing in particular we need to do here
7377
for (const auto& field : fields_) {
@@ -84,6 +88,10 @@ bool StructType::Equals(const Type& other) const {
8488
const auto& struct_ = static_cast<const StructType&>(other);
8589
return fields_ == struct_.fields_;
8690
}
91+
std::optional<std::reference_wrapper<const SchemaField>> StructType::GetFieldByName(
92+
std::string_view name) const {
93+
return GetFieldByName(name, true);
94+
}
8795

8896
ListType::ListType(SchemaField element) : element_(std::move(element)) {
8997
if (element_.name() != kElementName) {
@@ -120,7 +128,7 @@ std::optional<std::reference_wrapper<const SchemaField>> ListType::GetFieldByInd
120128
return std::nullopt;
121129
}
122130
std::optional<std::reference_wrapper<const SchemaField>> ListType::GetFieldByName(
123-
std::string_view name) const {
131+
std::string_view name, bool case_sensitive) const {
124132
if (name == element_.name()) {
125133
return std::cref(element_);
126134
}
@@ -133,6 +141,10 @@ bool ListType::Equals(const Type& other) const {
133141
const auto& list = static_cast<const ListType&>(other);
134142
return element_ == list.element_;
135143
}
144+
std::optional<std::reference_wrapper<const SchemaField>> ListType::GetFieldByName(
145+
std::string_view name) const {
146+
return GetFieldByName(name, false);
147+
}
136148

137149
MapType::MapType(SchemaField key, SchemaField value)
138150
: fields_{std::move(key), std::move(value)} {
@@ -178,7 +190,7 @@ std::optional<std::reference_wrapper<const SchemaField>> MapType::GetFieldByInde
178190
return std::nullopt;
179191
}
180192
std::optional<std::reference_wrapper<const SchemaField>> MapType::GetFieldByName(
181-
std::string_view name) const {
193+
std::string_view name, bool case_sensitive) const {
182194
if (name == kKeyName) {
183195
return key();
184196
} else if (name == kValueName) {
@@ -193,6 +205,10 @@ bool MapType::Equals(const Type& other) const {
193205
const auto& map = static_cast<const MapType&>(other);
194206
return fields_ == map.fields_;
195207
}
208+
std::optional<std::reference_wrapper<const SchemaField>> MapType::GetFieldByName(
209+
std::string_view name) const {
210+
return GetFieldByName(name, false);
211+
}
196212

197213
TypeId BooleanType::type_id() const { return kTypeId; }
198214
std::string BooleanType::ToString() const { return "boolean"; }

src/iceberg/type.h

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,10 @@ class ICEBERG_EXPORT NestedType : public Type {
9191
///
9292
/// \note This is currently O(n) complexity.
9393
[[nodiscard]] virtual std::optional<std::reference_wrapper<const SchemaField>>
94-
GetFieldByName(std::string_view name) const = 0;
94+
GetFieldByName(std::string_view name, bool case_sensitive) const = 0;
95+
96+
std::optional<std::reference_wrapper<const SchemaField>> GetFieldByName(
97+
std::string_view name) const;
9598
};
9699

97100
/// \defgroup type-nested Nested Types
@@ -114,7 +117,9 @@ class ICEBERG_EXPORT StructType : public NestedType {
114117
std::optional<std::reference_wrapper<const SchemaField>> GetFieldByIndex(
115118
int32_t index) const override;
116119
std::optional<std::reference_wrapper<const SchemaField>> GetFieldByName(
117-
std::string_view name) const override;
120+
std::string_view name, bool case_sensitive) const override;
121+
std::optional<std::reference_wrapper<const SchemaField>> GetFieldByName(
122+
std::string_view name) const;
118123

119124
protected:
120125
bool Equals(const Type& other) const override;
@@ -145,7 +150,9 @@ class ICEBERG_EXPORT ListType : public NestedType {
145150
std::optional<std::reference_wrapper<const SchemaField>> GetFieldByIndex(
146151
int32_t index) const override;
147152
std::optional<std::reference_wrapper<const SchemaField>> GetFieldByName(
148-
std::string_view name) const override;
153+
std::string_view name, bool case_sensitive) const override;
154+
std::optional<std::reference_wrapper<const SchemaField>> GetFieldByName(
155+
std::string_view name) const;
149156

150157
protected:
151158
bool Equals(const Type& other) const override;
@@ -177,7 +184,9 @@ class ICEBERG_EXPORT MapType : public NestedType {
177184
std::optional<std::reference_wrapper<const SchemaField>> GetFieldByIndex(
178185
int32_t index) const override;
179186
std::optional<std::reference_wrapper<const SchemaField>> GetFieldByName(
180-
std::string_view name) const override;
187+
std::string_view name, bool case_sensitive) const override;
188+
std::optional<std::reference_wrapper<const SchemaField>> GetFieldByName(
189+
std::string_view name) const;
181190

182191
protected:
183192
bool Equals(const Type& other) const override;

0 commit comments

Comments
 (0)