Skip to content

Commit 693f670

Browse files
author
nullccxsy
committed
feat: Add nested field query support to NestedType
- Add recursive field lookup methods to NestedType base class - Implement FindFieldById, FindFieldByName, and FindFieldByPath methods - Support nested struct, list, and map type field queries - Add comprehensive tests for nested field operations FIXME: Non-ASCII character case sensitivity not implemented.
1 parent 06adc3f commit 693f670

File tree

4 files changed

+228
-102
lines changed

4 files changed

+228
-102
lines changed

src/iceberg/type.cc

Lines changed: 126 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -28,25 +28,99 @@
2828
#include <optional>
2929
#include <ranges>
3030
#include <string_view>
31-
#include <iceberg/schema_field.h>
31+
#include <unordered_map>
32+
33+
#include <iceberg/type_fwd.h>
3234

3335
#include "iceberg/exception.h"
3436
#include "iceberg/util/formatter.h" // IWYU pragma: keep
3537

3638
namespace iceberg {
39+
void NestedType::BuildNameToIndexMap(
40+
std::string_view current_path,
41+
std::unordered_map<std::string, size_t>& name_to_index_, int& index) const {
42+
for (const auto& field : fields()) {
43+
std::string full_path = std::string(current_path) + "." + std::string(field.name());
44+
if (current_path == "") {
45+
full_path = std::string(field.name());
46+
}
47+
if (field.type() && field.type()->is_nested()) {
48+
auto nested_type = std::dynamic_pointer_cast<NestedType>(field.type());
49+
if (nested_type) {
50+
nested_type->BuildNameToIndexMap(full_path, name_to_index_, index);
51+
}
52+
}
53+
name_to_index_[full_path] = index++;
54+
}
55+
}
3756

38-
StructType::StructType(std::vector<SchemaField> fields) : fields_(std::move(fields)) {
39-
size_t index = 0;
40-
for (const auto& field : fields_) {
41-
auto [it, inserted] = field_id_to_index_.try_emplace(field.field_id(), index);
57+
void NestedType::BuildIdToIndexMap(std::unordered_map<int, size_t>& id_to_index,
58+
std::vector<SchemaField>& full_schemafield) const {
59+
for (const auto& field : fields()) {
60+
if (field.type() && field.type()->is_nested()) {
61+
auto nested_type = std::dynamic_pointer_cast<NestedType>(field.type());
62+
if (nested_type) {
63+
nested_type->BuildIdToIndexMap(id_to_index, full_schemafield);
64+
}
65+
}
66+
auto [it, inserted] =
67+
id_to_index.try_emplace(field.field_id(), full_schemafield.size());
4268
if (!inserted) {
4369
throw IcebergError(
4470
std::format("StructType: duplicate field ID {} (field indices {} and {})",
45-
field.field_id(), it->second, index));
71+
field.field_id(), it->second, full_schemafield.size()));
4672
}
73+
full_schemafield.emplace_back(field);
74+
}
75+
}
76+
77+
void NestedType::BuildLowerCaseNameToIndexMap(
78+
std::string_view current_path,
79+
std::unordered_map<std::string, size_t>& lowercase_name_to_index, int& index) const {
80+
for (const auto& field : fields()) {
81+
std::string full_path = std::string(current_path) + "." + std::string(field.name());
82+
if (current_path == "") {
83+
full_path = std::string(field.name());
84+
}
85+
if (field.type() && field.type()->is_nested()) {
86+
auto nested_type = std::dynamic_pointer_cast<NestedType>(field.type());
87+
if (nested_type) {
88+
nested_type->BuildLowerCaseNameToIndexMap(full_path, lowercase_name_to_index,
89+
index);
90+
}
91+
}
92+
lowercase_name_to_index[full_path] = index++;
93+
}
94+
}
4795

48-
++index;
96+
void NestedType::InitNameToIndexMap() const {
97+
if (init_name_to_index_) {
98+
return;
4999
}
100+
int index = 0;
101+
BuildNameToIndexMap("", name_to_index_, index);
102+
init_name_to_index_ = true;
103+
}
104+
105+
void NestedType::InitIdToIndexMap() const {
106+
if (init_id_to_index_) {
107+
return;
108+
}
109+
BuildIdToIndexMap(field_id_to_index_, full_schemafield_);
110+
init_id_to_index_ = true;
111+
}
112+
113+
void NestedType::InitLowerCaseNameToIndexMap() const {
114+
if (init_lowercase_name_to_index_) {
115+
return;
116+
}
117+
int index = 0;
118+
BuildLowerCaseNameToIndexMap("", lowercase_name_to_index_, index);
119+
init_lowercase_name_to_index_ = true;
120+
}
121+
122+
StructType::StructType(std::vector<SchemaField> fields) : fields_(std::move(fields)) {
123+
InitIdToIndexMap();
50124
}
51125

52126
TypeId StructType::type_id() const { return kTypeId; }
@@ -64,7 +138,7 @@ std::optional<std::reference_wrapper<const SchemaField>> StructType::GetFieldByI
64138
int32_t field_id) const {
65139
auto it = field_id_to_index_.find(field_id);
66140
if (it == field_id_to_index_.end()) return std::nullopt;
67-
return fields_[it->second];
141+
return full_schemafield_[it->second];
68142
}
69143
std::optional<std::reference_wrapper<const SchemaField>> StructType::GetFieldByIndex(
70144
int32_t index) const {
@@ -74,22 +148,21 @@ std::optional<std::reference_wrapper<const SchemaField>> StructType::GetFieldByI
74148
return fields_[index];
75149
}
76150
std::optional<std::reference_wrapper<const SchemaField>> StructType::GetFieldByName(
77-
std::string_view name) const {
151+
std::string_view name, bool case_sensitive) const {
78152
// N.B. duplicate names are not permitted (looking at the Java
79153
// implementation) so there is nothing in particular we need to do here
80-
InitNameToIdMap();
81-
auto it = field_name_to_index_.find(std::string(name));
82-
if (it == field_name_to_index_.end()) return std::nullopt;
83-
return fields_[it->second];
84-
}
85-
std::optional<std::reference_wrapper<const SchemaField>> StructType::GetFieldByNameCaseInsensitive(
86-
std::string_view name) const {
87-
InitNameToIdMapCaseInsensitive();
154+
if (case_sensitive) {
155+
InitNameToIndexMap();
156+
auto it = name_to_index_.find(std::string(name));
157+
if (it == name_to_index_.end()) return std::nullopt;
158+
return full_schemafield_[it->second];
159+
}
160+
InitLowerCaseNameToIndexMap();
88161
std::string lower_name(name);
89162
std::ranges::transform(lower_name, lower_name.begin(), ::tolower);
90-
auto it = caseinsensitive_field_name_to_index_.find(lower_name);
91-
if (it == caseinsensitive_field_name_to_index_.end()) return std::nullopt;
92-
return fields_[it->second];
163+
auto it = lowercase_name_to_index_.find(lower_name);
164+
if (it == lowercase_name_to_index_.end()) return std::nullopt;
165+
return full_schemafield_[it->second];
93166
}
94167
bool StructType::Equals(const Type& other) const {
95168
if (other.type_id() != TypeId::kStruct) {
@@ -98,36 +171,19 @@ bool StructType::Equals(const Type& other) const {
98171
const auto& struct_ = static_cast<const StructType&>(other);
99172
return fields_ == struct_.fields_;
100173
}
101-
void StructType::InitNameToIdMap() const {
102-
if (!field_name_to_index_.empty()) {
103-
return;
104-
}
105-
106-
for (int i = 0; i < fields_.size(); i++) {
107-
field_name_to_index_[std::string(fields_[i].name())] = i;
108-
}
109-
}
110-
void StructType::InitNameToIdMapCaseInsensitive() const {
111-
if (!caseinsensitive_field_name_to_index_.empty()) {
112-
return;
113-
}
114-
115-
for (int i = 0; i < fields_.size(); i++) {
116-
std::string lowercase_name(fields_[i].name());
117-
std::ranges::transform(lowercase_name, lowercase_name.begin(), ::tolower);
118-
caseinsensitive_field_name_to_index_[lowercase_name] = i;
119-
}
120-
}
121174

122175
ListType::ListType(SchemaField element) : element_(std::move(element)) {
123176
if (element_.name() != kElementName) {
124177
throw IcebergError(std::format("ListType: child field name should be '{}', was '{}'",
125178
kElementName, element_.name()));
126179
}
180+
InitIdToIndexMap();
127181
}
128182

129183
ListType::ListType(int32_t field_id, std::shared_ptr<Type> type, bool optional)
130-
: element_(field_id, std::string(kElementName), std::move(type), optional) {}
184+
: element_(field_id, std::string(kElementName), std::move(type), optional) {
185+
InitIdToIndexMap();
186+
}
131187

132188
TypeId ListType::type_id() const { return kTypeId; }
133189
std::string ListType::ToString() const {
@@ -141,10 +197,9 @@ std::string ListType::ToString() const {
141197
std::span<const SchemaField> ListType::fields() const { return {&element_, 1}; }
142198
std::optional<std::reference_wrapper<const SchemaField>> ListType::GetFieldById(
143199
int32_t field_id) const {
144-
if (field_id == element_.field_id()) {
145-
return std::cref(element_);
146-
}
147-
return std::nullopt;
200+
auto it = field_id_to_index_.find(field_id);
201+
if (it == field_id_to_index_.end()) return std::nullopt;
202+
return full_schemafield_[it->second];
148203
}
149204
std::optional<std::reference_wrapper<const SchemaField>> ListType::GetFieldByIndex(
150205
int index) const {
@@ -154,19 +209,20 @@ std::optional<std::reference_wrapper<const SchemaField>> ListType::GetFieldByInd
154209
return std::nullopt;
155210
}
156211
std::optional<std::reference_wrapper<const SchemaField>> ListType::GetFieldByName(
157-
std::string_view name) const {
158-
if (name == element_.name()) {
159-
return std::cref(element_);
212+
std::string_view name, bool case_sensitive) const {
213+
if (case_sensitive) {
214+
InitNameToIndexMap();
215+
if (name == element_.name()) {
216+
return std::cref(element_);
217+
}
218+
return std::nullopt;
160219
}
161-
return std::nullopt;
162-
}
163-
std::optional<std::reference_wrapper<const SchemaField>> ListType::GetFieldByNameCaseInsensitive(
164-
std::string_view name) const {
220+
InitLowerCaseNameToIndexMap();
165221
auto lower_name_view = name | std::views::transform(::tolower);
166222
auto lower_field_name = element_.name() | std::views::transform(::tolower);
167223
if (std::ranges::equal(lower_field_name, lower_name_view)) {
168224
return std::cref(element_);
169-
}
225+
}
170226
return std::nullopt;
171227
}
172228
bool ListType::Equals(const Type& other) const {
@@ -187,6 +243,7 @@ MapType::MapType(SchemaField key, SchemaField value)
187243
throw IcebergError(std::format("MapType: value field name should be '{}', was '{}'",
188244
kValueName, this->value().name()));
189245
}
246+
InitIdToIndexMap();
190247
}
191248

192249
const SchemaField& MapType::key() const { return fields_[0]; }
@@ -204,12 +261,9 @@ std::string MapType::ToString() const {
204261
std::span<const SchemaField> MapType::fields() const { return fields_; }
205262
std::optional<std::reference_wrapper<const SchemaField>> MapType::GetFieldById(
206263
int32_t field_id) const {
207-
if (field_id == key().field_id()) {
208-
return key();
209-
} else if (field_id == value().field_id()) {
210-
return value();
211-
}
212-
return std::nullopt;
264+
auto it = field_id_to_index_.find(field_id);
265+
if (it == field_id_to_index_.end()) return std::nullopt;
266+
return full_schemafield_[it->second];
213267
}
214268
std::optional<std::reference_wrapper<const SchemaField>> MapType::GetFieldByIndex(
215269
int32_t index) const {
@@ -221,19 +275,20 @@ std::optional<std::reference_wrapper<const SchemaField>> MapType::GetFieldByInde
221275
return std::nullopt;
222276
}
223277
std::optional<std::reference_wrapper<const SchemaField>> MapType::GetFieldByName(
224-
std::string_view name) const {
225-
if (name == kKeyName) {
226-
return key();
227-
} else if (name == kValueName) {
228-
return value();
278+
std::string_view name, bool case_sensitive) const {
279+
if (case_sensitive) {
280+
InitNameToIndexMap();
281+
if (name == kKeyName) {
282+
return key();
283+
} else if (name == kValueName) {
284+
return value();
285+
}
286+
return std::nullopt;
229287
}
230-
return std::nullopt;
231-
}
232-
std::optional<std::reference_wrapper<const SchemaField>> MapType::GetFieldByNameCaseInsensitive(
233-
std::string_view name) const {
234-
auto lower_name_view = name | std::views::transform(::tolower);
235-
auto lower_key_view = kKeyName | std::views::transform(tolower);
236-
auto lower_value_view = kValueName | std::views::transform(tolower);
288+
InitLowerCaseNameToIndexMap();
289+
auto lower_name_view = name | std::views::transform(::tolower);
290+
auto lower_key_view = kKeyName | std::views::transform(::tolower);
291+
auto lower_value_view = kValueName | std::views::transform(::tolower);
237292
if (std::ranges::equal(lower_key_view, lower_name_view)) {
238293
return key();
239294
} else if (std::ranges::equal(lower_value_view, lower_name_view)) {

src/iceberg/type.h

Lines changed: 34 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -24,15 +24,20 @@
2424
/// iceberg/type_fwd.h for the enum defining the list of types.
2525

2626
#include <array>
27+
#include <cctype>
28+
#include <cstddef>
2729
#include <cstdint>
2830
#include <functional>
2931
#include <memory>
3032
#include <optional>
3133
#include <span>
3234
#include <string>
35+
#include <string_view>
3336
#include <unordered_map>
3437
#include <vector>
3538

39+
#include <iceberg/expression/expression.h>
40+
3641
#include "iceberg/iceberg_export.h"
3742
#include "iceberg/schema_field.h"
3843
#include "iceberg/util/formattable.h"
@@ -94,10 +99,32 @@ class ICEBERG_EXPORT NestedType : public Type {
9499
///
95100
/// \note This is currently O(1) complexity.
96101
[[nodiscard]] virtual std::optional<std::reference_wrapper<const SchemaField>>
97-
GetFieldByName(std::string_view name) const = 0;
102+
GetFieldByName(std::string_view name, bool case_sensitive = true) const = 0;
103+
/// FIXME: Non-Latin character case sensitivity is unhandled. Deferred to contributors
104+
/// with Latin locale expertise. PRs are welcome (must include locale-specific testing
105+
/// for non-Latin scripts).
106+
107+
void InitIdToIndexMap() const;
108+
void InitNameToIndexMap() const;
109+
void InitLowerCaseNameToIndexMap() const;
110+
111+
mutable std::unordered_map<int, size_t> field_id_to_index_;
112+
mutable std::unordered_map<std::string, size_t> name_to_index_;
113+
mutable std::unordered_map<std::string, size_t> lowercase_name_to_index_;
114+
mutable std::vector<SchemaField> full_schemafield_;
115+
mutable bool init_id_to_index_ = false;
116+
mutable bool init_name_to_index_ = false;
117+
mutable bool init_lowercase_name_to_index_ = false;
98118

99-
[[nodiscard]] virtual std::optional<std::reference_wrapper<const SchemaField>>
100-
GetFieldByNameCaseInsensitive(std::string_view name) const = 0;
119+
private:
120+
void BuildIdToIndexMap(std::unordered_map<int, size_t>& id_to_index,
121+
std::vector<SchemaField>& full_schemafield) const;
122+
void BuildNameToIndexMap(std::string_view current_path,
123+
std::unordered_map<std::string, size_t>& name_to_index_,
124+
int& index) const;
125+
void BuildLowerCaseNameToIndexMap(
126+
std::string_view current_path,
127+
std::unordered_map<std::string, size_t>& lowercase_name_to_index, int& index) const;
101128
};
102129

103130
/// \defgroup type-nested Nested Types
@@ -120,19 +147,11 @@ class ICEBERG_EXPORT StructType : public NestedType {
120147
std::optional<std::reference_wrapper<const SchemaField>> GetFieldByIndex(
121148
int32_t index) const override;
122149
std::optional<std::reference_wrapper<const SchemaField>> GetFieldByName(
123-
std::string_view name) const override;
124-
std::optional<std::reference_wrapper<const SchemaField>> GetFieldByNameCaseInsensitive(
125-
std::string_view name) const override;
126-
void InitNameToIdMap() const;
127-
void InitNameToIdMapCaseInsensitive() const;
150+
std::string_view name, bool case_sensitive = true) const override;
128151

129152
protected:
130153
bool Equals(const Type& other) const override;
131-
132154
std::vector<SchemaField> fields_;
133-
std::unordered_map<int32_t, size_t> field_id_to_index_;
134-
mutable std::unordered_map<std::string, size_t> field_name_to_index_;
135-
mutable std::unordered_map<std::string, size_t> caseinsensitive_field_name_to_index_;
136155
};
137156

138157
/// \brief A data type representing a list of values.
@@ -157,9 +176,7 @@ class ICEBERG_EXPORT ListType : public NestedType {
157176
std::optional<std::reference_wrapper<const SchemaField>> GetFieldByIndex(
158177
int32_t index) const override;
159178
std::optional<std::reference_wrapper<const SchemaField>> GetFieldByName(
160-
std::string_view name) const override;
161-
std::optional<std::reference_wrapper<const SchemaField>> GetFieldByNameCaseInsensitive(
162-
std::string_view name) const override;
179+
std::string_view name, bool case_sensitive = true) const override;
163180

164181
protected:
165182
bool Equals(const Type& other) const override;
@@ -191,10 +208,8 @@ class ICEBERG_EXPORT MapType : public NestedType {
191208
std::optional<std::reference_wrapper<const SchemaField>> GetFieldByIndex(
192209
int32_t index) const override;
193210
std::optional<std::reference_wrapper<const SchemaField>> GetFieldByName(
194-
std::string_view name) const override;
195-
std::optional<std::reference_wrapper<const SchemaField>> GetFieldByNameCaseInsensitive(
196-
std::string_view name) const override;
197-
211+
std::string_view name, bool case_sensitive = true) const override;
212+
198213
protected:
199214
bool Equals(const Type& other) const override;
200215

0 commit comments

Comments
 (0)