Skip to content

Commit 13ed89f

Browse files
author
nullccxsy
committed
feat: add case-insensitive field lookup for StructType, ListType, and MapType
- Implemented case-insensitive GetFieldByName in NestedType subclasses. - Added lazy initialization for maps in StructType - Handled duplicate names/IDs with Status returns instead of throws.
1 parent 9f13bac commit 13ed89f

File tree

7 files changed

+244
-84
lines changed

7 files changed

+244
-84
lines changed

src/iceberg/manifest_reader_internal.cc

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -222,8 +222,8 @@ Result<std::vector<ManifestFile>> ParseManifestList(ArrowSchema* schema,
222222
if (!field.has_value()) {
223223
return InvalidSchema("Field index {} is not found in schema", idx);
224224
}
225-
auto field_name = field.value().get().name();
226-
bool required = !field.value().get().optional();
225+
auto field_name = field.value()->get().name();
226+
bool required = !field.value()->get().optional();
227227
auto view_of_column = array_view.children[idx];
228228
switch (idx) {
229229
case 0:
@@ -340,8 +340,8 @@ Status ParseDataFile(const std::shared_ptr<StructType>& data_file_schema,
340340
data_file_schema->fields().size(), view_of_column->n_children);
341341
}
342342
for (int64_t col_idx = 0; col_idx < view_of_column->n_children; ++col_idx) {
343-
auto field_name = data_file_schema->GetFieldByIndex(col_idx).value().get().name();
344-
auto required = !data_file_schema->GetFieldByIndex(col_idx).value().get().optional();
343+
auto field_name = data_file_schema->GetFieldByIndex(col_idx).value()->get().name();
344+
auto required = !data_file_schema->GetFieldByIndex(col_idx).value()->get().optional();
345345
auto view_of_file_field = view_of_column->children[col_idx];
346346
auto manifest_entry_count = view_of_file_field->length;
347347

@@ -487,8 +487,8 @@ Result<std::vector<ManifestEntry>> ParseManifestEntry(ArrowSchema* schema,
487487
if (!field.has_value()) {
488488
return InvalidManifest("Field not found in schema: {}", idx);
489489
}
490-
auto field_name = field.value().get().name();
491-
bool required = !field.value().get().optional();
490+
auto field_name = field.value()->get().name();
491+
bool required = !field.value()->get().optional();
492492
auto view_of_column = array_view.children[idx];
493493

494494
switch (idx) {
@@ -510,7 +510,7 @@ Result<std::vector<ManifestEntry>> ParseManifestEntry(ArrowSchema* schema,
510510
break;
511511
case 4: {
512512
auto data_file_schema =
513-
dynamic_pointer_cast<StructType>(field.value().get().type());
513+
dynamic_pointer_cast<StructType>(field.value()->get().type());
514514
ICEBERG_RETURN_UNEXPECTED(
515515
ParseDataFile(data_file_schema, view_of_column, manifest_entries));
516516
break;

src/iceberg/table_scan.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ Result<std::unique_ptr<TableScan>> TableScanBuilder::Build() {
115115
return InvalidArgument("Column {} not found in schema '{}'", column_name,
116116
*schema_id);
117117
}
118-
projected_fields.emplace_back(field_opt.value().get());
118+
projected_fields.emplace_back(field_opt.value()->get());
119119
}
120120
context_.projected_schema =
121121
std::make_shared<Schema>(std::move(projected_fields), schema->schema_id());

src/iceberg/type.cc

Lines changed: 96 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -25,23 +25,18 @@
2525

2626
#include "iceberg/exception.h"
2727
#include "iceberg/util/formatter.h" // IWYU pragma: keep
28+
#include "iceberg/util/macros.h"
29+
#include "iceberg/util/string_utils.h"
2830

2931
namespace iceberg {
3032

31-
StructType::StructType(std::vector<SchemaField> fields) : fields_(std::move(fields)) {
32-
size_t index = 0;
33-
for (const auto& field : fields_) {
34-
auto [it, inserted] = field_id_to_index_.try_emplace(field.field_id(), index);
35-
if (!inserted) {
36-
throw IcebergError(
37-
std::format("StructType: duplicate field ID {} (field indices {} and {})",
38-
field.field_id(), it->second, index));
39-
}
40-
41-
++index;
42-
}
33+
Result<std::optional<std::reference_wrapper<const SchemaField>>>
34+
NestedType::GetFieldByName(std::string_view name) const {
35+
return GetFieldByName(name, true);
4336
}
4437

38+
StructType::StructType(std::vector<SchemaField> fields) : fields_(std::move(fields)) {}
39+
4540
TypeId StructType::type_id() const { return kTypeId; }
4641

4742
std::string StructType::ToString() const {
@@ -53,27 +48,34 @@ std::string StructType::ToString() const {
5348
return repr;
5449
}
5550
std::span<const SchemaField> StructType::fields() const { return fields_; }
56-
std::optional<std::reference_wrapper<const SchemaField>> StructType::GetFieldById(
51+
Result<std::optional<std::reference_wrapper<const SchemaField>>> StructType::GetFieldById(
5752
int32_t field_id) const {
58-
auto it = field_id_to_index_.find(field_id);
59-
if (it == field_id_to_index_.end()) return std::nullopt;
60-
return fields_[it->second];
53+
ICEBERG_RETURN_UNEXPECTED(InitFieldById());
54+
auto it = field_by_id_.find(field_id);
55+
if (it == field_by_id_.end()) return std::nullopt;
56+
return it->second;
6157
}
62-
std::optional<std::reference_wrapper<const SchemaField>> StructType::GetFieldByIndex(
63-
int32_t index) const {
64-
if (index < 0 || index >= static_cast<int32_t>(fields_.size())) {
58+
Result<std::optional<std::reference_wrapper<const SchemaField>>>
59+
StructType::GetFieldByIndex(int32_t index) const {
60+
if (index < 0 || static_cast<size_t>(index) >= fields_.size()) {
6561
return std::nullopt;
6662
}
6763
return fields_[index];
6864
}
69-
std::optional<std::reference_wrapper<const SchemaField>> StructType::GetFieldByName(
70-
std::string_view name) const {
71-
// N.B. duplicate names are not permitted (looking at the Java
72-
// implementation) so there is nothing in particular we need to do here
73-
for (const auto& field : fields_) {
74-
if (field.name() == name) {
75-
return field;
65+
Result<std::optional<std::reference_wrapper<const SchemaField>>>
66+
StructType::GetFieldByName(std::string_view name, bool case_sensitive) const {
67+
if (case_sensitive) {
68+
ICEBERG_RETURN_UNEXPECTED(InitFieldByName());
69+
auto it = field_by_name_.find(name);
70+
if (it != field_by_name_.end()) {
71+
return it->second;
7672
}
73+
return std::nullopt;
74+
}
75+
ICEBERG_RETURN_UNEXPECTED(InitFieldByLowerCaseName());
76+
auto it = field_by_lowercase_name_.find(StringUtils::ToLower(name));
77+
if (it != field_by_lowercase_name_.end()) {
78+
return it->second;
7779
}
7880
return std::nullopt;
7981
}
@@ -84,6 +86,48 @@ bool StructType::Equals(const Type& other) const {
8486
const auto& struct_ = static_cast<const StructType&>(other);
8587
return fields_ == struct_.fields_;
8688
}
89+
Status StructType::InitFieldById() const {
90+
if (!field_by_id_.empty()) {
91+
return {};
92+
}
93+
for (const auto& field : fields_) {
94+
auto it = field_by_id_.try_emplace(field.field_id(), field);
95+
if (!it.second) {
96+
return NotAllowed("Duplicate field id found: {} (prev name: {}, curr name: {})",
97+
field.field_id(), it.first->second.get().name(), field.name());
98+
}
99+
}
100+
return {};
101+
}
102+
Status StructType::InitFieldByName() const {
103+
if (!field_by_name_.empty()) {
104+
return {};
105+
}
106+
for (const auto& field : fields_) {
107+
auto it = field_by_name_.try_emplace(field.name(), field);
108+
if (!it.second) {
109+
return NotAllowed("Duplicate field name found: {} (prev id: {}, curr id: {})",
110+
it.first->first, it.first->second.get().field_id(),
111+
field.field_id());
112+
}
113+
}
114+
return {};
115+
}
116+
Status StructType::InitFieldByLowerCaseName() const {
117+
if (!field_by_lowercase_name_.empty()) {
118+
return {};
119+
}
120+
for (const auto& field : fields_) {
121+
auto it =
122+
field_by_lowercase_name_.try_emplace(StringUtils::ToLower(field.name()), field);
123+
if (!it.second) {
124+
return NotAllowed("Duplicate field name found: {} (prev id: {}, curr id: {})",
125+
it.first->first, it.first->second.get().field_id(),
126+
field.field_id());
127+
}
128+
}
129+
return {};
130+
}
87131

88132
ListType::ListType(SchemaField element) : element_(std::move(element)) {
89133
if (element_.name() != kElementName) {
@@ -105,23 +149,29 @@ std::string ListType::ToString() const {
105149
return repr;
106150
}
107151
std::span<const SchemaField> ListType::fields() const { return {&element_, 1}; }
108-
std::optional<std::reference_wrapper<const SchemaField>> ListType::GetFieldById(
152+
Result<std::optional<std::reference_wrapper<const SchemaField>>> ListType::GetFieldById(
109153
int32_t field_id) const {
110154
if (field_id == element_.field_id()) {
111155
return std::cref(element_);
112156
}
113157
return std::nullopt;
114158
}
115-
std::optional<std::reference_wrapper<const SchemaField>> ListType::GetFieldByIndex(
116-
int index) const {
159+
Result<std::optional<std::reference_wrapper<const SchemaField>>>
160+
ListType::GetFieldByIndex(int index) const {
117161
if (index == 0) {
118162
return std::cref(element_);
119163
}
120164
return std::nullopt;
121165
}
122-
std::optional<std::reference_wrapper<const SchemaField>> ListType::GetFieldByName(
123-
std::string_view name) const {
124-
if (name == element_.name()) {
166+
Result<std::optional<std::reference_wrapper<const SchemaField>>> ListType::GetFieldByName(
167+
std::string_view name, bool case_sensitive) const {
168+
if (case_sensitive) {
169+
if (name == element_.name()) {
170+
return std::cref(element_);
171+
}
172+
return std::nullopt;
173+
}
174+
if (StringUtils::ToLower(name) == StringUtils::ToLower(element_.name())) {
125175
return std::cref(element_);
126176
}
127177
return std::nullopt;
@@ -159,7 +209,7 @@ std::string MapType::ToString() const {
159209
return repr;
160210
}
161211
std::span<const SchemaField> MapType::fields() const { return fields_; }
162-
std::optional<std::reference_wrapper<const SchemaField>> MapType::GetFieldById(
212+
Result<std::optional<std::reference_wrapper<const SchemaField>>> MapType::GetFieldById(
163213
int32_t field_id) const {
164214
if (field_id == key().field_id()) {
165215
return key();
@@ -168,7 +218,7 @@ std::optional<std::reference_wrapper<const SchemaField>> MapType::GetFieldById(
168218
}
169219
return std::nullopt;
170220
}
171-
std::optional<std::reference_wrapper<const SchemaField>> MapType::GetFieldByIndex(
221+
Result<std::optional<std::reference_wrapper<const SchemaField>>> MapType::GetFieldByIndex(
172222
int32_t index) const {
173223
if (index == 0) {
174224
return key();
@@ -177,11 +227,19 @@ std::optional<std::reference_wrapper<const SchemaField>> MapType::GetFieldByInde
177227
}
178228
return std::nullopt;
179229
}
180-
std::optional<std::reference_wrapper<const SchemaField>> MapType::GetFieldByName(
181-
std::string_view name) const {
182-
if (name == kKeyName) {
230+
Result<std::optional<std::reference_wrapper<const SchemaField>>> MapType::GetFieldByName(
231+
std::string_view name, bool case_sensitive) const {
232+
if (case_sensitive) {
233+
if (name == kKeyName) {
234+
return key();
235+
} else if (name == kValueName) {
236+
return value();
237+
}
238+
return std::nullopt;
239+
}
240+
if (StringUtils::ToLower(name) == StringUtils::ToLower(kKeyName)) {
183241
return key();
184-
} else if (name == kValueName) {
242+
} else if (StringUtils::ToLower(name) == StringUtils::ToLower(kValueName)) {
185243
return value();
186244
}
187245
return std::nullopt;

src/iceberg/type.h

Lines changed: 38 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
#include <vector>
3434

3535
#include "iceberg/iceberg_export.h"
36+
#include "iceberg/result.h"
3637
#include "iceberg/schema_field.h"
3738
#include "iceberg/util/formattable.h"
3839

@@ -78,20 +79,23 @@ class ICEBERG_EXPORT NestedType : public Type {
7879
/// \brief Get a field by field ID.
7980
///
8081
/// \note This is O(1) complexity.
81-
[[nodiscard]] virtual std::optional<std::reference_wrapper<const SchemaField>>
82+
[[nodiscard]] virtual Result<std::optional<std::reference_wrapper<const SchemaField>>>
8283
GetFieldById(int32_t field_id) const = 0;
8384
/// \brief Get a field by index.
8485
///
8586
/// \note This is O(1) complexity.
86-
[[nodiscard]] virtual std::optional<std::reference_wrapper<const SchemaField>>
87+
[[nodiscard]] virtual Result<std::optional<std::reference_wrapper<const SchemaField>>>
8788
GetFieldByIndex(int32_t index) const = 0;
88-
/// \brief Get a field by name (case-sensitive). Behavior is undefined if
89+
/// \brief Get a field by name. Behavior is not allowed if
8990
/// the field name is not unique; prefer GetFieldById or GetFieldByIndex
9091
/// when possible.
9192
///
92-
/// \note This is currently O(n) complexity.
93-
[[nodiscard]] virtual std::optional<std::reference_wrapper<const SchemaField>>
94-
GetFieldByName(std::string_view name) const = 0;
93+
/// \note This is currently O(1) complexity.
94+
[[nodiscard]] virtual Result<std::optional<std::reference_wrapper<const SchemaField>>>
95+
GetFieldByName(std::string_view name, bool case_sensitive) const = 0;
96+
/// \brief Get a field by name(case-sensitive).
97+
[[nodiscard]] Result<std::optional<std::reference_wrapper<const SchemaField>>>
98+
GetFieldByName(std::string_view name) const;
9599
};
96100

97101
/// \defgroup type-nested Nested Types
@@ -109,18 +113,29 @@ class ICEBERG_EXPORT StructType : public NestedType {
109113
std::string ToString() const override;
110114

111115
std::span<const SchemaField> fields() const override;
112-
std::optional<std::reference_wrapper<const SchemaField>> GetFieldById(
116+
Result<std::optional<std::reference_wrapper<const SchemaField>>> GetFieldById(
113117
int32_t field_id) const override;
114-
std::optional<std::reference_wrapper<const SchemaField>> GetFieldByIndex(
118+
Result<std::optional<std::reference_wrapper<const SchemaField>>> GetFieldByIndex(
115119
int32_t index) const override;
116-
std::optional<std::reference_wrapper<const SchemaField>> GetFieldByName(
117-
std::string_view name) const override;
120+
Result<std::optional<std::reference_wrapper<const SchemaField>>> GetFieldByName(
121+
std::string_view name, bool case_sensitive) const override;
122+
123+
using NestedType::GetFieldByName;
118124

119125
protected:
120126
bool Equals(const Type& other) const override;
127+
Status InitFieldById() const;
128+
Status InitFieldByName() const;
129+
Status InitFieldByLowerCaseName() const;
121130

131+
protected:
122132
std::vector<SchemaField> fields_;
123-
std::unordered_map<int32_t, size_t> field_id_to_index_;
133+
mutable std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>
134+
field_by_id_;
135+
mutable std::unordered_map<std::string_view, std::reference_wrapper<const SchemaField>>
136+
field_by_name_;
137+
mutable std::unordered_map<std::string, std::reference_wrapper<const SchemaField>>
138+
field_by_lowercase_name_;
124139
};
125140

126141
/// \brief A data type representing a list of values.
@@ -140,12 +155,14 @@ class ICEBERG_EXPORT ListType : public NestedType {
140155
std::string ToString() const override;
141156

142157
std::span<const SchemaField> fields() const override;
143-
std::optional<std::reference_wrapper<const SchemaField>> GetFieldById(
158+
Result<std::optional<std::reference_wrapper<const SchemaField>>> GetFieldById(
144159
int32_t field_id) const override;
145-
std::optional<std::reference_wrapper<const SchemaField>> GetFieldByIndex(
160+
Result<std::optional<std::reference_wrapper<const SchemaField>>> GetFieldByIndex(
146161
int32_t index) const override;
147-
std::optional<std::reference_wrapper<const SchemaField>> GetFieldByName(
148-
std::string_view name) const override;
162+
Result<std::optional<std::reference_wrapper<const SchemaField>>> GetFieldByName(
163+
std::string_view name, bool case_sensitive) const override;
164+
165+
using NestedType::GetFieldByName;
149166

150167
protected:
151168
bool Equals(const Type& other) const override;
@@ -172,12 +189,14 @@ class ICEBERG_EXPORT MapType : public NestedType {
172189
std::string ToString() const override;
173190

174191
std::span<const SchemaField> fields() const override;
175-
std::optional<std::reference_wrapper<const SchemaField>> GetFieldById(
192+
Result<std::optional<std::reference_wrapper<const SchemaField>>> GetFieldById(
176193
int32_t field_id) const override;
177-
std::optional<std::reference_wrapper<const SchemaField>> GetFieldByIndex(
194+
Result<std::optional<std::reference_wrapper<const SchemaField>>> GetFieldByIndex(
178195
int32_t index) const override;
179-
std::optional<std::reference_wrapper<const SchemaField>> GetFieldByName(
180-
std::string_view name) const override;
196+
Result<std::optional<std::reference_wrapper<const SchemaField>>> GetFieldByName(
197+
std::string_view name, bool case_sensitive) const override;
198+
199+
using NestedType::GetFieldByName;
181200

182201
protected:
183202
bool Equals(const Type& other) const override;

src/iceberg/util/macros.h

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,13 @@
1919

2020
#pragma once
2121

22-
#define ICEBERG_RETURN_UNEXPECTED(result) \
23-
if (!result) [[unlikely]] { \
24-
return std::unexpected<Error>(result.error()); \
25-
}
22+
#define ICEBERG_RETURN_UNEXPECTED(result) \
23+
do { \
24+
auto&& iceberg_temp_result = (result); \
25+
if (!iceberg_temp_result) [[unlikely]] { \
26+
return std::unexpected<Error>(iceberg_temp_result.error()); \
27+
} \
28+
} while (false);
2629

2730
#define ICEBERG_ASSIGN_OR_RAISE_IMPL(result_name, lhs, rexpr) \
2831
auto&& result_name = (rexpr); \

0 commit comments

Comments
 (0)