Skip to content

Commit b80bf8d

Browse files
nullccxsynullccxsy
andauthored
feat: add find field (by name) support to NestedType (#194)
… MapType - Implemented case-insensitive GetFieldByName in NestedType subclasses. - Added lazy initialization for maps in StructType - Handled duplicate names/IDs with Status returns instead of throws. --------- Co-authored-by: nullccxsy <[email protected]>
1 parent 88f5520 commit b80bf8d

File tree

7 files changed

+279
-100
lines changed

7 files changed

+279
-100
lines changed

src/iceberg/manifest_reader_internal.cc

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -222,8 +222,8 @@ Result<std::vector<ManifestFile>> ParseManifestList(ArrowSchema* schema,
222222
if (!field.has_value()) {
223223
return InvalidSchema("Field index {} is not found in schema", idx);
224224
}
225-
auto field_name = field.value().get().name();
226-
bool required = !field.value().get().optional();
225+
auto field_name = field.value()->get().name();
226+
bool required = !field.value()->get().optional();
227227
auto view_of_column = array_view.children[idx];
228228
switch (idx) {
229229
case 0:
@@ -340,8 +340,8 @@ Status ParseDataFile(const std::shared_ptr<StructType>& data_file_schema,
340340
data_file_schema->fields().size(), view_of_column->n_children);
341341
}
342342
for (int64_t col_idx = 0; col_idx < view_of_column->n_children; ++col_idx) {
343-
auto field_name = data_file_schema->GetFieldByIndex(col_idx).value().get().name();
344-
auto required = !data_file_schema->GetFieldByIndex(col_idx).value().get().optional();
343+
auto field_name = data_file_schema->GetFieldByIndex(col_idx).value()->get().name();
344+
auto required = !data_file_schema->GetFieldByIndex(col_idx).value()->get().optional();
345345
auto view_of_file_field = view_of_column->children[col_idx];
346346
auto manifest_entry_count = view_of_file_field->length;
347347

@@ -487,8 +487,8 @@ Result<std::vector<ManifestEntry>> ParseManifestEntry(ArrowSchema* schema,
487487
if (!field.has_value()) {
488488
return InvalidManifest("Field not found in schema: {}", idx);
489489
}
490-
auto field_name = field.value().get().name();
491-
bool required = !field.value().get().optional();
490+
auto field_name = field.value()->get().name();
491+
bool required = !field.value()->get().optional();
492492
auto view_of_column = array_view.children[idx];
493493

494494
switch (idx) {
@@ -510,7 +510,7 @@ Result<std::vector<ManifestEntry>> ParseManifestEntry(ArrowSchema* schema,
510510
break;
511511
case 4: {
512512
auto data_file_schema =
513-
dynamic_pointer_cast<StructType>(field.value().get().type());
513+
dynamic_pointer_cast<StructType>(field.value()->get().type());
514514
ICEBERG_RETURN_UNEXPECTED(
515515
ParseDataFile(data_file_schema, view_of_column, manifest_entries));
516516
break;

src/iceberg/table_scan.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ Result<std::unique_ptr<TableScan>> TableScanBuilder::Build() {
115115
return InvalidArgument("Column {} not found in schema '{}'", column_name,
116116
*schema_id);
117117
}
118-
projected_fields.emplace_back(field_opt.value().get());
118+
projected_fields.emplace_back(field_opt.value()->get());
119119
}
120120
context_.projected_schema =
121121
std::make_shared<Schema>(std::move(projected_fields), schema->schema_id());

src/iceberg/type.cc

Lines changed: 98 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -25,23 +25,18 @@
2525

2626
#include "iceberg/exception.h"
2727
#include "iceberg/util/formatter.h" // IWYU pragma: keep
28+
#include "iceberg/util/macros.h"
29+
#include "iceberg/util/string_util.h"
2830

2931
namespace iceberg {
3032

31-
StructType::StructType(std::vector<SchemaField> fields) : fields_(std::move(fields)) {
32-
size_t index = 0;
33-
for (const auto& field : fields_) {
34-
auto [it, inserted] = field_id_to_index_.try_emplace(field.field_id(), index);
35-
if (!inserted) {
36-
throw IcebergError(
37-
std::format("StructType: duplicate field ID {} (field indices {} and {})",
38-
field.field_id(), it->second, index));
39-
}
40-
41-
++index;
42-
}
33+
Result<std::optional<NestedType::SchemaFieldConstRef>> NestedType::GetFieldByName(
34+
std::string_view name) const {
35+
return GetFieldByName(name, /*case_sensitive=*/true);
4336
}
4437

38+
StructType::StructType(std::vector<SchemaField> fields) : fields_(std::move(fields)) {}
39+
4540
TypeId StructType::type_id() const { return kTypeId; }
4641

4742
std::string StructType::ToString() const {
@@ -53,27 +48,34 @@ std::string StructType::ToString() const {
5348
return repr;
5449
}
5550
std::span<const SchemaField> StructType::fields() const { return fields_; }
56-
std::optional<std::reference_wrapper<const SchemaField>> StructType::GetFieldById(
51+
Result<std::optional<NestedType::SchemaFieldConstRef>> StructType::GetFieldById(
5752
int32_t field_id) const {
58-
auto it = field_id_to_index_.find(field_id);
59-
if (it == field_id_to_index_.end()) return std::nullopt;
60-
return fields_[it->second];
53+
ICEBERG_RETURN_UNEXPECTED(InitFieldById());
54+
auto it = field_by_id_.find(field_id);
55+
if (it == field_by_id_.end()) return std::nullopt;
56+
return it->second;
6157
}
62-
std::optional<std::reference_wrapper<const SchemaField>> StructType::GetFieldByIndex(
58+
Result<std::optional<NestedType::SchemaFieldConstRef>> StructType::GetFieldByIndex(
6359
int32_t index) const {
64-
if (index < 0 || index >= static_cast<int32_t>(fields_.size())) {
65-
return std::nullopt;
60+
if (index < 0 || static_cast<size_t>(index) >= fields_.size()) {
61+
return InvalidArgument("Invalid index {} to get field from struct", index);
6662
}
6763
return fields_[index];
6864
}
69-
std::optional<std::reference_wrapper<const SchemaField>> StructType::GetFieldByName(
70-
std::string_view name) const {
71-
// N.B. duplicate names are not permitted (looking at the Java
72-
// implementation) so there is nothing in particular we need to do here
73-
for (const auto& field : fields_) {
74-
if (field.name() == name) {
75-
return field;
65+
Result<std::optional<NestedType::SchemaFieldConstRef>> StructType::GetFieldByName(
66+
std::string_view name, bool case_sensitive) const {
67+
if (case_sensitive) {
68+
ICEBERG_RETURN_UNEXPECTED(InitFieldByName());
69+
auto it = field_by_name_.find(name);
70+
if (it != field_by_name_.end()) {
71+
return it->second;
7672
}
73+
return std::nullopt;
74+
}
75+
ICEBERG_RETURN_UNEXPECTED(InitFieldByLowerCaseName());
76+
auto it = field_by_lowercase_name_.find(StringUtils::ToLower(name));
77+
if (it != field_by_lowercase_name_.end()) {
78+
return it->second;
7779
}
7880
return std::nullopt;
7981
}
@@ -84,6 +86,48 @@ bool StructType::Equals(const Type& other) const {
8486
const auto& struct_ = static_cast<const StructType&>(other);
8587
return fields_ == struct_.fields_;
8688
}
89+
Status StructType::InitFieldById() const {
90+
if (!field_by_id_.empty()) {
91+
return {};
92+
}
93+
for (const auto& field : fields_) {
94+
auto it = field_by_id_.try_emplace(field.field_id(), field);
95+
if (!it.second) {
96+
return InvalidSchema("Duplicate field id found: {} (prev name: {}, curr name: {})",
97+
field.field_id(), it.first->second.get().name(), field.name());
98+
}
99+
}
100+
return {};
101+
}
102+
Status StructType::InitFieldByName() const {
103+
if (!field_by_name_.empty()) {
104+
return {};
105+
}
106+
for (const auto& field : fields_) {
107+
auto it = field_by_name_.try_emplace(field.name(), field);
108+
if (!it.second) {
109+
return InvalidSchema("Duplicate field name found: {} (prev id: {}, curr id: {})",
110+
it.first->first, it.first->second.get().field_id(),
111+
field.field_id());
112+
}
113+
}
114+
return {};
115+
}
116+
Status StructType::InitFieldByLowerCaseName() const {
117+
if (!field_by_lowercase_name_.empty()) {
118+
return {};
119+
}
120+
for (const auto& field : fields_) {
121+
auto it =
122+
field_by_lowercase_name_.try_emplace(StringUtils::ToLower(field.name()), field);
123+
if (!it.second) {
124+
return InvalidSchema(
125+
"Duplicate lowercase field name found: {} (prev id: {}, curr id: {})",
126+
it.first->first, it.first->second.get().field_id(), field.field_id());
127+
}
128+
}
129+
return {};
130+
}
87131

88132
ListType::ListType(SchemaField element) : element_(std::move(element)) {
89133
if (element_.name() != kElementName) {
@@ -105,23 +149,29 @@ std::string ListType::ToString() const {
105149
return repr;
106150
}
107151
std::span<const SchemaField> ListType::fields() const { return {&element_, 1}; }
108-
std::optional<std::reference_wrapper<const SchemaField>> ListType::GetFieldById(
152+
Result<std::optional<NestedType::SchemaFieldConstRef>> ListType::GetFieldById(
109153
int32_t field_id) const {
110154
if (field_id == element_.field_id()) {
111155
return std::cref(element_);
112156
}
113157
return std::nullopt;
114158
}
115-
std::optional<std::reference_wrapper<const SchemaField>> ListType::GetFieldByIndex(
159+
Result<std::optional<NestedType::SchemaFieldConstRef>> ListType::GetFieldByIndex(
116160
int index) const {
117161
if (index == 0) {
118162
return std::cref(element_);
119163
}
120-
return std::nullopt;
164+
return InvalidArgument("Invalid index {} to get field from list", index);
121165
}
122-
std::optional<std::reference_wrapper<const SchemaField>> ListType::GetFieldByName(
123-
std::string_view name) const {
124-
if (name == element_.name()) {
166+
Result<std::optional<NestedType::SchemaFieldConstRef>> ListType::GetFieldByName(
167+
std::string_view name, bool case_sensitive) const {
168+
if (case_sensitive) {
169+
if (name == kElementName) {
170+
return std::cref(element_);
171+
}
172+
return std::nullopt;
173+
}
174+
if (StringUtils::ToLower(name) == kElementName) {
125175
return std::cref(element_);
126176
}
127177
return std::nullopt;
@@ -159,7 +209,7 @@ std::string MapType::ToString() const {
159209
return repr;
160210
}
161211
std::span<const SchemaField> MapType::fields() const { return fields_; }
162-
std::optional<std::reference_wrapper<const SchemaField>> MapType::GetFieldById(
212+
Result<std::optional<NestedType::SchemaFieldConstRef>> MapType::GetFieldById(
163213
int32_t field_id) const {
164214
if (field_id == key().field_id()) {
165215
return key();
@@ -168,20 +218,29 @@ std::optional<std::reference_wrapper<const SchemaField>> MapType::GetFieldById(
168218
}
169219
return std::nullopt;
170220
}
171-
std::optional<std::reference_wrapper<const SchemaField>> MapType::GetFieldByIndex(
221+
Result<std::optional<NestedType::SchemaFieldConstRef>> MapType::GetFieldByIndex(
172222
int32_t index) const {
173223
if (index == 0) {
174224
return key();
175225
} else if (index == 1) {
176226
return value();
177227
}
178-
return std::nullopt;
228+
return InvalidArgument("Invalid index {} to get field from map", index);
179229
}
180-
std::optional<std::reference_wrapper<const SchemaField>> MapType::GetFieldByName(
181-
std::string_view name) const {
182-
if (name == kKeyName) {
230+
Result<std::optional<NestedType::SchemaFieldConstRef>> MapType::GetFieldByName(
231+
std::string_view name, bool case_sensitive) const {
232+
if (case_sensitive) {
233+
if (name == kKeyName) {
234+
return key();
235+
} else if (name == kValueName) {
236+
return value();
237+
}
238+
return std::nullopt;
239+
}
240+
const auto lower_case_name = StringUtils::ToLower(name);
241+
if (lower_case_name == kKeyName) {
183242
return key();
184-
} else if (name == kValueName) {
243+
} else if (lower_case_name == kValueName) {
185244
return value();
186245
}
187246
return std::nullopt;

src/iceberg/type.h

Lines changed: 36 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
#include <vector>
3434

3535
#include "iceberg/iceberg_export.h"
36+
#include "iceberg/result.h"
3637
#include "iceberg/schema_field.h"
3738
#include "iceberg/util/formattable.h"
3839

@@ -75,23 +76,27 @@ class ICEBERG_EXPORT NestedType : public Type {
7576

7677
/// \brief Get a view of the child fields.
7778
[[nodiscard]] virtual std::span<const SchemaField> fields() const = 0;
79+
using SchemaFieldConstRef = std::reference_wrapper<const SchemaField>;
7880
/// \brief Get a field by field ID.
7981
///
8082
/// \note This is O(1) complexity.
81-
[[nodiscard]] virtual std::optional<std::reference_wrapper<const SchemaField>>
82-
GetFieldById(int32_t field_id) const = 0;
83+
[[nodiscard]] virtual Result<std::optional<SchemaFieldConstRef>> GetFieldById(
84+
int32_t field_id) const = 0;
8385
/// \brief Get a field by index.
8486
///
8587
/// \note This is O(1) complexity.
86-
[[nodiscard]] virtual std::optional<std::reference_wrapper<const SchemaField>>
87-
GetFieldByIndex(int32_t index) const = 0;
88-
/// \brief Get a field by name (case-sensitive). Behavior is undefined if
88+
[[nodiscard]] virtual Result<std::optional<SchemaFieldConstRef>> GetFieldByIndex(
89+
int32_t index) const = 0;
90+
/// \brief Get a field by name. Return an error Status if
8991
/// the field name is not unique; prefer GetFieldById or GetFieldByIndex
9092
/// when possible.
9193
///
92-
/// \note This is currently O(n) complexity.
93-
[[nodiscard]] virtual std::optional<std::reference_wrapper<const SchemaField>>
94-
GetFieldByName(std::string_view name) const = 0;
94+
/// \note This is O(1) complexity.
95+
[[nodiscard]] virtual Result<std::optional<SchemaFieldConstRef>> GetFieldByName(
96+
std::string_view name, bool case_sensitive) const = 0;
97+
/// \brief Get a field by name (case-sensitive).
98+
[[nodiscard]] Result<std::optional<SchemaFieldConstRef>> GetFieldByName(
99+
std::string_view name) const;
95100
};
96101

97102
/// \defgroup type-nested Nested Types
@@ -109,18 +114,26 @@ class ICEBERG_EXPORT StructType : public NestedType {
109114
std::string ToString() const override;
110115

111116
std::span<const SchemaField> fields() const override;
112-
std::optional<std::reference_wrapper<const SchemaField>> GetFieldById(
117+
Result<std::optional<SchemaFieldConstRef>> GetFieldById(
113118
int32_t field_id) const override;
114-
std::optional<std::reference_wrapper<const SchemaField>> GetFieldByIndex(
119+
Result<std::optional<SchemaFieldConstRef>> GetFieldByIndex(
115120
int32_t index) const override;
116-
std::optional<std::reference_wrapper<const SchemaField>> GetFieldByName(
117-
std::string_view name) const override;
121+
Result<std::optional<SchemaFieldConstRef>> GetFieldByName(
122+
std::string_view name, bool case_sensitive) const override;
123+
using NestedType::GetFieldByName;
118124

119125
protected:
120126
bool Equals(const Type& other) const override;
127+
// TODO(nullccxsy): Lazy initialization has concurrency issues, need to add proper
128+
// synchronization mechanism
129+
Status InitFieldById() const;
130+
Status InitFieldByName() const;
131+
Status InitFieldByLowerCaseName() const;
121132

122133
std::vector<SchemaField> fields_;
123-
std::unordered_map<int32_t, size_t> field_id_to_index_;
134+
mutable std::unordered_map<int32_t, SchemaFieldConstRef> field_by_id_;
135+
mutable std::unordered_map<std::string_view, SchemaFieldConstRef> field_by_name_;
136+
mutable std::unordered_map<std::string, SchemaFieldConstRef> field_by_lowercase_name_;
124137
};
125138

126139
/// \brief A data type representing a list of values.
@@ -140,12 +153,13 @@ class ICEBERG_EXPORT ListType : public NestedType {
140153
std::string ToString() const override;
141154

142155
std::span<const SchemaField> fields() const override;
143-
std::optional<std::reference_wrapper<const SchemaField>> GetFieldById(
156+
Result<std::optional<SchemaFieldConstRef>> GetFieldById(
144157
int32_t field_id) const override;
145-
std::optional<std::reference_wrapper<const SchemaField>> GetFieldByIndex(
158+
Result<std::optional<SchemaFieldConstRef>> GetFieldByIndex(
146159
int32_t index) const override;
147-
std::optional<std::reference_wrapper<const SchemaField>> GetFieldByName(
148-
std::string_view name) const override;
160+
Result<std::optional<SchemaFieldConstRef>> GetFieldByName(
161+
std::string_view name, bool case_sensitive) const override;
162+
using NestedType::GetFieldByName;
149163

150164
protected:
151165
bool Equals(const Type& other) const override;
@@ -172,12 +186,13 @@ class ICEBERG_EXPORT MapType : public NestedType {
172186
std::string ToString() const override;
173187

174188
std::span<const SchemaField> fields() const override;
175-
std::optional<std::reference_wrapper<const SchemaField>> GetFieldById(
189+
Result<std::optional<SchemaFieldConstRef>> GetFieldById(
176190
int32_t field_id) const override;
177-
std::optional<std::reference_wrapper<const SchemaField>> GetFieldByIndex(
191+
Result<std::optional<SchemaFieldConstRef>> GetFieldByIndex(
178192
int32_t index) const override;
179-
std::optional<std::reference_wrapper<const SchemaField>> GetFieldByName(
180-
std::string_view name) const override;
193+
Result<std::optional<SchemaFieldConstRef>> GetFieldByName(
194+
std::string_view name, bool case_sensitive) const override;
195+
using NestedType::GetFieldByName;
181196

182197
protected:
183198
bool Equals(const Type& other) const override;

src/iceberg/util/macros.h

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,10 @@
1919

2020
#pragma once
2121

22-
#define ICEBERG_RETURN_UNEXPECTED(result) \
23-
do { \
24-
auto&& result_name = (result); \
25-
if (!result_name) [[unlikely]] { \
26-
return std::unexpected<Error>(result_name.error()); \
27-
} \
28-
} while (false);
22+
#define ICEBERG_RETURN_UNEXPECTED(result) \
23+
if (auto&& result_name = result; !result_name) [[unlikely]] { \
24+
return std::unexpected<Error>(result_name.error()); \
25+
}
2926

3027
#define ICEBERG_ASSIGN_OR_RAISE_IMPL(result_name, lhs, rexpr) \
3128
auto&& result_name = (rexpr); \

0 commit comments

Comments
 (0)