Skip to content

Commit 777190b

Browse files
author
nullccxsy
committed
feat(schema): implement nested field lookup with name pruning
- Introduce IdVisitor for schema traversal to init id_to_index_(Map) - Introduce NameVisitor for schema traversal to init name_to_index_(Map), lowercase_name_to_index_(map)
1 parent ae4515e commit 777190b

File tree

4 files changed

+305
-136
lines changed

4 files changed

+305
-136
lines changed

src/iceberg/schema.cc

Lines changed: 150 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -21,15 +21,49 @@
2121

2222
#include <format>
2323

24-
#include "iceberg/exception.h"
2524
#include "iceberg/type.h"
2625
#include "iceberg/util/formatter.h" // IWYU pragma: keep
2726
namespace iceberg {
27+
class IdVisitor {
28+
public:
29+
explicit IdVisitor(bool has_init_ = false);
30+
Status Visit(const Type& type);
2831

29-
Schema::Schema(std::vector<SchemaField> fields, std::optional<int32_t> schema_id)
30-
: StructType(std::move(fields)), schema_id_(schema_id) {
31-
InitIdToIndexMap();
32+
bool has_init;
33+
int index = 0;
34+
std::unordered_map<int, size_t> id_to_index;
35+
std::vector<std::reference_wrapper<const SchemaField>> full_schemafield;
36+
};
37+
38+
std::string GetPath(const std::string& last_path, const std::string& field_name,
39+
bool case_sensitive) {
40+
if (case_sensitive) {
41+
return last_path.empty() ? field_name : last_path + "." + field_name;
42+
}
43+
std::string lower_name(field_name);
44+
std::ranges::transform(lower_name, lower_name.begin(), ::tolower);
45+
return last_path.empty() ? lower_name : last_path + "." + lower_name;
3246
}
47+
class NameVisitor {
48+
public:
49+
explicit NameVisitor(bool case_sensitive_ = true, bool has_init_ = false);
50+
Status Visit(const ListType& type, const std::string& path,
51+
const std::string& short_path);
52+
Status Visit(const MapType& type, const std::string& path,
53+
const std::string& short_path);
54+
Status Visit(const StructType& type, const std::string& path,
55+
const std::string& short_path);
56+
Status Visit(const PrimitiveType& type, const std::string& path,
57+
const std::string& short_path);
58+
59+
int index = 0;
60+
bool case_sensitive;
61+
bool has_init;
62+
std::unordered_map<std::string, size_t> name_to_index;
63+
std::vector<std::reference_wrapper<const SchemaField>> full_schemafield;
64+
};
65+
Schema::Schema(std::vector<SchemaField> fields, std::optional<int32_t> schema_id)
66+
: StructType(std::move(fields)), schema_id_(schema_id) {}
3367

3468
std::optional<int32_t> Schema::schema_id() const { return schema_id_; }
3569

@@ -46,142 +80,172 @@ bool Schema::Equals(const Schema& other) const {
4680
return schema_id_ == other.schema_id_ && fields_ == other.fields_;
4781
}
4882

49-
std::optional<std::reference_wrapper<const SchemaField>> Schema::GetFieldByName(
83+
Result<std::optional<std::reference_wrapper<const SchemaField>>> Schema::FindFieldByName(
5084
std::string_view name, bool case_sensitive) const {
5185
if (case_sensitive) {
52-
InitNameToIndexMap();
86+
ICEBERG_RETURN_UNEXPECTED(InitNameToIndexMap());
5387
auto it = name_to_index_.find(std::string(name));
5488
if (it == name_to_index_.end()) return std::nullopt;
5589
return full_schemafield_[it->second];
5690
}
57-
InitLowerCaseNameToIndexMap();
91+
ICEBERG_RETURN_UNEXPECTED(InitLowerCaseNameToIndexMap());
5892
std::string lower_name(name);
5993
std::ranges::transform(lower_name, lower_name.begin(), ::tolower);
6094
auto it = lowercase_name_to_index_.find(lower_name);
6195
if (it == lowercase_name_to_index_.end()) return std::nullopt;
6296
return full_schemafield_[it->second];
6397
}
6498

65-
std::optional<std::reference_wrapper<const SchemaField>> Schema::GetFieldByName(
99+
Result<std::optional<std::reference_wrapper<const SchemaField>>> Schema::FindFieldByName(
66100
std::string_view name) const {
67-
return GetFieldByName(name, true);
101+
return FindFieldByName(name, /*case_sensitive*/ true);
68102
}
69103

70-
void Schema::InitIdToIndexMap() const {
104+
Result<Status> Schema::InitIdToIndexMap() const {
71105
if (!id_to_index_.empty()) {
72-
return;
106+
return {};
73107
}
74-
SchemaFieldVisitor visitor;
75-
auto result = VisitTypeInline(*this, &visitor, id_to_index_, full_schemafield_);
108+
bool has_init = !full_schemafield_.empty();
109+
IdVisitor visitor(has_init);
110+
ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*this, &visitor));
111+
id_to_index_ = std::move(visitor.id_to_index);
112+
if (!has_init) {
113+
full_schemafield_ = std::move(visitor.full_schemafield);
114+
}
115+
return {};
76116
}
77117

78-
void Schema::InitNameToIndexMap() const {
118+
Result<Status> Schema::InitNameToIndexMap() const {
79119
if (!name_to_index_.empty()) {
80-
return;
120+
return {};
81121
}
82-
int index = 0;
83-
std::string_view path, short_path;
84-
SchemaFieldVisitor visitor;
85-
std::unordered_map<std::string, size_t> shortname_to_index;
86-
auto tmp = VisitTypeInline(*this, &visitor, name_to_index_, path, shortname_to_index,
87-
short_path, index, true);
88-
if (!tmp.has_value()) {
89-
throw IcebergError("Failed to perform InitNameToIndexMap");
90-
}
91-
for (const auto& pair : shortname_to_index) {
92-
if (!name_to_index_.count(pair.first)) {
93-
name_to_index_.emplace(pair.first, pair.second);
94-
}
122+
bool has_init = !full_schemafield_.empty();
123+
std::string path, short_path;
124+
NameVisitor visitor(true, has_init);
125+
ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*this, &visitor, path, short_path));
126+
name_to_index_ = std::move(visitor.name_to_index);
127+
if (!has_init) {
128+
full_schemafield_ = std::move(visitor.full_schemafield);
95129
}
130+
return {};
96131
}
97132

98-
void Schema::InitLowerCaseNameToIndexMap() const {
133+
Result<Status> Schema::InitLowerCaseNameToIndexMap() const {
99134
if (!lowercase_name_to_index_.empty()) {
100-
return;
135+
return {};
101136
}
102-
int index = 0;
103-
std::string_view path, short_path;
104-
SchemaFieldVisitor visitor;
105-
std::unordered_map<std::string, size_t> shortlowercasename_to_index;
106-
auto tmp = VisitTypeInline(*this, &visitor, lowercase_name_to_index_, path,
107-
shortlowercasename_to_index, short_path, index, false);
108-
if (!tmp.has_value()) {
109-
throw IcebergError("Failed to perform InitLowerCaseNameToIndexMap");
110-
}
111-
for (const auto& pair : shortlowercasename_to_index) {
112-
if (!lowercase_name_to_index_.count(pair.first)) {
113-
lowercase_name_to_index_.emplace(pair.first, pair.second);
114-
}
137+
bool has_init = !full_schemafield_.empty();
138+
std::string path, short_path;
139+
NameVisitor visitor(false, has_init);
140+
ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*this, &visitor, path, short_path));
141+
lowercase_name_to_index_ = std::move(visitor.name_to_index);
142+
if (!has_init) {
143+
full_schemafield_ = std::move(visitor.full_schemafield);
115144
}
145+
return {};
116146
}
117147

118-
std::optional<std::reference_wrapper<const SchemaField>> Schema::GetFieldById(
148+
Result<std::optional<std::reference_wrapper<const SchemaField>>> Schema::FindFieldById(
119149
int32_t field_id) const {
120-
InitIdToIndexMap();
150+
ICEBERG_RETURN_UNEXPECTED(InitIdToIndexMap());
121151
auto it = id_to_index_.find(field_id);
122152
if (it == id_to_index_.end()) {
123153
return std::nullopt;
124154
}
125155
return full_schemafield_[it->second];
126156
}
127157

128-
Status SchemaFieldVisitor::Visit(const Type& type,
129-
std::unordered_map<int, size_t>& id_to_index,
130-
std::vector<SchemaField>& full_schemafield) {
158+
IdVisitor::IdVisitor(bool has_init_) : has_init(has_init_) {}
159+
160+
Status IdVisitor::Visit(const Type& type) {
131161
const auto& nested = iceberg::internal::checked_cast<const NestedType&>(type);
132-
for (const auto& field : nested.fields()) {
133-
id_to_index[field.field_id()] = full_schemafield.size();
134-
full_schemafield.emplace_back(field);
162+
const auto& fields = nested.fields();
163+
for (const auto& field : fields) {
164+
id_to_index[field.field_id()] = index++;
165+
if (!has_init) {
166+
full_schemafield.emplace_back(field);
167+
}
135168
if (field.type()->is_nested()) {
136-
auto tmp = Visit(*field.type(), id_to_index, full_schemafield);
137-
if (!tmp.has_value()) {
138-
throw IcebergError("Failed to perform visit(id_to_index)");
139-
}
169+
ICEBERG_RETURN_UNEXPECTED(Visit(*field.type()));
140170
}
141171
}
142172
return {};
143173
}
144-
std::string SchemaFieldVisitor::GetPath(const std::string& last_path,
145-
const std::string& field_name,
146-
bool case_sensitive) {
147-
if (case_sensitive) {
148-
return last_path.empty() ? field_name : last_path + "." + field_name;
174+
175+
NameVisitor::NameVisitor(bool case_sensitive_, bool has_init_)
176+
: case_sensitive(case_sensitive_), has_init(has_init_) {}
177+
178+
Status NameVisitor::Visit(const ListType& type, const std::string& path,
179+
const std::string& short_path) {
180+
const auto& field = type.fields()[0];
181+
std::string full_path =
182+
iceberg::GetPath(path, std::string(field.name()), case_sensitive);
183+
std::string short_full_path;
184+
if (field.type()->type_id() == TypeId::kStruct) {
185+
short_full_path = short_path;
186+
} else {
187+
short_full_path =
188+
iceberg::GetPath(short_path, std::string(field.name()), case_sensitive);
149189
}
150-
std::string lower_name(field_name);
151-
std::ranges::transform(lower_name, lower_name.begin(), ::tolower);
152-
return last_path.empty() ? lower_name : last_path + "." + lower_name;
190+
name_to_index[full_path] = index++;
191+
if (!has_init) {
192+
full_schemafield.emplace_back(field);
193+
}
194+
name_to_index.emplace(short_full_path, index - 1);
195+
if (field.type()->is_nested()) {
196+
ICEBERG_RETURN_UNEXPECTED(
197+
VisitTypeInline(*field.type(), this, full_path, short_full_path));
198+
}
199+
return {};
153200
}
154201

155-
Status SchemaFieldVisitor::Visit(
156-
const Type& type, std::unordered_map<std::string, size_t>& name_to_index,
157-
std::string_view path, std::unordered_map<std::string, size_t>& shortname_to_index,
158-
std::string_view short_path, int& index, bool case_sensitive) {
159-
const char dot = '.';
160-
const auto& nested = iceberg::internal::checked_cast<const NestedType&>(type);
161-
for (const auto& field : nested.fields()) {
162-
std::string full_path, short_full_path;
163-
full_path = GetPath(std::string(path), std::string(field.name()), case_sensitive);
164-
name_to_index[full_path] = index;
165-
166-
if (type.type_id() == TypeId::kList and field.type()->type_id() == TypeId::kStruct) {
167-
short_full_path = short_path;
168-
} else if (type.type_id() == TypeId::kMap and field.name() == "value" and
169-
field.type()->type_id() == TypeId::kStruct) {
202+
Status NameVisitor::Visit(const MapType& type, const std::string& path,
203+
const std::string& short_path) {
204+
std::string full_path, short_full_path;
205+
for (const auto& field : type.fields()) {
206+
full_path = iceberg::GetPath(path, std::string(field.name()), case_sensitive);
207+
if (field.name() == MapType::kValueName &&
208+
field.type()->type_id() == TypeId::kStruct) {
170209
short_full_path = short_path;
171210
} else {
172-
short_full_path =
173-
GetPath(std::string(short_path), std::string(field.name()), case_sensitive);
211+
short_full_path = iceberg::GetPath(path, std::string(field.name()), case_sensitive);
212+
}
213+
name_to_index[full_path] = index++;
214+
if (!has_init) {
215+
full_schemafield.emplace_back(field);
174216
}
175-
shortname_to_index[short_full_path] = index++;
217+
name_to_index.emplace(short_full_path, index - 1);
176218
if (field.type()->is_nested()) {
177-
auto tmp = Visit(*field.type(), name_to_index, full_path, shortname_to_index,
178-
short_full_path, index, case_sensitive);
179-
if (!tmp.has_value()) {
180-
throw IcebergError("Failed to perform visit(name_to_index)");
181-
}
219+
ICEBERG_RETURN_UNEXPECTED(
220+
VisitTypeInline(*field.type(), this, full_path, short_full_path));
182221
}
183222
}
184223
return {};
185224
}
186225

226+
Status NameVisitor::Visit(const StructType& type, const std::string& path,
227+
const std::string& short_path) {
228+
const auto& fields = type.fields();
229+
std::string full_path, short_full_path;
230+
for (const auto& field : fields) {
231+
full_path = iceberg::GetPath(path, std::string(field.name()), case_sensitive);
232+
short_full_path =
233+
iceberg::GetPath(short_path, std::string(field.name()), case_sensitive);
234+
name_to_index[full_path] = index++;
235+
if (!has_init) {
236+
full_schemafield.emplace_back(field);
237+
}
238+
name_to_index.emplace(short_full_path, index - 1);
239+
if (field.type()->is_nested()) {
240+
ICEBERG_RETURN_UNEXPECTED(
241+
VisitTypeInline(*field.type(), this, full_path, short_full_path));
242+
}
243+
}
244+
return {};
245+
}
246+
247+
Status NameVisitor::Visit(const PrimitiveType& type, const std::string& path,
248+
const std::string& short_path) {
249+
return {};
250+
}
187251
} // namespace iceberg

src/iceberg/schema.h

Lines changed: 22 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
#include "iceberg/iceberg_export.h"
3333
#include "iceberg/schema_field.h"
3434
#include "iceberg/type.h"
35+
#include "iceberg/util/macros.h"
3536
#include "iceberg/util/visit_type.h"
3637

3738
namespace iceberg {
@@ -56,42 +57,41 @@ class ICEBERG_EXPORT Schema : public StructType {
5657

5758
[[nodiscard]] std::string ToString() const override;
5859

59-
[[nodiscard]] std::optional<std::reference_wrapper<const SchemaField>> GetFieldByName(
60-
std::string_view name, bool case_sensitive) const override;
60+
///\brief Get thd SchemaField By Name
61+
///
62+
/// Short names for maps and lists are included for any name that does not conflict with
63+
/// a canonical name. For example, a list, 'l', of structs with field 'x' will produce
64+
/// short name 'l.x' in addition to canonical name 'l.element.x'. a map 'm', if its
65+
/// value include a structs with field 'x' wil produce short name 'm.x' in addition to
66+
/// canonical name 'm.value.x'
67+
[[nodiscard]] Result<std::optional<std::reference_wrapper<const SchemaField>>>
68+
FindFieldByName(std::string_view name, bool case_sensitive) const;
6169

62-
[[nodiscard]] std::optional<std::reference_wrapper<const SchemaField>> GetFieldByName(
63-
std::string_view name) const;
70+
[[nodiscard]] Result<std::optional<std::reference_wrapper<const SchemaField>>>
71+
FindFieldByName(std::string_view name) const;
6472

65-
[[nodiscard]] std::optional<std::reference_wrapper<const SchemaField>> GetFieldById(
66-
int32_t field_id) const override;
73+
[[nodiscard]] Result<std::optional<std::reference_wrapper<const SchemaField>>>
74+
FindFieldById(int32_t field_id) const;
6775

6876
friend bool operator==(const Schema& lhs, const Schema& rhs) { return lhs.Equals(rhs); }
6977

78+
/// Mapping from field id to index of `full_schemafield_`.
7079
mutable std::unordered_map<int, size_t> id_to_index_;
80+
/// Mapping from field name to index of `full_schemafield_`.
7181
mutable std::unordered_map<std::string, size_t> name_to_index_;
82+
/// Mapping from field lowercase_name(suppoert case_insensitive query) to index of
83+
/// `full_schemafield_`.
7284
mutable std::unordered_map<std::string, size_t> lowercase_name_to_index_;
73-
mutable std::vector<SchemaField> full_schemafield_;
85+
mutable std::vector<std::reference_wrapper<const SchemaField>> full_schemafield_;
7486

7587
private:
7688
/// \brief Compare two schemas for equality.
7789
[[nodiscard]] bool Equals(const Schema& other) const;
7890

7991
const std::optional<int32_t> schema_id_;
8092

81-
void InitIdToIndexMap() const;
82-
void InitNameToIndexMap() const;
83-
void InitLowerCaseNameToIndexMap() const;
84-
};
85-
86-
class SchemaFieldVisitor {
87-
public:
88-
Status Visit(const Type& type, std::unordered_map<int, size_t>& id_to_index,
89-
std::vector<SchemaField>& full_schemafield);
90-
std::string GetPath(const std::string& last_path, const std::string& field_name,
91-
bool case_sensitive);
92-
Status Visit(const Type& type, std::unordered_map<std::string, size_t>& name_to_index,
93-
std::string_view path,
94-
std::unordered_map<std::string, size_t>& shortname_to_index,
95-
std::string_view short_path, int& index, bool case_sensitive);
93+
Result<Status> InitIdToIndexMap() const;
94+
Result<Status> InitNameToIndexMap() const;
95+
Result<Status> InitLowerCaseNameToIndexMap() const;
9696
};
9797
} // namespace iceberg

0 commit comments

Comments
 (0)