Skip to content

Commit ef4b2de

Browse files
author
nullccxsy
committed
refactor(schema): optimize IdVisitor & merge id/index maps to align Java impl
1. Refactor IdVisitor: Add VisitNested() to handle NestedType, make Visit() generic for all Type hierarchy (avoid anti-pattern) 2. Merge id_to_index_ + full_schemafield_ into id_to_field_ (std::unordered_map<int32_t, const ref<SchemaField>>) to reduce memory footprint 3. Align with Java impl's idToField design (support future alias/identifier fields without refactor)
1 parent 777190b commit ef4b2de

File tree

5 files changed

+106
-164
lines changed

5 files changed

+106
-164
lines changed

src/iceberg/schema.cc

Lines changed: 86 additions & 107 deletions
Original file line numberDiff line numberDiff line change
@@ -19,34 +19,29 @@
1919

2020
#include "iceberg/schema.h"
2121

22+
#include <algorithm>
2223
#include <format>
2324

2425
#include "iceberg/type.h"
2526
#include "iceberg/util/formatter.h" // IWYU pragma: keep
27+
#include "iceberg/util/macros.h"
28+
#include "iceberg/util/visit_type.h"
2629
namespace iceberg {
2730
class IdVisitor {
2831
public:
29-
explicit IdVisitor(bool has_init_ = false);
32+
explicit IdVisitor(
33+
std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>&
34+
id_to_field);
3035
Status Visit(const Type& type);
36+
Status VisitNestedType(const Type& type);
3137

32-
bool has_init;
33-
int index = 0;
34-
std::unordered_map<int, size_t> id_to_index;
35-
std::vector<std::reference_wrapper<const SchemaField>> full_schemafield;
38+
private:
39+
std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>& id_to_field_;
3640
};
37-
38-
std::string GetPath(const std::string& last_path, const std::string& field_name,
39-
bool case_sensitive) {
40-
if (case_sensitive) {
41-
return last_path.empty() ? field_name : last_path + "." + field_name;
42-
}
43-
std::string lower_name(field_name);
44-
std::ranges::transform(lower_name, lower_name.begin(), ::tolower);
45-
return last_path.empty() ? lower_name : last_path + "." + lower_name;
46-
}
47-
class NameVisitor {
41+
class NametoIdVisitor {
4842
public:
49-
explicit NameVisitor(bool case_sensitive_ = true, bool has_init_ = false);
43+
explicit NametoIdVisitor(std::unordered_map<std::string, size_t>& name_to_id,
44+
bool case_sensitive_ = true);
5045
Status Visit(const ListType& type, const std::string& path,
5146
const std::string& short_path);
5247
Status Visit(const MapType& type, const std::string& path,
@@ -55,12 +50,12 @@ class NameVisitor {
5550
const std::string& short_path);
5651
Status Visit(const PrimitiveType& type, const std::string& path,
5752
const std::string& short_path);
53+
static std::string GetPath(const std::string& last_path, const std::string& field_name,
54+
bool case_sensitive = true);
5855

59-
int index = 0;
60-
bool case_sensitive;
61-
bool has_init;
62-
std::unordered_map<std::string, size_t> name_to_index;
63-
std::vector<std::reference_wrapper<const SchemaField>> full_schemafield;
56+
private:
57+
bool case_sensitive_;
58+
std::unordered_map<std::string, size_t>& name_to_id_;
6459
};
6560
Schema::Schema(std::vector<SchemaField> fields, std::optional<int32_t> schema_id)
6661
: StructType(std::move(fields)), schema_id_(schema_id) {}
@@ -84,16 +79,16 @@ Result<std::optional<std::reference_wrapper<const SchemaField>>> Schema::FindFie
8479
std::string_view name, bool case_sensitive) const {
8580
if (case_sensitive) {
8681
ICEBERG_RETURN_UNEXPECTED(InitNameToIndexMap());
87-
auto it = name_to_index_.find(std::string(name));
88-
if (it == name_to_index_.end()) return std::nullopt;
89-
return full_schemafield_[it->second];
82+
auto it = name_to_id_.find(std::string(name));
83+
if (it == name_to_id_.end()) return std::nullopt;
84+
return FindFieldById(it->second);
9085
}
9186
ICEBERG_RETURN_UNEXPECTED(InitLowerCaseNameToIndexMap());
9287
std::string lower_name(name);
9388
std::ranges::transform(lower_name, lower_name.begin(), ::tolower);
94-
auto it = lowercase_name_to_index_.find(lower_name);
95-
if (it == lowercase_name_to_index_.end()) return std::nullopt;
96-
return full_schemafield_[it->second];
89+
auto it = lowercase_name_to_id_.find(lower_name);
90+
if (it == lowercase_name_to_id_.end()) return std::nullopt;
91+
return FindFieldById(it->second);
9792
}
9893

9994
Result<std::optional<std::reference_wrapper<const SchemaField>>> Schema::FindFieldByName(
@@ -102,150 +97,134 @@ Result<std::optional<std::reference_wrapper<const SchemaField>>> Schema::FindFie
10297
}
10398

10499
Result<Status> Schema::InitIdToIndexMap() const {
105-
if (!id_to_index_.empty()) {
100+
if (!id_to_field_.empty()) {
106101
return {};
107102
}
108-
bool has_init = !full_schemafield_.empty();
109-
IdVisitor visitor(has_init);
103+
IdVisitor visitor(id_to_field_);
110104
ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*this, &visitor));
111-
id_to_index_ = std::move(visitor.id_to_index);
112-
if (!has_init) {
113-
full_schemafield_ = std::move(visitor.full_schemafield);
114-
}
115105
return {};
116106
}
117107

118108
Result<Status> Schema::InitNameToIndexMap() const {
119-
if (!name_to_index_.empty()) {
109+
if (!name_to_id_.empty()) {
120110
return {};
121111
}
122-
bool has_init = !full_schemafield_.empty();
123112
std::string path, short_path;
124-
NameVisitor visitor(true, has_init);
113+
NametoIdVisitor visitor(name_to_id_, true);
125114
ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*this, &visitor, path, short_path));
126-
name_to_index_ = std::move(visitor.name_to_index);
127-
if (!has_init) {
128-
full_schemafield_ = std::move(visitor.full_schemafield);
129-
}
130115
return {};
131116
}
132117

133118
Result<Status> Schema::InitLowerCaseNameToIndexMap() const {
134-
if (!lowercase_name_to_index_.empty()) {
119+
if (!lowercase_name_to_id_.empty()) {
135120
return {};
136121
}
137-
bool has_init = !full_schemafield_.empty();
138122
std::string path, short_path;
139-
NameVisitor visitor(false, has_init);
123+
NametoIdVisitor visitor(lowercase_name_to_id_, false);
140124
ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*this, &visitor, path, short_path));
141-
lowercase_name_to_index_ = std::move(visitor.name_to_index);
142-
if (!has_init) {
143-
full_schemafield_ = std::move(visitor.full_schemafield);
144-
}
145125
return {};
146126
}
147127

148128
Result<std::optional<std::reference_wrapper<const SchemaField>>> Schema::FindFieldById(
149129
int32_t field_id) const {
150130
ICEBERG_RETURN_UNEXPECTED(InitIdToIndexMap());
151-
auto it = id_to_index_.find(field_id);
152-
if (it == id_to_index_.end()) {
131+
auto it = id_to_field_.find(field_id);
132+
if (it == id_to_field_.end()) {
153133
return std::nullopt;
154134
}
155-
return full_schemafield_[it->second];
135+
return it->second;
156136
}
157137

158-
IdVisitor::IdVisitor(bool has_init_) : has_init(has_init_) {}
138+
IdVisitor::IdVisitor(
139+
std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>& id_to_field)
140+
: id_to_field_(id_to_field) {}
159141

160142
Status IdVisitor::Visit(const Type& type) {
143+
if (type.is_nested()) {
144+
ICEBERG_RETURN_UNEXPECTED(VisitNestedType(type));
145+
}
146+
return {};
147+
}
148+
149+
Status IdVisitor::VisitNestedType(const Type& type) {
161150
const auto& nested = iceberg::internal::checked_cast<const NestedType&>(type);
162151
const auto& fields = nested.fields();
163152
for (const auto& field : fields) {
164-
id_to_index[field.field_id()] = index++;
165-
if (!has_init) {
166-
full_schemafield.emplace_back(field);
167-
}
168-
if (field.type()->is_nested()) {
169-
ICEBERG_RETURN_UNEXPECTED(Visit(*field.type()));
170-
}
153+
id_to_field_.emplace(field.field_id(), std::cref(field));
154+
ICEBERG_RETURN_UNEXPECTED(Visit(*field.type()));
171155
}
172156
return {};
173157
}
174158

175-
NameVisitor::NameVisitor(bool case_sensitive_, bool has_init_)
176-
: case_sensitive(case_sensitive_), has_init(has_init_) {}
159+
NametoIdVisitor::NametoIdVisitor(std::unordered_map<std::string, size_t>& name_to_id,
160+
bool case_sensitive)
161+
: name_to_id_(name_to_id), case_sensitive_(case_sensitive) {}
177162

178-
Status NameVisitor::Visit(const ListType& type, const std::string& path,
179-
const std::string& short_path) {
163+
Status NametoIdVisitor::Visit(const ListType& type, const std::string& path,
164+
const std::string& short_path) {
180165
const auto& field = type.fields()[0];
181-
std::string full_path =
182-
iceberg::GetPath(path, std::string(field.name()), case_sensitive);
166+
std::string full_path = GetPath(path, std::string(field.name()), case_sensitive_);
183167
std::string short_full_path;
184168
if (field.type()->type_id() == TypeId::kStruct) {
185169
short_full_path = short_path;
186170
} else {
187-
short_full_path =
188-
iceberg::GetPath(short_path, std::string(field.name()), case_sensitive);
189-
}
190-
name_to_index[full_path] = index++;
191-
if (!has_init) {
192-
full_schemafield.emplace_back(field);
193-
}
194-
name_to_index.emplace(short_full_path, index - 1);
195-
if (field.type()->is_nested()) {
196-
ICEBERG_RETURN_UNEXPECTED(
197-
VisitTypeInline(*field.type(), this, full_path, short_full_path));
171+
short_full_path = GetPath(short_path, std::string(field.name()), case_sensitive_);
198172
}
173+
name_to_id_[full_path] = field.field_id();
174+
name_to_id_.emplace(short_full_path, field.field_id());
175+
ICEBERG_RETURN_UNEXPECTED(
176+
VisitTypeInline(*field.type(), this, full_path, short_full_path));
199177
return {};
200178
}
201179

202-
Status NameVisitor::Visit(const MapType& type, const std::string& path,
203-
const std::string& short_path) {
180+
Status NametoIdVisitor::Visit(const MapType& type, const std::string& path,
181+
const std::string& short_path) {
204182
std::string full_path, short_full_path;
205-
for (const auto& field : type.fields()) {
206-
full_path = iceberg::GetPath(path, std::string(field.name()), case_sensitive);
183+
const auto& fields = type.fields();
184+
for (const auto& field : fields) {
185+
full_path = GetPath(path, std::string(field.name()), case_sensitive_);
207186
if (field.name() == MapType::kValueName &&
208187
field.type()->type_id() == TypeId::kStruct) {
209188
short_full_path = short_path;
210189
} else {
211-
short_full_path = iceberg::GetPath(path, std::string(field.name()), case_sensitive);
212-
}
213-
name_to_index[full_path] = index++;
214-
if (!has_init) {
215-
full_schemafield.emplace_back(field);
216-
}
217-
name_to_index.emplace(short_full_path, index - 1);
218-
if (field.type()->is_nested()) {
219-
ICEBERG_RETURN_UNEXPECTED(
220-
VisitTypeInline(*field.type(), this, full_path, short_full_path));
190+
short_full_path = GetPath(path, std::string(field.name()), case_sensitive_);
221191
}
192+
name_to_id_[full_path] = field.field_id();
193+
name_to_id_.emplace(short_full_path, field.field_id());
194+
ICEBERG_RETURN_UNEXPECTED(
195+
VisitTypeInline(*field.type(), this, full_path, short_full_path));
222196
}
223197
return {};
224198
}
225199

226-
Status NameVisitor::Visit(const StructType& type, const std::string& path,
227-
const std::string& short_path) {
200+
Status NametoIdVisitor::Visit(const StructType& type, const std::string& path,
201+
const std::string& short_path) {
228202
const auto& fields = type.fields();
229203
std::string full_path, short_full_path;
230204
for (const auto& field : fields) {
231-
full_path = iceberg::GetPath(path, std::string(field.name()), case_sensitive);
232-
short_full_path =
233-
iceberg::GetPath(short_path, std::string(field.name()), case_sensitive);
234-
name_to_index[full_path] = index++;
235-
if (!has_init) {
236-
full_schemafield.emplace_back(field);
237-
}
238-
name_to_index.emplace(short_full_path, index - 1);
239-
if (field.type()->is_nested()) {
240-
ICEBERG_RETURN_UNEXPECTED(
241-
VisitTypeInline(*field.type(), this, full_path, short_full_path));
242-
}
205+
full_path =
206+
NametoIdVisitor::GetPath(path, std::string(field.name()), case_sensitive_);
207+
short_full_path = GetPath(short_path, std::string(field.name()), case_sensitive_);
208+
name_to_id_[full_path] = field.field_id();
209+
name_to_id_.emplace(short_full_path, field.field_id());
210+
ICEBERG_RETURN_UNEXPECTED(
211+
VisitTypeInline(*field.type(), this, full_path, short_full_path));
243212
}
244213
return {};
245214
}
246215

247-
Status NameVisitor::Visit(const PrimitiveType& type, const std::string& path,
248-
const std::string& short_path) {
216+
Status NametoIdVisitor::Visit(const PrimitiveType& type, const std::string& path,
217+
const std::string& short_path) {
249218
return {};
250219
}
220+
221+
std::string NametoIdVisitor::GetPath(const std::string& last_path,
222+
const std::string& field_name, bool case_sensitive) {
223+
if (case_sensitive) {
224+
return last_path.empty() ? field_name : last_path + "." + field_name;
225+
}
226+
std::string lower_name(field_name);
227+
std::ranges::transform(lower_name, lower_name.begin(), ::tolower);
228+
return last_path.empty() ? lower_name : last_path + "." + lower_name;
229+
}
251230
} // namespace iceberg

src/iceberg/schema.h

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -23,17 +23,15 @@
2323
/// Schemas for Iceberg tables. This header contains the definition of Schema
2424
/// and any utility functions. See iceberg/type.h and iceberg/field.h as well.
2525

26-
#include <algorithm>
2726
#include <cstdint>
2827
#include <optional>
2928
#include <string>
3029
#include <vector>
3130

3231
#include "iceberg/iceberg_export.h"
32+
#include "iceberg/result.h"
3333
#include "iceberg/schema_field.h"
3434
#include "iceberg/type.h"
35-
#include "iceberg/util/macros.h"
36-
#include "iceberg/util/visit_type.h"
3735

3836
namespace iceberg {
3937

@@ -57,7 +55,7 @@ class ICEBERG_EXPORT Schema : public StructType {
5755

5856
[[nodiscard]] std::string ToString() const override;
5957

60-
///\brief Get thd SchemaField By Name
58+
///\brief Find thd SchemaField By field name
6159
///
6260
/// Short names for maps and lists are included for any name that does not conflict with
6361
/// a canonical name. For example, a list, 'l', of structs with field 'x' will produce
@@ -75,14 +73,13 @@ class ICEBERG_EXPORT Schema : public StructType {
7573

7674
friend bool operator==(const Schema& lhs, const Schema& rhs) { return lhs.Equals(rhs); }
7775

78-
/// Mapping from field id to index of `full_schemafield_`.
79-
mutable std::unordered_map<int, size_t> id_to_index_;
80-
/// Mapping from field name to index of `full_schemafield_`.
81-
mutable std::unordered_map<std::string, size_t> name_to_index_;
82-
/// Mapping from field lowercase_name(suppoert case_insensitive query) to index of
83-
/// `full_schemafield_`.
84-
mutable std::unordered_map<std::string, size_t> lowercase_name_to_index_;
85-
mutable std::vector<std::reference_wrapper<const SchemaField>> full_schemafield_;
76+
/// Mapping from field id to field.
77+
mutable std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>
78+
id_to_field_;
79+
/// Mapping from field name to id of field.
80+
mutable std::unordered_map<std::string, size_t> name_to_id_;
81+
/// Mapping from field lowercase_name(suppoert case_insensitive query) to id of field
82+
mutable std::unordered_map<std::string, size_t> lowercase_name_to_id_;
8683

8784
private:
8885
/// \brief Compare two schemas for equality.

0 commit comments

Comments
 (0)