Skip to content

Commit 0394c01

Browse files
author
nullccxsy
committed
refactor(schema): optimize IdVisitor & merge id/index maps to align Java impl
1. Refactor IdVisitor: Add VisitNested() to handle NestedType, make Visit() generic for all Type hierarchy (avoid anti-pattern) 2. Merge id_to_index_ + full_schemafield_ into id_to_field_ (std::unordered_map<int32_t, const ref<SchemaField>>) to reduce memory footprint 3. Align with Java impl's idToField design (support future alias/identifier fields without refactor)
1 parent 777190b commit 0394c01

File tree

5 files changed

+104
-163
lines changed

5 files changed

+104
-163
lines changed

src/iceberg/schema.cc

Lines changed: 85 additions & 107 deletions
Original file line numberDiff line numberDiff line change
@@ -19,34 +19,28 @@
1919

2020
#include "iceberg/schema.h"
2121

22+
#include <algorithm>
2223
#include <format>
2324

2425
#include "iceberg/type.h"
2526
#include "iceberg/util/formatter.h" // IWYU pragma: keep
27+
#include "iceberg/util/macros.h"
2628
namespace iceberg {
2729
class IdVisitor {
2830
public:
29-
explicit IdVisitor(bool has_init_ = false);
31+
explicit IdVisitor(
32+
std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>&
33+
id_to_field);
3034
Status Visit(const Type& type);
35+
Status VisitNestedType(const Type& type);
3136

32-
bool has_init;
33-
int index = 0;
34-
std::unordered_map<int, size_t> id_to_index;
35-
std::vector<std::reference_wrapper<const SchemaField>> full_schemafield;
37+
private:
38+
std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>& id_to_field_;
3639
};
37-
38-
std::string GetPath(const std::string& last_path, const std::string& field_name,
39-
bool case_sensitive) {
40-
if (case_sensitive) {
41-
return last_path.empty() ? field_name : last_path + "." + field_name;
42-
}
43-
std::string lower_name(field_name);
44-
std::ranges::transform(lower_name, lower_name.begin(), ::tolower);
45-
return last_path.empty() ? lower_name : last_path + "." + lower_name;
46-
}
47-
class NameVisitor {
40+
class NametoIdVisitor {
4841
public:
49-
explicit NameVisitor(bool case_sensitive_ = true, bool has_init_ = false);
42+
explicit NametoIdVisitor(std::unordered_map<std::string, size_t>& name_to_id,
43+
bool case_sensitive_ = true);
5044
Status Visit(const ListType& type, const std::string& path,
5145
const std::string& short_path);
5246
Status Visit(const MapType& type, const std::string& path,
@@ -55,12 +49,12 @@ class NameVisitor {
5549
const std::string& short_path);
5650
Status Visit(const PrimitiveType& type, const std::string& path,
5751
const std::string& short_path);
52+
static std::string GetPath(const std::string& last_path, const std::string& field_name,
53+
bool case_sensitive = true);
5854

59-
int index = 0;
60-
bool case_sensitive;
61-
bool has_init;
62-
std::unordered_map<std::string, size_t> name_to_index;
63-
std::vector<std::reference_wrapper<const SchemaField>> full_schemafield;
55+
private:
56+
bool case_sensitive_;
57+
std::unordered_map<std::string, size_t>& name_to_id_;
6458
};
6559
Schema::Schema(std::vector<SchemaField> fields, std::optional<int32_t> schema_id)
6660
: StructType(std::move(fields)), schema_id_(schema_id) {}
@@ -84,16 +78,16 @@ Result<std::optional<std::reference_wrapper<const SchemaField>>> Schema::FindFie
8478
std::string_view name, bool case_sensitive) const {
8579
if (case_sensitive) {
8680
ICEBERG_RETURN_UNEXPECTED(InitNameToIndexMap());
87-
auto it = name_to_index_.find(std::string(name));
88-
if (it == name_to_index_.end()) return std::nullopt;
89-
return full_schemafield_[it->second];
81+
auto it = name_to_id_.find(std::string(name));
82+
if (it == name_to_id_.end()) return std::nullopt;
83+
return FindFieldById(it->second);
9084
}
9185
ICEBERG_RETURN_UNEXPECTED(InitLowerCaseNameToIndexMap());
9286
std::string lower_name(name);
9387
std::ranges::transform(lower_name, lower_name.begin(), ::tolower);
94-
auto it = lowercase_name_to_index_.find(lower_name);
95-
if (it == lowercase_name_to_index_.end()) return std::nullopt;
96-
return full_schemafield_[it->second];
88+
auto it = lowercase_name_to_id_.find(lower_name);
89+
if (it == lowercase_name_to_id_.end()) return std::nullopt;
90+
return FindFieldById(it->second);
9791
}
9892

9993
Result<std::optional<std::reference_wrapper<const SchemaField>>> Schema::FindFieldByName(
@@ -102,150 +96,134 @@ Result<std::optional<std::reference_wrapper<const SchemaField>>> Schema::FindFie
10296
}
10397

10498
Result<Status> Schema::InitIdToIndexMap() const {
105-
if (!id_to_index_.empty()) {
99+
if (!id_to_field_.empty()) {
106100
return {};
107101
}
108-
bool has_init = !full_schemafield_.empty();
109-
IdVisitor visitor(has_init);
102+
IdVisitor visitor(id_to_field_);
110103
ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*this, &visitor));
111-
id_to_index_ = std::move(visitor.id_to_index);
112-
if (!has_init) {
113-
full_schemafield_ = std::move(visitor.full_schemafield);
114-
}
115104
return {};
116105
}
117106

118107
Result<Status> Schema::InitNameToIndexMap() const {
119-
if (!name_to_index_.empty()) {
108+
if (!name_to_id_.empty()) {
120109
return {};
121110
}
122-
bool has_init = !full_schemafield_.empty();
123111
std::string path, short_path;
124-
NameVisitor visitor(true, has_init);
112+
NametoIdVisitor visitor(name_to_id_, true);
125113
ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*this, &visitor, path, short_path));
126-
name_to_index_ = std::move(visitor.name_to_index);
127-
if (!has_init) {
128-
full_schemafield_ = std::move(visitor.full_schemafield);
129-
}
130114
return {};
131115
}
132116

133117
Result<Status> Schema::InitLowerCaseNameToIndexMap() const {
134-
if (!lowercase_name_to_index_.empty()) {
118+
if (!lowercase_name_to_id_.empty()) {
135119
return {};
136120
}
137-
bool has_init = !full_schemafield_.empty();
138121
std::string path, short_path;
139-
NameVisitor visitor(false, has_init);
122+
NametoIdVisitor visitor(lowercase_name_to_id_, false);
140123
ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*this, &visitor, path, short_path));
141-
lowercase_name_to_index_ = std::move(visitor.name_to_index);
142-
if (!has_init) {
143-
full_schemafield_ = std::move(visitor.full_schemafield);
144-
}
145124
return {};
146125
}
147126

148127
Result<std::optional<std::reference_wrapper<const SchemaField>>> Schema::FindFieldById(
149128
int32_t field_id) const {
150129
ICEBERG_RETURN_UNEXPECTED(InitIdToIndexMap());
151-
auto it = id_to_index_.find(field_id);
152-
if (it == id_to_index_.end()) {
130+
auto it = id_to_field_.find(field_id);
131+
if (it == id_to_field_.end()) {
153132
return std::nullopt;
154133
}
155-
return full_schemafield_[it->second];
134+
return it->second;
156135
}
157136

158-
IdVisitor::IdVisitor(bool has_init_) : has_init(has_init_) {}
137+
IdVisitor::IdVisitor(
138+
std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>& id_to_field)
139+
: id_to_field_(id_to_field) {}
159140

160141
Status IdVisitor::Visit(const Type& type) {
142+
if (type.is_nested()) {
143+
ICEBERG_RETURN_UNEXPECTED(VisitNestedType(type));
144+
}
145+
return {};
146+
}
147+
148+
Status IdVisitor::VisitNestedType(const Type& type) {
161149
const auto& nested = iceberg::internal::checked_cast<const NestedType&>(type);
162150
const auto& fields = nested.fields();
163151
for (const auto& field : fields) {
164-
id_to_index[field.field_id()] = index++;
165-
if (!has_init) {
166-
full_schemafield.emplace_back(field);
167-
}
168-
if (field.type()->is_nested()) {
169-
ICEBERG_RETURN_UNEXPECTED(Visit(*field.type()));
170-
}
152+
id_to_field_.emplace(field.field_id(), std::cref(field));
153+
ICEBERG_RETURN_UNEXPECTED(Visit(*field.type()));
171154
}
172155
return {};
173156
}
174157

175-
NameVisitor::NameVisitor(bool case_sensitive_, bool has_init_)
176-
: case_sensitive(case_sensitive_), has_init(has_init_) {}
158+
NametoIdVisitor::NametoIdVisitor(std::unordered_map<std::string, size_t>& name_to_id,
159+
bool case_sensitive)
160+
: name_to_id_(name_to_id), case_sensitive_(case_sensitive) {}
177161

178-
Status NameVisitor::Visit(const ListType& type, const std::string& path,
179-
const std::string& short_path) {
162+
Status NametoIdVisitor::Visit(const ListType& type, const std::string& path,
163+
const std::string& short_path) {
180164
const auto& field = type.fields()[0];
181-
std::string full_path =
182-
iceberg::GetPath(path, std::string(field.name()), case_sensitive);
165+
std::string full_path = GetPath(path, std::string(field.name()), case_sensitive_);
183166
std::string short_full_path;
184167
if (field.type()->type_id() == TypeId::kStruct) {
185168
short_full_path = short_path;
186169
} else {
187-
short_full_path =
188-
iceberg::GetPath(short_path, std::string(field.name()), case_sensitive);
189-
}
190-
name_to_index[full_path] = index++;
191-
if (!has_init) {
192-
full_schemafield.emplace_back(field);
193-
}
194-
name_to_index.emplace(short_full_path, index - 1);
195-
if (field.type()->is_nested()) {
196-
ICEBERG_RETURN_UNEXPECTED(
197-
VisitTypeInline(*field.type(), this, full_path, short_full_path));
170+
short_full_path = GetPath(short_path, std::string(field.name()), case_sensitive_);
198171
}
172+
name_to_id_[full_path] = field.field_id();
173+
name_to_id_.emplace(short_full_path, field.field_id());
174+
ICEBERG_RETURN_UNEXPECTED(
175+
VisitTypeInline(*field.type(), this, full_path, short_full_path));
199176
return {};
200177
}
201178

202-
Status NameVisitor::Visit(const MapType& type, const std::string& path,
203-
const std::string& short_path) {
179+
Status NametoIdVisitor::Visit(const MapType& type, const std::string& path,
180+
const std::string& short_path) {
204181
std::string full_path, short_full_path;
205-
for (const auto& field : type.fields()) {
206-
full_path = iceberg::GetPath(path, std::string(field.name()), case_sensitive);
182+
const auto& fields = type.fields();
183+
for (const auto& field : fields) {
184+
full_path = GetPath(path, std::string(field.name()), case_sensitive_);
207185
if (field.name() == MapType::kValueName &&
208186
field.type()->type_id() == TypeId::kStruct) {
209187
short_full_path = short_path;
210188
} else {
211-
short_full_path = iceberg::GetPath(path, std::string(field.name()), case_sensitive);
212-
}
213-
name_to_index[full_path] = index++;
214-
if (!has_init) {
215-
full_schemafield.emplace_back(field);
216-
}
217-
name_to_index.emplace(short_full_path, index - 1);
218-
if (field.type()->is_nested()) {
219-
ICEBERG_RETURN_UNEXPECTED(
220-
VisitTypeInline(*field.type(), this, full_path, short_full_path));
189+
short_full_path = GetPath(path, std::string(field.name()), case_sensitive_);
221190
}
191+
name_to_id_[full_path] = field.field_id();
192+
name_to_id_.emplace(short_full_path, field.field_id());
193+
ICEBERG_RETURN_UNEXPECTED(
194+
VisitTypeInline(*field.type(), this, full_path, short_full_path));
222195
}
223196
return {};
224197
}
225198

226-
Status NameVisitor::Visit(const StructType& type, const std::string& path,
227-
const std::string& short_path) {
199+
Status NametoIdVisitor::Visit(const StructType& type, const std::string& path,
200+
const std::string& short_path) {
228201
const auto& fields = type.fields();
229202
std::string full_path, short_full_path;
230203
for (const auto& field : fields) {
231-
full_path = iceberg::GetPath(path, std::string(field.name()), case_sensitive);
232-
short_full_path =
233-
iceberg::GetPath(short_path, std::string(field.name()), case_sensitive);
234-
name_to_index[full_path] = index++;
235-
if (!has_init) {
236-
full_schemafield.emplace_back(field);
237-
}
238-
name_to_index.emplace(short_full_path, index - 1);
239-
if (field.type()->is_nested()) {
240-
ICEBERG_RETURN_UNEXPECTED(
241-
VisitTypeInline(*field.type(), this, full_path, short_full_path));
242-
}
204+
full_path =
205+
NametoIdVisitor::GetPath(path, std::string(field.name()), case_sensitive_);
206+
short_full_path = GetPath(short_path, std::string(field.name()), case_sensitive_);
207+
name_to_id_[full_path] = field.field_id();
208+
name_to_id_.emplace(short_full_path, field.field_id());
209+
ICEBERG_RETURN_UNEXPECTED(
210+
VisitTypeInline(*field.type(), this, full_path, short_full_path));
243211
}
244212
return {};
245213
}
246214

247-
Status NameVisitor::Visit(const PrimitiveType& type, const std::string& path,
248-
const std::string& short_path) {
215+
Status NametoIdVisitor::Visit(const PrimitiveType& type, const std::string& path,
216+
const std::string& short_path) {
249217
return {};
250218
}
219+
220+
std::string NametoIdVisitor::GetPath(const std::string& last_path,
221+
const std::string& field_name, bool case_sensitive) {
222+
if (case_sensitive) {
223+
return last_path.empty() ? field_name : last_path + "." + field_name;
224+
}
225+
std::string lower_name(field_name);
226+
std::ranges::transform(lower_name, lower_name.begin(), ::tolower);
227+
return last_path.empty() ? lower_name : last_path + "." + lower_name;
228+
}
251229
} // namespace iceberg

src/iceberg/schema.h

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323
/// Schemas for Iceberg tables. This header contains the definition of Schema
2424
/// and any utility functions. See iceberg/type.h and iceberg/field.h as well.
2525

26-
#include <algorithm>
2726
#include <cstdint>
2827
#include <optional>
2928
#include <string>
@@ -32,7 +31,6 @@
3231
#include "iceberg/iceberg_export.h"
3332
#include "iceberg/schema_field.h"
3433
#include "iceberg/type.h"
35-
#include "iceberg/util/macros.h"
3634
#include "iceberg/util/visit_type.h"
3735

3836
namespace iceberg {
@@ -57,7 +55,7 @@ class ICEBERG_EXPORT Schema : public StructType {
5755

5856
[[nodiscard]] std::string ToString() const override;
5957

60-
///\brief Get thd SchemaField By Name
58+
///\brief Find thd SchemaField By field name
6159
///
6260
/// Short names for maps and lists are included for any name that does not conflict with
6361
/// a canonical name. For example, a list, 'l', of structs with field 'x' will produce
@@ -75,14 +73,13 @@ class ICEBERG_EXPORT Schema : public StructType {
7573

7674
friend bool operator==(const Schema& lhs, const Schema& rhs) { return lhs.Equals(rhs); }
7775

78-
/// Mapping from field id to index of `full_schemafield_`.
79-
mutable std::unordered_map<int, size_t> id_to_index_;
80-
/// Mapping from field name to index of `full_schemafield_`.
81-
mutable std::unordered_map<std::string, size_t> name_to_index_;
82-
/// Mapping from field lowercase_name(suppoert case_insensitive query) to index of
83-
/// `full_schemafield_`.
84-
mutable std::unordered_map<std::string, size_t> lowercase_name_to_index_;
85-
mutable std::vector<std::reference_wrapper<const SchemaField>> full_schemafield_;
76+
/// Mapping from field id to field.
77+
mutable std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>
78+
id_to_field_;
79+
/// Mapping from field name to id of field.
80+
mutable std::unordered_map<std::string, size_t> name_to_id_;
81+
/// Mapping from field lowercase_name(suppoert case_insensitive query) to id of field
82+
mutable std::unordered_map<std::string, size_t> lowercase_name_to_id_;
8683

8784
private:
8885
/// \brief Compare two schemas for equality.

0 commit comments

Comments
 (0)