Skip to content

Commit bfac669

Browse files
author
nullccxsy
committed
refactor(schema): optimize IdVisitor & merge id/index maps to align Java impl
1. Refactor IdVisitor: Add VisitNested() to handle NestedType, make Visit() generic for all Type hierarchy (avoid anti-pattern) 2. Merge id_to_index_ + full_schemafield_ into id_to_field_ (std::unordered_map<int32_t, const ref<SchemaField>>) to reduce memory footprint 3. Align with Java impl's idToField design (support future alias/identifier fields without refactor)
1 parent 777190b commit bfac669

File tree

5 files changed

+103
-163
lines changed

5 files changed

+103
-163
lines changed

src/iceberg/schema.cc

Lines changed: 84 additions & 107 deletions
Original file line numberDiff line numberDiff line change
@@ -23,30 +23,23 @@
2323

2424
#include "iceberg/type.h"
2525
#include "iceberg/util/formatter.h" // IWYU pragma: keep
26+
#include "iceberg/util/macros.h"
2627
namespace iceberg {
2728
class IdVisitor {
2829
public:
29-
explicit IdVisitor(bool has_init_ = false);
30+
explicit IdVisitor(
31+
std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>&
32+
id_to_field);
3033
Status Visit(const Type& type);
34+
Status VisitNestedType(const Type& type);
3135

32-
bool has_init;
33-
int index = 0;
34-
std::unordered_map<int, size_t> id_to_index;
35-
std::vector<std::reference_wrapper<const SchemaField>> full_schemafield;
36+
private:
37+
std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>& id_to_field_;
3638
};
37-
38-
std::string GetPath(const std::string& last_path, const std::string& field_name,
39-
bool case_sensitive) {
40-
if (case_sensitive) {
41-
return last_path.empty() ? field_name : last_path + "." + field_name;
42-
}
43-
std::string lower_name(field_name);
44-
std::ranges::transform(lower_name, lower_name.begin(), ::tolower);
45-
return last_path.empty() ? lower_name : last_path + "." + lower_name;
46-
}
47-
class NameVisitor {
39+
class NametoIdVisitor {
4840
public:
49-
explicit NameVisitor(bool case_sensitive_ = true, bool has_init_ = false);
41+
explicit NametoIdVisitor(std::unordered_map<std::string, size_t>& name_to_id,
42+
bool case_sensitive_ = true);
5043
Status Visit(const ListType& type, const std::string& path,
5144
const std::string& short_path);
5245
Status Visit(const MapType& type, const std::string& path,
@@ -55,12 +48,12 @@ class NameVisitor {
5548
const std::string& short_path);
5649
Status Visit(const PrimitiveType& type, const std::string& path,
5750
const std::string& short_path);
51+
static std::string GetPath(const std::string& last_path, const std::string& field_name,
52+
bool case_sensitive = true);
5853

59-
int index = 0;
60-
bool case_sensitive;
61-
bool has_init;
62-
std::unordered_map<std::string, size_t> name_to_index;
63-
std::vector<std::reference_wrapper<const SchemaField>> full_schemafield;
54+
private:
55+
bool case_sensitive_;
56+
std::unordered_map<std::string, size_t>& name_to_id_;
6457
};
6558
Schema::Schema(std::vector<SchemaField> fields, std::optional<int32_t> schema_id)
6659
: StructType(std::move(fields)), schema_id_(schema_id) {}
@@ -84,16 +77,16 @@ Result<std::optional<std::reference_wrapper<const SchemaField>>> Schema::FindFie
8477
std::string_view name, bool case_sensitive) const {
8578
if (case_sensitive) {
8679
ICEBERG_RETURN_UNEXPECTED(InitNameToIndexMap());
87-
auto it = name_to_index_.find(std::string(name));
88-
if (it == name_to_index_.end()) return std::nullopt;
89-
return full_schemafield_[it->second];
80+
auto it = name_to_id_.find(std::string(name));
81+
if (it == name_to_id_.end()) return std::nullopt;
82+
return FindFieldById(it->second);
9083
}
9184
ICEBERG_RETURN_UNEXPECTED(InitLowerCaseNameToIndexMap());
9285
std::string lower_name(name);
9386
std::ranges::transform(lower_name, lower_name.begin(), ::tolower);
94-
auto it = lowercase_name_to_index_.find(lower_name);
95-
if (it == lowercase_name_to_index_.end()) return std::nullopt;
96-
return full_schemafield_[it->second];
87+
auto it = lowercase_name_to_id_.find(lower_name);
88+
if (it == lowercase_name_to_id_.end()) return std::nullopt;
89+
return FindFieldById(it->second);
9790
}
9891

9992
Result<std::optional<std::reference_wrapper<const SchemaField>>> Schema::FindFieldByName(
@@ -102,150 +95,134 @@ Result<std::optional<std::reference_wrapper<const SchemaField>>> Schema::FindFie
10295
}
10396

10497
Result<Status> Schema::InitIdToIndexMap() const {
105-
if (!id_to_index_.empty()) {
98+
if (!id_to_field_.empty()) {
10699
return {};
107100
}
108-
bool has_init = !full_schemafield_.empty();
109-
IdVisitor visitor(has_init);
101+
IdVisitor visitor(id_to_field_);
110102
ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*this, &visitor));
111-
id_to_index_ = std::move(visitor.id_to_index);
112-
if (!has_init) {
113-
full_schemafield_ = std::move(visitor.full_schemafield);
114-
}
115103
return {};
116104
}
117105

118106
Result<Status> Schema::InitNameToIndexMap() const {
119-
if (!name_to_index_.empty()) {
107+
if (!name_to_id_.empty()) {
120108
return {};
121109
}
122-
bool has_init = !full_schemafield_.empty();
123110
std::string path, short_path;
124-
NameVisitor visitor(true, has_init);
111+
NametoIdVisitor visitor(name_to_id_, true);
125112
ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*this, &visitor, path, short_path));
126-
name_to_index_ = std::move(visitor.name_to_index);
127-
if (!has_init) {
128-
full_schemafield_ = std::move(visitor.full_schemafield);
129-
}
130113
return {};
131114
}
132115

133116
Result<Status> Schema::InitLowerCaseNameToIndexMap() const {
134-
if (!lowercase_name_to_index_.empty()) {
117+
if (!lowercase_name_to_id_.empty()) {
135118
return {};
136119
}
137-
bool has_init = !full_schemafield_.empty();
138120
std::string path, short_path;
139-
NameVisitor visitor(false, has_init);
121+
NametoIdVisitor visitor(lowercase_name_to_id_, false);
140122
ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*this, &visitor, path, short_path));
141-
lowercase_name_to_index_ = std::move(visitor.name_to_index);
142-
if (!has_init) {
143-
full_schemafield_ = std::move(visitor.full_schemafield);
144-
}
145123
return {};
146124
}
147125

148126
Result<std::optional<std::reference_wrapper<const SchemaField>>> Schema::FindFieldById(
149127
int32_t field_id) const {
150128
ICEBERG_RETURN_UNEXPECTED(InitIdToIndexMap());
151-
auto it = id_to_index_.find(field_id);
152-
if (it == id_to_index_.end()) {
129+
auto it = id_to_field_.find(field_id);
130+
if (it == id_to_field_.end()) {
153131
return std::nullopt;
154132
}
155-
return full_schemafield_[it->second];
133+
return it->second;
156134
}
157135

158-
IdVisitor::IdVisitor(bool has_init_) : has_init(has_init_) {}
136+
IdVisitor::IdVisitor(
137+
std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>& id_to_field)
138+
: id_to_field_(id_to_field) {}
159139

160140
Status IdVisitor::Visit(const Type& type) {
141+
if (type.is_nested()) {
142+
ICEBERG_RETURN_UNEXPECTED(VisitNestedType(type));
143+
}
144+
return {};
145+
}
146+
147+
Status IdVisitor::VisitNestedType(const Type& type) {
161148
const auto& nested = iceberg::internal::checked_cast<const NestedType&>(type);
162149
const auto& fields = nested.fields();
163150
for (const auto& field : fields) {
164-
id_to_index[field.field_id()] = index++;
165-
if (!has_init) {
166-
full_schemafield.emplace_back(field);
167-
}
168-
if (field.type()->is_nested()) {
169-
ICEBERG_RETURN_UNEXPECTED(Visit(*field.type()));
170-
}
151+
id_to_field_.emplace(field.field_id(), std::cref(field));
152+
ICEBERG_RETURN_UNEXPECTED(Visit(*field.type()));
171153
}
172154
return {};
173155
}
174156

175-
NameVisitor::NameVisitor(bool case_sensitive_, bool has_init_)
176-
: case_sensitive(case_sensitive_), has_init(has_init_) {}
157+
NametoIdVisitor::NametoIdVisitor(std::unordered_map<std::string, size_t>& name_to_id,
158+
bool case_sensitive)
159+
: name_to_id_(name_to_id), case_sensitive_(case_sensitive) {}
177160

178-
Status NameVisitor::Visit(const ListType& type, const std::string& path,
179-
const std::string& short_path) {
161+
Status NametoIdVisitor::Visit(const ListType& type, const std::string& path,
162+
const std::string& short_path) {
180163
const auto& field = type.fields()[0];
181-
std::string full_path =
182-
iceberg::GetPath(path, std::string(field.name()), case_sensitive);
164+
std::string full_path = GetPath(path, std::string(field.name()), case_sensitive_);
183165
std::string short_full_path;
184166
if (field.type()->type_id() == TypeId::kStruct) {
185167
short_full_path = short_path;
186168
} else {
187-
short_full_path =
188-
iceberg::GetPath(short_path, std::string(field.name()), case_sensitive);
189-
}
190-
name_to_index[full_path] = index++;
191-
if (!has_init) {
192-
full_schemafield.emplace_back(field);
193-
}
194-
name_to_index.emplace(short_full_path, index - 1);
195-
if (field.type()->is_nested()) {
196-
ICEBERG_RETURN_UNEXPECTED(
197-
VisitTypeInline(*field.type(), this, full_path, short_full_path));
169+
short_full_path = GetPath(short_path, std::string(field.name()), case_sensitive_);
198170
}
171+
name_to_id_[full_path] = field.field_id();
172+
name_to_id_.emplace(short_full_path, field.field_id());
173+
ICEBERG_RETURN_UNEXPECTED(
174+
VisitTypeInline(*field.type(), this, full_path, short_full_path));
199175
return {};
200176
}
201177

202-
Status NameVisitor::Visit(const MapType& type, const std::string& path,
203-
const std::string& short_path) {
178+
Status NametoIdVisitor::Visit(const MapType& type, const std::string& path,
179+
const std::string& short_path) {
204180
std::string full_path, short_full_path;
205-
for (const auto& field : type.fields()) {
206-
full_path = iceberg::GetPath(path, std::string(field.name()), case_sensitive);
181+
const auto& fields = type.fields();
182+
for (const auto& field : fields) {
183+
full_path = GetPath(path, std::string(field.name()), case_sensitive_);
207184
if (field.name() == MapType::kValueName &&
208185
field.type()->type_id() == TypeId::kStruct) {
209186
short_full_path = short_path;
210187
} else {
211-
short_full_path = iceberg::GetPath(path, std::string(field.name()), case_sensitive);
212-
}
213-
name_to_index[full_path] = index++;
214-
if (!has_init) {
215-
full_schemafield.emplace_back(field);
216-
}
217-
name_to_index.emplace(short_full_path, index - 1);
218-
if (field.type()->is_nested()) {
219-
ICEBERG_RETURN_UNEXPECTED(
220-
VisitTypeInline(*field.type(), this, full_path, short_full_path));
188+
short_full_path = GetPath(path, std::string(field.name()), case_sensitive_);
221189
}
190+
name_to_id_[full_path] = field.field_id();
191+
name_to_id_.emplace(short_full_path, field.field_id());
192+
ICEBERG_RETURN_UNEXPECTED(
193+
VisitTypeInline(*field.type(), this, full_path, short_full_path));
222194
}
223195
return {};
224196
}
225197

226-
Status NameVisitor::Visit(const StructType& type, const std::string& path,
227-
const std::string& short_path) {
198+
Status NametoIdVisitor::Visit(const StructType& type, const std::string& path,
199+
const std::string& short_path) {
228200
const auto& fields = type.fields();
229201
std::string full_path, short_full_path;
230202
for (const auto& field : fields) {
231-
full_path = iceberg::GetPath(path, std::string(field.name()), case_sensitive);
232-
short_full_path =
233-
iceberg::GetPath(short_path, std::string(field.name()), case_sensitive);
234-
name_to_index[full_path] = index++;
235-
if (!has_init) {
236-
full_schemafield.emplace_back(field);
237-
}
238-
name_to_index.emplace(short_full_path, index - 1);
239-
if (field.type()->is_nested()) {
240-
ICEBERG_RETURN_UNEXPECTED(
241-
VisitTypeInline(*field.type(), this, full_path, short_full_path));
242-
}
203+
full_path =
204+
NametoIdVisitor::GetPath(path, std::string(field.name()), case_sensitive_);
205+
short_full_path = GetPath(short_path, std::string(field.name()), case_sensitive_);
206+
name_to_id_[full_path] = field.field_id();
207+
name_to_id_.emplace(short_full_path, field.field_id());
208+
ICEBERG_RETURN_UNEXPECTED(
209+
VisitTypeInline(*field.type(), this, full_path, short_full_path));
243210
}
244211
return {};
245212
}
246213

247-
Status NameVisitor::Visit(const PrimitiveType& type, const std::string& path,
248-
const std::string& short_path) {
214+
Status NametoIdVisitor::Visit(const PrimitiveType& type, const std::string& path,
215+
const std::string& short_path) {
249216
return {};
250217
}
218+
219+
std::string NametoIdVisitor::GetPath(const std::string& last_path,
220+
const std::string& field_name, bool case_sensitive) {
221+
if (case_sensitive) {
222+
return last_path.empty() ? field_name : last_path + "." + field_name;
223+
}
224+
std::string lower_name(field_name);
225+
std::ranges::transform(lower_name, lower_name.begin(), ::tolower);
226+
return last_path.empty() ? lower_name : last_path + "." + lower_name;
227+
}
251228
} // namespace iceberg

src/iceberg/schema.h

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323
/// Schemas for Iceberg tables. This header contains the definition of Schema
2424
/// and any utility functions. See iceberg/type.h and iceberg/field.h as well.
2525

26-
#include <algorithm>
2726
#include <cstdint>
2827
#include <optional>
2928
#include <string>
@@ -32,7 +31,6 @@
3231
#include "iceberg/iceberg_export.h"
3332
#include "iceberg/schema_field.h"
3433
#include "iceberg/type.h"
35-
#include "iceberg/util/macros.h"
3634
#include "iceberg/util/visit_type.h"
3735

3836
namespace iceberg {
@@ -57,7 +55,7 @@ class ICEBERG_EXPORT Schema : public StructType {
5755

5856
[[nodiscard]] std::string ToString() const override;
5957

60-
///\brief Get thd SchemaField By Name
58+
///\brief Find thd SchemaField By field name
6159
///
6260
/// Short names for maps and lists are included for any name that does not conflict with
6361
/// a canonical name. For example, a list, 'l', of structs with field 'x' will produce
@@ -75,14 +73,13 @@ class ICEBERG_EXPORT Schema : public StructType {
7573

7674
friend bool operator==(const Schema& lhs, const Schema& rhs) { return lhs.Equals(rhs); }
7775

78-
/// Mapping from field id to index of `full_schemafield_`.
79-
mutable std::unordered_map<int, size_t> id_to_index_;
80-
/// Mapping from field name to index of `full_schemafield_`.
81-
mutable std::unordered_map<std::string, size_t> name_to_index_;
82-
/// Mapping from field lowercase_name(suppoert case_insensitive query) to index of
83-
/// `full_schemafield_`.
84-
mutable std::unordered_map<std::string, size_t> lowercase_name_to_index_;
85-
mutable std::vector<std::reference_wrapper<const SchemaField>> full_schemafield_;
76+
/// Mapping from field id to field.
77+
mutable std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>
78+
id_to_field_;
79+
/// Mapping from field name to id of field.
80+
mutable std::unordered_map<std::string, size_t> name_to_id_;
81+
/// Mapping from field lowercase_name(suppoert case_insensitive query) to id of field
82+
mutable std::unordered_map<std::string, size_t> lowercase_name_to_id_;
8683

8784
private:
8885
/// \brief Compare two schemas for equality.

0 commit comments

Comments
 (0)