Skip to content

Commit 88f5520

Browse files
nullccxsynullccxsy
andauthored
feat: add find field (by id and name) support to schema (apache#180)
1. add insensitive way to find schemafield(list, struct, map) 2. change the complexity of find name to O(1) 3. test insensitive way to find schemafield(list, struct, map) --------- Co-authored-by: nullccxsy <[email protected]>
1 parent 7595047 commit 88f5520

File tree

5 files changed

+676
-4
lines changed

5 files changed

+676
-4
lines changed

src/iceberg/schema.cc

Lines changed: 213 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,54 @@
2020
#include "iceberg/schema.h"
2121

2222
#include <format>
23+
#include <functional>
2324

2425
#include "iceberg/type.h"
2526
#include "iceberg/util/formatter.h" // IWYU pragma: keep
27+
#include "iceberg/util/macros.h"
28+
#include "iceberg/util/visit_type.h"
2629

2730
namespace iceberg {
2831

32+
class IdToFieldVisitor {
33+
public:
34+
explicit IdToFieldVisitor(
35+
std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>&
36+
id_to_field);
37+
Status Visit(const PrimitiveType& type);
38+
Status Visit(const NestedType& type);
39+
40+
private:
41+
std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>& id_to_field_;
42+
};
43+
44+
class NameToIdVisitor {
45+
public:
46+
explicit NameToIdVisitor(
47+
std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>& name_to_id,
48+
bool case_sensitive = true,
49+
std::function<std::string(std::string_view)> quoting_func = {});
50+
Status Visit(const ListType& type, const std::string& path,
51+
const std::string& short_path);
52+
Status Visit(const MapType& type, const std::string& path,
53+
const std::string& short_path);
54+
Status Visit(const StructType& type, const std::string& path,
55+
const std::string& short_path);
56+
Status Visit(const PrimitiveType& type, const std::string& path,
57+
const std::string& short_path);
58+
void Finish();
59+
60+
private:
61+
std::string BuildPath(std::string_view prefix, std::string_view field_name,
62+
bool case_sensitive);
63+
64+
private:
65+
bool case_sensitive_;
66+
std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>& name_to_id_;
67+
std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>> short_name_to_id_;
68+
std::function<std::string(std::string_view)> quoting_func_;
69+
};
70+
2971
Schema::Schema(std::vector<SchemaField> fields, std::optional<int32_t> schema_id)
3072
: StructType(std::move(fields)), schema_id_(schema_id) {}
3173

@@ -44,4 +86,175 @@ bool Schema::Equals(const Schema& other) const {
4486
return schema_id_ == other.schema_id_ && fields_ == other.fields_;
4587
}
4688

89+
Result<std::optional<std::reference_wrapper<const SchemaField>>> Schema::FindFieldByName(
90+
std::string_view name, bool case_sensitive) const {
91+
if (case_sensitive) {
92+
ICEBERG_RETURN_UNEXPECTED(InitNameToIdMap());
93+
auto it = name_to_id_.find(name);
94+
if (it == name_to_id_.end()) return std::nullopt;
95+
return FindFieldById(it->second);
96+
}
97+
ICEBERG_RETURN_UNEXPECTED(InitLowerCaseNameToIdMap());
98+
auto it = lowercase_name_to_id_.find(StringUtils::ToLower(name));
99+
if (it == lowercase_name_to_id_.end()) return std::nullopt;
100+
return FindFieldById(it->second);
101+
}
102+
103+
Status Schema::InitIdToFieldMap() const {
104+
if (!id_to_field_.empty()) {
105+
return {};
106+
}
107+
IdToFieldVisitor visitor(id_to_field_);
108+
ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*this, &visitor));
109+
return {};
110+
}
111+
112+
Status Schema::InitNameToIdMap() const {
113+
if (!name_to_id_.empty()) {
114+
return {};
115+
}
116+
NameToIdVisitor visitor(name_to_id_, /*case_sensitive=*/true);
117+
ICEBERG_RETURN_UNEXPECTED(
118+
VisitTypeInline(*this, &visitor, /*path=*/"", /*short_path=*/""));
119+
visitor.Finish();
120+
return {};
121+
}
122+
123+
Status Schema::InitLowerCaseNameToIdMap() const {
124+
if (!lowercase_name_to_id_.empty()) {
125+
return {};
126+
}
127+
NameToIdVisitor visitor(lowercase_name_to_id_, /*case_sensitive=*/false);
128+
ICEBERG_RETURN_UNEXPECTED(
129+
VisitTypeInline(*this, &visitor, /*path=*/"", /*short_path=*/""));
130+
visitor.Finish();
131+
return {};
132+
}
133+
134+
Result<std::optional<std::reference_wrapper<const SchemaField>>> Schema::FindFieldById(
135+
int32_t field_id) const {
136+
ICEBERG_RETURN_UNEXPECTED(InitIdToFieldMap());
137+
auto it = id_to_field_.find(field_id);
138+
if (it == id_to_field_.end()) {
139+
return std::nullopt;
140+
}
141+
return it->second;
142+
}
143+
144+
IdToFieldVisitor::IdToFieldVisitor(
145+
std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>& id_to_field)
146+
: id_to_field_(id_to_field) {}
147+
148+
Status IdToFieldVisitor::Visit(const PrimitiveType& type) { return {}; }
149+
150+
Status IdToFieldVisitor::Visit(const NestedType& type) {
151+
const auto& nested = internal::checked_cast<const NestedType&>(type);
152+
const auto& fields = nested.fields();
153+
for (const auto& field : fields) {
154+
auto it = id_to_field_.try_emplace(field.field_id(), std::cref(field));
155+
if (!it.second) {
156+
return InvalidSchema("Duplicate field id found: {}", field.field_id());
157+
}
158+
ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*field.type(), this));
159+
}
160+
return {};
161+
}
162+
163+
NameToIdVisitor::NameToIdVisitor(
164+
std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>& name_to_id,
165+
bool case_sensitive, std::function<std::string(std::string_view)> quoting_func)
166+
: name_to_id_(name_to_id),
167+
case_sensitive_(case_sensitive),
168+
quoting_func_(std::move(quoting_func)) {}
169+
170+
Status NameToIdVisitor::Visit(const ListType& type, const std::string& path,
171+
const std::string& short_path) {
172+
const auto& field = type.fields()[0];
173+
std::string new_path = BuildPath(path, field.name(), case_sensitive_);
174+
std::string new_short_path;
175+
if (field.type()->type_id() == TypeId::kStruct) {
176+
new_short_path = short_path;
177+
} else {
178+
new_short_path = BuildPath(short_path, field.name(), case_sensitive_);
179+
}
180+
auto it = name_to_id_.try_emplace(new_path, field.field_id());
181+
if (!it.second) {
182+
return InvalidSchema("Duplicate path found: {}, prev id: {}, curr id: {}",
183+
it.first->first, it.first->second, field.field_id());
184+
}
185+
short_name_to_id_.try_emplace(new_short_path, field.field_id());
186+
ICEBERG_RETURN_UNEXPECTED(
187+
VisitTypeInline(*field.type(), this, new_path, new_short_path));
188+
return {};
189+
}
190+
191+
Status NameToIdVisitor::Visit(const MapType& type, const std::string& path,
192+
const std::string& short_path) {
193+
std::string new_path, new_short_path;
194+
const auto& fields = type.fields();
195+
for (const auto& field : fields) {
196+
new_path = BuildPath(path, field.name(), case_sensitive_);
197+
if (field.name() == MapType::kValueName &&
198+
field.type()->type_id() == TypeId::kStruct) {
199+
new_short_path = short_path;
200+
} else {
201+
new_short_path = BuildPath(short_path, field.name(), case_sensitive_);
202+
}
203+
auto it = name_to_id_.try_emplace(new_path, field.field_id());
204+
if (!it.second) {
205+
return InvalidSchema("Duplicate path found: {}, prev id: {}, curr id: {}",
206+
it.first->first, it.first->second, field.field_id());
207+
}
208+
short_name_to_id_.try_emplace(new_short_path, field.field_id());
209+
ICEBERG_RETURN_UNEXPECTED(
210+
VisitTypeInline(*field.type(), this, new_path, new_short_path));
211+
}
212+
return {};
213+
}
214+
215+
Status NameToIdVisitor::Visit(const StructType& type, const std::string& path,
216+
const std::string& short_path) {
217+
const auto& fields = type.fields();
218+
std::string new_path, new_short_path;
219+
for (const auto& field : fields) {
220+
new_path = BuildPath(path, field.name(), case_sensitive_);
221+
new_short_path = BuildPath(short_path, field.name(), case_sensitive_);
222+
auto it = name_to_id_.try_emplace(new_path, field.field_id());
223+
if (!it.second) {
224+
return InvalidSchema("Duplicate path found: {}, prev id: {}, curr id: {}",
225+
it.first->first, it.first->second, field.field_id());
226+
}
227+
short_name_to_id_.try_emplace(new_short_path, field.field_id());
228+
ICEBERG_RETURN_UNEXPECTED(
229+
VisitTypeInline(*field.type(), this, new_path, new_short_path));
230+
}
231+
return {};
232+
}
233+
234+
Status NameToIdVisitor::Visit(const PrimitiveType& type, const std::string& path,
235+
const std::string& short_path) {
236+
return {};
237+
}
238+
239+
std::string NameToIdVisitor::BuildPath(std::string_view prefix,
240+
std::string_view field_name, bool case_sensitive) {
241+
std::string quoted_name;
242+
if (!quoting_func_) {
243+
quoted_name = std::string(field_name);
244+
} else {
245+
quoted_name = quoting_func_(field_name);
246+
}
247+
if (case_sensitive) {
248+
return prefix.empty() ? quoted_name : std::string(prefix) + "." + quoted_name;
249+
}
250+
return prefix.empty() ? StringUtils::ToLower(quoted_name)
251+
: std::string(prefix) + "." + StringUtils::ToLower(quoted_name);
252+
}
253+
254+
void NameToIdVisitor::Finish() {
255+
for (auto&& it : short_name_to_id_) {
256+
name_to_id_.try_emplace(it.first, it.second);
257+
}
258+
}
259+
47260
} // namespace iceberg

src/iceberg/schema.h

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,10 @@
2929
#include <vector>
3030

3131
#include "iceberg/iceberg_export.h"
32+
#include "iceberg/result.h"
3233
#include "iceberg/schema_field.h"
3334
#include "iceberg/type.h"
35+
#include "iceberg/util/string_util.h"
3436

3537
namespace iceberg {
3638

@@ -54,13 +56,44 @@ class ICEBERG_EXPORT Schema : public StructType {
5456

5557
[[nodiscard]] std::string ToString() const override;
5658

59+
/// \brief Find the SchemaField by field name.
60+
///
61+
/// Short names for maps and lists are included for any name that does not conflict with
62+
/// a canonical name. For example, a list, 'l', of structs with field 'x' will produce
63+
/// short name 'l.x' in addition to canonical name 'l.element.x'. a map 'm', if its
64+
/// value include a structs with field 'x' wil produce short name 'm.x' in addition to
65+
/// canonical name 'm.value.x'
66+
/// FIXME: Currently only handles ASCII lowercase conversion; extend to support
67+
/// non-ASCII characters (e.g., using std::towlower or ICU)
68+
[[nodiscard]] Result<std::optional<std::reference_wrapper<const SchemaField>>>
69+
FindFieldByName(std::string_view name, bool case_sensitive = true) const;
70+
71+
/// \brief Find the SchemaField by field id.
72+
[[nodiscard]] Result<std::optional<std::reference_wrapper<const SchemaField>>>
73+
FindFieldById(int32_t field_id) const;
74+
5775
friend bool operator==(const Schema& lhs, const Schema& rhs) { return lhs.Equals(rhs); }
5876

5977
private:
6078
/// \brief Compare two schemas for equality.
6179
[[nodiscard]] bool Equals(const Schema& other) const;
6280

81+
// TODO(nullccxsy): Address potential concurrency issues in lazy initialization (e.g.,
82+
// use std::call_once)
83+
Status InitIdToFieldMap() const;
84+
Status InitNameToIdMap() const;
85+
Status InitLowerCaseNameToIdMap() const;
86+
6387
const std::optional<int32_t> schema_id_;
88+
/// Mapping from field id to field.
89+
mutable std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>
90+
id_to_field_;
91+
/// Mapping from field name to field id.
92+
mutable std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>
93+
name_to_id_;
94+
/// Mapping from lowercased field name to field id
95+
mutable std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>
96+
lowercase_name_to_id_;
6497
};
6598

6699
} // namespace iceberg

src/iceberg/util/macros.h

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,13 @@
1919

2020
#pragma once
2121

22-
#define ICEBERG_RETURN_UNEXPECTED(result) \
23-
if (!result) [[unlikely]] { \
24-
return std::unexpected<Error>(result.error()); \
25-
}
22+
#define ICEBERG_RETURN_UNEXPECTED(result) \
23+
do { \
24+
auto&& result_name = (result); \
25+
if (!result_name) [[unlikely]] { \
26+
return std::unexpected<Error>(result_name.error()); \
27+
} \
28+
} while (false);
2629

2730
#define ICEBERG_ASSIGN_OR_RAISE_IMPL(result_name, lhs, rexpr) \
2831
auto&& result_name = (rexpr); \

src/iceberg/util/string_util.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,4 +46,17 @@ class ICEBERG_EXPORT StringUtils {
4646
}
4747
};
4848

49+
/// \brief Transparent hash function that supports std::string_view as lookup key
50+
///
51+
/// Enables std::unordered_map to directly accept std::string_view lookup keys
52+
/// without creating temporary std::string objects, using C++20's transparent lookup.
53+
struct ICEBERG_EXPORT StringHash {
54+
using hash_type = std::hash<std::string_view>;
55+
using is_transparent = void;
56+
57+
std::size_t operator()(std::string_view str) const { return hash_type{}(str); }
58+
std::size_t operator()(const char* str) const { return hash_type{}(str); }
59+
std::size_t operator()(const std::string& str) const { return hash_type{}(str); }
60+
};
61+
4962
} // namespace iceberg

0 commit comments

Comments
 (0)