Skip to content

Commit e0b35ef

Browse files
authored
refactor: move type visitor classes to type_util (#347)
1 parent 19b086e commit e0b35ef

File tree

8 files changed

+403
-327
lines changed

8 files changed

+403
-327
lines changed

src/iceberg/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ set(ICEBERG_SOURCES
7777
util/temporal_util.cc
7878
util/timepoint.cc
7979
util/truncate_util.cc
80+
util/type_util.cc
8081
util/uuid.cc)
8182

8283
set(ICEBERG_STATIC_BUILD_INTERFACE_LIBS)

src/iceberg/meson.build

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ iceberg_sources = files(
9999
'util/temporal_util.cc',
100100
'util/timepoint.cc',
101101
'util/truncate_util.cc',
102+
'util/type_util.cc',
102103
'util/uuid.cc',
103104
)
104105

src/iceberg/partition_spec.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ bool PartitionSpec::Equals(const PartitionSpec& other) const {
108108
}
109109

110110
Status PartitionSpec::Validate(const Schema& schema, bool allow_missing_fields) const {
111-
std::unordered_map<int32_t, int32_t> parents = indexParents(schema);
111+
std::unordered_map<int32_t, int32_t> parents = IndexParents(schema);
112112
for (const auto& partition_field : fields_) {
113113
ICEBERG_ASSIGN_OR_RAISE(auto source_field,
114114
schema.FindFieldById(partition_field.source_id()));

src/iceberg/schema.cc

Lines changed: 1 addition & 292 deletions
Original file line numberDiff line numberDiff line change
@@ -27,93 +27,12 @@
2727
#include "iceberg/schema_internal.h"
2828
#include "iceberg/type.h"
2929
#include "iceberg/util/formatter.h" // IWYU pragma: keep
30-
#include "iceberg/util/formatter_internal.h"
3130
#include "iceberg/util/macros.h"
31+
#include "iceberg/util/type_util.h"
3232
#include "iceberg/util/visit_type.h"
3333

3434
namespace iceberg {
3535

36-
class IdToFieldVisitor {
37-
public:
38-
explicit IdToFieldVisitor(
39-
std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>&
40-
id_to_field);
41-
Status Visit(const PrimitiveType& type);
42-
Status Visit(const NestedType& type);
43-
44-
private:
45-
std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>& id_to_field_;
46-
};
47-
48-
class NameToIdVisitor {
49-
public:
50-
explicit NameToIdVisitor(
51-
std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>& name_to_id,
52-
bool case_sensitive = true,
53-
std::function<std::string(std::string_view)> quoting_func = {});
54-
Status Visit(const ListType& type, const std::string& path,
55-
const std::string& short_path);
56-
Status Visit(const MapType& type, const std::string& path,
57-
const std::string& short_path);
58-
Status Visit(const StructType& type, const std::string& path,
59-
const std::string& short_path);
60-
Status Visit(const PrimitiveType& type, const std::string& path,
61-
const std::string& short_path);
62-
void Finish();
63-
64-
private:
65-
std::string BuildPath(std::string_view prefix, std::string_view field_name,
66-
bool case_sensitive);
67-
68-
private:
69-
bool case_sensitive_;
70-
std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>& name_to_id_;
71-
std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>> short_name_to_id_;
72-
std::function<std::string(std::string_view)> quoting_func_;
73-
};
74-
75-
class PositionPathVisitor {
76-
public:
77-
Status Visit(const PrimitiveType& type) {
78-
if (current_field_id_ == kUnassignedFieldId) {
79-
return InvalidSchema("Current field id is not assigned, type: {}", type.ToString());
80-
}
81-
82-
if (auto ret = position_path_.try_emplace(current_field_id_, current_path_);
83-
!ret.second) {
84-
return InvalidSchema("Duplicate field id found: {}, prev path: {}, curr path: {}",
85-
current_field_id_, ret.first->second, current_path_);
86-
}
87-
88-
return {};
89-
}
90-
91-
Status Visit(const StructType& type) {
92-
for (size_t i = 0; i < type.fields().size(); ++i) {
93-
const auto& field = type.fields()[i];
94-
current_field_id_ = field.field_id();
95-
current_path_.push_back(i);
96-
ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*field.type(), this));
97-
current_path_.pop_back();
98-
}
99-
return {};
100-
}
101-
102-
// Non-struct types are not supported yet, but it is not an error.
103-
Status Visit(const ListType& type) { return {}; }
104-
Status Visit(const MapType& type) { return {}; }
105-
106-
std::unordered_map<int32_t, std::vector<size_t>> Finish() {
107-
return std::move(position_path_);
108-
}
109-
110-
private:
111-
constexpr static int32_t kUnassignedFieldId = -1;
112-
int32_t current_field_id_ = kUnassignedFieldId;
113-
std::vector<size_t> current_path_;
114-
std::unordered_map<int32_t, std::vector<size_t>> position_path_;
115-
};
116-
11736
Schema::Schema(std::vector<SchemaField> fields, std::optional<int32_t> schema_id)
11837
: StructType(std::move(fields)), schema_id_(schema_id) {}
11938

@@ -210,216 +129,6 @@ Result<std::unique_ptr<StructLikeAccessor>> Schema::GetAccessorById(
210129
return NotFound("Cannot get accessor for field id: {}", field_id);
211130
}
212131

213-
IdToFieldVisitor::IdToFieldVisitor(
214-
std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>& id_to_field)
215-
: id_to_field_(id_to_field) {}
216-
217-
Status IdToFieldVisitor::Visit(const PrimitiveType& type) { return {}; }
218-
219-
Status IdToFieldVisitor::Visit(const NestedType& type) {
220-
const auto& nested = internal::checked_cast<const NestedType&>(type);
221-
const auto& fields = nested.fields();
222-
for (const auto& field : fields) {
223-
auto it = id_to_field_.try_emplace(field.field_id(), std::cref(field));
224-
if (!it.second) {
225-
return InvalidSchema("Duplicate field id found: {}", field.field_id());
226-
}
227-
ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*field.type(), this));
228-
}
229-
return {};
230-
}
231-
232-
NameToIdVisitor::NameToIdVisitor(
233-
std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>& name_to_id,
234-
bool case_sensitive, std::function<std::string(std::string_view)> quoting_func)
235-
: case_sensitive_(case_sensitive),
236-
name_to_id_(name_to_id),
237-
quoting_func_(std::move(quoting_func)) {}
238-
239-
Status NameToIdVisitor::Visit(const ListType& type, const std::string& path,
240-
const std::string& short_path) {
241-
const auto& field = type.fields()[0];
242-
std::string new_path = BuildPath(path, field.name(), case_sensitive_);
243-
std::string new_short_path;
244-
if (field.type()->type_id() == TypeId::kStruct) {
245-
new_short_path = short_path;
246-
} else {
247-
new_short_path = BuildPath(short_path, field.name(), case_sensitive_);
248-
}
249-
auto it = name_to_id_.try_emplace(new_path, field.field_id());
250-
if (!it.second) {
251-
return InvalidSchema("Duplicate path found: {}, prev id: {}, curr id: {}",
252-
it.first->first, it.first->second, field.field_id());
253-
}
254-
short_name_to_id_.try_emplace(new_short_path, field.field_id());
255-
ICEBERG_RETURN_UNEXPECTED(
256-
VisitTypeInline(*field.type(), this, new_path, new_short_path));
257-
return {};
258-
}
259-
260-
Status NameToIdVisitor::Visit(const MapType& type, const std::string& path,
261-
const std::string& short_path) {
262-
std::string new_path, new_short_path;
263-
const auto& fields = type.fields();
264-
for (const auto& field : fields) {
265-
new_path = BuildPath(path, field.name(), case_sensitive_);
266-
if (field.name() == MapType::kValueName &&
267-
field.type()->type_id() == TypeId::kStruct) {
268-
new_short_path = short_path;
269-
} else {
270-
new_short_path = BuildPath(short_path, field.name(), case_sensitive_);
271-
}
272-
auto it = name_to_id_.try_emplace(new_path, field.field_id());
273-
if (!it.second) {
274-
return InvalidSchema("Duplicate path found: {}, prev id: {}, curr id: {}",
275-
it.first->first, it.first->second, field.field_id());
276-
}
277-
short_name_to_id_.try_emplace(new_short_path, field.field_id());
278-
ICEBERG_RETURN_UNEXPECTED(
279-
VisitTypeInline(*field.type(), this, new_path, new_short_path));
280-
}
281-
return {};
282-
}
283-
284-
Status NameToIdVisitor::Visit(const StructType& type, const std::string& path,
285-
const std::string& short_path) {
286-
const auto& fields = type.fields();
287-
std::string new_path, new_short_path;
288-
for (const auto& field : fields) {
289-
new_path = BuildPath(path, field.name(), case_sensitive_);
290-
new_short_path = BuildPath(short_path, field.name(), case_sensitive_);
291-
auto it = name_to_id_.try_emplace(new_path, field.field_id());
292-
if (!it.second) {
293-
return InvalidSchema("Duplicate path found: {}, prev id: {}, curr id: {}",
294-
it.first->first, it.first->second, field.field_id());
295-
}
296-
short_name_to_id_.try_emplace(new_short_path, field.field_id());
297-
ICEBERG_RETURN_UNEXPECTED(
298-
VisitTypeInline(*field.type(), this, new_path, new_short_path));
299-
}
300-
return {};
301-
}
302-
303-
Status NameToIdVisitor::Visit(const PrimitiveType& type, const std::string& path,
304-
const std::string& short_path) {
305-
return {};
306-
}
307-
308-
std::string NameToIdVisitor::BuildPath(std::string_view prefix,
309-
std::string_view field_name, bool case_sensitive) {
310-
std::string quoted_name;
311-
if (!quoting_func_) {
312-
quoted_name = std::string(field_name);
313-
} else {
314-
quoted_name = quoting_func_(field_name);
315-
}
316-
if (case_sensitive) {
317-
return prefix.empty() ? quoted_name : std::string(prefix) + "." + quoted_name;
318-
}
319-
return prefix.empty() ? StringUtils::ToLower(quoted_name)
320-
: std::string(prefix) + "." + StringUtils::ToLower(quoted_name);
321-
}
322-
323-
void NameToIdVisitor::Finish() {
324-
for (auto&& it : short_name_to_id_) {
325-
name_to_id_.try_emplace(it.first, it.second);
326-
}
327-
}
328-
329-
/// \brief Visitor for pruning columns based on selected field IDs.
330-
///
331-
/// This visitor traverses a schema and creates a projected version containing only
332-
/// the specified fields. When `select_full_types` is true, a field with all its
333-
/// sub-fields are selected if its field-id has been selected; otherwise, only leaf
334-
/// fields of selected field-ids are selected.
335-
///
336-
/// \note It returns an error when projection is not successful.
337-
class PruneColumnVisitor {
338-
public:
339-
PruneColumnVisitor(const std::unordered_set<int32_t>& selected_ids,
340-
bool select_full_types)
341-
: selected_ids_(selected_ids), select_full_types_(select_full_types) {}
342-
343-
Result<std::shared_ptr<Type>> Visit(const std::shared_ptr<Type>& type) const {
344-
switch (type->type_id()) {
345-
case TypeId::kStruct:
346-
return Visit(internal::checked_pointer_cast<StructType>(type));
347-
case TypeId::kList:
348-
return Visit(internal::checked_pointer_cast<ListType>(type));
349-
case TypeId::kMap:
350-
return Visit(internal::checked_pointer_cast<MapType>(type));
351-
default:
352-
return nullptr;
353-
}
354-
}
355-
356-
Result<std::shared_ptr<Type>> Visit(const SchemaField& field) const {
357-
if (selected_ids_.contains(field.field_id())) {
358-
return (select_full_types_ || field.type()->is_primitive()) ? field.type()
359-
: Visit(field.type());
360-
}
361-
return Visit(field.type());
362-
}
363-
364-
static SchemaField MakeField(const SchemaField& field, std::shared_ptr<Type> type) {
365-
return {field.field_id(), std::string(field.name()), std::move(type),
366-
field.optional(), std::string(field.doc())};
367-
}
368-
369-
Result<std::shared_ptr<Type>> Visit(const std::shared_ptr<StructType>& type) const {
370-
bool same_types = true;
371-
std::vector<SchemaField> selected_fields;
372-
for (const auto& field : type->fields()) {
373-
ICEBERG_ASSIGN_OR_RAISE(auto child_type, Visit(field));
374-
if (child_type) {
375-
same_types = same_types && (child_type == field.type());
376-
selected_fields.emplace_back(MakeField(field, std::move(child_type)));
377-
}
378-
}
379-
380-
if (selected_fields.empty()) {
381-
return nullptr;
382-
} else if (same_types && selected_fields.size() == type->fields().size()) {
383-
return type;
384-
}
385-
return std::make_shared<StructType>(std::move(selected_fields));
386-
}
387-
388-
Result<std::shared_ptr<Type>> Visit(const std::shared_ptr<ListType>& type) const {
389-
const auto& elem_field = type->fields()[0];
390-
ICEBERG_ASSIGN_OR_RAISE(auto elem_type, Visit(elem_field));
391-
if (elem_type == nullptr) {
392-
return nullptr;
393-
} else if (elem_type == elem_field.type()) {
394-
return type;
395-
}
396-
return std::make_shared<ListType>(MakeField(elem_field, std::move(elem_type)));
397-
}
398-
399-
Result<std::shared_ptr<Type>> Visit(const std::shared_ptr<MapType>& type) const {
400-
const auto& key_field = type->fields()[0];
401-
const auto& value_field = type->fields()[1];
402-
ICEBERG_ASSIGN_OR_RAISE(auto key_type, Visit(key_field));
403-
ICEBERG_ASSIGN_OR_RAISE(auto value_type, Visit(value_field));
404-
405-
if (key_type == nullptr && value_type == nullptr) {
406-
return nullptr;
407-
} else if (value_type == value_field.type() &&
408-
(key_type == key_field.type() || key_type == nullptr)) {
409-
return type;
410-
} else if (value_type == nullptr) {
411-
return InvalidArgument("Cannot project Map without value field");
412-
}
413-
return std::make_shared<MapType>(
414-
(key_type == nullptr ? key_field : MakeField(key_field, std::move(key_type))),
415-
MakeField(value_field, std::move(value_type)));
416-
}
417-
418-
private:
419-
const std::unordered_set<int32_t>& selected_ids_;
420-
const bool select_full_types_;
421-
};
422-
423132
Result<std::unique_ptr<Schema>> Schema::Select(std::span<const std::string> names,
424133
bool case_sensitive) const {
425134
const std::string kAllColumns = "*";

src/iceberg/test/type_test.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -659,7 +659,7 @@ TEST(TypeTest, IndexParents) {
659659
points,
660660
});
661661

662-
std::unordered_map<int32_t, int32_t> parent_index = iceberg::indexParents(root_struct);
662+
std::unordered_map<int32_t, int32_t> parent_index = iceberg::IndexParents(root_struct);
663663

664664
// Verify top-level fields have no parent
665665
ASSERT_EQ(parent_index.find(1), parent_index.end());

src/iceberg/util/meson.build

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,19 +17,24 @@
1717

1818
install_headers(
1919
[
20+
'bucket_util.h',
2021
'checked_cast.h',
2122
'config.h',
23+
'conversions.h',
2224
'decimal.h',
2325
'endian.h',
2426
'formattable.h',
2527
'formatter.h',
2628
'int128.h',
29+
'lazy.h',
2730
'macros.h',
2831
'partition_value_util.h',
2932
'string_util.h',
33+
'temporal_util.h',
3034
'timepoint.h',
3135
'truncate_util.h',
3236
'type_util.h',
37+
'uuid.h',
3338
'visitor_generate.h',
3439
'visit_type.h',
3540
],

0 commit comments

Comments
 (0)