|
27 | 27 | #include "iceberg/schema_internal.h" |
28 | 28 | #include "iceberg/type.h" |
29 | 29 | #include "iceberg/util/formatter.h" // IWYU pragma: keep |
30 | | -#include "iceberg/util/formatter_internal.h" |
31 | 30 | #include "iceberg/util/macros.h" |
| 31 | +#include "iceberg/util/type_util.h" |
32 | 32 | #include "iceberg/util/visit_type.h" |
33 | 33 |
|
34 | 34 | namespace iceberg { |
35 | 35 |
|
36 | | -class IdToFieldVisitor { |
37 | | - public: |
38 | | - explicit IdToFieldVisitor( |
39 | | - std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>& |
40 | | - id_to_field); |
41 | | - Status Visit(const PrimitiveType& type); |
42 | | - Status Visit(const NestedType& type); |
43 | | - |
44 | | - private: |
45 | | - std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>& id_to_field_; |
46 | | -}; |
47 | | - |
48 | | -class NameToIdVisitor { |
49 | | - public: |
50 | | - explicit NameToIdVisitor( |
51 | | - std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>& name_to_id, |
52 | | - bool case_sensitive = true, |
53 | | - std::function<std::string(std::string_view)> quoting_func = {}); |
54 | | - Status Visit(const ListType& type, const std::string& path, |
55 | | - const std::string& short_path); |
56 | | - Status Visit(const MapType& type, const std::string& path, |
57 | | - const std::string& short_path); |
58 | | - Status Visit(const StructType& type, const std::string& path, |
59 | | - const std::string& short_path); |
60 | | - Status Visit(const PrimitiveType& type, const std::string& path, |
61 | | - const std::string& short_path); |
62 | | - void Finish(); |
63 | | - |
64 | | - private: |
65 | | - std::string BuildPath(std::string_view prefix, std::string_view field_name, |
66 | | - bool case_sensitive); |
67 | | - |
68 | | - private: |
69 | | - bool case_sensitive_; |
70 | | - std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>& name_to_id_; |
71 | | - std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>> short_name_to_id_; |
72 | | - std::function<std::string(std::string_view)> quoting_func_; |
73 | | -}; |
74 | | - |
75 | | -class PositionPathVisitor { |
76 | | - public: |
77 | | - Status Visit(const PrimitiveType& type) { |
78 | | - if (current_field_id_ == kUnassignedFieldId) { |
79 | | - return InvalidSchema("Current field id is not assigned, type: {}", type.ToString()); |
80 | | - } |
81 | | - |
82 | | - if (auto ret = position_path_.try_emplace(current_field_id_, current_path_); |
83 | | - !ret.second) { |
84 | | - return InvalidSchema("Duplicate field id found: {}, prev path: {}, curr path: {}", |
85 | | - current_field_id_, ret.first->second, current_path_); |
86 | | - } |
87 | | - |
88 | | - return {}; |
89 | | - } |
90 | | - |
91 | | - Status Visit(const StructType& type) { |
92 | | - for (size_t i = 0; i < type.fields().size(); ++i) { |
93 | | - const auto& field = type.fields()[i]; |
94 | | - current_field_id_ = field.field_id(); |
95 | | - current_path_.push_back(i); |
96 | | - ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*field.type(), this)); |
97 | | - current_path_.pop_back(); |
98 | | - } |
99 | | - return {}; |
100 | | - } |
101 | | - |
102 | | - // Non-struct types are not supported yet, but it is not an error. |
103 | | - Status Visit(const ListType& type) { return {}; } |
104 | | - Status Visit(const MapType& type) { return {}; } |
105 | | - |
106 | | - std::unordered_map<int32_t, std::vector<size_t>> Finish() { |
107 | | - return std::move(position_path_); |
108 | | - } |
109 | | - |
110 | | - private: |
111 | | - constexpr static int32_t kUnassignedFieldId = -1; |
112 | | - int32_t current_field_id_ = kUnassignedFieldId; |
113 | | - std::vector<size_t> current_path_; |
114 | | - std::unordered_map<int32_t, std::vector<size_t>> position_path_; |
115 | | -}; |
116 | | - |
117 | 36 | Schema::Schema(std::vector<SchemaField> fields, std::optional<int32_t> schema_id) |
118 | 37 | : StructType(std::move(fields)), schema_id_(schema_id) {} |
119 | 38 |
|
@@ -210,216 +129,6 @@ Result<std::unique_ptr<StructLikeAccessor>> Schema::GetAccessorById( |
210 | 129 | return NotFound("Cannot get accessor for field id: {}", field_id); |
211 | 130 | } |
212 | 131 |
|
213 | | -IdToFieldVisitor::IdToFieldVisitor( |
214 | | - std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>& id_to_field) |
215 | | - : id_to_field_(id_to_field) {} |
216 | | - |
217 | | -Status IdToFieldVisitor::Visit(const PrimitiveType& type) { return {}; } |
218 | | - |
219 | | -Status IdToFieldVisitor::Visit(const NestedType& type) { |
220 | | - const auto& nested = internal::checked_cast<const NestedType&>(type); |
221 | | - const auto& fields = nested.fields(); |
222 | | - for (const auto& field : fields) { |
223 | | - auto it = id_to_field_.try_emplace(field.field_id(), std::cref(field)); |
224 | | - if (!it.second) { |
225 | | - return InvalidSchema("Duplicate field id found: {}", field.field_id()); |
226 | | - } |
227 | | - ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*field.type(), this)); |
228 | | - } |
229 | | - return {}; |
230 | | -} |
231 | | - |
232 | | -NameToIdVisitor::NameToIdVisitor( |
233 | | - std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>& name_to_id, |
234 | | - bool case_sensitive, std::function<std::string(std::string_view)> quoting_func) |
235 | | - : case_sensitive_(case_sensitive), |
236 | | - name_to_id_(name_to_id), |
237 | | - quoting_func_(std::move(quoting_func)) {} |
238 | | - |
239 | | -Status NameToIdVisitor::Visit(const ListType& type, const std::string& path, |
240 | | - const std::string& short_path) { |
241 | | - const auto& field = type.fields()[0]; |
242 | | - std::string new_path = BuildPath(path, field.name(), case_sensitive_); |
243 | | - std::string new_short_path; |
244 | | - if (field.type()->type_id() == TypeId::kStruct) { |
245 | | - new_short_path = short_path; |
246 | | - } else { |
247 | | - new_short_path = BuildPath(short_path, field.name(), case_sensitive_); |
248 | | - } |
249 | | - auto it = name_to_id_.try_emplace(new_path, field.field_id()); |
250 | | - if (!it.second) { |
251 | | - return InvalidSchema("Duplicate path found: {}, prev id: {}, curr id: {}", |
252 | | - it.first->first, it.first->second, field.field_id()); |
253 | | - } |
254 | | - short_name_to_id_.try_emplace(new_short_path, field.field_id()); |
255 | | - ICEBERG_RETURN_UNEXPECTED( |
256 | | - VisitTypeInline(*field.type(), this, new_path, new_short_path)); |
257 | | - return {}; |
258 | | -} |
259 | | - |
260 | | -Status NameToIdVisitor::Visit(const MapType& type, const std::string& path, |
261 | | - const std::string& short_path) { |
262 | | - std::string new_path, new_short_path; |
263 | | - const auto& fields = type.fields(); |
264 | | - for (const auto& field : fields) { |
265 | | - new_path = BuildPath(path, field.name(), case_sensitive_); |
266 | | - if (field.name() == MapType::kValueName && |
267 | | - field.type()->type_id() == TypeId::kStruct) { |
268 | | - new_short_path = short_path; |
269 | | - } else { |
270 | | - new_short_path = BuildPath(short_path, field.name(), case_sensitive_); |
271 | | - } |
272 | | - auto it = name_to_id_.try_emplace(new_path, field.field_id()); |
273 | | - if (!it.second) { |
274 | | - return InvalidSchema("Duplicate path found: {}, prev id: {}, curr id: {}", |
275 | | - it.first->first, it.first->second, field.field_id()); |
276 | | - } |
277 | | - short_name_to_id_.try_emplace(new_short_path, field.field_id()); |
278 | | - ICEBERG_RETURN_UNEXPECTED( |
279 | | - VisitTypeInline(*field.type(), this, new_path, new_short_path)); |
280 | | - } |
281 | | - return {}; |
282 | | -} |
283 | | - |
284 | | -Status NameToIdVisitor::Visit(const StructType& type, const std::string& path, |
285 | | - const std::string& short_path) { |
286 | | - const auto& fields = type.fields(); |
287 | | - std::string new_path, new_short_path; |
288 | | - for (const auto& field : fields) { |
289 | | - new_path = BuildPath(path, field.name(), case_sensitive_); |
290 | | - new_short_path = BuildPath(short_path, field.name(), case_sensitive_); |
291 | | - auto it = name_to_id_.try_emplace(new_path, field.field_id()); |
292 | | - if (!it.second) { |
293 | | - return InvalidSchema("Duplicate path found: {}, prev id: {}, curr id: {}", |
294 | | - it.first->first, it.first->second, field.field_id()); |
295 | | - } |
296 | | - short_name_to_id_.try_emplace(new_short_path, field.field_id()); |
297 | | - ICEBERG_RETURN_UNEXPECTED( |
298 | | - VisitTypeInline(*field.type(), this, new_path, new_short_path)); |
299 | | - } |
300 | | - return {}; |
301 | | -} |
302 | | - |
303 | | -Status NameToIdVisitor::Visit(const PrimitiveType& type, const std::string& path, |
304 | | - const std::string& short_path) { |
305 | | - return {}; |
306 | | -} |
307 | | - |
308 | | -std::string NameToIdVisitor::BuildPath(std::string_view prefix, |
309 | | - std::string_view field_name, bool case_sensitive) { |
310 | | - std::string quoted_name; |
311 | | - if (!quoting_func_) { |
312 | | - quoted_name = std::string(field_name); |
313 | | - } else { |
314 | | - quoted_name = quoting_func_(field_name); |
315 | | - } |
316 | | - if (case_sensitive) { |
317 | | - return prefix.empty() ? quoted_name : std::string(prefix) + "." + quoted_name; |
318 | | - } |
319 | | - return prefix.empty() ? StringUtils::ToLower(quoted_name) |
320 | | - : std::string(prefix) + "." + StringUtils::ToLower(quoted_name); |
321 | | -} |
322 | | - |
323 | | -void NameToIdVisitor::Finish() { |
324 | | - for (auto&& it : short_name_to_id_) { |
325 | | - name_to_id_.try_emplace(it.first, it.second); |
326 | | - } |
327 | | -} |
328 | | - |
329 | | -/// \brief Visitor for pruning columns based on selected field IDs. |
330 | | -/// |
331 | | -/// This visitor traverses a schema and creates a projected version containing only |
332 | | -/// the specified fields. When `select_full_types` is true, a field with all its |
333 | | -/// sub-fields are selected if its field-id has been selected; otherwise, only leaf |
334 | | -/// fields of selected field-ids are selected. |
335 | | -/// |
336 | | -/// \note It returns an error when projection is not successful. |
337 | | -class PruneColumnVisitor { |
338 | | - public: |
339 | | - PruneColumnVisitor(const std::unordered_set<int32_t>& selected_ids, |
340 | | - bool select_full_types) |
341 | | - : selected_ids_(selected_ids), select_full_types_(select_full_types) {} |
342 | | - |
343 | | - Result<std::shared_ptr<Type>> Visit(const std::shared_ptr<Type>& type) const { |
344 | | - switch (type->type_id()) { |
345 | | - case TypeId::kStruct: |
346 | | - return Visit(internal::checked_pointer_cast<StructType>(type)); |
347 | | - case TypeId::kList: |
348 | | - return Visit(internal::checked_pointer_cast<ListType>(type)); |
349 | | - case TypeId::kMap: |
350 | | - return Visit(internal::checked_pointer_cast<MapType>(type)); |
351 | | - default: |
352 | | - return nullptr; |
353 | | - } |
354 | | - } |
355 | | - |
356 | | - Result<std::shared_ptr<Type>> Visit(const SchemaField& field) const { |
357 | | - if (selected_ids_.contains(field.field_id())) { |
358 | | - return (select_full_types_ || field.type()->is_primitive()) ? field.type() |
359 | | - : Visit(field.type()); |
360 | | - } |
361 | | - return Visit(field.type()); |
362 | | - } |
363 | | - |
364 | | - static SchemaField MakeField(const SchemaField& field, std::shared_ptr<Type> type) { |
365 | | - return {field.field_id(), std::string(field.name()), std::move(type), |
366 | | - field.optional(), std::string(field.doc())}; |
367 | | - } |
368 | | - |
369 | | - Result<std::shared_ptr<Type>> Visit(const std::shared_ptr<StructType>& type) const { |
370 | | - bool same_types = true; |
371 | | - std::vector<SchemaField> selected_fields; |
372 | | - for (const auto& field : type->fields()) { |
373 | | - ICEBERG_ASSIGN_OR_RAISE(auto child_type, Visit(field)); |
374 | | - if (child_type) { |
375 | | - same_types = same_types && (child_type == field.type()); |
376 | | - selected_fields.emplace_back(MakeField(field, std::move(child_type))); |
377 | | - } |
378 | | - } |
379 | | - |
380 | | - if (selected_fields.empty()) { |
381 | | - return nullptr; |
382 | | - } else if (same_types && selected_fields.size() == type->fields().size()) { |
383 | | - return type; |
384 | | - } |
385 | | - return std::make_shared<StructType>(std::move(selected_fields)); |
386 | | - } |
387 | | - |
388 | | - Result<std::shared_ptr<Type>> Visit(const std::shared_ptr<ListType>& type) const { |
389 | | - const auto& elem_field = type->fields()[0]; |
390 | | - ICEBERG_ASSIGN_OR_RAISE(auto elem_type, Visit(elem_field)); |
391 | | - if (elem_type == nullptr) { |
392 | | - return nullptr; |
393 | | - } else if (elem_type == elem_field.type()) { |
394 | | - return type; |
395 | | - } |
396 | | - return std::make_shared<ListType>(MakeField(elem_field, std::move(elem_type))); |
397 | | - } |
398 | | - |
399 | | - Result<std::shared_ptr<Type>> Visit(const std::shared_ptr<MapType>& type) const { |
400 | | - const auto& key_field = type->fields()[0]; |
401 | | - const auto& value_field = type->fields()[1]; |
402 | | - ICEBERG_ASSIGN_OR_RAISE(auto key_type, Visit(key_field)); |
403 | | - ICEBERG_ASSIGN_OR_RAISE(auto value_type, Visit(value_field)); |
404 | | - |
405 | | - if (key_type == nullptr && value_type == nullptr) { |
406 | | - return nullptr; |
407 | | - } else if (value_type == value_field.type() && |
408 | | - (key_type == key_field.type() || key_type == nullptr)) { |
409 | | - return type; |
410 | | - } else if (value_type == nullptr) { |
411 | | - return InvalidArgument("Cannot project Map without value field"); |
412 | | - } |
413 | | - return std::make_shared<MapType>( |
414 | | - (key_type == nullptr ? key_field : MakeField(key_field, std::move(key_type))), |
415 | | - MakeField(value_field, std::move(value_type))); |
416 | | - } |
417 | | - |
418 | | - private: |
419 | | - const std::unordered_set<int32_t>& selected_ids_; |
420 | | - const bool select_full_types_; |
421 | | -}; |
422 | | - |
423 | 132 | Result<std::unique_ptr<Schema>> Schema::Select(std::span<const std::string> names, |
424 | 133 | bool case_sensitive) const { |
425 | 134 | const std::string kAllColumns = "*"; |
|
0 commit comments