Skip to content

Commit a47d73b

Browse files
author
nullccxsy
committed
refactor: enhance PruneColumnVisitor for improved type handling and error reporting
- Updated the PruneColumnVisitor class to utilize shared pointers for type results, enhancing memory management. - Refined Visit methods for StructType, ListType, and MapType to improve clarity and error handling, particularly for cases involving invalid projections.
1 parent c6c1f38 commit a47d73b

File tree

3 files changed

+290
-201
lines changed

3 files changed

+290
-201
lines changed

src/iceberg/schema.cc

Lines changed: 105 additions & 131 deletions
Original file line numberDiff line numberDiff line change
@@ -266,192 +266,167 @@ void NameToIdVisitor::Finish() {
266266
/// - select_full_types=false: Recursively project nested fields within selected structs
267267
///
268268
/// \warning Error conditions that will cause projection to fail:
269-
/// - Attempting to explicitly project List or Map types (returns InvalidArgument)
270-
/// - Projecting a List when element result is null (returns InvalidArgument)
271-
/// - Projecting a Map without a defined map value type (returns InvalidArgument)
272-
/// - Projecting a struct when result is not StructType (returns InvalidArgument)
269+
/// - Project or Select a Map with just key or value (returns InvalidArgument)
273270
class PruneColumnVisitor {
274271
public:
275272
PruneColumnVisitor(const std::unordered_set<int32_t>& selected_ids,
276273
bool select_full_types)
277274
: selected_ids_(selected_ids), select_full_types_(select_full_types) {}
278275

279-
Status Visit(const StructType& type, std::unique_ptr<Type>* out) const {
280-
std::vector<std::shared_ptr<const Type>> selected_types;
281-
for (const auto& field : type.fields()) {
282-
std::unique_ptr<Type> child_result;
276+
Result<std::shared_ptr<Type>> Visit(const std::shared_ptr<Type>& type) const {
277+
switch (type->type_id()) {
278+
case TypeId::kStruct: {
279+
auto struct_type = std::static_pointer_cast<StructType>(type);
280+
return Visit(struct_type);
281+
}
282+
case TypeId::kList: {
283+
auto list_type = std::static_pointer_cast<ListType>(type);
284+
return Visit(list_type);
285+
}
286+
case TypeId::kMap: {
287+
auto map_type = std::static_pointer_cast<MapType>(type);
288+
return Visit(map_type);
289+
}
290+
default: {
291+
auto primitive_type = std::static_pointer_cast<PrimitiveType>(type);
292+
return Visit(primitive_type);
293+
}
294+
}
295+
}
296+
297+
Result<std::shared_ptr<Type>> Visit(const std::shared_ptr<StructType>& type) const {
298+
std::vector<std::shared_ptr<Type>> selected_types;
299+
for (const auto& field : type->fields()) {
283300
if (select_full_types_ and selected_ids_.contains(field.field_id())) {
284301
selected_types.emplace_back(field.type());
285302
continue;
286303
}
287-
ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*field.type(), this, &child_result));
304+
ICEBERG_ASSIGN_OR_RAISE(auto child_result, Visit(field.type()));
288305
if (selected_ids_.contains(field.field_id())) {
289-
// select
290-
if (field.type()->type_id() == TypeId::kStruct) {
291-
// project(kstruct)
292-
ICEBERG_RETURN_UNEXPECTED(ProjectStruct(field, &child_result));
293-
selected_types.emplace_back(std::move(child_result));
294-
} else {
295-
// project(list, map, primitive)
296-
if (!field.type()->is_primitive()) {
297-
return InvalidArgument(
298-
"Cannot explicitly project List or Map types, {}:{} of type {} was "
299-
"selected",
300-
field.field_id(), field.name(), field.type()->ToString());
301-
}
302-
selected_types.emplace_back(field.type());
303-
}
304-
} else if (child_result) {
305-
// project, select
306-
selected_types.emplace_back(std::move(child_result));
306+
selected_types.emplace_back(
307+
field.type()->is_primitive() ? field.type() : std::move(child_result));
307308
} else {
308-
// project, select
309-
selected_types.emplace_back(nullptr);
309+
selected_types.emplace_back(std::move(child_result));
310310
}
311311
}
312312

313313
bool same_types = true;
314314
std::vector<SchemaField> selected_fields;
315-
const auto& fields = type.fields();
315+
const auto& fields = type->fields();
316316
for (size_t i = 0; i < fields.size(); i++) {
317317
if (fields[i].type() == selected_types[i]) {
318-
selected_fields.emplace_back(fields[i]);
318+
selected_fields.emplace_back(std::move(fields[i]));
319319
} else if (selected_types[i]) {
320320
same_types = false;
321321
selected_fields.emplace_back(fields[i].field_id(), std::string(fields[i].name()),
322-
std::const_pointer_cast<Type>(selected_types[i]),
323-
fields[i].optional(), std::string(fields[i].doc()));
322+
std::move(selected_types[i]), fields[i].optional(),
323+
std::string(fields[i].doc()));
324324
}
325325
}
326326

327327
if (!selected_fields.empty()) {
328328
if (same_types && selected_fields.size() == fields.size()) {
329-
*out = std::make_unique<StructType>(type);
329+
return type;
330330
} else {
331-
*out = std::make_unique<StructType>(std::move(selected_fields));
331+
return std::make_shared<StructType>(std::move(selected_fields));
332332
}
333333
}
334334

335-
return {};
335+
return nullptr;
336336
}
337337

338-
Status Visit(const ListType& type, std::unique_ptr<Type>* out) const {
339-
const auto& element_field = type.fields()[0];
340-
338+
Result<std::shared_ptr<Type>> Visit(const std::shared_ptr<ListType>& type) const {
339+
const auto& element_field = type->fields()[0];
341340
if (select_full_types_ and selected_ids_.contains(element_field.field_id())) {
342-
*out = std::make_unique<ListType>(type);
343-
return {};
341+
return type;
344342
}
345343

346-
std::unique_ptr<Type> child_result;
347-
ICEBERG_RETURN_UNEXPECTED(
348-
VisitTypeInline(*element_field.type(), this, &child_result));
344+
ICEBERG_ASSIGN_OR_RAISE(auto child_result, Visit(element_field.type()));
349345

346+
std::shared_ptr<Type> out;
350347
if (selected_ids_.contains(element_field.field_id())) {
351-
if (element_field.type()->type_id() == TypeId::kStruct) {
352-
ICEBERG_RETURN_UNEXPECTED(ProjectStruct(element_field, &child_result));
353-
ICEBERG_RETURN_UNEXPECTED(
354-
ProjectList(element_field, std::move(child_result), out));
348+
if (element_field.type()->is_primitive()) {
349+
out = std::make_shared<ListType>(element_field);
355350
} else {
356-
if (!element_field.type()->is_primitive()) {
357-
return InvalidArgument(
358-
"Cannot explicitly project List or Map types, List element {} of type {} "
359-
"was "
360-
"selected",
361-
element_field.field_id(), element_field.name());
362-
}
363-
*out = std::make_unique<ListType>(element_field);
351+
ICEBERG_ASSIGN_OR_RAISE(out, ProjectList(element_field, std::move(child_result)));
364352
}
365353
} else if (child_result) {
366-
ICEBERG_RETURN_UNEXPECTED(ProjectList(element_field, std::move(child_result), out));
354+
ICEBERG_ASSIGN_OR_RAISE(out, ProjectList(element_field, std::move(child_result)));
367355
}
368-
369-
return {};
356+
return out;
370357
}
371358

372-
Status Visit(const MapType& type, std::unique_ptr<Type>* out) const {
373-
const auto& key_field = type.fields()[0];
374-
const auto& value_field = type.fields()[1];
359+
Result<std::shared_ptr<Type>> Visit(const std::shared_ptr<MapType>& type) const {
360+
const auto& key_field = type->fields()[0];
361+
const auto& value_field = type->fields()[1];
375362

376-
if (select_full_types_ and selected_ids_.contains(value_field.field_id())) {
377-
*out = std::make_unique<MapType>(type);
378-
return {};
363+
if (select_full_types_ and selected_ids_.contains(key_field.field_id()) and
364+
selected_ids_.contains(value_field.field_id())) {
365+
return type;
379366
}
380367

381-
std::unique_ptr<Type> key_result;
382-
ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*key_field.type(), this, &key_result));
368+
ICEBERG_ASSIGN_OR_RAISE(auto key_result, Visit(key_field.type()));
369+
ICEBERG_ASSIGN_OR_RAISE(auto value_result, Visit(value_field.type()));
383370

384-
std::unique_ptr<Type> value_result;
385-
ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*value_field.type(), this, &value_result));
371+
if (selected_ids_.contains(value_field.field_id()) and
372+
value_field.type()->is_primitive()) {
373+
value_result = value_field.type();
374+
}
375+
if (selected_ids_.contains(key_field.field_id()) and
376+
key_field.type()->is_primitive()) {
377+
key_result = key_field.type();
378+
}
386379

387-
if (selected_ids_.contains(value_field.field_id())) {
388-
if (value_field.type()->type_id() == TypeId::kStruct) {
389-
ICEBERG_RETURN_UNEXPECTED(ProjectStruct(value_field, &value_result));
390-
ICEBERG_RETURN_UNEXPECTED(
391-
ProjectMap(key_field, value_field, std::move(value_result), out));
392-
} else {
393-
if (!value_field.type()->is_primitive()) {
394-
return InvalidArgument(
395-
"Cannot explicitly project List or Map types, Map value {} of type {} was "
396-
"selected",
397-
value_field.field_id(), type.ToString());
398-
}
399-
*out = std::make_unique<MapType>(type);
400-
}
401-
} else if (value_result) {
402-
ICEBERG_RETURN_UNEXPECTED(
403-
ProjectMap(key_field, value_field, std::move(value_result), out));
404-
} else if (selected_ids_.contains(key_field.field_id())) {
405-
*out = std::make_unique<MapType>(type);
380+
if (!key_result && !value_result) {
381+
return nullptr;
406382
}
407383

408-
return {};
384+
if (!key_result || !value_result) {
385+
return InvalidArgument(
386+
"Cannot project Map with only key or value: key={}, value={}",
387+
key_result ? "present" : "null", value_result ? "present" : "null");
388+
}
389+
390+
ICEBERG_ASSIGN_OR_RAISE(auto out,
391+
ProjectMap(key_field, value_field, key_result, value_result));
392+
return out;
409393
}
410394

411-
Status Visit(const PrimitiveType& type, std::unique_ptr<Type>* result) const {
412-
return {};
395+
Result<std::shared_ptr<Type>> Visit(const std::shared_ptr<PrimitiveType>& type) const {
396+
return nullptr;
413397
}
414398

415-
Status ProjectList(const SchemaField& element_field, std::shared_ptr<Type> child_result,
416-
std::unique_ptr<Type>* out) const {
399+
Result<std::shared_ptr<Type>> ProjectList(const SchemaField& element_field,
400+
std::shared_ptr<Type> child_result) const {
417401
if (!child_result) {
418-
return InvalidArgument("Cannot project a list when the element result is null");
402+
return nullptr;
419403
}
420404
if (element_field.type() == child_result) {
421-
*out = std::make_unique<ListType>(element_field);
422-
} else {
423-
*out = std::make_unique<ListType>(element_field.field_id(), child_result,
424-
element_field.optional());
405+
return std::make_shared<ListType>(element_field);
425406
}
426-
return {};
407+
return std::make_shared<ListType>(element_field.field_id(), child_result,
408+
element_field.optional());
427409
}
428410

429-
Status ProjectMap(const SchemaField& key_field, const SchemaField& value_field,
430-
std::shared_ptr<Type> value_result,
431-
std::unique_ptr<Type>* out) const {
432-
if (!value_result) {
433-
return InvalidArgument(
434-
"Attempted to project a map without a defined map value type");
435-
}
436-
if (value_field.type() == value_result) {
437-
*out = std::make_unique<MapType>(key_field, value_field);
438-
} else {
439-
*out = std::make_unique<MapType>(
440-
key_field, SchemaField(value_field.field_id(), std::string(value_field.name()),
441-
value_result, value_field.optional()));
411+
Result<std::shared_ptr<Type>> ProjectMap(const SchemaField& key_field,
412+
const SchemaField& value_field,
413+
std::shared_ptr<Type> key_result,
414+
std::shared_ptr<Type> value_result) const {
415+
SchemaField projected_key_field = key_field;
416+
if (key_field.type() != key_result) {
417+
projected_key_field =
418+
SchemaField(key_field.field_id(), std::string(key_field.name()), key_result,
419+
key_field.optional());
442420
}
443-
return {};
444-
}
445421

446-
Status ProjectStruct(const SchemaField& field, std::unique_ptr<Type>* out) const {
447-
if (*out && (*out)->type_id() != TypeId::kStruct) {
448-
return InvalidArgument("Project struct {}:{}, but result is not StructType",
449-
field.name(), field.field_id());
422+
SchemaField projected_value_field = value_field;
423+
if (value_field.type() != value_result) {
424+
projected_value_field =
425+
SchemaField(value_field.field_id(), std::string(value_field.name()),
426+
value_result, value_field.optional());
450427
}
451-
if (*out == nullptr) {
452-
*out = std::make_unique<StructType>(std::vector<SchemaField>{});
453-
}
454-
return {};
428+
429+
return std::make_shared<MapType>(projected_key_field, projected_value_field);
455430
}
456431

457432
private:
@@ -484,9 +459,10 @@ Result<std::unique_ptr<const Schema>> Schema::SelectInternal(
484459
}
485460
}
486461

487-
std::unique_ptr<Type> result;
488462
PruneColumnVisitor visitor(selected_ids, /*select_full_types=*/true);
489-
ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*this, &visitor, &result));
463+
auto self = std::shared_ptr<const StructType>(this, [](const StructType*) {});
464+
ICEBERG_ASSIGN_OR_RAISE(auto result,
465+
visitor.Visit(std::const_pointer_cast<StructType>(self)));
490466

491467
if (!result) {
492468
return std::make_unique<Schema>(std::vector<SchemaField>{}, schema_id_);
@@ -504,9 +480,9 @@ Result<std::unique_ptr<const Schema>> Schema::SelectInternal(
504480
Result<std::unique_ptr<const Schema>> Schema::Project(
505481
std::unordered_set<int32_t>& field_ids) const {
506482
PruneColumnVisitor visitor(field_ids, /*select_full_types=*/false);
507-
508-
std::unique_ptr<Type> result;
509-
ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*this, &visitor, &result));
483+
auto self = std::shared_ptr<const StructType>(this, [](const StructType*) {});
484+
ICEBERG_ASSIGN_OR_RAISE(auto result,
485+
visitor.Visit(std::const_pointer_cast<StructType>(self)));
510486

511487
if (!result) {
512488
return std::make_unique<Schema>(std::vector<SchemaField>{}, schema_id_);
@@ -516,10 +492,8 @@ Result<std::unique_ptr<const Schema>> Schema::Project(
516492
return InvalidSchema("Projected type must be a struct type");
517493
}
518494

519-
const auto& projected_struct = internal::checked_cast<const StructType&>(*result);
520-
std::vector<SchemaField> fields_vec(projected_struct.fields().begin(),
521-
projected_struct.fields().end());
522-
return std::make_unique<Schema>(std::move(fields_vec), schema_id_);
495+
auto& projected_struct = internal::checked_cast<const StructType&>(*result);
496+
return FromStructType(std::move(const_cast<StructType&>(projected_struct)), schema_id_);
523497
}
524498

525499
} // namespace iceberg

src/iceberg/schema.h

Lines changed: 0 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -82,27 +82,6 @@ class ICEBERG_EXPORT Schema : public StructType {
8282
/// \param names Field names to select (supports nested field paths)
8383
/// \param case_sensitive Whether name matching is case-sensitive (default: true)
8484
/// \return Projected schema containing only the specified fields
85-
///
86-
/// \example
87-
/// \code
88-
/// // Original schema: struct {
89-
/// // id: int,
90-
/// // user: struct {
91-
/// // name: string,
92-
/// // address: struct { street: string, city: string }
93-
/// // }
94-
/// // }
95-
///
96-
/// // Select by names - you must specify the exact path
97-
/// auto result1 = schema->Select({"id", "user.name"});
98-
/// // Result: struct { id: int, user: struct { name: string } }
99-
///
100-
/// auto result2 = schema->Select({"user.address.street"});
101-
/// // Result: struct { user: struct { address: struct { street: string } } }
102-
///
103-
/// auto result3 = schema->Select({"user.name"});
104-
/// Result: struct { user: struct { name: string } }
105-
/// \endcode
10685
Result<std::unique_ptr<const Schema>> Select(std::span<const std::string> names,
10786
bool case_sensitive = true) const;
10887

@@ -133,32 +112,6 @@ class ICEBERG_EXPORT Schema : public StructType {
133112
/// - If nested field IDs are also in field_ids, they are recursively projected
134113
/// - If no nested field IDs are in field_ids, an empty struct is included
135114
/// - List/Map types cannot be explicitly projected (returns error)
136-
///
137-
/// \example
138-
/// \code
139-
/// // Original schema with field IDs:
140-
/// // struct {
141-
/// // 1: id: int,
142-
/// // 2: user: struct {
143-
/// // 3: name: string,
144-
/// // 4: address: struct { 5: street: string, 6: city: string }
145-
/// // }
146-
/// // }
147-
///
148-
/// // Project by IDs - recursive behavior for structs
149-
/// std::unordered_set<int32_t> ids1 = {1, 2, 3}; // id, user, user.name
150-
/// auto result1 = schema->Project(ids1);
151-
/// // Result: struct { id: int, user: struct { name: string } }
152-
///
153-
/// std::unordered_set<int32_t> ids2 = {2}; // Only user struct
154-
/// auto result2 = schema->Project(ids2);
155-
/// // Result: struct { user: struct {} } // Empty struct because no nested fields
156-
/// selected
157-
///
158-
/// std::unordered_set<int32_t> ids3 = {2, 5}; // user struct + street field
159-
/// auto result3 = schema->Project(ids3);
160-
/// // Result: struct { user: struct { address: struct { street: string } } }
161-
/// \endcode
162115
Result<std::unique_ptr<const Schema>> Project(
163116
std::unordered_set<int32_t>& field_ids) const;
164117

0 commit comments

Comments
 (0)