Skip to content

Commit ca21204

Browse files
wgtmacpitrou
andauthored
GH-47241: [C++][Parquet] Fix VariantExtensionType conversion (#47242)
### Rationale for this change The test case of VariantExtensionType is incomplete. We need to make sure its conversion is well covered. ### What changes are included in this PR? Fix all issues for VariantExtensionType conversion and add it to test. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * GitHub Issue: #47241 Lead-authored-by: Gang Wu <[email protected]> Co-authored-by: Antoine Pitrou <[email protected]> Signed-off-by: Antoine Pitrou <[email protected]>
1 parent f16b6ac commit ca21204

File tree

4 files changed

+56
-30
lines changed

4 files changed

+56
-30
lines changed

cpp/src/parquet/arrow/arrow_schema_test.cc

Lines changed: 43 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#include "parquet/arrow/reader.h"
2626
#include "parquet/arrow/reader_internal.h"
2727
#include "parquet/arrow/schema.h"
28+
#include "parquet/arrow/variant_internal.h"
2829
#include "parquet/file_reader.h"
2930
#include "parquet/schema.h"
3031
#include "parquet/schema_internal.h"
@@ -941,46 +942,61 @@ TEST_F(TestConvertParquetSchema, ParquetVariant) {
941942
PrimitiveNode::Make("metadata", Repetition::REQUIRED, ParquetType::BYTE_ARRAY);
942943
auto value =
943944
PrimitiveNode::Make("value", Repetition::REQUIRED, ParquetType::BYTE_ARRAY);
944-
945-
auto variant =
946-
GroupNode::Make("variant_unshredded", Repetition::OPTIONAL, {metadata, value});
945+
auto variant = GroupNode::Make("variant_unshredded", Repetition::OPTIONAL,
946+
{metadata, value}, LogicalType::Variant());
947947
parquet_fields.push_back(variant);
948948

949-
{
950-
// Test converting from parquet schema to arrow schema.
951-
std::vector<std::shared_ptr<Field>> arrow_fields;
952-
auto arrow_metadata =
953-
::arrow::field("metadata", ::arrow::binary(), /*nullable=*/false);
954-
auto arrow_value = ::arrow::field("value", ::arrow::binary(), /*nullable=*/false);
955-
auto arrow_variant = ::arrow::struct_({arrow_metadata, arrow_value});
956-
arrow_fields.push_back(
957-
::arrow::field("variant_unshredded", arrow_variant, /*nullable=*/true));
958-
auto arrow_schema = ::arrow::schema(arrow_fields);
949+
// Arrow schema for unshredded variant struct.
950+
auto arrow_metadata = ::arrow::field("metadata", ::arrow::binary(), /*nullable=*/false);
951+
auto arrow_value = ::arrow::field("value", ::arrow::binary(), /*nullable=*/false);
952+
auto arrow_variant = ::arrow::struct_({arrow_metadata, arrow_value});
953+
auto variant_extension = std::make_shared<VariantExtensionType>(arrow_variant);
959954

955+
{
956+
// Parquet file does not contain Arrow schema.
957+
// By default, field should be treated as a normal struct in Arrow.
958+
auto arrow_schema =
959+
::arrow::schema({::arrow::field("variant_unshredded", arrow_variant)});
960960
ASSERT_OK(ConvertSchema(parquet_fields));
961-
ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema));
961+
ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema, /*check_metadata=*/true));
962962
}
963963

964-
{
965-
// Test converting from parquet schema to arrow schema even though
966-
// extensions are not enabled.
964+
for (bool register_extension : {true, false}) {
965+
::arrow::ExtensionTypeGuard guard(register_extension
966+
? ::arrow::DataTypeVector{variant_extension}
967+
: ::arrow::DataTypeVector{});
968+
969+
// Parquet file does not contain Arrow schema.
970+
// If Arrow extensions are enabled, field should be interpreted as Parquet Variant
971+
// extension type if registered.
972+
ArrowReaderProperties props;
973+
props.set_arrow_extensions_enabled(true);
974+
975+
auto arrow_schema = ::arrow::schema({::arrow::field(
976+
"variant_unshredded", register_extension ? variant_extension : arrow_variant)});
977+
978+
ASSERT_OK(ConvertSchema(parquet_fields, /*metadata=*/nullptr, props));
979+
ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema, /*check_metadata=*/true));
980+
}
981+
982+
for (bool register_extension : {true, false}) {
983+
::arrow::ExtensionTypeGuard guard(register_extension
984+
? ::arrow::DataTypeVector{variant_extension}
985+
: ::arrow::DataTypeVector{});
986+
987+
// Parquet file does contain Arrow schema.
988+
// Field should be interpreted as Parquet Variant extension, if registered,
989+
// even though extensions are not enabled.
967990
ArrowReaderProperties props;
968991
props.set_arrow_extensions_enabled(false);
969992

970-
// Test converting from parquet schema to arrow schema.
971-
std::vector<std::shared_ptr<Field>> arrow_fields;
972-
auto arrow_metadata =
973-
::arrow::field("metadata", ::arrow::binary(), /*nullable=*/false);
974-
auto arrow_value = ::arrow::field("value", ::arrow::binary(), /*nullable=*/false);
975-
auto arrow_variant = ::arrow::struct_({arrow_metadata, arrow_value});
976-
arrow_fields.push_back(
977-
::arrow::field("variant_unshredded", arrow_variant, /*nullable=*/true));
978-
auto arrow_schema = ::arrow::schema(arrow_fields);
993+
auto arrow_schema = ::arrow::schema({::arrow::field(
994+
"variant_unshredded", register_extension ? variant_extension : arrow_variant)});
979995

980996
std::shared_ptr<KeyValueMetadata> metadata;
981997
ASSERT_OK(ArrowSchemaToParquetMetadata(arrow_schema, metadata));
982998
ASSERT_OK(ConvertSchema(parquet_fields, metadata, props));
983-
CheckFlatSchema(arrow_schema, true /* check_metadata */);
999+
ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema, /*check_metadata=*/true));
9841000
}
9851001
}
9861002

cpp/src/parquet/arrow/schema.cc

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -591,6 +591,15 @@ Status GroupToStruct(const GroupNode& node, LevelInfo current_levels,
591591
arrow_fields.push_back(out->children[i].field);
592592
}
593593
auto struct_type = ::arrow::struct_(arrow_fields);
594+
if (ctx->properties.get_arrow_extensions_enabled() &&
595+
node.logical_type()->is_variant()) {
596+
auto extension_type = ::arrow::GetExtensionType("parquet.variant");
597+
if (extension_type) {
598+
ARROW_ASSIGN_OR_RAISE(
599+
struct_type,
600+
extension_type->Deserialize(std::move(struct_type), /*serialized_data=*/""));
601+
}
602+
}
594603
out->field = ::arrow::field(node.name(), struct_type, node.is_optional(),
595604
FieldIdMetadata(node.field_id()));
596605
out->level_info = current_levels;

cpp/src/parquet/schema_test.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1251,7 +1251,7 @@ TEST(TestLogicalTypeOperation, LogicalTypeProperties) {
12511251
{BSONLogicalType::Make(), false, true, true},
12521252
{UUIDLogicalType::Make(), false, true, true},
12531253
{Float16LogicalType::Make(), false, true, true},
1254-
{VariantLogicalType::Make(), false, true, true},
1254+
{VariantLogicalType::Make(), true, true, true},
12551255
{NoLogicalType::Make(), false, false, true},
12561256
};
12571257

cpp/src/parquet/types.cc

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -835,8 +835,9 @@ bool LogicalType::is_valid() const {
835835
}
836836
bool LogicalType::is_invalid() const { return !is_valid(); }
837837
bool LogicalType::is_nested() const {
838-
return (impl_->type() == LogicalType::Type::LIST) ||
839-
(impl_->type() == LogicalType::Type::MAP);
838+
return impl_->type() == LogicalType::Type::LIST ||
839+
impl_->type() == LogicalType::Type::MAP ||
840+
impl_->type() == LogicalType::Type::VARIANT;
840841
}
841842
bool LogicalType::is_nonnested() const { return !is_nested(); }
842843
bool LogicalType::is_serialized() const { return impl_->is_serialized(); }

0 commit comments

Comments
 (0)