|
25 | 25 | #include "parquet/arrow/reader.h" |
26 | 26 | #include "parquet/arrow/reader_internal.h" |
27 | 27 | #include "parquet/arrow/schema.h" |
| 28 | +#include "parquet/arrow/variant_internal.h" |
28 | 29 | #include "parquet/file_reader.h" |
29 | 30 | #include "parquet/schema.h" |
30 | 31 | #include "parquet/schema_internal.h" |
@@ -941,46 +942,61 @@ TEST_F(TestConvertParquetSchema, ParquetVariant) { |
941 | 942 | PrimitiveNode::Make("metadata", Repetition::REQUIRED, ParquetType::BYTE_ARRAY); |
942 | 943 | auto value = |
943 | 944 | PrimitiveNode::Make("value", Repetition::REQUIRED, ParquetType::BYTE_ARRAY); |
944 | | - |
945 | | - auto variant = |
946 | | - GroupNode::Make("variant_unshredded", Repetition::OPTIONAL, {metadata, value}); |
| 945 | + auto variant = GroupNode::Make("variant_unshredded", Repetition::OPTIONAL, |
| 946 | + {metadata, value}, LogicalType::Variant()); |
947 | 947 | parquet_fields.push_back(variant); |
948 | 948 |
|
949 | | - { |
950 | | - // Test converting from parquet schema to arrow schema. |
951 | | - std::vector<std::shared_ptr<Field>> arrow_fields; |
952 | | - auto arrow_metadata = |
953 | | - ::arrow::field("metadata", ::arrow::binary(), /*nullable=*/false); |
954 | | - auto arrow_value = ::arrow::field("value", ::arrow::binary(), /*nullable=*/false); |
955 | | - auto arrow_variant = ::arrow::struct_({arrow_metadata, arrow_value}); |
956 | | - arrow_fields.push_back( |
957 | | - ::arrow::field("variant_unshredded", arrow_variant, /*nullable=*/true)); |
958 | | - auto arrow_schema = ::arrow::schema(arrow_fields); |
| 949 | + // Arrow schema for unshredded variant struct. |
| 950 | + auto arrow_metadata = ::arrow::field("metadata", ::arrow::binary(), /*nullable=*/false); |
| 951 | + auto arrow_value = ::arrow::field("value", ::arrow::binary(), /*nullable=*/false); |
| 952 | + auto arrow_variant = ::arrow::struct_({arrow_metadata, arrow_value}); |
| 953 | + auto variant_extension = std::make_shared<VariantExtensionType>(arrow_variant); |
959 | 954 |
|
| 955 | + { |
| 956 | + // Parquet file does not contain Arrow schema. |
| 957 | + // By default, field should be treated as a normal struct in Arrow. |
| 958 | + auto arrow_schema = |
| 959 | + ::arrow::schema({::arrow::field("variant_unshredded", arrow_variant)}); |
960 | 960 | ASSERT_OK(ConvertSchema(parquet_fields)); |
961 | | - ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema)); |
| 961 | + ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema, /*check_metadata=*/true)); |
962 | 962 | } |
963 | 963 |
|
964 | | - { |
965 | | - // Test converting from parquet schema to arrow schema even though |
966 | | - // extensions are not enabled. |
| 964 | + for (bool register_extension : {true, false}) { |
| 965 | + ::arrow::ExtensionTypeGuard guard(register_extension |
| 966 | + ? ::arrow::DataTypeVector{variant_extension} |
| 967 | + : ::arrow::DataTypeVector{}); |
| 968 | + |
| 969 | + // Parquet file does not contain Arrow schema. |
| 970 | + // If Arrow extensions are enabled, field should be interpreted as Parquet Variant |
| 971 | + // extension type if registered. |
| 972 | + ArrowReaderProperties props; |
| 973 | + props.set_arrow_extensions_enabled(true); |
| 974 | + |
| 975 | + auto arrow_schema = ::arrow::schema({::arrow::field( |
| 976 | + "variant_unshredded", register_extension ? variant_extension : arrow_variant)}); |
| 977 | + |
| 978 | + ASSERT_OK(ConvertSchema(parquet_fields, /*metadata=*/nullptr, props)); |
| 979 | + ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema, /*check_metadata=*/true)); |
| 980 | + } |
| 981 | + |
| 982 | + for (bool register_extension : {true, false}) { |
| 983 | + ::arrow::ExtensionTypeGuard guard(register_extension |
| 984 | + ? ::arrow::DataTypeVector{variant_extension} |
| 985 | + : ::arrow::DataTypeVector{}); |
| 986 | + |
| 987 | + // Parquet file does contain Arrow schema. |
| 988 | + // Field should be interpreted as Parquet Variant extension, if registered, |
| 989 | + // even though extensions are not enabled. |
967 | 990 | ArrowReaderProperties props; |
968 | 991 | props.set_arrow_extensions_enabled(false); |
969 | 992 |
|
970 | | - // Test converting from parquet schema to arrow schema. |
971 | | - std::vector<std::shared_ptr<Field>> arrow_fields; |
972 | | - auto arrow_metadata = |
973 | | - ::arrow::field("metadata", ::arrow::binary(), /*nullable=*/false); |
974 | | - auto arrow_value = ::arrow::field("value", ::arrow::binary(), /*nullable=*/false); |
975 | | - auto arrow_variant = ::arrow::struct_({arrow_metadata, arrow_value}); |
976 | | - arrow_fields.push_back( |
977 | | - ::arrow::field("variant_unshredded", arrow_variant, /*nullable=*/true)); |
978 | | - auto arrow_schema = ::arrow::schema(arrow_fields); |
| 993 | + auto arrow_schema = ::arrow::schema({::arrow::field( |
| 994 | + "variant_unshredded", register_extension ? variant_extension : arrow_variant)}); |
979 | 995 |
|
980 | 996 | std::shared_ptr<KeyValueMetadata> metadata; |
981 | 997 | ASSERT_OK(ArrowSchemaToParquetMetadata(arrow_schema, metadata)); |
982 | 998 | ASSERT_OK(ConvertSchema(parquet_fields, metadata, props)); |
983 | | - CheckFlatSchema(arrow_schema, true /* check_metadata */); |
| 999 | + ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema, /*check_metadata=*/true)); |
984 | 1000 | } |
985 | 1001 | } |
986 | 1002 |
|
|
0 commit comments