-
Notifications
You must be signed in to change notification settings - Fork 1.5k
GH-3223: Implement Variant parquet writer #3221
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 5 commits
6ba82c7
c9d5692
883d60f
3a89bdc
783acdf
22f90f4
835daed
1ff12ec
4f984b2
0a27339
38570dc
e6f9b7f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -42,9 +42,12 @@ | |
| import org.apache.parquet.io.api.Binary; | ||
| import org.apache.parquet.io.api.RecordConsumer; | ||
| import org.apache.parquet.schema.GroupType; | ||
| import org.apache.parquet.schema.LogicalTypeAnnotation; | ||
| import org.apache.parquet.schema.LogicalTypeAnnotation.UUIDLogicalTypeAnnotation; | ||
| import org.apache.parquet.schema.MessageType; | ||
| import org.apache.parquet.schema.Type; | ||
| import org.apache.parquet.variant.Variant; | ||
| import org.apache.parquet.variant.VariantValueWriter; | ||
| import org.slf4j.Logger; | ||
| import org.slf4j.LoggerFactory; | ||
|
|
||
|
|
@@ -181,9 +184,58 @@ public void write(T record) { | |
| } | ||
|
|
||
| private void writeRecord(GroupType schema, Schema avroSchema, Object record) { | ||
| recordConsumer.startGroup(); | ||
| writeRecordFields(schema, avroSchema, record); | ||
| recordConsumer.endGroup(); | ||
| if (schema.getLogicalTypeAnnotation() instanceof LogicalTypeAnnotation.VariantLogicalTypeAnnotation) { | ||
| writeVariantFields(schema, avroSchema, record); | ||
| } else { | ||
| recordConsumer.startGroup(); | ||
| writeRecordFields(schema, avroSchema, record); | ||
| recordConsumer.endGroup(); | ||
| } | ||
| } | ||
|
|
||
| private void writeVariantFields(GroupType schema, Schema avroSchema, Object record) { | ||
| List<Type> fields = schema.getFields(); | ||
| List<Schema.Field> avroFields = avroSchema.getFields(); | ||
| boolean binarySchema = true; | ||
| ByteBuffer metadata = null; | ||
| ByteBuffer value = null; | ||
| // Extract the value and metadata binary. | ||
| for (int index = 0; index < avroFields.size(); index++) { | ||
| Schema.Field avroField = avroFields.get(index); | ||
| Schema fieldSchema = AvroSchemaConverter.getNonNull(avroField.schema()); | ||
| if (!fieldSchema.getType().equals(Schema.Type.BYTES)) { | ||
| binarySchema = false; | ||
| break; | ||
| } | ||
| Type fieldType = fields.get(index); | ||
| if (fieldType.getName() == "value") { | ||
| Object valueObj = model.getField(record, avroField.name(), index); | ||
| if (valueObj instanceof byte[]) { | ||
| value = ByteBuffer.wrap((byte[]) valueObj); | ||
| } else { | ||
| value = (ByteBuffer) valueObj; | ||
| } | ||
| } else if (fieldType.getName() == "metadata") { | ||
| Object metadataObj = model.getField(record, avroField.name(), index); | ||
| if (metadataObj instanceof byte[]) { | ||
|
||
| metadata = ByteBuffer.wrap((byte[]) metadataObj); | ||
| } else { | ||
| metadata = (ByteBuffer) metadataObj; | ||
| } | ||
| } else { | ||
| binarySchema = false; | ||
| break; | ||
| } | ||
| } | ||
|
|
||
| if (binarySchema) { | ||
| VariantValueWriter.write(recordConsumer, schema, new Variant(value, metadata)); | ||
| } else { | ||
| // If the schema was something other than value and metaadata, treat the value as a non-variant record. | ||
|
||
| recordConsumer.startGroup(); | ||
| writeRecordFields(schema, avroSchema, record); | ||
| recordConsumer.endGroup(); | ||
| } | ||
| } | ||
|
|
||
| private void writeRecordFields(GroupType schema, Schema avroSchema, Object record) { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -33,6 +33,7 @@ | |
| import org.apache.parquet.hadoop.ParquetReader; | ||
| import org.apache.parquet.hadoop.ParquetWriter; | ||
| import org.apache.parquet.hadoop.util.HadoopInputFile; | ||
| import org.apache.parquet.variant.Variant; | ||
| import org.junit.Assert; | ||
| import org.junit.rules.TemporaryFolder; | ||
|
|
||
|
|
@@ -129,4 +130,36 @@ public static Configuration conf(String name, boolean value) { | |
| conf.setBoolean(name, value); | ||
| return conf; | ||
| } | ||
|
|
||
| /** | ||
| * Assert that to Variant values are logically equivalent. | ||
| * E.g. object fields may be ordered differently in the binary. | ||
|
||
| */ | ||
| static void assertEquivalent(Variant expected, Variant actual) { | ||
| Assert.assertEquals(expected.getType(), actual.getType()); | ||
| switch (expected.getType()) { | ||
| case STRING: | ||
| // Short strings may use the compact or extended representation. | ||
| Assert.assertEquals(expected.getString(), actual.getString()); | ||
| break; | ||
| case ARRAY: | ||
| Assert.assertEquals(expected.numArrayElements(), actual.numArrayElements()); | ||
| for (int i = 0; i < expected.numArrayElements(); ++i) { | ||
| assertEquivalent(expected.getElementAtIndex(i), actual.getElementAtIndex(i)); | ||
| } | ||
| break; | ||
| case OBJECT: | ||
| Assert.assertEquals(expected.numObjectElements(), actual.numObjectElements()); | ||
| for (int i = 0; i < expected.numObjectElements(); ++i) { | ||
| Variant.ObjectField expectedField = expected.getFieldAtIndex(i); | ||
| Variant.ObjectField actualField = actual.getFieldAtIndex(i); | ||
| Assert.assertEquals(expectedField.key, actualField.key); | ||
| assertEquivalent(expectedField.value, actualField.value); | ||
| } | ||
| break; | ||
| default: | ||
| // All other types have a single representation, and must be bit-for-bit identical. | ||
| Assert.assertEquals(expected.getValueBuffer(), actual.getValueBuffer()); | ||
| } | ||
| } | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.