-
Notifications
You must be signed in to change notification settings - Fork 1.5k
GH-3286: Add support for Parquet-Protobuf in Parquet-cli #3287
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 4 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -85,6 +85,26 @@ | |
| <artifactId>parquet-avro</artifactId> | ||
| <version>${project.version}</version> | ||
| </dependency> | ||
| <dependency> | ||
| <groupId>org.apache.parquet</groupId> | ||
| <artifactId>parquet-protobuf</artifactId> | ||
| <version>${project.version}</version> | ||
| <classifier>tests</classifier> | ||
| <scope>test</scope> | ||
| </dependency> | ||
| <!-- CatCommandTest (for TestProtobuf) --> | ||
| <dependency> | ||
| <groupId>com.google.protobuf</groupId> | ||
| <artifactId>protobuf-java</artifactId> | ||
| <version>3.25.6</version> | ||
| <scope>test</scope> | ||
| </dependency> | ||
| <dependency> | ||
| <groupId>org.apache.parquet</groupId> | ||
| <artifactId>parquet-protobuf</artifactId> | ||
| <version>${project.version}</version> | ||
| <scope>test</scope> | ||
| </dependency> | ||
|
Comment on lines
+97
to
+108
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. By adding these dependencies in test scope only, wouldn't cause them missing at the command line execution?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thats a great catch thanks! moved to ${deps.scope} which can be enabled via the profile being used |
||
| <dependency> | ||
| <groupId>org.apache.parquet</groupId> | ||
| <artifactId>parquet-format-structures</artifactId> | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,10 +18,14 @@ | |
| */ | ||
| package org.apache.parquet.cli.commands; | ||
|
|
||
| import com.google.protobuf.Message; | ||
| import java.io.File; | ||
| import java.io.IOException; | ||
| import java.util.Arrays; | ||
| import org.apache.hadoop.conf.Configuration; | ||
| import org.apache.hadoop.fs.Path; | ||
| import org.apache.parquet.proto.ProtoParquetWriter; | ||
| import org.apache.parquet.proto.test.TestProtobuf; | ||
| import org.junit.Assert; | ||
| import org.junit.Test; | ||
|
|
||
|
|
@@ -63,4 +67,42 @@ public void testCatCommandWithInvalidColumn() throws IOException { | |
| command.setConf(new Configuration()); | ||
| command.run(); | ||
| } | ||
|
|
||
| @Test | ||
| public void testCatCommandProtoParquetAutoDetected() throws Exception { | ||
| File protoFile = new File(getTempFolder(), "proto_someevent.parquet"); | ||
| writeProtoParquet(protoFile); | ||
|
|
||
| CatCommand cmd = new CatCommand(createLogger(), 0); | ||
| cmd.sourceFiles = Arrays.asList(protoFile.getAbsolutePath()); | ||
| cmd.setConf(new Configuration()); | ||
|
|
||
| int result = cmd.run(); | ||
| Assert.assertEquals(0, result); | ||
| } | ||
|
|
||
| @Test | ||
| public void testCatCommandProtoParquetSucceedsWithAutoDetection() throws Exception { | ||
|
||
| File protoFile = new File(getTempFolder(), "proto_someevent.parquet"); | ||
| writeProtoParquet(protoFile); | ||
|
|
||
| CatCommand cmd = new CatCommand(createLogger(), 0); | ||
| cmd.sourceFiles = Arrays.asList(protoFile.getAbsolutePath()); | ||
| cmd.setConf(new Configuration()); | ||
|
|
||
| int result = cmd.run(); | ||
| Assert.assertEquals(0, result); | ||
| } | ||
|
|
||
| private static void writeProtoParquet(File file) throws Exception { | ||
| TestProtobuf.RepeatedIntMessage.Builder b = TestProtobuf.RepeatedIntMessage.newBuilder() | ||
| .addRepeatedInt(1) | ||
| .addRepeatedInt(2) | ||
| .addRepeatedInt(3); | ||
|
|
||
| try (ProtoParquetWriter<Message> w = | ||
| new ProtoParquetWriter<>(new Path(file.getAbsolutePath()), TestProtobuf.RepeatedIntMessage.class)) { | ||
| w.write(b.build()); | ||
| } | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think I was wrong with my previous comment. We do not actually use protobuf in prod, right? We check only for the related key in the footer and use the example binding to get the values. So
testscope seems legit.(If you want to keep this comment, maybe let it be for all three dependencies with some separation?)
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
updated, yeah just noticed protobuf is only pulled to write the test file