apache · gszadovszky · Sep 6, 2025 · Aug 28, 2025 · Aug 28, 2025 · Aug 28, 2025
diff --git a/parquet-cli/pom.xml b/parquet-cli/pom.xml
@@ -85,6 +85,26 @@
       <artifactId>parquet-avro</artifactId>
       <version>${project.version}</version>
     </dependency>
+    <dependency>
+      <groupId>org.apache.parquet</groupId>
+      <artifactId>parquet-protobuf</artifactId>
+      <version>${project.version}</version>
+      <classifier>tests</classifier>
+      <scope>test</scope>
+    </dependency>
+    <!-- CatCommandTest (for TestProtobuf) -->
+    <dependency>
+      <groupId>com.google.protobuf</groupId>
+      <artifactId>protobuf-java</artifactId>
+      <version>3.25.6</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.parquet</groupId>
+      <artifactId>parquet-protobuf</artifactId>
+      <version>${project.version}</version>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>org.apache.parquet</groupId>
       <artifactId>parquet-format-structures</artifactId>

diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/BaseCommand.java b/parquet-cli/src/main/java/org/apache/parquet/cli/BaseCommand.java
@@ -35,6 +35,7 @@
 import java.security.AccessController;
 import java.util.Iterator;
 import java.util.List;
+import java.util.Map;
 import java.util.NoSuchElementException;
 import org.apache.avro.Schema;
 import org.apache.avro.file.DataFileReader;
@@ -55,14 +56,56 @@
 import org.apache.parquet.cli.util.GetClassLoader;
 import org.apache.parquet.cli.util.Schemas;
 import org.apache.parquet.cli.util.SeekableFSDataInputStream;
+import org.apache.parquet.example.data.Group;
+import org.apache.parquet.hadoop.ParquetFileReader;
 import org.apache.parquet.hadoop.ParquetReader;
+import org.apache.parquet.hadoop.example.GroupReadSupport;
 import org.slf4j.Logger;
 
 public abstract class BaseCommand implements Command, Configurable {
 
   private static final String RESOURCE_URI_SCHEME = "resource";
   private static final String STDIN_AS_SOURCE = "stdin";
 
+  /**
+   * Note for dev: Due to legancy reasons, parquet-cli used the avro schema reader which
+   * breaks for files generated through proto. This logic is in place to auto-detect such cases
+   * and route the request to simple reader instead of avro.
+   */
+  private boolean isProtobufStyleSchema(String source) throws IOException {
+    try (ParquetFileReader reader = ParquetFileReader.open(getConf(), qualifiedPath(source))) {
+      Map<String, String> metadata = reader.getFooter().getFileMetaData().getKeyValueMetaData();
+      return metadata != null && metadata.containsKey("parquet.proto.class");
+    }
+  }
+
+  // Util to convert ParquetReader to Iterable
+  private static <T> Iterable<T> asIterable(final ParquetReader<T> reader) {
+    return () -> new Iterator<T>() {
+      private T next = advance();
+
+      private T advance() {
+        try {
+          return reader.read();
+        } catch (IOException e) {
+          throw new RuntimeException(e);
+        }
+      }
+
+      @Override
+      public boolean hasNext() {
+        return next != null;
+      }
+
+      @Override
+      public T next() {
+        T current = next;
+        next = advance();
+        return current;
+      }
+    };
+  }
+
   protected final Logger console;
 
   private Configuration conf = null;
@@ -320,6 +363,14 @@ protected <D> Iterable<D> openDataFile(final String source, Schema projection) t
     Formats.Format format = Formats.detectFormat(open(source));
     switch (format) {
       case PARQUET:
+        boolean isProtobufStyle = isProtobufStyleSchema(source);
+        if (isProtobufStyle) {
+          final ParquetReader<Group> grp = ParquetReader.<Group>builder(
+                  new GroupReadSupport(), qualifiedPath(source))
+              .withConf(getConf())
+              .build();
+          return (Iterable<D>) asIterable(grp);
+        }
         Configuration conf = new Configuration(getConf());
         // TODO: add these to the reader builder
         AvroReadSupport.setRequestedProjection(conf, projection);

diff --git a/parquet-cli/src/test/java/org/apache/parquet/cli/commands/CatCommandTest.java b/parquet-cli/src/test/java/org/apache/parquet/cli/commands/CatCommandTest.java
@@ -18,10 +18,14 @@
  */
 package org.apache.parquet.cli.commands;
 
+import com.google.protobuf.Message;
 import java.io.File;
 import java.io.IOException;
 import java.util.Arrays;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.parquet.proto.ProtoParquetWriter;
+import org.apache.parquet.proto.test.TestProtobuf;
 import org.junit.Assert;
 import org.junit.Test;
 
@@ -63,4 +67,42 @@ public void testCatCommandWithInvalidColumn() throws IOException {
     command.setConf(new Configuration());
     command.run();
   }
+
+  @Test
+  public void testCatCommandProtoParquetAutoDetected() throws Exception {
+    File protoFile = new File(getTempFolder(), "proto_someevent.parquet");
+    writeProtoParquet(protoFile);
+
+    CatCommand cmd = new CatCommand(createLogger(), 0);
+    cmd.sourceFiles = Arrays.asList(protoFile.getAbsolutePath());
+    cmd.setConf(new Configuration());
+
+    int result = cmd.run();
+    Assert.assertEquals(0, result);
+  }
+
+  @Test
+  public void testCatCommandProtoParquetSucceedsWithAutoDetection() throws Exception {
+    File protoFile = new File(getTempFolder(), "proto_someevent.parquet");
+    writeProtoParquet(protoFile);
+
+    CatCommand cmd = new CatCommand(createLogger(), 0);
+    cmd.sourceFiles = Arrays.asList(protoFile.getAbsolutePath());
+    cmd.setConf(new Configuration());
+
+    int result = cmd.run();
+    Assert.assertEquals(0, result);
+  }
+
+  private static void writeProtoParquet(File file) throws Exception {
+    TestProtobuf.RepeatedIntMessage.Builder b = TestProtobuf.RepeatedIntMessage.newBuilder()
+        .addRepeatedInt(1)
+        .addRepeatedInt(2)
+        .addRepeatedInt(3);
+
+    try (ProtoParquetWriter<Message> w =
+        new ProtoParquetWriter<>(new Path(file.getAbsolutePath()), TestProtobuf.RepeatedIntMessage.class)) {
+      w.write(b.build());
+    }
+  }
 }