G-Research
diff --git a/‎PARQUET.md‎
Lines changed: 28 additions & 28 deletions b/‎PARQUET.md‎
Lines changed: 28 additions & 28 deletions
diff --git a/‎pom.xml‎
Lines changed: 41 additions & 0 deletions b/‎pom.xml‎
Lines changed: 41 additions & 0 deletions
diff --git a/‎src/main/scala-spark-3.2/uk/co/gresearch/spark/parquet/BlockMetaDataUtil.scala‎
Lines changed: 0 additions & 23 deletions b/‎src/main/scala-spark-3.2/uk/co/gresearch/spark/parquet/BlockMetaDataUtil.scala‎
Lines changed: 0 additions & 23 deletions
diff --git a/‎src/main/scala-spark-3.2/uk/co/gresearch/spark/parquet/FileMetaDataUtil.scala‎
Lines changed: 0 additions & 1 deletion b/‎src/main/scala-spark-3.2/uk/co/gresearch/spark/parquet/FileMetaDataUtil.scala‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/main/scala-spark-3.2/uk/co/gresearch/spark/parquet/PrimitiveTypeUtil.scala‎
Lines changed: 0 additions & 24 deletions b/‎src/main/scala-spark-3.2/uk/co/gresearch/spark/parquet/PrimitiveTypeUtil.scala‎
Lines changed: 0 additions & 24 deletions
diff --git a/‎src/main/scala-spark-3.3/uk/co/gresearch/spark/parquet/BlockMetaDataUtil.scala‎
Lines changed: 0 additions & 1 deletion b/‎src/main/scala-spark-3.3/uk/co/gresearch/spark/parquet/BlockMetaDataUtil.scala‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/main/scala-spark-3.3/uk/co/gresearch/spark/parquet/FileMetaDataUtil.scala‎
Lines changed: 0 additions & 1 deletion b/‎src/main/scala-spark-3.3/uk/co/gresearch/spark/parquet/FileMetaDataUtil.scala‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/main/scala-spark-3.3/uk/co/gresearch/spark/parquet/PrimitiveTypeUtil.scala‎
Lines changed: 0 additions & 1 deletion b/‎src/main/scala-spark-3.3/uk/co/gresearch/spark/parquet/PrimitiveTypeUtil.scala‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/main/scala-spark-3.4/uk/co/gresearch/spark/parquet/BlockMetaDataUtil.scala‎
Lines changed: 0 additions & 1 deletion b/‎src/main/scala-spark-3.4/uk/co/gresearch/spark/parquet/BlockMetaDataUtil.scala‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/main/scala-spark-3.4/uk/co/gresearch/spark/parquet/FileMetaDataUtil.scala‎
Lines changed: 0 additions & 23 deletions b/‎src/main/scala-spark-3.4/uk/co/gresearch/spark/parquet/FileMetaDataUtil.scala‎
Lines changed: 0 additions & 23 deletions
@@ -55,20 +55,20 @@ spark.read.parquet_metadata("/path/to/parquet").show()
 
 The Dataframe provides the following per-file information:
 
-|column            |type  |description                                                                  |
-|:-----------------|:----:|:----------------------------------------------------------------------------|
-|filename          |string|The Parquet file name                                                        |
-|blocks            |int   |Number of blocks / RowGroups in the Parquet file                             |
-|compressedBytes   |long  |Number of compressed bytes of all blocks                                     |
-|uncompressedBytes |long  |Number of uncompressed bytes of all blocks                                   |
-|rows              |long  |Number of rows in the file                                                   |
-|columns           |int   |Number of columns in the file                                                |
-|values            |long  |Number of values in the file                                                 |
-|nulls             |long  |Number of null values in the file                                            |
-|createdBy         |string|The createdBy string of the Parquet file, e.g. library used to write the file|
-|schema            |string|The schema                                                                   |
-|encryption        |string|The encryption                                                               |
-|keyValues         |string-to-string map|Key-value data of the file                                     |
+|column            |type  | description                                                                   |
+|:-----------------|:----:|:------------------------------------------------------------------------------|
+|filename          |string| The Parquet file name                                                         |
+|blocks            |int   | Number of blocks / RowGroups in the Parquet file                              |
+|compressedBytes   |long  | Number of compressed bytes of all blocks                                      |
+|uncompressedBytes |long  | Number of uncompressed bytes of all blocks                                    |
+|rows              |long  | Number of rows in the file                                                    |
+|columns           |int   | Number of columns in the file                                                 |
+|values            |long  | Number of values in the file                                                  |
+|nulls             |long  | Number of null values in the file                                             |
+|createdBy         |string| The createdBy string of the Parquet file, e.g. library used to write the file |
+|schema            |string| The schema                                                                    |
+|encryption        |string| The encryption (requires org.apache.parquet:parquet-hadoop:1.12.4 and above)  |
+|keyValues         |string-to-string map| Key-value data of the file                                      |
 
 ## Parquet file schema
 
@@ -96,20 +96,20 @@ spark.read.parquet_schema("/path/to/parquet").show()
 
 The Dataframe provides the following per-file information:
 
-|column            |type  |description                           |
-|:-----------------|:----:|:-------------------------------------|
-|filename          |string|The Parquet file name                 |
-|columnName        |string|The column name                       |
-|columnPath        |string array|The column path                 |
-|repetition        |string|The repetition                        |
-|type              |string|The data type                         |
-|length            |int   |The length of the type                |
-|originalType      |string|The original type                     |
-|isPrimitive       |boolean|True if type is primitive            |
-|primitiveType     |string|The primitive type                    |
-|primitiveOrder    |string|The order of the primitive type       |
-|maxDefinitionLevel|int   |The max definition level              |
-|maxRepetitionLevel|int   |The max repetition level              |
+|column            |     type     | description                                                                     |
+|:-----------------|:------------:|:--------------------------------------------------------------------------------|
+|filename          |    string    | The Parquet file name                                                           |
+|columnName        |    string    | The column name                                                                 |
+|columnPath        | string array | The column path                                                                 |
+|repetition        |    string    | The repetition                                                                  |
+|type              |    string    | The data type                                                                   |
+|length            |     int      | The length of the type                                                          |
+|originalType      |   string     | The original type (requires org.apache.parquet:parquet-hadoop:1.11.0 and above) |
+|isPrimitive       |   boolean    | True if type is primitive                                                       |
+|primitiveType     |    string    | The primitive type                                                              |
+|primitiveOrder    |    string    | The order of the primitive type                                                 |
+|maxDefinitionLevel|     int      | The max definition level                                                        |
+|maxRepetitionLevel|     int      | The max repetition level                                                        |
 
 ## Parquet block / RowGroup metadata
 
 
@@ -64,6 +64,25 @@
       <artifactId>spark-sql_${scala.compat.version}</artifactId>
       <version>${spark.version}</version>
       <scope>provided</scope>
+      <exclusions>
+        <!-- we exclude parquet-hadoop and all its dependencies, as we depend on our own version -->
+        <exclusion>
+          <groupId>org.apache.parquet</groupId>
+          <artifactId>*</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>io.airlift</groupId>
+          <artifactId>aircompressor</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.xerial.snappy</groupId>
+          <artifactId>snappy-java</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.slf4j</groupId>
+          <artifactId>slf4j-api</artifactId>
+        </exclusion>
+      </exclusions>
     </dependency>
 
     <dependency>
@@ -80,6 +99,28 @@
       <scope>provided</scope>
     </dependency>
 
+    <!-- our own version of parquet-hadoop, which is more recent than what earlier Spark versions depends on -->
+    <dependency>
+      <groupId>org.apache.parquet</groupId>
+      <artifactId>parquet-hadoop</artifactId>
+      <version>1.16.0</version>
+      <exclusions>
+        <exclusion>
+          <groupId>commons-pool</groupId>
+          <artifactId>commons-pool</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>javax.annotation</groupId>
+          <artifactId>javax.annotation-api</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>com.github.luben</groupId>
+          <artifactId>zstd-jni</artifactId>
+        </exclusion>
+      </exclusions>
+      <scope>provided</scope>
+    </dependency>
+
     <dependency>
       <groupId>com.github.scopt</groupId>
       <artifactId>scopt_${scala.compat.version}</artifactId>