Skip to content

Commit 36083e1

Browse files
committed
Handle runtime Parquet Hadoop API version via reflection
1 parent c4905a8 commit 36083e1

File tree

20 files changed

+136
-138
lines changed

20 files changed

+136
-138
lines changed

PARQUET.md

Lines changed: 28 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -55,20 +55,20 @@ spark.read.parquet_metadata("/path/to/parquet").show()
5555

5656
The Dataframe provides the following per-file information:
5757

58-
|column |type |description |
59-
|:-----------------|:----:|:----------------------------------------------------------------------------|
60-
|filename |string|The Parquet file name |
61-
|blocks |int |Number of blocks / RowGroups in the Parquet file |
62-
|compressedBytes |long |Number of compressed bytes of all blocks |
63-
|uncompressedBytes |long |Number of uncompressed bytes of all blocks |
64-
|rows |long |Number of rows in the file |
65-
|columns |int |Number of columns in the file |
66-
|values |long |Number of values in the file |
67-
|nulls |long |Number of null values in the file |
68-
|createdBy |string|The createdBy string of the Parquet file, e.g. library used to write the file|
69-
|schema |string|The schema |
70-
|encryption |string|The encryption |
71-
|keyValues |string-to-string map|Key-value data of the file |
58+
|column |type | description |
59+
|:-----------------|:----:|:------------------------------------------------------------------------------|
60+
|filename |string| The Parquet file name |
61+
|blocks |int | Number of blocks / RowGroups in the Parquet file |
62+
|compressedBytes |long | Number of compressed bytes of all blocks |
63+
|uncompressedBytes |long | Number of uncompressed bytes of all blocks |
64+
|rows |long | Number of rows in the file |
65+
|columns |int | Number of columns in the file |
66+
|values |long | Number of values in the file |
67+
|nulls |long | Number of null values in the file |
68+
|createdBy |string| The createdBy string of the Parquet file, e.g. library used to write the file |
69+
|schema |string| The schema |
70+
|encryption |string| The encryption (requires org.apache.parquet:parquet-hadoop:1.12.4 and above) |
71+
|keyValues |string-to-string map| Key-value data of the file |
7272

7373
## Parquet file schema
7474

@@ -96,20 +96,20 @@ spark.read.parquet_schema("/path/to/parquet").show()
9696

9797
The Dataframe provides the following per-file information:
9898

99-
|column |type |description |
100-
|:-----------------|:----:|:-------------------------------------|
101-
|filename |string|The Parquet file name |
102-
|columnName |string|The column name |
103-
|columnPath |string array|The column path |
104-
|repetition |string|The repetition |
105-
|type |string|The data type |
106-
|length |int |The length of the type |
107-
|originalType |string|The original type |
108-
|isPrimitive |boolean|True if type is primitive |
109-
|primitiveType |string|The primitive type |
110-
|primitiveOrder |string|The order of the primitive type |
111-
|maxDefinitionLevel|int |The max definition level |
112-
|maxRepetitionLevel|int |The max repetition level |
99+
|column | type | description |
100+
|:-----------------|:------------:|:--------------------------------------------------------------------------------|
101+
|filename | string | The Parquet file name |
102+
|columnName | string | The column name |
103+
|columnPath | string array | The column path |
104+
|repetition | string | The repetition |
105+
|type | string | The data type |
106+
|length | int | The length of the type |
107+
|originalType | string | The original type (requires org.apache.parquet:parquet-hadoop:1.11.0 and above) |
108+
|isPrimitive | boolean | True if type is primitive |
109+
|primitiveType | string | The primitive type |
110+
|primitiveOrder | string | The order of the primitive type |
111+
|maxDefinitionLevel| int | The max definition level |
112+
|maxRepetitionLevel| int | The max repetition level |
113113

114114
## Parquet block / RowGroup metadata
115115

pom.xml

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,25 @@
6464
<artifactId>spark-sql_${scala.compat.version}</artifactId>
6565
<version>${spark.version}</version>
6666
<scope>provided</scope>
67+
<exclusions>
68+
<!-- we exclude parquet-hadoop and all its dependencies, as we depend on our own version -->
69+
<exclusion>
70+
<groupId>org.apache.parquet</groupId>
71+
<artifactId>*</artifactId>
72+
</exclusion>
73+
<exclusion>
74+
<groupId>io.airlift</groupId>
75+
<artifactId>aircompressor</artifactId>
76+
</exclusion>
77+
<exclusion>
78+
<groupId>org.xerial.snappy</groupId>
79+
<artifactId>snappy-java</artifactId>
80+
</exclusion>
81+
<exclusion>
82+
<groupId>org.slf4j</groupId>
83+
<artifactId>slf4j-api</artifactId>
84+
</exclusion>
85+
</exclusions>
6786
</dependency>
6887

6988
<dependency>
@@ -80,6 +99,28 @@
8099
<scope>provided</scope>
81100
</dependency>
82101

102+
<!-- our own version of parquet-hadoop, which is more recent than what earlier Spark versions depends on -->
103+
<dependency>
104+
<groupId>org.apache.parquet</groupId>
105+
<artifactId>parquet-hadoop</artifactId>
106+
<version>1.16.0</version>
107+
<exclusions>
108+
<exclusion>
109+
<groupId>commons-pool</groupId>
110+
<artifactId>commons-pool</artifactId>
111+
</exclusion>
112+
<exclusion>
113+
<groupId>javax.annotation</groupId>
114+
<artifactId>javax.annotation-api</artifactId>
115+
</exclusion>
116+
<exclusion>
117+
<groupId>com.github.luben</groupId>
118+
<artifactId>zstd-jni</artifactId>
119+
</exclusion>
120+
</exclusions>
121+
<scope>provided</scope>
122+
</dependency>
123+
83124
<dependency>
84125
<groupId>com.github.scopt</groupId>
85126
<artifactId>scopt_${scala.compat.version}</artifactId>

src/main/scala-spark-3.2/uk/co/gresearch/spark/parquet/BlockMetaDataUtil.scala

Lines changed: 0 additions & 23 deletions
This file was deleted.

src/main/scala-spark-3.2/uk/co/gresearch/spark/parquet/FileMetaDataUtil.scala

Lines changed: 0 additions & 1 deletion
This file was deleted.

src/main/scala-spark-3.2/uk/co/gresearch/spark/parquet/PrimitiveTypeUtil.scala

Lines changed: 0 additions & 24 deletions
This file was deleted.

src/main/scala-spark-3.3/uk/co/gresearch/spark/parquet/BlockMetaDataUtil.scala

Lines changed: 0 additions & 1 deletion
This file was deleted.

src/main/scala-spark-3.3/uk/co/gresearch/spark/parquet/FileMetaDataUtil.scala

Lines changed: 0 additions & 1 deletion
This file was deleted.

src/main/scala-spark-3.3/uk/co/gresearch/spark/parquet/PrimitiveTypeUtil.scala

Lines changed: 0 additions & 1 deletion
This file was deleted.

src/main/scala-spark-3.4/uk/co/gresearch/spark/parquet/BlockMetaDataUtil.scala

Lines changed: 0 additions & 1 deletion
This file was deleted.

src/main/scala-spark-3.4/uk/co/gresearch/spark/parquet/FileMetaDataUtil.scala

Lines changed: 0 additions & 23 deletions
This file was deleted.

0 commit comments

Comments
 (0)