Skip to content

Commit 913d67c

Browse files
authored
Support parquet encryption (#324)
Supports reading metadata of encrypted parquet with plaintext and encrypted footer.
1 parent 0a446f2 commit 913d67c

File tree

13 files changed

+453
-95
lines changed

13 files changed

+453
-95
lines changed

.github/actions/build-whl/action.yml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,14 @@ runs:
5353
java-version: ${{ inputs.java-compat-version }}
5454
distribution: 'zulu'
5555

56+
- name: Fetch Release Test Dependencies
57+
run: |
58+
# Fetch Release Test Dependencies
59+
echo "::group::mvn dependency:get"
60+
mvn dependency:get -Dtransitive=false -Dartifact=org.apache.parquet:parquet-hadoop:1.16.0:jar:tests
61+
echo "::endgroup::"
62+
shell: bash
63+
5664
- name: Setup Python
5765
uses: actions/setup-python@v5
5866
with:

.github/actions/test-release/action.yml

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -113,13 +113,21 @@ runs:
113113
echo "::endgroup::"
114114
shell: bash
115115

116+
- name: Fetch Release Test Dependencies
117+
run: |
118+
# Fetch Release Test Dependencies
119+
echo "::group::mvn dependency:get"
120+
mvn dependency:get -Dtransitive=false -Dartifact=org.apache.parquet:parquet-hadoop:1.16.0:jar:tests
121+
echo "::endgroup::"
122+
shell: bash
123+
116124
- name: Scala Release Test
117125
env:
118126
SPARK_HOME: ${{ env.SPARK_BIN_HOME }}
119127
run: |
120128
# Scala Release Test
121129
echo "::group::spark-shell"
122-
$SPARK_BIN_HOME/bin/spark-shell --packages uk.co.gresearch.spark:spark-extension_${{ inputs.scala-compat-version }}:$SPARK_EXTENSION_VERSION < test-release.scala
130+
$SPARK_BIN_HOME/bin/spark-shell --packages uk.co.gresearch.spark:spark-extension_${{ inputs.scala-compat-version }}:$SPARK_EXTENSION_VERSION --jars ~/.m2/repository/org/apache/parquet/parquet-hadoop/1.16.0/parquet-hadoop-1.16.0-tests.jar < test-release.scala
123131
echo
124132
echo "::endgroup::"
125133
shell: bash
@@ -142,6 +150,13 @@ runs:
142150
echo "::endgroup::"
143151
shell: bash
144152

153+
- name: Fetch Whl Artifact
154+
if: inputs.python-version != ''
155+
uses: actions/download-artifact@v4
156+
with:
157+
name: Whl (Spark ${{ inputs.spark-compat-version }} Scala ${{ inputs.scala-compat-version }})
158+
path: .
159+
145160
- name: Install Python dependencies
146161
if: inputs.python-version != ''
147162
run: |
@@ -150,7 +165,7 @@ runs:
150165
python -m venv .pytest-venv
151166
.pytest-venv/bin/python -m pip install --upgrade pip
152167
.pytest-venv/bin/pip install pypandoc
153-
.pytest-venv/bin/pip install -e python/[test]
168+
.pytest-venv/bin/pip install $(ls pyspark_extension-*.whl)[test]
154169
echo "::endgroup::"
155170
156171
PYSPARK_HOME=$(.pytest-venv/bin/python -c "import os; import pyspark; print(os.path.dirname(pyspark.__file__))")

PARQUET.md

Lines changed: 51 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -55,20 +55,20 @@ spark.read.parquet_metadata("/path/to/parquet").show()
5555

5656
The Dataframe provides the following per-file information:
5757

58-
|column |type | description |
59-
|:-----------------|:----:|:------------------------------------------------------------------------------|
60-
|filename |string| The Parquet file name |
61-
|blocks |int | Number of blocks / RowGroups in the Parquet file |
62-
|compressedBytes |long | Number of compressed bytes of all blocks |
63-
|uncompressedBytes |long | Number of uncompressed bytes of all blocks |
64-
|rows |long | Number of rows in the file |
65-
|columns |int | Number of columns in the file |
66-
|values |long | Number of values in the file |
67-
|nulls |long | Number of null values in the file |
68-
|createdBy |string| The createdBy string of the Parquet file, e.g. library used to write the file |
69-
|schema |string| The schema |
70-
|encryption |string| The encryption (requires org.apache.parquet:parquet-hadoop:1.12.4 and above) |
71-
|keyValues |string-to-string map| Key-value data of the file |
58+
|column |type | description |
59+
|:-----------------|:----:|:-------------------------------------------------------------------------------|
60+
|filename |string| The Parquet file name |
61+
|blocks |int | Number of blocks / RowGroups in the Parquet file |
62+
|compressedBytes |long | Number of compressed bytes of all blocks |
63+
|uncompressedBytes |long | Number of uncompressed bytes of all blocks |
64+
|rows |long | Number of rows in the file |
65+
|columns |int | Number of columns in the file |
66+
|values |long | Number of values in the file |
67+
|nulls |long | Number of null values in the file |
68+
|createdBy |string| The createdBy string of the Parquet file, e.g. library used to write the file |
69+
|schema |string| The schema |
70+
|encryption |string| The encryption (requires `org.apache.parquet:parquet-hadoop:1.12.4` and above) |
71+
|keyValues |string-to-string map| Key-value data of the file |
7272

7373
## Parquet file schema
7474

@@ -96,20 +96,20 @@ spark.read.parquet_schema("/path/to/parquet").show()
9696

9797
The Dataframe provides the following per-file information:
9898

99-
|column | type | description |
100-
|:-----------------|:------------:|:--------------------------------------------------------------------------------|
101-
|filename | string | The Parquet file name |
102-
|columnName | string | The column name |
103-
|columnPath | string array | The column path |
104-
|repetition | string | The repetition |
105-
|type | string | The data type |
106-
|length | int | The length of the type |
107-
|originalType | string | The original type (requires org.apache.parquet:parquet-hadoop:1.11.0 and above) |
108-
|isPrimitive | boolean | True if type is primitive |
109-
|primitiveType | string | The primitive type |
110-
|primitiveOrder | string | The order of the primitive type |
111-
|maxDefinitionLevel| int | The max definition level |
112-
|maxRepetitionLevel| int | The max repetition level |
99+
|column | type | description |
100+
|:-----------------|:------------:|:----------------------------------------------------------------------------------|
101+
|filename | string | The Parquet file name |
102+
|columnName | string | The column name |
103+
|columnPath | string array | The column path |
104+
|repetition | string | The repetition |
105+
|type | string | The data type |
106+
|length | int | The length of the type |
107+
|originalType | string | The original type (requires `org.apache.parquet:parquet-hadoop:1.11.0` and above) |
108+
|isPrimitive | boolean | True if type is primitive |
109+
|primitiveType | string | The primitive type |
110+
|primitiveOrder | string | The order of the primitive type |
111+
|maxDefinitionLevel| int | The max definition level |
112+
|maxRepetitionLevel| int | The max repetition level |
113113

114114
## Parquet block / RowGroup metadata
115115

@@ -170,21 +170,22 @@ spark.read.parquet_block_columns("/path/to/parquet").show()
170170
+-------------+-----+------+------+-------------------+-------------------+--------------------+------------------+-----------+---------------+-----------------+------+-----+
171171
```
172172

173-
|column |type |description |
174-
|:-----------------|:-----------:|:-----------------------------------------------------|
175-
|filename |string |The Parquet file name |
176-
|block |int |Block / RowGroup number starting at 1 |
177-
|column |array<string>|Block / RowGroup column name |
178-
|codec |string |The coded used to compress the block column values |
179-
|type |string |The data type of the block column |
180-
|encodings |array<string>|Encodings of the block column |
181-
|minValue |string |Minimum value of this column in this block |
182-
|maxValue |string |Maximum value of this column in this block |
183-
|columnStart |long |Start position of the block column in the Parquet file|
184-
|compressedBytes |long |Number of compressed bytes of this block column |
185-
|uncompressedBytes |long |Number of uncompressed bytes of this block column |
186-
|values |long |Number of values in this block column |
187-
|nulls |long |Number of null values in this block column |
173+
| column | type | description |
174+
|:------------------|:-------------:|:--------------------------------------------------------------------------------------------------|
175+
| filename | string | The Parquet file name |
176+
| block | int | Block / RowGroup number starting at 1 |
177+
| column | array<string> | Block / RowGroup column name |
178+
| codec | string | The coded used to compress the block column values |
179+
| type | string | The data type of the block column |
180+
| encodings | array<string> | Encodings of the block column |
181+
| isEncrypted | boolean | Whether block column is encrypted (requires `org.apache.parquet:parquet-hadoop:1.12.3` and above) |
182+
| minValue | string | Minimum value of this column in this block |
183+
| maxValue | string | Maximum value of this column in this block |
184+
| columnStart | long | Start position of the block column in the Parquet file |
185+
| compressedBytes | long | Number of compressed bytes of this block column |
186+
| uncompressedBytes | long | Number of uncompressed bytes of this block column |
187+
| values | long | Number of values in this block column |
188+
| nulls | long | Number of null values in this block column |
188189

189190
## Parquet partition metadata
190191

@@ -255,6 +256,13 @@ spark.read.parquet_block_columns("/path/to/parquet", parallelism=100)
255256
spark.read.parquet_partitions("/path/to/parquet", parallelism=100)
256257
```
257258

259+
## Encryption
260+
261+
Reading [encrypted Parquet is supported](https://spark.apache.org/docs/latest/sql-data-sources-parquet.html#columnar-encryption).
262+
Files encrypted with [plaintext footer](https://github.com/apache/parquet-format/blob/master/Encryption.md#55-plaintext-footer-mode)
263+
can be read without any encryption keys, while encrypted Parquet metadata are then show as `NULL` values in the result Dataframe.
264+
Encrypted Parquet files with encrypted footer requires the footer encryption key only. No column encryption keys are needed.
265+
258266
## Known Issues
259267

260268
Note that this feature is not supported in Python when connected with a [Spark Connect server](README.md#spark-connect-server).

pom.xml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,20 @@
157157
<version>1.0.0-M2</version>
158158
<scope>test</scope>
159159
</dependency>
160+
161+
<dependency>
162+
<groupId>org.apache.parquet</groupId>
163+
<artifactId>parquet-hadoop</artifactId>
164+
<version>1.16.0</version>
165+
<classifier>tests</classifier>
166+
<scope>test</scope>
167+
<exclusions>
168+
<exclusion>
169+
<groupId>*</groupId>
170+
<artifactId>*</artifactId>
171+
</exclusion>
172+
</exclusions>
173+
</dependency>
160174
</dependencies>
161175

162176
<repositories>

python/test/test_jvm.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,3 +145,7 @@ def test_parquet(self):
145145
with self.assertRaises(RuntimeError) as e:
146146
func(self.spark.read)
147147
self.assertEqual((EXPECTED_UNSUPPORTED_MESSAGE, ), e.exception.args)
148+
149+
150+
if __name__ == '__main__':
151+
SparkTest.main(__file__)

src/main/scala/uk/co/gresearch/spark/parquet/ParquetMetaDataUtil.scala

Lines changed: 50 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,14 @@
1616

1717
package uk.co.gresearch.spark.parquet
1818

19-
import org.apache.parquet.hadoop.metadata.{BlockMetaData, FileMetaData}
19+
import org.apache.parquet.crypto.ParquetCryptoRuntimeException
20+
import org.apache.parquet.hadoop.Footer
21+
import org.apache.parquet.hadoop.metadata.{BlockMetaData, ColumnChunkMetaData, FileMetaData}
2022
import org.apache.parquet.schema.PrimitiveType
2123

2224
import scala.reflect.{ClassTag, classTag}
2325
import scala.util.Try
26+
import scala.collection.convert.ImplicitConversions.`iterable AsScalaIterable`
2427

2528
private trait MethodGuard {
2629
def isSupported[T: ClassTag](methodName: String): Boolean = {
@@ -38,6 +41,45 @@ private trait MethodGuard {
3841
}
3942
}
4043

44+
/**
45+
* Guard access to possibly encrypted and inaccessible metadata of a footer.
46+
* - If footer is encrypted while we have no decryption keys, metadata values are None.
47+
* - If footer is known not to be encrypted, metadata values are Some.
48+
* - If we don't know whether the footer is encrypted, we access some metadata that we could not read if encrypted to
49+
* determine the encryption state of the footer.
50+
*/
51+
private case class FooterGuard(footer: Footer) {
52+
lazy val isSafe: Boolean = {
53+
// having a decryptor tells us this file is expected to be decryptable
54+
Option(footer.getParquetMetadata.getFileMetaData.getFileDecryptor)
55+
// otherwise, when we have an unencrypted file, we are also safe to access f
56+
.orElse(
57+
ParquetMetaDataUtil
58+
.getEncryptionType(footer.getParquetMetadata.getFileMetaData)
59+
.filter(_ == "UNENCRYPTED")
60+
)
61+
// turn to Some(true) if safe, None if unknown
62+
.map(_ => true)
63+
// otherwise, we access some metadata that if the footer is encrypted would fail
64+
.orElse(
65+
Some(
66+
Try(footer.getParquetMetadata.getBlocks.headOption.map(_.getTotalByteSize))
67+
// get hold of the possible exception
68+
.toEither.swap.toOption
69+
// no exception means safe, ignore exceptions other than ParquetCryptoRuntimeException
70+
.exists(!_.isInstanceOf[ParquetCryptoRuntimeException])
71+
)
72+
)
73+
// now is Some(true) or Some(false)
74+
.get
75+
}
76+
77+
private[parquet] def apply[T](f: => T): Option[T] = {
78+
if (isSafe) { Some(f) }
79+
else { None }
80+
}
81+
}
82+
4183
private[parquet] object ParquetMetaDataUtil extends MethodGuard {
4284
lazy val getEncryptionTypeIsSupported: Boolean =
4385
isSupported[FileMetaData]("getEncryptionType")
@@ -59,4 +101,11 @@ private[parquet] object ParquetMetaDataUtil extends MethodGuard {
59101
guard(getOrdinalIsSupported) { (block: BlockMetaData) =>
60102
block.getOrdinal
61103
}
104+
105+
lazy val isEncryptedIsSupported: Boolean =
106+
isSupported[ColumnChunkMetaData]("isEncrypted")
107+
lazy val isEncrypted: ColumnChunkMetaData => Option[Boolean] =
108+
guard(isEncryptedIsSupported) { (column: ColumnChunkMetaData) =>
109+
column.isEncrypted
110+
}
62111
}

0 commit comments

Comments
 (0)