G-Research
diff --git a/‎.github/actions/build-whl/action.yml‎
Lines changed: 8 additions & 0 deletions b/‎.github/actions/build-whl/action.yml‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎.github/actions/test-release/action.yml‎
Lines changed: 17 additions & 2 deletions b/‎.github/actions/test-release/action.yml‎
Lines changed: 17 additions & 2 deletions
diff --git a/‎PARQUET.md‎
Lines changed: 51 additions & 43 deletions b/‎PARQUET.md‎
Lines changed: 51 additions & 43 deletions
diff --git a/‎pom.xml‎
Lines changed: 14 additions & 0 deletions b/‎pom.xml‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎python/test/test_jvm.py‎
Lines changed: 4 additions & 0 deletions b/‎python/test/test_jvm.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/main/scala/uk/co/gresearch/spark/parquet/ParquetMetaDataUtil.scala‎
Lines changed: 50 additions & 1 deletion b/‎src/main/scala/uk/co/gresearch/spark/parquet/ParquetMetaDataUtil.scala‎
Lines changed: 50 additions & 1 deletion
@@ -53,6 +53,14 @@ runs:
       java-version: ${{ inputs.java-compat-version }}
       distribution: 'zulu'
 
+  - name: Fetch Release Test Dependencies
+    run: |
+      # Fetch Release Test Dependencies
+      echo "::group::mvn dependency:get"
+      mvn dependency:get -Dtransitive=false -Dartifact=org.apache.parquet:parquet-hadoop:1.16.0:jar:tests
+      echo "::endgroup::"
+    shell: bash
+
   - name: Setup Python
     uses: actions/setup-python@v5
     with:
 
@@ -113,13 +113,21 @@ runs:
       echo "::endgroup::"
     shell: bash
 
+  - name: Fetch Release Test Dependencies
+    run: |
+      # Fetch Release Test Dependencies
+      echo "::group::mvn dependency:get"
+      mvn dependency:get -Dtransitive=false -Dartifact=org.apache.parquet:parquet-hadoop:1.16.0:jar:tests
+      echo "::endgroup::"
+    shell: bash
+
   - name: Scala Release Test
     env:
       SPARK_HOME: ${{ env.SPARK_BIN_HOME }}
     run: |
       # Scala Release Test
       echo "::group::spark-shell"
-      $SPARK_BIN_HOME/bin/spark-shell --packages uk.co.gresearch.spark:spark-extension_${{ inputs.scala-compat-version }}:$SPARK_EXTENSION_VERSION < test-release.scala
+      $SPARK_BIN_HOME/bin/spark-shell --packages uk.co.gresearch.spark:spark-extension_${{ inputs.scala-compat-version }}:$SPARK_EXTENSION_VERSION --jars ~/.m2/repository/org/apache/parquet/parquet-hadoop/1.16.0/parquet-hadoop-1.16.0-tests.jar < test-release.scala
       echo
       echo "::endgroup::"
     shell: bash
@@ -142,6 +150,13 @@ runs:
       echo "::endgroup::"
     shell: bash
 
+  - name: Fetch Whl Artifact
+    if: inputs.python-version != ''
+    uses: actions/download-artifact@v4
+    with:
+      name: Whl (Spark ${{ inputs.spark-compat-version }} Scala ${{ inputs.scala-compat-version }})
+      path: .
+
   - name: Install Python dependencies
     if: inputs.python-version != ''
     run: |
@@ -150,7 +165,7 @@ runs:
       python -m venv .pytest-venv
       .pytest-venv/bin/python -m pip install --upgrade pip
       .pytest-venv/bin/pip install pypandoc
-      .pytest-venv/bin/pip install -e python/[test]
+      .pytest-venv/bin/pip install $(ls pyspark_extension-*.whl)[test]
       echo "::endgroup::"
 
       PYSPARK_HOME=$(.pytest-venv/bin/python -c "import os; import pyspark; print(os.path.dirname(pyspark.__file__))")
 
@@ -55,20 +55,20 @@ spark.read.parquet_metadata("/path/to/parquet").show()
 
 The Dataframe provides the following per-file information:
 
-|column            |type  | description                                                                   |
-|:-----------------|:----:|:------------------------------------------------------------------------------|
-|filename          |string| The Parquet file name                                                         |
-|blocks            |int   | Number of blocks / RowGroups in the Parquet file                              |
-|compressedBytes   |long  | Number of compressed bytes of all blocks                                      |
-|uncompressedBytes |long  | Number of uncompressed bytes of all blocks                                    |
-|rows              |long  | Number of rows in the file                                                    |
-|columns           |int   | Number of columns in the file                                                 |
-|values            |long  | Number of values in the file                                                  |
-|nulls             |long  | Number of null values in the file                                             |
-|createdBy         |string| The createdBy string of the Parquet file, e.g. library used to write the file |
-|schema            |string| The schema                                                                    |
-|encryption        |string| The encryption (requires org.apache.parquet:parquet-hadoop:1.12.4 and above)  |
-|keyValues         |string-to-string map| Key-value data of the file                                      |
+|column            |type  | description                                                                    |
+|:-----------------|:----:|:-------------------------------------------------------------------------------|
+|filename          |string| The Parquet file name                                                          |
+|blocks            |int   | Number of blocks / RowGroups in the Parquet file                               |
+|compressedBytes   |long  | Number of compressed bytes of all blocks                                       |
+|uncompressedBytes |long  | Number of uncompressed bytes of all blocks                                     |
+|rows              |long  | Number of rows in the file                                                     |
+|columns           |int   | Number of columns in the file                                                  |
+|values            |long  | Number of values in the file                                                   |
+|nulls             |long  | Number of null values in the file                                              |
+|createdBy         |string| The createdBy string of the Parquet file, e.g. library used to write the file  |
+|schema            |string| The schema                                                                     |
+|encryption        |string| The encryption (requires `org.apache.parquet:parquet-hadoop:1.12.4` and above) |
+|keyValues         |string-to-string map| Key-value data of the file                                       |
 
 ## Parquet file schema
 
@@ -96,20 +96,20 @@ spark.read.parquet_schema("/path/to/parquet").show()
 
 The Dataframe provides the following per-file information:
 
-|column            |     type     | description                                                                     |
-|:-----------------|:------------:|:--------------------------------------------------------------------------------|
-|filename          |    string    | The Parquet file name                                                           |
-|columnName        |    string    | The column name                                                                 |
-|columnPath        | string array | The column path                                                                 |
-|repetition        |    string    | The repetition                                                                  |
-|type              |    string    | The data type                                                                   |
-|length            |     int      | The length of the type                                                          |
-|originalType      |   string     | The original type (requires org.apache.parquet:parquet-hadoop:1.11.0 and above) |
-|isPrimitive       |   boolean    | True if type is primitive                                                       |
-|primitiveType     |    string    | The primitive type                                                              |
-|primitiveOrder    |    string    | The order of the primitive type                                                 |
-|maxDefinitionLevel|     int      | The max definition level                                                        |
-|maxRepetitionLevel|     int      | The max repetition level                                                        |
+|column            |     type     | description                                                                       |
+|:-----------------|:------------:|:----------------------------------------------------------------------------------|
+|filename          |    string    | The Parquet file name                                                             |
+|columnName        |    string    | The column name                                                                   |
+|columnPath        | string array | The column path                                                                   |
+|repetition        |    string    | The repetition                                                                    |
+|type              |    string    | The data type                                                                     |
+|length            |     int      | The length of the type                                                            |
+|originalType      |   string     | The original type (requires `org.apache.parquet:parquet-hadoop:1.11.0` and above) |
+|isPrimitive       |   boolean    | True if type is primitive                                                         |
+|primitiveType     |    string    | The primitive type                                                                |
+|primitiveOrder    |    string    | The order of the primitive type                                                   |
+|maxDefinitionLevel|     int      | The max definition level                                                          |
+|maxRepetitionLevel|     int      | The max repetition level                                                          |
 
 ## Parquet block / RowGroup metadata
 
@@ -170,21 +170,22 @@ spark.read.parquet_block_columns("/path/to/parquet").show()
 +-------------+-----+------+------+-------------------+-------------------+--------------------+------------------+-----------+---------------+-----------------+------+-----+
 ```
 
-|column            |type         |description                                           |
-|:-----------------|:-----------:|:-----------------------------------------------------|
-|filename          |string       |The Parquet file name                                 |
-|block             |int          |Block / RowGroup number starting at 1                 |
-|column            |array<string>|Block / RowGroup column name                          |
-|codec             |string       |The coded used to compress the block column values    |
-|type              |string       |The data type of the block column                     |
-|encodings         |array<string>|Encodings of the block column                         |
-|minValue          |string       |Minimum value of this column in this block            |
-|maxValue          |string       |Maximum value of this column in this block            |
-|columnStart       |long         |Start position of the block column in the Parquet file|
-|compressedBytes   |long         |Number of compressed bytes of this block column       |
-|uncompressedBytes |long         |Number of uncompressed bytes of this block column     |
-|values            |long         |Number of values in this block column                 |
-|nulls             |long         |Number of null values in this block column            |
+| column            |     type      | description                                                                                       |
+|:------------------|:-------------:|:--------------------------------------------------------------------------------------------------|
+| filename          |    string     | The Parquet file name                                                                             |
+| block             |      int      | Block / RowGroup number starting at 1                                                             |
+| column            | array<string> | Block / RowGroup column name                                                                      |
+| codec             |    string     | The coded used to compress the block column values                                                |
+| type              |    string     | The data type of the block column                                                                 |
+| encodings         | array<string> | Encodings of the block column                                                                     |
+| isEncrypted       |    boolean    | Whether block column is encrypted (requires `org.apache.parquet:parquet-hadoop:1.12.3` and above) |
+| minValue          |    string     | Minimum value of this column in this block                                                        |
+| maxValue          |    string     | Maximum value of this column in this block                                                        |
+| columnStart       |     long      | Start position of the block column in the Parquet file                                            |
+| compressedBytes   |     long      | Number of compressed bytes of this block column                                                   |
+| uncompressedBytes |     long      | Number of uncompressed bytes of this block column                                                 |
+| values            |     long      | Number of values in this block column                                                             |
+| nulls             |     long      | Number of null values in this block column                                                        |
 
 ## Parquet partition metadata
 
@@ -255,6 +256,13 @@ spark.read.parquet_block_columns("/path/to/parquet", parallelism=100)
 spark.read.parquet_partitions("/path/to/parquet", parallelism=100)
 ```
 
+## Encryption
+
+Reading [encrypted Parquet is supported](https://spark.apache.org/docs/latest/sql-data-sources-parquet.html#columnar-encryption).
+Files encrypted with [plaintext footer](https://github.com/apache/parquet-format/blob/master/Encryption.md#55-plaintext-footer-mode)
+can be read without any encryption keys, while encrypted Parquet metadata are then show as `NULL` values in the result Dataframe.
+Encrypted Parquet files with encrypted footer requires the footer encryption key only. No column encryption keys are needed.
+
 ## Known Issues
 
 Note that this feature is not supported in Python when connected with a [Spark Connect server](README.md#spark-connect-server).
@@ -157,6 +157,20 @@
       <version>1.0.0-M2</version>
       <scope>test</scope>
     </dependency>
+
+    <dependency>
+      <groupId>org.apache.parquet</groupId>
+      <artifactId>parquet-hadoop</artifactId>
+      <version>1.16.0</version>
+      <classifier>tests</classifier>
+      <scope>test</scope>
+      <exclusions>
+        <exclusion>
+          <groupId>*</groupId>
+          <artifactId>*</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
   </dependencies>
 
   <repositories>
 
@@ -145,3 +145,7 @@ def test_parquet(self):
                 with self.assertRaises(RuntimeError) as e:
                     func(self.spark.read)
                 self.assertEqual((EXPECTED_UNSUPPORTED_MESSAGE, ), e.exception.args)
+
+
+if __name__ == '__main__':
+    SparkTest.main(__file__)
@@ -16,11 +16,14 @@
 
 package uk.co.gresearch.spark.parquet
 
-import org.apache.parquet.hadoop.metadata.{BlockMetaData, FileMetaData}
+import org.apache.parquet.crypto.ParquetCryptoRuntimeException
+import org.apache.parquet.hadoop.Footer
+import org.apache.parquet.hadoop.metadata.{BlockMetaData, ColumnChunkMetaData, FileMetaData}
 import org.apache.parquet.schema.PrimitiveType
 
 import scala.reflect.{ClassTag, classTag}
 import scala.util.Try
+import scala.collection.convert.ImplicitConversions.`iterable AsScalaIterable`
 
 private trait MethodGuard {
   def isSupported[T: ClassTag](methodName: String): Boolean = {
@@ -38,6 +41,45 @@ private trait MethodGuard {
     }
 }
 
+/**
+ * Guard access to possibly encrypted and inaccessible metadata of a footer.
+ *   - If footer is encrypted while we have no decryption keys, metadata values are None.
+ *   - If footer is known not to be encrypted, metadata values are Some.
+ *   - If we don't know whether the footer is encrypted, we access some metadata that we could not read if encrypted to
+ *     determine the encryption state of the footer.
+ */
+private case class FooterGuard(footer: Footer) {
+  lazy val isSafe: Boolean = {
+    // having a decryptor tells us this file is expected to be decryptable
+    Option(footer.getParquetMetadata.getFileMetaData.getFileDecryptor)
+      // otherwise, when we have an unencrypted file, we are also safe to access f
+      .orElse(
+        ParquetMetaDataUtil
+          .getEncryptionType(footer.getParquetMetadata.getFileMetaData)
+          .filter(_ == "UNENCRYPTED")
+      )
+      // turn to Some(true) if safe, None if unknown
+      .map(_ => true)
+      // otherwise, we access some metadata that if the footer is encrypted would fail
+      .orElse(
+        Some(
+          Try(footer.getParquetMetadata.getBlocks.headOption.map(_.getTotalByteSize))
+            // get hold of the possible exception
+            .toEither.swap.toOption
+            // no exception means safe, ignore exceptions other than ParquetCryptoRuntimeException
+            .exists(!_.isInstanceOf[ParquetCryptoRuntimeException])
+        )
+      )
+      // now is Some(true) or Some(false)
+      .get
+  }
+
+  private[parquet] def apply[T](f: => T): Option[T] = {
+    if (isSafe) { Some(f) }
+    else { None }
+  }
+}
+
 private[parquet] object ParquetMetaDataUtil extends MethodGuard {
   lazy val getEncryptionTypeIsSupported: Boolean =
     isSupported[FileMetaData]("getEncryptionType")
@@ -59,4 +101,11 @@ private[parquet] object ParquetMetaDataUtil extends MethodGuard {
     guard(getOrdinalIsSupported) { (block: BlockMetaData) =>
       block.getOrdinal
     }
+
+  lazy val isEncryptedIsSupported: Boolean =
+    isSupported[ColumnChunkMetaData]("isEncrypted")
+  lazy val isEncrypted: ColumnChunkMetaData => Option[Boolean] =
+    guard(isEncryptedIsSupported) { (column: ColumnChunkMetaData) =>
+      column.isEncrypted
+    }
 }