feat: support parquet client encryption (#2674)

Marwen94 · web-flow · commit 3b96385b23b6 · 2024-02-21T10:30:32.000-06:00
Closes #2642
diff --git a/awswrangler/distributed/ray/datasources/arrow_parquet_base_datasource.py b/awswrangler/distributed/ray/datasources/arrow_parquet_base_datasource.py
@@ -40,12 +40,14 @@ def _read_stream(self, f: pa.NativeFile, path: str) -> Iterator[pa.Table]:
 
         dataset_kwargs = arrow_parquet_args.get("dataset_kwargs", {})
         coerce_int96_timestamp_unit: str | None = dataset_kwargs.get("coerce_int96_timestamp_unit", None)
+        decryption_properties = dataset_kwargs.get("decryption_properties", None)
 
         table = pq.read_table(
             f,
             use_threads=use_threads,
             columns=columns,
             coerce_int96_timestamp_unit=coerce_int96_timestamp_unit,
+            decryption_properties=decryption_properties,
         )
 
         table = _add_table_partitions(
diff --git a/awswrangler/distributed/ray/modin/s3/_read_parquet.py b/awswrangler/distributed/ray/modin/s3/_read_parquet.py
@@ -39,10 +39,13 @@ def _read_parquet_distributed(
     s3_additional_kwargs: dict[str, Any] | None,
     arrow_kwargs: dict[str, Any],
     bulk_read: bool,
+    decryption_properties: pa.parquet.encryption.DecryptionConfiguration | None = None,
 ) -> pd.DataFrame:
     dataset_kwargs = {}
     if coerce_int96_timestamp_unit:
         dataset_kwargs["coerce_int96_timestamp_unit"] = coerce_int96_timestamp_unit
+    if decryption_properties:
+        dataset_kwargs["decryption_properties"] = decryption_properties
 
     dataset = read_datasource(
         **_resolve_datasource_parameters(
diff --git a/awswrangler/distributed/ray/modin/s3/_write_orc.py b/awswrangler/distributed/ray/modin/s3/_write_orc.py
@@ -12,6 +12,7 @@
 from awswrangler import exceptions
 from awswrangler.distributed.ray.datasources import ArrowORCDatasink, UserProvidedKeyBlockWritePathProvider
 from awswrangler.distributed.ray.modin._utils import _ray_dataset_from_df
+from awswrangler.typing import ArrowEncryptionConfiguration
 
 if TYPE_CHECKING:
     from mypy_boto3_s3 import S3Client
@@ -36,6 +37,7 @@ def _to_orc_distributed(
     filename_prefix: str | None = None,
     max_rows_by_file: int | None = 0,
     bucketing: bool = False,
+    encryption_configuration: ArrowEncryptionConfiguration | None = None,
 ) -> list[str]:
     if bucketing:
         # Add bucket id to the prefix
diff --git a/awswrangler/distributed/ray/modin/s3/_write_parquet.py b/awswrangler/distributed/ray/modin/s3/_write_parquet.py
@@ -12,6 +12,7 @@
 from awswrangler import exceptions
 from awswrangler.distributed.ray.datasources import ArrowParquetDatasink, UserProvidedKeyBlockWritePathProvider
 from awswrangler.distributed.ray.modin._utils import _ray_dataset_from_df
+from awswrangler.typing import ArrowEncryptionConfiguration
 
 if TYPE_CHECKING:
     from mypy_boto3_s3 import S3Client
@@ -36,6 +37,7 @@ def _to_parquet_distributed(
     filename_prefix: str | None = "",
     max_rows_by_file: int | None = 0,
     bucketing: bool = False,
+    encryption_configuration: ArrowEncryptionConfiguration | None = None,
 ) -> list[str]:
     if bucketing:
         # Add bucket id to the prefix
diff --git a/awswrangler/s3/_read_parquet.py b/awswrangler/s3/_read_parquet.py
@@ -40,7 +40,7 @@
     _InternalReadTableMetadataReturnValue,
     _TableMetadataReader,
 )
-from awswrangler.typing import RayReadParquetSettings, _ReadTableMetadataReturnValue
+from awswrangler.typing import ArrowDecryptionConfiguration, RayReadParquetSettings, _ReadTableMetadataReturnValue
 
 if TYPE_CHECKING:
     from mypy_boto3_s3 import S3Client
@@ -56,9 +56,14 @@
 def _pyarrow_parquet_file_wrapper(
     source: Any,
     coerce_int96_timestamp_unit: str | None = None,
+    decryption_properties: pyarrow.parquet.encryption.DecryptionConfiguration | None = None,
 ) -> pyarrow.parquet.ParquetFile:
     try:
-        return pyarrow.parquet.ParquetFile(source=source, coerce_int96_timestamp_unit=coerce_int96_timestamp_unit)
+        return pyarrow.parquet.ParquetFile(
+            source=source,
+            coerce_int96_timestamp_unit=coerce_int96_timestamp_unit,
+            decryption_properties=decryption_properties,
+        )
     except pyarrow.ArrowInvalid as ex:
         if str(ex) == "Parquet file size is 0 bytes":
             _logger.warning("Ignoring empty file...")
@@ -74,6 +79,7 @@ def _read_parquet_metadata_file(
     use_threads: bool | int,
     version_id: str | None = None,
     coerce_int96_timestamp_unit: str | None = None,
+    decryption_properties: pyarrow.parquet.encryption.DecryptionConfiguration | None = None,
 ) -> pa.schema:
     with open_s3_object(
         path=path,
@@ -85,7 +91,9 @@ def _read_parquet_metadata_file(
         s3_additional_kwargs=s3_additional_kwargs,
     ) as f:
         pq_file: pyarrow.parquet.ParquetFile | None = _pyarrow_parquet_file_wrapper(
-            source=f, coerce_int96_timestamp_unit=coerce_int96_timestamp_unit
+            source=f,
+            coerce_int96_timestamp_unit=coerce_int96_timestamp_unit,
+            decryption_properties=decryption_properties,
         )
         if pq_file:
             return pq_file.schema.to_arrow_schema()
@@ -156,6 +164,7 @@ def _read_parquet_file(
     use_threads: bool | int,
     version_id: str | None = None,
     schema: pa.schema | None = None,
+    decryption_properties: pyarrow.parquet.encryption.DecryptionConfiguration | None = None,
 ) -> pa.Table:
     s3_block_size: int = FULL_READ_S3_BLOCK_SIZE if columns else -1  # One shot for a full read or see constant
     with open_s3_object(
@@ -176,6 +185,7 @@ def _read_parquet_file(
                     use_threads=False,
                     use_pandas_metadata=False,
                     coerce_int96_timestamp_unit=coerce_int96_timestamp_unit,
+                    decryption_properties=decryption_properties,
                 )
             except pyarrow.ArrowInvalid as ex:
                 if "Parquet file size is 0 bytes" in str(ex):
@@ -190,6 +200,7 @@ def _read_parquet_file(
             pq_file: pyarrow.parquet.ParquetFile | None = _pyarrow_parquet_file_wrapper(
                 source=f,
                 coerce_int96_timestamp_unit=coerce_int96_timestamp_unit,
+                decryption_properties=decryption_properties,
             )
             if pq_file is None:
                 raise exceptions.InvalidFile(f"Invalid Parquet file: {path}")
@@ -212,6 +223,7 @@ def _read_parquet_chunked(
     s3_additional_kwargs: dict[str, str] | None,
     arrow_kwargs: dict[str, Any],
     version_ids: dict[str, str] | None = None,
+    decryption_properties: pyarrow.parquet.encryption.DecryptionConfiguration | None = None,
 ) -> Iterator[pd.DataFrame]:
     next_slice: pd.DataFrame | None = None
     batch_size = BATCH_READ_BLOCK_SIZE if chunked is True else chunked
@@ -229,6 +241,7 @@ def _read_parquet_chunked(
             pq_file: pyarrow.parquet.ParquetFile | None = _pyarrow_parquet_file_wrapper(
                 source=f,
                 coerce_int96_timestamp_unit=coerce_int96_timestamp_unit,
+                decryption_properties=decryption_properties,
             )
             if pq_file is None:
                 continue
@@ -278,6 +291,7 @@ def _read_parquet(
     s3_additional_kwargs: dict[str, Any] | None,
     arrow_kwargs: dict[str, Any],
     bulk_read: bool,
+    decryption_properties: pyarrow.parquet.encryption.DecryptionConfiguration | None = None,
 ) -> pd.DataFrame:
     executor: _BaseExecutor = _get_executor(use_threads=use_threads)
     tables = executor.map(
@@ -291,6 +305,7 @@ def _read_parquet(
         itertools.repeat(use_threads),
         [version_ids.get(p) if isinstance(version_ids, dict) else None for p in paths],
         itertools.repeat(schema),
+        itertools.repeat(decryption_properties),
     )
     return _utils.table_refs_to_df(tables, kwargs=arrow_kwargs)
 
@@ -321,6 +336,7 @@ def read_parquet(
     boto3_session: boto3.Session | None = None,
     s3_additional_kwargs: dict[str, Any] | None = None,
     pyarrow_additional_kwargs: dict[str, Any] | None = None,
+    decryption_configuration: ArrowDecryptionConfiguration | None = None,
 ) -> pd.DataFrame | Iterator[pd.DataFrame]:
     """Read Parquet file(s) from an S3 prefix or list of S3 objects paths.
 
@@ -425,6 +441,11 @@ def read_parquet(
         Forwarded to `to_pandas` method converting from PyArrow tables to Pandas DataFrame.
         Valid values include "split_blocks", "self_destruct", "ignore_metadata".
         e.g. pyarrow_additional_kwargs={'split_blocks': True}.
+    decryption_configuration: typing.ArrowDecryptionConfiguration, optional
+        ``pyarrow.parquet.encryption.CryptoFactory`` and ``pyarrow.parquet.encryption.KmsConnectionConfig`` objects dict
+        used to create a PyArrow ``CryptoFactory.file_decryption_properties`` object to forward to PyArrow reader.
+        see: https://arrow.apache.org/docs/python/parquet.html#decryption-configuration
+        Client Decryption is not supported in distributed mode.
 
     Returns
     -------
@@ -508,10 +529,17 @@ def read_parquet(
             coerce_int96_timestamp_unit=coerce_int96_timestamp_unit,
         )
 
+    decryption_properties = (
+        decryption_configuration["crypto_factory"].file_decryption_properties(
+            decryption_configuration["kms_connection_config"]
+        )
+        if decryption_configuration
+        else None
+    )
+
     arrow_kwargs = _data_types.pyarrow2pandas_defaults(
         use_threads=use_threads, kwargs=pyarrow_additional_kwargs, dtype_backend=dtype_backend
     )
-
     if chunked:
         return _read_parquet_chunked(
             s3_client=s3_client,
@@ -524,6 +552,7 @@ def read_parquet(
             s3_additional_kwargs=s3_additional_kwargs,
             arrow_kwargs=arrow_kwargs,
             version_ids=version_ids,
+            decryption_properties=decryption_properties,
         )
 
     return _read_parquet(
@@ -539,6 +568,7 @@ def read_parquet(
         arrow_kwargs=arrow_kwargs,
         version_ids=version_ids,
         bulk_read=bulk_read,
+        decryption_properties=decryption_properties,
     )
 
 
@@ -563,6 +593,7 @@ def read_parquet_table(
     boto3_session: boto3.Session | None = None,
     s3_additional_kwargs: dict[str, Any] | None = None,
     pyarrow_additional_kwargs: dict[str, Any] | None = None,
+    decryption_configuration: ArrowDecryptionConfiguration | None = None,
 ) -> pd.DataFrame | Iterator[pd.DataFrame]:
     """Read Apache Parquet table registered in the AWS Glue Catalog.
 
@@ -641,6 +672,10 @@ def read_parquet_table(
         Forwarded to `to_pandas` method converting from PyArrow tables to Pandas DataFrame.
         Valid values include "split_blocks", "self_destruct", "ignore_metadata".
         e.g. pyarrow_additional_kwargs={'split_blocks': True}.
+    decryption_configuration: typing.ArrowDecryptionConfiguration, optional
+        ``pyarrow.parquet.encryption.CryptoFactory`` and ``pyarrow.parquet.encryption.KmsConnectionConfig`` objects dict
+        used to create a PyArrow ``CryptoFactory.file_decryption_properties`` object to forward to PyArrow reader.
+        Client Decryption is not supported in distributed mode.
 
     Returns
     -------
@@ -698,6 +733,7 @@ def read_parquet_table(
         boto3_session=boto3_session,
         s3_additional_kwargs=s3_additional_kwargs,
         pyarrow_additional_kwargs=pyarrow_additional_kwargs,
+        decryption_configuration=decryption_configuration,
     )
 
     partial_cast_function = functools.partial(
diff --git a/awswrangler/s3/_read_parquet.pyi b/awswrangler/s3/_read_parquet.pyi
@@ -5,7 +5,7 @@ import boto3
 import pandas as pd
 import pyarrow as pa
 
-from awswrangler.typing import RayReadParquetSettings
+from awswrangler.typing import ArrowDecryptionConfiguration, RayReadParquetSettings
 
 if TYPE_CHECKING:
     from mypy_boto3_s3 import S3Client
@@ -88,6 +88,7 @@ def read_parquet(
     boto3_session: boto3.Session | None = ...,
     s3_additional_kwargs: dict[str, Any] | None = ...,
     pyarrow_additional_kwargs: dict[str, Any] | None = ...,
+    decryption_configuration: ArrowDecryptionConfiguration | None = ...,
 ) -> pd.DataFrame: ...
 @overload
 def read_parquet(
@@ -113,6 +114,7 @@ def read_parquet(
     boto3_session: boto3.Session | None = ...,
     s3_additional_kwargs: dict[str, Any] | None = ...,
     pyarrow_additional_kwargs: dict[str, Any] | None = ...,
+    decryption_configuration: ArrowDecryptionConfiguration | None = ...,
 ) -> Iterator[pd.DataFrame]: ...
 @overload
 def read_parquet(
@@ -138,6 +140,7 @@ def read_parquet(
     boto3_session: boto3.Session | None = ...,
     s3_additional_kwargs: dict[str, Any] | None = ...,
     pyarrow_additional_kwargs: dict[str, Any] | None = ...,
+    decryption_configuration: ArrowDecryptionConfiguration | None = ...,
 ) -> pd.DataFrame | Iterator[pd.DataFrame]: ...
 @overload
 def read_parquet(
@@ -163,6 +166,7 @@ def read_parquet(
     boto3_session: boto3.Session | None = ...,
     s3_additional_kwargs: dict[str, Any] | None = ...,
     pyarrow_additional_kwargs: dict[str, Any] | None = ...,
+    decryption_configuration: ArrowDecryptionConfiguration | None = ...,
 ) -> Iterator[pd.DataFrame]: ...
 @overload
 def read_parquet_table(
@@ -183,6 +187,7 @@ def read_parquet_table(
     boto3_session: boto3.Session | None = ...,
     s3_additional_kwargs: dict[str, Any] | None = ...,
     pyarrow_additional_kwargs: dict[str, Any] | None = ...,
+    decryption_configuration: ArrowDecryptionConfiguration | None = ...,
 ) -> pd.DataFrame: ...
 @overload
 def read_parquet_table(
@@ -203,6 +208,7 @@ def read_parquet_table(
     boto3_session: boto3.Session | None = ...,
     s3_additional_kwargs: dict[str, Any] | None = ...,
     pyarrow_additional_kwargs: dict[str, Any] | None = ...,
+    decryption_configuration: ArrowDecryptionConfiguration | None = ...,
 ) -> Iterator[pd.DataFrame]: ...
 @overload
 def read_parquet_table(
@@ -223,6 +229,7 @@ def read_parquet_table(
     boto3_session: boto3.Session | None = ...,
     s3_additional_kwargs: dict[str, Any] | None = ...,
     pyarrow_additional_kwargs: dict[str, Any] | None = ...,
+    decryption_configuration: ArrowDecryptionConfiguration | None = ...,
 ) -> pd.DataFrame | Iterator[pd.DataFrame]: ...
 @overload
 def read_parquet_table(
@@ -243,4 +250,5 @@ def read_parquet_table(
     boto3_session: boto3.Session | None = ...,
     s3_additional_kwargs: dict[str, Any] | None = ...,
     pyarrow_additional_kwargs: dict[str, Any] | None = ...,
+    decryption_configuration: ArrowDecryptionConfiguration | None = ...,
 ) -> Iterator[pd.DataFrame]: ...
diff --git a/awswrangler/s3/_write.py b/awswrangler/s3/_write.py
@@ -195,6 +195,7 @@ def _write_to_s3(
         filename_prefix: str | None = None,
         max_rows_by_file: int | None = 0,
         bucketing: bool = False,
+        encryption_configuration: typing.ArrowEncryptionConfiguration | None = None,
     ) -> list[str]:
         pass
 
@@ -269,6 +270,7 @@ def write(  # noqa: PLR0912,PLR0913,PLR0915
         athena_partition_projection_settings: typing.AthenaPartitionProjectionSettings | None,
         catalog_id: str | None,
         compression_ext: str,
+        encryption_configuration: typing.ArrowEncryptionConfiguration | None,
     ) -> typing._S3WriteDataReturnValue:
         # Initializing defaults
         partition_cols = partition_cols if partition_cols else []
@@ -349,6 +351,7 @@ def write(  # noqa: PLR0912,PLR0913,PLR0915
                 dtype=dtype,
                 max_rows_by_file=max_rows_by_file,
                 use_threads=use_threads,
+                encryption_configuration=encryption_configuration,
             )
         else:
             columns_types: dict[str, str] = {}
@@ -417,6 +420,7 @@ def write(  # noqa: PLR0912,PLR0913,PLR0915
                 s3_additional_kwargs=s3_additional_kwargs,
                 schema=schema,
                 max_rows_by_file=max_rows_by_file,
+                encryption_configuration=encryption_configuration,
             )
             if database and table:
                 try:
diff --git a/awswrangler/s3/_write_orc.py b/awswrangler/s3/_write_orc.py
@@ -156,6 +156,7 @@ def _to_orc(
     filename_prefix: str | None = None,
     max_rows_by_file: int | None = 0,
     bucketing: bool = False,
+    encryption_configuration: typing.ArrowEncryptionConfiguration | None = None,
 ) -> list[str]:
     s3_client = s3_client if s3_client else _utils.client(service_name="s3")
     file_path = _get_file_path(
@@ -216,6 +217,7 @@ def _write_to_s3(
         filename_prefix: str | None = None,
         max_rows_by_file: int | None = 0,
         bucketing: bool = False,
+        encryption_configuration: typing.ArrowEncryptionConfiguration | None = None,
     ) -> list[str]:
         return _to_orc(
             df=df,
@@ -234,6 +236,7 @@ def _write_to_s3(
             filename_prefix=filename_prefix,
             max_rows_by_file=max_rows_by_file,
             bucketing=bucketing,
+            encryption_configuration=encryption_configuration,
         )
 
     def _create_glue_table(
@@ -712,4 +715,5 @@ def to_orc(
         athena_partition_projection_settings=athena_partition_projection_settings,
         catalog_id=catalog_id,
         compression_ext=compression_ext,
+        encryption_configuration=None,
     )
diff --git a/awswrangler/s3/_write_parquet.py b/awswrangler/s3/_write_parquet.py
diff --git a/awswrangler/typing.py b/awswrangler/typing.py
diff --git a/tests/conftest.py b/tests/conftest.py
diff --git a/tests/unit/test_s3_parquet.py b/tests/unit/test_s3_parquet.py