4040 _InternalReadTableMetadataReturnValue ,
4141 _TableMetadataReader ,
4242)
43- from awswrangler .typing import RayReadParquetSettings , _ReadTableMetadataReturnValue
43+ from awswrangler .typing import ArrowDecryptionConfiguration , RayReadParquetSettings , _ReadTableMetadataReturnValue
4444
4545if TYPE_CHECKING :
4646 from mypy_boto3_s3 import S3Client
5656def _pyarrow_parquet_file_wrapper (
5757 source : Any ,
5858 coerce_int96_timestamp_unit : str | None = None ,
59+ decryption_properties : pyarrow .parquet .encryption .DecryptionConfiguration | None = None ,
5960) -> pyarrow .parquet .ParquetFile :
6061 try :
61- return pyarrow .parquet .ParquetFile (source = source , coerce_int96_timestamp_unit = coerce_int96_timestamp_unit )
62+ return pyarrow .parquet .ParquetFile (
63+ source = source ,
64+ coerce_int96_timestamp_unit = coerce_int96_timestamp_unit ,
65+ decryption_properties = decryption_properties ,
66+ )
6267 except pyarrow .ArrowInvalid as ex :
6368 if str (ex ) == "Parquet file size is 0 bytes" :
6469 _logger .warning ("Ignoring empty file..." )
@@ -74,6 +79,7 @@ def _read_parquet_metadata_file(
7479 use_threads : bool | int ,
7580 version_id : str | None = None ,
7681 coerce_int96_timestamp_unit : str | None = None ,
82+ decryption_properties : pyarrow .parquet .encryption .DecryptionConfiguration | None = None ,
7783) -> pa .schema :
7884 with open_s3_object (
7985 path = path ,
@@ -85,7 +91,9 @@ def _read_parquet_metadata_file(
8591 s3_additional_kwargs = s3_additional_kwargs ,
8692 ) as f :
8793 pq_file : pyarrow .parquet .ParquetFile | None = _pyarrow_parquet_file_wrapper (
88- source = f , coerce_int96_timestamp_unit = coerce_int96_timestamp_unit
94+ source = f ,
95+ coerce_int96_timestamp_unit = coerce_int96_timestamp_unit ,
96+ decryption_properties = decryption_properties ,
8997 )
9098 if pq_file :
9199 return pq_file .schema .to_arrow_schema ()
@@ -156,6 +164,7 @@ def _read_parquet_file(
156164 use_threads : bool | int ,
157165 version_id : str | None = None ,
158166 schema : pa .schema | None = None ,
167+ decryption_properties : pyarrow .parquet .encryption .DecryptionConfiguration | None = None ,
159168) -> pa .Table :
160169 s3_block_size : int = FULL_READ_S3_BLOCK_SIZE if columns else - 1 # One shot for a full read or see constant
161170 with open_s3_object (
@@ -176,6 +185,7 @@ def _read_parquet_file(
176185 use_threads = False ,
177186 use_pandas_metadata = False ,
178187 coerce_int96_timestamp_unit = coerce_int96_timestamp_unit ,
188+ decryption_properties = decryption_properties ,
179189 )
180190 except pyarrow .ArrowInvalid as ex :
181191 if "Parquet file size is 0 bytes" in str (ex ):
@@ -190,6 +200,7 @@ def _read_parquet_file(
190200 pq_file : pyarrow .parquet .ParquetFile | None = _pyarrow_parquet_file_wrapper (
191201 source = f ,
192202 coerce_int96_timestamp_unit = coerce_int96_timestamp_unit ,
203+ decryption_properties = decryption_properties ,
193204 )
194205 if pq_file is None :
195206 raise exceptions .InvalidFile (f"Invalid Parquet file: { path } " )
@@ -212,6 +223,7 @@ def _read_parquet_chunked(
212223 s3_additional_kwargs : dict [str , str ] | None ,
213224 arrow_kwargs : dict [str , Any ],
214225 version_ids : dict [str , str ] | None = None ,
226+ decryption_properties : pyarrow .parquet .encryption .DecryptionConfiguration | None = None ,
215227) -> Iterator [pd .DataFrame ]:
216228 next_slice : pd .DataFrame | None = None
217229 batch_size = BATCH_READ_BLOCK_SIZE if chunked is True else chunked
@@ -229,6 +241,7 @@ def _read_parquet_chunked(
229241 pq_file : pyarrow .parquet .ParquetFile | None = _pyarrow_parquet_file_wrapper (
230242 source = f ,
231243 coerce_int96_timestamp_unit = coerce_int96_timestamp_unit ,
244+ decryption_properties = decryption_properties ,
232245 )
233246 if pq_file is None :
234247 continue
@@ -278,6 +291,7 @@ def _read_parquet(
278291 s3_additional_kwargs : dict [str , Any ] | None ,
279292 arrow_kwargs : dict [str , Any ],
280293 bulk_read : bool ,
294+ decryption_properties : pyarrow .parquet .encryption .DecryptionConfiguration | None = None ,
281295) -> pd .DataFrame :
282296 executor : _BaseExecutor = _get_executor (use_threads = use_threads )
283297 tables = executor .map (
@@ -291,6 +305,7 @@ def _read_parquet(
291305 itertools .repeat (use_threads ),
292306 [version_ids .get (p ) if isinstance (version_ids , dict ) else None for p in paths ],
293307 itertools .repeat (schema ),
308+ itertools .repeat (decryption_properties ),
294309 )
295310 return _utils .table_refs_to_df (tables , kwargs = arrow_kwargs )
296311
@@ -321,6 +336,7 @@ def read_parquet(
321336 boto3_session : boto3 .Session | None = None ,
322337 s3_additional_kwargs : dict [str , Any ] | None = None ,
323338 pyarrow_additional_kwargs : dict [str , Any ] | None = None ,
339+ decryption_configuration : ArrowDecryptionConfiguration | None = None ,
324340) -> pd .DataFrame | Iterator [pd .DataFrame ]:
325341 """Read Parquet file(s) from an S3 prefix or list of S3 objects paths.
326342
@@ -425,6 +441,11 @@ def read_parquet(
425441 Forwarded to `to_pandas` method converting from PyArrow tables to Pandas DataFrame.
426442 Valid values include "split_blocks", "self_destruct", "ignore_metadata".
427443 e.g. pyarrow_additional_kwargs={'split_blocks': True}.
444+ decryption_configuration: typing.ArrowDecryptionConfiguration, optional
445+ ``pyarrow.parquet.encryption.CryptoFactory`` and ``pyarrow.parquet.encryption.KmsConnectionConfig`` objects dict
446+ used to create a PyArrow ``CryptoFactory.file_decryption_properties`` object to forward to PyArrow reader.
447+ see: https://arrow.apache.org/docs/python/parquet.html#decryption-configuration
448+ Client Decryption is not supported in distributed mode.
428449
429450 Returns
430451 -------
@@ -508,10 +529,17 @@ def read_parquet(
508529 coerce_int96_timestamp_unit = coerce_int96_timestamp_unit ,
509530 )
510531
532+ decryption_properties = (
533+ decryption_configuration ["crypto_factory" ].file_decryption_properties (
534+ decryption_configuration ["kms_connection_config" ]
535+ )
536+ if decryption_configuration
537+ else None
538+ )
539+
511540 arrow_kwargs = _data_types .pyarrow2pandas_defaults (
512541 use_threads = use_threads , kwargs = pyarrow_additional_kwargs , dtype_backend = dtype_backend
513542 )
514-
515543 if chunked :
516544 return _read_parquet_chunked (
517545 s3_client = s3_client ,
@@ -524,6 +552,7 @@ def read_parquet(
524552 s3_additional_kwargs = s3_additional_kwargs ,
525553 arrow_kwargs = arrow_kwargs ,
526554 version_ids = version_ids ,
555+ decryption_properties = decryption_properties ,
527556 )
528557
529558 return _read_parquet (
@@ -539,6 +568,7 @@ def read_parquet(
539568 arrow_kwargs = arrow_kwargs ,
540569 version_ids = version_ids ,
541570 bulk_read = bulk_read ,
571+ decryption_properties = decryption_properties ,
542572 )
543573
544574
@@ -563,6 +593,7 @@ def read_parquet_table(
563593 boto3_session : boto3 .Session | None = None ,
564594 s3_additional_kwargs : dict [str , Any ] | None = None ,
565595 pyarrow_additional_kwargs : dict [str , Any ] | None = None ,
596+ decryption_configuration : ArrowDecryptionConfiguration | None = None ,
566597) -> pd .DataFrame | Iterator [pd .DataFrame ]:
567598 """Read Apache Parquet table registered in the AWS Glue Catalog.
568599
@@ -641,6 +672,10 @@ def read_parquet_table(
641672 Forwarded to `to_pandas` method converting from PyArrow tables to Pandas DataFrame.
642673 Valid values include "split_blocks", "self_destruct", "ignore_metadata".
643674 e.g. pyarrow_additional_kwargs={'split_blocks': True}.
675+ decryption_configuration: typing.ArrowDecryptionConfiguration, optional
676+ ``pyarrow.parquet.encryption.CryptoFactory`` and ``pyarrow.parquet.encryption.KmsConnectionConfig`` objects dict
677+ used to create a PyArrow ``CryptoFactory.file_decryption_properties`` object to forward to PyArrow reader.
678+ Client Decryption is not supported in distributed mode.
644679
645680 Returns
646681 -------
@@ -698,6 +733,7 @@ def read_parquet_table(
698733 boto3_session = boto3_session ,
699734 s3_additional_kwargs = s3_additional_kwargs ,
700735 pyarrow_additional_kwargs = pyarrow_additional_kwargs ,
736+ decryption_configuration = decryption_configuration ,
701737 )
702738
703739 partial_cast_function = functools .partial (
0 commit comments