1212import boto3
1313import pandas as pd
1414import pyarrow as pa
15- import pyarrow .lib
1615import pyarrow .parquet
1716
1817from awswrangler import _data_types , _utils , exceptions
3231_logger : logging .Logger = logging .getLogger (__name__ )
3332
3433
34+ def _pyarrow_parquet_file_wrapper (
35+ source : Any , read_dictionary : Optional [List [str ]] = None
36+ ) -> pyarrow .parquet .ParquetFile :
37+ try :
38+ return pyarrow .parquet .ParquetFile (source = source , read_dictionary = read_dictionary )
39+ except pyarrow .ArrowInvalid as ex :
40+ if str (ex ) == "Parquet file size is 0 bytes" :
41+ _logger .warning ("Ignoring empty file...xx" )
42+ return None
43+ raise
44+
45+
3546def _read_parquet_metadata_file (
3647 path : str , boto3_session : boto3 .Session , s3_additional_kwargs : Optional [Dict [str , str ]], use_threads : bool
37- ) -> Dict [str , str ]:
48+ ) -> Optional [ Dict [str , str ] ]:
3849 with open_s3_object (
3950 path = path ,
4051 mode = "rb" ,
@@ -43,7 +54,9 @@ def _read_parquet_metadata_file(
4354 s3_additional_kwargs = s3_additional_kwargs ,
4455 boto3_session = boto3_session ,
4556 ) as f :
46- pq_file : pyarrow .parquet .ParquetFile = pyarrow .parquet .ParquetFile (source = f )
57+ pq_file : Optional [pyarrow .parquet .ParquetFile ] = _pyarrow_parquet_file_wrapper (source = f )
58+ if pq_file is None :
59+ return None
4760 return _data_types .athena_types_from_pyarrow_schema (schema = pq_file .schema .to_arrow_schema (), partitions = None )[0 ]
4861
4962
@@ -55,7 +68,7 @@ def _read_schemas_from_files(
5568 s3_additional_kwargs : Optional [Dict [str , str ]],
5669) -> Tuple [Dict [str , str ], ...]:
5770 paths = _utils .list_sampling (lst = paths , sampling = sampling )
58- schemas : Tuple [Dict [str , str ], ...] = tuple ()
71+ schemas : Tuple [Optional [ Dict [str , str ] ], ...] = tuple ()
5972 n_paths : int = len (paths )
6073 if use_threads is False or n_paths == 1 :
6174 schemas = tuple (
@@ -76,6 +89,7 @@ def _read_schemas_from_files(
7689 itertools .repeat (use_threads ),
7790 )
7891 )
92+ schemas = cast (Tuple [Dict [str , str ], ...], tuple (x for x in schemas if x is not None ))
7993 _logger .debug ("schemas: %s" , schemas )
8094 return schemas
8195
@@ -125,6 +139,7 @@ def _read_parquet_metadata(
125139 path : Union [str , List [str ]],
126140 path_suffix : Optional [str ],
127141 path_ignore_suffix : Optional [str ],
142+ ignore_empty : bool ,
128143 dtype : Optional [Dict [str , str ]],
129144 sampling : float ,
130145 dataset : bool ,
@@ -139,6 +154,7 @@ def _read_parquet_metadata(
139154 boto3_session = boto3_session ,
140155 suffix = path_suffix ,
141156 ignore_suffix = _get_path_ignore_suffix (path_ignore_suffix = path_ignore_suffix ),
157+ ignore_empty = ignore_empty ,
142158 )
143159
144160 # Files
@@ -279,7 +295,11 @@ def _read_parquet_chunked(
279295 s3_additional_kwargs = s3_additional_kwargs ,
280296 boto3_session = boto3_session ,
281297 ) as f :
282- pq_file : pyarrow .parquet .ParquetFile = pyarrow .parquet .ParquetFile (source = f , read_dictionary = categories )
298+ pq_file : Optional [pyarrow .parquet .ParquetFile ] = _pyarrow_parquet_file_wrapper (
299+ source = f , read_dictionary = categories
300+ )
301+ if pq_file is None :
302+ continue
283303 schema : Dict [str , str ] = _data_types .athena_types_from_pyarrow_schema (
284304 schema = pq_file .schema .to_arrow_schema (), partitions = None
285305 )[0 ]
@@ -342,7 +362,11 @@ def _read_parquet_file(
342362 s3_additional_kwargs = s3_additional_kwargs ,
343363 boto3_session = boto3_session ,
344364 ) as f :
345- pq_file : pyarrow .parquet .ParquetFile = pyarrow .parquet .ParquetFile (source = f , read_dictionary = categories )
365+ pq_file : Optional [pyarrow .parquet .ParquetFile ] = _pyarrow_parquet_file_wrapper (
366+ source = f , read_dictionary = categories
367+ )
368+ if pq_file is None :
369+ raise exceptions .InvalidFile (f"Invalid Parquet file: { path } " )
346370 return pq_file .read (columns = columns , use_threads = False , use_pandas_metadata = False )
347371
348372
@@ -362,7 +386,11 @@ def _count_row_groups(
362386 s3_additional_kwargs = s3_additional_kwargs ,
363387 boto3_session = boto3_session ,
364388 ) as f :
365- pq_file : pyarrow .parquet .ParquetFile = pyarrow .parquet .ParquetFile (source = f , read_dictionary = categories )
389+ pq_file : Optional [pyarrow .parquet .ParquetFile ] = _pyarrow_parquet_file_wrapper (
390+ source = f , read_dictionary = categories
391+ )
392+ if pq_file is None :
393+ return 0
366394 n : int = cast (int , pq_file .num_row_groups )
367395 _logger .debug ("Row groups count: %d" , n )
368396 return n
@@ -401,6 +429,7 @@ def read_parquet(
401429 path : Union [str , List [str ]],
402430 path_suffix : Union [str , List [str ], None ] = None ,
403431 path_ignore_suffix : Union [str , List [str ], None ] = None ,
432+ ignore_empty : bool = True ,
404433 partition_filter : Optional [Callable [[Dict [str , str ]], bool ]] = None ,
405434 columns : Optional [List [str ]] = None ,
406435 validate_schema : bool = False ,
@@ -453,9 +482,13 @@ def read_parquet(
453482 S3 prefix (accepts Unix shell-style wildcards)
454483 (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
455484 path_suffix: Union[str, List[str], None]
456- Suffix or List of suffixes for filtering S3 keys.
485+ Suffix or List of suffixes to be read (e.g. [".gz.parquet", ".snappy.parquet"]).
486+ If None, will try to read all files. (default)
457487 path_ignore_suffix: Union[str, List[str], None]
458- Suffix or List of suffixes for S3 keys to be ignored.
488+ Suffix or List of suffixes for S3 keys to be ignored.(e.g. [".csv", "_SUCCESS"]).
489+ If None, will try to read all files. (default)
490+ ignore_empty: bool
491+ Ignore files with 0 bytes.
459492 partition_filter: Optional[Callable[[Dict[str, str]], bool]]
460493 Callback Function filters to apply on PARTITION columns (PUSH-DOWN filter).
461494 This function MUST receive a single argument (Dict[str, str]) where keys are partitions
@@ -543,6 +576,7 @@ def read_parquet(
543576 ignore_suffix = _get_path_ignore_suffix (path_ignore_suffix = path_ignore_suffix ),
544577 last_modified_begin = last_modified_begin ,
545578 last_modified_end = last_modified_end ,
579+ ignore_empty = ignore_empty ,
546580 )
547581 path_root : Optional [str ] = _get_path_root (path = path , dataset = dataset )
548582 if path_root is not None :
@@ -727,6 +761,7 @@ def read_parquet_metadata(
727761 path : Union [str , List [str ]],
728762 path_suffix : Optional [str ] = None ,
729763 path_ignore_suffix : Optional [str ] = None ,
764+ ignore_empty : bool = True ,
730765 dtype : Optional [Dict [str , str ]] = None ,
731766 sampling : float = 1.0 ,
732767 dataset : bool = False ,
@@ -754,9 +789,13 @@ def read_parquet_metadata(
754789 S3 prefix (accepts Unix shell-style wildcards)
755790 (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
756791 path_suffix: Union[str, List[str], None]
757- Suffix or List of suffixes for filtering S3 keys.
792+ Suffix or List of suffixes to be read (e.g. [".gz.parquet", ".snappy.parquet"]).
793+ If None, will try to read all files. (default)
758794 path_ignore_suffix: Union[str, List[str], None]
759- Suffix or List of suffixes for S3 keys to be ignored.
795+ Suffix or List of suffixes for S3 keys to be ignored.(e.g. [".csv", "_SUCCESS"]).
796+ If None, will try to read all files. (default)
797+ ignore_empty: bool
798+ Ignore files with 0 bytes.
760799 dtype : Dict[str, str], optional
761800 Dictionary of columns names and Athena/Glue types to be casted.
762801 Useful when you have columns with undetermined data types as partitions columns.
@@ -804,6 +843,7 @@ def read_parquet_metadata(
804843 path = path ,
805844 path_suffix = path_suffix ,
806845 path_ignore_suffix = path_ignore_suffix ,
846+ ignore_empty = ignore_empty ,
807847 dtype = dtype ,
808848 sampling = sampling ,
809849 dataset = dataset ,
0 commit comments