aws
diff --git a/‎README.md‎
Lines changed: 2 additions & 2 deletions b/‎README.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎awswrangler/__metadata__.py‎
Lines changed: 1 addition & 1 deletion b/‎awswrangler/__metadata__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎awswrangler/exceptions.py‎
Lines changed: 4 additions & 0 deletions b/‎awswrangler/exceptions.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎awswrangler/redshift.py‎
Lines changed: 23 additions & 0 deletions b/‎awswrangler/redshift.py‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎awswrangler/s3/_copy.py‎
Lines changed: 4 additions & 1 deletion b/‎awswrangler/s3/_copy.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎awswrangler/s3/_list.py‎
Lines changed: 10 additions & 1 deletion b/‎awswrangler/s3/_list.py‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎awswrangler/s3/_read_parquet.py‎
Lines changed: 51 additions & 11 deletions b/‎awswrangler/s3/_read_parquet.py‎
Lines changed: 51 additions & 11 deletions
@@ -6,13 +6,13 @@
 
 > An [AWS Professional Service](https://aws.amazon.com/professional-services/) open source initiative | [email protected]
 
-[![Release](https://img.shields.io/badge/release-2.0.0-brightgreen.svg)](https://pypi.org/project/awswrangler/)
+[![Release](https://img.shields.io/badge/release-2.0.1-brightgreen.svg)](https://pypi.org/project/awswrangler/)
 [![Python Version](https://img.shields.io/badge/python-3.6%20%7C%203.7%20%7C%203.8-brightgreen.svg)](https://anaconda.org/conda-forge/awswrangler)
 [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
 [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
 
 [![Checked with mypy](http://www.mypy-lang.org/static/mypy_badge.svg)](http://mypy-lang.org/)
-[![Coverage](https://img.shields.io/badge/coverage-93%25-brightgreen.svg)](https://pypi.org/project/awswrangler/)
+[![Coverage](https://img.shields.io/badge/coverage-92%25-brightgreen.svg)](https://pypi.org/project/awswrangler/)
 ![Static Checking](https://github.com/awslabs/aws-data-wrangler/workflows/Static%20Checking/badge.svg?branch=master)
 [![Documentation Status](https://readthedocs.org/projects/aws-data-wrangler/badge/?version=latest)](https://aws-data-wrangler.readthedocs.io/?badge=latest)
 
 
@@ -7,5 +7,5 @@
 
 __title__: str = "awswrangler"
 __description__: str = "Pandas on AWS."
-__version__: str = "2.0.0"
+__version__: str = "2.0.1"
 __license__: str = "Apache License 2.0"
@@ -99,3 +99,7 @@ class NoFilesFound(Exception):
 
 class InvalidDataFrame(Exception):
     """InvalidDataFrame."""
+
+
+class InvalidFile(Exception):
+    """InvalidFile."""
@@ -127,16 +127,21 @@ def _redshift_types_from_path(
     varchar_lengths_default: int,
     varchar_lengths: Optional[Dict[str, int]],
     parquet_infer_sampling: float,
+    path_suffix: Optional[str],
+    path_ignore_suffix: Optional[str],
     use_threads: bool,
     boto3_session: Optional[boto3.Session],
     s3_additional_kwargs: Optional[Dict[str, str]],
 ) -> Dict[str, str]:
     """Extract Redshift data types from a Pandas DataFrame."""
     _varchar_lengths: Dict[str, int] = {} if varchar_lengths is None else varchar_lengths
     session: boto3.Session = _utils.ensure_session(session=boto3_session)
+    _logger.debug("Scanning parquet schemas on s3...")
     athena_types, _ = s3.read_parquet_metadata(
         path=path,
         sampling=parquet_infer_sampling,
+        path_suffix=path_suffix,
+        path_ignore_suffix=path_ignore_suffix,
         dataset=False,
         use_threads=use_threads,
         boto3_session=session,
@@ -167,6 +172,8 @@ def _create_table(
     varchar_lengths_default: int,
     varchar_lengths: Optional[Dict[str, int]],
     parquet_infer_sampling: float = 1.0,
+    path_suffix: Optional[str] = None,
+    path_ignore_suffix: Optional[str] = None,
     use_threads: bool = True,
     boto3_session: Optional[boto3.Session] = None,
     s3_additional_kwargs: Optional[Dict[str, str]] = None,
@@ -199,6 +206,8 @@ def _create_table(
             varchar_lengths_default=varchar_lengths_default,
             varchar_lengths=varchar_lengths,
             parquet_infer_sampling=parquet_infer_sampling,
+            path_suffix=path_suffix,
+            path_ignore_suffix=path_ignore_suffix,
             use_threads=use_threads,
             boto3_session=boto3_session,
             s3_additional_kwargs=s3_additional_kwargs,
@@ -927,6 +936,8 @@ def copy_from_files(  # pylint: disable=too-many-locals,too-many-arguments
     primary_keys: Optional[List[str]] = None,
     varchar_lengths_default: int = 256,
     varchar_lengths: Optional[Dict[str, int]] = None,
+    path_suffix: Optional[str] = None,
+    path_ignore_suffix: Optional[str] = None,
     use_threads: bool = True,
     boto3_session: Optional[boto3.Session] = None,
     s3_additional_kwargs: Optional[Dict[str, str]] = None,
@@ -983,6 +994,16 @@ def copy_from_files(  # pylint: disable=too-many-locals,too-many-arguments
         The size that will be set for all VARCHAR columns not specified with varchar_lengths.
     varchar_lengths : Dict[str, int], optional
         Dict of VARCHAR length by columns. (e.g. {"col1": 10, "col5": 200}).
+    path_suffix: Union[str, List[str], None]
+        Suffix or List of suffixes to be scanned on s3 for the schema extraction
+        (e.g. [".gz.parquet", ".snappy.parquet"]).
+        Only has effect during the table creation.
+        If None, will try to read all files. (default)
+    path_ignore_suffix: Union[str, List[str], None]
+        Suffix or List of suffixes for S3 keys to be ignored during the schema extraction.
+        (e.g. [".csv", "_SUCCESS"]).
+        Only has effect during the table creation.
+        If None, will try to read all files. (default)
     use_threads : bool
         True to enable concurrent requests, False to disable multiple threads.
         If enabled os.cpu_count() will be used as the max number of threads.
@@ -1020,6 +1041,8 @@ def copy_from_files(  # pylint: disable=too-many-locals,too-many-arguments
                 df=None,
                 path=path,
                 parquet_infer_sampling=parquet_infer_sampling,
+                path_suffix=path_suffix,
+                path_ignore_suffix=path_ignore_suffix,
                 cursor=cursor,
                 table=table,
                 schema=schema,
 
@@ -45,6 +45,7 @@ def merge_datasets(
     source_path: str,
     target_path: str,
     mode: str = "append",
+    ignore_empty: bool = False,
     use_threads: bool = True,
     boto3_session: Optional[boto3.Session] = None,
     s3_additional_kwargs: Optional[Dict[str, Any]] = None,
@@ -74,6 +75,8 @@ def merge_datasets(
         S3 Path for the target directory.
     mode: str, optional
         ``append`` (Default), ``overwrite``, ``overwrite_partitions``.
+    ignore_empty: bool
+        Ignore files with 0 bytes.
     use_threads : bool
         True to enable concurrent requests, False to disable multiple threads.
         If enabled os.cpu_count() will be used as the max number of threads.
@@ -120,7 +123,7 @@ def merge_datasets(
     target_path = target_path[:-1] if target_path[-1] == "/" else target_path
     session: boto3.Session = _utils.ensure_session(session=boto3_session)
 
-    paths: List[str] = list_objects(path=f"{source_path}/", boto3_session=session)
+    paths: List[str] = list_objects(path=f"{source_path}/", ignore_empty=ignore_empty, boto3_session=session)
     _logger.debug("len(paths): %s", len(paths))
     if len(paths) < 1:
         return []
 
@@ -20,6 +20,7 @@ def _path2list(
     last_modified_end: Optional[datetime.datetime] = None,
     suffix: Union[str, List[str], None] = None,
     ignore_suffix: Union[str, List[str], None] = None,
+    ignore_empty: bool = False,
 ) -> List[str]:
     """Convert Amazon S3 path to list of objects."""
     _suffix: Optional[List[str]] = [suffix] if isinstance(suffix, str) else suffix
@@ -32,6 +33,7 @@ def _path2list(
             boto3_session=boto3_session,
             last_modified_begin=last_modified_begin,
             last_modified_end=last_modified_end,
+            ignore_empty=ignore_empty,
         )
     elif isinstance(path, list):
         if last_modified_begin or last_modified_end:
@@ -72,6 +74,7 @@ def _list_objects(  # pylint: disable=too-many-branches
     last_modified_begin: Optional[datetime.datetime] = None,
     last_modified_end: Optional[datetime.datetime] = None,
     boto3_session: Optional[boto3.Session] = None,
+    ignore_empty: bool = False,
 ) -> List[str]:
     bucket: str
     prefix_original: str
@@ -94,7 +97,9 @@ def _list_objects(  # pylint: disable=too-many-branches
             if contents is not None:
                 for content in contents:
                     key: str = content["Key"]
-                    if (content is not None) and ("Key" in content):
+                    if ignore_empty and content.get("Size", 0) == 0:
+                        _logger.debug("Skipping empty file: %s", f"s3://{bucket}/{key}")
+                    elif (content is not None) and ("Key" in content):
                         if (_suffix is None) or key.endswith(tuple(_suffix)):
                             if last_modified_begin is not None:
                                 if content["LastModified"] < last_modified_begin:
@@ -212,6 +217,7 @@ def list_objects(
     ignore_suffix: Union[str, List[str], None] = None,
     last_modified_begin: Optional[datetime.datetime] = None,
     last_modified_end: Optional[datetime.datetime] = None,
+    ignore_empty: bool = False,
     boto3_session: Optional[boto3.Session] = None,
 ) -> List[str]:
     """List Amazon S3 objects from a prefix.
@@ -238,6 +244,8 @@ def list_objects(
     last_modified_end: datetime, optional
         Filter the s3 files by the Last modified date of the object.
         The filter is applied only after list all s3 files.
+    ignore_empty: bool
+        Ignore files with 0 bytes.
     boto3_session : boto3.Session(), optional
         Boto3 Session. The default boto3 session will be used if boto3_session receive None.
 
@@ -270,5 +278,6 @@ def list_objects(
         boto3_session=boto3_session,
         last_modified_begin=last_modified_begin,
         last_modified_end=last_modified_end,
+        ignore_empty=ignore_empty,
     )
     return [p for p in paths if not p.endswith("/")]
@@ -12,7 +12,6 @@
 import boto3
 import pandas as pd
 import pyarrow as pa
-import pyarrow.lib
 import pyarrow.parquet
 
 from awswrangler import _data_types, _utils, exceptions
@@ -32,9 +31,21 @@
 _logger: logging.Logger = logging.getLogger(__name__)
 
 
+def _pyarrow_parquet_file_wrapper(
+    source: Any, read_dictionary: Optional[List[str]] = None
+) -> pyarrow.parquet.ParquetFile:
+    try:
+        return pyarrow.parquet.ParquetFile(source=source, read_dictionary=read_dictionary)
+    except pyarrow.ArrowInvalid as ex:
+        if str(ex) == "Parquet file size is 0 bytes":
+            _logger.warning("Ignoring empty file...xx")
+            return None
+        raise
+
+
 def _read_parquet_metadata_file(
     path: str, boto3_session: boto3.Session, s3_additional_kwargs: Optional[Dict[str, str]], use_threads: bool
-) -> Dict[str, str]:
+) -> Optional[Dict[str, str]]:
     with open_s3_object(
         path=path,
         mode="rb",
@@ -43,7 +54,9 @@ def _read_parquet_metadata_file(
         s3_additional_kwargs=s3_additional_kwargs,
         boto3_session=boto3_session,
     ) as f:
-        pq_file: pyarrow.parquet.ParquetFile = pyarrow.parquet.ParquetFile(source=f)
+        pq_file: Optional[pyarrow.parquet.ParquetFile] = _pyarrow_parquet_file_wrapper(source=f)
+        if pq_file is None:
+            return None
         return _data_types.athena_types_from_pyarrow_schema(schema=pq_file.schema.to_arrow_schema(), partitions=None)[0]
 
 
@@ -55,7 +68,7 @@ def _read_schemas_from_files(
     s3_additional_kwargs: Optional[Dict[str, str]],
 ) -> Tuple[Dict[str, str], ...]:
     paths = _utils.list_sampling(lst=paths, sampling=sampling)
-    schemas: Tuple[Dict[str, str], ...] = tuple()
+    schemas: Tuple[Optional[Dict[str, str]], ...] = tuple()
     n_paths: int = len(paths)
     if use_threads is False or n_paths == 1:
         schemas = tuple(
@@ -76,6 +89,7 @@ def _read_schemas_from_files(
                     itertools.repeat(use_threads),
                 )
             )
+    schemas = cast(Tuple[Dict[str, str], ...], tuple(x for x in schemas if x is not None))
     _logger.debug("schemas: %s", schemas)
     return schemas
 
@@ -125,6 +139,7 @@ def _read_parquet_metadata(
     path: Union[str, List[str]],
     path_suffix: Optional[str],
     path_ignore_suffix: Optional[str],
+    ignore_empty: bool,
     dtype: Optional[Dict[str, str]],
     sampling: float,
     dataset: bool,
@@ -139,6 +154,7 @@ def _read_parquet_metadata(
         boto3_session=boto3_session,
         suffix=path_suffix,
         ignore_suffix=_get_path_ignore_suffix(path_ignore_suffix=path_ignore_suffix),
+        ignore_empty=ignore_empty,
     )
 
     # Files
@@ -279,7 +295,11 @@ def _read_parquet_chunked(
             s3_additional_kwargs=s3_additional_kwargs,
             boto3_session=boto3_session,
         ) as f:
-            pq_file: pyarrow.parquet.ParquetFile = pyarrow.parquet.ParquetFile(source=f, read_dictionary=categories)
+            pq_file: Optional[pyarrow.parquet.ParquetFile] = _pyarrow_parquet_file_wrapper(
+                source=f, read_dictionary=categories
+            )
+            if pq_file is None:
+                continue
             schema: Dict[str, str] = _data_types.athena_types_from_pyarrow_schema(
                 schema=pq_file.schema.to_arrow_schema(), partitions=None
             )[0]
@@ -342,7 +362,11 @@ def _read_parquet_file(
         s3_additional_kwargs=s3_additional_kwargs,
         boto3_session=boto3_session,
     ) as f:
-        pq_file: pyarrow.parquet.ParquetFile = pyarrow.parquet.ParquetFile(source=f, read_dictionary=categories)
+        pq_file: Optional[pyarrow.parquet.ParquetFile] = _pyarrow_parquet_file_wrapper(
+            source=f, read_dictionary=categories
+        )
+        if pq_file is None:
+            raise exceptions.InvalidFile(f"Invalid Parquet file: {path}")
         return pq_file.read(columns=columns, use_threads=False, use_pandas_metadata=False)
 
 
@@ -362,7 +386,11 @@ def _count_row_groups(
         s3_additional_kwargs=s3_additional_kwargs,
         boto3_session=boto3_session,
     ) as f:
-        pq_file: pyarrow.parquet.ParquetFile = pyarrow.parquet.ParquetFile(source=f, read_dictionary=categories)
+        pq_file: Optional[pyarrow.parquet.ParquetFile] = _pyarrow_parquet_file_wrapper(
+            source=f, read_dictionary=categories
+        )
+        if pq_file is None:
+            return 0
         n: int = cast(int, pq_file.num_row_groups)
         _logger.debug("Row groups count: %d", n)
         return n
@@ -401,6 +429,7 @@ def read_parquet(
     path: Union[str, List[str]],
     path_suffix: Union[str, List[str], None] = None,
     path_ignore_suffix: Union[str, List[str], None] = None,
+    ignore_empty: bool = True,
     partition_filter: Optional[Callable[[Dict[str, str]], bool]] = None,
     columns: Optional[List[str]] = None,
     validate_schema: bool = False,
@@ -453,9 +482,13 @@ def read_parquet(
         S3 prefix (accepts Unix shell-style wildcards)
         (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
     path_suffix: Union[str, List[str], None]
-        Suffix or List of suffixes for filtering S3 keys.
+        Suffix or List of suffixes to be read (e.g. [".gz.parquet", ".snappy.parquet"]).
+        If None, will try to read all files. (default)
     path_ignore_suffix: Union[str, List[str], None]
-        Suffix or List of suffixes for S3 keys to be ignored.
+        Suffix or List of suffixes for S3 keys to be ignored.(e.g. [".csv", "_SUCCESS"]).
+        If None, will try to read all files. (default)
+    ignore_empty: bool
+        Ignore files with 0 bytes.
     partition_filter: Optional[Callable[[Dict[str, str]], bool]]
         Callback Function filters to apply on PARTITION columns (PUSH-DOWN filter).
         This function MUST receive a single argument (Dict[str, str]) where keys are partitions
@@ -543,6 +576,7 @@ def read_parquet(
         ignore_suffix=_get_path_ignore_suffix(path_ignore_suffix=path_ignore_suffix),
         last_modified_begin=last_modified_begin,
         last_modified_end=last_modified_end,
+        ignore_empty=ignore_empty,
     )
     path_root: Optional[str] = _get_path_root(path=path, dataset=dataset)
     if path_root is not None:
@@ -727,6 +761,7 @@ def read_parquet_metadata(
     path: Union[str, List[str]],
     path_suffix: Optional[str] = None,
     path_ignore_suffix: Optional[str] = None,
+    ignore_empty: bool = True,
     dtype: Optional[Dict[str, str]] = None,
     sampling: float = 1.0,
     dataset: bool = False,
@@ -754,9 +789,13 @@ def read_parquet_metadata(
         S3 prefix (accepts Unix shell-style wildcards)
         (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
     path_suffix: Union[str, List[str], None]
-        Suffix or List of suffixes for filtering S3 keys.
+        Suffix or List of suffixes to be read (e.g. [".gz.parquet", ".snappy.parquet"]).
+        If None, will try to read all files. (default)
     path_ignore_suffix: Union[str, List[str], None]
-        Suffix or List of suffixes for S3 keys to be ignored.
+        Suffix or List of suffixes for S3 keys to be ignored.(e.g. [".csv", "_SUCCESS"]).
+        If None, will try to read all files. (default)
+    ignore_empty: bool
+        Ignore files with 0 bytes.
     dtype : Dict[str, str], optional
         Dictionary of columns names and Athena/Glue types to be casted.
         Useful when you have columns with undetermined data types as partitions columns.
@@ -804,6 +843,7 @@ def read_parquet_metadata(
         path=path,
         path_suffix=path_suffix,
         path_ignore_suffix=path_ignore_suffix,
+        ignore_empty=ignore_empty,
         dtype=dtype,
         sampling=sampling,
         dataset=dataset,