feat: Add ability to pass schema to s3.read_parquet (#2328)

kukushking · web-flow · commit 8c9eb6321ba3 · 2023-06-09T15:46:43.000+01:00
* feat: Allow to pass pyarrow.Schema to wr.s3.read_parquet()

* Fix file handle for read_table

* Add packaging dependency

* Raise an error if reading an empty file

* Throw an exception when file size is 0

* [skip ci] Add warning
diff --git a/awswrangler/distributed/ray/datasources/arrow_parquet_datasource.py b/awswrangler/distributed/ray/datasources/arrow_parquet_datasource.py
@@ -29,6 +29,7 @@
     _handle_read_os_error,
 )
 
+from awswrangler import exceptions
 from awswrangler._arrow import _add_table_partitions, _df_to_table
 from awswrangler.distributed.ray import ray_remote
 from awswrangler.distributed.ray.datasources.arrow_parquet_base_datasource import ArrowParquetBaseDatasource
@@ -243,6 +244,10 @@ def __init__(
             self._metadata = meta_provider.prefetch_file_metadata(pq_ds.pieces, **prefetch_remote_args) or []
         except OSError as e:
             _handle_read_os_error(e, paths)
+        except pyarrow.ArrowInvalid as ex:
+            if "Parquet file size is 0 bytes" in str(ex):
+                raise exceptions.InvalidFile(f"Invalid Parquet file. {str(ex)}")
+            raise
         self._pq_ds = pq_ds
         self._meta_provider = meta_provider
         self._inferred_schema = inferred_schema
diff --git a/awswrangler/s3/_read_parquet.py b/awswrangler/s3/_read_parquet.py
@@ -4,6 +4,7 @@
 import functools
 import itertools
 import logging
+import warnings
 from typing import (
     TYPE_CHECKING,
     Any,
@@ -20,6 +21,7 @@
 import pyarrow as pa
 import pyarrow.dataset
 import pyarrow.parquet
+from packaging import version
 from typing_extensions import Literal
 
 from awswrangler import _data_types, _utils, exceptions
@@ -54,7 +56,8 @@
 
 
 def _pyarrow_parquet_file_wrapper(
-    source: Any, coerce_int96_timestamp_unit: Optional[str] = None
+    source: Any,
+    coerce_int96_timestamp_unit: Optional[str] = None,
 ) -> pyarrow.parquet.ParquetFile:
     try:
         return pyarrow.parquet.ParquetFile(source=source, coerce_int96_timestamp_unit=coerce_int96_timestamp_unit)
@@ -154,6 +157,7 @@ def _read_parquet_file(
     s3_additional_kwargs: Optional[Dict[str, str]],
     use_threads: Union[bool, int],
     version_id: Optional[str] = None,
+    schema: Optional[pa.schema] = None,
 ) -> pa.Table:
     s3_block_size: int = FULL_READ_S3_BLOCK_SIZE if columns else -1  # One shot for a full read or see constant
     with open_s3_object(
@@ -165,14 +169,35 @@ def _read_parquet_file(
         s3_additional_kwargs=s3_additional_kwargs,
         s3_client=s3_client,
     ) as f:
-        pq_file: Optional[pyarrow.parquet.ParquetFile] = _pyarrow_parquet_file_wrapper(
-            source=f,
-            coerce_int96_timestamp_unit=coerce_int96_timestamp_unit,
-        )
-        if pq_file is None:
-            raise exceptions.InvalidFile(f"Invalid Parquet file: {path}")
+        if schema and version.parse(pa.__version__) >= version.parse("8.0.0"):
+            try:
+                table = pyarrow.parquet.read_table(
+                    f,
+                    columns=columns,
+                    schema=schema,
+                    use_threads=False,
+                    use_pandas_metadata=False,
+                    coerce_int96_timestamp_unit=coerce_int96_timestamp_unit,
+                )
+            except pyarrow.ArrowInvalid as ex:
+                if "Parquet file size is 0 bytes" in str(ex):
+                    raise exceptions.InvalidFile(f"Invalid Parquet file: {path}")
+                raise
+        else:
+            if schema:
+                warnings.warn(
+                    "Your version of pyarrow does not support reading with schema. Consider an upgrade to pyarrow 8+.",
+                    UserWarning,
+                )
+            pq_file: Optional[pyarrow.parquet.ParquetFile] = _pyarrow_parquet_file_wrapper(
+                source=f,
+                coerce_int96_timestamp_unit=coerce_int96_timestamp_unit,
+            )
+            if pq_file is None:
+                raise exceptions.InvalidFile(f"Invalid Parquet file: {path}")
+            table = pq_file.read(columns=columns, use_threads=False, use_pandas_metadata=False)
         return _add_table_partitions(
-            table=pq_file.read(columns=columns, use_threads=False, use_pandas_metadata=False),
+            table=table,
             path=path,
             path_root=path_root,
         )
@@ -262,6 +287,7 @@ def _read_parquet(  # pylint: disable=W0613
         itertools.repeat(s3_additional_kwargs),
         itertools.repeat(use_threads),
         [version_ids.get(p) if isinstance(version_ids, dict) else None for p in paths],
+        itertools.repeat(schema),
     )
     return _utils.table_refs_to_df(tables, kwargs=arrow_kwargs)
 
@@ -281,6 +307,7 @@ def read_parquet(
     columns: Optional[List[str]] = None,
     validate_schema: bool = False,
     coerce_int96_timestamp_unit: Optional[str] = None,
+    schema: Optional[pa.Schema] = None,
     last_modified_begin: Optional[datetime.datetime] = None,
     last_modified_end: Optional[datetime.datetime] = None,
     version_id: Optional[Union[str, Dict[str, str]]] = None,
@@ -359,6 +386,8 @@ def read_parquet(
     coerce_int96_timestamp_unit : str, optional
         Cast timestamps that are stored in INT96 format to a particular resolution (e.g. "ms").
         Setting to None is equivalent to "ns" and therefore INT96 timestamps are inferred as in nanoseconds.
+    schema : pyarrow.Schema, optional
+        Schema to use whem reading the file.
     last_modified_begin : datetime, optional
         Filter S3 objects by Last modified date.
         Filter is only applied after listing all objects.
@@ -462,7 +491,6 @@ def read_parquet(
     version_ids = _check_version_id(paths=paths, version_id=version_id)
 
     # Create PyArrow schema based on file metadata, columns filter, and partitions
-    schema: Optional[pa.schema] = None
     if validate_schema and not bulk_read:
         metadata_reader = _ParquetTableMetadataReader()
         schema = metadata_reader.validate_schemas(
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -34,6 +34,7 @@ pandas = ">=1.2.0,!=1.5.0,<3.0.0" # Exclusion per: https://github.com/aws/aws-sd
 numpy = "^1.18"
 pyarrow = ">=7.0.0"
 typing-extensions = "^4.4.0"
+packaging = "^23.1"
 
 # Databases
 redshift-connector = { version = "^2.0.0", optional = true }
diff --git a/tests/unit/test_s3_parquet.py b/tests/unit/test_s3_parquet.py
@@ -630,14 +630,19 @@ def test_parquet_compression(path, compression) -> None:
 
 
 @pytest.mark.parametrize("use_threads", [True, False, 2])
-def test_empty_file(path, use_threads):
+@pytest.mark.parametrize(
+    "schema", [None, pa.schema([pa.field("c0", pa.int64()), pa.field("c1", pa.int64()), pa.field("par", pa.string())])]
+)
+def test_empty_file(path, use_threads, schema):
     df = pd.DataFrame({"c0": [1, 2, 3], "c1": [None, None, None], "par": ["a", "b", "c"]})
     df.index = df.index.astype("Int64")
     df["c0"] = df["c0"].astype("Int64")
     df["par"] = df["par"].astype("string")
     wr.s3.to_parquet(df, path, index=True, dataset=True, partition_cols=["par"])
     bucket, key = wr._utils.parse_path(f"{path}test.csv")
     boto3.client("s3").put_object(Body=b"", Bucket=bucket, Key=key)
+    with pytest.raises(wr.exceptions.InvalidFile):
+        wr.s3.read_parquet(path, use_threads=use_threads, ignore_empty=False, schema=schema)
     df2 = wr.s3.read_parquet(path, dataset=True, use_threads=use_threads)
     df2["par"] = df2["par"].astype("string")
     assert_pandas_equals(df, df2)