BUG: fix read_parquet with dataset=True when the first partition is empty.

David Cournapeau · David Cournapeau · commit 778b386bd64e · 2024-10-02T04:18:23.000Z
When reading a set of parquet files with dataset=True, if the first
partition is empty the current logic for dtype inference will fail. It
ill raise exceptions as follows:

    ```
    pyarrow.lib.ArrowTypeError: Unable to merge: Field col0 has incompatible
    types: dictionary&lt;values=null, indices=int32, ordered=0&gt; vs
    dictionary&lt;values=string, indices=int32, ordered=0
    ```

To fix this, we filter out empty table(s) before merging them into one
parquet file.
diff --git a/awswrangler/s3/_read_parquet.py b/awswrangler/s3/_read_parquet.py
@@ -308,6 +308,7 @@ def _read_parquet(
         itertools.repeat(schema),
         itertools.repeat(decryption_properties),
     )
+    tables = [table for table in tables if len(table) > 0]
     return _utils.table_refs_to_df(tables, kwargs=arrow_kwargs)
 
 
diff --git a/tests/unit/test_moto.py b/tests/unit/test_moto.py
@@ -485,6 +485,33 @@ def test_s3_delete_object_success(moto_s3_client: "S3Client") -> None:
         wr.s3.read_parquet(path=path, dataset=True)
 
 
+def test_s3_dataset_empty_table(moto_s3_client: "S3Client") -> None:
+    """ Test that a dataset split into multiple parquet files whose first
+    partition is an empty table still loads properly.
+    """
+    partition_col, partition_val = "col0", "1"
+    dataset = f"{partition_col}={partition_val}"
+    s3_key = f's3://bucket/{dataset}'
+
+    dtypes = {"id": "string[python]"}
+    df1 = pd.DataFrame({"id": []}).astype(dtypes)
+    df2 = pd.DataFrame({"id": ['1'] * 2}).astype(dtypes)
+    df3 = pd.DataFrame({"id": ['1'] * 3}).astype(dtypes)
+
+    dataframes = [df1, df2, df3]
+    r_df = pd.concat(dataframes, ignore_index=True)
+    r_df = r_df.assign(col0=pd.Categorical([partition_val] * len(r_df)))
+
+    for i, df in enumerate(dataframes):
+        wr.s3.to_parquet(
+            df=df,
+            path=f"{s3_key}/part{i}.parquet",
+        )
+
+    result_df = wr.s3.read_parquet(path=s3_key, dataset=True)
+    pd.testing.assert_frame_equal(result_df, r_df, check_dtype=True)
+
+
 def test_s3_raise_delete_object_exception_success(moto_s3_client: "S3Client") -> None:
     path = "s3://bucket/test.parquet"
     wr.s3.to_parquet(df=get_df_list(), path=path, index=False, dataset=True, partition_cols=["par0", "par1"])

Original file line number	Diff line number	Diff line change
`@@ -308,6 +308,7 @@ def _read_parquet(`
`308`	`308`	`itertools.repeat(schema),`
`309`	`309`	`itertools.repeat(decryption_properties),`
`310`	`310`	`)`
	`311`	`+ tables = [table for table in tables if len(table) > 0]`
`311`	`312`	`return _utils.table_refs_to_df(tables, kwargs=arrow_kwargs)`
`312`	`313`
`313`	`314`