From e15aa08a6d5f3d932ea60e5715b8229ab07e7e04 Mon Sep 17 00:00:00 2001 From: Brian Tang Date: Tue, 4 Feb 2025 19:12:51 +0800 Subject: [PATCH 1/2] Adds unit test for pyarrow large lists and strings --- tests/unit/test_s3_parquet.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/tests/unit/test_s3_parquet.py b/tests/unit/test_s3_parquet.py index b650100bc..3ce1ef84c 100644 --- a/tests/unit/test_s3_parquet.py +++ b/tests/unit/test_s3_parquet.py @@ -62,6 +62,27 @@ def test_read_parquet_metadata_nonexistent_file(path): wr.s3.read_parquet_metadata(path + "non-existent-file.parquet") +def read_parquet_metadata_large_dtype(path): + schema = pa.schema( + [ + pa.field("c0", pa.large_list(pa.large_string())), + pa.field("c1", pa.large_string()), + ] + ) + c0 = pa.array([["a", "b", "c"], ["d", "e", "f"], ["g", "h", "i"]]) + c1 = pa.array(["a", "b", "c"]) + df = pa.table([c0, c1], schema=schema) + + # use pyarrow-backed dataframe to simulate the large_list and large_string dtypes + pandas_df = df.to_pandas(types_mapper=lambda pa_dtype: pd.ArrowDtype(pa_dtype)) + + wr.s3.to_parquet(pandas_df, path) + columns_types, _ = wr.s3.read_parquet_metadata(path) + assert len(columns_types) == len(df.columns) + assert columns_types.get("c0") == "array" + assert columns_types.get("c1") == "string" + + @pytest.mark.parametrize( "partition_cols", [ From 87097d508357b66e913d9d5e57da02af10665ebc Mon Sep 17 00:00:00 2001 From: Brian Tang Date: Tue, 4 Feb 2025 19:15:43 +0800 Subject: [PATCH 2/2] Fixes typo in test func name --- tests/unit/test_s3_parquet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/test_s3_parquet.py b/tests/unit/test_s3_parquet.py index 3ce1ef84c..eaf51ecb1 100644 --- a/tests/unit/test_s3_parquet.py +++ b/tests/unit/test_s3_parquet.py @@ -62,7 +62,7 @@ def test_read_parquet_metadata_nonexistent_file(path): wr.s3.read_parquet_metadata(path + "non-existent-file.parquet") -def read_parquet_metadata_large_dtype(path): +def test_read_parquet_metadata_large_dtype(path): schema = pa.schema( [ pa.field("c0", pa.large_list(pa.large_string())),