From e15aa08a6d5f3d932ea60e5715b8229ab07e7e04 Mon Sep 17 00:00:00 2001
From: Brian Tang <ashrielbrian@gmail.com>
Date: Tue, 4 Feb 2025 19:12:51 +0800
Subject: [PATCH 1/2] Adds unit test for pyarrow large lists and strings

---
 tests/unit/test_s3_parquet.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)
diff --git a/tests/unit/test_s3_parquet.py b/tests/unit/test_s3_parquet.py
index b650100bc..3ce1ef84c 100644
--- a/tests/unit/test_s3_parquet.py
+++ b/tests/unit/test_s3_parquet.py
@@ -62,6 +62,27 @@ def test_read_parquet_metadata_nonexistent_file(path):
         wr.s3.read_parquet_metadata(path + "non-existent-file.parquet")
 
 
+def read_parquet_metadata_large_dtype(path):
+    schema = pa.schema(
+        [
+            pa.field("c0", pa.large_list(pa.large_string())),
+            pa.field("c1", pa.large_string()),
+        ]
+    )
+    c0 = pa.array([["a", "b", "c"], ["d", "e", "f"], ["g", "h", "i"]])
+    c1 = pa.array(["a", "b", "c"])
+    df = pa.table([c0, c1], schema=schema)
+
+    # use pyarrow-backed dataframe to simulate the large_list and large_string dtypes
+    pandas_df = df.to_pandas(types_mapper=lambda pa_dtype: pd.ArrowDtype(pa_dtype))
+
+    wr.s3.to_parquet(pandas_df, path)
+    columns_types, _ = wr.s3.read_parquet_metadata(path)
+    assert len(columns_types) == len(df.columns)
+    assert columns_types.get("c0") == "array<string>"
+    assert columns_types.get("c1") == "string"
+
+
 @pytest.mark.parametrize(
     "partition_cols",
     [

From 87097d508357b66e913d9d5e57da02af10665ebc Mon Sep 17 00:00:00 2001
From: Brian Tang <ashrielbrian@gmail.com>
Date: Tue, 4 Feb 2025 19:15:43 +0800
Subject: [PATCH 2/2] Fixes typo in test func name

---
 tests/unit/test_s3_parquet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/test_s3_parquet.py b/tests/unit/test_s3_parquet.py
index 3ce1ef84c..eaf51ecb1 100644
--- a/tests/unit/test_s3_parquet.py
+++ b/tests/unit/test_s3_parquet.py
@@ -62,7 +62,7 @@ def test_read_parquet_metadata_nonexistent_file(path):
         wr.s3.read_parquet_metadata(path + "non-existent-file.parquet")
 
 
-def read_parquet_metadata_large_dtype(path):
+def test_read_parquet_metadata_large_dtype(path):
     schema = pa.schema(
         [
             pa.field("c0", pa.large_list(pa.large_string())),