fix: Index columns removed on s3.to_parquet (#2655)

robert-schmidtke · Robert Schmidtke · kukushking · web-flow · commit 95e37bf3217e · 2024-02-16T13:15:06.000Z
* first go at a failing test

* pass missing dataset flag in test

* because we partition, do not specify full parquet paths during write

* use proper path in tests

* use reset_index to allow dropping the entire index

* test partitioning on full and partial index

* need to validate schema on read for issue to surface

* need to sort on index

* cross-test without partitioning

* print assertion error for remote debugging

* simplify test to just assert schema validation

* consistently handle regular and index columns casts

* use equality assertion utility, drop unnecessary sort

* add index partition test

* reformat

* undo categorical-specific dataframe creation in test

* try again to expect the right dtypes

* pull out toparquet kwargs

* expect test to fail when using modin and partitioning on full index

* manually assert unpartitioned index is still present, then reset full index

* handle change in promotion kwargs for pyarrow 14+

* move packaging import to correct location

* fix types for promotion kwargs

* test and handle unnamed index levels as well

---------

Co-authored-by: Robert Schmidtke &lt;robert.schmidtke@trailstonegroup.com&gt;
Co-authored-by: kukushking &lt;kukushkin.anton@gmail.com&gt;
Co-authored-by: Leon Luttenberger &lt;LeonLuttenberger@users.noreply.github.com&gt;
diff --git a/awswrangler/_data_types.py b/awswrangler/_data_types.py
@@ -563,10 +563,12 @@ def pyarrow_types_from_pandas(  # noqa: PLR0912,PLR0915
         for field in fields:
             name = str(field.name)
             # Check if any of the index columns must be ignored
-            if name not in ignore_cols:
+            if name in ignore_cols:
+                cols_dtypes[name] = None
+            else:
                 _logger.debug("Inferring PyArrow type from index: %s", name)
                 cols_dtypes[name] = field.type
-                indexes.append(name)
+            indexes.append(name)
 
     # Merging Index
     sorted_cols: list[str] = indexes + list(df.columns) if index_left is True else list(df.columns) + indexes
@@ -693,13 +695,26 @@ def pyarrow_schema_from_pandas(
         df=df, index=index, ignore_cols=ignore_plus
     )
     for k, v in casts.items():
-        if (k in df.columns) and (k not in ignore):
+        if (k not in ignore) and (k in df.columns or _is_index_name(k, df.index)):
             columns_types[k] = athena2pyarrow(dtype=v)
     columns_types = {k: v for k, v in columns_types.items() if v is not None}
     _logger.debug("columns_types: %s", columns_types)
     return pa.schema(fields=columns_types)
 
 
+def _is_index_name(name: str, index: pd.Index) -> bool:
+    if name in index.names:
+        # named index level
+        return True
+
+    if (match := re.match(r"__index_level_(?P<level>\d+)__", name)) is not None:
+        # unnamed index level
+        if len(index.names) > (level := int(match.group("level"))):
+            return index.names[level] is None
+
+    return False
+
+
 def athena_types_from_pyarrow_schema(
     schema: pa.Schema,
     ignore_null: bool = False,
diff --git a/awswrangler/_utils.py b/awswrangler/_utils.py
@@ -31,6 +31,7 @@
 import numpy as np
 import pyarrow as pa
 from botocore.config import Config
+from packaging import version
 
 import awswrangler.pandas as pd
 from awswrangler import _config, exceptions
@@ -893,7 +894,11 @@ def split_pandas_frame(df: pd.DataFrame, splits: int) -> list[pd.DataFrame]:
 @engine.dispatch_on_engine
 def table_refs_to_df(tables: list[pa.Table], kwargs: dict[str, Any]) -> pd.DataFrame:
     """Build Pandas DataFrame from list of PyArrow tables."""
-    return _table_to_df(pa.concat_tables(tables, promote=True), kwargs=kwargs)
+    promote_kwargs: dict[str, bool | str] = {"promote": True}
+    if version.parse(pa.__version__) >= version.parse("14.0.0"):
+        promote_kwargs = {"promote_options": "default"}
+
+    return _table_to_df(pa.concat_tables(tables, **promote_kwargs), kwargs=kwargs)
 
 
 @engine.dispatch_on_engine
diff --git a/awswrangler/s3/_write_dataset.py b/awswrangler/s3/_write_dataset.py
@@ -159,8 +159,10 @@ def _to_partitions(
             inplace=True,
         )
         # Drop index levels if partitioning by index columns
-        subgroup = subgroup.droplevel(  # noqa: PLW2901
-            level=[col for col in partition_cols if col in subgroup.index.names]
+        subgroup.reset_index(
+            level=[col for col in partition_cols if col in subgroup.index.names],
+            drop=True,
+            inplace=True,
         )
         prefix = _delete_objects(
             keys=keys,
diff --git a/tests/unit/test_s3_parquet.py b/tests/unit/test_s3_parquet.py
@@ -506,6 +506,61 @@ def test_index_columns(path, use_threads, name, pandas):
     assert df[["c0"]].equals(df2)
 
 
+@pytest.mark.parametrize("index", [None, ["c0"], ["c0", "c1"]])
+def test_index_schema_validation(path, glue_database, glue_table, index):
+    df = pd.DataFrame({"c0": [0, 1], "c1": [2, 3], "c2": [4, 5]}, dtype="Int64")
+
+    if index is not None:
+        df = df.set_index(index)
+    else:
+        df.index = df.index.astype("Int64")
+
+    for _ in range(2):
+        wr.s3.to_parquet(df, path, index=True, dataset=True, database=glue_database, table=glue_table)
+
+    df2 = wr.s3.read_parquet(path, validate_schema=True)
+    assert_pandas_equals(pd.concat([df, df]), df2)
+
+
+@pytest.mark.modin_index
+@pytest.mark.parametrize("index", [["c0"], ["c0", "c1"]])
+@pytest.mark.parametrize("partition_cols", [["c0"], ["c0", "c1"]])
+def test_index_partition(path, glue_database, glue_table, index, partition_cols):
+    df = pd.DataFrame({"c0": [0, 1], "c1": [2, 3], "c2": [4, 5]}, dtype="Int64")
+    df = df.set_index(index)
+
+    for _ in range(2):
+        wr.s3.to_parquet(
+            df,
+            path,
+            index=True,
+            dataset=True,
+            partition_cols=partition_cols,
+            database=glue_database,
+            table=glue_table,
+        )
+
+    df2 = wr.s3.read_parquet(path, dataset=True)
+
+    # partitioned index is not preserved, so reset unpartitioned index for recreation
+    assert all(idx in df2.index.names for idx in [idx for idx in index if idx not in partition_cols])
+    df2 = df2.reset_index()
+
+    # partition columns come back as categorical, so convert back
+    for col in partition_cols:
+        df2[col] = df2[col].astype("Int64")
+
+    # apply full index again
+    df2 = df2.set_index(index)
+
+    assert_pandas_equals(
+        # partitioned on index, so the data comes back sorted on the index
+        pd.concat([df, df]).sort_index(),
+        # need to reorder columns, because partition columns are appended
+        df2[df.columns],
+    )
+
+
 @pytest.mark.parametrize("use_threads", [True, False, 2])
 @pytest.mark.parametrize("name", [None, "foo"])
 @pytest.mark.parametrize("pandas", [True, False])

Original file line number	Diff line number	Diff line change
`@@ -159,8 +159,10 @@ def _to_partitions(`
`159`	`159`	`inplace=True,`
`160`	`160`	`)`
`161`	`161`	`# Drop index levels if partitioning by index columns`
`162`		`- subgroup = subgroup.droplevel( # noqa: PLW2901`
`163`		`- level=[col for col in partition_cols if col in subgroup.index.names]`
	`162`	`+ subgroup.reset_index(`
	`163`	`+ level=[col for col in partition_cols if col in subgroup.index.names],`
	`164`	`+ drop=True,`
	`165`	`+ inplace=True,`
`164`	`166`	`)`
`165`	`167`	`prefix = _delete_objects(`
`166`	`168`	`keys=keys,`