fix: respect order of columns in to_iceberg (#2768)

jaidisido · web-flow · commit c95a5d0c5a18 · 2024-04-09T16:15:18.000+01:00
diff --git a/awswrangler/athena/_write_iceberg.py b/awswrangler/athena/_write_iceberg.py
@@ -499,7 +499,7 @@ def to_iceberg(
             """
         else:
             sql_statement = f"""
-            INSERT INTO "{database}"."{table}"
+            INSERT INTO "{database}"."{table}" ({', '.join([f'"{x}"' for x in df.columns])})
             SELECT {', '.join([f'"{x}"' for x in df.columns])}
               FROM "{database}"."{temp_table}"
             """
diff --git a/tests/unit/test_athena_iceberg.py b/tests/unit/test_athena_iceberg.py
@@ -650,6 +650,58 @@ def test_athena_to_iceberg_merge_into(path: str, path2: str, glue_database: str,
     assert_pandas_equals(df_expected, df_out)
 
 
+def test_athena_to_iceberg_cols_order(path: str, path2: str, glue_database: str, glue_table: str) -> None:
+    kwargs = {
+        "database": glue_database,
+        "table": glue_table,
+        "table_location": path,
+        "temp_path": path2,
+        "partition_cols": ["partition"],
+        "schema_evolution": True,
+        "keep_files": False,
+    }
+
+    df = pd.DataFrame(
+        {
+            "partition": [1, 1, 2, 2],
+            "column1": ["X", "Y", "Z", "Z"],
+            "column2": ["A", "B", "C", "D"],
+        }
+    )
+    wr.athena.to_iceberg(df=df, mode="overwrite_partitions", **kwargs)
+
+    # Adding a column
+    df_new_col_last = pd.DataFrame(
+        {
+            "partition": [2, 2],
+            "column1": ["Z", "Z"],
+            "column2": ["C", "D"],
+            "new_column": [True, False],
+        }
+    )
+    wr.athena.to_iceberg(df=df_new_col_last, mode="overwrite_partitions", **kwargs)
+
+    # Switching the order of columns
+    df_new_col_not_last = pd.DataFrame(
+        {
+            "partition": [2, 2],
+            "column1": ["Z", "Z"],
+            "new_column": [True, False],
+            "column2": ["C", "D"],
+        }
+    )
+    wr.athena.to_iceberg(df=df_new_col_not_last, mode="overwrite_partitions", **kwargs)
+
+    df_out = wr.athena.read_sql_query(
+        sql=f'SELECT * FROM "{glue_table}"',
+        database=glue_database,
+        ctas_approach=False,
+        unload_approach=False,
+    )
+    assert len(df) == len(df_out)
+    assert len(df.columns) + 1 == len(df_out.columns)
+
+
 def test_athena_to_iceberg_empty_df_error(
     path: str,
     path2: str,