SNOW-2126985: Fix to_pandas dropping columns (#3408)

sfc-gh-jrose · web-flow · commit f94595baf1f8 · 2025-05-30T18:04:07.000-07:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,6 +14,7 @@
 
 - Fixed a bug in `DataFrameReader.dbapi` (PrPr) where the `create_connection` defined as local function was incompatible with multiprocessing.
 - Fixed a bug in `DataFrameReader.dbapi` (PrPr) where databricks `TIMESTAMP` type was converted to Snowflake `TIMESTAMP_NTZ` type which should be `TIMESTAMP_LTZ` type.
+- Fixed a bug in `DataFrame.to_pandas()` that would drop column names when converting a dataframe that did not originate from a select statement.
 - Fixed a bug that `DataFrame.create_or_replace_dynamic_table` raises error when the dataframe contains a UDTF and `SELECT *` in UDTF not being parsed correctly.
 
 #### Improvements
diff --git a/src/snowflake/snowpark/async_job.py b/src/snowflake/snowpark/async_job.py
@@ -373,8 +373,6 @@ def result(
             result = self._session._conn._to_data_or_iter(
                 self._cursor, to_pandas=True, to_iter=False
             )["data"]
-            if not isinstance(result, pandas.DataFrame):
-                result = pandas.DataFrame(result)
         elif async_result_type == _AsyncResultType.PANDAS_BATCH:
             result = self._session._conn._to_data_or_iter(
                 self._cursor, to_pandas=True, to_iter=True
diff --git a/src/snowflake/snowpark/dataframe.py b/src/snowflake/snowpark/dataframe.py
@@ -1069,7 +1069,9 @@ def to_pandas(
         # e.g., session.sql("create ...").to_pandas()
         if block:
             if not isinstance(result, pandas.DataFrame):
-                return pandas.DataFrame(result)
+                return pandas.DataFrame(
+                    result, columns=[attr.name for attr in self._plan.attributes]
+                )
 
         return result
 
diff --git a/tests/integ/scala/test_async_job_suite.py b/tests/integ/scala/test_async_job_suite.py
@@ -114,6 +114,23 @@ def test_async_to_pandas_common(session):
         session.create_dataframe(res), session.create_dataframe(expected_res)
     )
 
+    # Non-select
+    non_select = session.sql("show regions")
+    expected = session.create_dataframe(non_select.to_pandas())
+    actual = session.create_dataframe(non_select.to_pandas(block=False).result())
+    assert non_select.columns == actual.columns
+    Utils.check_answer(expected, actual)
+
+    try:
+        table_name = Utils.random_table_name()
+        df = session.sql(f"create temporary table {table_name} (A int)")
+        Utils.check_answer(
+            session.create_dataframe(df.to_pandas(block=False).result()),
+            [Row(f"Table {table_name} successfully created.")],
+        )
+    finally:
+        Utils.drop_table(session, table_name)
+
 
 @pytest.mark.skipif(IS_IN_STORED_PROC_LOCALFS, reason="Requires large result")
 @pytest.mark.skipif(not is_pandas_available, reason="pandas is not available")
diff --git a/tests/integ/test_df_to_pandas.py b/tests/integ/test_df_to_pandas.py
@@ -201,19 +201,24 @@ def test_to_pandas_non_select(session):
         PandasDF,
     )
 
-    # non SELECT statements will fail
     def check_fetch_data_exception(query: str):
-        result = session.sql(query).to_pandas()
-        isinstance(result, PandasDF)
+        df = session.sql(query)
+        result = df.to_pandas()
+        assert df.columns == result.columns.to_list()
+        assert isinstance(result, PandasDF)
         return result
 
     temp_table_name = Utils.random_name_for_temp_object(TempObjectType.TABLE)
     check_fetch_data_exception("show tables")
     res = check_fetch_data_exception(f"create temporary table {temp_table_name}(a int)")
-    expected_res = pd.DataFrame([(f"Table {temp_table_name} successfully created.",)])
+    expected_res = pd.DataFrame(
+        [(f"Table {temp_table_name} successfully created.",)], columns=['"status"']
+    )
     assert expected_res.equals(res)
     res = check_fetch_data_exception(f"drop table if exists {temp_table_name}")
-    expected_res = pd.DataFrame([(f"{temp_table_name} successfully dropped.",)])
+    expected_res = pd.DataFrame(
+        [(f"{temp_table_name} successfully dropped.",)], columns=['"status"']
+    )
     assert expected_res.equals(res)
 
     # to_pandas should work for the large dataframe