SNOW-2478173: Improve single-row transpose helper for 1-column frames (#3975)

sfc-gh-joshi · web-flow · commit b2696ad25738 · 2025-10-30T17:08:32.000-07:00
While testing #3973, I noticed that aggregations on single-column frames/series were producing queries with JSON serialization and unnecessary UNPIVOT operations. The QC's `transpose_single_row` helper method is used in aggregations to skip a PIVOT operation used in the general transpose case, but for transposing a 1x1 frame, we don't even need to UNPIVOT and need only re-label the index since we already know that the column's dtype will not change. This PR adds a fast path for 1x1 `transpose_single_row` operations, which replaces JSON/UNPIVOT operations with simple projections. It produces some modest performance improvements for operations on a 2000x1 frame: - `DataFrame.count`: 1.48s -> 1.31s (11.2% improvement) - `DataFrame.describe`: 2.64s -> 2.36s (10.9% improvement) - `DataFrame.nunique`: 1.25s -> 1.21s (3.4% improvement) These improvements are likely to be more noticeable on frame produced from more complex queries. This PR also adds explicit row count caching for the general transpose case. We currently cannot directly use the `transpose_single_row` path for the `transpose` API itself since the helper function drops the column labels of the result.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -42,6 +42,7 @@
   - `groupby_first()` with `min_count>1`
   - `groupby_last()` with `min_count>1`
   - `shift()` with `freq` parameter
+- Slightly improved the performance of `agg`, `nunique`, `describe`, and related methods on 1-column DataFrame and Series objects.
 
 #### Bug Fixes
 
diff --git a/src/snowflake/snowpark/modin/plugin/_internal/frame.py b/src/snowflake/snowpark/modin/plugin/_internal/frame.py
@@ -1144,6 +1144,8 @@ def update_snowflake_quoted_identifiers_with_expressions(
         self,
         quoted_identifier_to_column_map: dict[str, SnowparkColumn],
         snowpark_pandas_types: Optional[list[Optional[SnowparkPandasType]]] = None,
+        *,
+        new_index_column_pandas_labels: Optional[list[Hashable]] = None,
     ) -> UpdatedInternalFrameResult:
         """
         Points Snowflake quoted identifiers to column expression given by `quoted_identifier_to_column_map`.
@@ -1171,6 +1173,8 @@ def update_snowflake_quoted_identifiers_with_expressions(
                 must be index columns and data columns in the original internal frame.
             data_column_snowpark_pandas_types: The optional Snowpark pandas types for the new
                 expressions, in the order of the keys of quoted_identifier_to_column_map.
+            new_index_column_pandas_labels: The optional list of labels to be used as
+                index_column_pandas_labels for the result.
 
         Returns:
             UpdatedInternalFrameResult: A tuple containing the new InternalFrame with updated column references, and a mapping
@@ -1252,7 +1256,9 @@ def update_snowflake_quoted_identifiers_with_expressions(
                 data_column_pandas_labels=self.data_column_pandas_labels,
                 data_column_snowflake_quoted_identifiers=new_data_column_snowflake_quoted_identifiers,
                 data_column_pandas_index_names=self.data_column_pandas_index_names,
-                index_column_pandas_labels=self.index_column_pandas_labels,
+                index_column_pandas_labels=self.index_column_pandas_labels
+                if new_index_column_pandas_labels is None
+                else new_index_column_pandas_labels,
                 index_column_snowflake_quoted_identifiers=new_index_column_snowflake_quoted_identifiers,
                 data_column_types=[
                     new_type_mapping[k]
diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py
@@ -11875,10 +11875,54 @@ def transpose_single_row(self) -> "SnowflakeQueryCompiler":
             self._raise_not_implemented_error_for_timedelta()
 
         frame = self._modin_frame
-
+        input_column_count = len(frame.data_columns_index)
         # Handle case where the dataframe has empty columns.
-        if len(frame.data_columns_index) == 0:
+        if input_column_count == 0:
             return transpose_empty_df(frame)
+        if input_column_count == 1:
+            # If the frame is 1x1, then the datatype is already preserved; we need only set the entry
+            # in the index columns to match the original index labels.
+            if len(frame.data_column_index_names) > 1:
+                # If the columns object has a multi-index name, we need to project new columns for
+                # the extra labels.
+                data_odf = frame.ordered_dataframe.select(
+                    frame.data_column_snowflake_quoted_identifiers
+                )
+                new_index_column_identifiers = (
+                    data_odf.generate_snowflake_quoted_identifiers(
+                        pandas_labels=frame.data_column_pandas_index_names
+                    )
+                )
+                new_odf = append_columns(
+                    data_odf,
+                    new_index_column_identifiers,
+                    list(map(pandas_lit, frame.data_column_pandas_labels[0])),
+                )
+                new_odf.row_count = 1
+                return SnowflakeQueryCompiler(
+                    InternalFrame.create(
+                        ordered_dataframe=new_odf,
+                        data_column_pandas_labels=[None],
+                        data_column_pandas_index_names=[None],
+                        data_column_snowflake_quoted_identifiers=frame.data_column_snowflake_quoted_identifiers,
+                        index_column_pandas_labels=frame.data_column_pandas_index_names,
+                        index_column_snowflake_quoted_identifiers=new_index_column_identifiers,
+                        data_column_types=frame.cached_data_column_snowpark_pandas_types,
+                        index_column_types=None,
+                    )
+                )
+            else:
+                return SnowflakeQueryCompiler(
+                    frame.update_snowflake_quoted_identifiers_with_expressions(
+                        {
+                            frame.index_column_snowflake_quoted_identifiers[
+                                0
+                            ]: pandas_lit(frame.data_column_pandas_labels[0]),
+                        },
+                        # Swap the name of the index/columns objects
+                        new_index_column_pandas_labels=frame.data_column_pandas_index_names,
+                    )[0]
+                ).set_columns([None])
 
         # This follows the same approach used in SnowflakeQueryCompiler.transpose().
         # However, as an optimization, only steps (1), (2), and (4) from the four steps described in
@@ -11909,6 +11953,7 @@ def transpose_single_row(self) -> "SnowflakeQueryCompiler":
             unpivot_result.variable_name_quoted_snowflake_identifier,
             unpivot_result.object_name_quoted_snowflake_identifier,
         )
+        new_internal_frame.ordered_dataframe.row_count = input_column_count
 
         return SnowflakeQueryCompiler(new_internal_frame)
 
@@ -11922,8 +11967,9 @@ def transpose(self) -> "SnowflakeQueryCompiler":
         """
         frame = self._modin_frame
 
+        original_col_count = len(frame.data_columns_index)
         # Handle case where the dataframe has empty columns.
-        if len(frame.data_columns_index) == 0:
+        if original_col_count == 0:
             return transpose_empty_df(frame)
 
         # The following approach to implementing transpose relies on combining unpivot and pivot operations to flip
@@ -12061,6 +12107,7 @@ def transpose(self) -> "SnowflakeQueryCompiler":
             unpivot_result.variable_name_quoted_snowflake_identifier,
             unpivot_result.object_name_quoted_snowflake_identifier,
         )
+        new_internal_frame.ordered_dataframe.row_count = original_col_count
 
         return SnowflakeQueryCompiler(new_internal_frame)
 
diff --git a/tests/integ/modin/frame/test_aggregate.py b/tests/integ/modin/frame/test_aggregate.py
@@ -193,7 +193,9 @@ def test_string_sum_with_nulls():
     with pytest.raises(TypeError):
         pandas_df.sum(numeric_only=False)
     snow_result = snow_df.sum(numeric_only=False)
-    assert_series_equal(snow_result.to_pandas(), native_pd.Series(["ab"]))
+    assert_series_equal(
+        snow_result.to_pandas(), native_pd.Series(["ab"]), check_index_type=False
+    )
 
 
 class TestTimedelta:
@@ -628,6 +630,16 @@ def test_agg_with_multiindex(native_df_multiindex, func, expected_union_count):
         eval_snowpark_pandas_result(snow_df, native_df_multiindex, func)
 
 
+def test_agg_with_one_column_multiindex(native_df_multiindex):
+    # Triggers the special 1x1 transpose code path
+    native_df_multiindex = native_df_multiindex.iloc[:, 0:1]
+    snow_df = pd.DataFrame(native_df_multiindex)
+    with SqlCounter(query_count=1):
+        eval_snowpark_pandas_result(
+            snow_df, native_df_multiindex, lambda df: df.agg("count")
+        )
+
+
 @pytest.mark.parametrize(
     "func",
     [
diff --git a/tests/integ/modin/frame/test_iloc.py b/tests/integ/modin/frame/test_iloc.py
@@ -1077,21 +1077,9 @@ def iloc_helper(df):
         else:
             return native_pd.Series([]) if axis == "row" else df.iloc[:, []]
 
-    def determine_query_count():
-        # Multiple queries because of squeeze() - in range is 2, out-of-bounds is 1.
-        if axis == "col":
-            num_queries = 1
-        else:
-            if not -8 < key < 7:  # key is out of bound
-                num_queries = 2
-            else:
-                num_queries = 1
-        return num_queries
-
-    query_count = determine_query_count()
     # test df with default index
     num_cols = 7
-    with SqlCounter(query_count=query_count):
+    with SqlCounter(query_count=1):
         eval_snowpark_pandas_result(
             default_index_snowpark_pandas_df,
             default_index_native_df,
@@ -1101,21 +1089,20 @@ def determine_query_count():
 
     # test df with non-default index
     num_cols = 6  # set_index() makes the number of columns 6
-    with SqlCounter(query_count=query_count):
+    with SqlCounter(query_count=1):
         eval_snowpark_pandas_result(
             default_index_snowpark_pandas_df.set_index("D"),
             default_index_native_df.set_index("D"),
             iloc_helper,
             test_attrs=False,
         )
 
-    query_count = determine_query_count()
     # test df with MultiIndex
     # Index dtype is different between Snowpark and native pandas if key produces empty df.
     num_cols = 7
     native_df = default_index_native_df.set_index(multiindex_native)
     snowpark_df = pd.DataFrame(native_df)
-    with SqlCounter(query_count=query_count):
+    with SqlCounter(query_count=1):
         eval_snowpark_pandas_result(
             snowpark_df,
             native_df,
@@ -1129,7 +1116,7 @@ def determine_query_count():
         native_df_with_multiindex_columns
     )
     in_range = True if (-8 < key < 7) else False
-    with SqlCounter(query_count=query_count):
+    with SqlCounter(query_count=1):
         if axis == "row" or in_range:  # series result
             eval_snowpark_pandas_result(
                 snowpark_df_with_multiindex_columns,
@@ -1151,7 +1138,7 @@ def determine_query_count():
     # test df with MultiIndex on both index and columns
     native_df = native_df_with_multiindex_columns.set_index(multiindex_native)
     snowpark_df = pd.DataFrame(native_df)
-    with SqlCounter(query_count=query_count):
+    with SqlCounter(query_count=1):
         if axis == "row" or in_range:  # series result
             eval_snowpark_pandas_result(
                 snowpark_df,
diff --git a/tests/integ/modin/frame/test_squeeze.py b/tests/integ/modin/frame/test_squeeze.py
@@ -31,11 +31,7 @@ def test_n_by_1(axis, dtype):
 
 @pytest.mark.parametrize("dtype", ["int", "timedelta64[ns]"])
 def test_1_by_n(axis, dtype):
-    if axis is None:
-        expected_query_count = 2
-    else:
-        expected_query_count = 1
-    with SqlCounter(query_count=expected_query_count):
+    with SqlCounter(query_count=1):
         eval_snowpark_pandas_result(
             *create_test_dfs({"a": [1], "b": [2], "c": [3]}, dtype=dtype),
             lambda df: df.squeeze(axis=axis),