Merge branch 'main' into feature/aherrera/SNOW-2443512-StringAndBinaryPart2

sfc-gh-aherreraaguilar · sfc-gh-aherreraaguilar · commit 4c36548d6675 · 2025-10-31T09:36:12.000-06:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -49,6 +49,12 @@
   - `pivot_table()` with `sort=True`, non-string `index` list, non-string `columns` list, non-string `values` list, or `aggfunc` dict with non-string values
   - `fillna()` with `downcast` parameter or using `limit` together with `value`
   - `dropna()` with `axis=1`
+  - `groupby()` with `axis=1`, `by!=None and level!=None`, or by containing any non-pandas hashable labels.
+  - `groupby_fillna()` with `downcast` parameter
+  - `groupby_first()` with `min_count>1`
+  - `groupby_last()` with `min_count>1`
+  - `shift()` with `freq` parameter
+- Slightly improved the performance of `agg`, `nunique`, `describe`, and related methods on 1-column DataFrame and Series objects.
 
 #### Bug Fixes
 
@@ -219,11 +225,6 @@
   - `skew()` with `axis=1` or `numeric_only=False` parameters
   - `round()` with `decimals` parameter as a Series
   - `corr()` with `method!=pearson` parameter
-  - `df.groupby()` with `axis=1`, `by!=None and level!=None`, or by containing any non-pandas hashable labels.
-  - `groupby_fillna()` with `downcast` parameter
-  - `groupby_first()` with `min_count>1`
-  - `groupby_last()` with `min_count>1`
-  - `shift()` with `freq` parameter
 - Set `cte_optimization_enabled` to True for all Snowpark pandas sessions.
 - Add support for the following in faster pandas:
   - `isin`
diff --git a/src/snowflake/snowpark/functions.py b/src/snowflake/snowpark/functions.py
@@ -7135,10 +7135,15 @@ def array_contains(
         variant: Column containing the VARIANT to find.
         array: Column containing the ARRAY to search.
 
+            If this is a semi-structured array, you're required to explicitly cast the following SQL types into a VARIANT:
+
+            - `String & Binary <https://docs.snowflake.com/en/sql-reference/data-types-text>`_
+            - `Date & Time <https://docs.snowflake.com/en/sql-reference/data-types-datetime>`_
+
     Example::
         >>> from snowflake.snowpark import Row
-        >>> df = session.create_dataframe([Row([1, 2]), Row([1, 3])], schema=["a"])
-        >>> df.select(array_contains(lit(2), "a").alias("result")).show()
+        >>> df = session.create_dataframe([Row(["apple", "banana"]), Row(["apple", "orange"])], schema=["a"])
+        >>> df.select(array_contains(lit("banana").cast("variant"), "a").alias("result")).show()
         ------------
         |"RESULT"  |
         ------------
diff --git a/src/snowflake/snowpark/modin/plugin/_internal/frame.py b/src/snowflake/snowpark/modin/plugin/_internal/frame.py
@@ -1144,6 +1144,8 @@ def update_snowflake_quoted_identifiers_with_expressions(
         self,
         quoted_identifier_to_column_map: dict[str, SnowparkColumn],
         snowpark_pandas_types: Optional[list[Optional[SnowparkPandasType]]] = None,
+        *,
+        new_index_column_pandas_labels: Optional[list[Hashable]] = None,
     ) -> UpdatedInternalFrameResult:
         """
         Points Snowflake quoted identifiers to column expression given by `quoted_identifier_to_column_map`.
@@ -1171,6 +1173,8 @@ def update_snowflake_quoted_identifiers_with_expressions(
                 must be index columns and data columns in the original internal frame.
             data_column_snowpark_pandas_types: The optional Snowpark pandas types for the new
                 expressions, in the order of the keys of quoted_identifier_to_column_map.
+            new_index_column_pandas_labels: The optional list of labels to be used as
+                index_column_pandas_labels for the result.
 
         Returns:
             UpdatedInternalFrameResult: A tuple containing the new InternalFrame with updated column references, and a mapping
@@ -1252,7 +1256,9 @@ def update_snowflake_quoted_identifiers_with_expressions(
                 data_column_pandas_labels=self.data_column_pandas_labels,
                 data_column_snowflake_quoted_identifiers=new_data_column_snowflake_quoted_identifiers,
                 data_column_pandas_index_names=self.data_column_pandas_index_names,
-                index_column_pandas_labels=self.index_column_pandas_labels,
+                index_column_pandas_labels=self.index_column_pandas_labels
+                if new_index_column_pandas_labels is None
+                else new_index_column_pandas_labels,
                 index_column_snowflake_quoted_identifiers=new_index_column_snowflake_quoted_identifiers,
                 data_column_types=[
                     new_type_mapping[k]
diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py
@@ -5548,7 +5548,7 @@ def _groupby_first_last(
         return result
 
     @register_query_compiler_method_not_implemented(
-        "DataFrameGroupBy",
+        ["DataFrameGroupBy", "SeriesGroupBy"],
         "first",
         UnsupportedArgsRule(
             unsupported_conditions=[
@@ -5594,7 +5594,7 @@ def groupby_first(
         )
 
     @register_query_compiler_method_not_implemented(
-        "DataFrameGroupBy",
+        ["DataFrameGroupBy", "SeriesGroupBy"],
         "last",
         UnsupportedArgsRule(
             unsupported_conditions=[
@@ -5640,7 +5640,7 @@ def groupby_last(
         )
 
     @register_query_compiler_method_not_implemented(
-        "DataFrameGroupBy",
+        ["DataFrameGroupBy", "SeriesGroupBy"],
         "rank",
         UnsupportedArgsRule(
             unsupported_conditions=[
@@ -6102,7 +6102,7 @@ def groupby_rolling(
         return result_qc
 
     @register_query_compiler_method_not_implemented(
-        "DataFrameGroupBy",
+        ["DataFrameGroupBy", "SeriesGroupBy"],
         "shift",
         UnsupportedArgsRule(
             unsupported_conditions=[
@@ -7107,7 +7107,7 @@ def groupby_value_counts(
         )
 
     @register_query_compiler_method_not_implemented(
-        "DataFrameGroupBy",
+        ["DataFrameGroupBy", "SeriesGroupBy"],
         "fillna",
         UnsupportedArgsRule(
             unsupported_conditions=[
@@ -11875,10 +11875,54 @@ def transpose_single_row(self) -> "SnowflakeQueryCompiler":
             self._raise_not_implemented_error_for_timedelta()
 
         frame = self._modin_frame
-
+        input_column_count = len(frame.data_columns_index)
         # Handle case where the dataframe has empty columns.
-        if len(frame.data_columns_index) == 0:
+        if input_column_count == 0:
             return transpose_empty_df(frame)
+        if input_column_count == 1:
+            # If the frame is 1x1, then the datatype is already preserved; we need only set the entry
+            # in the index columns to match the original index labels.
+            if len(frame.data_column_index_names) > 1:
+                # If the columns object has a multi-index name, we need to project new columns for
+                # the extra labels.
+                data_odf = frame.ordered_dataframe.select(
+                    frame.data_column_snowflake_quoted_identifiers
+                )
+                new_index_column_identifiers = (
+                    data_odf.generate_snowflake_quoted_identifiers(
+                        pandas_labels=frame.data_column_pandas_index_names
+                    )
+                )
+                new_odf = append_columns(
+                    data_odf,
+                    new_index_column_identifiers,
+                    list(map(pandas_lit, frame.data_column_pandas_labels[0])),
+                )
+                new_odf.row_count = 1
+                return SnowflakeQueryCompiler(
+                    InternalFrame.create(
+                        ordered_dataframe=new_odf,
+                        data_column_pandas_labels=[None],
+                        data_column_pandas_index_names=[None],
+                        data_column_snowflake_quoted_identifiers=frame.data_column_snowflake_quoted_identifiers,
+                        index_column_pandas_labels=frame.data_column_pandas_index_names,
+                        index_column_snowflake_quoted_identifiers=new_index_column_identifiers,
+                        data_column_types=frame.cached_data_column_snowpark_pandas_types,
+                        index_column_types=None,
+                    )
+                )
+            else:
+                return SnowflakeQueryCompiler(
+                    frame.update_snowflake_quoted_identifiers_with_expressions(
+                        {
+                            frame.index_column_snowflake_quoted_identifiers[
+                                0
+                            ]: pandas_lit(frame.data_column_pandas_labels[0]),
+                        },
+                        # Swap the name of the index/columns objects
+                        new_index_column_pandas_labels=frame.data_column_pandas_index_names,
+                    )[0]
+                ).set_columns([None])
 
         # This follows the same approach used in SnowflakeQueryCompiler.transpose().
         # However, as an optimization, only steps (1), (2), and (4) from the four steps described in
@@ -11909,6 +11953,7 @@ def transpose_single_row(self) -> "SnowflakeQueryCompiler":
             unpivot_result.variable_name_quoted_snowflake_identifier,
             unpivot_result.object_name_quoted_snowflake_identifier,
         )
+        new_internal_frame.ordered_dataframe.row_count = input_column_count
 
         return SnowflakeQueryCompiler(new_internal_frame)
 
@@ -11922,8 +11967,9 @@ def transpose(self) -> "SnowflakeQueryCompiler":
         """
         frame = self._modin_frame
 
+        original_col_count = len(frame.data_columns_index)
         # Handle case where the dataframe has empty columns.
-        if len(frame.data_columns_index) == 0:
+        if original_col_count == 0:
             return transpose_empty_df(frame)
 
         # The following approach to implementing transpose relies on combining unpivot and pivot operations to flip
@@ -12061,6 +12107,7 @@ def transpose(self) -> "SnowflakeQueryCompiler":
             unpivot_result.variable_name_quoted_snowflake_identifier,
             unpivot_result.object_name_quoted_snowflake_identifier,
         )
+        new_internal_frame.ordered_dataframe.row_count = original_col_count
 
         return SnowflakeQueryCompiler(new_internal_frame)
 
diff --git a/src/snowflake/snowpark/modin/plugin/extensions/series_overrides.py b/src/snowflake/snowpark/modin/plugin/extensions/series_overrides.py
@@ -59,6 +59,12 @@
 )
 from snowflake.snowpark.modin.plugin.compiler.snowflake_query_compiler import (
     HYBRID_SWITCH_FOR_UNIMPLEMENTED_METHODS,
+    UnsupportedArgsRule,
+    _GROUPBY_UNSUPPORTED_GROUPING_MESSAGE,
+    register_query_compiler_method_not_implemented,
+)
+from snowflake.snowpark.modin.plugin._internal.groupby_utils import (
+    check_is_groupby_supported_by_snowflake,
 )
 from snowflake.snowpark.modin.plugin._typing import DropKeep, ListLike
 from snowflake.snowpark.modin.plugin.extensions.snow_partition_iterator import (
@@ -1549,6 +1555,22 @@ def fillna(
 
 # Snowpark pandas defines a custom GroupBy object
 @register_series_accessor("groupby")
+@register_query_compiler_method_not_implemented(
+    "Series",
+    "groupby",
+    UnsupportedArgsRule(
+        unsupported_conditions=[
+            (
+                lambda args: not check_is_groupby_supported_by_snowflake(
+                    args.get("by"),
+                    args.get("level"),
+                    args.get("axis", 0),
+                ),
+                f"Groupby {_GROUPBY_UNSUPPORTED_GROUPING_MESSAGE}",
+            )
+        ]
+    ),
+)
 def groupby(
     self,
     by=None,
diff --git a/tests/integ/modin/frame/test_aggregate.py b/tests/integ/modin/frame/test_aggregate.py
@@ -193,7 +193,9 @@ def test_string_sum_with_nulls():
     with pytest.raises(TypeError):
         pandas_df.sum(numeric_only=False)
     snow_result = snow_df.sum(numeric_only=False)
-    assert_series_equal(snow_result.to_pandas(), native_pd.Series(["ab"]))
+    assert_series_equal(
+        snow_result.to_pandas(), native_pd.Series(["ab"]), check_index_type=False
+    )
 
 
 class TestTimedelta:
@@ -628,6 +630,16 @@ def test_agg_with_multiindex(native_df_multiindex, func, expected_union_count):
         eval_snowpark_pandas_result(snow_df, native_df_multiindex, func)
 
 
+def test_agg_with_one_column_multiindex(native_df_multiindex):
+    # Triggers the special 1x1 transpose code path
+    native_df_multiindex = native_df_multiindex.iloc[:, 0:1]
+    snow_df = pd.DataFrame(native_df_multiindex)
+    with SqlCounter(query_count=1):
+        eval_snowpark_pandas_result(
+            snow_df, native_df_multiindex, lambda df: df.agg("count")
+        )
+
+
 @pytest.mark.parametrize(
     "func",
     [
diff --git a/tests/integ/modin/frame/test_iloc.py b/tests/integ/modin/frame/test_iloc.py
@@ -1077,21 +1077,9 @@ def iloc_helper(df):
         else:
             return native_pd.Series([]) if axis == "row" else df.iloc[:, []]
 
-    def determine_query_count():
-        # Multiple queries because of squeeze() - in range is 2, out-of-bounds is 1.
-        if axis == "col":
-            num_queries = 1
-        else:
-            if not -8 < key < 7:  # key is out of bound
-                num_queries = 2
-            else:
-                num_queries = 1
-        return num_queries
-
-    query_count = determine_query_count()
     # test df with default index
     num_cols = 7
-    with SqlCounter(query_count=query_count):
+    with SqlCounter(query_count=1):
         eval_snowpark_pandas_result(
             default_index_snowpark_pandas_df,
             default_index_native_df,
@@ -1101,21 +1089,20 @@ def determine_query_count():
 
     # test df with non-default index
     num_cols = 6  # set_index() makes the number of columns 6
-    with SqlCounter(query_count=query_count):
+    with SqlCounter(query_count=1):
         eval_snowpark_pandas_result(
             default_index_snowpark_pandas_df.set_index("D"),
             default_index_native_df.set_index("D"),
             iloc_helper,
             test_attrs=False,
         )
 
-    query_count = determine_query_count()
     # test df with MultiIndex
     # Index dtype is different between Snowpark and native pandas if key produces empty df.
     num_cols = 7
     native_df = default_index_native_df.set_index(multiindex_native)
     snowpark_df = pd.DataFrame(native_df)
-    with SqlCounter(query_count=query_count):
+    with SqlCounter(query_count=1):
         eval_snowpark_pandas_result(
             snowpark_df,
             native_df,
@@ -1129,7 +1116,7 @@ def determine_query_count():
         native_df_with_multiindex_columns
     )
     in_range = True if (-8 < key < 7) else False
-    with SqlCounter(query_count=query_count):
+    with SqlCounter(query_count=1):
         if axis == "row" or in_range:  # series result
             eval_snowpark_pandas_result(
                 snowpark_df_with_multiindex_columns,
@@ -1151,7 +1138,7 @@ def determine_query_count():
     # test df with MultiIndex on both index and columns
     native_df = native_df_with_multiindex_columns.set_index(multiindex_native)
     snowpark_df = pd.DataFrame(native_df)
-    with SqlCounter(query_count=query_count):
+    with SqlCounter(query_count=1):
         if axis == "row" or in_range:  # series result
             eval_snowpark_pandas_result(
                 snowpark_df,
diff --git a/tests/integ/modin/frame/test_squeeze.py b/tests/integ/modin/frame/test_squeeze.py
@@ -31,11 +31,7 @@ def test_n_by_1(axis, dtype):
 
 @pytest.mark.parametrize("dtype", ["int", "timedelta64[ns]"])
 def test_1_by_n(axis, dtype):
-    if axis is None:
-        expected_query_count = 2
-    else:
-        expected_query_count = 1
-    with SqlCounter(query_count=expected_query_count):
+    with SqlCounter(query_count=1):
         eval_snowpark_pandas_result(
             *create_test_dfs({"a": [1], "b": [2], "c": [3]}, dtype=dtype),
             lambda df: df.squeeze(axis=axis),
diff --git a/tests/integ/modin/groupby/test_groupby_default2pandas.py b/tests/integ/modin/groupby/test_groupby_default2pandas.py
@@ -130,7 +130,7 @@ def test_groupby_with_numpy_array(basic_snowpark_pandas_df) -> None:
 @sql_count_checker(query_count=0)
 def test_groupby_series_with_numpy_array(native_series_multi_numeric, by_list) -> None:
     with pytest.raises(
-        NotImplementedError, match=GROUPBY_UNSUPPORTED_GROUPING_ERROR_PATTERN
+        NotImplementedError, match=_GROUPBY_UNSUPPORTED_GROUPING_MESSAGE
     ):
         pd.Series(native_series_multi_numeric).groupby(by=by_list).max()
 
diff --git a/tests/integ/modin/groupby/test_groupby_rolling.py b/tests/integ/modin/groupby/test_groupby_rolling.py
@@ -102,18 +102,18 @@ def test_groupby_rolling_dropna_false():
     )
 
 
-@sql_count_checker(query_count=1)
+@sql_count_checker(query_count=0)
 def test_groupby_rolling_series_negative():
     date_idx = pd.date_range("1/1/2000", periods=8, freq="min")
     date_idx.names = ["grp_col"]
     snow_ser = pd.Series([1, 1, np.nan, 2])
     with pytest.raises(
         NotImplementedError,
         match=re.escape(
-            "Groupby does not yet support axis == 1, by != None and level != None, or by containing any non-pandas hashable labels"
+            "Snowpark pandas does not yet support the method GroupBy.rolling for Series"
         ),
     ):
-        snow_ser.groupby(snow_ser.index).rolling(2).sum()
+        snow_ser.groupby(level=0).rolling(2).sum()
 
 
 @pytest.mark.parametrize(
diff --git a/tests/integ/modin/hybrid/test_switch_operations.py b/tests/integ/modin/hybrid/test_switch_operations.py