snowflakedb
diff --git a/‎CHANGELOG.md‎
Lines changed: 4 additions & 1 deletion b/‎CHANGELOG.md‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎docs/source/modin/supported/groupby_supported.rst‎
Lines changed: 2 additions & 2 deletions b/‎docs/source/modin/supported/groupby_supported.rst‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 7 additions & 0 deletions b/‎pyproject.toml‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎setup.py‎
Lines changed: 1 addition & 2 deletions b/‎setup.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎src/snowflake/snowpark/dataframe.py‎
Lines changed: 49 additions & 11 deletions b/‎src/snowflake/snowpark/dataframe.py‎
Lines changed: 49 additions & 11 deletions
diff --git a/‎src/snowflake/snowpark/functions.py‎
Lines changed: 75 additions & 10 deletions b/‎src/snowflake/snowpark/functions.py‎
Lines changed: 75 additions & 10 deletions
diff --git a/‎src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py‎
Lines changed: 38 additions & 20 deletions b/‎src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py‎
Lines changed: 38 additions & 20 deletions
diff --git a/‎src/snowflake/snowpark/modin/plugin/docstrings/groupby.py‎
Lines changed: 3 additions & 0 deletions b/‎src/snowflake/snowpark/modin/plugin/docstrings/groupby.py‎
Lines changed: 3 additions & 0 deletions
@@ -19,11 +19,13 @@
 #### Improvements
 
 - Updated README.md to include instructions on how to verify package signatures using `cosign`.
+- Added an option `keep_column_order` for keeping original column order in `DataFrame.with_column` and `DataFrame.with_columns`.
 
 #### Bug Fixes
 
 - Fixed a bug in local testing mode that caused a column to contain None when it should contain 0
-- Fixed a bug in StructField.from_json that prevented TimestampTypes with tzinfo from being parsed correctly.
+- Fixed a bug in `StructField.from_json` that prevented TimestampTypes with tzinfo from being parsed correctly.
+- Fixed a bug in function `date_format` that caused an error when the input column was date type or timestamp type.
 
 ### Snowpark pandas API Updates
 
@@ -49,6 +51,7 @@
   - %X: Locale’s appropriate time representation.
   - %%: A literal '%' character.
 - Added support for `Series.between`.
+- Added support for `include_groups=False` in `DataFrameGroupBy.apply`.
 - Added support for `DataFrame.pop` and `Series.pop`.
 
 #### Bug Fixes
 
@@ -39,8 +39,8 @@ Function application
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
 | ``apply``                   | P                               | ``axis`` other than 0 is not     | ``Y`` if the following are true, otherwise ``N``:  |
 |                             |                                 | implemented.                     |   - ``func`` is a callable that always returns     |
-|                             |                                 | ``include_groups = False`` is    |     either a pandas DataFrame, a pandas Series, or |
-|                             |                                 | not implemented.                 |     objects that are neither DataFrame nor Series. |
+|                             |                                 |                                  |     either a pandas DataFrame, a pandas Series, or |
+|                             |                                 |                                  |     objects that are neither DataFrame nor Series. |
 |                             |                                 |                                  |   - grouping on axis=0                             |
 |                             |                                 |                                  |   - Not applying transform to a dataframe with a   |
 |                             |                                 |                                  |     non-unique index                               |
 
@@ -0,0 +1,7 @@
+[build-system]
+requires = [
+    "setuptools",
+    "protoc-wheel-0==21.1", # Protocol buffer compiler for Snowpark IR
+    "mypy-protobuf", # used in generating typed Python code from protobuf for Snowpark IR
+]
+build-backend = "setuptools.build_meta"
@@ -58,9 +58,8 @@
     "graphviz",  # used in plot tests
     "pytest-assume",  # sql counter check
     "decorator",  # sql counter check
-    "protoc-wheel-0==21.1",  # Protocol buffer compiler, for Snowpark IR
-    "mypy-protobuf",  # used in generating typed Python code from protobuf for Snowpark IR
     "lxml",  # used in read_xml tests
+    "tox",  # used for setting up testing environments
 ]
 
 # read the version
 
@@ -3688,6 +3688,8 @@ def with_column(
         self,
         col_name: str,
         col: Union[Column, TableFunctionCall],
+        *,
+        keep_column_order: bool = False,
         ast_stmt: proto.Expr = None,
         _emit_ast: bool = True,
     ) -> "DataFrame":
@@ -3730,6 +3732,7 @@ def with_column(
         Args:
             col_name: The name of the column to add or replace.
             col: The :class:`Column` or :class:`table_function.TableFunctionCall` with single column output to add or replace.
+            keep_column_order: If ``True``, the original order of the columns in the DataFrame is preserved when reaplacing a column.
         """
         if ast_stmt is None and _emit_ast:
             ast_stmt = self._session._ast_batch.assign()
@@ -3738,7 +3741,13 @@ def with_column(
             build_expr_from_snowpark_column_or_table_fn(expr.col, col)
             self._set_ast_ref(expr.df)
 
-        df = self.with_columns([col_name], [col], _ast_stmt=ast_stmt, _emit_ast=False)
+        df = self.with_columns(
+            [col_name],
+            [col],
+            keep_column_order=keep_column_order,
+            _ast_stmt=ast_stmt,
+            _emit_ast=False,
+        )
 
         if _emit_ast:
             df._ast_id = ast_stmt.var_id.bitfield1
@@ -3751,6 +3760,8 @@ def with_columns(
         self,
         col_names: List[str],
         values: List[Union[Column, TableFunctionCall]],
+        *,
+        keep_column_order: bool = False,
         _ast_stmt: proto.Expr = None,
         _emit_ast: bool = True,
     ) -> "DataFrame":
@@ -3797,6 +3808,7 @@ def with_columns(
             col_names: A list of the names of the columns to add or replace.
             values: A list of the :class:`Column` objects or :class:`table_function.TableFunctionCall` object
                     to add or replace.
+            keep_column_order: If ``True``, the original order of the columns in the DataFrame is preserved when reaplacing a column.
         """
         # Get a list of the new columns and their dedupped values
         qualified_names = [quote_name(n) for n in col_names]
@@ -3837,14 +3849,7 @@ def with_columns(
                     names = col_names[i : i + offset + 1]
                     new_cols.append(col.as_(*names))
 
-        # Get a list of existing column names that are not being replaced
-        old_cols = [
-            Column(field)
-            for field in self._output
-            if field.name not in new_column_names
-        ]
-
-        # AST.
+        # AST
         if _ast_stmt is None and _emit_ast:
             _ast_stmt = self._session._ast_batch.assign()
             expr = with_src_position(
@@ -3856,8 +3861,41 @@ def with_columns(
                 build_expr_from_snowpark_column_or_table_fn(expr.values.add(), value)
             self._set_ast_ref(expr.df)
 
-        # Put it all together
-        df = self.select([*old_cols, *new_cols], _ast_stmt=_ast_stmt, _emit_ast=False)
+        # If there's a table function call or keep_column_order=False,
+        # we do the original "remove old columns and append new ones" logic.
+        if num_table_func_calls > 0 or not keep_column_order:
+            old_cols = [
+                Column(field)
+                for field in self._output
+                if field.name not in new_column_names
+            ]
+            final_cols = [*old_cols, *new_cols]
+        else:
+            # keep_column_order=True and no table function calls
+            # Re-insert replaced columns in their original positions if they exist
+            replaced_map = {
+                name: new_col for name, new_col in zip(qualified_names, new_cols)
+            }
+            final_cols = []
+            used = set()  # track which new cols we've inserted
+
+            for field in self._output:
+                field_quoted = quote_name(field.name)
+                # If this old column name is being replaced, insert the new col at the same position
+                if field_quoted in replaced_map:
+                    final_cols.append(replaced_map[field_quoted])
+                    used.add(field_quoted)
+                else:
+                    # keep the original col
+                    final_cols.append(Column(field))
+
+            # For any new columns that didn't exist in the old schema, append them at the end
+            for name, c in replaced_map.items():
+                if name not in used:
+                    final_cols.append(c)
+
+        # Construct the final DataFrame
+        df = self.select(final_cols, _ast_stmt=_ast_stmt, _emit_ast=False)
 
         if _emit_ast:
             df._ast_id = _ast_stmt.var_id.bitfield1
 
@@ -224,6 +224,7 @@
     StoredProcedureRegistration,
 )
 from snowflake.snowpark.types import (
+    ArrayType,
     DataType,
     FloatType,
     PandasDataFrameType,
@@ -3561,20 +3562,67 @@ def _concat_ws_ignore_nulls(sep: str, *cols: ColumnOrName) -> Column:
         |Hello                                              |
         -----------------------------------------------------
         <BLANKLINE>
+
+        >>> df = session.create_dataframe([
+        ...     (['Hello', 'World', None], None, '!'),
+        ...     (['Hi', 'World', "."], "I'm Dad", '.'),
+        ... ], schema=['a', 'b', 'c'])
+        >>> df.select(_concat_ws_ignore_nulls(", ", "a", "b", "c")).show()
+        -----------------------------------------------------
+        |"CONCAT_WS_IGNORE_NULLS(', ', ""A"",""B"",""C"")"  |
+        -----------------------------------------------------
+        |Hello, World, !                                    |
+        |Hi, World, ., I'm Dad, .                           |
+        -----------------------------------------------------
+        <BLANKLINE>
     """
     # TODO: SNOW-1831917 create ast
     columns = [_to_col_if_str(c, "_concat_ws_ignore_nulls") for c in cols]
     names = ",".join([c.get_name() for c in columns])
 
-    input_column_array = array_construct_compact(*columns, _emit_ast=False)
-    reduced_result = builtin("reduce", _emit_ast=False)(
-        input_column_array,
-        lit("", _emit_ast=False),
-        sql_expr(f"(l, r) -> l || '{sep}' || r"),
-    )
-    return substring(reduced_result, len(sep) + 1, _emit_ast=False).alias(
-        f"CONCAT_WS_IGNORE_NULLS('{sep}', {names})", _emit_ast=False
-    )
+    # The implementation of this function is as follows with example input of
+    # sep = "," and row = [a, NULL], b, NULL, c:
+    # 1. Cast all columns to array.
+    #   [a, NULL], [b], NULL, [c]
+    # 2. Combine all arrays into a array of arrays after removing nulls (array_construct_compact).
+    #   [[a, NULL], [b], [c]]
+    # 3. Flatten the array of arrays into a single array (array_flatten).
+    #   [a, NULL, b, c]
+    # 4. Filter out nulls (array_remove_nulls).
+    #   [a, b, c]
+    # 5. Concatenate the non-null values into a single string (concat_strings_with_sep).
+    #   "a,b,c"
+
+    def array_remove_nulls(col: Column) -> Column:
+        """Expects an array and returns an array with nulls removed."""
+        return builtin("filter", _emit_ast=False)(
+            col, sql_expr("x -> NOT IS_NULL_VALUE(x)", _emit_ast=False)
+        )
+
+    def concat_strings_with_sep(col: Column) -> Column:
+        """
+        Expects an array of strings and returns a single string
+        with the values concatenated with the separator.
+        """
+        return substring(
+            builtin("reduce", _emit_ast=False)(
+                col, lit(""), sql_expr(f"(l, r) -> l || '{sep}' || r", _emit_ast=False)
+            ),
+            len(sep) + 1,
+            _emit_ast=False,
+        )
+
+    return concat_strings_with_sep(
+        array_remove_nulls(
+            array_flatten(
+                array_construct_compact(
+                    *[c.cast(ArrayType(), _emit_ast=False) for c in columns],
+                    _emit_ast=False,
+                ),
+                _emit_ast=False,
+            )
+        )
+    ).alias(f"CONCAT_WS_IGNORE_NULLS('{sep}', {names})", _emit_ast=False)
 
 
 @publicapi
@@ -3828,6 +3876,19 @@ def date_format(
         |2022/05/15 10:45:00  |
         -----------------------
         <BLANKLINE>
+
+    Example::
+        >>> df = session.sql("select '2023-10-10'::DATE as date_col, '2023-10-10 15:30:00'::TIMESTAMP as timestamp_col")
+        >>> df.select(
+        ...     date_format('date_col', 'YYYY/MM/DD').as_('formatted_dt'),
+        ...     date_format('timestamp_col', 'YYYY/MM/DD HH:mi:ss').as_('formatted_ts')
+        ... ).show()
+        ----------------------------------------
+        |"FORMATTED_DT"  |"FORMATTED_TS"       |
+        ----------------------------------------
+        |2023/10/10      |2023/10/10 15:30:00  |
+        ----------------------------------------
+        <BLANKLINE>
     """
 
     # AST.
@@ -3836,7 +3897,11 @@ def date_format(
         ast = proto.Expr()
         build_builtin_fn_apply(ast, "date_format", c, fmt)
 
-    ans = to_char(try_cast(c, TimestampType(), _emit_ast=False), fmt, _emit_ast=False)
+    ans = to_char(
+        try_cast(to_char(c, _emit_ast=False), TimestampType(), _emit_ast=False),
+        fmt,
+        _emit_ast=False,
+    )
     ans._ast = ast
     return ans
 
 
@@ -3979,6 +3979,7 @@ def groupby_apply(
         agg_args: Any,
         agg_kwargs: dict[str, Any],
         series_groupby: bool,
+        include_groups: bool,
         force_single_group: bool = False,
         force_list_like_to_series: bool = False,
     ) -> "SnowflakeQueryCompiler":
@@ -4001,6 +4002,9 @@ def groupby_apply(
                 Keyword arguments to pass to agg_func when applying it to each group.
             series_groupby:
                 Whether we are performing a SeriesGroupBy.apply() instead of a DataFrameGroupBy.apply()
+            include_groups:
+                When True, will include grouping keys when calling func in the case that
+                they are columns of the DataFrame.
             force_single_group:
                 Force single group (empty set of group by labels) useful for DataFrame.apply() with axis=0
             force_list_like_to_series:
@@ -4019,14 +4023,6 @@ def groupby_apply(
                 + f"level={level}, and axis={axis}"
             )
 
-        if "include_groups" in agg_kwargs:
-            # exclude "include_groups" from the apply function kwargs
-            include_groups = agg_kwargs.pop("include_groups")
-            if not include_groups:
-                ErrorMessage.not_implemented(
-                    f"No support for groupby.apply with include_groups = {include_groups}"
-                )
-
         sort = groupby_kwargs.get("sort", True)
         as_index = groupby_kwargs.get("as_index", True)
         dropna = groupby_kwargs.get("dropna", True)
@@ -4051,17 +4047,36 @@ def groupby_apply(
         )
 
         snowflake_type_map = self._modin_frame.quoted_identifier_to_snowflake_type()
-
-        # For DataFrameGroupBy, `func` operates on this frame in its entirety.
-        # For SeriesGroupBy, this frame may also include some grouping columns
-        # that `func` should not take as input. In that case, the only column
-        # that `func` takes as input is the last data column, so grab just that
-        # column with a slice starting at index -1 and ending at None.
-        input_data_column_identifiers = (
-            self._modin_frame.data_column_snowflake_quoted_identifiers[
-                slice(-1, None) if series_groupby else slice(None)
-            ]
-        )
+        input_data_column_positions = [
+            i
+            for i, identifier in enumerate(
+                self._modin_frame.data_column_snowflake_quoted_identifiers
+            )
+            if (
+                (
+                    # For SeriesGroupBy, this frame may also include some
+                    # grouping columns that `func` should not take as input. In
+                    # that case, the only column that `func` takes as input is
+                    # the last data column, so take just that column.
+                    # include_groups has no effect.
+                    i
+                    == len(self._modin_frame.data_column_snowflake_quoted_identifiers)
+                    - 1
+                )
+                if series_groupby
+                else (
+                    # For DataFrameGroupBy, if include_groups, we apply the
+                    # function to all data columns. Otherwise, we exclude
+                    # data columns that we are grouping by.
+                    include_groups
+                    or identifier not in by_snowflake_quoted_identifiers_list
+                )
+            )
+        ]
+        input_data_column_identifiers = [
+            self._modin_frame.data_column_snowflake_quoted_identifiers[i]
+            for i in input_data_column_positions
+        ]
 
         # TODO(SNOW-1210489): When type hints show that `agg_func` returns a
         # scalar, we can use a vUDF instead of a vUDTF and we can skip the
@@ -4070,7 +4085,9 @@ def groupby_apply(
             agg_func,
             agg_args,
             agg_kwargs,
-            data_column_index=self._modin_frame.data_columns_index,
+            data_column_index=self._modin_frame.data_columns_index[
+                input_data_column_positions
+            ],
             index_column_names=self._modin_frame.index_column_pandas_labels,
             input_data_column_types=[
                 snowflake_type_map[quoted_identifier]
@@ -8511,6 +8528,7 @@ def wrapped_func(*args, **kwargs):  # type: ignore[no-untyped-def] # pragma: no
                     series_groupby=True,
                     force_single_group=True,
                     force_list_like_to_series=True,
+                    include_groups=True,
                 )
 
                 data_col_result_frame = data_col_qc._modin_frame
 
@@ -1078,6 +1078,9 @@ def apply():
             A callable that takes a dataframe or series as its first argument, and
             returns a dataframe, a series or a scalar. In addition the
             callable may take positional and keyword arguments.
+        include_groups : bool, default True
+            When True, will apply ``func`` to the groups in the case that they
+            are columns of the DataFrame.
         args, kwargs : tuple and dict
             Optional positional and keyword arguments to pass to ``func``.