snowflakedb
diff --git a/‎CHANGELOG.md‎
Lines changed: 2 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎docs/source/modin/supported/dataframe_supported.rst‎
Lines changed: 1 addition & 1 deletion b/‎docs/source/modin/supported/dataframe_supported.rst‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/modin/supported/series_str_supported.rst‎
Lines changed: 1 addition & 1 deletion b/‎docs/source/modin/supported/series_str_supported.rst‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/modin/supported/series_supported.rst‎
Lines changed: 1 addition & 1 deletion b/‎docs/source/modin/supported/series_supported.rst‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/snowflake/snowpark/_internal/analyzer/analyzer.py‎
Lines changed: 4 additions & 0 deletions b/‎src/snowflake/snowpark/_internal/analyzer/analyzer.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/snowflake/snowpark/_internal/analyzer/snowflake_plan_node.py‎
Lines changed: 14 additions & 0 deletions b/‎src/snowflake/snowpark/_internal/analyzer/snowflake_plan_node.py‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎src/snowflake/snowpark/functions.py‎
Lines changed: 33 additions & 7 deletions b/‎src/snowflake/snowpark/functions.py‎
Lines changed: 33 additions & 7 deletions
diff --git a/‎src/snowflake/snowpark/mock/_udtf.py‎
Lines changed: 1 addition & 1 deletion b/‎src/snowflake/snowpark/mock/_udtf.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py‎
Lines changed: 94 additions & 7 deletions b/‎src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py‎
Lines changed: 94 additions & 7 deletions
diff --git a/‎src/snowflake/snowpark/modin/plugin/docstrings/base.py‎
Lines changed: 37 additions & 0 deletions b/‎src/snowflake/snowpark/modin/plugin/docstrings/base.py‎
Lines changed: 37 additions & 0 deletions
@@ -80,6 +80,8 @@
   - %%: A literal '%' character.
 - Added support for `Series.between`.
 - Added support for `include_groups=False` in `DataFrameGroupBy.apply`.
+- Added support for `expand=True` in `Series.str.split`.
+- Added support for `DataFrame.pop` and `Series.pop`.
 
 #### Bug Fixes
 
 
@@ -311,7 +311,7 @@ Methods
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
 | ``plot``                    | D                               |                                  | Performed locally on the client                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
-| ``pop``                     | N                               |                                  |                                                    |
+| ``pop``                     | Y                               |                                  |                                                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
 | ``pow``                     | P                               | ``level``                        |                                                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
 
@@ -119,7 +119,7 @@ the method in the left column.
 | ``slice_replace``           | N                               |                                                    |
 +-----------------------------+---------------------------------+----------------------------------------------------+
 | ``split``                   | P                               |  ``N`` if `pat` is non-string, `n` is non-numeric, |
-|                             |                                 |  `expand` is set, or `regex` is set.               |
+|                             |                                 |  or `regex` is set.                                |
 +-----------------------------+---------------------------------+----------------------------------------------------+
 | ``startswith``              | P                               |  ``N`` if the `na` parameter is set to a non-bool  |
 |                             |                                 |  value.                                            |
 
@@ -306,7 +306,7 @@ Methods
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
 | ``plot``                    | D                               |                                  | Performed locally on the client                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
-| ``pop``                     | N                               |                                  |                                                    |
+| ``pop``                     | Y                               |                                  |                                                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
 | ``pow``                     | P                               | ``level``                        |                                                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
 
@@ -6,6 +6,8 @@
 from collections import Counter, defaultdict
 from typing import TYPE_CHECKING, DefaultDict, Dict, List, Optional, Union
 
+from snowflake.connector import IntegrityError
+
 import snowflake.snowpark
 from snowflake.snowpark._internal.analyzer.analyzer_utils import (
     alias_expression,
@@ -975,6 +977,8 @@ def do_resolve_with_resolved_children(
 
             if logical_plan.data:
                 if not logical_plan.is_large_local_data:
+                    if logical_plan.is_contain_illegal_null_value:
+                        raise IntegrityError("NULL result in a non-nullable column")
                     return self.plan_builder.query(
                         values_statement(logical_plan.output, logical_plan.data),
                         logical_plan,
 
@@ -158,6 +158,20 @@ def is_large_local_data(self) -> bool:
 
         return len(self.data) * len(self.output) >= ARRAY_BIND_THRESHOLD
 
+    @property
+    def is_contain_illegal_null_value(self) -> bool:
+        from snowflake.snowpark._internal.analyzer.analyzer import ARRAY_BIND_THRESHOLD
+
+        rows_to_compare = min(
+            ARRAY_BIND_THRESHOLD // len(self.output) + 1, len(self.data)
+        )
+        for j in range(len(self.output)):
+            if not self.output[j].nullable:
+                for i in range(rows_to_compare):
+                    if self.data[i][j] is None:
+                        return True
+        return False
+
     @property
     def individual_node_complexity(self) -> Dict[PlanNodeCategory, int]:
         if self.is_large_local_data:
 
@@ -758,17 +758,31 @@ def convert_timezone(
     )
     target_tz = _to_col_if_str(target_timezone, "convert_timezone")
     source_time_to_convert = _to_col_if_str(source_time, "convert_timezone")
-
+    # Build AST here to prevent rearrangement of args in the encoded AST.
+    ast = (
+        build_function_expr(
+            "convert_timezone",
+            [target_timezone, source_time, source_timezone],
+            ignore_null_args=True,
+        )
+        if _emit_ast
+        else None
+    )
     if source_timezone is None:
         return call_builtin(
-            "convert_timezone", target_tz, source_time_to_convert, _emit_ast=_emit_ast
+            "convert_timezone",
+            target_tz,
+            source_time_to_convert,
+            _ast=ast,
+            _emit_ast=False,
         )
     return call_builtin(
         "convert_timezone",
         source_tz,
         target_tz,
         source_time_to_convert,
-        _emit_ast=_emit_ast,
+        _ast=ast,
+        _emit_ast=False,
     )
 
 
@@ -894,7 +908,7 @@ def count_distinct(*cols: ColumnOrName, _emit_ast: bool = True) -> Column:
     return Column(
         FunctionExpression("count", [c._expression for c in cs], is_distinct=True),
         _ast=ast,
-        _emit_ast=_emit_ast,
+        _emit_ast=False,
     )
 
 
@@ -3435,7 +3449,7 @@ def charindex(
     s = _to_col_if_str(source_expr, "charindex")
     # Build AST here to prevent `position` from being recorded as a literal instead of int/None.
     ast = (
-        build_function_expr("char_index", [t, s, position], ignore_null_args=True)
+        build_function_expr("charindex", [t, s, position], ignore_null_args=True)
         if _emit_ast
         else None
     )
@@ -4336,7 +4350,12 @@ def next_day(
         [Row(NEXT_DAY("A", 'FR')=datetime.date(2020, 8, 7)), Row(NEXT_DAY("A", 'FR')=datetime.date(2020, 12, 4))]
     """
     c = _to_col_if_str(date, "next_day")
-    return builtin("next_day", _emit_ast=_emit_ast)(c, Column._to_expr(day_of_week))
+    # Build AST here to prevent `date` from being recorded as a Column instead of a literal and
+    # `day_of_week` from being recorded as a literal instead of Column.
+    ast = build_function_expr("next_day", [date, day_of_week]) if _emit_ast else None
+    return builtin("next_day", _ast=ast, _emit_ast=False)(
+        c, Column._to_expr(day_of_week)
+    )
 
 
 @publicapi
@@ -4359,7 +4378,14 @@ def previous_day(
         [Row(PREVIOUS_DAY("A", 'FR')=datetime.date(2020, 7, 31)), Row(PREVIOUS_DAY("A", 'FR')=datetime.date(2020, 11, 27))]
     """
     c = _to_col_if_str(date, "previous_day")
-    return builtin("previous_day", _emit_ast=_emit_ast)(c, Column._to_expr(day_of_week))
+    # Build AST here to prevent `date` from being recorded as a Column instead of a literal and
+    # `day_of_week` from being recorded as a literal instead of Column.
+    ast = (
+        build_function_expr("previous_day", [date, day_of_week]) if _emit_ast else None
+    )
+    return builtin("previous_day", _ast=ast, _emit_ast=False)(
+        c, Column._to_expr(day_of_week)
+    )
 
 
 @publicapi
 
@@ -78,7 +78,7 @@ def _do_register_udtf(
                 ast = with_src_position(stmt.expr.udtf, stmt)
                 ast_id = stmt.var_id.bitfield1
 
-            object_name = kwargs["_registrated_object_name"]
+            object_name = kwargs["_registered_object_name"]
             udtf = MockUserDefinedTableFunction(
                 handler,
                 output_schema,
 
@@ -107,6 +107,7 @@
     dense_rank,
     first_value,
     floor,
+    get,
     greatest,
     hour,
     iff,
@@ -16813,10 +16814,6 @@ def str_split(
             ErrorMessage.not_implemented(
                 "Snowpark pandas doesn't support non-str 'pat' argument"
             )
-        if expand:
-            ErrorMessage.not_implemented(
-                "Snowpark pandas doesn't support 'expand' argument"
-            )
         if regex:
             ErrorMessage.not_implemented(
                 "Snowpark pandas doesn't support 'regex' argument"
@@ -16864,6 +16861,12 @@ def output_col(
             if np.isnan(n):
                 # Follow pandas behavior
                 return pandas_lit(np.nan)
+            elif n < -1 and not pandas.isnull(pat) and len(str(pat)) > 1:
+                # Follow pandas behavior, which based on our experiments, leaves the input column as is
+                # whenever the above condition is satisfied.
+                new_col = iff(
+                    column.is_null(), pandas_lit(None), array_construct(column)
+                )
             elif n <= 0:
                 # If all possible splits are requested, we just use SQL's split function.
                 new_col = builtin("split")(new_col, pandas_lit(new_pat))
@@ -16907,9 +16910,93 @@ def output_col(
                 )
             return self._replace_non_str(column, new_col)
 
-        new_internal_frame = self._modin_frame.apply_snowpark_function_to_columns(
-            lambda col_name: output_col(col_name, pat, n)
-        )
+        def output_cols(
+            column: SnowparkColumn, pat: Optional[str], n: int, max_splits: int
+        ) -> list[SnowparkColumn]:
+            """
+            Returns the list of columns that the input column will be split into.
+            This is only used when expand=True.
+            Args:
+                column : SnowparkColumn
+                    Input column
+                pat : str
+                    String to split on
+                n : int
+                    Limit on the number of output splits
+                max_splits : int
+                    Maximum number of achievable splits across all values in the input column.
+                    This is needed to be able to pad rows with fewer splits than desired with nulls.
+            """
+            col = output_col(column, pat, n)
+            final_splits = 0
+
+            if np.isnan(n):
+                # Follow pandas behavior
+                final_splits = 1
+            elif n <= 0:
+                final_splits = max_splits
+            else:
+                final_splits = min(n + 1, max_splits)
+
+            if n < -1 and not pandas.isnull(pat) and len(str(pat)) > 1:
+                # Follow pandas behavior, which based on our experiments, leaves the input column as is
+                # whenever the above condition is satisfied.
+                final_splits = 1
+
+            return [
+                iff(
+                    array_size(col) > pandas_lit(i),
+                    get(col, pandas_lit(i)),
+                    pandas_lit(None),
+                )
+                for i in range(final_splits)
+            ]
+
+        def get_max_splits() -> int:
+            """
+            Returns the maximum number of splits achievable
+            across all values stored in the input column.
+            """
+            splits_as_list_frame = self.str_split(
+                pat=pat,
+                n=-1,
+                expand=False,
+                regex=regex,
+            )._modin_frame
+
+            split_counts_frame = splits_as_list_frame.append_column(
+                "split_counts",
+                array_size(
+                    col(
+                        splits_as_list_frame.data_column_snowflake_quoted_identifiers[0]
+                    )
+                ),
+            )
+
+            max_count_rows = split_counts_frame.ordered_dataframe.agg(
+                max_(
+                    col(split_counts_frame.data_column_snowflake_quoted_identifiers[-1])
+                ).as_("max_count")
+            ).collect()
+
+            return max_count_rows[0][0]
+
+        if expand:
+            cols = output_cols(
+                col(self._modin_frame.data_column_snowflake_quoted_identifiers[0]),
+                pat,
+                n,
+                get_max_splits(),
+            )
+            new_internal_frame = self._modin_frame.project_columns(
+                list(range(len(cols))),
+                cols,
+            )
+        else:
+            new_internal_frame = self._modin_frame.apply_snowpark_function_to_columns(
+                lambda col_name: output_col(col_name, pat, n)
+            )
+
         return SnowflakeQueryCompiler(new_internal_frame)
 
     def str_rsplit(
 
@@ -2346,6 +2346,43 @@ def pipe():
     def pop():
         """
         Return item and drop from frame. Raise KeyError if not found.
+
+        Parameters
+        ----------
+            item : label
+                Label of column to be popped.
+
+        Returns
+        -------
+            Series
+
+        Examples
+        --------
+        >>> df = pd.DataFrame([('falcon', 'bird', 389.0),
+        ...                    ('parrot', 'bird', 24.0),
+        ...                    ('lion', 'mammal', 80.5),
+        ...                    ('monkey', 'mammal', np.nan)],
+        ...                   columns=('name', 'class', 'max_speed'))
+        >>> df
+             name   class  max_speed
+        0  falcon    bird      389.0
+        1  parrot    bird       24.0
+        2    lion  mammal       80.5
+        3  monkey  mammal        NaN
+
+        >>> df.pop('class')
+        0      bird
+        1      bird
+        2    mammal
+        3    mammal
+        Name: class, dtype: object
+
+        >>> df
+             name  max_speed
+        0  falcon      389.0
+        1  parrot       24.0
+        2    lion       80.5
+        3  monkey        NaN
         """
 
     def pow():