SNOW-2401303: Add support for str.contains/startswith/endswith/slice in faster pandas (#3868)

sfc-gh-helmeleegy · web-flow · commit 0c077b7c0b91 · 2025-10-09T15:07:11.000-07:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -45,6 +45,7 @@
 - Improved performance of `Series.to_snowflake` and `pd.to_snowflake(series)` for large data by uploading data via a parquet file. You can control the dataset size at which Snowpark pandas switches to parquet with the variable `modin.config.PandasToSnowflakeParquetThresholdBytes`.
 - Set `cte_optimization_enabled` to True for all Snowpark pandas sessions.
 - Add support for `isna`, `isnull`, `notna`, `notnull` in faster pandas.
+- Add support for `str.contains`, `str.startswith`, `str.endswith`, and `str.slice` in faster pandas.
 
 ## 1.40.0 (2025-10-02)
 
diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py
@@ -17116,6 +17116,21 @@ def str_encode(self, encoding: str, errors: str) -> None:
 
     def str_startswith(
         self, pat: Union[str, tuple], na: object = None
+    ) -> "SnowflakeQueryCompiler":
+        """
+        Wrapper around _str_startswith_internal to be supported in faster pandas.
+        """
+        relaxed_query_compiler = None
+        if self._relaxed_query_compiler is not None:
+            relaxed_query_compiler = (
+                self._relaxed_query_compiler._str_startswith_internal(pat=pat, na=na)
+            )
+
+        qc = self._str_startswith_internal(pat=pat, na=na)
+        return self._maybe_set_relaxed_qc(qc, relaxed_query_compiler)
+
+    def _str_startswith_internal(
+        self, pat: Union[str, tuple], na: object = None
     ) -> "SnowflakeQueryCompiler":
         """
         Test if the start of each string element matches a pattern.
@@ -17135,6 +17150,21 @@ def str_startswith(
 
     def str_endswith(
         self, pat: Union[str, tuple], na: object = None
+    ) -> "SnowflakeQueryCompiler":
+        """
+        Wrapper around _str_endswith_internal to be supported in faster pandas.
+        """
+        relaxed_query_compiler = None
+        if self._relaxed_query_compiler is not None:
+            relaxed_query_compiler = (
+                self._relaxed_query_compiler._str_endswith_internal(pat=pat, na=na)
+            )
+
+        qc = self._str_endswith_internal(pat=pat, na=na)
+        return self._maybe_set_relaxed_qc(qc, relaxed_query_compiler)
+
+    def _str_endswith_internal(
+        self, pat: Union[str, tuple], na: object = None
     ) -> "SnowflakeQueryCompiler":
         """
         Test if the end of each string element matches a pattern.
@@ -17490,6 +17520,38 @@ def str_contains(
         flags: int = 0,
         na: object = None,
         regex: bool = True,
+    ) -> "SnowflakeQueryCompiler":
+        """
+        Wrapper around _str_contains_internal to be supported in faster pandas.
+        """
+        relaxed_query_compiler = None
+        if self._relaxed_query_compiler is not None:
+            relaxed_query_compiler = (
+                self._relaxed_query_compiler._str_contains_internal(
+                    pat=pat,
+                    case=case,
+                    flags=flags,
+                    na=na,
+                    regex=regex,
+                )
+            )
+
+        qc = self._str_contains_internal(
+            pat=pat,
+            case=case,
+            flags=flags,
+            na=na,
+            regex=regex,
+        )
+        return self._maybe_set_relaxed_qc(qc, relaxed_query_compiler)
+
+    def _str_contains_internal(
+        self,
+        pat: str,
+        case: bool = True,
+        flags: int = 0,
+        na: object = None,
+        regex: bool = True,
     ) -> "SnowflakeQueryCompiler":
         """
         Test if pattern or regex is contained within a string of a Series or Index.
@@ -17851,6 +17913,29 @@ def str_slice(
         start: Optional[int] = None,
         stop: Optional[int] = None,
         step: Optional[int] = None,
+    ) -> "SnowflakeQueryCompiler":
+        """
+        Wrapper around _str_slice_internal to be supported in faster pandas.
+        """
+        relaxed_query_compiler = None
+        if self._relaxed_query_compiler is not None:
+            relaxed_query_compiler = self._relaxed_query_compiler._str_slice_internal(
+                start=start,
+                stop=stop,
+                step=step,
+            )
+        qc = self._str_slice_internal(
+            start=start,
+            stop=stop,
+            step=step,
+        )
+        return self._maybe_set_relaxed_qc(qc, relaxed_query_compiler)
+
+    def _str_slice_internal(
+        self,
+        start: Optional[int] = None,
+        stop: Optional[int] = None,
+        step: Optional[int] = None,
     ) -> "SnowflakeQueryCompiler":
         """
         Slice substrings from each element in the Series or Index.
diff --git a/tests/integ/modin/test_faster_pandas.py b/tests/integ/modin/test_faster_pandas.py
@@ -14,7 +14,11 @@
     _SNOWPARK_PANDAS_DUMMY_ROW_POS_OPTIMIZATION_ENABLED,
     Session,
 )
-from tests.integ.modin.utils import assert_frame_equal, assert_index_equal
+from tests.integ.modin.utils import (
+    assert_frame_equal,
+    assert_index_equal,
+    assert_series_equal,
+)
 from tests.integ.utils.sql_counter import sql_count_checker
 from tests.utils import Utils
 
@@ -278,6 +282,94 @@ def test_isin_series(session):
     assert_frame_equal(snow_result, native_result, check_dtype=False)
 
 
+@sql_count_checker(query_count=3)
+def test_str_contains(session):
+    # create tables
+    table_name = Utils.random_name_for_temp_object(TempObjectType.TABLE)
+    session.create_dataframe(
+        native_pd.DataFrame([["abc"], ["def"], ["ghi"]], columns=["A"])
+    ).write.save_as_table(table_name, table_type="temp")
+
+    # create snow dataframes
+    df = pd.read_snowflake(table_name)
+    snow_result = df["A"].str.contains("ab")
+
+    # verify that the input dataframe has a populated relaxed query compiler
+    assert df._query_compiler._relaxed_query_compiler is not None
+    assert df._query_compiler._relaxed_query_compiler._dummy_row_pos_mode is True
+    # verify that the output dataframe also has a populated relaxed query compiler
+    assert snow_result._query_compiler._relaxed_query_compiler is not None
+    assert (
+        snow_result._query_compiler._relaxed_query_compiler._dummy_row_pos_mode is True
+    )
+
+    # create pandas dataframes
+    native_df = df.to_pandas()
+    native_result = native_df["A"].str.contains("ab")
+
+    # compare results
+    assert_series_equal(snow_result, native_result)
+
+
+@pytest.mark.parametrize("func", ["startswith", "endswith"])
+@sql_count_checker(query_count=3)
+def test_str_startswith_endswith(session, func):
+    # create tables
+    table_name = Utils.random_name_for_temp_object(TempObjectType.TABLE)
+    session.create_dataframe(
+        native_pd.DataFrame([["abc"], ["def"], ["cba"]], columns=["A"])
+    ).write.save_as_table(table_name, table_type="temp")
+
+    # create snow dataframes
+    df = pd.read_snowflake(table_name)
+    snow_result = getattr(df["A"].str, func)("c")
+
+    # verify that the input dataframe has a populated relaxed query compiler
+    assert df._query_compiler._relaxed_query_compiler is not None
+    assert df._query_compiler._relaxed_query_compiler._dummy_row_pos_mode is True
+    # verify that the output dataframe also has a populated relaxed query compiler
+    assert snow_result._query_compiler._relaxed_query_compiler is not None
+    assert (
+        snow_result._query_compiler._relaxed_query_compiler._dummy_row_pos_mode is True
+    )
+
+    # create pandas dataframes
+    native_df = df.to_pandas()
+    native_result = getattr(native_df["A"].str, func)("c")
+
+    # compare results
+    assert_series_equal(snow_result, native_result)
+
+
+@sql_count_checker(query_count=3)
+def test_str_slice(session):
+    # create tables
+    table_name = Utils.random_name_for_temp_object(TempObjectType.TABLE)
+    session.create_dataframe(
+        native_pd.DataFrame([["abc"], ["def"], ["ghi"]], columns=["A"])
+    ).write.save_as_table(table_name, table_type="temp")
+
+    # create snow dataframes
+    df = pd.read_snowflake(table_name)
+    snow_result = df["A"].str.slice(0, 2, 1)
+
+    # verify that the input dataframe has a populated relaxed query compiler
+    assert df._query_compiler._relaxed_query_compiler is not None
+    assert df._query_compiler._relaxed_query_compiler._dummy_row_pos_mode is True
+    # verify that the output dataframe also has a populated relaxed query compiler
+    assert snow_result._query_compiler._relaxed_query_compiler is not None
+    assert (
+        snow_result._query_compiler._relaxed_query_compiler._dummy_row_pos_mode is True
+    )
+
+    # create pandas dataframes
+    native_df = df.to_pandas()
+    native_result = native_df["A"].str.slice(0, 2, 1)
+
+    # compare results
+    assert_series_equal(snow_result, native_result)
+
+
 @sql_count_checker(query_count=0)
 def test_dummy_row_pos_optimization_enabled_on_session(db_parameters):
     with Session.builder.configs(db_parameters).create() as new_session: