Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,7 @@
- Improved performance of `Series.to_snowflake` and `pd.to_snowflake(series)` for large data by uploading data via a parquet file. You can control the dataset size at which Snowpark pandas switches to parquet with the variable `modin.config.PandasToSnowflakeParquetThresholdBytes`.
- Set `cte_optimization_enabled` to True for all Snowpark pandas sessions.
- Add support for `isin` in faster pandas.
- Add support for `str.contains`, `str.startswith`, `str.endswith`, and `str.slice` in faster pandas.

## 1.39.1 (2025-09-25)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17116,6 +17116,21 @@ def str_encode(self, encoding: str, errors: str) -> None:

def str_startswith(
self, pat: Union[str, tuple], na: object = None
) -> "SnowflakeQueryCompiler":
"""
Wrapper around _str_startswith_internal to be supported in faster pandas.
"""
relaxed_query_compiler = None
if self._relaxed_query_compiler is not None:
relaxed_query_compiler = (
self._relaxed_query_compiler._str_startswith_internal(pat=pat, na=na)
)

qc = self._str_startswith_internal(pat=pat, na=na)
return self._maybe_set_relaxed_qc(qc, relaxed_query_compiler)

def _str_startswith_internal(
self, pat: Union[str, tuple], na: object = None
) -> "SnowflakeQueryCompiler":
"""
Test if the start of each string element matches a pattern.
Expand All @@ -17135,6 +17150,21 @@ def str_startswith(

def str_endswith(
self, pat: Union[str, tuple], na: object = None
) -> "SnowflakeQueryCompiler":
"""
Wrapper around _str_endswith_internal to be supported in faster pandas.
"""
relaxed_query_compiler = None
if self._relaxed_query_compiler is not None:
relaxed_query_compiler = (
self._relaxed_query_compiler._str_endswith_internal(pat=pat, na=na)
)

qc = self._str_endswith_internal(pat=pat, na=na)
return self._maybe_set_relaxed_qc(qc, relaxed_query_compiler)

def _str_endswith_internal(
self, pat: Union[str, tuple], na: object = None
) -> "SnowflakeQueryCompiler":
"""
Test if the end of each string element matches a pattern.
Expand Down Expand Up @@ -17490,6 +17520,38 @@ def str_contains(
flags: int = 0,
na: object = None,
regex: bool = True,
) -> "SnowflakeQueryCompiler":
"""
Wrapper around _str_contains_internal to be supported in faster pandas.
"""
relaxed_query_compiler = None
if self._relaxed_query_compiler is not None:
relaxed_query_compiler = (
self._relaxed_query_compiler._str_contains_internal(
pat=pat,
case=case,
flags=flags,
na=na,
regex=regex,
)
)

qc = self._str_contains_internal(
pat=pat,
case=case,
flags=flags,
na=na,
regex=regex,
)
return self._maybe_set_relaxed_qc(qc, relaxed_query_compiler)

def _str_contains_internal(
self,
pat: str,
case: bool = True,
flags: int = 0,
na: object = None,
regex: bool = True,
) -> "SnowflakeQueryCompiler":
"""
Test if pattern or regex is contained within a string of a Series or Index.
Expand Down Expand Up @@ -17851,6 +17913,29 @@ def str_slice(
start: Optional[int] = None,
stop: Optional[int] = None,
step: Optional[int] = None,
) -> "SnowflakeQueryCompiler":
"""
Wrapper around _str_slice_internal to be supported in faster pandas.
"""
relaxed_query_compiler = None
if self._relaxed_query_compiler is not None:
relaxed_query_compiler = self._relaxed_query_compiler._str_slice_internal(
start=start,
stop=stop,
step=step,
)
qc = self._str_slice_internal(
start=start,
stop=stop,
step=step,
)
return self._maybe_set_relaxed_qc(qc, relaxed_query_compiler)

def _str_slice_internal(
self,
start: Optional[int] = None,
stop: Optional[int] = None,
step: Optional[int] = None,
) -> "SnowflakeQueryCompiler":
"""
Slice substrings from each element in the Series or Index.
Expand Down
94 changes: 93 additions & 1 deletion tests/integ/modin/test_faster_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,11 @@
_SNOWPARK_PANDAS_DUMMY_ROW_POS_OPTIMIZATION_ENABLED,
Session,
)
from tests.integ.modin.utils import assert_frame_equal, assert_index_equal
from tests.integ.modin.utils import (
assert_frame_equal,
assert_index_equal,
assert_series_equal,
)
from tests.integ.utils.sql_counter import sql_count_checker
from tests.utils import Utils

Expand Down Expand Up @@ -278,6 +282,94 @@ def test_isin_series(session):
assert_frame_equal(snow_result, native_result, check_dtype=False)


@sql_count_checker(query_count=3)
def test_str_contains(session):
# create tables
table_name = Utils.random_name_for_temp_object(TempObjectType.TABLE)
session.create_dataframe(
native_pd.DataFrame([["abc"], ["def"], ["ghi"]], columns=["A"])
).write.save_as_table(table_name, table_type="temp")

# create snow dataframes
df = pd.read_snowflake(table_name)
snow_result = df["A"].str.contains("ab")

# verify that the input dataframe has a populated relaxed query compiler
assert df._query_compiler._relaxed_query_compiler is not None
assert df._query_compiler._relaxed_query_compiler._dummy_row_pos_mode is True
# verify that the output dataframe also has a populated relaxed query compiler
assert snow_result._query_compiler._relaxed_query_compiler is not None
assert (
snow_result._query_compiler._relaxed_query_compiler._dummy_row_pos_mode is True
)

# create pandas dataframes
native_df = df.to_pandas()
native_result = native_df["A"].str.contains("ab")

# compare results
assert_series_equal(snow_result, native_result, check_dtype=False)


@pytest.mark.parametrize("func", ["startswith", "endswith"])
@sql_count_checker(query_count=3)
def test_str_startswith_endswith(session, func):
# create tables
table_name = Utils.random_name_for_temp_object(TempObjectType.TABLE)
session.create_dataframe(
native_pd.DataFrame([["abc"], ["def"], ["cba"]], columns=["A"])
).write.save_as_table(table_name, table_type="temp")

# create snow dataframes
df = pd.read_snowflake(table_name)
snow_result = getattr(df["A"].str, func)("c")

# verify that the input dataframe has a populated relaxed query compiler
assert df._query_compiler._relaxed_query_compiler is not None
assert df._query_compiler._relaxed_query_compiler._dummy_row_pos_mode is True
# verify that the output dataframe also has a populated relaxed query compiler
assert snow_result._query_compiler._relaxed_query_compiler is not None
assert (
snow_result._query_compiler._relaxed_query_compiler._dummy_row_pos_mode is True
)

# create pandas dataframes
native_df = df.to_pandas()
native_result = getattr(native_df["A"].str, func)("c")

# compare results
assert_series_equal(snow_result, native_result, check_dtype=False)


@sql_count_checker(query_count=3)
def test_str_slice(session):
# create tables
table_name = Utils.random_name_for_temp_object(TempObjectType.TABLE)
session.create_dataframe(
native_pd.DataFrame([["abc"], ["def"], ["ghi"]], columns=["A"])
).write.save_as_table(table_name, table_type="temp")

# create snow dataframes
df = pd.read_snowflake(table_name)
snow_result = df["A"].str.slice(0, 2, 1)

# verify that the input dataframe has a populated relaxed query compiler
assert df._query_compiler._relaxed_query_compiler is not None
assert df._query_compiler._relaxed_query_compiler._dummy_row_pos_mode is True
# verify that the output dataframe also has a populated relaxed query compiler
assert snow_result._query_compiler._relaxed_query_compiler is not None
assert (
snow_result._query_compiler._relaxed_query_compiler._dummy_row_pos_mode is True
)

# create pandas dataframes
native_df = df.to_pandas()
native_result = native_df["A"].str.slice(0, 2, 1)

# compare results
assert_series_equal(snow_result, native_result, check_dtype=False)


@sql_count_checker(query_count=0)
def test_dummy_row_pos_optimization_enabled_on_session(db_parameters):
with Session.builder.configs(db_parameters).create() as new_session:
Expand Down