Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
- Improved performance of `Series.to_snowflake` and `pd.to_snowflake(series)` for large data by uploading data via a parquet file. You can control the dataset size at which Snowpark pandas switches to parquet with the variable `modin.config.PandasToSnowflakeParquetThresholdBytes`.
- Set `cte_optimization_enabled` to True for all Snowpark pandas sessions.
- Add support for `isna`, `isnull`, `notna`, `notnull` in faster pandas.
- Add support for `str.contains`, `str.startswith`, `str.endswith`, and `str.slice` in faster pandas.

## 1.40.0 (2025-10-02)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17116,6 +17116,21 @@ def str_encode(self, encoding: str, errors: str) -> None:

def str_startswith(
self, pat: Union[str, tuple], na: object = None
) -> "SnowflakeQueryCompiler":
"""
Wrapper around _str_startswith_internal to be supported in faster pandas.
"""
relaxed_query_compiler = None
if self._relaxed_query_compiler is not None:
relaxed_query_compiler = (
self._relaxed_query_compiler._str_startswith_internal(pat=pat, na=na)
)

qc = self._str_startswith_internal(pat=pat, na=na)
return self._maybe_set_relaxed_qc(qc, relaxed_query_compiler)

def _str_startswith_internal(
self, pat: Union[str, tuple], na: object = None
) -> "SnowflakeQueryCompiler":
"""
Test if the start of each string element matches a pattern.
Expand All @@ -17135,6 +17150,21 @@ def str_startswith(

def str_endswith(
self, pat: Union[str, tuple], na: object = None
) -> "SnowflakeQueryCompiler":
"""
Wrapper around _str_endswith_internal to be supported in faster pandas.
"""
relaxed_query_compiler = None
if self._relaxed_query_compiler is not None:
relaxed_query_compiler = (
self._relaxed_query_compiler._str_endswith_internal(pat=pat, na=na)
)

qc = self._str_endswith_internal(pat=pat, na=na)
return self._maybe_set_relaxed_qc(qc, relaxed_query_compiler)

def _str_endswith_internal(
self, pat: Union[str, tuple], na: object = None
) -> "SnowflakeQueryCompiler":
"""
Test if the end of each string element matches a pattern.
Expand Down Expand Up @@ -17490,6 +17520,38 @@ def str_contains(
flags: int = 0,
na: object = None,
regex: bool = True,
) -> "SnowflakeQueryCompiler":
"""
Wrapper around _str_contains_internal to be supported in faster pandas.
"""
relaxed_query_compiler = None
if self._relaxed_query_compiler is not None:
relaxed_query_compiler = (
self._relaxed_query_compiler._str_contains_internal(
pat=pat,
case=case,
flags=flags,
na=na,
regex=regex,
)
)

qc = self._str_contains_internal(
pat=pat,
case=case,
flags=flags,
na=na,
regex=regex,
)
return self._maybe_set_relaxed_qc(qc, relaxed_query_compiler)

def _str_contains_internal(
self,
pat: str,
case: bool = True,
flags: int = 0,
na: object = None,
regex: bool = True,
) -> "SnowflakeQueryCompiler":
"""
Test if pattern or regex is contained within a string of a Series or Index.
Expand Down Expand Up @@ -17851,6 +17913,29 @@ def str_slice(
start: Optional[int] = None,
stop: Optional[int] = None,
step: Optional[int] = None,
) -> "SnowflakeQueryCompiler":
"""
Wrapper around _str_slice_internal to be supported in faster pandas.
"""
relaxed_query_compiler = None
if self._relaxed_query_compiler is not None:
relaxed_query_compiler = self._relaxed_query_compiler._str_slice_internal(
start=start,
stop=stop,
step=step,
)
qc = self._str_slice_internal(
start=start,
stop=stop,
step=step,
)
return self._maybe_set_relaxed_qc(qc, relaxed_query_compiler)

def _str_slice_internal(
self,
start: Optional[int] = None,
stop: Optional[int] = None,
step: Optional[int] = None,
) -> "SnowflakeQueryCompiler":
"""
Slice substrings from each element in the Series or Index.
Expand Down
94 changes: 93 additions & 1 deletion tests/integ/modin/test_faster_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,11 @@
_SNOWPARK_PANDAS_DUMMY_ROW_POS_OPTIMIZATION_ENABLED,
Session,
)
from tests.integ.modin.utils import assert_frame_equal, assert_index_equal
from tests.integ.modin.utils import (
assert_frame_equal,
assert_index_equal,
assert_series_equal,
)
from tests.integ.utils.sql_counter import sql_count_checker
from tests.utils import Utils

Expand Down Expand Up @@ -278,6 +282,94 @@ def test_isin_series(session):
assert_frame_equal(snow_result, native_result, check_dtype=False)


@sql_count_checker(query_count=3)
def test_str_contains(session):
# create tables
table_name = Utils.random_name_for_temp_object(TempObjectType.TABLE)
session.create_dataframe(
native_pd.DataFrame([["abc"], ["def"], ["ghi"]], columns=["A"])
).write.save_as_table(table_name, table_type="temp")

# create snow dataframes
df = pd.read_snowflake(table_name)
snow_result = df["A"].str.contains("ab")

# verify that the input dataframe has a populated relaxed query compiler
assert df._query_compiler._relaxed_query_compiler is not None
assert df._query_compiler._relaxed_query_compiler._dummy_row_pos_mode is True
# verify that the output dataframe also has a populated relaxed query compiler
assert snow_result._query_compiler._relaxed_query_compiler is not None
assert (
snow_result._query_compiler._relaxed_query_compiler._dummy_row_pos_mode is True
)

# create pandas dataframes
native_df = df.to_pandas()
native_result = native_df["A"].str.contains("ab")

# compare results
assert_series_equal(snow_result, native_result)


@pytest.mark.parametrize("func", ["startswith", "endswith"])
@sql_count_checker(query_count=3)
def test_str_startswith_endswith(session, func):
# create tables
table_name = Utils.random_name_for_temp_object(TempObjectType.TABLE)
session.create_dataframe(
native_pd.DataFrame([["abc"], ["def"], ["cba"]], columns=["A"])
).write.save_as_table(table_name, table_type="temp")

# create snow dataframes
df = pd.read_snowflake(table_name)
snow_result = getattr(df["A"].str, func)("c")

# verify that the input dataframe has a populated relaxed query compiler
assert df._query_compiler._relaxed_query_compiler is not None
assert df._query_compiler._relaxed_query_compiler._dummy_row_pos_mode is True
# verify that the output dataframe also has a populated relaxed query compiler
assert snow_result._query_compiler._relaxed_query_compiler is not None
assert (
snow_result._query_compiler._relaxed_query_compiler._dummy_row_pos_mode is True
)

# create pandas dataframes
native_df = df.to_pandas()
native_result = getattr(native_df["A"].str, func)("c")

# compare results
assert_series_equal(snow_result, native_result)


@sql_count_checker(query_count=3)
def test_str_slice(session):
# create tables
table_name = Utils.random_name_for_temp_object(TempObjectType.TABLE)
session.create_dataframe(
native_pd.DataFrame([["abc"], ["def"], ["ghi"]], columns=["A"])
).write.save_as_table(table_name, table_type="temp")

# create snow dataframes
df = pd.read_snowflake(table_name)
snow_result = df["A"].str.slice(0, 2, 1)

# verify that the input dataframe has a populated relaxed query compiler
assert df._query_compiler._relaxed_query_compiler is not None
assert df._query_compiler._relaxed_query_compiler._dummy_row_pos_mode is True
# verify that the output dataframe also has a populated relaxed query compiler
assert snow_result._query_compiler._relaxed_query_compiler is not None
assert (
snow_result._query_compiler._relaxed_query_compiler._dummy_row_pos_mode is True
)

# create pandas dataframes
native_df = df.to_pandas()
native_result = native_df["A"].str.slice(0, 2, 1)

# compare results
assert_series_equal(snow_result, native_result)


@sql_count_checker(query_count=0)
def test_dummy_row_pos_optimization_enabled_on_session(db_parameters):
with Session.builder.configs(db_parameters).create() as new_session:
Expand Down