From 5296ddf176ec4432bdc2a96d909350c849bb96ed Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Thu, 12 Dec 2024 19:44:07 -0800 Subject: [PATCH 1/9] [SNOW-1705797]: Use cached metadata to make `repr` faster on simple DataFrames --- .../plugin/_internal/ordered_dataframe.py | 39 +++++++++++++++++-- 1 file changed, 35 insertions(+), 4 deletions(-) diff --git a/src/snowflake/snowpark/modin/plugin/_internal/ordered_dataframe.py b/src/snowflake/snowpark/modin/plugin/_internal/ordered_dataframe.py index aa111b8a96..b62bad3710 100644 --- a/src/snowflake/snowpark/modin/plugin/_internal/ordered_dataframe.py +++ b/src/snowflake/snowpark/modin/plugin/_internal/ordered_dataframe.py @@ -421,10 +421,21 @@ def ensure_row_count_column(self) -> "OrderedDataFrame": wrap_double_underscore=True, )[0] ) - ordered_dataframe = self.select( - *self.projected_column_snowflake_quoted_identifiers, - count("*").over().as_(row_count_snowflake_quoted_identifier), - ) + if not self.is_projection_of_table(): + ordered_dataframe = self.select( + *self.projected_column_snowflake_quoted_identifiers, + count("*").over().as_(row_count_snowflake_quoted_identifier), + ) + else: + from snowflake.snowpark.modin.plugin._internal.utils import pandas_lit + + row_count = self.select(count("*").as_("__count_of_rows__"),).collect()[ + 0 + ][0] + ordered_dataframe = self.select( + *self.projected_column_snowflake_quoted_identifiers, + pandas_lit(row_count).as_(row_count_snowflake_quoted_identifier), + ) # inplace update so dataframe_ref can be shared. Note that we keep # the original ordering columns. ordered_dataframe.row_count_snowflake_quoted_identifier = ( @@ -2020,3 +2031,23 @@ def sample(self, n: Optional[int], frac: Optional[float]) -> "OrderedDataFrame": ordering_columns=self.ordering_columns, ) ) + + def is_projection_of_table(self) -> bool: + """ + Return whether or not the current OrderedDataFrame is simply a projection of a table. + + Returns: + bool + True if the current OrderedDataFrame is simply a projection of a table. False if it represents + a more complex operation. + """ + # If we have only performed projections since creating this DataFrame, it will only contain + # 1 API call in the plan - either `Session.sql` for DataFrames based off of I/O operations + # e.g. `read_snowflake` or `read_csv`, or `Session.create_dataframe` for DataFrames created + # out of Python objects. + snowpark_df = self._dataframe_ref.snowpark_dataframe + snowpark_plan = snowpark_df._plan + return len(snowpark_plan.api_calls) == 1 and any( + accepted_api in snowpark_plan.api_calls[0]["name"] + for accepted_api in ["Session.sql", "Session.create_dataframe"] + ) From b0c271e5eed9c20bbb038553b63de0f75976d255 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Thu, 12 Dec 2024 20:19:19 -0800 Subject: [PATCH 2/9] Use count correctly --- .../snowpark/modin/plugin/_internal/ordered_dataframe.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/snowflake/snowpark/modin/plugin/_internal/ordered_dataframe.py b/src/snowflake/snowpark/modin/plugin/_internal/ordered_dataframe.py index b62bad3710..650ceff90a 100644 --- a/src/snowflake/snowpark/modin/plugin/_internal/ordered_dataframe.py +++ b/src/snowflake/snowpark/modin/plugin/_internal/ordered_dataframe.py @@ -429,9 +429,7 @@ def ensure_row_count_column(self) -> "OrderedDataFrame": else: from snowflake.snowpark.modin.plugin._internal.utils import pandas_lit - row_count = self.select(count("*").as_("__count_of_rows__"),).collect()[ - 0 - ][0] + row_count = self._dataframe_ref.snowpark_dataframe.count() ordered_dataframe = self.select( *self.projected_column_snowflake_quoted_identifiers, pandas_lit(row_count).as_(row_count_snowflake_quoted_identifier), From 64359f6254b6ba4092bb7b4be4aa5441bdf27123 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Tue, 17 Dec 2024 15:30:45 -0800 Subject: [PATCH 3/9] Fix first 10 files' query counts --- tests/integ/modin/frame/test_add_suffix.py | 2 +- tests/integ/modin/frame/test_aggregate.py | 2 +- tests/integ/modin/frame/test_attrs.py | 2 +- tests/integ/modin/frame/test_empty.py | 2 +- tests/integ/modin/frame/test_getitem.py | 27 ++-- tests/integ/modin/frame/test_head_tail.py | 60 ++++----- tests/integ/modin/frame/test_iat.py | 119 ++++++++++-------- tests/integ/modin/frame/test_insert.py | 54 ++++---- tests/integ/modin/test_chained_operations.py | 2 +- .../integ/modin/test_from_pandas_to_pandas.py | 2 +- tests/integ/modin/test_numpy.py | 6 +- 11 files changed, 148 insertions(+), 130 deletions(-) diff --git a/tests/integ/modin/frame/test_add_suffix.py b/tests/integ/modin/frame/test_add_suffix.py index 3cb338ca6d..5b6c2a91f7 100644 --- a/tests/integ/modin/frame/test_add_suffix.py +++ b/tests/integ/modin/frame/test_add_suffix.py @@ -71,7 +71,7 @@ def test_df_add_suffix_snowpark_pandas_series( ) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=3) def test_df_add_prefix_snowpark_pandas_df( default_index_snowpark_pandas_df, default_index_native_df ): diff --git a/tests/integ/modin/frame/test_aggregate.py b/tests/integ/modin/frame/test_aggregate.py index 7003c802d5..4a14241237 100644 --- a/tests/integ/modin/frame/test_aggregate.py +++ b/tests/integ/modin/frame/test_aggregate.py @@ -170,7 +170,7 @@ def test_string_sum(data, numeric_only_kwargs): ) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=2) def test_string_sum_of_reversed_df(): # check that we get the string concatenation right even when the dataframe # is not in its original order. diff --git a/tests/integ/modin/frame/test_attrs.py b/tests/integ/modin/frame/test_attrs.py index bfa972d16a..b89bd7d8d4 100644 --- a/tests/integ/modin/frame/test_attrs.py +++ b/tests/integ/modin/frame/test_attrs.py @@ -68,7 +68,7 @@ def func(df): # Tests that attrs is preserved across `take`, a unary operation that returns a Snowpark pandas object. # Other unary operators are checked by other tests in the `eval_snowpark_pandas_result` method. -@sql_count_checker(query_count=0) +@sql_count_checker(query_count=1) def test_df_attrs_take(): def func(df): df.attrs = {"A": [1], "B": "check me"} diff --git a/tests/integ/modin/frame/test_empty.py b/tests/integ/modin/frame/test_empty.py index 9eaecc36dd..842d072064 100644 --- a/tests/integ/modin/frame/test_empty.py +++ b/tests/integ/modin/frame/test_empty.py @@ -27,7 +27,7 @@ ({"A": [np.nan]}, "np nan column"), ], ) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=2) def test_dataframe_empty_param(dataframe_input, test_case_name): eval_snowpark_pandas_result( pd.DataFrame(dataframe_input), diff --git a/tests/integ/modin/frame/test_getitem.py b/tests/integ/modin/frame/test_getitem.py index e1f676b4ff..abf9e9834f 100644 --- a/tests/integ/modin/frame/test_getitem.py +++ b/tests/integ/modin/frame/test_getitem.py @@ -85,7 +85,7 @@ def get_helper(df): return df[key] # 5 extra queries for iter - with SqlCounter(query_count=6 if isinstance(key, native_pd.Index) else 1): + with SqlCounter(query_count=7 if isinstance(key, native_pd.Index) else 1): eval_snowpark_pandas_result( default_index_snowpark_pandas_df, default_index_native_df, @@ -120,7 +120,7 @@ def get_helper(df): snowpark_df = pd.DataFrame(native_df) # 5 extra queries for iter - with SqlCounter(query_count=6 if isinstance(key, native_pd.Index) else 1): + with SqlCounter(query_count=7 if isinstance(key, native_pd.Index) else 1): eval_snowpark_pandas_result( snowpark_df, native_df, @@ -321,15 +321,19 @@ def test_df_getitem_calls_getitem(): slice(-100, None, -2), ], ) -@sql_count_checker(query_count=1) def test_df_getitem_with_slice( key, default_index_snowpark_pandas_df, default_index_native_df ): - eval_snowpark_pandas_result( - default_index_snowpark_pandas_df, - default_index_native_df, - lambda df: df[key], - ) + if key.start is None: + expected_query_count = 1 + else: + expected_query_count = 2 + with SqlCounter(query_count=expected_query_count): + eval_snowpark_pandas_result( + default_index_snowpark_pandas_df, + default_index_native_df, + lambda df: df[key], + ) @pytest.mark.parametrize( @@ -364,6 +368,7 @@ def test_df_getitem_with_non_int_slice(key): def test_df_getitem_with_multiindex( key, default_index_native_df, multiindex_native, native_df_with_multiindex_columns ): + expected_query_count = 2 if isinstance(key, slice) else 1 # Test __getitem__ with df with MultiIndex index. native_df = default_index_native_df.set_index(multiindex_native) snowpark_df = pd.DataFrame(native_df) @@ -377,13 +382,13 @@ def test_df_getitem_with_multiindex( ) else _key ) - with SqlCounter(query_count=1): + with SqlCounter(query_count=expected_query_count): eval_snowpark_pandas_result(snowpark_df, native_df, lambda df: df[_key]) # Test __getitem__ with df with MultiIndex columns. native_df = native_df_with_multiindex_columns snowpark_df = pd.DataFrame(native_df) - with SqlCounter(query_count=1): + with SqlCounter(query_count=expected_query_count): eval_snowpark_pandas_result( snowpark_df, native_df, lambda df: df[key], check_column_type=False ) @@ -391,7 +396,7 @@ def test_df_getitem_with_multiindex( # Test __getitem__ with df with MultiIndex index. native_df = native_df_with_multiindex_columns.set_index(multiindex_native) snowpark_df = pd.DataFrame(native_df) - with SqlCounter(query_count=1): + with SqlCounter(query_count=expected_query_count): eval_snowpark_pandas_result( snowpark_df, native_df, lambda df: df[key], check_column_type=False ) diff --git a/tests/integ/modin/frame/test_head_tail.py b/tests/integ/modin/frame/test_head_tail.py index 231bec17ca..2f38bb1d76 100644 --- a/tests/integ/modin/frame/test_head_tail.py +++ b/tests/integ/modin/frame/test_head_tail.py @@ -13,7 +13,7 @@ assert_snowpark_pandas_equals_to_pandas_without_dtypecheck, eval_snowpark_pandas_result, ) -from tests.integ.utils.sql_counter import sql_count_checker +from tests.integ.utils.sql_counter import SqlCounter def eval_result_and_query_with_no_join( @@ -34,41 +34,43 @@ def eval_result_and_query_with_no_join( "n", [1, None, 0, -1, -10, 5, 10], ) -@sql_count_checker(query_count=2) def test_head_tail(n, default_index_snowpark_pandas_df, default_index_native_df): - eval_snowpark_pandas_result( - default_index_snowpark_pandas_df, - default_index_native_df, - lambda df: (df.head() if n is None else df.head(n)), - comparator=eval_result_and_query_with_no_join, - ) + expected_query_count = 2 if n == 0 else 3 + with SqlCounter(query_count=expected_query_count): + eval_snowpark_pandas_result( + default_index_snowpark_pandas_df, + default_index_native_df, + lambda df: (df.head() if n is None else df.head(n)), + comparator=eval_result_and_query_with_no_join, + ) - eval_snowpark_pandas_result( - default_index_snowpark_pandas_df, - default_index_native_df, - lambda df: (df.tail() if n is None else df.tail(n)), - comparator=eval_result_and_query_with_no_join, - ) + eval_snowpark_pandas_result( + default_index_snowpark_pandas_df, + default_index_native_df, + lambda df: (df.tail() if n is None else df.tail(n)), + comparator=eval_result_and_query_with_no_join, + ) @pytest.mark.parametrize( "n", [1, None, 0, -1, -10, 5, 10], ) -@sql_count_checker(query_count=2) def test_empty_dataframe(n, empty_snowpark_pandas_df): - eval_snowpark_pandas_result( - empty_snowpark_pandas_df, - native_pd.DataFrame(), - lambda df: (df.head() if n is None else df.head(n)), - comparator=eval_result_and_query_with_no_join, - check_column_type=False, - ) + expected_query_count = 2 if n == 0 else 3 + with SqlCounter(query_count=expected_query_count): + eval_snowpark_pandas_result( + empty_snowpark_pandas_df, + native_pd.DataFrame(), + lambda df: (df.head() if n is None else df.head(n)), + comparator=eval_result_and_query_with_no_join, + check_column_type=False, + ) - eval_snowpark_pandas_result( - empty_snowpark_pandas_df, - native_pd.DataFrame(), - lambda df: (df.tail() if n is None else df.tail(n)), - comparator=eval_result_and_query_with_no_join, - check_column_type=False, - ) + eval_snowpark_pandas_result( + empty_snowpark_pandas_df, + native_pd.DataFrame(), + lambda df: (df.tail() if n is None else df.tail(n)), + comparator=eval_result_and_query_with_no_join, + check_column_type=False, + ) diff --git a/tests/integ/modin/frame/test_iat.py b/tests/integ/modin/frame/test_iat.py index 4f2c685e89..a1c42540dc 100644 --- a/tests/integ/modin/frame/test_iat.py +++ b/tests/integ/modin/frame/test_iat.py @@ -7,25 +7,29 @@ from pandas.errors import IndexingError from tests.integ.modin.utils import eval_snowpark_pandas_result # noqa: F401 -from tests.integ.utils.sql_counter import sql_count_checker +from tests.integ.utils.sql_counter import sql_count_checker, SqlCounter @pytest.mark.parametrize( - "key", + "key, expected_query_count", [ - (0, 0), - (0, -7), - (-7, 0), - (-7, -7), + ((0, 0), 1), + ((0, -7), 1), + ((-7, 0), 2), + ((-7, -7), 2), ], ) -@sql_count_checker(query_count=1) def test_iat_get_default_index_str_columns( key, + expected_query_count, default_index_snowpark_pandas_df, default_index_native_df, ): - assert default_index_snowpark_pandas_df.iat[key] == default_index_native_df.iat[key] + with SqlCounter(query_count=expected_query_count): + assert ( + default_index_snowpark_pandas_df.iat[key] + == default_index_native_df.iat[key] + ) @pytest.mark.parametrize( @@ -37,7 +41,7 @@ def test_iat_get_default_index_str_columns( (-7, -7), ], ) -@sql_count_checker(query_count=1, join_count=2) +@sql_count_checker(query_count=2, join_count=2) def test_iat_set_default_index_str_columns( key, default_index_snowpark_pandas_df, @@ -55,21 +59,22 @@ def iat_set_helper(df): @pytest.mark.parametrize( - "key", + "key, expected_query_count", [ - (0, 0), - (0, -7), - (-7, 0), - (-7, -7), + ((0, 0), 1), + ((0, -7), 1), + ((-7, 0), 2), + ((-7, -7), 2), ], ) -@sql_count_checker(query_count=1) def test_iat_get_str_index_str_columns( key, + expected_query_count, str_index_snowpark_pandas_df, str_index_native_df, ): - assert str_index_snowpark_pandas_df.iat[key] == str_index_native_df.iat[key] + with SqlCounter(query_count=expected_query_count): + assert str_index_snowpark_pandas_df.iat[key] == str_index_native_df.iat[key] @pytest.mark.parametrize( @@ -81,7 +86,7 @@ def test_iat_get_str_index_str_columns( (-7, -7), ], ) -@sql_count_checker(query_count=1, join_count=2) +@sql_count_checker(query_count=2, join_count=2) def test_iat_set_str_index_str_columns( key, str_index_snowpark_pandas_df, @@ -96,21 +101,22 @@ def iat_set_helper(df): @pytest.mark.parametrize( - "key", + "key, expected_query_count", [ - (0, 0), - (0, -7), - (-7, 0), - (-7, -7), + ((0, 0), 1), + ((0, -7), 1), + ((-7, 0), 2), + ((-7, -7), 2), ], ) -@sql_count_checker(query_count=1) def test_iat_get_time_index_time_columns( key, + expected_query_count, time_index_snowpark_pandas_df, time_index_native_df, ): - assert time_index_snowpark_pandas_df.iat[key] == time_index_native_df.iat[key] + with SqlCounter(query_count=expected_query_count): + assert time_index_snowpark_pandas_df.iat[key] == time_index_native_df.iat[key] @pytest.mark.parametrize( @@ -122,7 +128,7 @@ def test_iat_get_time_index_time_columns( (-7, -7), ], ) -@sql_count_checker(query_count=1, join_count=2) +@sql_count_checker(query_count=2, join_count=2) def test_iat_set_time_index_time_columns( key, time_index_snowpark_pandas_df, @@ -140,23 +146,24 @@ def iat_set_helper(df): @pytest.mark.parametrize( - "key", + "key, expected_query_count", [ - (0, 0), - (0, -7), - (-7, 0), - (-7, -7), + ((0, 0), 1), + ((0, -7), 1), + ((-7, 0), 2), + ((-7, -7), 2), ], ) -@sql_count_checker(query_count=1) def test_iat_get_multiindex_index_str_columns( key, + expected_query_count, default_index_native_df, multiindex_native, ): - native_df = default_index_native_df.set_index(multiindex_native) - snowpark_df = pd.DataFrame(native_df) - assert snowpark_df.iat[key] == native_df.iat[key] + with SqlCounter(query_count=expected_query_count): + native_df = default_index_native_df.set_index(multiindex_native) + snowpark_df = pd.DataFrame(native_df) + assert snowpark_df.iat[key] == native_df.iat[key] @pytest.mark.parametrize( @@ -168,7 +175,7 @@ def test_iat_get_multiindex_index_str_columns( (-7, -7), ], ) -@sql_count_checker(query_count=1, join_count=2) +@sql_count_checker(query_count=2, join_count=2) def test_iat_set_multiindex_index_str_columns( key, default_index_native_df, @@ -183,22 +190,23 @@ def at_set_helper(df): @pytest.mark.parametrize( - "key", + "key, expected_query_count", [ - (0, 0), - (0, -7), - (-7, 0), - (-7, -7), + ((0, 0), 1), + ((0, -7), 1), + ((-7, 0), 2), + ((-7, -7), 2), ], ) -@sql_count_checker(query_count=1) def test_iat_get_default_index_multiindex_columns( key, + expected_query_count, native_df_with_multiindex_columns, ): - native_df = native_df_with_multiindex_columns - snowpark_df = pd.DataFrame(native_df) - assert snowpark_df.iat[key] == native_df.iat[key] + with SqlCounter(query_count=expected_query_count): + native_df = native_df_with_multiindex_columns + snowpark_df = pd.DataFrame(native_df) + assert snowpark_df.iat[key] == native_df.iat[key] @pytest.mark.parametrize( @@ -210,7 +218,7 @@ def test_iat_get_default_index_multiindex_columns( (-7, -7), ], ) -@sql_count_checker(query_count=1, join_count=2) +@sql_count_checker(query_count=2, join_count=2) def test_iat_set_default_index_multiindex_columns( key, native_df_with_multiindex_columns, @@ -224,23 +232,24 @@ def at_set_helper(df): @pytest.mark.parametrize( - "key", + "key, expected_query_count", [ - (0, 0), - (0, -7), - (-7, 0), - (-7, -7), + ((0, 0), 1), + ((0, -7), 1), + ((-7, 0), 2), + ((-7, -7), 2), ], ) -@sql_count_checker(query_count=1) def test_iat_get_multiindex_index_multiindex_columns( key, + expected_query_count, native_df_with_multiindex_columns, multiindex_native, ): - native_df = native_df_with_multiindex_columns.set_index(multiindex_native) - snowpark_df = pd.DataFrame(native_df) - assert snowpark_df.iat[key] == native_df.iat[key] + with SqlCounter(query_count=expected_query_count): + native_df = native_df_with_multiindex_columns.set_index(multiindex_native) + snowpark_df = pd.DataFrame(native_df) + assert snowpark_df.iat[key] == native_df.iat[key] @pytest.mark.parametrize( @@ -252,7 +261,7 @@ def test_iat_get_multiindex_index_multiindex_columns( (-7, -7), ], ) -@sql_count_checker(query_count=1, join_count=2) +@sql_count_checker(query_count=2, join_count=2) def test_iat_set_multiindex_index_multiindex_columns( key, native_df_with_multiindex_columns, diff --git a/tests/integ/modin/frame/test_insert.py b/tests/integ/modin/frame/test_insert.py index 974823d02c..f9a421323e 100644 --- a/tests/integ/modin/frame/test_insert.py +++ b/tests/integ/modin/frame/test_insert.py @@ -54,7 +54,7 @@ def native_df(): ), ], ) -@sql_count_checker(query_count=5, join_count=3) +@sql_count_checker(query_count=6, join_count=3) def test_insert_snowpark_pandas_objects(native_df, native_value): snow_df = pd.DataFrame(native_df) value = pd.DataFrame(native_value) @@ -99,7 +99,7 @@ def test_insert_snowpark_pandas_objects(native_df, native_value): ), ], ) -@sql_count_checker(query_count=2, join_count=1) +@sql_count_checker(query_count=3, join_count=1) def test_insert_one_to_many(native_df, native_value): snow_df = pd.DataFrame(native_df) value = pd.DataFrame(native_value) @@ -128,11 +128,11 @@ def test_insert_one_to_many(native_df, native_value): @pytest.mark.parametrize( "value, expected_query_count, expected_join_count", [ - (np.array(["a", "b", "c"]), 2, 1), # numpy array of shape (N,) - (np.array([["a"], ["b"], ["c"]]), 2, 1), # numpy array of shape (N, 1) - (["a", "b", "c"], 2, 1), # python list + (np.array(["a", "b", "c"]), 3, 1), # numpy array of shape (N,) + (np.array([["a"], ["b"], ["c"]]), 3, 1), # numpy array of shape (N, 1) + (["a", "b", "c"], 3, 1), # python list ({0: 1, 1: 2, 4: 3}, 1, 1), # python dict - (("a", "b", "c"), 2, 1), # python tuple + (("a", "b", "c"), 3, 1), # python tuple ], ) def test_insert_array_like(native_df, value, expected_query_count, expected_join_count): @@ -187,7 +187,7 @@ def test_insert_pandas_types_negative(snow_df): snow_df.insert(0, "col3", value) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=3) def test_insert_dataframe_shape_negative(native_df): # DataFrame with more than one column snow_df = pd.DataFrame(native_df) @@ -209,10 +209,10 @@ def test_insert_dataframe_shape_negative(native_df): [ # NOTE: Accepted numpy array shapes are (N,) or (N, 1) where N = number of rows = 3 (np.ones((3, 2)), 0), - (np.ones((6, 1)), 1), - (np.ones((1, 1)), 1), - ([1, 2], 1), # len < number of rows - ((6, 7, 8, 9), 1), # len > number of rows + (np.ones((6, 1)), 2), + (np.ones((1, 1)), 2), + ([1, 2], 2), # len < number of rows + ((6, 7, 8, 9), 2), # len > number of rows ({"a", "b", "c"}, 0), # python set ], ) @@ -227,7 +227,7 @@ def test_insert_value_negative(native_df, value, expected_query_count): ) -@sql_count_checker(query_count=2, join_count=1) +@sql_count_checker(query_count=3, join_count=1) def test_insert_duplicate_label(native_df): snow_df = pd.DataFrame(native_df) eval_snowpark_pandas_result( @@ -250,7 +250,7 @@ def test_insert_duplicate_label_negative(native_df): @pytest.mark.parametrize("loc", [0, 1, 2]) -@sql_count_checker(query_count=2, join_count=1) +@sql_count_checker(query_count=3, join_count=1) def test_insert_loc(native_df, loc): snow_df = pd.DataFrame(native_df) eval_snowpark_pandas_result( @@ -262,7 +262,7 @@ def test_insert_loc(native_df, loc): @pytest.mark.parametrize( - "loc, expected_query_count", [(-99, 1), (-1, 1), (99, 1), ("1", 0)] + "loc, expected_query_count", [(-99, 2), (-1, 2), (99, 2), ("1", 0)] ) def test_insert_loc_negative(native_df, loc, expected_query_count): with SqlCounter(query_count=expected_query_count): @@ -278,10 +278,10 @@ def test_insert_loc_negative(native_df, loc, expected_query_count): @pytest.mark.parametrize( "value, expected_query_count, expected_join_count", [ - (np.array(["a", "b", "c", "d"]), 2, 1), # numpy array of shape (N,) - (np.array([["a"], ["b"], ["c"], ["d"]]), 2, 1), # numpy array of shape (N, 1) - (["a", "b", "c", "d"], 2, 1), # python list - (("a", "b", "c", "d"), 2, 1), # python tuple + (np.array(["a", "b", "c", "d"]), 3, 1), # numpy array of shape (N,) + (np.array([["a"], ["b"], ["c"], ["d"]]), 3, 1), # numpy array of shape (N, 1) + (["a", "b", "c", "d"], 3, 1), # python list + (("a", "b", "c", "d"), 3, 1), # python tuple ({(3, 1): 1}, 1, 1), # python dict ("abc", 1, 0), # sting scalar (1, 1, 0), # int scalar @@ -311,7 +311,7 @@ def test_insert_multiindex_array_like_and_scalar( ("a", "b", "c", "d"), # python tuple ], ) -@sql_count_checker(query_count=2, join_count=1) +@sql_count_checker(query_count=3, join_count=1) def test_insert_empty_multiindex_frame(value): mi = pd.MultiIndex.from_arrays([np.array([], dtype=int), np.array([], dtype=int)]) snow_df = pd.DataFrame([], index=mi) @@ -352,7 +352,7 @@ def test_insert_multiindex_dict_negative(): ([1.0, 2.5, 3.0], [1, 2, 3]), # Long and Double can be joined ], ) -@sql_count_checker(query_count=4, join_count=1) +@sql_count_checker(query_count=5, join_count=1) def test_insert_compatible_index(df_index, value_index): snow_df = pd.DataFrame({"col1": ["p", "q", "r"]}, index=native_pd.Index(df_index)) value = pd.DataFrame({"col2": ["x", "y", "z"]}, index=native_pd.Index(value_index)) @@ -383,7 +383,7 @@ def test_insert_compatible_index(df_index, value_index): ), # length and type mismatch ], ) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=2) def test_insert_index_num_levels_mismatch_negative(df_index, value_index): snow_df = pd.DataFrame({"col1": ["p", "q", "r"]}, index=native_pd.Index(df_index)) value = pd.DataFrame({"col2": ["w", "x", "y"]}, index=native_pd.Index(value_index)) @@ -408,7 +408,7 @@ def test_insert_index_num_levels_mismatch_negative(df_index, value_index): ), # type mismatch boolean != long ], ) -@sql_count_checker(query_count=2, join_count=1) +@sql_count_checker(query_count=3, join_count=1) def test_insert_index_type_mismatch(df_index, value_index, expected_index): # Note: This is different behavior than native pandas. In native pandas when # index datatype mismatch new columns in inserted will all NULL values. @@ -467,7 +467,7 @@ def test_insert_multiple_null(): "index, value, expected_query_count, expected_join_count", [ ([1, 2], native_pd.Series([1, 2], index=[2, 3]), 1, 1), - ([1, 2], [3, 4], 2, 1), + ([1, 2], [3, 4], 3, 1), ], ) def test_insert_into_empty_dataframe_with_index( @@ -528,6 +528,8 @@ def test_insert_into_empty_dataframe( expected_join_count = 0 if isinstance(value, list) or isinstance(value, np.ndarray): expected_query_count = 2 + if isinstance(value, (list, np.ndarray)): + expected_query_count += 1 snow_df = pd.DataFrame(data=data, columns=columns) native_df = native_pd.DataFrame(data=data, columns=columns) @@ -574,7 +576,7 @@ def test_insert_into_empty_dataframe_index_dtype_mismatch(): snow_df.to_pandas() -@sql_count_checker(query_count=2, join_count=1) +@sql_count_checker(query_count=3, join_count=1) def test_insert_empty_list_into_empty_dataframe(): snow_df = pd.DataFrame() native_df = native_pd.DataFrame() @@ -596,7 +598,7 @@ def test_insert_empty_list_into_empty_dataframe(): ([], ["A", "B", "C"]), ], ) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=2) def test_insert_into_empty_dataframe_negative(loc, data, columns): snow_df = pd.DataFrame(data=data, columns=columns) native_df = native_pd.DataFrame(data=data, columns=columns) @@ -771,7 +773,7 @@ def insert_op(df): assert_frame_equal(snow_res, expected_res, check_dtype=False) -@sql_count_checker(query_count=4, join_count=6) +@sql_count_checker(query_count=5, join_count=6) def test_insert_timedelta(): native_df = native_pd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) snow_df = pd.DataFrame(native_df) diff --git a/tests/integ/modin/test_chained_operations.py b/tests/integ/modin/test_chained_operations.py index 9e496a2608..072b5fe455 100644 --- a/tests/integ/modin/test_chained_operations.py +++ b/tests/integ/modin/test_chained_operations.py @@ -11,7 +11,7 @@ from tests.utils import multithreaded_run -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=3) def test_chained_op1(): # bug fix SNOW-1348886 data = {"X": [1, 2, 3], "Y": [4, 5, 6]} diff --git a/tests/integ/modin/test_from_pandas_to_pandas.py b/tests/integ/modin/test_from_pandas_to_pandas.py index 469ea5ee52..14d2eb942e 100644 --- a/tests/integ/modin/test_from_pandas_to_pandas.py +++ b/tests/integ/modin/test_from_pandas_to_pandas.py @@ -536,7 +536,7 @@ def test_series_to_pandas(): assert_series_equal(snow_series.to_pandas(), pandas_series) -@sql_count_checker(query_count=2, union_count=1) +@sql_count_checker(query_count=3, union_count=1) def test_single_row_frame_to_series_to_pandas(): # create a Snowpark pandas with single row native_df = native_pd.DataFrame( diff --git a/tests/integ/modin/test_numpy.py b/tests/integ/modin/test_numpy.py index 7a322d1671..f907e2979e 100644 --- a/tests/integ/modin/test_numpy.py +++ b/tests/integ/modin/test_numpy.py @@ -66,7 +66,7 @@ def test_full_like(): snow_df = pd.DataFrame(data) pandas_df = native_pd.DataFrame(data) - with SqlCounter(query_count=2): + with SqlCounter(query_count=3): snow_result = np.full_like(snow_df, 1234) pandas_result = np.full_like(pandas_df, 1234) assert_array_equal(np.array(snow_result), np.array(pandas_result)) @@ -232,7 +232,7 @@ def test_np_ufunc_unary_operators(np_ufunc): # The query count here is from the argument logging performed by numpy on error -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=3) def test_np_ufunc_notimplemented(): data = { "A": [1], @@ -244,7 +244,7 @@ def test_np_ufunc_notimplemented(): # The query count here is from the argument logging performed by numpy on error -@sql_count_checker(query_count=3) +@sql_count_checker(query_count=4) def test_np_ufunc_with_out_notimpl(): data = { "A": [1], From e09978be6c6f2e31ccc7f33ec66c0bb1b9d228f2 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Tue, 17 Dec 2024 15:33:20 -0800 Subject: [PATCH 4/9] Add 3 more query counts --- tests/integ/modin/frame/test_iterrows.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/integ/modin/frame/test_iterrows.py b/tests/integ/modin/frame/test_iterrows.py index dc7a0e6d3d..e1dee762ff 100644 --- a/tests/integ/modin/frame/test_iterrows.py +++ b/tests/integ/modin/frame/test_iterrows.py @@ -61,7 +61,7 @@ def test_df_iterrows(native_df): snowpark_df = pd.DataFrame(native_df) # One query is used to get the number of rows. One query is used to retrieve each row - each query has 4 JOIN # operations performed due to iloc. - with SqlCounter(query_count=len(native_df) + 1): + with SqlCounter(query_count=len(native_df) + 2): eval_snowpark_pandas_result( snowpark_df, native_df, @@ -70,7 +70,7 @@ def test_df_iterrows(native_df): ) -@sql_count_checker(query_count=8, union_count=7) +@sql_count_checker(query_count=9, union_count=7) def test_df_iterrows_mixed_types(default_index_native_df): # Same test as above on bigger df with mixed types. # One query is used to get the number of rows. One query is used to retrieve each row - each query has 4 JOIN @@ -85,7 +85,7 @@ def test_df_iterrows_mixed_types(default_index_native_df): ) -@sql_count_checker(query_count=7, union_count=6) +@sql_count_checker(query_count=8, union_count=6) def test_df_iterrows_multindex_df(): # Create df with a MultiIndex index. # One query is used to get the number of rows. One query is used to retrieve each row - each query has 4 JOIN From df51d38d5c5e067cdafa8e4760c668dd3effba73 Mon Sep 17 00:00:00 2001 From: Jonathan Shi Date: Tue, 11 Feb 2025 15:30:10 -0800 Subject: [PATCH 5/9] bump sql query counts --- .../modin/interoperability/scikit-learn/test_scikit_learn.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integ/modin/interoperability/scikit-learn/test_scikit_learn.py b/tests/integ/modin/interoperability/scikit-learn/test_scikit_learn.py index 7cff7c7803..a8e0138a42 100644 --- a/tests/integ/modin/interoperability/scikit-learn/test_scikit_learn.py +++ b/tests/integ/modin/interoperability/scikit-learn/test_scikit_learn.py @@ -137,7 +137,7 @@ def get_cluster_centers(df) -> np.ndarray: class TestDimensionalityReduction: - @sql_count_checker(query_count=3) + @sql_count_checker(query_count=4) def test_principal_component_analysis(self, test_dfs): def get_principal_components(df) -> np.ndarray: return PCA(n_components=2).fit(df).components_ @@ -192,7 +192,7 @@ def validate_search_results(snow_estimator, pandas_estimator): class TestPreprocessing: - @sql_count_checker(query_count=5) + @sql_count_checker(query_count=6) def test_maxabs(self, test_dfs): eval_snowpark_pandas_result( *test_dfs, From 3337f4032e3ca066e8965911142bcb18e158d11e Mon Sep 17 00:00:00 2001 From: Jonathan Shi Date: Thu, 27 Mar 2025 13:20:35 -0700 Subject: [PATCH 6/9] materialize row count for simple projection repr --- .../plugin/_internal/ordered_dataframe.py | 11 +++ .../compiler/snowflake_query_compiler.py | 86 +++++++++++-------- 2 files changed, 62 insertions(+), 35 deletions(-) diff --git a/src/snowflake/snowpark/modin/plugin/_internal/ordered_dataframe.py b/src/snowflake/snowpark/modin/plugin/_internal/ordered_dataframe.py index 650ceff90a..b8afc843eb 100644 --- a/src/snowflake/snowpark/modin/plugin/_internal/ordered_dataframe.py +++ b/src/snowflake/snowpark/modin/plugin/_internal/ordered_dataframe.py @@ -441,6 +441,17 @@ def ensure_row_count_column(self) -> "OrderedDataFrame": ) return ordered_dataframe + def materialize_row_count(self) -> int: + """ + Perform a query to retrieve the row count of this OrderedDataFrame. + + Use this function in place of ensure_row_count_column() in scenarios where the extra + query is acceptable, and the embedded `COUNT(*) OVER()` window operation would be too expensive. + Performing a naked `COUNT(*)` and avoiding a potential window function or cross join is + more performance in these scenarios. + """ + return self._dataframe_ref.snowpark_dataframe.count() + def generate_snowflake_quoted_identifiers( self, *, diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index c48cebefec..1f04b48355 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -13058,42 +13058,54 @@ def build_repr_df( `row_count` holds the number of rows the DataFrame has, `col_count` the number of columns the DataFrame has, and the pandas dataset with `num_rows` or fewer rows and `num_cols` or fewer columns. """ - # In order to issue less queries, use following trick: - # 1. add the row count column holding COUNT(*) OVER () over the snowpark dataframe - # 2. retrieve all columns - # 3. filter on rows with recursive count + # build_repr_df needs to know the row count of the underlying data, as the displayed representation will + # include the last few rows of the frame. + # + # To maximize performance, we use two distinct code paths. + # If the underlying OrderedDataFrame is a simple projection of a table: + # 1. Perform a query to retrieve the row count. This query will be cheap because the SQL engine can + # retrieve the value from table metadata. + # 2. Directly embed the row count into the filter query as a literal. + # If the underlying data is NOT a simple projection, we opt to perform fewer queries: + # 1. add the row count column holding COUNT(*) OVER () over the snowpark dataframe + # 2. retrieve all columns + # 3. filter on rows with recursive count # Previously, 2 queries were issued, and a first version replaced them with a single query and a join # the solution here uses a window function. This may lead to perf regressions, track these here SNOW-984177. # Ensure that our reference to self._modin_frame is updated with cached row count and position. - self._modin_frame = ( - self._modin_frame.ensure_row_position_column().ensure_row_count_column() - ) - row_count_pandas_label = ( - ROW_COUNT_COLUMN_LABEL - if len(self._modin_frame.data_column_pandas_index_names) == 1 - else (ROW_COUNT_COLUMN_LABEL,) - * len(self._modin_frame.data_column_pandas_index_names) - ) - frame_with_row_count_and_position = InternalFrame.create( - ordered_dataframe=self._modin_frame.ordered_dataframe, - data_column_pandas_labels=self._modin_frame.data_column_pandas_labels - + [row_count_pandas_label], - data_column_snowflake_quoted_identifiers=self._modin_frame.data_column_snowflake_quoted_identifiers - + [self._modin_frame.row_count_snowflake_quoted_identifier], - data_column_pandas_index_names=self._modin_frame.data_column_pandas_index_names, - index_column_pandas_labels=self._modin_frame.index_column_pandas_labels, - index_column_snowflake_quoted_identifiers=self._modin_frame.index_column_snowflake_quoted_identifiers, - data_column_types=self._modin_frame.cached_data_column_snowpark_pandas_types - + [None], - index_column_types=self._modin_frame.cached_index_column_snowpark_pandas_types, - ) + row_count_value = None + if self._modin_frame.ordered_dataframe.is_projection_of_table(): + frame = self._modin_frame.ensure_row_position_column() + row_count_value = frame.ordered_dataframe.materialize_row_count() + row_count_expr = pandas_lit(row_count_value) + else: + frame = ( + self._modin_frame.ensure_row_position_column().ensure_row_count_column() + ) + row_count_pandas_label = ( + ROW_COUNT_COLUMN_LABEL + if len(frame.data_column_pandas_index_names) == 1 + else (ROW_COUNT_COLUMN_LABEL,) + * len(frame.data_column_pandas_index_names) + ) + frame = InternalFrame.create( + ordered_dataframe=frame.ordered_dataframe, + data_column_pandas_labels=frame.data_column_pandas_labels + + [row_count_pandas_label], + data_column_snowflake_quoted_identifiers=frame.data_column_snowflake_quoted_identifiers + + [frame.row_count_snowflake_quoted_identifier], + data_column_pandas_index_names=frame.data_column_pandas_index_names, + index_column_pandas_labels=frame.index_column_pandas_labels, + index_column_snowflake_quoted_identifiers=frame.index_column_snowflake_quoted_identifiers, + data_column_types=frame.cached_data_column_snowpark_pandas_types + + [None], + index_column_types=frame.cached_index_column_snowpark_pandas_types, + ) - row_count_identifier = ( - frame_with_row_count_and_position.row_count_snowflake_quoted_identifier - ) + row_count_expr = col(frame.row_count_snowflake_quoted_identifier) row_position_snowflake_quoted_identifier = ( - frame_with_row_count_and_position.row_position_snowflake_quoted_identifier + frame.row_position_snowflake_quoted_identifier ) # filter frame based on num_rows. @@ -13101,14 +13113,14 @@ def build_repr_df( # in the future could analyze plan to see whether retrieving column count would trigger a query, if not # simply filter out based on static schema num_rows_for_head_and_tail = num_rows_to_display // 2 + 1 - new_frame = frame_with_row_count_and_position.filter( + new_frame = frame.filter( ( col(row_position_snowflake_quoted_identifier) <= num_rows_for_head_and_tail ) | ( col(row_position_snowflake_quoted_identifier) - >= col(row_count_identifier) - num_rows_for_head_and_tail + >= row_count_expr - num_rows_for_head_and_tail ) ) @@ -13116,9 +13128,13 @@ def build_repr_df( new_qc = SnowflakeQueryCompiler(new_frame) pandas_frame = new_qc.to_pandas() - # remove last column after first retrieving row count - row_count = 0 if 0 == len(pandas_frame) else pandas_frame.iat[0, -1] - pandas_frame = pandas_frame.iloc[:, :-1] + if row_count_value is None: + # if we appended the row count column instead of directly doing a COUNT(*), splice it off + # remove last column after first retrieving row count + row_count = 0 if 0 == len(pandas_frame) else pandas_frame.iat[0, -1] + pandas_frame = pandas_frame.iloc[:, :-1] + else: + row_count = row_count_value col_count = len(pandas_frame.columns) return row_count, col_count, pandas_frame From cbf394f51e37d5864be02acf714def741a88c7d1 Mon Sep 17 00:00:00 2001 From: Jonathan Shi Date: Thu, 27 Mar 2025 14:40:51 -0700 Subject: [PATCH 7/9] revert query counts (need to fix test_repr) --- .../plugin/_internal/ordered_dataframe.py | 16 +-- tests/integ/modin/frame/test_add_suffix.py | 2 +- tests/integ/modin/frame/test_aggregate.py | 2 +- tests/integ/modin/frame/test_attrs.py | 2 +- tests/integ/modin/frame/test_empty.py | 2 +- tests/integ/modin/frame/test_getitem.py | 27 ++-- tests/integ/modin/frame/test_head_tail.py | 60 +++++---- tests/integ/modin/frame/test_iat.py | 119 ++++++++---------- tests/integ/modin/frame/test_insert.py | 54 ++++---- tests/integ/modin/frame/test_iterrows.py | 6 +- tests/integ/modin/frame/test_repr.py | 19 +-- .../scikit-learn/test_scikit_learn.py | 4 +- tests/integ/modin/test_chained_operations.py | 2 +- .../integ/modin/test_from_pandas_to_pandas.py | 2 +- tests/integ/modin/test_numpy.py | 6 +- 15 files changed, 149 insertions(+), 174 deletions(-) diff --git a/src/snowflake/snowpark/modin/plugin/_internal/ordered_dataframe.py b/src/snowflake/snowpark/modin/plugin/_internal/ordered_dataframe.py index b8afc843eb..1987e807a2 100644 --- a/src/snowflake/snowpark/modin/plugin/_internal/ordered_dataframe.py +++ b/src/snowflake/snowpark/modin/plugin/_internal/ordered_dataframe.py @@ -421,19 +421,11 @@ def ensure_row_count_column(self) -> "OrderedDataFrame": wrap_double_underscore=True, )[0] ) - if not self.is_projection_of_table(): - ordered_dataframe = self.select( - *self.projected_column_snowflake_quoted_identifiers, - count("*").over().as_(row_count_snowflake_quoted_identifier), - ) - else: - from snowflake.snowpark.modin.plugin._internal.utils import pandas_lit + ordered_dataframe = self.select( + *self.projected_column_snowflake_quoted_identifiers, + count("*").over().as_(row_count_snowflake_quoted_identifier), + ) - row_count = self._dataframe_ref.snowpark_dataframe.count() - ordered_dataframe = self.select( - *self.projected_column_snowflake_quoted_identifiers, - pandas_lit(row_count).as_(row_count_snowflake_quoted_identifier), - ) # inplace update so dataframe_ref can be shared. Note that we keep # the original ordering columns. ordered_dataframe.row_count_snowflake_quoted_identifier = ( diff --git a/tests/integ/modin/frame/test_add_suffix.py b/tests/integ/modin/frame/test_add_suffix.py index 5b6c2a91f7..3cb338ca6d 100644 --- a/tests/integ/modin/frame/test_add_suffix.py +++ b/tests/integ/modin/frame/test_add_suffix.py @@ -71,7 +71,7 @@ def test_df_add_suffix_snowpark_pandas_series( ) -@sql_count_checker(query_count=3) +@sql_count_checker(query_count=2) def test_df_add_prefix_snowpark_pandas_df( default_index_snowpark_pandas_df, default_index_native_df ): diff --git a/tests/integ/modin/frame/test_aggregate.py b/tests/integ/modin/frame/test_aggregate.py index 4a14241237..7003c802d5 100644 --- a/tests/integ/modin/frame/test_aggregate.py +++ b/tests/integ/modin/frame/test_aggregate.py @@ -170,7 +170,7 @@ def test_string_sum(data, numeric_only_kwargs): ) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=1) def test_string_sum_of_reversed_df(): # check that we get the string concatenation right even when the dataframe # is not in its original order. diff --git a/tests/integ/modin/frame/test_attrs.py b/tests/integ/modin/frame/test_attrs.py index b89bd7d8d4..bfa972d16a 100644 --- a/tests/integ/modin/frame/test_attrs.py +++ b/tests/integ/modin/frame/test_attrs.py @@ -68,7 +68,7 @@ def func(df): # Tests that attrs is preserved across `take`, a unary operation that returns a Snowpark pandas object. # Other unary operators are checked by other tests in the `eval_snowpark_pandas_result` method. -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=0) def test_df_attrs_take(): def func(df): df.attrs = {"A": [1], "B": "check me"} diff --git a/tests/integ/modin/frame/test_empty.py b/tests/integ/modin/frame/test_empty.py index 842d072064..9eaecc36dd 100644 --- a/tests/integ/modin/frame/test_empty.py +++ b/tests/integ/modin/frame/test_empty.py @@ -27,7 +27,7 @@ ({"A": [np.nan]}, "np nan column"), ], ) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=1) def test_dataframe_empty_param(dataframe_input, test_case_name): eval_snowpark_pandas_result( pd.DataFrame(dataframe_input), diff --git a/tests/integ/modin/frame/test_getitem.py b/tests/integ/modin/frame/test_getitem.py index abf9e9834f..e1f676b4ff 100644 --- a/tests/integ/modin/frame/test_getitem.py +++ b/tests/integ/modin/frame/test_getitem.py @@ -85,7 +85,7 @@ def get_helper(df): return df[key] # 5 extra queries for iter - with SqlCounter(query_count=7 if isinstance(key, native_pd.Index) else 1): + with SqlCounter(query_count=6 if isinstance(key, native_pd.Index) else 1): eval_snowpark_pandas_result( default_index_snowpark_pandas_df, default_index_native_df, @@ -120,7 +120,7 @@ def get_helper(df): snowpark_df = pd.DataFrame(native_df) # 5 extra queries for iter - with SqlCounter(query_count=7 if isinstance(key, native_pd.Index) else 1): + with SqlCounter(query_count=6 if isinstance(key, native_pd.Index) else 1): eval_snowpark_pandas_result( snowpark_df, native_df, @@ -321,19 +321,15 @@ def test_df_getitem_calls_getitem(): slice(-100, None, -2), ], ) +@sql_count_checker(query_count=1) def test_df_getitem_with_slice( key, default_index_snowpark_pandas_df, default_index_native_df ): - if key.start is None: - expected_query_count = 1 - else: - expected_query_count = 2 - with SqlCounter(query_count=expected_query_count): - eval_snowpark_pandas_result( - default_index_snowpark_pandas_df, - default_index_native_df, - lambda df: df[key], - ) + eval_snowpark_pandas_result( + default_index_snowpark_pandas_df, + default_index_native_df, + lambda df: df[key], + ) @pytest.mark.parametrize( @@ -368,7 +364,6 @@ def test_df_getitem_with_non_int_slice(key): def test_df_getitem_with_multiindex( key, default_index_native_df, multiindex_native, native_df_with_multiindex_columns ): - expected_query_count = 2 if isinstance(key, slice) else 1 # Test __getitem__ with df with MultiIndex index. native_df = default_index_native_df.set_index(multiindex_native) snowpark_df = pd.DataFrame(native_df) @@ -382,13 +377,13 @@ def test_df_getitem_with_multiindex( ) else _key ) - with SqlCounter(query_count=expected_query_count): + with SqlCounter(query_count=1): eval_snowpark_pandas_result(snowpark_df, native_df, lambda df: df[_key]) # Test __getitem__ with df with MultiIndex columns. native_df = native_df_with_multiindex_columns snowpark_df = pd.DataFrame(native_df) - with SqlCounter(query_count=expected_query_count): + with SqlCounter(query_count=1): eval_snowpark_pandas_result( snowpark_df, native_df, lambda df: df[key], check_column_type=False ) @@ -396,7 +391,7 @@ def test_df_getitem_with_multiindex( # Test __getitem__ with df with MultiIndex index. native_df = native_df_with_multiindex_columns.set_index(multiindex_native) snowpark_df = pd.DataFrame(native_df) - with SqlCounter(query_count=expected_query_count): + with SqlCounter(query_count=1): eval_snowpark_pandas_result( snowpark_df, native_df, lambda df: df[key], check_column_type=False ) diff --git a/tests/integ/modin/frame/test_head_tail.py b/tests/integ/modin/frame/test_head_tail.py index 2f38bb1d76..231bec17ca 100644 --- a/tests/integ/modin/frame/test_head_tail.py +++ b/tests/integ/modin/frame/test_head_tail.py @@ -13,7 +13,7 @@ assert_snowpark_pandas_equals_to_pandas_without_dtypecheck, eval_snowpark_pandas_result, ) -from tests.integ.utils.sql_counter import SqlCounter +from tests.integ.utils.sql_counter import sql_count_checker def eval_result_and_query_with_no_join( @@ -34,43 +34,41 @@ def eval_result_and_query_with_no_join( "n", [1, None, 0, -1, -10, 5, 10], ) +@sql_count_checker(query_count=2) def test_head_tail(n, default_index_snowpark_pandas_df, default_index_native_df): - expected_query_count = 2 if n == 0 else 3 - with SqlCounter(query_count=expected_query_count): - eval_snowpark_pandas_result( - default_index_snowpark_pandas_df, - default_index_native_df, - lambda df: (df.head() if n is None else df.head(n)), - comparator=eval_result_and_query_with_no_join, - ) + eval_snowpark_pandas_result( + default_index_snowpark_pandas_df, + default_index_native_df, + lambda df: (df.head() if n is None else df.head(n)), + comparator=eval_result_and_query_with_no_join, + ) - eval_snowpark_pandas_result( - default_index_snowpark_pandas_df, - default_index_native_df, - lambda df: (df.tail() if n is None else df.tail(n)), - comparator=eval_result_and_query_with_no_join, - ) + eval_snowpark_pandas_result( + default_index_snowpark_pandas_df, + default_index_native_df, + lambda df: (df.tail() if n is None else df.tail(n)), + comparator=eval_result_and_query_with_no_join, + ) @pytest.mark.parametrize( "n", [1, None, 0, -1, -10, 5, 10], ) +@sql_count_checker(query_count=2) def test_empty_dataframe(n, empty_snowpark_pandas_df): - expected_query_count = 2 if n == 0 else 3 - with SqlCounter(query_count=expected_query_count): - eval_snowpark_pandas_result( - empty_snowpark_pandas_df, - native_pd.DataFrame(), - lambda df: (df.head() if n is None else df.head(n)), - comparator=eval_result_and_query_with_no_join, - check_column_type=False, - ) + eval_snowpark_pandas_result( + empty_snowpark_pandas_df, + native_pd.DataFrame(), + lambda df: (df.head() if n is None else df.head(n)), + comparator=eval_result_and_query_with_no_join, + check_column_type=False, + ) - eval_snowpark_pandas_result( - empty_snowpark_pandas_df, - native_pd.DataFrame(), - lambda df: (df.tail() if n is None else df.tail(n)), - comparator=eval_result_and_query_with_no_join, - check_column_type=False, - ) + eval_snowpark_pandas_result( + empty_snowpark_pandas_df, + native_pd.DataFrame(), + lambda df: (df.tail() if n is None else df.tail(n)), + comparator=eval_result_and_query_with_no_join, + check_column_type=False, + ) diff --git a/tests/integ/modin/frame/test_iat.py b/tests/integ/modin/frame/test_iat.py index a1c42540dc..4f2c685e89 100644 --- a/tests/integ/modin/frame/test_iat.py +++ b/tests/integ/modin/frame/test_iat.py @@ -7,29 +7,25 @@ from pandas.errors import IndexingError from tests.integ.modin.utils import eval_snowpark_pandas_result # noqa: F401 -from tests.integ.utils.sql_counter import sql_count_checker, SqlCounter +from tests.integ.utils.sql_counter import sql_count_checker @pytest.mark.parametrize( - "key, expected_query_count", + "key", [ - ((0, 0), 1), - ((0, -7), 1), - ((-7, 0), 2), - ((-7, -7), 2), + (0, 0), + (0, -7), + (-7, 0), + (-7, -7), ], ) +@sql_count_checker(query_count=1) def test_iat_get_default_index_str_columns( key, - expected_query_count, default_index_snowpark_pandas_df, default_index_native_df, ): - with SqlCounter(query_count=expected_query_count): - assert ( - default_index_snowpark_pandas_df.iat[key] - == default_index_native_df.iat[key] - ) + assert default_index_snowpark_pandas_df.iat[key] == default_index_native_df.iat[key] @pytest.mark.parametrize( @@ -41,7 +37,7 @@ def test_iat_get_default_index_str_columns( (-7, -7), ], ) -@sql_count_checker(query_count=2, join_count=2) +@sql_count_checker(query_count=1, join_count=2) def test_iat_set_default_index_str_columns( key, default_index_snowpark_pandas_df, @@ -59,22 +55,21 @@ def iat_set_helper(df): @pytest.mark.parametrize( - "key, expected_query_count", + "key", [ - ((0, 0), 1), - ((0, -7), 1), - ((-7, 0), 2), - ((-7, -7), 2), + (0, 0), + (0, -7), + (-7, 0), + (-7, -7), ], ) +@sql_count_checker(query_count=1) def test_iat_get_str_index_str_columns( key, - expected_query_count, str_index_snowpark_pandas_df, str_index_native_df, ): - with SqlCounter(query_count=expected_query_count): - assert str_index_snowpark_pandas_df.iat[key] == str_index_native_df.iat[key] + assert str_index_snowpark_pandas_df.iat[key] == str_index_native_df.iat[key] @pytest.mark.parametrize( @@ -86,7 +81,7 @@ def test_iat_get_str_index_str_columns( (-7, -7), ], ) -@sql_count_checker(query_count=2, join_count=2) +@sql_count_checker(query_count=1, join_count=2) def test_iat_set_str_index_str_columns( key, str_index_snowpark_pandas_df, @@ -101,22 +96,21 @@ def iat_set_helper(df): @pytest.mark.parametrize( - "key, expected_query_count", + "key", [ - ((0, 0), 1), - ((0, -7), 1), - ((-7, 0), 2), - ((-7, -7), 2), + (0, 0), + (0, -7), + (-7, 0), + (-7, -7), ], ) +@sql_count_checker(query_count=1) def test_iat_get_time_index_time_columns( key, - expected_query_count, time_index_snowpark_pandas_df, time_index_native_df, ): - with SqlCounter(query_count=expected_query_count): - assert time_index_snowpark_pandas_df.iat[key] == time_index_native_df.iat[key] + assert time_index_snowpark_pandas_df.iat[key] == time_index_native_df.iat[key] @pytest.mark.parametrize( @@ -128,7 +122,7 @@ def test_iat_get_time_index_time_columns( (-7, -7), ], ) -@sql_count_checker(query_count=2, join_count=2) +@sql_count_checker(query_count=1, join_count=2) def test_iat_set_time_index_time_columns( key, time_index_snowpark_pandas_df, @@ -146,24 +140,23 @@ def iat_set_helper(df): @pytest.mark.parametrize( - "key, expected_query_count", + "key", [ - ((0, 0), 1), - ((0, -7), 1), - ((-7, 0), 2), - ((-7, -7), 2), + (0, 0), + (0, -7), + (-7, 0), + (-7, -7), ], ) +@sql_count_checker(query_count=1) def test_iat_get_multiindex_index_str_columns( key, - expected_query_count, default_index_native_df, multiindex_native, ): - with SqlCounter(query_count=expected_query_count): - native_df = default_index_native_df.set_index(multiindex_native) - snowpark_df = pd.DataFrame(native_df) - assert snowpark_df.iat[key] == native_df.iat[key] + native_df = default_index_native_df.set_index(multiindex_native) + snowpark_df = pd.DataFrame(native_df) + assert snowpark_df.iat[key] == native_df.iat[key] @pytest.mark.parametrize( @@ -175,7 +168,7 @@ def test_iat_get_multiindex_index_str_columns( (-7, -7), ], ) -@sql_count_checker(query_count=2, join_count=2) +@sql_count_checker(query_count=1, join_count=2) def test_iat_set_multiindex_index_str_columns( key, default_index_native_df, @@ -190,23 +183,22 @@ def at_set_helper(df): @pytest.mark.parametrize( - "key, expected_query_count", + "key", [ - ((0, 0), 1), - ((0, -7), 1), - ((-7, 0), 2), - ((-7, -7), 2), + (0, 0), + (0, -7), + (-7, 0), + (-7, -7), ], ) +@sql_count_checker(query_count=1) def test_iat_get_default_index_multiindex_columns( key, - expected_query_count, native_df_with_multiindex_columns, ): - with SqlCounter(query_count=expected_query_count): - native_df = native_df_with_multiindex_columns - snowpark_df = pd.DataFrame(native_df) - assert snowpark_df.iat[key] == native_df.iat[key] + native_df = native_df_with_multiindex_columns + snowpark_df = pd.DataFrame(native_df) + assert snowpark_df.iat[key] == native_df.iat[key] @pytest.mark.parametrize( @@ -218,7 +210,7 @@ def test_iat_get_default_index_multiindex_columns( (-7, -7), ], ) -@sql_count_checker(query_count=2, join_count=2) +@sql_count_checker(query_count=1, join_count=2) def test_iat_set_default_index_multiindex_columns( key, native_df_with_multiindex_columns, @@ -232,24 +224,23 @@ def at_set_helper(df): @pytest.mark.parametrize( - "key, expected_query_count", + "key", [ - ((0, 0), 1), - ((0, -7), 1), - ((-7, 0), 2), - ((-7, -7), 2), + (0, 0), + (0, -7), + (-7, 0), + (-7, -7), ], ) +@sql_count_checker(query_count=1) def test_iat_get_multiindex_index_multiindex_columns( key, - expected_query_count, native_df_with_multiindex_columns, multiindex_native, ): - with SqlCounter(query_count=expected_query_count): - native_df = native_df_with_multiindex_columns.set_index(multiindex_native) - snowpark_df = pd.DataFrame(native_df) - assert snowpark_df.iat[key] == native_df.iat[key] + native_df = native_df_with_multiindex_columns.set_index(multiindex_native) + snowpark_df = pd.DataFrame(native_df) + assert snowpark_df.iat[key] == native_df.iat[key] @pytest.mark.parametrize( @@ -261,7 +252,7 @@ def test_iat_get_multiindex_index_multiindex_columns( (-7, -7), ], ) -@sql_count_checker(query_count=2, join_count=2) +@sql_count_checker(query_count=1, join_count=2) def test_iat_set_multiindex_index_multiindex_columns( key, native_df_with_multiindex_columns, diff --git a/tests/integ/modin/frame/test_insert.py b/tests/integ/modin/frame/test_insert.py index f9a421323e..974823d02c 100644 --- a/tests/integ/modin/frame/test_insert.py +++ b/tests/integ/modin/frame/test_insert.py @@ -54,7 +54,7 @@ def native_df(): ), ], ) -@sql_count_checker(query_count=6, join_count=3) +@sql_count_checker(query_count=5, join_count=3) def test_insert_snowpark_pandas_objects(native_df, native_value): snow_df = pd.DataFrame(native_df) value = pd.DataFrame(native_value) @@ -99,7 +99,7 @@ def test_insert_snowpark_pandas_objects(native_df, native_value): ), ], ) -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=2, join_count=1) def test_insert_one_to_many(native_df, native_value): snow_df = pd.DataFrame(native_df) value = pd.DataFrame(native_value) @@ -128,11 +128,11 @@ def test_insert_one_to_many(native_df, native_value): @pytest.mark.parametrize( "value, expected_query_count, expected_join_count", [ - (np.array(["a", "b", "c"]), 3, 1), # numpy array of shape (N,) - (np.array([["a"], ["b"], ["c"]]), 3, 1), # numpy array of shape (N, 1) - (["a", "b", "c"], 3, 1), # python list + (np.array(["a", "b", "c"]), 2, 1), # numpy array of shape (N,) + (np.array([["a"], ["b"], ["c"]]), 2, 1), # numpy array of shape (N, 1) + (["a", "b", "c"], 2, 1), # python list ({0: 1, 1: 2, 4: 3}, 1, 1), # python dict - (("a", "b", "c"), 3, 1), # python tuple + (("a", "b", "c"), 2, 1), # python tuple ], ) def test_insert_array_like(native_df, value, expected_query_count, expected_join_count): @@ -187,7 +187,7 @@ def test_insert_pandas_types_negative(snow_df): snow_df.insert(0, "col3", value) -@sql_count_checker(query_count=3) +@sql_count_checker(query_count=2) def test_insert_dataframe_shape_negative(native_df): # DataFrame with more than one column snow_df = pd.DataFrame(native_df) @@ -209,10 +209,10 @@ def test_insert_dataframe_shape_negative(native_df): [ # NOTE: Accepted numpy array shapes are (N,) or (N, 1) where N = number of rows = 3 (np.ones((3, 2)), 0), - (np.ones((6, 1)), 2), - (np.ones((1, 1)), 2), - ([1, 2], 2), # len < number of rows - ((6, 7, 8, 9), 2), # len > number of rows + (np.ones((6, 1)), 1), + (np.ones((1, 1)), 1), + ([1, 2], 1), # len < number of rows + ((6, 7, 8, 9), 1), # len > number of rows ({"a", "b", "c"}, 0), # python set ], ) @@ -227,7 +227,7 @@ def test_insert_value_negative(native_df, value, expected_query_count): ) -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=2, join_count=1) def test_insert_duplicate_label(native_df): snow_df = pd.DataFrame(native_df) eval_snowpark_pandas_result( @@ -250,7 +250,7 @@ def test_insert_duplicate_label_negative(native_df): @pytest.mark.parametrize("loc", [0, 1, 2]) -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=2, join_count=1) def test_insert_loc(native_df, loc): snow_df = pd.DataFrame(native_df) eval_snowpark_pandas_result( @@ -262,7 +262,7 @@ def test_insert_loc(native_df, loc): @pytest.mark.parametrize( - "loc, expected_query_count", [(-99, 2), (-1, 2), (99, 2), ("1", 0)] + "loc, expected_query_count", [(-99, 1), (-1, 1), (99, 1), ("1", 0)] ) def test_insert_loc_negative(native_df, loc, expected_query_count): with SqlCounter(query_count=expected_query_count): @@ -278,10 +278,10 @@ def test_insert_loc_negative(native_df, loc, expected_query_count): @pytest.mark.parametrize( "value, expected_query_count, expected_join_count", [ - (np.array(["a", "b", "c", "d"]), 3, 1), # numpy array of shape (N,) - (np.array([["a"], ["b"], ["c"], ["d"]]), 3, 1), # numpy array of shape (N, 1) - (["a", "b", "c", "d"], 3, 1), # python list - (("a", "b", "c", "d"), 3, 1), # python tuple + (np.array(["a", "b", "c", "d"]), 2, 1), # numpy array of shape (N,) + (np.array([["a"], ["b"], ["c"], ["d"]]), 2, 1), # numpy array of shape (N, 1) + (["a", "b", "c", "d"], 2, 1), # python list + (("a", "b", "c", "d"), 2, 1), # python tuple ({(3, 1): 1}, 1, 1), # python dict ("abc", 1, 0), # sting scalar (1, 1, 0), # int scalar @@ -311,7 +311,7 @@ def test_insert_multiindex_array_like_and_scalar( ("a", "b", "c", "d"), # python tuple ], ) -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=2, join_count=1) def test_insert_empty_multiindex_frame(value): mi = pd.MultiIndex.from_arrays([np.array([], dtype=int), np.array([], dtype=int)]) snow_df = pd.DataFrame([], index=mi) @@ -352,7 +352,7 @@ def test_insert_multiindex_dict_negative(): ([1.0, 2.5, 3.0], [1, 2, 3]), # Long and Double can be joined ], ) -@sql_count_checker(query_count=5, join_count=1) +@sql_count_checker(query_count=4, join_count=1) def test_insert_compatible_index(df_index, value_index): snow_df = pd.DataFrame({"col1": ["p", "q", "r"]}, index=native_pd.Index(df_index)) value = pd.DataFrame({"col2": ["x", "y", "z"]}, index=native_pd.Index(value_index)) @@ -383,7 +383,7 @@ def test_insert_compatible_index(df_index, value_index): ), # length and type mismatch ], ) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=1) def test_insert_index_num_levels_mismatch_negative(df_index, value_index): snow_df = pd.DataFrame({"col1": ["p", "q", "r"]}, index=native_pd.Index(df_index)) value = pd.DataFrame({"col2": ["w", "x", "y"]}, index=native_pd.Index(value_index)) @@ -408,7 +408,7 @@ def test_insert_index_num_levels_mismatch_negative(df_index, value_index): ), # type mismatch boolean != long ], ) -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=2, join_count=1) def test_insert_index_type_mismatch(df_index, value_index, expected_index): # Note: This is different behavior than native pandas. In native pandas when # index datatype mismatch new columns in inserted will all NULL values. @@ -467,7 +467,7 @@ def test_insert_multiple_null(): "index, value, expected_query_count, expected_join_count", [ ([1, 2], native_pd.Series([1, 2], index=[2, 3]), 1, 1), - ([1, 2], [3, 4], 3, 1), + ([1, 2], [3, 4], 2, 1), ], ) def test_insert_into_empty_dataframe_with_index( @@ -528,8 +528,6 @@ def test_insert_into_empty_dataframe( expected_join_count = 0 if isinstance(value, list) or isinstance(value, np.ndarray): expected_query_count = 2 - if isinstance(value, (list, np.ndarray)): - expected_query_count += 1 snow_df = pd.DataFrame(data=data, columns=columns) native_df = native_pd.DataFrame(data=data, columns=columns) @@ -576,7 +574,7 @@ def test_insert_into_empty_dataframe_index_dtype_mismatch(): snow_df.to_pandas() -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=2, join_count=1) def test_insert_empty_list_into_empty_dataframe(): snow_df = pd.DataFrame() native_df = native_pd.DataFrame() @@ -598,7 +596,7 @@ def test_insert_empty_list_into_empty_dataframe(): ([], ["A", "B", "C"]), ], ) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=1) def test_insert_into_empty_dataframe_negative(loc, data, columns): snow_df = pd.DataFrame(data=data, columns=columns) native_df = native_pd.DataFrame(data=data, columns=columns) @@ -773,7 +771,7 @@ def insert_op(df): assert_frame_equal(snow_res, expected_res, check_dtype=False) -@sql_count_checker(query_count=5, join_count=6) +@sql_count_checker(query_count=4, join_count=6) def test_insert_timedelta(): native_df = native_pd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) snow_df = pd.DataFrame(native_df) diff --git a/tests/integ/modin/frame/test_iterrows.py b/tests/integ/modin/frame/test_iterrows.py index e1dee762ff..dc7a0e6d3d 100644 --- a/tests/integ/modin/frame/test_iterrows.py +++ b/tests/integ/modin/frame/test_iterrows.py @@ -61,7 +61,7 @@ def test_df_iterrows(native_df): snowpark_df = pd.DataFrame(native_df) # One query is used to get the number of rows. One query is used to retrieve each row - each query has 4 JOIN # operations performed due to iloc. - with SqlCounter(query_count=len(native_df) + 2): + with SqlCounter(query_count=len(native_df) + 1): eval_snowpark_pandas_result( snowpark_df, native_df, @@ -70,7 +70,7 @@ def test_df_iterrows(native_df): ) -@sql_count_checker(query_count=9, union_count=7) +@sql_count_checker(query_count=8, union_count=7) def test_df_iterrows_mixed_types(default_index_native_df): # Same test as above on bigger df with mixed types. # One query is used to get the number of rows. One query is used to retrieve each row - each query has 4 JOIN @@ -85,7 +85,7 @@ def test_df_iterrows_mixed_types(default_index_native_df): ) -@sql_count_checker(query_count=8, union_count=6) +@sql_count_checker(query_count=7, union_count=6) def test_df_iterrows_multindex_df(): # Create df with a MultiIndex index. # One query is used to get the number of rows. One query is used to retrieve each row - each query has 4 JOIN diff --git a/tests/integ/modin/frame/test_repr.py b/tests/integ/modin/frame/test_repr.py index 80abf4fd3e..f4ada1b7e9 100644 --- a/tests/integ/modin/frame/test_repr.py +++ b/tests/integ/modin/frame/test_repr.py @@ -15,6 +15,7 @@ from tests.integ.utils.sql_counter import SqlCounter, sql_count_checker # expected_query_count is for test_repr_html paramterized SqlCounter test +# an additional query + select may be incurred to eagerly retrieve the row count _DATAFRAMES_TO_TEST = [ ( native_pd.DataFrame( @@ -29,41 +30,41 @@ ], } ), - 1, + 2, ), ( native_pd.DataFrame([1, 2], index=[pd.Timedelta(1), pd.Timedelta(-1)]), - 1, + 2, ), ( IRIS_DF, - 4, + 5, ), ( native_pd.DataFrame(), - 1, + 2, ), ( native_pd.DataFrame( {"A": list(range(10000)), "B": np.random.normal(size=10000)} ), - 4, + 5, ), ( native_pd.DataFrame(columns=["A", "B", "C", "D", "C", "B", "A"]), - 1, + 2, ), # one large dataframe to test many columns ( native_pd.DataFrame(columns=[f"x{i}" for i in range(300)]), - 1, + 2, ), # one large dataframe to test both columns/rows ( native_pd.DataFrame( data=np.zeros(shape=(300, 300)), columns=[f"x{i}" for i in range(300)] ), - 4, + 5, ), ] @@ -75,7 +76,7 @@ def test_repr(native_df, expected_query_count): native_str = repr(native_df) # only measure select statements here, creation of dfs may yield a couple # CREATE TEMPORARY TABLE/INSERT INTO queries - with SqlCounter(query_count=expected_query_count, select_count=1): + with SqlCounter(query_count=expected_query_count, select_count=2): snow_str = repr(snow_df) assert native_str == snow_str diff --git a/tests/integ/modin/interoperability/scikit-learn/test_scikit_learn.py b/tests/integ/modin/interoperability/scikit-learn/test_scikit_learn.py index a8e0138a42..7cff7c7803 100644 --- a/tests/integ/modin/interoperability/scikit-learn/test_scikit_learn.py +++ b/tests/integ/modin/interoperability/scikit-learn/test_scikit_learn.py @@ -137,7 +137,7 @@ def get_cluster_centers(df) -> np.ndarray: class TestDimensionalityReduction: - @sql_count_checker(query_count=4) + @sql_count_checker(query_count=3) def test_principal_component_analysis(self, test_dfs): def get_principal_components(df) -> np.ndarray: return PCA(n_components=2).fit(df).components_ @@ -192,7 +192,7 @@ def validate_search_results(snow_estimator, pandas_estimator): class TestPreprocessing: - @sql_count_checker(query_count=6) + @sql_count_checker(query_count=5) def test_maxabs(self, test_dfs): eval_snowpark_pandas_result( *test_dfs, diff --git a/tests/integ/modin/test_chained_operations.py b/tests/integ/modin/test_chained_operations.py index 072b5fe455..9e496a2608 100644 --- a/tests/integ/modin/test_chained_operations.py +++ b/tests/integ/modin/test_chained_operations.py @@ -11,7 +11,7 @@ from tests.utils import multithreaded_run -@sql_count_checker(query_count=3) +@sql_count_checker(query_count=2) def test_chained_op1(): # bug fix SNOW-1348886 data = {"X": [1, 2, 3], "Y": [4, 5, 6]} diff --git a/tests/integ/modin/test_from_pandas_to_pandas.py b/tests/integ/modin/test_from_pandas_to_pandas.py index 14d2eb942e..469ea5ee52 100644 --- a/tests/integ/modin/test_from_pandas_to_pandas.py +++ b/tests/integ/modin/test_from_pandas_to_pandas.py @@ -536,7 +536,7 @@ def test_series_to_pandas(): assert_series_equal(snow_series.to_pandas(), pandas_series) -@sql_count_checker(query_count=3, union_count=1) +@sql_count_checker(query_count=2, union_count=1) def test_single_row_frame_to_series_to_pandas(): # create a Snowpark pandas with single row native_df = native_pd.DataFrame( diff --git a/tests/integ/modin/test_numpy.py b/tests/integ/modin/test_numpy.py index f907e2979e..7a322d1671 100644 --- a/tests/integ/modin/test_numpy.py +++ b/tests/integ/modin/test_numpy.py @@ -66,7 +66,7 @@ def test_full_like(): snow_df = pd.DataFrame(data) pandas_df = native_pd.DataFrame(data) - with SqlCounter(query_count=3): + with SqlCounter(query_count=2): snow_result = np.full_like(snow_df, 1234) pandas_result = np.full_like(pandas_df, 1234) assert_array_equal(np.array(snow_result), np.array(pandas_result)) @@ -232,7 +232,7 @@ def test_np_ufunc_unary_operators(np_ufunc): # The query count here is from the argument logging performed by numpy on error -@sql_count_checker(query_count=3) +@sql_count_checker(query_count=2) def test_np_ufunc_notimplemented(): data = { "A": [1], @@ -244,7 +244,7 @@ def test_np_ufunc_notimplemented(): # The query count here is from the argument logging performed by numpy on error -@sql_count_checker(query_count=4) +@sql_count_checker(query_count=3) def test_np_ufunc_with_out_notimpl(): data = { "A": [1], From eb2d8f17fe946701d02c53a020e253d28ab1f3d5 Mon Sep 17 00:00:00 2001 From: Jonathan Shi Date: Thu, 27 Mar 2025 16:15:06 -0700 Subject: [PATCH 8/9] fix repr tests --- .../plugin/_internal/ordered_dataframe.py | 13 ++++- tests/integ/modin/frame/test_repr.py | 55 +++++++++++++------ 2 files changed, 49 insertions(+), 19 deletions(-) diff --git a/src/snowflake/snowpark/modin/plugin/_internal/ordered_dataframe.py b/src/snowflake/snowpark/modin/plugin/_internal/ordered_dataframe.py index 1987e807a2..d752013fb6 100644 --- a/src/snowflake/snowpark/modin/plugin/_internal/ordered_dataframe.py +++ b/src/snowflake/snowpark/modin/plugin/_internal/ordered_dataframe.py @@ -2046,9 +2046,16 @@ def is_projection_of_table(self) -> bool: # 1 API call in the plan - either `Session.sql` for DataFrames based off of I/O operations # e.g. `read_snowflake` or `read_csv`, or `Session.create_dataframe` for DataFrames created # out of Python objects. + # We must also ensure that the underlying compiled query plan is only a single query -- + # for example, a simple select on pd.DataFrame([1] * 2000) would result in CREATE TEMP TABLE + # + batch INSERT + DROP TABLE queries, which introduce non-trivial overhead. snowpark_df = self._dataframe_ref.snowpark_dataframe snowpark_plan = snowpark_df._plan - return len(snowpark_plan.api_calls) == 1 and any( - accepted_api in snowpark_plan.api_calls[0]["name"] - for accepted_api in ["Session.sql", "Session.create_dataframe"] + return ( + len(snowpark_plan.api_calls) == 1 + and len(snowpark_plan.queries) == 1 + and any( + accepted_api in snowpark_plan.api_calls[0]["name"] + for accepted_api in ["Session.sql", "Session.create_dataframe"] + ) ) diff --git a/tests/integ/modin/frame/test_repr.py b/tests/integ/modin/frame/test_repr.py index f4ada1b7e9..9a8a617aca 100644 --- a/tests/integ/modin/frame/test_repr.py +++ b/tests/integ/modin/frame/test_repr.py @@ -15,7 +15,10 @@ from tests.integ.utils.sql_counter import SqlCounter, sql_count_checker # expected_query_count is for test_repr_html paramterized SqlCounter test -# an additional query + select may be incurred to eagerly retrieve the row count +# if the input data would be retrieved by a simple select, an additional query + select is +# incurred to eagerly retrieve the row count (larger data may avoid this because the CREATE TEMP +# TABLE + batch INSERT + DROP TABLE queries are likely more expensive than the COUNT(*) OVER() window +# function) _DATAFRAMES_TO_TEST = [ ( native_pd.DataFrame( @@ -31,59 +34,73 @@ } ), 2, + 2, ), ( native_pd.DataFrame([1, 2], index=[pd.Timedelta(1), pd.Timedelta(-1)]), 2, + 2, ), ( IRIS_DF, - 5, + 4, + 1, ), ( native_pd.DataFrame(), 2, + 2, ), ( native_pd.DataFrame( {"A": list(range(10000)), "B": np.random.normal(size=10000)} ), - 5, + 4, + 1, ), ( native_pd.DataFrame(columns=["A", "B", "C", "D", "C", "B", "A"]), 2, + 2, ), # one large dataframe to test many columns ( native_pd.DataFrame(columns=[f"x{i}" for i in range(300)]), 2, + 2, ), # one large dataframe to test both columns/rows ( native_pd.DataFrame( data=np.zeros(shape=(300, 300)), columns=[f"x{i}" for i in range(300)] ), - 5, + 4, + 1, ), ] -@pytest.mark.parametrize("native_df, expected_query_count", _DATAFRAMES_TO_TEST) -def test_repr(native_df, expected_query_count): +@pytest.mark.parametrize( + "native_df, expected_query_count, expected_select_count", _DATAFRAMES_TO_TEST +) +def test_repr(native_df, expected_query_count, expected_select_count): snow_df = pd.DataFrame(native_df) native_str = repr(native_df) # only measure select statements here, creation of dfs may yield a couple # CREATE TEMPORARY TABLE/INSERT INTO queries - with SqlCounter(query_count=expected_query_count, select_count=2): + with SqlCounter( + query_count=expected_query_count, select_count=expected_select_count + ): snow_str = repr(snow_df) assert native_str == snow_str -@pytest.mark.parametrize("native_df, expected_query_count", _DATAFRAMES_TO_TEST) -def test_repr_html(native_df, expected_query_count): +@pytest.mark.parametrize( + "native_df, expected_query_count, expected_select_count", _DATAFRAMES_TO_TEST +) +def test_repr_html(native_df, expected_query_count, expected_select_count): # TODO: SNOW-916596 Test this with Jupyter notebooks. # joins due to temp table creation @@ -98,7 +115,9 @@ def test_repr_html(native_df, expected_query_count): native_html = native_df._repr_html_() # 10 of these are related to stored procs, inserts, alter session query tag. - with SqlCounter(query_count=expected_query_count, select_count=1): + with SqlCounter( + query_count=expected_query_count, select_count=expected_select_count + ): snow_html = snow_df._repr_html_() assert native_html == snow_html @@ -134,19 +153,23 @@ def queries(self) -> list[QueryRecord]: return [query.sql_text for query in self._queries] -@pytest.mark.parametrize("native_df, expected_query_count", _DATAFRAMES_TO_TEST) -def test_repr_and_repr_html_issue_same_query(native_df, expected_query_count): - """This test ensures that the same query is issued for both `repr` and `repr_html` +def test_repr_and_repr_html_issue_same_query(): + """ + This test ensures that the same query is issued for both `repr` and `repr_html` in order to take advantage of Snowflake server side caching when both are called back - to back (as in the case with displaying a DataFrame in a Jupyter notebook).""" + to back (as in the case with displaying a DataFrame in a Jupyter notebook). + + If the input frame was a simple projection that results in an additional COUNT() query, + it is not captured by the ReprQueryListener. + """ native_df = native_pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) snow_df = pd.DataFrame(native_df) with ReprQueryListener(pd.session) as listener: - with SqlCounter(query_count=1, select_count=1): + with SqlCounter(query_count=2, select_count=2): repr_str = repr(snow_df) - with SqlCounter(query_count=1, select_count=1): + with SqlCounter(query_count=2, select_count=2): repr_html = snow_df._repr_html_() assert repr_str == repr(native_df) From 5d2f8f4db0501b69adda909a51904648410a0e6d Mon Sep 17 00:00:00 2001 From: Jonathan Shi Date: Tue, 8 Apr 2025 11:31:19 -0700 Subject: [PATCH 9/9] lift ensure_row_position call --- .../modin/plugin/compiler/snowflake_query_compiler.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index e28f4ac940..c87ae34619 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -13067,14 +13067,12 @@ def build_repr_df( # the solution here uses a window function. This may lead to perf regressions, track these here SNOW-984177. # Ensure that our reference to self._modin_frame is updated with cached row count and position. row_count_value = None + frame = self._modin_frame.ensure_row_position_column() if self._modin_frame.ordered_dataframe.is_projection_of_table(): - frame = self._modin_frame.ensure_row_position_column() row_count_value = frame.ordered_dataframe.materialize_row_count() row_count_expr = pandas_lit(row_count_value) else: - frame = ( - self._modin_frame.ensure_row_position_column().ensure_row_count_column() - ) + frame = frame.ensure_row_count_column() row_count_pandas_label = ( ROW_COUNT_COLUMN_LABEL if len(frame.data_column_pandas_index_names) == 1