diff --git a/CHANGELOG.md b/CHANGELOG.md index bb1e2c1ba7..0e4bd327ae 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ #### Improvements - Added support for reading XML files with namespaces using `rowTag` and `stripNamespaces` options. +- Added a new argument to `Dataframe.describe` called `strings_include_math_stats` that triggers `stddev` and `mean` to be calculated for String columns. ### Snowpark Local Testing Updates @@ -16,7 +17,7 @@ - Set the default value of the `index` parameter to `False` for `DataFrame.to_view`, `Series.to_view`, `DataFrame.to_dynamic_table`, and `Series.to_dynamic_table`. - Added `iceberg_version` option to table creation functions. -- Added a new argument to `Dataframe.describe` called `strings_include_math_stats` that triggers `stddev` and `mean` to be calculated for String columns. +- Reduced query count for many operations, including `insert`, `repr`, and `groupby`, that previously issued a query to retrieve the input data's size. ## 1.32.0 (2025-05-15) diff --git a/src/snowflake/snowpark/modin/plugin/_internal/frame.py b/src/snowflake/snowpark/modin/plugin/_internal/frame.py index 1d40f42e8d..67bdbd6cdb 100644 --- a/src/snowflake/snowpark/modin/plugin/_internal/frame.py +++ b/src/snowflake/snowpark/modin/plugin/_internal/frame.py @@ -738,10 +738,7 @@ def num_rows(self) -> int: Returns: Number of rows in this frame. """ - num_rows = count_rows(self.ordered_dataframe) - self.ordered_dataframe.row_count = num_rows - self.ordered_dataframe.row_count_upper_bound = num_rows - return num_rows + return count_rows(self.ordered_dataframe) def has_unique_index(self, axis: Optional[int] = 0) -> bool: """ diff --git a/src/snowflake/snowpark/modin/plugin/_internal/ordered_dataframe.py b/src/snowflake/snowpark/modin/plugin/_internal/ordered_dataframe.py index 2159abd688..88484ac546 100644 --- a/src/snowflake/snowpark/modin/plugin/_internal/ordered_dataframe.py +++ b/src/snowflake/snowpark/modin/plugin/_internal/ordered_dataframe.py @@ -680,6 +680,7 @@ def select( row_count_snowflake_quoted_identifier=self.row_count_snowflake_quoted_identifier, ) + new_df.row_count = self.row_count # Update the row count upper bound new_df.row_count_upper_bound = RowCountEstimator.upper_bound( self, DataFrameOperation.SELECT, args={} @@ -746,6 +747,8 @@ def union_all(self, other: "OrderedDataFrame") -> "OrderedDataFrame": DataFrameReference(snowpark_dataframe, result_column_quoted_identifiers), projected_column_snowflake_quoted_identifiers=result_column_quoted_identifiers, ) + if self.row_count is not None and other.row_count is not None: + new_df.row_count = self.row_count + other.row_count # Update the row count upper bound new_df.row_count_upper_bound = RowCountEstimator.upper_bound( self, DataFrameOperation.UNION_ALL, args={"other": other} @@ -849,6 +852,7 @@ def sort( # No need to reset row count, since sorting should not add/drop rows. row_count_snowflake_quoted_identifier=self.row_count_snowflake_quoted_identifier, ) + new_df.row_count = self.row_count # Update the row count upper bound new_df.row_count_upper_bound = RowCountEstimator.upper_bound( self, DataFrameOperation.SORT, args={} diff --git a/src/snowflake/snowpark/modin/plugin/_internal/utils.py b/src/snowflake/snowpark/modin/plugin/_internal/utils.py index e2b1a0ea45..f6ba004baa 100644 --- a/src/snowflake/snowpark/modin/plugin/_internal/utils.py +++ b/src/snowflake/snowpark/modin/plugin/_internal/utils.py @@ -1889,9 +1889,14 @@ def count_rows(df: OrderedDataFrame) -> int: """ Returns the number of rows of a Snowpark DataFrame. """ + if df.row_count is not None: + return df.row_count df = df.ensure_row_count_column() rowset = df.select(df.row_count_snowflake_quoted_identifier).limit(1).collect() - return 0 if len(rowset) == 0 else rowset[0][0] + row_count = 0 if len(rowset) == 0 else rowset[0][0] + df.row_count = row_count + df.row_count_upper_bound = row_count + return row_count def append_columns( diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index c9b8af5ccb..0a659f3d08 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -13216,37 +13216,40 @@ def build_repr_df( # 2. retrieve all columns # 3. filter on rows with recursive count - # Previously, 2 queries were issued, and a first version replaced them with a single query and a join - # the solution here uses a window function. This may lead to perf regressions, track these here SNOW-984177. - # Ensure that our reference to self._modin_frame is updated with cached row count and position. - self._modin_frame = ( - self._modin_frame.ensure_row_position_column().ensure_row_count_column() - ) - row_count_pandas_label = ( - ROW_COUNT_COLUMN_LABEL - if len(self._modin_frame.data_column_pandas_index_names) == 1 - else (ROW_COUNT_COLUMN_LABEL,) - * len(self._modin_frame.data_column_pandas_index_names) - ) - frame_with_row_count_and_position = InternalFrame.create( - ordered_dataframe=self._modin_frame.ordered_dataframe, - data_column_pandas_labels=self._modin_frame.data_column_pandas_labels - + [row_count_pandas_label], - data_column_snowflake_quoted_identifiers=self._modin_frame.data_column_snowflake_quoted_identifiers - + [self._modin_frame.row_count_snowflake_quoted_identifier], - data_column_pandas_index_names=self._modin_frame.data_column_pandas_index_names, - index_column_pandas_labels=self._modin_frame.index_column_pandas_labels, - index_column_snowflake_quoted_identifiers=self._modin_frame.index_column_snowflake_quoted_identifiers, - data_column_types=self._modin_frame.cached_data_column_snowpark_pandas_types - + [None], - index_column_types=self._modin_frame.cached_index_column_snowpark_pandas_types, - ) + frame = self._modin_frame.ensure_row_position_column() + use_cached_row_count = frame.ordered_dataframe.row_count is not None - row_count_identifier = ( - frame_with_row_count_and_position.row_count_snowflake_quoted_identifier - ) + # If the row count is already cached, there's no need to include it in the query. + if use_cached_row_count: + row_count_expr = pandas_lit(frame.ordered_dataframe.row_count) + else: + # Previously, 2 queries were issued, and a first version replaced them with a single query and a join + # the solution here uses a window function. This may lead to perf regressions, track these here SNOW-984177. + # Ensure that our reference to self._modin_frame is updated with cached row count and position. + frame = frame.ensure_row_count_column() + row_count_pandas_label = ( + ROW_COUNT_COLUMN_LABEL + if len(frame.data_column_pandas_index_names) == 1 + else (ROW_COUNT_COLUMN_LABEL,) + * len(frame.data_column_pandas_index_names) + ) + frame = InternalFrame.create( + ordered_dataframe=frame.ordered_dataframe, + data_column_pandas_labels=frame.data_column_pandas_labels + + [row_count_pandas_label], + data_column_snowflake_quoted_identifiers=frame.data_column_snowflake_quoted_identifiers + + [frame.row_count_snowflake_quoted_identifier], + data_column_pandas_index_names=frame.data_column_pandas_index_names, + index_column_pandas_labels=frame.index_column_pandas_labels, + index_column_snowflake_quoted_identifiers=frame.index_column_snowflake_quoted_identifiers, + data_column_types=frame.cached_data_column_snowpark_pandas_types + + [None], + index_column_types=frame.cached_index_column_snowpark_pandas_types, + ) + + row_count_expr = col(frame.row_count_snowflake_quoted_identifier) row_position_snowflake_quoted_identifier = ( - frame_with_row_count_and_position.row_position_snowflake_quoted_identifier + frame.row_position_snowflake_quoted_identifier ) # filter frame based on num_rows. @@ -13254,14 +13257,14 @@ def build_repr_df( # in the future could analyze plan to see whether retrieving column count would trigger a query, if not # simply filter out based on static schema num_rows_for_head_and_tail = num_rows_to_display // 2 + 1 - new_frame = frame_with_row_count_and_position.filter( + new_frame = frame.filter( ( col(row_position_snowflake_quoted_identifier) <= num_rows_for_head_and_tail ) | ( col(row_position_snowflake_quoted_identifier) - >= col(row_count_identifier) - num_rows_for_head_and_tail + >= row_count_expr - num_rows_for_head_and_tail ) ) @@ -13269,9 +13272,12 @@ def build_repr_df( new_qc = SnowflakeQueryCompiler(new_frame) pandas_frame = new_qc.to_pandas() - # remove last column after first retrieving row count - row_count = 0 if 0 == len(pandas_frame) else pandas_frame.iat[0, -1] - pandas_frame = pandas_frame.iloc[:, :-1] + if use_cached_row_count: + row_count = frame.ordered_dataframe.row_count + else: + # remove last column after first retrieving row count + row_count = 0 if len(pandas_frame) == 0 else pandas_frame.iat[0, -1] + pandas_frame = pandas_frame.iloc[:, :-1] col_count = len(pandas_frame.columns) return row_count, col_count, pandas_frame diff --git a/tests/integ/modin/crosstab/test_crosstab.py b/tests/integ/modin/crosstab/test_crosstab.py index 59df70001d..99c2d2ef71 100644 --- a/tests/integ/modin/crosstab/test_crosstab.py +++ b/tests/integ/modin/crosstab/test_crosstab.py @@ -223,7 +223,7 @@ def eval_func(args_list): def test_basic_crosstab_with_df_and_series_objs_pandas_errors_columns( self, dropna, a, b, c ): - query_count = 4 + query_count = 2 join_count = 1 if dropna else 2 a = native_pd.Series( a, @@ -269,7 +269,7 @@ def eval_func(args_list): def test_basic_crosstab_with_df_and_series_objs_pandas_errors_index( self, dropna, a, b, c ): - query_count = 6 + query_count = 4 join_count = 5 if dropna else 11 a = native_pd.Series( a, @@ -556,7 +556,7 @@ def test_values(self, dropna, aggfunc, basic_crosstab_dfs): @pytest.mark.parametrize("aggfunc", AGGFUNCS_THAT_CANNOT_PRODUCE_NAN) def test_values_series_like(self, dropna, aggfunc, basic_crosstab_dfs): - query_count = 5 + query_count = 3 join_count = 2 if dropna else 3 native_df, snow_df = basic_crosstab_dfs @@ -646,7 +646,7 @@ def test_values_unsupported_aggfunc(basic_crosstab_dfs): ) -@sql_count_checker(query_count=4) +@sql_count_checker(query_count=2) def test_values_series_like_unsupported_aggfunc(basic_crosstab_dfs): # The query count above comes from building the DataFrame # that we pass in to pivot table. diff --git a/tests/integ/modin/frame/test_empty.py b/tests/integ/modin/frame/test_empty.py index 9eaecc36dd..40cf760bb5 100644 --- a/tests/integ/modin/frame/test_empty.py +++ b/tests/integ/modin/frame/test_empty.py @@ -27,7 +27,7 @@ ({"A": [np.nan]}, "np nan column"), ], ) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=0) def test_dataframe_empty_param(dataframe_input, test_case_name): eval_snowpark_pandas_result( pd.DataFrame(dataframe_input), diff --git a/tests/integ/modin/frame/test_from_dict.py b/tests/integ/modin/frame/test_from_dict.py index 894da0f95b..2035dc3dcf 100644 --- a/tests/integ/modin/frame/test_from_dict.py +++ b/tests/integ/modin/frame/test_from_dict.py @@ -72,7 +72,7 @@ def test_from_dict_orient_tight(): ) -@sql_count_checker(query_count=7) +@sql_count_checker(query_count=5) def test_from_dict_series_values(): # TODO(SNOW-1857349): Proved a lazy implementation for this case data = {i: pd.Series(range(1)) for i in range(2)} diff --git a/tests/integ/modin/frame/test_getitem.py b/tests/integ/modin/frame/test_getitem.py index e1f676b4ff..25c067e4e1 100644 --- a/tests/integ/modin/frame/test_getitem.py +++ b/tests/integ/modin/frame/test_getitem.py @@ -84,8 +84,8 @@ def get_helper(df): else: return df[key] - # 5 extra queries for iter - with SqlCounter(query_count=6 if isinstance(key, native_pd.Index) else 1): + # 4 extra queries for iter + with SqlCounter(query_count=5 if isinstance(key, native_pd.Index) else 1): eval_snowpark_pandas_result( default_index_snowpark_pandas_df, default_index_native_df, @@ -119,8 +119,8 @@ def get_helper(df): native_df = native_pd.DataFrame(data) snowpark_df = pd.DataFrame(native_df) - # 5 extra queries for iter - with SqlCounter(query_count=6 if isinstance(key, native_pd.Index) else 1): + # 4 extra queries for iter + with SqlCounter(query_count=5 if isinstance(key, native_pd.Index) else 1): eval_snowpark_pandas_result( snowpark_df, native_df, diff --git a/tests/integ/modin/frame/test_iloc.py b/tests/integ/modin/frame/test_iloc.py index 6662ade701..ad2e90515e 100644 --- a/tests/integ/modin/frame/test_iloc.py +++ b/tests/integ/modin/frame/test_iloc.py @@ -425,7 +425,7 @@ def test_df_iloc_get_empty_key( ) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=1) def test_df_iloc_get_empty(empty_snowpark_pandas_df): _ = empty_snowpark_pandas_df.iloc[0] @@ -1811,8 +1811,8 @@ def test_df_iloc_set_with_row_key_list( else: snow_row_pos = row_pos - # 2 extra queries for iter - expected_query_count = 3 if isinstance(snow_row_pos, pd.Index) else 1 + # 1 extra query for iter + expected_query_count = 2 if isinstance(snow_row_pos, pd.Index) else 1 expected_join_count = 2 if isinstance(item_values, int) else 3 with SqlCounter(query_count=expected_query_count, join_count=expected_join_count): diff --git a/tests/integ/modin/frame/test_insert.py b/tests/integ/modin/frame/test_insert.py index 974823d02c..d99e60b8a3 100644 --- a/tests/integ/modin/frame/test_insert.py +++ b/tests/integ/modin/frame/test_insert.py @@ -54,7 +54,7 @@ def native_df(): ), ], ) -@sql_count_checker(query_count=5, join_count=3) +@sql_count_checker(query_count=4, join_count=3) def test_insert_snowpark_pandas_objects(native_df, native_value): snow_df = pd.DataFrame(native_df) value = pd.DataFrame(native_value) @@ -99,7 +99,7 @@ def test_insert_snowpark_pandas_objects(native_df, native_value): ), ], ) -@sql_count_checker(query_count=2, join_count=1) +@sql_count_checker(query_count=1, join_count=1) def test_insert_one_to_many(native_df, native_value): snow_df = pd.DataFrame(native_df) value = pd.DataFrame(native_value) @@ -126,18 +126,18 @@ def test_insert_one_to_many(native_df, native_value): @pytest.mark.parametrize( - "value, expected_query_count, expected_join_count", + "value", [ - (np.array(["a", "b", "c"]), 2, 1), # numpy array of shape (N,) - (np.array([["a"], ["b"], ["c"]]), 2, 1), # numpy array of shape (N, 1) - (["a", "b", "c"], 2, 1), # python list - ({0: 1, 1: 2, 4: 3}, 1, 1), # python dict - (("a", "b", "c"), 2, 1), # python tuple + np.array(["a", "b", "c"]), # numpy array of shape (N,) + np.array([["a"], ["b"], ["c"]]), # numpy array of shape (N, 1) + ["a", "b", "c"], # python list + {0: 1, 1: 2, 4: 3}, # python dict + ("a", "b", "c"), # python tuple ], ) -def test_insert_array_like(native_df, value, expected_query_count, expected_join_count): +def test_insert_array_like(native_df, value): snow_df = pd.DataFrame(native_df) - with SqlCounter(query_count=expected_query_count, join_count=expected_join_count): + with SqlCounter(query_count=1, join_count=1): eval_snowpark_pandas_result( snow_df, native_df, @@ -187,7 +187,7 @@ def test_insert_pandas_types_negative(snow_df): snow_df.insert(0, "col3", value) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=1) def test_insert_dataframe_shape_negative(native_df): # DataFrame with more than one column snow_df = pd.DataFrame(native_df) @@ -205,20 +205,20 @@ def test_insert_dataframe_shape_negative(native_df): @pytest.mark.parametrize( - "value, expected_query_count", + "value", [ # NOTE: Accepted numpy array shapes are (N,) or (N, 1) where N = number of rows = 3 - (np.ones((3, 2)), 0), - (np.ones((6, 1)), 1), - (np.ones((1, 1)), 1), - ([1, 2], 1), # len < number of rows - ((6, 7, 8, 9), 1), # len > number of rows - ({"a", "b", "c"}, 0), # python set + np.ones((3, 2)), + np.ones((6, 1)), + np.ones((1, 1)), + [1, 2], # len < number of rows + (6, 7, 8, 9), # len > number of rows + {"a", "b", "c"}, # python set ], ) -def test_insert_value_negative(native_df, value, expected_query_count): +def test_insert_value_negative(native_df, value): snow_df = pd.DataFrame(native_df) - with SqlCounter(query_count=expected_query_count): + with SqlCounter(query_count=0): eval_snowpark_pandas_result( snow_df, native_df, @@ -227,7 +227,7 @@ def test_insert_value_negative(native_df, value, expected_query_count): ) -@sql_count_checker(query_count=2, join_count=1) +@sql_count_checker(query_count=1, join_count=1) def test_insert_duplicate_label(native_df): snow_df = pd.DataFrame(native_df) eval_snowpark_pandas_result( @@ -250,7 +250,7 @@ def test_insert_duplicate_label_negative(native_df): @pytest.mark.parametrize("loc", [0, 1, 2]) -@sql_count_checker(query_count=2, join_count=1) +@sql_count_checker(query_count=1, join_count=1) def test_insert_loc(native_df, loc): snow_df = pd.DataFrame(native_df) eval_snowpark_pandas_result( @@ -261,11 +261,9 @@ def test_insert_loc(native_df, loc): ) -@pytest.mark.parametrize( - "loc, expected_query_count", [(-99, 1), (-1, 1), (99, 1), ("1", 0)] -) -def test_insert_loc_negative(native_df, loc, expected_query_count): - with SqlCounter(query_count=expected_query_count): +@pytest.mark.parametrize("loc", [-99, -1, 99, "1"]) +def test_insert_loc_negative(native_df, loc): + with SqlCounter(query_count=0): snow_df = pd.DataFrame(native_df) eval_snowpark_pandas_result( snow_df, @@ -276,25 +274,23 @@ def test_insert_loc_negative(native_df, loc, expected_query_count): @pytest.mark.parametrize( - "value, expected_query_count, expected_join_count", + "value, expected_join_count", [ - (np.array(["a", "b", "c", "d"]), 2, 1), # numpy array of shape (N,) - (np.array([["a"], ["b"], ["c"], ["d"]]), 2, 1), # numpy array of shape (N, 1) - (["a", "b", "c", "d"], 2, 1), # python list - (("a", "b", "c", "d"), 2, 1), # python tuple - ({(3, 1): 1}, 1, 1), # python dict - ("abc", 1, 0), # sting scalar - (1, 1, 0), # int scalar + (np.array(["a", "b", "c", "d"]), 1), # numpy array of shape (N,) + (np.array([["a"], ["b"], ["c"], ["d"]]), 1), # numpy array of shape (N, 1) + (["a", "b", "c", "d"], 1), # python list + (("a", "b", "c", "d"), 1), # python tuple + ({(3, 1): 1}, 1), # python dict + ("abc", 0), # sting scalar + (1, 0), # int scalar ], ) -def test_insert_multiindex_array_like_and_scalar( - value, expected_query_count, expected_join_count -): +def test_insert_multiindex_array_like_and_scalar(value, expected_join_count): arrays = [[3, 4, 5, 6], [1, 2, 1, 2]] index = pd.MultiIndex.from_arrays(arrays, names=["first", "second"]) snow_df = pd.DataFrame({"col1": ["p", "q", "r", "s"]}, index=index) native_df = snow_df.to_pandas() - with SqlCounter(query_count=expected_query_count, join_count=expected_join_count): + with SqlCounter(query_count=1, join_count=expected_join_count): eval_snowpark_pandas_result( snow_df, native_df, @@ -311,7 +307,7 @@ def test_insert_multiindex_array_like_and_scalar( ("a", "b", "c", "d"), # python tuple ], ) -@sql_count_checker(query_count=2, join_count=1) +@sql_count_checker(query_count=1, join_count=1) def test_insert_empty_multiindex_frame(value): mi = pd.MultiIndex.from_arrays([np.array([], dtype=int), np.array([], dtype=int)]) snow_df = pd.DataFrame([], index=mi) @@ -352,7 +348,7 @@ def test_insert_multiindex_dict_negative(): ([1.0, 2.5, 3.0], [1, 2, 3]), # Long and Double can be joined ], ) -@sql_count_checker(query_count=4, join_count=1) +@sql_count_checker(query_count=3, join_count=1) def test_insert_compatible_index(df_index, value_index): snow_df = pd.DataFrame({"col1": ["p", "q", "r"]}, index=native_pd.Index(df_index)) value = pd.DataFrame({"col2": ["x", "y", "z"]}, index=native_pd.Index(value_index)) @@ -383,7 +379,7 @@ def test_insert_compatible_index(df_index, value_index): ), # length and type mismatch ], ) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=0) def test_insert_index_num_levels_mismatch_negative(df_index, value_index): snow_df = pd.DataFrame({"col1": ["p", "q", "r"]}, index=native_pd.Index(df_index)) value = pd.DataFrame({"col2": ["w", "x", "y"]}, index=native_pd.Index(value_index)) @@ -408,7 +404,7 @@ def test_insert_index_num_levels_mismatch_negative(df_index, value_index): ), # type mismatch boolean != long ], ) -@sql_count_checker(query_count=2, join_count=1) +@sql_count_checker(query_count=1, join_count=1) def test_insert_index_type_mismatch(df_index, value_index, expected_index): # Note: This is different behavior than native pandas. In native pandas when # index datatype mismatch new columns in inserted will all NULL values. @@ -464,16 +460,14 @@ def test_insert_multiple_null(): @pytest.mark.parametrize( - "index, value, expected_query_count, expected_join_count", + "index, value", [ - ([1, 2], native_pd.Series([1, 2], index=[2, 3]), 1, 1), - ([1, 2], [3, 4], 2, 1), + ([1, 2], native_pd.Series([1, 2], index=[2, 3])), + ([1, 2], [3, 4]), ], ) -def test_insert_into_empty_dataframe_with_index( - index, value, expected_query_count, expected_join_count -): - with SqlCounter(query_count=expected_query_count, join_count=expected_join_count): +def test_insert_into_empty_dataframe_with_index(index, value): + with SqlCounter(query_count=1, join_count=1): snow_df = pd.DataFrame(index=index) native_df = native_pd.DataFrame(index=index) @@ -527,7 +521,7 @@ def test_insert_into_empty_dataframe( if isinstance(value, int): expected_join_count = 0 if isinstance(value, list) or isinstance(value, np.ndarray): - expected_query_count = 2 + expected_query_count = 1 snow_df = pd.DataFrame(data=data, columns=columns) native_df = native_pd.DataFrame(data=data, columns=columns) @@ -574,7 +568,7 @@ def test_insert_into_empty_dataframe_index_dtype_mismatch(): snow_df.to_pandas() -@sql_count_checker(query_count=2, join_count=1) +@sql_count_checker(query_count=1, join_count=1) def test_insert_empty_list_into_empty_dataframe(): snow_df = pd.DataFrame() native_df = native_pd.DataFrame() @@ -596,7 +590,7 @@ def test_insert_empty_list_into_empty_dataframe(): ([], ["A", "B", "C"]), ], ) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=0) def test_insert_into_empty_dataframe_negative(loc, data, columns): snow_df = pd.DataFrame(data=data, columns=columns) native_df = native_pd.DataFrame(data=data, columns=columns) @@ -771,7 +765,7 @@ def insert_op(df): assert_frame_equal(snow_res, expected_res, check_dtype=False) -@sql_count_checker(query_count=4, join_count=6) +@sql_count_checker(query_count=3, join_count=6) def test_insert_timedelta(): native_df = native_pd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) snow_df = pd.DataFrame(native_df) diff --git a/tests/integ/modin/frame/test_isin.py b/tests/integ/modin/frame/test_isin.py index 955e2cf334..b80d8ed504 100644 --- a/tests/integ/modin/frame/test_isin.py +++ b/tests/integ/modin/frame/test_isin.py @@ -143,7 +143,7 @@ def test_isin_with_Series(values, data, columns, index): snow_df, native_df, # 2 queries: 1 for the isin, 1 extra query to handle empty dataframe special case - lambda df: _test_isin_with_snowflake_logic(df, values, query_count=2), + lambda df: _test_isin_with_snowflake_logic(df, values, query_count=1), ) @@ -198,7 +198,7 @@ def eval_dataframe_isin(df): else: values = other # 3 queries: 2 for the isin of which one is caused by set, 1 extra query to handle empty dataframe special case - return _test_isin_with_snowflake_logic(df, values, query_count=2) + return _test_isin_with_snowflake_logic(df, values, query_count=1) eval_snowpark_pandas_result( snow_df, @@ -229,7 +229,7 @@ def test_isin_with_dict(df, values): ) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=0) def test_isin_duplicate_columns_negative(): with pytest.raises(ValueError, match="cannot compute isin with a duplicate axis."): df = pd.DataFrame({"A": [1, 2, 3]}) diff --git a/tests/integ/modin/frame/test_iterrows.py b/tests/integ/modin/frame/test_iterrows.py index dc7a0e6d3d..3a9b77c79b 100644 --- a/tests/integ/modin/frame/test_iterrows.py +++ b/tests/integ/modin/frame/test_iterrows.py @@ -59,9 +59,8 @@ def assert_iterators_equal(snowpark_iterator, native_iterator): def test_df_iterrows(native_df): # Test that the tuple returned is correct: (index, Series). snowpark_df = pd.DataFrame(native_df) - # One query is used to get the number of rows. One query is used to retrieve each row - each query has 4 JOIN - # operations performed due to iloc. - with SqlCounter(query_count=len(native_df) + 1): + # One query is used to retrieve each row - each query has 4 JOIN operations performed due to iloc. + with SqlCounter(query_count=len(native_df)): eval_snowpark_pandas_result( snowpark_df, native_df, @@ -70,11 +69,10 @@ def test_df_iterrows(native_df): ) -@sql_count_checker(query_count=8, union_count=7) +@sql_count_checker(query_count=7, union_count=7) def test_df_iterrows_mixed_types(default_index_native_df): # Same test as above on bigger df with mixed types. - # One query is used to get the number of rows. One query is used to retrieve each row - each query has 4 JOIN - # operations performed due to iloc. + # One query is used to retrieve each row - each query has 4 JOIN operations performed due to iloc. native_df = default_index_native_df snowpark_df = pd.DataFrame(native_df) eval_snowpark_pandas_result( @@ -85,11 +83,10 @@ def test_df_iterrows_mixed_types(default_index_native_df): ) -@sql_count_checker(query_count=7, union_count=6) +@sql_count_checker(query_count=6, union_count=6) def test_df_iterrows_multindex_df(): # Create df with a MultiIndex index. - # One query is used to get the number of rows. One query is used to retrieve each row - each query has 4 JOIN - # operations performed due to iloc. + # One query is used to retrieve each row - each query has 4 JOIN operations performed due to iloc. arrays = [ np.array(["bar", "bar", "baz", "baz", "foo", "foo"]), np.array(["one", "two", "one", "two", "one", "two"]), diff --git a/tests/integ/modin/frame/test_len.py b/tests/integ/modin/frame/test_len.py index 15956573b8..659c8ac75b 100644 --- a/tests/integ/modin/frame/test_len.py +++ b/tests/integ/modin/frame/test_len.py @@ -19,7 +19,8 @@ ({"td": native_pd.timedelta_range(1, periods=20)}, 20), ], ) -@sql_count_checker(query_count=1) +# Frames constructed from literal objects cache their sizes, so no queries are necessary. +@sql_count_checker(query_count=0) def test_len(sample, expected_len): snow = pd.DataFrame(sample) native = native_pd.DataFrame(sample) diff --git a/tests/integ/modin/frame/test_loc.py b/tests/integ/modin/frame/test_loc.py index 7ca5453a89..37b63de295 100644 --- a/tests/integ/modin/frame/test_loc.py +++ b/tests/integ/modin/frame/test_loc.py @@ -2573,7 +2573,7 @@ def test_empty_df_loc_set_scalar(): # Check `loc` with column scalar on empty DataFrame. native_df = native_pd.DataFrame() snow_df = pd.DataFrame(native_df) - with SqlCounter(query_count=1): + with SqlCounter(query_count=0): with pytest.raises( ValueError, match="cannot set a frame with no defined index and a scalar" ): @@ -2996,9 +2996,9 @@ def loc_set_helper(df): ) expected_join_count = 3 if len(item) > 1 else 2 - # 4 extra queries for index, 1 for converting to native pandas in loc_set_helper, 2 for iter and 1 for tolist + # 3 extra queries for index, 1 for converting to native pandas in loc_set_helper, 1 for iter and 1 for tolist with SqlCounter( - query_count=5 if item_type_name == "index" else 1, + query_count=4 if item_type_name == "index" else 1, join_count=expected_join_count, ): eval_snowpark_pandas_result( @@ -3056,9 +3056,9 @@ def loc_set_helper(df): if len(item) > 1: # When col_key is list and item's length > 1 or new label exists, both native pandas and Snowpark pandas # raises error if the length of item and col_key do not match when col_key length > 1 - # 4 extra queries for index, 1 for converting to native pandas in loc_set_helper, 2 for iter and 1 for tolist + # 3 extra queries for index, 1 for converting to native pandas in loc_set_helper, 1 for iter and 1 for tolist with SqlCounter( - query_count=4 if item_type_name == "index" else 0, join_count=0 + query_count=3 if item_type_name == "index" else 0, join_count=0 ): eval_snowpark_pandas_result( snow_df, @@ -3081,9 +3081,9 @@ def loc_set_helper(df): if len(col_key) <= len(item) else item + ([item[-1]] * (len(col_key) - len(item))) ) - # 4 extra queries for index, 1 for converting to native pandas in loc_set_helper, 2 for iter and 1 for tolist + # 3 extra queries for index, 1 for converting to native pandas in loc_set_helper, 1 for iter and 1 for tolist with SqlCounter( - query_count=5 if item_type_name == "index" else 1, join_count=2 + query_count=4 if item_type_name == "index" else 1, join_count=2 ): eval_snowpark_pandas_result( snow_df, native_df, loc_set_helper, inplace=True @@ -3100,9 +3100,9 @@ def loc_set_helper(df): native_df.loc[row_key, col_key] = try_convert_index_to_native( item_to_type(item) ) - # 3 extra queries for index, 2 for iter and 1 for tolist + # 2 extra queries for index, 1 for iter and 1 for tolist with SqlCounter( - query_count=3 if item_type_name == "index" else 0, join_count=0 + query_count=2 if item_type_name == "index" else 0, join_count=0 ): snowpark_err_msg = ( "Must have equal len keys and value when setting with an iterable" @@ -3113,9 +3113,9 @@ def loc_set_helper(df): else: # Both Snowpark pandas and Native pandas should have same non-error behavior. - # 4 extra queries for index, 1 for converting to native pandas in loc_set_helper, 2 for iter and 1 for tolist + # 3 extra queries for index, 1 for converting to native pandas in loc_set_helper, 1 for iter and 1 for tolist with SqlCounter( - query_count=5 if item_type_name == "index" else 1, join_count=2 + query_count=4 if item_type_name == "index" else 1, join_count=2 ): eval_snowpark_pandas_result( snow_df, native_df, loc_set_helper, inplace=True @@ -3711,7 +3711,7 @@ def loc_set_helper(df): df.loc[_key] = try_convert_index_to_native(item) # 4 extra queries, 1 for converting to native pandas in loc_set_helper, 2 for iter and 1 for tolist - with SqlCounter(query_count=5, join_count=2): + with SqlCounter(query_count=4, join_count=2): eval_snowpark_pandas_result( simple_snowpark_pandas_df, simple_native_pandas_df, @@ -4094,7 +4094,7 @@ def locset(df): df.loc[:] = obj return df - query_count = 1 if isinstance(row_obj, list) else 4 + query_count = 1 if isinstance(row_obj, list) else 3 with SqlCounter(query_count=query_count): eval_snowpark_pandas_result( snow_df, @@ -4293,7 +4293,7 @@ def test_df_loc_set_series_value_slice_key(key, row_loc): assert_snowpark_pandas_equals_to_pandas_without_dtypecheck(snow_df, native_df) -@sql_count_checker(query_count=3) +@sql_count_checker(query_count=2) def test_fix_1829928(): vars = [ -0.974507, diff --git a/tests/integ/modin/frame/test_mask.py b/tests/integ/modin/frame/test_mask.py index 40c19804ee..1e8f792aeb 100644 --- a/tests/integ/modin/frame/test_mask.py +++ b/tests/integ/modin/frame/test_mask.py @@ -437,7 +437,7 @@ def test_dataframe_mask_not_implemented(test_data, test_cond, test_others): snow_dfs[0].mask(snow_dfs[1], snow_dfs[2], axis=1) -@sql_count_checker(query_count=2, join_count=2) +@sql_count_checker(query_count=1, join_count=2) def test_dataframe_mask_cond_is_array(caplog): data = [[1, 2], [3, 4]] cond = np.array([[True, False], [False, True]]) @@ -448,7 +448,7 @@ def test_dataframe_mask_cond_is_array(caplog): eval_snowpark_pandas_result(snow_df, native_df, lambda df: df.mask(cond)) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=0) def test_dataframe_mask_cond_is_array_wrong_size_negative(): data = [[1, 2, 3], [3, 4, 5], [5, 6, 7]] cond = np.array([[True, False], [False, True]]) @@ -500,7 +500,7 @@ def __call__(self, df): ) -@sql_count_checker(query_count=2, join_count=2) +@sql_count_checker(query_count=1, join_count=2) def test_dataframe_mask_other_is_array(): data = [[1, 3], [2, 4]] other = np.array([[99, -99], [101, -101]]) @@ -513,7 +513,7 @@ def test_dataframe_mask_other_is_array(): ) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=0) def test_dataframe_mask_other_is_array_wrong_size_negative(): data = [[1, 2, 3], [3, 4, 5], [5, 6, 7]] other = np.array([[99, -99], [101, -101]]) @@ -543,7 +543,7 @@ def test_dataframe_mask_sizes_do_not_match_negative_test(test_data, test_cond): snow_df.mask(snow_cond_df) -@sql_count_checker(query_count=2, join_count=3) +@sql_count_checker(query_count=1, join_count=3) def test_dataframe_mask_with_np_array_cond(): data = [1, 2, 3] cond = np.array([[False, True, False]]).T @@ -570,7 +570,7 @@ def test_dataframe_mask_with_np_array_cond(): ) -@sql_count_checker(query_count=2, join_count=4) +@sql_count_checker(query_count=1, join_count=4) def test_dataframe_mask_with_np_array_cond_mismatched_labels(): data = [1, 2, 3] cond = np.array([[False, True, False]]).T @@ -827,9 +827,7 @@ def test_mask_with_scalar_cond(cond): native_ser = native_pd.DataFrame([[1, 2, 3]]) snow_ser = pd.DataFrame(native_ser) - sql_count = 1 if isinstance(cond, list) else 0 - - with SqlCounter(query_count=sql_count): + with SqlCounter(query_count=0): eval_snowpark_pandas_result( snow_ser, native_ser, @@ -863,7 +861,7 @@ def perform_mask(df): ) -@sql_count_checker(query_count=2, join_count=3) +@sql_count_checker(query_count=1, join_count=3) @pytest.mark.parametrize( "data", [[10], [10, 11, 12], [10, 11, 12, 13]], @@ -908,7 +906,7 @@ def perform_mask(df): ) -@sql_count_checker(query_count=2, join_count=3, union_count=1) +@sql_count_checker(query_count=1, join_count=3, union_count=1) @pytest.mark.parametrize( "data", [[10], [10, 11, 12], [10, 11, 12, 13]], diff --git a/tests/integ/modin/frame/test_merge.py b/tests/integ/modin/frame/test_merge.py index 7b6b091b29..5a788830d1 100644 --- a/tests/integ/modin/frame/test_merge.py +++ b/tests/integ/modin/frame/test_merge.py @@ -1003,25 +1003,24 @@ def test_merge_no_join_keys_common_index_with_data_negative(left_df, right_df): @pytest.mark.parametrize( - "left_on, right_on, expected_query_count, expected_join_count", + "left_on, right_on, expected_join_count", [ - (np.array(["a", "b", "c", "x", "y"]), "right_d", 5, 2), - ([np.array(["a", "b", "c", "x", "y"]), "A"], ["right_d", "A"], 5, 2), - ("left_d", np.array(["a", "b", "c", "x", "y"]), 5, 2), - (["left_d", "A"], [np.array(["a", "b", "c", "x", "y"]), "A"], 5, 2), - (["left_d", "A"], (np.array(["a", "b", "c", "x", "y"]), "A"), 5, 2), # tuple + (np.array(["a", "b", "c", "x", "y"]), "right_d", 2), + ([np.array(["a", "b", "c", "x", "y"]), "A"], ["right_d", "A"], 2), + ("left_d", np.array(["a", "b", "c", "x", "y"]), 2), + (["left_d", "A"], [np.array(["a", "b", "c", "x", "y"]), "A"], 2), + (["left_d", "A"], (np.array(["a", "b", "c", "x", "y"]), "A"), 2), # tuple ( np.array(["a", "b", "c", "x", "y"]), np.array(["x", "y", "c", "a", "b"]), - 7, 3, ), ], ) def test_merge_on_array_like_keys( - left_df, right_df, left_on, right_on, how, expected_query_count, expected_join_count + left_df, right_df, left_on, right_on, how, expected_join_count ): - with SqlCounter(query_count=expected_query_count, join_count=expected_join_count): + with SqlCounter(query_count=3, join_count=expected_join_count): _verify_merge(left_df, right_df, how=how, left_on=left_on, right_on=right_on) @@ -1052,7 +1051,7 @@ def test_merge_on_array_like_keys_conflict_negative(left_df, right_df): np.array(["a", "b", "c", "a", "b", "c"]), # too long ], ) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=0) def test_merge_on_array_like_keys_length_mismatch_negative(left_df, right_df, left_on): # Native pandas raises # ValueError: The truth value of an array with more than one element is ambiguous diff --git a/tests/integ/modin/frame/test_repr.py b/tests/integ/modin/frame/test_repr.py index 80abf4fd3e..ff88bb9798 100644 --- a/tests/integ/modin/frame/test_repr.py +++ b/tests/integ/modin/frame/test_repr.py @@ -194,7 +194,7 @@ def test_with_max_rows_none(self): native_pd.set_option("display.max_rows", None) pd.set_option("display.max_rows", None) - with SqlCounter(select_count=2): + with SqlCounter(select_count=1): snow_str = repr(self.snow_df) native_str = repr(self.native_df) diff --git a/tests/integ/modin/frame/test_sample.py b/tests/integ/modin/frame/test_sample.py index 7153d1d7d0..ab9b23070e 100644 --- a/tests/integ/modin/frame/test_sample.py +++ b/tests/integ/modin/frame/test_sample.py @@ -18,7 +18,7 @@ def ignore_index(request): @pytest.mark.modin_sp_precommit -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=0) def test_df_sample_cols(): data = np.random.randint(100, size=(20, 20)) @@ -47,7 +47,7 @@ def test_df_sample_rows_n(data, n, ignore_index): @pytest.mark.parametrize("n", [0, 1, 10, 20, 30]) -@sql_count_checker(query_count=5, join_count=1) +@sql_count_checker(query_count=4, join_count=1) def test_df_sample_rows_n_replace(n, ignore_index): sample_df = pd.DataFrame(np.random.randint(100, size=(20, 20))).sample( n=n, replace=True, ignore_index=ignore_index @@ -67,7 +67,7 @@ def test_df_sample_rows_frac(frac, ignore_index): @pytest.mark.parametrize("frac", [0, 0.1, 0.9, 1, 1.1, 1.9, 2]) -@sql_count_checker(query_count=4, join_count=1) +@sql_count_checker(query_count=3, join_count=1) def test_df_sample_rows_frac_replace(frac, ignore_index): sample_df = pd.DataFrame(np.random.randint(100, size=(20, 20))).sample( frac=frac, replace=True, ignore_index=ignore_index diff --git a/tests/integ/modin/frame/test_setitem.py b/tests/integ/modin/frame/test_setitem.py index 42f9809523..b027b7c8db 100644 --- a/tests/integ/modin/frame/test_setitem.py +++ b/tests/integ/modin/frame/test_setitem.py @@ -394,9 +394,9 @@ def func_insert_new_column(df, column): inplace=True, ) else: - # 3 extra queries, 2 for iter and 1 for tolist + # 2 extra queries, 1 for iter and 1 for tolist with SqlCounter( - query_count=4 + query_count=3 if isinstance(column, native_pd.Index) and not isinstance(column, native_pd.DatetimeIndex) else 1, diff --git a/tests/integ/modin/frame/test_shape.py b/tests/integ/modin/frame/test_shape.py index 4f900988cc..ae27d1ad8e 100644 --- a/tests/integ/modin/frame/test_shape.py +++ b/tests/integ/modin/frame/test_shape.py @@ -29,7 +29,7 @@ "timedelta", ], ) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=0) def test_dataframe_shape_param(dataframe_input): eval_snowpark_pandas_result( pd.DataFrame(dataframe_input), @@ -39,7 +39,7 @@ def test_dataframe_shape_param(dataframe_input): ) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=0) def test_dataframe_shape_index_empty(empty_index_native_pandas_dataframe): eval_snowpark_pandas_result( pd.DataFrame(empty_index_native_pandas_dataframe), diff --git a/tests/integ/modin/frame/test_size.py b/tests/integ/modin/frame/test_size.py index 21d5096921..7897fd2d64 100644 --- a/tests/integ/modin/frame/test_size.py +++ b/tests/integ/modin/frame/test_size.py @@ -15,9 +15,9 @@ @pytest.mark.parametrize( "args, kwargs, expected_query_count", [ - ([{"A": [1, 2], "B": [3, 4], "C": [5, 6]}], {}, 1), - ([{"A": [], "B": []}], {}, 1), - ([np.random.rand(100, 10)], {}, 4), + ([{"A": [1, 2], "B": [3, 4], "C": [5, 6]}], {}, 0), + ([{"A": [], "B": []}], {}, 0), + ([np.random.rand(100, 10)], {}, 0), ( [{"Value": [10, 20, 30, 40]}], { @@ -27,7 +27,7 @@ }, 1, ), - ([[pd.Timedelta(1), 1]], {}, 1), + ([[pd.Timedelta(1), 1]], {}, 0), ], ids=[ "non-empty 2x3", @@ -47,7 +47,7 @@ def test_dataframe_size_param(args, kwargs, expected_query_count): ) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=0) def test_dataframe_size_index_empty(empty_index_native_pandas_dataframe): eval_snowpark_pandas_result( pd.DataFrame(empty_index_native_pandas_dataframe), diff --git a/tests/integ/modin/frame/test_squeeze.py b/tests/integ/modin/frame/test_squeeze.py index a5db2e7de9..6fa9a70dc6 100644 --- a/tests/integ/modin/frame/test_squeeze.py +++ b/tests/integ/modin/frame/test_squeeze.py @@ -22,12 +22,7 @@ def axis(request): @pytest.mark.parametrize("dtype", ["int", "timedelta64[ns]"]) def test_n_by_1(axis, dtype): - if axis == 1 or axis == "columns": - expected_query_count = 1 - else: - expected_query_count = 2 - - with SqlCounter(query_count=expected_query_count): + with SqlCounter(query_count=1): eval_snowpark_pandas_result( *create_test_dfs([1, 2, 3], dtype=dtype), lambda df: df.squeeze(axis=axis), @@ -37,8 +32,6 @@ def test_n_by_1(axis, dtype): @pytest.mark.parametrize("dtype", ["int", "timedelta64[ns]"]) def test_1_by_n(axis, dtype): if axis is None: - expected_query_count = 3 - elif axis in [0, "index"]: expected_query_count = 2 else: expected_query_count = 1 @@ -50,7 +43,7 @@ def test_1_by_n(axis, dtype): def test_2d(axis): - with SqlCounter(query_count=1 if axis in [1, "columns"] else 2): + with SqlCounter(query_count=1): eval_snowpark_pandas_result( *create_test_dfs( { @@ -67,12 +60,8 @@ def test_2d(axis): "scalar", [param(pd.Timedelta(1), id="timedelta"), param(1, id="int")] ) def test_scalar(axis, scalar): - if axis == 1 or axis == "columns": - expected_query_count = 1 - else: - expected_query_count = 2 snow_df, native_df = create_test_dfs([scalar]) - with SqlCounter(query_count=expected_query_count): + with SqlCounter(query_count=1): if axis is None: assert scalar == snow_df.squeeze() else: diff --git a/tests/integ/modin/frame/test_to_html.py b/tests/integ/modin/frame/test_to_html.py index 1cf78ac186..c62cf8dadd 100644 --- a/tests/integ/modin/frame/test_to_html.py +++ b/tests/integ/modin/frame/test_to_html.py @@ -30,7 +30,7 @@ """ -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=1) def test_to_html(): df = pd.DataFrame(data={"col1": [1, 2], "col2": [4, 3]}) assert html_string == df.to_html() diff --git a/tests/integ/modin/frame/test_to_string.py b/tests/integ/modin/frame/test_to_string.py index b4e76cd99f..a56a8a8961 100644 --- a/tests/integ/modin/frame/test_to_string.py +++ b/tests/integ/modin/frame/test_to_string.py @@ -9,7 +9,7 @@ from tests.integ.utils.sql_counter import sql_count_checker -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=1) def test_to_string(): native_df = native_pd.DataFrame(data={"col1": [1, 2], "col2": [4, 3]}) snow_df = pd.DataFrame(native_df) diff --git a/tests/integ/modin/frame/test_value_counts.py b/tests/integ/modin/frame/test_value_counts.py index 8a06fca35a..d597d55dd6 100644 --- a/tests/integ/modin/frame/test_value_counts.py +++ b/tests/integ/modin/frame/test_value_counts.py @@ -74,7 +74,7 @@ def test_value_counts_subset_negative(test_data, subset): snow_df = pd.DataFrame(test_data) native_df = native_pd.DataFrame(test_data) - with SqlCounter(query_count=1 if len(subset) > 0 else 0): + with SqlCounter(query_count=0): eval_snowpark_pandas_result( snow_df, native_df, @@ -180,7 +180,7 @@ def test_value_counts_dropna(test_data, dropna): ) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=0) def test_non_existing_labels(): # when subset contains non-existing labels, it is unimplemented # because of function `get_frame_with_groupby_columns_as_index` diff --git a/tests/integ/modin/frame/test_where.py b/tests/integ/modin/frame/test_where.py index 02f9de9ef0..c015ca4fdf 100644 --- a/tests/integ/modin/frame/test_where.py +++ b/tests/integ/modin/frame/test_where.py @@ -443,7 +443,7 @@ def test_dataframe_where_not_implemented(test_data, test_cond, test_others): snow_dfs[0].where(snow_dfs[1], snow_dfs[2], axis=1) -@sql_count_checker(query_count=2, join_count=2) +@sql_count_checker(query_count=1, join_count=2) def test_dataframe_where_cond_is_array(caplog): data = [[1, 2], [3, 4]] cond = np.array([[True, False], [False, True]]) @@ -454,7 +454,7 @@ def test_dataframe_where_cond_is_array(caplog): eval_snowpark_pandas_result(snow_df, native_df, lambda df: df.where(cond)) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=0) def test_dataframe_where_cond_is_array_wrong_size_negative(): data = [[1, 2, 3], [3, 4, 5], [5, 6, 7]] cond = np.array([[True, False], [False, True]]) @@ -506,7 +506,7 @@ def __call__(self, df): ) -@sql_count_checker(query_count=2, join_count=2) +@sql_count_checker(query_count=1, join_count=2) def test_dataframe_where_other_is_array(): data = [[1, 3], [2, 4]] other = np.array([[99, -99], [101, -101]]) @@ -519,7 +519,7 @@ def test_dataframe_where_other_is_array(): ) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=0) def test_dataframe_where_other_is_array_wrong_size_negative(): data = [[1, 2, 3], [3, 4, 5], [5, 6, 7]] other = np.array([[99, -99], [101, -101]]) @@ -549,7 +549,7 @@ def test_dataframe_where_sizes_do_not_match_negative_test(test_data, test_cond): snow_df.where(snow_cond_df) -@sql_count_checker(query_count=2, join_count=3) +@sql_count_checker(query_count=1, join_count=3) def test_dataframe_where_with_np_array_cond(): data = [1, 2, 3] cond = np.array([[False, True, False]]).T @@ -864,9 +864,7 @@ def test_where_with_scalar_cond(cond): native_ser = native_pd.DataFrame([[1, 2, 3]]) snow_ser = pd.DataFrame(native_ser) - sql_count = 1 if isinstance(cond, list) else 0 - - with SqlCounter(query_count=sql_count): + with SqlCounter(query_count=0): eval_snowpark_pandas_result( snow_ser, native_ser, @@ -900,7 +898,7 @@ def perform_where(df): ) -@sql_count_checker(query_count=2, join_count=3) +@sql_count_checker(query_count=1, join_count=3) @pytest.mark.parametrize( "data", [[10], [10, 11, 12], [10, 11, 12, 13]], @@ -945,7 +943,7 @@ def perform_where(df): ) -@sql_count_checker(query_count=2, join_count=3, union_count=1) +@sql_count_checker(query_count=1, join_count=3, union_count=1) @pytest.mark.parametrize( "data", [[10], [10, 11, 12], [10, 11, 12, 13]], diff --git a/tests/integ/modin/groupby/test_groupby_bfill_ffill.py b/tests/integ/modin/groupby/test_groupby_bfill_ffill.py index 49378ac2b0..52703970c9 100644 --- a/tests/integ/modin/groupby/test_groupby_bfill_ffill.py +++ b/tests/integ/modin/groupby/test_groupby_bfill_ffill.py @@ -14,7 +14,7 @@ from tests.integ.modin.utils import ( eval_snowpark_pandas_result as _eval_snowpark_pandas_result, ) -from tests.integ.utils.sql_counter import SqlCounter, sql_count_checker +from tests.integ.utils.sql_counter import sql_count_checker def eval_snowpark_pandas_result(*args, **kwargs): @@ -184,11 +184,9 @@ def test_groupby_bfill_ffill_multiindex_with_level(method, level, limit): @pytest.mark.parametrize("method", ["bfill", "ffill"]) -@pytest.mark.parametrize( - "by_info", [(["I", "A"], 1), (["A"], 0), (["A", "B"], 1), (10, 0)] -) -def test_groupby_bfill_ffill_multiindex_negative(method, by_info): - by_list, expected_query_count = by_info +@pytest.mark.parametrize("by_list", [["I", "A"], ["A"], ["A", "B"], 10]) +@sql_count_checker(query_count=0) +def test_groupby_bfill_ffill_multiindex_negative(method, by_list): native_df = native_pd.DataFrame( TEST_DF_DATA_2, index=TEST_DF_INDEX_2, columns=TEST_DF_COLUMNS_2 ) @@ -199,17 +197,15 @@ def test_groupby_bfill_ffill_multiindex_negative(method, by_info): by_list = None else: level = None - - with SqlCounter(query_count=expected_query_count): - eval_snowpark_pandas_result( - snow_df, - native_df, - lambda df: getattr(df.groupby(by_list, level=level, axis=0), method)( - limit=None - ), - expect_exception=True, - expect_exception_type=IndexError if level is not None else KeyError, - ) + eval_snowpark_pandas_result( + snow_df, + native_df, + lambda df: getattr(df.groupby(by_list, level=level, axis=0), method)( + limit=None + ), + expect_exception=True, + expect_exception_type=IndexError if level is not None else KeyError, + ) @pytest.mark.parametrize("method", ["bfill", "ffill"]) diff --git a/tests/integ/modin/groupby/test_groupby_default2pandas.py b/tests/integ/modin/groupby/test_groupby_default2pandas.py index a7e70b2c42..40dfa4ed0b 100644 --- a/tests/integ/modin/groupby/test_groupby_default2pandas.py +++ b/tests/integ/modin/groupby/test_groupby_default2pandas.py @@ -93,15 +93,12 @@ def test_groupby_axis_1_mi(group_name): ["col1", lambda x: x + 1, lambda x: x % 3, "col2"], ], ) +@sql_count_checker(query_count=0) def test_groupby_with_callable_and_array(basic_snowpark_pandas_df, by) -> None: - expected_query_count = 0 - if isinstance(by, list): - expected_query_count = 1 - with SqlCounter(query_count=expected_query_count): - with pytest.raises( - NotImplementedError, match=AGGREGATE_UNSUPPORTED_GROUPING_ERROR_PATTERN - ): - basic_snowpark_pandas_df.groupby(by).min() + with pytest.raises( + NotImplementedError, match=AGGREGATE_UNSUPPORTED_GROUPING_ERROR_PATTERN + ): + basic_snowpark_pandas_df.groupby(by).min() @sql_count_checker(query_count=0) @@ -113,7 +110,7 @@ def test_timeseries_groupby_with_callable(tsframe): snow_ts_df.groupby(lambda x: x.month).agg(np.percentile, 80, axis=0) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=0) def test_groupby_with_numpy_array(basic_snowpark_pandas_df) -> None: by = [1, 1, 4, 2, 2, 4] with pytest.raises( @@ -126,7 +123,7 @@ def test_groupby_with_numpy_array(basic_snowpark_pandas_df) -> None: "by_list", [[2, 1, 1, 2, 3, 3], [[2, 1, 1, 2, 3, 3], "a"]], ) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=0) def test_groupby_series_with_numpy_array(native_series_multi_numeric, by_list) -> None: with pytest.raises( NotImplementedError, match=AGGREGATE_UNSUPPORTED_GROUPING_ERROR_PATTERN @@ -145,7 +142,7 @@ def test_groupby_with_external_series(basic_snowpark_pandas_df) -> None: ): basic_snowpark_pandas_df.groupby(by=snowpark_pandas_series).sum() - with SqlCounter(query_count=1): + with SqlCounter(query_count=0): by_list = ["col1", "col2", snowpark_pandas_series] with pytest.raises( NotImplementedError, match=AGGREGATE_UNSUPPORTED_GROUPING_ERROR_PATTERN diff --git a/tests/integ/modin/groupby/test_groupby_fillna.py b/tests/integ/modin/groupby/test_groupby_fillna.py index e7df92b8d4..1e26721a74 100644 --- a/tests/integ/modin/groupby/test_groupby_fillna.py +++ b/tests/integ/modin/groupby/test_groupby_fillna.py @@ -219,14 +219,11 @@ def test_groupby_fillna_multiindex_ffill_bfill_with_level( @pytest.mark.parametrize("method_or_value", METHOD_OR_VALUES) @pytest.mark.parametrize("fillna_axis", [None, 1]) -@pytest.mark.parametrize( - "by_info", [(["I", "A"], 1), (["A"], 0), (["A", "B"], 1), (10, 0)] -) +@pytest.mark.parametrize("by_list", [["I", "A"], ["A"], ["A", "B"], 10]) def test_groupby_fillna_multiindex_ffill_bfill_negative( - method_or_value, fillna_axis, by_info + method_or_value, fillna_axis, by_list ): method, value = method_or_value - by_list, expected_query_count = by_info native_df = native_pd.DataFrame( TEST_DF_DATA_2, index=TEST_DF_INDEX_2, columns=TEST_DF_COLUMNS_2 ) @@ -238,7 +235,7 @@ def test_groupby_fillna_multiindex_ffill_bfill_negative( else: level = None - with SqlCounter(query_count=expected_query_count): + with SqlCounter(query_count=0): eval_snowpark_pandas_result( snow_df, native_df, diff --git a/tests/integ/modin/groupby/test_groupby_negative.py b/tests/integ/modin/groupby/test_groupby_negative.py index 6c34670261..33745ad0e4 100644 --- a/tests/integ/modin/groupby/test_groupby_negative.py +++ b/tests/integ/modin/groupby/test_groupby_negative.py @@ -25,21 +25,18 @@ @pytest.mark.parametrize( - "invalid_by, expected_query_count", + "invalid_by", [ - (["col1"], 0), - (None, 0), - ([], 0), - ("non_exist_by", 0), - ( - ["col2", "non_exist_by"], - 1, - ), # non existing label in list leads to count query - (("col2", "col3"), 0), + ["col1"], + None, + [], + "non_exist_by", + ["col2", "non_exist_by"], + ("col2", "col3"), ], ) -def test_invalid_by(invalid_by, expected_query_count) -> None: - snowpark_pandas_df = pd.DataFrame( +def test_invalid_by(invalid_by) -> None: + pandas_df = native_pd.DataFrame( { "col1": [0, 1, 1, 0], "col2": [4, 5, 36, 7], @@ -48,10 +45,10 @@ def test_invalid_by(invalid_by, expected_query_count) -> None: } ) # rename the columns to have duplicated column names - snowpark_pandas_df.columns = ["col1", "col2", "col3", "col1"] - pandas_df = snowpark_pandas_df.to_pandas() + pandas_df.columns = ["col1", "col2", "col3", "col1"] + snowpark_pandas_df = pd.DataFrame(pandas_df) - with SqlCounter(query_count=expected_query_count): + with SqlCounter(query_count=0): eval_snowpark_pandas_result( snowpark_pandas_df, pandas_df, @@ -60,7 +57,7 @@ def test_invalid_by(invalid_by, expected_query_count) -> None: ) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=1) def test_invalid_none_label(): snowpark_pandas_df = pd.DataFrame( { diff --git a/tests/integ/modin/groupby/test_quantile.py b/tests/integ/modin/groupby/test_quantile.py index bce7a6a05f..f312d5b09c 100644 --- a/tests/integ/modin/groupby/test_quantile.py +++ b/tests/integ/modin/groupby/test_quantile.py @@ -251,7 +251,7 @@ def test_quantile_raises(): df.groupby("key").quantile().to_pandas() -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=0) def test_quantile_out_of_bounds_q_raises(): # https://github.com/pandas-dev/pandas/issues/27470 df = pd.DataFrame({"a": [0, 0, 0, 1, 1, 1], "b": range(6)}) diff --git a/tests/integ/modin/groupby/test_value_counts.py b/tests/integ/modin/groupby/test_value_counts.py index f80df364cd..db70e13507 100644 --- a/tests/integ/modin/groupby/test_value_counts.py +++ b/tests/integ/modin/groupby/test_value_counts.py @@ -154,21 +154,19 @@ def test_value_counts_as_index(test_data, by, groupby_sort, sort, as_index): (["by", "bad_key"], ValueError), # subset cannot overlap with grouping columns ], ) +@sql_count_checker(query_count=0) def test_value_counts_bad_subset(subset, exception_cls): - # for KeyError, 1 query always runs to validate the length of the by list - with SqlCounter(query_count=1 if exception_cls is KeyError else 0): - eval_snowpark_pandas_result( - *create_test_dfs(TEST_DATA[0]), - lambda x: x.groupby(by=["by"]).value_counts(subset=subset), - expect_exception=True, - expect_exception_type=exception_cls, - assert_exception_equal=False, - ) + eval_snowpark_pandas_result( + *create_test_dfs(TEST_DATA[0]), + lambda x: x.groupby(by=["by"]).value_counts(subset=subset), + expect_exception=True, + expect_exception_type=exception_cls, + assert_exception_equal=False, + ) -# An additional query is needed to validate the length of the by list # A JOIN is needed to set the index to the by list -@sql_count_checker(query_count=2, join_count=1) +@sql_count_checker(query_count=1, join_count=1) def test_value_counts_series(): by = ["a", "a", "b", "b", "a", "c"] native_ser = native_pd.Series( @@ -180,8 +178,7 @@ def test_value_counts_series(): ) -# 1 query always runs to validate the length of the by list -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=0) def test_value_counts_bins_unimplemented(): by = ["a", "a", "b", "b", "a", "c"] native_ser = native_pd.Series( diff --git a/tests/integ/modin/index/test_index_methods.py b/tests/integ/modin/index/test_index_methods.py index c20696b2ab..d7525b9cf0 100644 --- a/tests/integ/modin/index/test_index_methods.py +++ b/tests/integ/modin/index/test_index_methods.py @@ -155,7 +155,7 @@ def test_index_summary(native_index): assert snow_index._summary() == native_index._summary() -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=0) @pytest.mark.parametrize("native_index", NATIVE_INDEX_TEST_DATA) def test_index_size(native_index): snow_index = pd.Index(native_index) @@ -163,14 +163,14 @@ def test_index_size(native_index): @pytest.mark.parametrize("native_df", TEST_DFS) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=0) def test_df_index_size(native_df): snow_df = pd.DataFrame(native_df) assert snow_df.index.size == native_df.index.size assert snow_df.columns.size == native_df.columns.size -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=0) @pytest.mark.parametrize("native_index", NATIVE_INDEX_TEST_DATA) def test_index_empty(native_index): snow_index = pd.Index(native_index) @@ -178,14 +178,14 @@ def test_index_empty(native_index): @pytest.mark.parametrize("native_df", TEST_DFS) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=0) def test_df_index_empty(native_df): snow_df = pd.DataFrame(native_df) assert snow_df.index.empty == native_df.index.empty assert snow_df.columns.empty == native_df.columns.empty -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=0) @pytest.mark.parametrize("native_index", NATIVE_INDEX_TEST_DATA) def test_index_shape(native_index): snow_index = pd.Index(native_index) @@ -193,7 +193,7 @@ def test_index_shape(native_index): @pytest.mark.parametrize("native_df", TEST_DFS) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=0) def test_df_index_shape(native_df): snow_df = pd.DataFrame(native_df) assert snow_df.index.shape == native_df.index.shape diff --git a/tests/integ/modin/interoperability/plotly/test_plotly.py b/tests/integ/modin/interoperability/plotly/test_plotly.py index 63de8969a5..181e7cedad 100644 --- a/tests/integ/modin/interoperability/plotly/test_plotly.py +++ b/tests/integ/modin/interoperability/plotly/test_plotly.py @@ -202,7 +202,7 @@ def test_box(test_dfs): ) -@sql_count_checker(query_count=4) +@sql_count_checker(query_count=3) def test_imshow(test_dfs): eval_snowpark_pandas_result( *test_dfs, diff --git a/tests/integ/modin/interoperability/scikit-learn/test_scikit_learn.py b/tests/integ/modin/interoperability/scikit-learn/test_scikit_learn.py index 7cff7c7803..48b2e51662 100644 --- a/tests/integ/modin/interoperability/scikit-learn/test_scikit_learn.py +++ b/tests/integ/modin/interoperability/scikit-learn/test_scikit_learn.py @@ -126,7 +126,7 @@ def get_predictions(df) -> np.ndarray: class TestClustering: - @sql_count_checker(query_count=3) + @sql_count_checker(query_count=2) def test_clustering(self, test_dfs): def get_cluster_centers(df) -> np.ndarray: return KMeans(n_clusters=3).fit(df).cluster_centers_ @@ -137,7 +137,7 @@ def get_cluster_centers(df) -> np.ndarray: class TestDimensionalityReduction: - @sql_count_checker(query_count=3) + @sql_count_checker(query_count=2) def test_principal_component_analysis(self, test_dfs): def get_principal_components(df) -> np.ndarray: return PCA(n_components=2).fit(df).components_ @@ -192,7 +192,7 @@ def validate_search_results(snow_estimator, pandas_estimator): class TestPreprocessing: - @sql_count_checker(query_count=5) + @sql_count_checker(query_count=4) def test_maxabs(self, test_dfs): eval_snowpark_pandas_result( *test_dfs, diff --git a/tests/integ/modin/io/test_read_snowflake.py b/tests/integ/modin/io/test_read_snowflake.py index b6253aa273..e680a07000 100644 --- a/tests/integ/modin/io/test_read_snowflake.py +++ b/tests/integ/modin/io/test_read_snowflake.py @@ -547,7 +547,7 @@ def test_read_snowflake_row_access_policy_table( f"alter table {test_table_name} add row access policy no_access_policy on (col1)" ).collect() - expected_query_count = 4 if enforce_ordering else 2 + expected_query_count = 3 if enforce_ordering else 1 with SqlCounter(query_count=expected_query_count): df = read_snowflake_and_verify_snapshot_creation_if_any( session, test_table_name, as_query, True, enforce_ordering diff --git a/tests/integ/modin/io/test_read_snowflake_query_order_by.py b/tests/integ/modin/io/test_read_snowflake_query_order_by.py index 6fb34078e4..a179b84a36 100644 --- a/tests/integ/modin/io/test_read_snowflake_query_order_by.py +++ b/tests/integ/modin/io/test_read_snowflake_query_order_by.py @@ -191,7 +191,7 @@ def test_order_by_with_no_limit_but_colname_shadows(session, caplog, enforce_ord @pytest.mark.parametrize("enforce_ordering", [True, False]) def test_order_by_with_limit_and_name_shadows(session, caplog, enforce_ordering): - expected_query_count = 6 if enforce_ordering else 3 + expected_query_count = 5 if enforce_ordering else 2 with SqlCounter(query_count=expected_query_count): table_name = Utils.random_name_for_temp_object(TempObjectType.TABLE) native_df = native_pd.DataFrame( diff --git a/tests/integ/modin/series/test_empty.py b/tests/integ/modin/series/test_empty.py index 2fd78ad464..b8e14c789c 100644 --- a/tests/integ/modin/series/test_empty.py +++ b/tests/integ/modin/series/test_empty.py @@ -33,7 +33,7 @@ "empty series with only index", ], ) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=0) def test_series_empty(args, kwargs): eval_snowpark_pandas_result( pd.Series(*args, **kwargs), diff --git a/tests/integ/modin/series/test_isin.py b/tests/integ/modin/series/test_isin.py index 4c8c933786..9759bc5eea 100644 --- a/tests/integ/modin/series/test_isin.py +++ b/tests/integ/modin/series/test_isin.py @@ -61,11 +61,11 @@ def _test_isin_with_snowflake_logic(s, values): (np.array([]), 3), (np.array([1, 2, 1]), 3), (np.array([None, 1, 2]), 3), - (native_pd.Series(), 5), + (native_pd.Series(), 4), # (native_pd.Series([2, 3], index=["A", "B"]), 1), # not supported anymore because of index type mismatch # (native_pd.Series(index=["A", "B"]), 1), # not supported anymore because of index type mismatch - (native_pd.Series([None, -10]), 5), - (native_pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}), 4), + (native_pd.Series([None, -10]), 4), + (native_pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}), 3), (native_pd.Index([4, 5, 6]), 5), ], ) @@ -115,10 +115,10 @@ def test_isin_with_incompatible_index(values, expected_query_count): @pytest.mark.parametrize( "data,values,expected_query_count", [ - ([], native_pd.Series([]), 5), - ([1, 2, 3], native_pd.Series([]), 5), - ([], native_pd.Series([2, 3, 4]), 5), - ([1, 2, 3, 8], native_pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}), 4), + ([], native_pd.Series([]), 4), + ([1, 2, 3], native_pd.Series([]), 4), + ([], native_pd.Series([2, 3, 4]), 4), + ([1, 2, 3, 8], native_pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}), 3), (["A", "B", ""], [], 3), (["A", "B", ""], ["A"], 3), (["A", "B", ""], ["A", "B", "C", "D"], 3), @@ -162,7 +162,7 @@ def test_isin_various_combos(data, values, expected_query_count): ) -@sql_count_checker(query_count=2, join_count=1) +@sql_count_checker(query_count=1, join_count=1) def test_isin_lazy(): s_data = [1, 2, 3, 4, 5] df_data = {"a": [1, 2, "test"], "b": [4, 5, 6]} diff --git a/tests/integ/modin/series/test_len.py b/tests/integ/modin/series/test_len.py index df76cd937b..e50a0287fe 100644 --- a/tests/integ/modin/series/test_len.py +++ b/tests/integ/modin/series/test_len.py @@ -18,7 +18,7 @@ ([1, 2, None], 3), ], ) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=0) def test_len(sample, expected_len): snow = pd.Series(sample) native = native_pd.Series(sample) diff --git a/tests/integ/modin/series/test_loc.py b/tests/integ/modin/series/test_loc.py index bdc10b4463..9b5dfabed3 100644 --- a/tests/integ/modin/series/test_loc.py +++ b/tests/integ/modin/series/test_loc.py @@ -773,12 +773,12 @@ def loc_set_helper(s): s.loc[_row_key] = _item query_count = 1 - # 5 extra queries: sum of two cases below + # 4 extra queries: sum of two cases below if item_type.startswith("index") and key_type.startswith("index"): - query_count = 6 - # 4 extra queries: 1 query to convert item index to pandas in loc_set_helper, 2 for iter, and 1 for to_list - elif item_type.startswith("index"): query_count = 5 + # 3 extra queries: 1 query to convert item index to pandas in loc_set_helper, 1 for iter, and 1 for to_list + elif item_type.startswith("index"): + query_count = 4 # 1 extra query to convert to series to setitem elif key_type.startswith("index"): query_count = 2 diff --git a/tests/integ/modin/series/test_mask.py b/tests/integ/modin/series/test_mask.py index 3960ac752f..c0ab10f8ab 100644 --- a/tests/integ/modin/series/test_mask.py +++ b/tests/integ/modin/series/test_mask.py @@ -103,7 +103,7 @@ def test_series_mask_index_no_names(): ) -@sql_count_checker(query_count=3, join_count=2) +@sql_count_checker(query_count=2, join_count=2) def test_series_mask_with_np_array_cond(): data = [1, 2] cond = np.array([True, False]) @@ -219,9 +219,7 @@ def test_series_mask_with_scalar_cond(cond): native_ser = native_pd.Series([1, 2, 3]) snow_ser = pd.Series(native_ser) - sql_count = 1 if isinstance(cond, list) else 0 - - with SqlCounter(query_count=sql_count): + with SqlCounter(query_count=0): eval_snowpark_pandas_result( snow_ser, native_ser, diff --git a/tests/integ/modin/series/test_ndim.py b/tests/integ/modin/series/test_ndim.py index ba12f1a072..1dc5619710 100644 --- a/tests/integ/modin/series/test_ndim.py +++ b/tests/integ/modin/series/test_ndim.py @@ -22,7 +22,7 @@ "empty column", ], ) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=0) def test_series_ndim(series_input): eval_snowpark_pandas_result( pd.Series(series_input), diff --git a/tests/integ/modin/series/test_sample.py b/tests/integ/modin/series/test_sample.py index c9b638fb25..e61028a3d6 100644 --- a/tests/integ/modin/series/test_sample.py +++ b/tests/integ/modin/series/test_sample.py @@ -39,7 +39,7 @@ def test_series_sample_n(data, n, ignore_index): @pytest.mark.parametrize("n", [None, 0, 1, 8, 10, 20]) -@sql_count_checker(query_count=5, join_count=1) +@sql_count_checker(query_count=4, join_count=1) def test_series_sample_n_replace(n, ignore_index): s = pd.Series(range(100, 110)).sample(n=n, replace=True, ignore_index=ignore_index) assert len(s) == (n if n is not None else 1) @@ -55,7 +55,7 @@ def test_series_sample_frac(frac, ignore_index): @pytest.mark.parametrize("frac", [None, 0, 0.1, 0.5, 0.8, 1, 1.1, 1.5, 1.8, 2]) -@sql_count_checker(query_count=4, join_count=1) +@sql_count_checker(query_count=3, join_count=1) def test_series_sample_frac_reply(frac, ignore_index): s = pd.Series(range(100, 110)).sample( frac=frac, replace=True, ignore_index=ignore_index diff --git a/tests/integ/modin/series/test_shape.py b/tests/integ/modin/series/test_shape.py index 654323fd14..d920ec770d 100644 --- a/tests/integ/modin/series/test_shape.py +++ b/tests/integ/modin/series/test_shape.py @@ -33,7 +33,7 @@ "empty series with only index", ], ) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=0) def test_series_shape(args, kwargs): eval_snowpark_pandas_result( pd.Series(*args, **kwargs), diff --git a/tests/integ/modin/series/test_size.py b/tests/integ/modin/series/test_size.py index eefabf00c2..d9b67406a2 100644 --- a/tests/integ/modin/series/test_size.py +++ b/tests/integ/modin/series/test_size.py @@ -8,16 +8,16 @@ import snowflake.snowpark.modin.plugin # noqa: F401 from tests.integ.modin.utils import eval_snowpark_pandas_result -from tests.integ.utils.sql_counter import sql_count_checker +from tests.integ.utils.sql_counter import SqlCounter @pytest.mark.parametrize( - "args, kwargs", + "args, kwargs, expected_query_count", [ - ([{"A": [1, 2, 3]}], {}), - ([{"A": []}], {}), - ([[]], {}), - ([None], {}), + ([{"A": [1, 2, 3]}], {}, 0), + ([{"A": []}], {}, 0), + ([[]], {}, 0), + ([None], {}, 0), ( [[1, 2, 3, 4]], { @@ -25,6 +25,7 @@ [["A", "B"], ["C", "D"]], names=["Index1", "Index2"] ) }, + 1, ), ], ids=[ @@ -35,11 +36,11 @@ "multi index", ], ) -@sql_count_checker(query_count=1) -def test_series_size(args, kwargs): - eval_snowpark_pandas_result( - pd.Series(*args, **kwargs), - native_pd.Series(*args, **kwargs), - lambda df: df.size, - comparator=lambda x, y: x == y, - ) +def test_series_size(args, kwargs, expected_query_count): + with SqlCounter(query_count=expected_query_count): + eval_snowpark_pandas_result( + pd.Series(*args, **kwargs), + native_pd.Series(*args, **kwargs), + lambda df: df.size, + comparator=lambda x, y: x == y, + ) diff --git a/tests/integ/modin/series/test_squeeze.py b/tests/integ/modin/series/test_squeeze.py index b911d97a83..6c02ec9b93 100644 --- a/tests/integ/modin/series/test_squeeze.py +++ b/tests/integ/modin/series/test_squeeze.py @@ -27,13 +27,13 @@ def axis_negative(request): return request.param -@sql_count_checker(query_count=3) +@sql_count_checker(query_count=2) def test_noop(axis): s = pd.Series([1, 2, 3]) assert_series_equal(s, s.squeeze(axis=axis)) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=1) def test_squeeze_to_scalar(axis): s = pd.Series([1]) assert 1 == s.squeeze(axis=axis) diff --git a/tests/integ/modin/series/test_to_string.py b/tests/integ/modin/series/test_to_string.py index 7e2074e9c5..d9e1dc8466 100644 --- a/tests/integ/modin/series/test_to_string.py +++ b/tests/integ/modin/series/test_to_string.py @@ -9,7 +9,7 @@ from tests.integ.utils.sql_counter import sql_count_checker -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=1) def test_to_string(): native_ser = native_pd.Series([-1, 5, 6, 2, 4]) snow_ser = pd.Series(native_ser) diff --git a/tests/integ/modin/series/test_where.py b/tests/integ/modin/series/test_where.py index 4630dd9985..46036d6ab7 100644 --- a/tests/integ/modin/series/test_where.py +++ b/tests/integ/modin/series/test_where.py @@ -103,7 +103,7 @@ def test_series_where_index_no_names(): ) -@sql_count_checker(query_count=3, join_count=2) +@sql_count_checker(query_count=2, join_count=2) def test_series_where_with_np_array_cond(): data = [1, 2] cond = np.array([True, False]) @@ -220,9 +220,7 @@ def test_series_where_with_scalar_cond(cond): native_ser = native_pd.Series([1, 2, 3]) snow_ser = pd.Series(native_ser) - sql_count = 1 if isinstance(cond, list) else 0 - - with SqlCounter(query_count=sql_count): + with SqlCounter(query_count=0): eval_snowpark_pandas_result( snow_ser, native_ser, diff --git a/tests/integ/modin/test_from_pandas_to_pandas.py b/tests/integ/modin/test_from_pandas_to_pandas.py index 575a70d1fa..be8c36a02d 100644 --- a/tests/integ/modin/test_from_pandas_to_pandas.py +++ b/tests/integ/modin/test_from_pandas_to_pandas.py @@ -538,7 +538,7 @@ def test_series_to_pandas(): assert_series_equal(snow_series.to_pandas(), pandas_series) -@sql_count_checker(query_count=2, union_count=1) +@sql_count_checker(query_count=1, union_count=1) def test_single_row_frame_to_series_to_pandas(): # create a Snowpark pandas with single row native_df = native_pd.DataFrame( diff --git a/tests/integ/modin/test_numpy.py b/tests/integ/modin/test_numpy.py index 7a322d1671..fbcee35450 100644 --- a/tests/integ/modin/test_numpy.py +++ b/tests/integ/modin/test_numpy.py @@ -66,7 +66,7 @@ def test_full_like(): snow_df = pd.DataFrame(data) pandas_df = native_pd.DataFrame(data) - with SqlCounter(query_count=2): + with SqlCounter(query_count=1): snow_result = np.full_like(snow_df, 1234) pandas_result = np.full_like(pandas_df, 1234) assert_array_equal(np.array(snow_result), np.array(pandas_result)) @@ -76,7 +76,7 @@ def test_full_like(): pandas_result = np.full_like(pandas_df, 1234, shape=(5, 3)) assert_array_equal(np.array(snow_result), np.array(pandas_result)) - with SqlCounter(query_count=2): + with SqlCounter(query_count=1): snow_result = np.full_like(snow_df["A"], 1234) pandas_result = np.full_like(pandas_df["A"], 1234) assert_array_equal(np.array(snow_result), np.array(pandas_result)) diff --git a/tests/integ/modin/test_qcut.py b/tests/integ/modin/test_qcut.py index eb67bc0929..5cba7b5a04 100644 --- a/tests/integ/modin/test_qcut.py +++ b/tests/integ/modin/test_qcut.py @@ -44,12 +44,12 @@ def test_qcut_non_series(x, q): (5, 1, 2), (100, 1, 2), (1000, 1, 8), - (5, 10, 3), - (100, 10, 3), - (1000, 10, 12), - (5, 47, 3), - (100, 47, 3), - (1000, 47, 12), + (5, 10, 2), + (100, 10, 2), + (1000, 10, 8), + (5, 47, 2), + (100, 47, 2), + (1000, 47, 8), # TODO: SNOW-1229442 # qcut was significantly optimized with SNOW-1368640 and SNOW-1370365, but still # cannot compute 10k q values in a reasonable amount of time. @@ -80,7 +80,7 @@ def test_qcut_series_non_range_data(data, q): native_ans = native_pd.qcut(native_pd.Series(data), q, labels=False) # Large n can not inline everything into a single query and will instead create a temp table. - with SqlCounter(query_count=3): + with SqlCounter(query_count=2): ans = pd.qcut(pd.Series(data), q, labels=False) assert_snowpark_pandas_equals_to_pandas_without_dtypecheck(ans, native_ans) @@ -137,7 +137,7 @@ def test_qcut_series_single_element_negative(q, s): re_match = "Bin edges must be unique: .*" with pytest.raises(ValueError, match=re_match): native_pd.qcut(s, q, labels=False) - with SqlCounter(query_count=2): + with SqlCounter(query_count=1): with pytest.raises(ValueError, match=re_match): pd.qcut(pd.Series(s), q, labels=False) else: @@ -160,12 +160,11 @@ def test_qcut_series_single_element_negative(q, s): ], ) @pytest.mark.parametrize("s", [native_pd.Series([0]), native_pd.Series([1])]) +@sql_count_checker(query_count=2) def test_qcut_series_single_element(q, s): native_ans = native_pd.qcut(s, q, duplicates="drop", labels=False) - - with SqlCounter(query_count=2 if q == 1 else 3): - ans = pd.qcut(pd.Series(s), q, duplicates="drop", labels=False) - assert_snowpark_pandas_equals_to_pandas_without_dtypecheck(ans, native_ans) + ans = pd.qcut(pd.Series(s), q, duplicates="drop", labels=False) + assert_snowpark_pandas_equals_to_pandas_without_dtypecheck(ans, native_ans) @pytest.mark.xfail(reason="TODO: SNOW-1225562 support retbins") diff --git a/tests/integ/modin/test_sql_counter.py b/tests/integ/modin/test_sql_counter.py index 04034d6bb7..fb0480926c 100644 --- a/tests/integ/modin/test_sql_counter.py +++ b/tests/integ/modin/test_sql_counter.py @@ -15,11 +15,14 @@ class CustomException(BaseException): pass +# These tests previously used len(df) to force a query, but newer versions of Snowpark pandas +# cache input dimensions when a frame is built from a native object. We directly call to_pandas() +# to force materialization instead. @sql_count_checker(query_count=3) def test_sql_counter_with_decorator(): for _ in range(3): df = pd.DataFrame({"a": [1, 2, 3]}) - assert len(df) == 3 + df.to_pandas() @pytest.mark.parametrize("test_arg", [1, 2]) @@ -27,7 +30,7 @@ def test_sql_counter_with_decorator(): def test_sql_counter_with_decorator_with_parametrize(test_arg): for _ in range(3): df = pd.DataFrame({"a": [1, 2, 3]}) - assert len(df) == 3 + df.to_pandas() @pytest.mark.parametrize( @@ -47,7 +50,7 @@ def test_sql_counter_with_fixture(num_queries, check_sql_counter, sql_counter): df = pd.DataFrame({"a": [1, 2, 3]}) if i % 2 == 0: df = df.merge(df) - assert len(df) == 3 + df.to_pandas() if check_sql_counter: sql_counter.expects(query_count=num_queries, join_count=(num_queries + 1) / 2) @@ -60,7 +63,7 @@ def test_sql_counter_with_fixture_with_repeat_checks_inside_loop( for _ in range(i): df = pd.DataFrame({"a": [1, 2, 3]}) df = df.merge(df) - assert len(df) == 3 + df.to_pandas() sql_counter.expects(query_count=i, join_count=i) @@ -69,7 +72,7 @@ def test_sql_counter_with_context_manager_inside_loop(): for _ in range(3): with SqlCounter(query_count=1) as sc: df = pd.DataFrame({"a": [1, 2, 3]}) - assert len(df) == 3 + df.to_pandas() with pytest.raises( AssertionError, match="SqlCounter is dead and can no longer be used." @@ -79,32 +82,32 @@ def test_sql_counter_with_context_manager_inside_loop(): @sql_count_checker(no_check=True) def test_sql_counter_with_multiple_checks(session): - expected_describe_count = 0 + expected_describe_count = 1 if not session.reduce_describe_query_enabled and session.sql_simplifier_enabled: expected_describe_count = 3 with SqlCounter(query_count=1, describe_count=expected_describe_count): df = pd.DataFrame({"a": [1, 2, 3]}) - assert len(df) == 3 + df.to_pandas() with SqlCounter(query_count=1, describe_count=expected_describe_count): df = pd.DataFrame({"b": [4, 5, 6]}) - assert len(df) == 3 + df.to_pandas() with SqlCounter(query_count=1, describe_count=expected_describe_count): df = pd.DataFrame({"c": [7, 8, 9]}) - assert len(df) == 3 + df.to_pandas() @sql_count_checker(no_check=True) def test_sql_counter_with_context_manager_outside_loop(session): - expected_describe_count = 0 + expected_describe_count = 3 if not session.reduce_describe_query_enabled and session.sql_simplifier_enabled: expected_describe_count = 9 sc = SqlCounter(query_count=3, describe_count=expected_describe_count) sc.__enter__() for _ in range(3): df = pd.DataFrame({"a": [1, 2, 3]}) - assert len(df) == 3 + df.to_pandas() sc.__exit__(None, None, None) @@ -128,7 +131,7 @@ def test_sql_counter_with_series_udf_count(): def test_high_sql_count_pass(): for i in range(11): df = pd.DataFrame({"a": list(range(i))}) - assert len(df) == i + df.to_pandas() def test_sql_count_with_joins(): @@ -224,7 +227,7 @@ def test_sql_count_instances_by_query_substr(): def test_high_sql_count_fail(): for i in range(11): df = pd.DataFrame({"a": list(range(i))}) - assert len(df) == i + df.to_pandas() @pytest.mark.xfail( @@ -235,7 +238,7 @@ def test_high_sql_count_fail(): def test_high_sql_count_expect_high_count_no_reason(): for i in range(11): df = pd.DataFrame({"a": list(range(i))}) - assert len(df) == i + df.to_pandas() class TestSqlCounterNotRequiredOrCheckedForStrictXfailedTest: diff --git a/tests/integ/modin/test_telemetry.py b/tests/integ/modin/test_telemetry.py index b734d2e6d6..4d89b950fd 100644 --- a/tests/integ/modin/test_telemetry.py +++ b/tests/integ/modin/test_telemetry.py @@ -331,7 +331,7 @@ def test_property_methods_telemetry(): assert api_call["name"] == "Series." -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=0) def test_telemetry_with_update_inplace(): # verify api_calls have been collected correctly for APIs using _update_inplace() in base.py df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) diff --git a/tests/integ/modin/tools/test_to_datetime.py b/tests/integ/modin/tools/test_to_datetime.py index 6b3200e4d6..ab07c626dc 100644 --- a/tests/integ/modin/tools/test_to_datetime.py +++ b/tests/integ/modin/tools/test_to_datetime.py @@ -589,7 +589,7 @@ def test_to_datetime_dtarr(self, tz): check_dtype=False, ) - @sql_count_checker(query_count=1) + @sql_count_checker(query_count=0) def test_to_datetime_pydatetime(self): actual = to_datetime(pd.Index([datetime(2008, 1, 15)])) assert actual == np.datetime64(datetime(2008, 1, 15)) diff --git a/tests/integ/modin/tools/test_to_numeric.py b/tests/integ/modin/tools/test_to_numeric.py index a0093b8daf..50d5ba9b2e 100644 --- a/tests/integ/modin/tools/test_to_numeric.py +++ b/tests/integ/modin/tools/test_to_numeric.py @@ -108,7 +108,7 @@ def test_series_to_numeric(input, dtype, expected_dtype): (True, "bool"), ], ) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=1) def test_scalar_to_numeric(input, dtype): snow = pd.to_numeric(input) assert snow.dtype == dtype @@ -119,7 +119,7 @@ def test_scalar_to_numeric(input, dtype): assert snow == native -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=1) def test_scalar_timedelta_to_numeric(): # Test this case separately because of a bug in pandas: https://github.com/pandas-dev/pandas/issues/59944 input = native_pd.Timedelta(1) @@ -128,7 +128,7 @@ def test_scalar_timedelta_to_numeric(): assert pd.to_numeric(input) == 1 -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=1) def test_downcast_ignored(downcast, caplog): caplog.clear() with caplog.at_level(logging.DEBUG): @@ -139,7 +139,7 @@ def test_downcast_ignored(downcast, caplog): assert "downcast is ignored in Snowflake backend" not in caplog.text -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=1) def test_nan_to_numeric(): # snowpark pandas can handle "nan" correctly but native pandas does not input = "nan" @@ -155,7 +155,7 @@ def large_val(request): return request.param -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=1) def test_really_large_scalar(large_val): snow = pd.to_numeric(large_val) native = native_pd.to_numeric(large_val)