FIX-#7638: Suppress default to pandas warnings on native pandas backend (#7639)

sfc-gh-joshi · web-flow · commit fd2fb4414a42 · 2025-08-07T11:46:38.000-07:00
IO and general module functions now all share a code path that checks
whether the active backend's query compiler should warn on default to
pandas. Methods that default to pandas directly in the base.py frontend
layer (rather than at the query compiler level) also now use this code
path.

Signed-off-by: Jonathan Shi &lt;jonathan.shi@snowflake.com&gt;
diff --git a/modin/core/storage_formats/base/query_compiler.py b/modin/core/storage_formats/base/query_compiler.py
@@ -206,7 +206,8 @@ class BaseQueryCompiler(
     _shape_hint: Optional[str]
     _should_warn_on_default_to_pandas: bool = True
 
-    def _maybe_warn_on_default(self, *, message: str = "", reason: str = "") -> None:
+    @classmethod
+    def _maybe_warn_on_default(cls, *, message: str = "", reason: str = "") -> None:
         """
         If this class is configured to warn on default to pandas, warn.
 
@@ -217,7 +218,7 @@ def _maybe_warn_on_default(self, *, message: str = "", reason: str = "") -> None
         reason : str, default: ""
             Reason for default.
         """
-        if self._should_warn_on_default_to_pandas:
+        if cls._should_warn_on_default_to_pandas:
             ErrorMessage.default_to_pandas(message=message, reason=reason)
 
     @disable_logging
diff --git a/modin/pandas/base.py b/modin/pandas/base.py
@@ -561,8 +561,8 @@ def _default_to_pandas(self, op, *args, reason: str = None, **kwargs):
             Result of operation.
         """
         empty_self_str = "" if not self.empty else " for empty DataFrame"
-        ErrorMessage.default_to_pandas(
-            "`{}.{}`{}".format(
+        self._query_compiler._maybe_warn_on_default(
+            message="`{}.{}`{}".format(
                 type(self).__name__,
                 op if isinstance(op, str) else op.__name__,
                 empty_self_str,
diff --git a/modin/pandas/general.py b/modin/pandas/general.py
@@ -28,10 +28,9 @@
 from modin.core.storage_formats.pandas.query_compiler_caster import (
     wrap_free_function_in_argument_caster,
 )
-from modin.error_message import ErrorMessage
 from modin.logging import enable_logging
 from modin.pandas.io import to_pandas
-from modin.utils import _inherit_docstrings
+from modin.utils import _inherit_docstrings, _maybe_warn_on_default
 
 from .base import BasePandasDataset
 from .dataframe import DataFrame
@@ -193,7 +192,7 @@ def merge_asof(
         raise ValueError(
             "can not merge DataFrame with instance of type {}".format(type(right))
         )
-    ErrorMessage.default_to_pandas("`merge_asof`")
+    left._query_compiler._maybe_warn_on_default(message="`merge_asof`")
 
     # As of Pandas 1.2 these should raise an error; before that it did
     # something likely random:
@@ -345,7 +344,7 @@ def cut(
     if isinstance(x, DataFrame):
         raise ValueError("Input array must be 1 dimensional")
     if not isinstance(x, Series):
-        ErrorMessage.default_to_pandas(
+        _maybe_warn_on_default(
             reason=f"pd.cut is not supported on objects of type {type(x)}"
         )
         import pandas
@@ -656,7 +655,7 @@ def get_dummies(
             + "github.com/modin-project/modin."
         )
     if not isinstance(data, DataFrame):
-        ErrorMessage.default_to_pandas("`get_dummies` on non-DataFrame")
+        _maybe_warn_on_default("`get_dummies` on non-DataFrame")
         if isinstance(data, Series):
             data = data._to_pandas()
         return DataFrame(
@@ -726,7 +725,7 @@ def crosstab(
     """
     Compute a simple cross tabulation of two (or more) factors.
     """
-    ErrorMessage.default_to_pandas("`crosstab`")
+    _maybe_warn_on_default("`crosstab`")
     pandas_crosstab = pandas.crosstab(
         index,
         columns,
@@ -769,7 +768,7 @@ def lreshape(data: DataFrame, groups, dropna=True) -> DataFrame:
     """
     if not isinstance(data, DataFrame):
         raise ValueError("can not lreshape with instance of type {}".format(type(data)))
-    ErrorMessage.default_to_pandas("`lreshape`")
+    data._query_compiler._maybe_warn_on_default(message="`lreshape`")
     return DataFrame(pandas.lreshape(to_pandas(data), groups, dropna=dropna))
 
 
diff --git a/modin/pandas/io.py b/modin/pandas/io.py
@@ -71,13 +71,13 @@
 from modin.core.storage_formats.pandas.query_compiler_caster import (
     wrap_free_function_in_argument_caster,
 )
-from modin.error_message import ErrorMessage
 from modin.logging import ClassLogger, enable_logging
 from modin.utils import (
     SupportsPrivateToNumPy,
     SupportsPublicToNumPy,
     SupportsPublicToPandas,
     _inherit_docstrings,
+    _maybe_warn_on_default,
     classproperty,
     expanduser_path_arg,
 )
@@ -156,7 +156,7 @@ def read_xml(
     storage_options: StorageOptions = None,
     dtype_backend: Union[DtypeBackend, NoDefault] = no_default,
 ) -> DataFrame:
-    ErrorMessage.default_to_pandas("read_xml")
+    _maybe_warn_on_default("read_xml")
     _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())
     return ModinObjects.DataFrame(pandas.read_xml(**kwargs))
 
@@ -658,7 +658,7 @@ def read_sql(
     from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher
 
     if kwargs.get("chunksize") is not None:
-        ErrorMessage.default_to_pandas("Parameters provided [chunksize]")
+        _maybe_warn_on_default("Parameters provided [chunksize]")
         df_gen = pandas.read_sql(**kwargs)
         return (
             ModinObjects.DataFrame(query_compiler=FactoryDispatcher.from_pandas(df))
@@ -818,7 +818,7 @@ def json_normalize(
     """
     Normalize semi-structured JSON data into a flat table.
     """
-    ErrorMessage.default_to_pandas("json_normalize")
+    _maybe_warn_on_default("json_normalize")
     return ModinObjects.DataFrame(
         pandas.json_normalize(
             data, record_path, meta, meta_prefix, record_prefix, errors, sep, max_level
@@ -840,7 +840,7 @@ def read_orc(
     """
     Load an ORC object from the file path, returning a DataFrame.
     """
-    ErrorMessage.default_to_pandas("read_orc")
+    _maybe_warn_on_default("read_orc")
     return ModinObjects.DataFrame(
         pandas.read_orc(
             path,
@@ -886,7 +886,7 @@ def return_handler(*args, **kwargs):
                     # We don't want to constantly be giving this error message for
                     # internal methods.
                     if item[0] != "_":
-                        ErrorMessage.default_to_pandas("`{}`".format(item))
+                        _maybe_warn_on_default("`{}`".format(item))
                     args = [
                         (
                             to_pandas(arg)
@@ -952,7 +952,7 @@ def return_handler(*args, **kwargs):
                     # We don't want to constantly be giving this error message for
                     # internal methods.
                     if item[0] != "_":
-                        ErrorMessage.default_to_pandas("`{}`".format(item))
+                        _maybe_warn_on_default("`{}`".format(item))
                     args = [
                         (
                             to_pandas(arg)
diff --git a/modin/tests/experimental/test_io_exp.py b/modin/tests/experimental/test_io_exp.py
@@ -29,7 +29,7 @@
     time_parsing_csv_path,
 )
 from modin.tests.test_utils import (
-    warns_that_defaulting_to_pandas,
+    current_execution_is_native,
     warns_that_defaulting_to_pandas_if,
 )
 from modin.utils import try_cast_to_pandas
@@ -129,7 +129,7 @@ def test_read_csv_empty_frame(self):
 
     def test_read_csv_without_glob(self):
         with pytest.raises(FileNotFoundError):
-            with warns_that_defaulting_to_pandas():
+            with warns_that_defaulting_to_pandas_if(not current_execution_is_native()):
                 pd.read_csv_glob(
                     "s3://dask-data/nyc-taxi/2015/yellow_tripdata_2015-",
                     storage_options={"anon": True},
diff --git a/modin/tests/interchange/dataframe_protocol/pandas/test_protocol.py b/modin/tests/interchange/dataframe_protocol/pandas/test_protocol.py
@@ -20,7 +20,6 @@
 from modin.tests.pandas.utils import df_equals, test_data
 from modin.tests.test_utils import (
     df_or_series_using_native_execution,
-    warns_that_defaulting_to_pandas,
     warns_that_defaulting_to_pandas_if,
 )
 
@@ -66,7 +65,9 @@ def test_categorical_from_dataframe():
 
 def test_from_dataframe_with_empty_dataframe():
     modin_df = pd.DataFrame({"foo_col": pd.Series([], dtype="int64")})
-    with warns_that_defaulting_to_pandas():
+    with warns_that_defaulting_to_pandas_if(
+        not df_or_series_using_native_execution(modin_df)
+    ):
         eval_df_protocol(modin_df)
 
 
diff --git a/modin/tests/pandas/dataframe/test_iter.py b/modin/tests/pandas/dataframe/test_iter.py
@@ -35,7 +35,6 @@
 )
 from modin.tests.test_utils import (
     current_execution_is_native,
-    warns_that_defaulting_to_pandas,
     warns_that_defaulting_to_pandas_if,
 )
 
@@ -147,7 +146,7 @@ def test_display_options_for___repr__(max_rows_columns, expand_frame_repr, frame
 def test___finalize__():
     data = test_data_values[0]
     # NOTE: __finalize__() defaults to pandas at the API layer.
-    with warns_that_defaulting_to_pandas():
+    with warns_that_defaulting_to_pandas_if(not current_execution_is_native()):
         pd.DataFrame(data).__finalize__(None)
 
 
diff --git a/modin/tests/pandas/dataframe/test_udf.py b/modin/tests/pandas/dataframe/test_udf.py
@@ -43,7 +43,10 @@
     udf_func_keys,
     udf_func_values,
 )
-from modin.tests.test_utils import warns_that_defaulting_to_pandas
+from modin.tests.test_utils import (
+    current_execution_is_native,
+    warns_that_defaulting_to_pandas_if,
+)
 from modin.utils import get_current_execution
 
 NPartitions.put(4)
@@ -126,10 +129,10 @@ def test_aggregate_alias():
 def test_aggregate_error_checking():
     modin_df = pd.DataFrame(test_data["float_nan_data"])
 
-    with warns_that_defaulting_to_pandas():
+    with warns_that_defaulting_to_pandas_if(not current_execution_is_native()):
         modin_df.aggregate({modin_df.columns[0]: "sum", modin_df.columns[1]: "mean"})
 
-    with warns_that_defaulting_to_pandas():
+    with warns_that_defaulting_to_pandas_if(not current_execution_is_native()):
         modin_df.aggregate("arcsin")
 
 
diff --git a/modin/tests/pandas/extensions/test_groupby_extensions.py b/modin/tests/pandas/extensions/test_groupby_extensions.py
@@ -24,7 +24,10 @@
 )
 from modin.pandas.groupby import DataFrameGroupBy, SeriesGroupBy
 from modin.tests.pandas.utils import default_to_pandas_ignore_string, df_equals
-from modin.tests.test_utils import warns_that_defaulting_to_pandas
+from modin.tests.test_utils import (
+    current_execution_is_native,
+    warns_that_defaulting_to_pandas_if,
+)
 
 
 @pytest.mark.parametrize(
@@ -150,10 +153,7 @@ def ngroups(self):
         # Check that the accessor doesn't work on the Python_Test backend.
         python_test_df = pandas_df.move_to("Python_Test")
         groupby = get_groupby(python_test_df)
-        # groupby.ngroups defaults to pandas at the API layer,
-        # where it warns that it's doing so, even for dataframes using the
-        # Pandas backend.
-        with warns_that_defaulting_to_pandas():
+        with warns_that_defaulting_to_pandas_if(not current_execution_is_native()):
             assert groupby.ngroups == 3
 
     def test_add_ngroups_setter_and_deleter_for_one_backend(
@@ -179,7 +179,7 @@ def _set_ngroups(self, value):
 
         python_test_groupby = get_groupby(python_test_df)
 
-        with warns_that_defaulting_to_pandas():
+        with warns_that_defaulting_to_pandas_if(not current_execution_is_native()):
             assert python_test_groupby.ngroups == 3
 
         with pytest.raises(AttributeError):
diff --git a/modin/tests/pandas/native_df_interoperability/test_default_to_pandas_without_warnings.py b/modin/tests/pandas/native_df_interoperability/test_default_to_pandas_without_warnings.py
@@ -0,0 +1,78 @@
+# Licensed to Modin Development Team under one or more contributor license agreements.
+# See the NOTICE file distributed with this work for additional information regarding
+# copyright ownership.  The Modin Development Team licenses this file to you under the
+# Apache License, Version 2.0 (the "License"); you may not use this file except in
+# compliance with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under
+# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+
+# While other modin backends raise a warning when defaulting to pandas, it does not make sense to
+# do so when we're running on the native pandas backend already. These tests ensure such warnings
+# are not raised with the pandas backend.
+
+import numpy as np
+import pandas
+import pytest
+
+import modin.pandas as pd
+from modin.config import Backend
+from modin.tests.pandas.utils import df_equals
+
+pytestmark = [
+    pytest.mark.skipif(
+        Backend.get() != "Pandas",
+        reason="warnings only suppressed on native pandas backend",
+        allow_module_level=True,
+    ),
+    # Error if a default to pandas warning is detected.
+    pytest.mark.filterwarnings("error:is not supported by NativeOnNative:UserWarning"),
+]
+
+
+def test_crosstab_no_warning():
+    # Example from pandas docs
+    # https://pandas.pydata.org/docs/reference/api/pandas.crosstab.html
+    a = np.array(
+        ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo"],
+        dtype=object,
+    )
+    b = np.array(
+        ["one", "one", "one", "two", "one", "one", "one", "two", "two", "two", "one"],
+        dtype=object,
+    )
+    c = np.array(
+        [
+            "dull",
+            "dull",
+            "shiny",
+            "dull",
+            "dull",
+            "shiny",
+            "shiny",
+            "dull",
+            "shiny",
+            "shiny",
+            "shiny",
+        ],
+        dtype=object,
+    )
+    df_equals(
+        pd.crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"]),
+        pandas.crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"]),
+    )
+
+
+def test_json_normalize_no_warning():
+    # Example from pandas docs
+    # https://pandas.pydata.org/docs/reference/api/pandas.json_normalize.html
+    data = [
+        {"id": 1, "name": {"first": "Coleen", "last": "Volk"}},
+        {"name": {"given": "Mark", "family": "Regner"}},
+        {"id": 2, "name": "Faye Raker"},
+    ]
+    df_equals(pd.json_normalize(data), pandas.json_normalize(data))
diff --git a/modin/tests/pandas/test_general.py b/modin/tests/pandas/test_general.py
diff --git a/modin/tests/pandas/test_reshape.py b/modin/tests/pandas/test_reshape.py
diff --git a/modin/tests/test_utils.py b/modin/tests/test_utils.py
diff --git a/modin/utils.py b/modin/utils.py

Original file line number	Diff line number	Diff line change
`@@ -35,7 +35,6 @@`
`35`	`35`	`)`
`36`	`36`	`from modin.tests.test_utils import (`
`37`	`37`	`current_execution_is_native,`
`38`		`- warns_that_defaulting_to_pandas,`
`39`	`38`	`warns_that_defaulting_to_pandas_if,`
`40`	`39`	`)`
`41`	`40`
`@@ -147,7 +146,7 @@ def test_display_options_for___repr__(max_rows_columns, expand_frame_repr, frame`
`147`	`146`	`def test___finalize__():`
`148`	`147`	`data = test_data_values[0]`
`149`	`148`	`# NOTE: __finalize__() defaults to pandas at the API layer.`
`150`		`- with warns_that_defaulting_to_pandas():`
	`149`	`+ with warns_that_defaulting_to_pandas_if(not current_execution_is_native()):`
`151`	`150`	`pd.DataFrame(data).__finalize__(None)`
`152`	`151`
`153`	`152`