FIX-#7595: Log backend switching information with the modin logger. (#7597)

sfc-gh-mvashishtha · web-flow · commit cc38a16aab45 · 2025-05-30T14:03:15.000-07:00
Resolves #7595 Signed-off-by: sfc-gh-mvashishtha <mahesh.vashishtha@snowflake.com>
diff --git a/modin/conftest.py b/modin/conftest.py
@@ -15,6 +15,7 @@
 # type: ignore
 
 import copy
+import logging
 import os
 import platform
 import shutil
@@ -792,3 +793,17 @@ def clean_up_auto_backend_switching():
         )
     ):
         yield
+
+
+@pytest.fixture(autouse=True)
+def assert_no_root_logging(caplog):
+    root_logger = logging.getLogger()
+    # Capture logs at any level, i.e. at level >= logging.NOTSET.
+    with caplog.at_level(logging.NOTSET):
+        yield
+    # Note that because this code is in a fixture, we have to use
+    # caplog.get_records(when="call") instead of caplog.records:
+    # https://github.com/pytest-dev/pytest/issues/4033
+    assert not any(
+        record.name == root_logger.name for record in caplog.get_records(when="call")
+    )
diff --git a/modin/core/storage_formats/base/query_compiler_calculator.py b/modin/core/storage_formats/base/query_compiler_calculator.py
@@ -19,7 +19,6 @@
 all query compilers to determine the best query compiler to use.
 """
 
-import logging
 import random
 from types import MappingProxyType
 from typing import Any, Optional
@@ -28,6 +27,7 @@
     BaseQueryCompiler,
     QCCoercionCost,
 )
+from modin.logging import get_logger
 from modin.logging.metrics import emit_metric
 
 
@@ -147,7 +147,7 @@ def calculate(self) -> str:
                 self._result_backend = k
 
         if len(self._backend_data) > 1:
-            logging.info(
+            get_logger().info(
                 f"BackendCostCalculator Results: {self._calc_result_log(self._result_backend)}"
             )
             # Does not need to be secure, should not use system entropy
diff --git a/modin/core/storage_formats/pandas/query_compiler_caster.py b/modin/core/storage_formats/pandas/query_compiler_caster.py
@@ -21,7 +21,6 @@
 
 import functools
 import inspect
-import logging
 import random
 from abc import ABC, abstractmethod
 from collections import defaultdict, namedtuple
@@ -42,7 +41,7 @@
     BackendCostCalculator,
 )
 from modin.error_message import ErrorMessage
-from modin.logging import disable_logging
+from modin.logging import disable_logging, get_logger
 from modin.logging.metrics import emit_metric
 from modin.utils import sentinel
 
@@ -812,7 +811,7 @@ def _get_backend_for_auto_switch(
                 move_stay_delta,
             )
 
-            logging.info(
+            get_logger().info(
                 f"After {class_of_wrapped_fn} function {function_name}, "
                 + f"considered moving to backend {backend} with "
                 + f"(transfer_cost {move_to_cost} + other_execution_cost {other_execute_cost}) "
@@ -822,10 +821,12 @@ def _get_backend_for_auto_switch(
 
     if best_backend == starting_backend:
         emit_metric(f"hybrid.auto.decision.{best_backend}.group.{metrics_group}", 0)
-        logging.info(f"Chose not to switch backends after operation {function_name}")
+        get_logger().info(
+            f"Chose not to switch backends after operation {function_name}"
+        )
     else:
         emit_metric(f"hybrid.auto.decision.{best_backend}.group.{metrics_group}", 1)
-        logging.info(f"Chose to move to backend {best_backend}")
+        get_logger().info(f"Chose to move to backend {best_backend}")
     return best_backend
 
 
diff --git a/modin/logging/__init__.py b/modin/logging/__init__.py
@@ -12,7 +12,7 @@
 # governing permissions and limitations under the License.
 
 from .class_logger import ClassLogger  # noqa: F401
-from .config import get_logger  # noqa: F401
+from .config import DEFAULT_LOGGER_NAME, get_logger  # noqa: F401
 from .logger_decorator import disable_logging, enable_logging  # noqa: F401
 from .metrics import add_metric_handler, clear_metric_handler, emit_metric
 
@@ -24,4 +24,5 @@
     "emit_metric",
     "add_metric_handler",
     "clear_metric_handler",
+    "DEFAULT_LOGGER_NAME",
 ]
diff --git a/modin/logging/config.py b/modin/logging/config.py
@@ -34,6 +34,8 @@
 import modin
 from modin.config import LogFileSize, LogMemoryInterval, LogMode
 
+DEFAULT_LOGGER_NAME = "modin.logger.default"
+
 __LOGGER_CONFIGURED__: bool = False
 
 
@@ -166,7 +168,7 @@ def configure_logging() -> None:
     job_id = f"{current_timestamp}_{uuid.uuid4().hex}"
 
     logger = _create_logger(
-        "modin.logger.default",
+        DEFAULT_LOGGER_NAME,
         job_id,
         "trace",
         LogLevel.INFO,
diff --git a/modin/tests/pandas/native_df_interoperability/test_compiler_caster.py b/modin/tests/pandas/native_df_interoperability/test_compiler_caster.py
@@ -13,6 +13,7 @@
 
 import contextlib
 import json
+import logging
 from io import StringIO
 from types import MappingProxyType
 from typing import Iterator
@@ -44,6 +45,7 @@
     register_function_for_post_op_switch,
     register_function_for_pre_op_switch,
 )
+from modin.logging import DEFAULT_LOGGER_NAME
 from modin.logging.metrics import add_metric_handler, clear_metric_handler
 from modin.pandas.api.extensions import register_pd_accessor
 from modin.tests.pandas.utils import create_test_dfs, df_equals, eval_general
@@ -369,12 +371,19 @@ def test_two_same_backend(pico_df):
     assert df3.get_backend() == "Pico"
 
 
-def test_cast_to_second_backend_with_concat(pico_df, cluster_df):
-    df3 = pd.concat([pico_df, cluster_df], axis=1)
+def test_cast_to_second_backend_with_concat(pico_df, cluster_df, caplog):
+    with caplog.at_level(level=logging.INFO, logger=DEFAULT_LOGGER_NAME):
+        df3 = pd.concat([pico_df, cluster_df], axis=1)
     assert pico_df.get_backend() == "Pico"
     assert cluster_df.get_backend() == "Cluster"
     assert df3.get_backend() == "Cluster"  # result should be on cluster
 
+    log_records = caplog.records
+    assert len(log_records) == 1
+    assert log_records[0].name == DEFAULT_LOGGER_NAME
+    assert log_records[0].levelno == logging.INFO
+    assert log_records[0].message.startswith("BackendCostCalculator Results: ")
+
 
 def test_cast_to_second_backend_with_concat_uses_second_backend_api_override(
     pico_df, cluster_df
@@ -693,6 +702,72 @@ def test_read_json(self):
                 pd.read_json(StringIO(small_json)).get_backend() == "Small_Data_Local"
             )
 
+    @backend_test_context(
+        test_backend="Big_Data_Cloud",
+        choices=("Big_Data_Cloud", "Small_Data_Local"),
+    )
+    def test_read_json_logging_for_post_op_switch(self, caplog):
+        register_function_for_post_op_switch(
+            class_name=None, backend="Big_Data_Cloud", method="read_json"
+        )
+        with caplog.at_level(level=logging.INFO, logger=DEFAULT_LOGGER_NAME):
+            assert (
+                pd.read_json(
+                    StringIO(
+                        json.dumps(
+                            {"col0": list(range(BIG_DATA_CLOUD_MIN_NUM_ROWS - 1))}
+                        )
+                    )
+                ).get_backend()
+                == "Small_Data_Local"
+            )
+        log_records = caplog.records
+        assert len(log_records) == 2
+
+        assert log_records[0].name == DEFAULT_LOGGER_NAME
+        assert log_records[0].levelno == logging.INFO
+        assert log_records[0].message.startswith(
+            "After None function read_json, considered moving to backend Small_Data_Local with"
+        )
+
+        assert log_records[1].name == DEFAULT_LOGGER_NAME
+        assert log_records[1].levelno == logging.INFO
+        assert log_records[1].message.startswith(
+            "Chose to move to backend Small_Data_Local"
+        )
+
+    @backend_test_context(
+        test_backend="Big_Data_Cloud",
+        choices=("Big_Data_Cloud", "Small_Data_Local"),
+    )
+    def test_read_json_logging_for_post_op_not_switch(self, caplog):
+        register_function_for_post_op_switch(
+            class_name=None, backend="Big_Data_Cloud", method="read_json"
+        )
+        with caplog.at_level(level=logging.INFO, logger=DEFAULT_LOGGER_NAME):
+            assert (
+                pd.read_json(
+                    StringIO(
+                        json.dumps({"col0": list(range(BIG_DATA_CLOUD_MIN_NUM_ROWS))})
+                    )
+                ).get_backend()
+                == "Big_Data_Cloud"
+            )
+        log_records = caplog.records
+        assert len(log_records) == 2
+
+        assert log_records[0].name == DEFAULT_LOGGER_NAME
+        assert log_records[0].levelno == logging.INFO
+        assert log_records[0].message.startswith(
+            "After None function read_json, considered moving to backend Small_Data_Local with"
+        )
+
+        assert log_records[1].name == DEFAULT_LOGGER_NAME
+        assert log_records[1].levelno == logging.INFO
+        assert log_records[1].message.startswith(
+            "Chose not to switch backends after operation read_json"
+        )
+
     def test_agg(self):
         with backend_test_context(
             test_backend="Big_Data_Cloud",