Refactor metrics a bit to be easier to process. Switch to using a random hex for the group instead of variable.

sfc-gh-jkew · sfc-gh-jkew · commit 692ff4d8ccc4 · 2025-05-07T12:39:29.000-07:00
diff --git a/modin/core/storage_formats/base/query_compiler_calculator.py b/modin/core/storage_formats/base/query_compiler_calculator.py
@@ -20,6 +20,7 @@
 """
 
 import logging
+import random
 from types import MappingProxyType
 from typing import Any, Optional
 
@@ -48,10 +49,6 @@ def __init__(self, backend: str, query_compiler: BaseQueryCompiler):
         self.max_cost = query_compiler.max_cost()
 
 
-# Global Variable Used to track groups of metrics
-hybrid_metrics_calc_group = 0
-
-
 class BackendCostCalculator:
     """
     Calculate which Backend should be used for an operation.
@@ -143,7 +140,6 @@ def calculate(self) -> str:
 
         min_value = None
         for k, v in self._backend_data.items():
-            emit_metric(f"hybrid.cast.to.{k}.cost.{hybrid_metrics_calc_group}", v.cost)
             if v.cost > v.max_cost:
                 continue
             if min_value is None or min_value > v.cost:
@@ -154,10 +150,26 @@ def calculate(self) -> str:
             logging.info(
                 f"BackendCostCalculator Results: {self._calc_result_log(self._result_backend)}"
             )
-            DECIDED_TO_SWITCH = 1
+            # Does not need to be secure, should not use system entropy
+            metrics_group = "%04x" % random.randrange(16**4)
+            for qc in self._qc_list:
+                max_shape = qc._max_shape()
+                backend = qc.get_backend()
+                emit_metric(
+                    f"hybrid.merge.candidate.{backend}.group.{metrics_group}.rows",
+                    max_shape[0],
+                )
+                emit_metric(
+                    f"hybrid.merge.candidate.{backend}.group.{metrics_group}.cols",
+                    max_shape[1],
+                )
+            for k, v in self._backend_data.items():
+                emit_metric(
+                    f"hybrid.merge.candidate.{k}.group.{metrics_group}.cost", v.cost
+                )
             emit_metric(
-                f"hybrid.cast.decision.{self._result_backend}.{hybrid_metrics_calc_group}",
-                DECIDED_TO_SWITCH,
+                f"hybrid.merge.decision.{self._result_backend}.group.{metrics_group}",
+                1,
             )
 
         if self._result_backend is None:
diff --git a/modin/core/storage_formats/pandas/query_compiler_caster.py b/modin/core/storage_formats/pandas/query_compiler_caster.py
@@ -24,6 +24,7 @@
 import logging
 from abc import ABC, abstractmethod
 from collections import defaultdict, namedtuple
+import random
 from types import FunctionType, MappingProxyType, MethodType
 from typing import Any, Callable, Dict, Optional, Tuple, TypeVar, Union, ValuesView
 
@@ -542,10 +543,6 @@ def _maybe_switch_backend_post_op(
     return result
 
 
-# Global Variable Used to track groups of metrics
-hybrid_metrics_group = 0
-
-
 def _get_backend_for_auto_switch(
     input_qc: BaseQueryCompiler,
     class_of_wrapped_fn: str,
@@ -583,6 +580,8 @@ def _get_backend_for_auto_switch(
     # backend.
     from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher
 
+    # Does not need to be secure, should not use system entropy
+    metrics_group = "%04x" % random.randrange(16**4)
     starting_backend = input_qc.get_backend()
 
     min_move_stay_delta = None
@@ -593,6 +592,23 @@ def _get_backend_for_auto_switch(
         operation=function_name,
         arguments=arguments,
     )
+    data_max_shape = input_qc._max_shape()
+    emit_metric(
+        f"hybrid.auto.api.{class_of_wrapped_fn}.{function_name}.group.{metrics_group}",
+        1,
+    )
+    emit_metric(
+        f"hybrid.auto.current.{starting_backend}.group.{metrics_group}.stay_cost",
+        stay_cost,
+    )
+    emit_metric(
+        f"hybrid.auto.current.{starting_backend}.group.{metrics_group}.rows",
+        data_max_shape[0],
+    )
+    emit_metric(
+        f"hybrid.auto.current.{starting_backend}.group.{metrics_group}.cols",
+        data_max_shape[1],
+    )
     for backend in Backend.get_active_backends():
         if backend in ("Ray", "Unidist", "Dask"):
             # Disable automatically switching to these engines for now, because
@@ -637,53 +653,32 @@ def _get_backend_for_auto_switch(
             ):
                 min_move_stay_delta = move_stay_delta
                 best_backend = backend
-            global hybrid_metrics_group
             emit_metric(
-                f"hybrid.auto.from.{starting_backend}.to.{backend}.move_to_cost.{hybrid_metrics_group}",
+                f"hybrid.auto.candidate.{backend}.group.{metrics_group}.move_to_cost",
                 move_to_cost,
             )
             emit_metric(
-                f"hybrid.auto.from.{starting_backend}.to.{backend}.stay_cost.{hybrid_metrics_group}",
-                stay_cost,
-            )
-            emit_metric(
-                f"hybrid.auto.from.{starting_backend}.to.{backend}.other_execute_cost.{hybrid_metrics_group}",
+                f"hybrid.auto.candidate.{backend}.group.{metrics_group}.other_execute_cost",
                 other_execute_cost,
             )
             emit_metric(
-                f"hybrid.auto.from.{starting_backend}.to.{backend}.delta.{hybrid_metrics_group}",
+                f"hybrid.auto.candidate.{backend}.group.{metrics_group}.delta",
                 move_stay_delta,
             )
-            SINGLE_EVENT = 1
-            DECIDED_TO_SWITCH = 1
-            DECIDED_NOT_TO_SWITCH = 0
-            emit_metric(
-                f"hybrid.auto.from.{starting_backend}.to.{backend}.decision.{best_backend}.{hybrid_metrics_group}",
-                (
-                    DECIDED_TO_SWITCH
-                    if starting_backend != backend
-                    else DECIDED_NOT_TO_SWITCH
-                ),
-            )
-            emit_metric(
-                f"hybrid.auto.from.{starting_backend}.to.{backend}.api_cls_name.{class_of_wrapped_fn}.{hybrid_metrics_group}",
-                SINGLE_EVENT,
-            )
-            emit_metric(
-                f"hybrid.auto.from.{starting_backend}.to.{backend}.function_name.{function_name}.{hybrid_metrics_group}",
-                SINGLE_EVENT,
-            )
-            hybrid_metrics_group += 1
+
             logging.info(
                 f"After {class_of_wrapped_fn} function {function_name}, "
                 + f"considered moving to backend {backend} with "
                 + f"(transfer_cost {move_to_cost} + other_execution_cost {other_execute_cost}) "
                 + f", stay_cost {stay_cost}, and move-stay delta "
                 + f"{move_stay_delta}"
             )
+
     if best_backend == starting_backend:
+        emit_metric(f"hybrid.auto.decision.{best_backend}.group.{metrics_group}", 0)
         logging.info(f"Chose not to switch backends after operation {function_name}")
     else:
+        emit_metric(f"hybrid.auto.decision.{best_backend}.group.{metrics_group}", 1)
         logging.info(f"Chose to move to backend {best_backend}")
     return best_backend
 
diff --git a/modin/tests/pandas/native_df_interoperability/test_compiler_caster.py b/modin/tests/pandas/native_df_interoperability/test_compiler_caster.py
@@ -1262,24 +1262,17 @@ def test_concat_with_pin(pin_backends, expected_backend):
 
 def test_cast_metrics(pico_df, cluster_df):
     try:
-        errors = 0
+        count = 0
 
         def test_handler(metric: str, value) -> None:
-            nonlocal errors
-            if metric.startswith("modin.hybrid.cast"):
-                tokens = metric.split(".")
-                if tokens[4] == "Pico" and value == 750:
-                    return
-                if tokens[4] == "Cluster" and value == 250:
-                    return
-                if tokens[3] == "decision" and tokens[4] == "Cluster" and value == 1:
-                    return
-                errors += 1
+            nonlocal count
+            if metric.startswith("modin.hybrid.merge"):
+                count += 1
 
         add_metric_handler(test_handler)
         df3 = pd.concat([pico_df, cluster_df], axis=1)
         assert df3.get_backend() == "Cluster"  # result should be on cluster
-        assert errors == 0
+        assert count == 7
     finally:
         clear_metric_handler(test_handler)
 
@@ -1290,43 +1283,13 @@ def test_switch_metrics(pico_df, cluster_df):
         choices=("Big_Data_Cloud", "Small_Data_Local"),
     ):
         try:
-            errors = 0
+            count = 0
 
             def test_handler(metric: str, value) -> None:
-                nonlocal errors
+                nonlocal count
                 if metric.startswith("modin.hybrid.auto"):
-                    tokens = metric.split(".")
                     assert "from.Big_Data_Cloud.to.Small_Data_Local" in metric
-                    if (
-                        tokens[7] == "stay_cost"
-                        and value == QCCoercionCost.COST_IMPOSSIBLE
-                    ):
-                        return
-                    if tokens[7] == "other_execute_cost" and value == 1000:
-                        return
-                    if tokens[7] == "move_to_cost" and value == 0:
-                        return
-                    if tokens[7] == "delta" and value == 0:
-                        return
-                    if (
-                        tokens[7] == "decision"
-                        and tokens[8] == "Big_Data_Cloud"
-                        and value == 1
-                    ):
-                        return
-                    if (
-                        tokens[7] == "api_cls_name"
-                        and tokens[8] == "DataFrame"
-                        and value == 1
-                    ):
-                        return
-                    if (
-                        tokens[7] == "function_name"
-                        and tokens[8] == "describe"
-                        and value == 1
-                    ):
-                        return
-                    errors += 1
+                    count += 1
 
             add_metric_handler(test_handler)
 
@@ -1338,7 +1301,7 @@ def test_handler(metric: str, value) -> None:
             df = pd.DataFrame([1] * 10)
             assert df.get_backend() == "Big_Data_Cloud"
             df.describe()
-            assert errors == 0
+            assert count == 9
         finally:
             clear_metric_handler(test_handler)