Revert D72410003: Multisect successfully blamed "D72410003: [TorchRec] Update critical path definition" for one test failure

generatedunixname89002005232357 · facebook-github-bot · commit 9855733c0572 · 2025-04-11T12:08:03.000-07:00
Summary: This diff reverts D72410003 D72410003: [TorchRec] Update critical path definition by micrain causes the following test failure: Tests affected: - [cogwheel:cogwheel_nex_task_and_notebook_creation_test#test_task_creation](https://www.internalfb.com/intern/test/562950141228121/) Here's the Multisect link: https://www.internalfb.com/multisect/25924686 Here are the tasks that are relevant to this breakage: T191381105: 100+ tests, one build rule, one sandcastle job, one CI signal unhealthy for model_understanding_iroc The backout may land if someone accepts it. If this diff has been generated in error, you can Commandeer and Abandon it. Reviewed By: micrain Differential Revision: D72864493 fbshipit-source-id: 67b7a69641a36c803531296301fa1a66718fca68
diff --git a/torchrec/distributed/planner/stats.py b/torchrec/distributed/planner/stats.py
@@ -25,7 +25,6 @@
     Union,
 )
 
-import pandas as pd
 from torch import nn
 
 from torchrec.distributed.embedding_types import EmbeddingComputeKernel
@@ -37,7 +36,6 @@
     InferenceStorageReservation,
 )
 from torchrec.distributed.planner.types import (
-    CriticalPathEstimate,
     ParameterConstraints,
     Perf,
     ShardingOption,
@@ -321,7 +319,7 @@ def log(
                 )
 
                 # Max perf and HBM to help root cause imbalance
-                self._log_max_perf_and_max_hbm(perf, used_hbm, best_plan)
+                self._log_max_perf_and_max_hbm(perf, used_hbm)
             self._log_storage_reservation_stats(
                 storage_reservation,
                 topology,
@@ -447,9 +445,7 @@ def _log_plan_imbalance_stats(
                 f"# {'Imbalance stats range 0-1, higher means more imbalanced' : <{self._width-3}}#"
             )
 
-    def _log_max_perf_and_max_hbm(
-        self, perfs: List[Perf], used_hbm: List[int], best_plan: List[ShardingOption]
-    ) -> None:
+    def _log_max_perf_and_max_hbm(self, perfs: List[Perf], used_hbm: List[int]) -> None:
         total_perfs = [perf.total for perf in perfs]
 
         max_total_perf_text = f"Longest Critical Path (Maximum of Total Perf): {_generate_max_text(total_perfs)}"
@@ -484,8 +480,6 @@ def _log_max_perf_and_max_hbm(
         )
         sum_of_maxima_text = f"Sum of Maxima: {round(sum_of_maxima, 3)} ms"
 
-        critical_path_estimate = _calculate_critical_path(best_plan)
-
         self._stats_table.append(f"#{'' : ^{self._width-2}}#")
         self._stats_table.append(f"# {max_total_perf_text : <{self._width-3}}#")
         self._stats_table.append(f"# {mean_total_perf_text : <{self._width-3}}#")
@@ -518,15 +512,6 @@ def _log_max_perf_and_max_hbm(
         self._stats_table.append(
             f"# {'High Median HBM: '+_generate_rank_hbm_stats(used_hbm, statistics.median_high) : <{self._width-3}}#"
         )
-        self._stats_table.append(
-            f"# {'Critical Path (comms): '+str(round(critical_path_estimate.comms_estimate, 3)) : <{self._width-3}}#"
-        )
-        self._stats_table.append(
-            f"# {'Critical Path (compute): '+str(round(critical_path_estimate.comp_estimate, 3)) : <{self._width-3}}#"
-        )
-        self._stats_table.append(
-            f"# {'Critical Path (comms + compute): '+str(round(critical_path_estimate.comp_estimate, 3)) : <{self._width-3}}#"
-        )
 
         max_used_hbm = max(used_hbm)
         mean_used_hbm = statistics.mean(used_hbm)
@@ -1067,76 +1052,6 @@ def _reduce_int_list(input_list: List[int]) -> str:
     return ", ".join(reduced)
 
 
-def _calculate_critical_path(best_plan: List[ShardingOption]) -> CriticalPathEstimate:
-    """
-    Calculates the critical path of the sharding plan. Makes the following assumptions:
-
-        1. There is a synchronization point across the ranks after each of the 4 events: Fwd/Bwd x Comms/Comp.
-        2. There are additional synchronization points during communication (both fwd & bwd) for each module <> sharding type combination.
-            i. Communication operations for each shard from the same module <> sharding type group are executed sequentially.
-            ii. Ranks need to synchronize before they can begin the communication operation for the next module <> sharding type group.
-        3. There are additional synchronization points during computation (both fwd & bwd) at the rank level.
-            i. Computation operations for each shard from the same module are executed sequentially.
-            ii. Ranks need to synchronize before they can begin the next set of events.
-    """
-
-    perf_data = defaultdict(float)
-    for so in best_plan:
-        module = so.module
-        sharding_type = so.sharding_type
-        ranks = sorted([cast(int, shard.rank) for shard in so.shards])
-        shard_perfs = [cast(Perf, shard.perf) for shard in so.shards]
-        perf_breakdowns = [
-            {
-                "fwd_compute": perf.fwd_compute,
-                "fwd_comms": perf.fwd_comms,
-                "bwd_compute": perf.bwd_compute,
-                "bwd_comms": perf.bwd_comms,
-                "prefetch_compute": perf.prefetch_compute,
-            }
-            for perf in shard_perfs
-        ]
-
-        for rank, perf_breakdown in zip(ranks, perf_breakdowns):
-            for perf_type in perf_breakdown:
-                perf_data[
-                    (
-                        rank,
-                        module,
-                        sharding_type,
-                        perf_type.split("_")[0],  # fwd or bwd
-                        perf_type.split("_")[1],  # compute or comms
-                    )
-                ] += perf_breakdown[perf_type]
-    perf_df = pd.DataFrame.from_dict(perf_data, orient="index", columns=["perf"])
-    perf_df.index = pd.MultiIndex.from_tuples(
-        perf_df.index,
-        names=["rank", "module", "sharding_type", "direction", "perf_type"],
-    )
-
-    comms_estimate = (
-        perf_df.xs("comms", level="perf_type")
-        .groupby(["rank", "module", "sharding_type", "direction"])
-        .sum()
-        .groupby(["module", "sharding_type", "direction"])
-        .max()
-        .sum()
-        .item()
-    )
-
-    comp_estimate = (
-        perf_df.xs("compute", level="perf_type")
-        .groupby(["rank", "direction"])
-        .sum()
-        .groupby(["direction"])
-        .max()
-        .sum()
-        .item()
-    )
-
-    return CriticalPathEstimate(comms_estimate, comp_estimate)
-
-
 class NoopEmbeddingStats(Stats):
     """
     Noop Stats for a sharding planner execution.
diff --git a/torchrec/distributed/planner/types.py b/torchrec/distributed/planner/types.py
@@ -843,12 +843,3 @@ def log(
         See class description
         """
         ...
-
-
-@dataclass
-class CriticalPathEstimate:
-    comms_estimate: float
-    comp_estimate: float
-
-    def total(self) -> float:
-        return self.comms_estimate + self.comp_estimate