add global_mesh support in AutoUnit (#1003)

JKSenthil · facebook-github-bot · commit cb31137b0928 · 2025-05-12T16:55:04.000-07:00
Summary: Pull Request resolved: #1003 Adds `global_mesh` arg in AutoUnit and forwards into `prepare_module` for model sharding Reviewed By: vdogaru Differential Revision: D74410711 fbshipit-source-id: fb7caedef706c9d8f7876f14d6d31e1d4aaa7151
diff --git a/tests/framework/test_auto_unit.py b/tests/framework/test_auto_unit.py
@@ -9,7 +9,7 @@
 
 import unittest
 from typing import Any, Literal, Optional, Tuple, TypeVar
-from unittest.mock import MagicMock, patch
+from unittest.mock import MagicMock, Mock, patch
 
 import torch
 
@@ -37,6 +37,7 @@
 from torchtnt.framework.train import train
 from torchtnt.framework.unit import TPredictData
 from torchtnt.utils.device import copy_data_to_device
+from torchtnt.utils.device_mesh import GlobalMeshCoordinator
 from torchtnt.utils.distributed import spawn_multi_process
 from torchtnt.utils.env import init_from_env
 from torchtnt.utils.lr_scheduler import TLRScheduler
@@ -780,6 +781,34 @@ def test_gradient_accumulation_fsdp2(self, _) -> None:
 
             auto_unit.train_progress.increment_step()
 
+    @patch("torchtnt.framework.auto_unit.prepare_module")
+    def test_global_mesh(self, mock_prepare_module: Mock) -> None:
+        """
+        Test that the global mesh is forwarded correctly in the AutoUnit.
+        """
+        module = torch.nn.Linear(1, 1)
+        device = torch.device("cpu")
+        strategy = DDPStrategy()
+        mock_global_mesh = MagicMock(spec=GlobalMeshCoordinator)
+        mock_prepare_module.return_value = module
+
+        DummyAutoUnit(
+            module=module,
+            device=device,
+            strategy=strategy,
+            global_mesh=mock_global_mesh,
+        )
+
+        mock_prepare_module.assert_called_once_with(
+            module,
+            device,
+            strategy=strategy,
+            torch_compile_params=None,
+            activation_checkpoint_params=None,
+            enable_compiled_autograd=False,
+            global_mesh=mock_global_mesh,
+        )
+
 
 Batch = Tuple[torch.Tensor, torch.Tensor]
 
diff --git a/torchtnt/framework/auto_unit.py b/torchtnt/framework/auto_unit.py
@@ -35,6 +35,7 @@
 from torchtnt.framework.unit import EvalUnit, PredictUnit, TPredictData, TrainUnit
 from torchtnt.framework.utils import get_timing_context
 from torchtnt.utils.device import copy_data_to_device
+from torchtnt.utils.device_mesh import GlobalMeshCoordinator
 from torchtnt.utils.env import init_from_env
 from torchtnt.utils.lr_scheduler import TLRScheduler
 from torchtnt.utils.precision import (
@@ -326,6 +327,7 @@ def __init__(
         torch_compile_params: Optional[TorchCompileParams] = None,
         detect_anomaly: Optional[bool] = None,
         enable_prefetch: bool = False,
+        global_mesh: Optional[GlobalMeshCoordinator] = None,
     ) -> None:
         """
         AutoPredictUnit is a convenience for users who are running inference and would like to have certain features handled for them, such as:
@@ -348,6 +350,7 @@ def __init__(
             strategy: the data parallelization strategy to be used. if a string, must be one of ``ddp`` or ``fsdp``.
             torch_compile_params: params for Torch compile https://pytorch.org/docs/stable/generated/torch.compile.html
             detect_anomaly: whether to enable anomaly detection for the autograd engine https://pytorch.org/docs/stable/autograd.html#anomaly-detection
+            global_mesh: an instance of :class:`~torchtnt.utils.device_mesh.GlobalMeshCoordinator` which defines the global mesh topology. Needed to configure TP or 2D parallelism strategies.
 
         Note:
             Torch compile support is only available in PyTorch 2.0 or higher.
@@ -365,6 +368,7 @@ def __init__(
             self.device,
             strategy=strategy,
             torch_compile_params=torch_compile_params,
+            global_mesh=global_mesh,
         )
 
     # pyre-fixme[3]: Return annotation cannot be `Any`.
@@ -474,6 +478,7 @@ class AutoUnit(
             in a much more efficient way.
         enable_prefetch: if True, the data will be prefetched to the device before the next batch is loaded
         zero_grad_at_train_step_start: if True, the optimizer's gradients will be zeroed at the start of each train step, rather than at the end. Useful if you want to inspect/log the gradients via custom callback.
+        global_mesh: an instance of :class:`~torchtnt.utils.device_mesh.GlobalMeshCoordinator` which defines the global mesh topology. Needed to configure TP or 2D parallelism strategies.
 
     Note:
         Certain strategies, like :class:`~torchtnt.utils.prepare_module.FSDPStrategy` also support mixed precision as an argument, so can be configured through that class as well.
@@ -510,6 +515,7 @@ def __init__(
         loss_backward_retain_graph: Optional[bool] = None,
         enable_prefetch: bool = True,
         zero_grad_at_train_step_start: bool = False,
+        global_mesh: Optional[GlobalMeshCoordinator] = None,
     ) -> None:
         super().__init__(
             module=module,
@@ -554,6 +560,7 @@ def __init__(
             torch_compile_params=torch_compile_params,
             activation_checkpoint_params=activation_checkpoint_params,
             enable_compiled_autograd=enable_compiled_autograd,
+            global_mesh=global_mesh,
         )
 
         self.grad_scaler: Optional[GradScaler] = None