add TPStrategy + 1d/2d parallel dispatcher (#1002)

JKSenthil · facebook-github-bot · commit 70abcd17fe8c · 2025-05-12T16:55:04.000-07:00
Summary: Pull Request resolved: #1002 # Context TP, AC, FSDP2, and torch compile require a specific order (TP -> AC -> compile -> fsdp2). The current order in `prepare_module` is incompatible with this. # This Diff 1) Adds `TPStrategy` dataclass 2) Rename old `prepare_module` logic for plain DDP/FSDP/FSDP2 as `_prepare_module_1d` 3) Shard TP via `_prepare_module_2d` to handle TP and TP+FSDP2+HSDP. The correct order of operations will be applied here 4) Rework `prepare_module` to use `_prepare_module_2d` for TP/any 2D parallel application, and `_prepare_module_1d` for all other strategies Reviewed By: galrotem Differential Revision: D74410708 fbshipit-source-id: 04fda80ef619784d5d9ad5c4db0377e77dc43c75
diff --git a/tests/utils/test_prepare_module.py b/tests/utils/test_prepare_module.py
@@ -19,6 +19,7 @@
 from torchtnt.utils.env import init_from_env
 from torchtnt.utils.prepare_module import (
     _check_and_convert_mp_policy_dtypes,
+    _prepare_module_2d,
     apply_torch_compile,
     DDPStrategy,
     FSDP2Strategy,
@@ -29,6 +30,7 @@
     prepare_fsdp2,
     prepare_module,
     TorchCompileParams,
+    TPStrategy,
 )
 from torchtnt.utils.test_utils import skip_if_not_distributed
 from torchtnt.utils.version import is_torch_version_geq
@@ -294,6 +296,75 @@ def test_fsdp2_mesh(self, mock_fully_shard: Mock) -> None:
             module, mesh=mock_mesh, reshard_after_forward=True
         )
 
+    @patch("torchtnt.utils.prepare_module._prepare_module_2d")
+    @patch("torchtnt.utils.prepare_module._prepare_module_1d")
+    def test_prepare_module_dispatching(
+        self, mock_prepare_module_1d: Mock, mock_prepare_module_2d: Mock
+    ) -> None:
+        """
+        Test that prepare_module dispatches to the correct 1d/2d function based on the strategy
+        """
+
+        module = torch.nn.Linear(2, 2, device="cpu")
+        device = torch.device("cpu")
+        strategy = TPStrategy(tp_plan={}, fsdp2_strategy=None)
+
+        with self.assertRaisesRegex(ValueError, "TPStrategy expects global_mesh"):
+            prepare_module(
+                module,
+                device,
+                strategy=strategy,
+                global_mesh=None,
+            )
+
+        mock_global_mesh = MagicMock(spec=GlobalMeshCoordinator)
+        prepare_module(
+            module,
+            device,
+            strategy=strategy,
+            global_mesh=mock_global_mesh,
+        )
+        mock_prepare_module_2d.assert_called_with(
+            module,
+            device,
+            strategy=strategy,
+            global_mesh=mock_global_mesh,
+            torch_compile_params=None,
+            activation_checkpoint_params=None,
+        )
+
+        strategy = FSDP2Strategy()
+        prepare_module(
+            module,
+            device,
+            strategy=strategy,
+            global_mesh=mock_global_mesh,
+        )
+        mock_prepare_module_1d.assert_called_with(
+            module,
+            device,
+            strategy=strategy,
+            global_mesh=mock_global_mesh,
+            torch_compile_params=None,
+            activation_checkpoint_params=None,
+            enable_compiled_autograd=False,
+        )
+
+    @patch("torchtnt.utils.prepare_module.parallelize_module")
+    def test_prepare_module_2d(self, mock_parallelize_module: Mock) -> None:
+        """
+        Test that prepare_module_2d invokes TP apis
+        """
+
+        module = torch.nn.Linear(2, 2, device="cpu")
+        device = torch.device("cpu")
+        strategy = TPStrategy(tp_plan={}, fsdp2_strategy=None)
+        mock_global_mesh = MagicMock(spec=GlobalMeshCoordinator)
+        _prepare_module_2d(
+            module, device, strategy=strategy, global_mesh=mock_global_mesh
+        )
+        mock_parallelize_module.assert_called_once()
+
     def test_apply_torch_compile_recursive_module_types(self) -> None:
         """
         Test that recursive_module_types is apply correctly.
diff --git a/torchtnt/utils/prepare_module.py b/torchtnt/utils/prepare_module.py
@@ -41,6 +41,8 @@
 )
 from torch.distributed.device_mesh import init_device_mesh
 from torch.distributed.fsdp.fully_sharded_data_parallel import FullStateDictConfig
+from torch.distributed.tensor.parallel import parallelize_module
+from torch.distributed.tensor.parallel.style import ParallelStyle
 from torchtnt.utils.device_mesh import GlobalMeshCoordinator
 from torchtnt.utils.precision import convert_precision_str_to_dtype
 
@@ -229,6 +231,20 @@ class FSDP2Strategy(Strategy):
     cpu_offload: bool = False
 
 
+@dataclass
+class TPStrategy(Strategy):
+    """
+    Dataclass representing Tensor Parallelism strategy. Specify the FSDP strategy for 2D parallelism setup.
+
+    Args:
+        tp_plan: The plan used to parallelize the module. See https://pytorch.org/docs/stable/distributed.tensor.parallel.html#torch.distributed.tensor.parallel.parallelize_module for details.
+        fsdp2_strategy (optional): fsdp2 strategy to configure 2D parallel strategy
+    """
+
+    tp_plan: Union[ParallelStyle, Dict[str, ParallelStyle]]
+    fsdp2_strategy: Optional[FSDP2Strategy] = None
+
+
 @dataclass
 class TorchCompileParams:
     """
@@ -609,7 +625,55 @@ def prepare_module(
     global_mesh: Optional[GlobalMeshCoordinator] = None,
 ) -> torch.nn.Module:
     """
-    Utility to move a module to device, set up parallelism, activation checkpointing and compile.
+    Utility to move a module to device, set up parallelism (None, DDP, FSDP, HSDP, TP), activation checkpointing and compile.
+    This function acts as a dispatcher to choose between 1D and 2D parallelism setup, depending on the strategy used.
+
+    Args:
+        module: module to be used.
+        device: device to which module will be moved.
+        strategy: the data parallelization strategy to be used. if a string, must be one of ``ddp``, ``fsdp``, or ``noop``.
+        torch_compile_params: params for Torch compile https://pytorch.org/docs/stable/generated/torch.compile.html.
+        activation_checkpoint_params: params for enabling activation checkpointing.
+        enable_compiled_autograd: if True, `compiled_autograd` will be used to compile the backward, this is an experimental flag.
+        global_mesh: an instance of :class:`~torchtnt.utils.device_mesh.GlobalMeshCoordinator` which defines the global mesh topology.
+    """
+    if isinstance(strategy, TPStrategy):
+        if global_mesh is None:
+            raise ValueError(
+                "TPStrategy expects global_mesh (GlobalMeshCoordinator) to be defined. Got None."
+            )
+        return _prepare_module_2d(
+            module,
+            device,
+            strategy=strategy,
+            global_mesh=global_mesh,
+            torch_compile_params=torch_compile_params,
+            activation_checkpoint_params=activation_checkpoint_params,
+        )
+
+    return _prepare_module_1d(
+        module,
+        device,
+        strategy=strategy,
+        torch_compile_params=torch_compile_params,
+        activation_checkpoint_params=activation_checkpoint_params,
+        enable_compiled_autograd=enable_compiled_autograd,
+        global_mesh=global_mesh,
+    )
+
+
+def _prepare_module_1d(
+    module: torch.nn.Module,
+    device: torch.device,
+    *,
+    strategy: Optional[Union[Strategy, str]] = None,
+    torch_compile_params: Optional[TorchCompileParams] = None,
+    activation_checkpoint_params: Optional[ActivationCheckpointParams] = None,
+    enable_compiled_autograd: bool = False,
+    global_mesh: Optional[GlobalMeshCoordinator] = None,
+) -> torch.nn.Module:
+    """
+    Utility to move a module to device, set up 1D parallelism (None, DDP, FSDP), activation checkpointing and compile.
 
     Args:
         module: module to be used.
@@ -675,6 +739,51 @@ def prepare_module(
     return module
 
 
+def _prepare_module_2d(
+    module: torch.nn.Module,
+    device: torch.device,
+    *,
+    strategy: TPStrategy,
+    global_mesh: GlobalMeshCoordinator,
+    torch_compile_params: Optional[TorchCompileParams] = None,
+    activation_checkpoint_params: Optional[ActivationCheckpointParams] = None,
+) -> torch.nn.Module:
+    """
+    Utility to move a module to device, set up 2D parallelism (FSDP / TP / HSDP), activation checkpointing and compile.
+
+    Order of composability is TP -> AC -> compile -> fsdp2.
+
+    Args:
+        module: module to be used.
+        device: device to which module will be moved.
+        strategy: the TP parallelization strategy to be used.
+        global_mesh: an instance of :class:`~torchtnt.utils.device_mesh.GlobalMeshCoordinator` which defines the global mesh topology.
+        torch_compile_params: params for Torch compile https://pytorch.org/docs/stable/generated/torch.compile.html.
+        activation_checkpoint_params: params for enabling activation checkpointing.
+    """
+
+    # 1) apply TP
+    parallelize_module(module, global_mesh.tp_mesh, parallelize_plan=strategy.tp_plan)
+
+    # 2) apply AC if specified
+    if activation_checkpoint_params:
+        apply_ac(module, activation_checkpoint_params)
+
+    # 3) apply torch.compile is specified
+    if torch_compile_params:
+        apply_torch_compile(module, torch_compile_params)
+
+    # 4) apply data parallel / HSDP sharding (via FSDP2 apis) if specified in TPStrategy
+    if (fsdp2_strategy := strategy.fsdp2_strategy) is not None:
+        prepare_fsdp2(module, device, fsdp2_strategy, global_mesh)
+    else:
+        # prepare_fsdp2 will handle materializing meta weights
+        # so if fsdp2strategy isn't used, we do it manually here
+        materialize_meta_params(module, device)
+
+    return module
+
+
 def convert_str_to_strategy(
     strategy: str,
 ) -> Union[DDPStrategy, FSDPStrategy, FSDP2Strategy, NOOPStrategy]: