support str precision in mp_policy for fsdp2 (#985)

JKSenthil · facebook-github-bot · commit 4da97041519b · 2025-03-17T15:31:11.000-07:00
Summary: Pull Request resolved: #985 Reviewed By: diego-urgell Differential Revision: D71144318 fbshipit-source-id: be4b4176f91be1f9e2057324f56afbbbed56d0dd
diff --git a/tests/utils/test_prepare_module.py b/tests/utils/test_prepare_module.py
@@ -11,10 +11,12 @@
 from unittest.mock import patch
 
 import torch
+from torch.distributed.fsdp import MixedPrecisionPolicy
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torchtnt.utils.distributed import spawn_multi_process
 from torchtnt.utils.env import init_from_env
 from torchtnt.utils.prepare_module import (
+    _check_and_convert_mp_policy_dtypes,
     DDPStrategy,
     FSDPStrategy,
     materialize_meta_params,
@@ -242,3 +244,25 @@ def __init__(self):
         # Check if the parameters are moved to the specified device
         for param in module.parameters():
             self.assertEqual(param.device, device)
+
+    def test_check_and_convert_mp_policy_dtypes(self) -> None:
+        mp_policy = MixedPrecisionPolicy(
+            # pyre-ignore: Incompatible parameter type [6] (intentional for this test)
+            param_dtype="bf16",
+            # pyre-ignore: Incompatible parameter type [6] (intentional for this test)
+            reduce_dtype="fp16",
+            cast_forward_inputs=False,
+        )
+        new_mp_policy = _check_and_convert_mp_policy_dtypes(mp_policy)
+        self.assertEqual(new_mp_policy.param_dtype, torch.bfloat16)
+        self.assertEqual(new_mp_policy.reduce_dtype, torch.float16)
+        self.assertEqual(new_mp_policy.output_dtype, None)
+        self.assertFalse(new_mp_policy.cast_forward_inputs)
+
+        # pyre-ignore: Incompatible parameter type [6] (intentional for this test)
+        invalid_mp_policy = MixedPrecisionPolicy(param_dtype=16)
+        with self.assertRaisesRegex(
+            ValueError,
+            "MixedPrecisionPolicy requires all dtypes to be torch.dtype.",
+        ):
+            _check_and_convert_mp_policy_dtypes(invalid_mp_policy)
diff --git a/tests/utils/test_prepare_module_gpu.py b/tests/utils/test_prepare_module_gpu.py
@@ -10,7 +10,10 @@
 from typing import Any
 
 import torch
-from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp import (
+    FullyShardedDataParallel as FSDP,
+    MixedPrecisionPolicy,
+)
 
 try:
     from torch.distributed.fsdp import fully_shard
@@ -329,7 +332,8 @@ def test_prepare_fsdp2(self) -> None:
     @staticmethod
     def _test_prepare_fsdp2_none_sharded_raises() -> None:
         """
-        Test with a strategy that does not shard any modules, should raise error
+        Test with a strategy that does not shard any modules, should raise error. And also raise error
+        for invalid mp_policy.
         """
         tc = unittest.TestCase()
 
@@ -339,6 +343,11 @@ def _test_prepare_fsdp2_none_sharded_raises() -> None:
         with tc.assertRaises(ValueError):
             prepare_fsdp2(module, device, strategy)
 
+        # pyre-ignore[6]: Incompatible parameter type (intentional for testing)
+        strategy = FSDP2Strategy(mp_policy=MixedPrecisionPolicy(param_dtype=16))
+        with tc.assertRaises(ValueError):
+            prepare_fsdp2(module, device, strategy)
+
     @staticmethod
     def _test_prepare_fsdp2_shard_all() -> None:
         """
@@ -364,7 +373,9 @@ def _test_prepare_fsdp2_submodule() -> None:
         for t in (torch.nn.Linear, "Linear"):
             module = SimpleModule()
             device = torch.device("cuda")
-            strategy = FSDP2Strategy(modules_to_shard=(t,))
+            # pyre-ignore: Incompatible parameter type [6] (intentional for this test)
+            mp_policy = MixedPrecisionPolicy(param_dtype="bf16")
+            strategy = FSDP2Strategy(modules_to_shard=(t,), mp_policy=mp_policy)
             prepare_fsdp2(module, device, strategy)
 
             for submodule in module.modules():
diff --git a/torchtnt/utils/prepare_module.py b/torchtnt/utils/prepare_module.py
@@ -38,6 +38,7 @@
     set_optimizer_state_dict,
 )
 from torch.distributed.device_mesh import init_device_mesh
+from torchtnt.utils.precision import convert_precision_str_to_dtype
 
 try:
     from torch.distributed.fsdp import (
@@ -218,7 +219,7 @@ class FSDP2Strategy(Strategy):
         Iterable[Union[str, Type[torch.nn.Module]]],
     ] = "all"
     reshard_after_forward: Union[bool, int] = True
-    mp_policy: Optional[Union[torch.dtype, MixedPrecisionPolicy]] = None
+    mp_policy: Optional[Union[str, torch.dtype, MixedPrecisionPolicy]] = None
     cpu_offload: bool = False
 
 
@@ -375,13 +376,20 @@ def prepare_fsdp2(
         fsdp_kwargs["offload_policy"] = CPUOffloadPolicy()
     if (mp_policy := strategy.mp_policy) is not None:
         if isinstance(mp_policy, MixedPrecisionPolicy):
+            mp_policy = _check_and_convert_mp_policy_dtypes(mp_policy)
             fsdp_kwargs["mp_policy"] = mp_policy
-        else:
+        elif isinstance(mp_policy, str):
+            dtype = convert_precision_str_to_dtype(mp_policy)
+            fsdp_kwargs["mp_policy"] = MixedPrecisionPolicy(
+                param_dtype=dtype,
+                reduce_dtype=dtype,
+                output_dtype=dtype,
+            )
+        elif isinstance(mp_policy, torch.dtype):
             fsdp_kwargs["mp_policy"] = MixedPrecisionPolicy(
                 param_dtype=mp_policy,
                 reduce_dtype=mp_policy,
                 output_dtype=mp_policy,
-                cast_forward_inputs=True,
             )
 
     # parse out the modules_to_shard argument
@@ -636,3 +644,39 @@ def materialize_meta_params(module: torch.nn.Module, device: torch.device) -> No
         if on_meta_device(submodule):
             rank_zero_info(f"{name} is on meta device, intializing on device {device}")
             submodule.to_empty(device=device, recurse=False)
+
+
+def _check_and_convert_mp_policy_dtypes(
+    mp_policy: MixedPrecisionPolicy,
+) -> MixedPrecisionPolicy:
+    """
+    Converts precision strings to torch.dtype and validates that all dtypes are of type torch.dtype.
+    Returns new MixedPrecisionPolicy as its attributes are frozen (cannot assign new values to fields)
+    """
+
+    dtypes = (mp_policy.param_dtype, mp_policy.reduce_dtype, mp_policy.output_dtype)
+    dtypes = filter(None, dtypes)
+    for dtype in dtypes:
+        if not isinstance(dtype, (str, torch.dtype)):
+            raise ValueError(
+                f"MixedPrecisionPolicy requires all dtypes to be torch.dtype or string. Got dtype={dtype} with type {type(dtype)}"
+            )
+
+    param_dtype = mp_policy.param_dtype
+    reduce_dtype = mp_policy.reduce_dtype
+    output_dtype = mp_policy.output_dtype
+    if isinstance(mp_policy.param_dtype, str):
+        param_dtype = convert_precision_str_to_dtype(mp_policy.param_dtype)
+    if isinstance(mp_policy.reduce_dtype, str):
+        reduce_dtype = convert_precision_str_to_dtype(mp_policy.reduce_dtype)
+    if isinstance(mp_policy.output_dtype, str):
+        output_dtype = convert_precision_str_to_dtype(mp_policy.output_dtype)
+
+    new_mp_policy = MixedPrecisionPolicy(
+        param_dtype=param_dtype,
+        reduce_dtype=reduce_dtype,
+        output_dtype=output_dtype,
+        cast_forward_inputs=mp_policy.cast_forward_inputs,
+    )
+
+    return new_mp_policy