add fsdp2 precision plugin

deependujha · deependujha · commit d8a4d846b4d2 · 2025-09-09T10:52:49.000+05:30
diff --git a/src/lightning/pytorch/plugins/precision/fsdp2.py b/src/lightning/pytorch/plugins/precision/fsdp2.py
@@ -0,0 +1,110 @@
+# Copyright The Lightning AI team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from contextlib import AbstractContextManager
+from typing import Any
+
+import torch
+from lightning_utilities import apply_to_collection
+from torch import Tensor
+from torch.nn import Module
+from typing_extensions import get_args, override
+
+from lightning.fabric.plugins.precision.fsdp import _PRECISION_INPUT
+from lightning.fabric.plugins.precision.utils import _convert_fp_tensor, _DtypeContextManager
+from lightning.pytorch.plugins.precision.precision import Precision
+from lightning.pytorch.utilities.exceptions import MisconfigurationException
+
+
+class FSDP2Precision(Precision):
+    """Precision plugin for training with FSDP2 (Fully Sharded Data Parallel v2).
+
+    .. warning::  This is an :ref:`experimental <versioning:Experimental API>` feature.
+
+    Args:
+        precision: Full precision (32-true), half precision (16-true, bf16-true) or
+            mixed precision (16-mixed, bf16-mixed).
+        scaler: An optional :class:`torch.distributed.fsdp.sharded_grad_scaler.ShardedGradScaler` to use.
+
+    Raises:
+        ValueError:
+            If unsupported ``precision`` is provided.
+
+    """
+
+    def __init__(self, precision: _PRECISION_INPUT, scaler: Any = None) -> None:
+        supported_precision = get_args(_PRECISION_INPUT)
+        if precision not in supported_precision:
+            raise ValueError(
+                f"`precision={precision!r})` is not supported in FSDP."
+                f" `precision` must be one of: {supported_precision}."
+            )
+
+        if scaler is not None:
+            raise ValueError(
+                f"`scaler` is not supported in `{self.__class__.__name__}`, found {scaler}."
+                "Use `mixed-precision policy` instead to configure the scaler."
+            )
+
+        if "mixed" in precision:
+            raise ValueError(
+                f"`precision={precision!r}` is not supported in `{self.__class__.__name__}`."
+                "Only `true` precision is supported."
+                "Use `mixed-precision policy (mp_policy)` instead to configure mixed precision."
+            )
+
+        self.precision = precision
+
+        precision_to_type = {
+            "bf16-true": torch.bfloat16,
+            "16-true": torch.float16,
+            "32-true": torch.float32,
+        }
+        self._desired_input_dtype = precision_to_type[self.precision]
+
+    @override
+    def convert_module(self, module: Module) -> Module:
+        if "true" in self.precision:
+            return module.to(dtype=self._desired_input_dtype)
+        return module
+
+    @override
+    def clip_grad_by_norm(self, *_: Any, **__: Any) -> None:
+        # see https://pytorch.org/docs/stable/fsdp.html#torch.distributed.fsdp.FullyShardedDataParallel.clip_grad_norm_
+        # section `Gradient Clipping`, using `torch.nn.utils.clip_grad_norm_` is incorrect with FSDP.
+        # To overcome this we need to call root_sharded_module.clip_grad_norm(clip_val), but we don't have a reference
+        # to the root module
+        raise MisconfigurationException(
+            f"`gradient_clip_algorithm='norm'` is currently not supported for `{self.__class__.__name__}`"
+        )
+
+    @override
+    def tensor_init_context(self) -> AbstractContextManager:
+        return _DtypeContextManager(self._desired_input_dtype)
+
+    @override
+    def module_init_context(self) -> AbstractContextManager:
+        # Use float32 for module parameter initialization to ensure numerical stability
+        return _DtypeContextManager(self._desired_input_dtype)
+
+    @override
+    def forward_context(self) -> AbstractContextManager:
+        return _DtypeContextManager(self._desired_input_dtype)
+
+    @override
+    def convert_input(self, data: Any) -> Any:
+        return apply_to_collection(data, function=_convert_fp_tensor, dtype=Tensor, dst_type=self._desired_input_dtype)
+
+    @override
+    def convert_output(self, data: Any) -> Any:
+        return apply_to_collection(data, function=_convert_fp_tensor, dtype=Tensor, dst_type=torch.get_default_dtype())
diff --git a/src/lightning/pytorch/strategies/fsdp2.py b/src/lightning/pytorch/strategies/fsdp2.py
@@ -65,7 +65,7 @@
 from lightning.fabric.utilities.types import _PATH, ReduceOp
 from lightning.pytorch.core.optimizer import LightningOptimizer
 from lightning.pytorch.plugins.precision import Precision
-from lightning.pytorch.plugins.precision.fsdp import FSDPPrecision
+from lightning.pytorch.plugins.precision.fsdp2 import FSDP2Precision
 from lightning.pytorch.strategies.launchers.subprocess_script import _SubprocessScriptLauncher
 from lightning.pytorch.strategies.parallel import ParallelStrategy
 from lightning.pytorch.strategies.strategy import TBroadcast
@@ -173,19 +173,19 @@ def process_group_backend(self) -> Optional[str]:
 
     @property
     @override
-    def precision_plugin(self) -> FSDPPrecision:
+    def precision_plugin(self) -> FSDP2Precision:
         plugin = self._precision_plugin
         if plugin is not None:
-            assert isinstance(plugin, FSDPPrecision)
+            assert isinstance(plugin, FSDP2Precision)
             return plugin
-        return FSDPPrecision("32-true")
+        return FSDP2Precision("32-true")
 
     @precision_plugin.setter
     @override
     def precision_plugin(self, precision_plugin: Optional[Precision]) -> None:
-        if precision_plugin is not None and not isinstance(precision_plugin, FSDPPrecision):
+        if precision_plugin is not None and not isinstance(precision_plugin, FSDP2Precision):
             raise TypeError(
-                f"The FSDP strategy can only work with the `FSDPPrecision` plugin, found {precision_plugin}"
+                f"The FSDP2 strategy can only work with the `FSDP2Precision` plugin, found {precision_plugin}"
             )
         self._precision_plugin = precision_plugin