Bunch of FSDP improvements (#3671)

S1ro1 · web-flow · commit d6c986c3f2dd · 2025-07-09T16:05:22.000+02:00
* Feat: split tests

* Feat: finito

* Fix

* Final, tests pass
diff --git a/src/accelerate/accelerator.py b/src/accelerate/accelerator.py
@@ -125,6 +125,7 @@
     FSDP2_PYTORCH_VERSION,
     FSDP_PYTORCH_VERSION,
     PROFILE_PATTERN_NAME,
+    SCALER_NAME,
 )
 from .utils.modeling import get_state_dict_offloaded_model
 from .utils.other import compile_regions, compile_regions_deepspeed, is_compiled_module
@@ -3521,6 +3522,21 @@ def _inner(folder):
             else:
                 models.append(model)
 
+        # We need to load the scaler state before the optimizer for FSDP2
+        # (`torch.distributed.checkpoint.set_optimizer_state_dict`) which we use to set the state of the optimizer calls `optimizer.step` on
+        # a dummy tensor, but since the scaler is not initialized, it will raise an error (the scaler exists but its `_scale` is None)
+        scaler = None
+        if self.scaler is not None and self.is_fsdp2:
+            input_scaler_file = os.path.join(input_dir, SCALER_NAME)
+            scaler_state = torch.load(input_scaler_file)
+            self.scaler.load_state_dict(scaler_state)
+            # We also need to call the `_lazy_init_scale_growth_tracker` to initialize the scaler, as it would else be called
+            # on the first call to scale
+            self.scaler._lazy_init_scale_growth_tracker(self.scaler._device)
+            logger.info("GradScaler state loaded successfully")
+        else:
+            scaler = self.scaler
+
         # Load the optimizers taking care of FSDP and DeepSpeed nuances
         optimizers = []
         if self.distributed_type == DistributedType.FSDP:
@@ -3569,7 +3585,7 @@ def _inner(folder):
             schedulers,
             dataloaders,
             self.state.process_index,
-            self.scaler,
+            scaler,
             map_location,
             load_kwargs,
             **load_model_func_kwargs,
diff --git a/src/accelerate/state.py b/src/accelerate/state.py
@@ -213,12 +213,6 @@ def __init__(self, cpu: bool = False, **kwargs):
                             if self.backend == "tccl":
                                 local_rank = os.environ.get("LOCAL_RANK", -1)
                                 torch.sdaa.set_device(f"sdaa:{local_rank}")
-                            if (
-                                self.backend == "nccl"
-                                and os.environ.get("ACCELERATE_USE_FSDP", "false") == "true"
-                                and os.environ.get("FSDP_OFFLOAD_PARAMS", "false") == "true"
-                            ):
-                                self.backend = "cuda:nccl,cpu:gloo"
                             dist.init_distributed(dist_backend=self.backend, auto_mpi_discovery=False, **kwargs)
                         # We need to flag to `use_deepspeed` to be True to override `distributed_type` later
                         use_deepspeed = True
@@ -230,6 +224,15 @@ def __init__(self, cpu: bool = False, **kwargs):
                         if self.backend == "tccl":
                             local_rank = os.environ.get("LOCAL_RANK", -1)
                             torch.sdaa.set_device(f"sdaa:{local_rank}")
+                        if (
+                            self.backend == "nccl"
+                            and os.environ.get("ACCELERATE_USE_FSDP", "false") == "true"
+                            and (
+                                os.environ.get("FSDP_OFFLOAD_PARAMS", "false") == "true"
+                                or os.environ.get("FSDP_STATE_DICT_TYPE", "SHARDED_STATE_DICT") == "FULL_STATE_DICT"
+                            )
+                        ):
+                            self.backend = "cuda:nccl,cpu:gloo"
                         torch.distributed.init_process_group(backend=self.backend, **kwargs)
 
             # XPU and CPU require special env configs to be set
diff --git a/src/accelerate/test_utils/testing.py b/src/accelerate/test_utils/testing.py
@@ -250,6 +250,10 @@ def require_fp8(test_case):
     return unittest.skipUnless(fp8_is_available, "test requires FP8 support")(test_case)
 
 
+def require_fsdp2(test_case):
+    return unittest.skipUnless(is_torch_version(">=", "2.5.0"), "test requires FSDP2 (torch >= 2.5.0)")(test_case)
+
+
 def require_mlu(test_case):
     """
     Decorator marking a test that requires MLU. These tests are skipped when there are no MLU available.
diff --git a/src/accelerate/utils/fsdp_utils.py b/src/accelerate/utils/fsdp_utils.py
@@ -179,10 +179,9 @@ def load_fsdp_model(fsdp_plugin, accelerator, model, input_dir, model_index=0, a
         else nullcontext()
     )
     sd_options = _prepare_sd_options(fsdp_plugin)
-
     with ctx:
         if fsdp_plugin.state_dict_type == StateDictType.FULL_STATE_DICT:
-            if type(model) is not FSDP and accelerator.process_index != 0:
+            if type(model) is not FSDP and accelerator.process_index != 0 and not accelerator.is_fsdp2:
                 if not fsdp_plugin.sync_module_states and fsdp_plugin.fsdp_version == 1:
                     raise ValueError(
                         "Set the `sync_module_states` flag to `True` so that model states are synced across processes when "
@@ -192,7 +191,12 @@ def load_fsdp_model(fsdp_plugin, accelerator, model, input_dir, model_index=0, a
             weights_name = f"{FSDP_MODEL_NAME}.bin" if model_index == 0 else f"{FSDP_MODEL_NAME}_{model_index}.bin"
             input_model_file = os.path.join(input_dir, weights_name)
             logger.info(f"Loading model from {input_model_file}")
-            state_dict = torch.load(input_model_file, weights_only=True)
+            # we want an empty state dict for FSDP2 as we use `broadcast_from_rank0`
+            load_model = not accelerator.is_fsdp2 or accelerator.is_main_process
+            if load_model:
+                state_dict = torch.load(input_model_file, weights_only=True)
+            else:
+                state_dict = {}
             logger.info(f"Model loaded from {input_model_file}")
         elif fsdp_plugin.state_dict_type == StateDictType.LOCAL_STATE_DICT:
             weights_name = (
diff --git a/tests/fsdp/test_fsdp.py b/tests/fsdp/test_fsdp.py
@@ -29,6 +29,7 @@
     get_launch_command,
     path_in_accelerate_package,
     require_fp16,
+    require_fsdp2,
     require_multi_device,
     require_non_cpu,
     require_non_torch_xla,
@@ -37,7 +38,6 @@
 )
 from accelerate.utils import is_bf16_available, is_fp16_available, is_hpu_available, patch_environment, set_seed
 from accelerate.utils.constants import (
-    FSDP2_PYTORCH_VERSION,
     FSDP2_STATE_DICT_TYPE,
     FSDP_AUTO_WRAP_POLICY,
     FSDP_BACKWARD_PREFETCH,
@@ -46,7 +46,6 @@
 )
 from accelerate.utils.dataclasses import FullyShardedDataParallelPlugin
 from accelerate.utils.fsdp_utils import disable_fsdp_ram_efficient_loading, enable_fsdp_ram_efficient_loading
-from accelerate.utils.versions import is_torch_version
 
 
 set_seed(42)
@@ -63,10 +62,6 @@
 if is_bf16_available():
     dtypes.append(BF16)
 
-FSDP_VERSIONS = [1]
-if is_torch_version(">=", FSDP2_PYTORCH_VERSION):
-    FSDP_VERSIONS.append(2)
-
 
 @require_non_cpu
 @require_non_torch_xla
@@ -90,24 +85,7 @@ def setUp(self):
             2: self.fsdp2_env,
         }
 
-    def run(self, result=None):
-        """Override run to get the current test name and format failures to include FSDP version."""
-        test_method = getattr(self, self._testMethodName)
-        orig_test_method = test_method
-
-        def test_wrapper(*args, **kwargs):
-            for fsdp_version in FSDP_VERSIONS:
-                try:
-                    self.current_fsdp_version = fsdp_version
-                    return orig_test_method(*args, **kwargs)
-                except Exception as e:
-                    raise type(e)(f"FSDP version {fsdp_version}: {str(e)}") from e
-
-        setattr(self, self._testMethodName, test_wrapper)
-        try:
-            return super().run(result)
-        finally:
-            setattr(self, self._testMethodName, orig_test_method)
+        self.current_fsdp_version = 1
 
     def test_sharding_strategy(self):
         from torch.distributed.fsdp.fully_sharded_data_parallel import ShardingStrategy
@@ -421,6 +399,15 @@ def test_cpu_ram_efficient_loading(self):
             assert os.environ.get("FSDP_CPU_RAM_EFFICIENT_LOADING") == "False"
 
 
+@require_fsdp2
+@require_non_cpu
+@require_non_torch_xla
+class FSDP2PluginIntegration(FSDPPluginIntegration):
+    def setUp(self):
+        super().setUp()
+        self.current_fsdp_version = 2
+
+
 @run_first
 # Skip this test when TorchXLA is available because accelerate.launch does not support TorchXLA FSDP.
 @require_non_torch_xla
@@ -462,24 +449,7 @@ def setUp(self):
         self.n_train = 160
         self.n_val = 160
 
-    def run(self, result=None):
-        """Override run to get the current test name and format failures to include FSDP version."""
-        test_method = getattr(self, self._testMethodName)
-        orig_test_method = test_method
-
-        def test_wrapper(*args, **kwargs):
-            for fsdp_version in FSDP_VERSIONS:
-                try:
-                    self.current_fsdp_version = fsdp_version
-                    return orig_test_method(*args, **kwargs)
-                except Exception as e:
-                    raise type(e)(f"FSDP version {fsdp_version}: {str(e)}") from e
-
-        setattr(self, self._testMethodName, test_wrapper)
-        try:
-            return super().run(result)
-        finally:
-            setattr(self, self._testMethodName, orig_test_method)
+        self.current_fsdp_version = 1
 
     @require_fp16
     def test_performance(self):
@@ -633,3 +603,15 @@ def test_peak_memory_usage(self):
             )
             with patch_environment(omp_num_threads=1):
                 execute_subprocess_async(cmd_config)
+
+
+@require_fsdp2
+@run_first
+# Skip this test when TorchXLA is available because accelerate.launch does not support TorchXLA FSDP.
+@require_non_torch_xla
+@require_multi_device
+@slow
+class FSDP2IntegrationTest(FSDPIntegrationTest):
+    def setUp(self):
+        super().setUp()
+        self.current_fsdp_version = 2