could it be

deependujha · deependujha · commit b3ce371027fc · 2025-09-11T10:28:27.000+05:30
diff --git a/src/lightning/fabric/utilities/imports.py b/src/lightning/fabric/utilities/imports.py
@@ -35,6 +35,7 @@
 _TORCH_GREATER_EQUAL_2_4 = compare_version("torch", operator.ge, "2.4.0")
 _TORCH_GREATER_EQUAL_2_4_1 = compare_version("torch", operator.ge, "2.4.1")
 _TORCH_GREATER_EQUAL_2_5 = compare_version("torch", operator.ge, "2.5.0")
+_TORCH_GREATER_EQUAL_2_6 = compare_version("torch", operator.ge, "2.6.0")
 _TORCH_LESS_EQUAL_2_6 = compare_version("torch", operator.le, "2.6.0")
 _TORCHMETRICS_GREATER_EQUAL_1_0_0 = compare_version("torchmetrics", operator.ge, "1.0.0")
 _PYTHON_GREATER_EQUAL_3_10_0 = (sys.version_info.major, sys.version_info.minor) >= (3, 10)
diff --git a/src/lightning/pytorch/strategies/fsdp2.py b/src/lightning/pytorch/strategies/fsdp2.py
@@ -48,7 +48,7 @@
     _sync_ddp_if_available,
 )
 from lightning.fabric.utilities.distributed import group as _group
-from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_3
+from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_6
 from lightning.fabric.utilities.init import _has_all_dtensor_params_or_buffers, _has_meta_device_parameters_or_buffers
 from lightning.fabric.utilities.optimizer import _optimizers_to_device
 from lightning.fabric.utilities.seed import reset_seed
@@ -66,9 +66,9 @@
     from torch.distributed.device_mesh import DeviceMesh
     from torch.distributed.fsdp import CPUOffloadPolicy, MixedPrecisionPolicy, OffloadPolicy
 
-try:
+if _TORCH_GREATER_EQUAL_2_6:
     from torch.distributed.checkpoint.stateful import Stateful as _TorchStateful
-except ImportError:
+else:
 
     class _TorchStateful:  # type: ignore[no-redef]
         pass
@@ -131,6 +131,11 @@ def __init__(
         mp_policy: Optional["MixedPrecisionPolicy"] = None,
         **kwargs: Any,
     ) -> None:
+        if not _TORCH_GREATER_EQUAL_2_6:
+            raise ModuleNotFoundError(
+                "FSDP2Strategy requires torch>=2.6.0. "
+                f"Found torch {torch.__version__}. Please upgrade torch to use FSDP2Strategy."
+            )
         super().__init__(
             accelerator=accelerator,
             parallel_devices=parallel_devices,
@@ -206,7 +211,7 @@ def setup_environment(self) -> None:
         self._process_group_backend = self._get_process_group_backend()
         assert self.cluster_environment is not None
         kwargs: dict[str, Any] = {"timeout": self._timeout}
-        if _TORCH_GREATER_EQUAL_2_3:
+        if _TORCH_GREATER_EQUAL_2_6:
             kwargs["device_id"] = self.root_device if self.root_device.type != "cpu" else None
         _init_dist_connection(self.cluster_environment, self._process_group_backend, **kwargs)
 
@@ -551,6 +556,11 @@ class AppState(_TorchStateful):
     """
 
     def __init__(self, model: Module, optimizers: list[Optimizer]) -> None:
+        if not _TORCH_GREATER_EQUAL_2_6:
+            raise ModuleNotFoundError(
+                "AppState requires torch>=2.6.0. "
+                f"Found torch {torch.__version__}. Please upgrade torch to use AppState."
+            )
         self.model = model
         self.optimizers = optimizers
 
diff --git a/tests/tests_pytorch/strategies/test_fsdp2.py b/tests/tests_pytorch/strategies/test_fsdp2.py
@@ -132,6 +132,7 @@ def _assert_save_equality(trainer, ckpt_path, cls=TestFSDP2Model):
             assert torch.equal(ddp_param, shard_param)
 
 
+@RunIf(min_torch="2.6.0")
 @pytest.mark.parametrize("strategy", ["fsdp2", "fsdp2_cpu_offload"])
 def test_invalid_on_cpu(tmp_path, cuda_count_0, strategy):
     """Test to ensure that we raise Misconfiguration for FSDP on CPU."""
@@ -141,6 +142,7 @@ def test_invalid_on_cpu(tmp_path, cuda_count_0, strategy):
         trainer.strategy.setup_environment()
 
 
+@RunIf(min_torch="2.6.0")
 def test_custom_mixed_precision():
     """Test to ensure that passing a custom mixed precision config works."""
     from torch.distributed.fsdp import MixedPrecisionPolicy
@@ -168,6 +170,7 @@ class InvalidMPPolicy:
         FSDP2Strategy(mp_policy=InvalidMPPolicy())
 
 
+@RunIf(min_torch="2.6.0")
 @pytest.mark.filterwarnings("ignore::FutureWarning")
 @RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True)
 def test_strategy_sync_batchnorm(tmp_path):
@@ -185,6 +188,7 @@ def test_strategy_sync_batchnorm(tmp_path):
     _run_multiple_stages(trainer, model, os.path.join(tmp_path, "last.ckpt"))
 
 
+@RunIf(min_torch="2.6.0")
 @pytest.mark.filterwarnings("ignore::FutureWarning")
 @RunIf(min_cuda_gpus=1, skip_windows=True)
 def test_modules_without_parameters(tmp_path):
@@ -217,7 +221,7 @@ def training_step(self, batch, batch_idx):
 
 
 @pytest.mark.filterwarnings("ignore::FutureWarning")
-@RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True)
+@RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True, min_torch="2.6.0")
 @pytest.mark.parametrize("precision", ["16-mixed", pytest.param("bf16-mixed", marks=RunIf(bf16_cuda=True))])
 def test_strategy_checkpoint(state_dict_type, precision, tmp_path):
     """Test to ensure that checkpoint is saved correctly when using a single GPU, and all stages can be run."""
@@ -237,7 +241,7 @@ def custom_auto_wrap_policy(
     return nonwrapped_numel >= 2
 
 
-@RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True)
+@RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True, min_torch="2.6.0")
 @pytest.mark.parametrize(
     ("precision", "expected_dtype"),
     [
@@ -279,6 +283,7 @@ def on_fit_start(self):
     trainer.fit(model)
 
 
+@RunIf(min_torch="2.6.0")
 def test_save_checkpoint_storage_options(tmp_path):
     """Test that the FSDP strategy does not accept storage options for saving checkpoints."""
     strategy = FSDP2Strategy()
@@ -304,7 +309,7 @@ def on_train_start(self):
 
 
 @pytest.mark.filterwarnings("ignore::FutureWarning")
-@RunIf(min_cuda_gpus=2, standalone=True)
+@RunIf(min_cuda_gpus=2, standalone=True, min_torch="2.6.0")
 def test_save_load_sharded_state_dict(tmp_path):
     """Test FSDP saving and loading with the sharded state dict format."""
     strategy = FSDP2Strategy()
@@ -341,7 +346,7 @@ def test_save_load_sharded_state_dict(tmp_path):
     trainer.fit(model, ckpt_path=checkpoint_path)
 
 
-@RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True)
+@RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True, min_torch="2.6.0")
 @pytest.mark.parametrize(
     ("precision", "expected_dtype"),
     [
@@ -391,7 +396,7 @@ def _run_setup_assertions(empty_init, expected_device):
 
 
 @pytest.mark.filterwarnings("ignore::FutureWarning")
-@RunIf(min_cuda_gpus=2, standalone=True, min_torch="2.3.0")
+@RunIf(min_cuda_gpus=2, standalone=True, min_torch="2.6.0")
 def test_save_sharded_and_consolidate_and_load(tmp_path):
     """Test the consolidation of a FSDP2-sharded checkpoint into a single file."""
 
@@ -433,3 +438,11 @@ def configure_optimizers(self):
         max_steps=4,
     )
     trainer.fit(model, ckpt_path=checkpoint_path_full)
+
+
+@RunIf(max_torch="2.5")
+@pytest.mark.parametrize("strategy", ["fsdp2", "fsdp2_cpu_offload"])
+def test_fsdp2_requires_torch_2_6_or_newer(tmp_path, strategy):
+    """FSDP2 strategies should error on torch < 2.6."""
+    with pytest.raises(ValueError, match="FSDP2Strategy requires torch>=2.6.0."):
+        Trainer(accelerator="cpu", default_root_dir=tmp_path, fast_dev_run=True, strategy=strategy)