From 5cc263c42cb4f2c4dd80ec597ce4fa80639ed47f Mon Sep 17 00:00:00 2001
From: Nicki Skafte <skaftenicki@gmail.com>
Date: Mon, 11 Aug 2025 07:11:03 +0200
Subject: [PATCH 1/6] add to deepspeed strategies

---
 src/lightning/fabric/strategies/deepspeed.py  |  8 +++++++-
 src/lightning/pytorch/strategies/deepspeed.py | 11 ++++++++++-
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/src/lightning/fabric/strategies/deepspeed.py b/src/lightning/fabric/strategies/deepspeed.py
index 48333455240cf..c6a3f0c22a5b9 100644
--- a/src/lightning/fabric/strategies/deepspeed.py
+++ b/src/lightning/fabric/strategies/deepspeed.py
@@ -101,6 +101,7 @@ def __init__(
         precision: Optional[Precision] = None,
         process_group_backend: Optional[str] = None,
         timeout: Optional[timedelta] = default_pg_timeout,
+        exclude_frozen_parameters: bool = False,
     ) -> None:
         """Provides capabilities to run training using the DeepSpeed library, with training optimizations for large
         billion parameter models. `For more information: https://pytorch-
@@ -230,6 +231,8 @@ def __init__(
                 when using ZeRO Stage 3. This differs from the DeepSpeed checkpoint which contains shards
                 per worker.
 
+            exclude_frozen_parameters: Exclude frozen parameters when saving checkpoints.
+
         """
         if not _DEEPSPEED_AVAILABLE:
             raise ImportError(
@@ -290,6 +293,7 @@ def __init__(
 
         self.remote_device = remote_device
         self.load_full_weights = load_full_weights
+        self.exclude_frozen_parameters = exclude_frozen_parameters
 
         # default FP16 parameters.
         self.loss_scale = loss_scale
@@ -445,7 +449,9 @@ def save_checkpoint(
         # there might be other stateful objects unrelated to the deepspeed engine - convert them to a state_dict
         state = self._convert_stateful_objects_in_state(state, filter={})
         # use deepspeed's internal checkpointing function to handle partitioned weights across processes
-        engine.save_checkpoint(path, client_state=state, tag="checkpoint")
+        engine.save_checkpoint(
+            path, client_state=state, tag="checkpoint", exclude_frozen_parameters=self.exclude_frozen_parameters
+        )
 
     @override
     def load_checkpoint(
diff --git a/src/lightning/pytorch/strategies/deepspeed.py b/src/lightning/pytorch/strategies/deepspeed.py
index dabfde70242b9..261b24545314f 100644
--- a/src/lightning/pytorch/strategies/deepspeed.py
+++ b/src/lightning/pytorch/strategies/deepspeed.py
@@ -122,6 +122,7 @@ def __init__(
         precision_plugin: Optional[Precision] = None,
         process_group_backend: Optional[str] = None,
         timeout: Optional[timedelta] = default_pg_timeout,
+        exclude_frozen_parameters: bool = False,
     ) -> None:
         """Provides capabilities to run training using the DeepSpeed library, with training optimizations for large
         billion parameter models. `For more information: https://pytorch-
@@ -253,6 +254,8 @@ def __init__(
                 when using ZeRO Stage 3. This differs from the DeepSpeed checkpoint which contains shards
                 per worker.
 
+            exclude_frozen_parameters: Exclude frozen parameters when saving checkpoints.
+
         """
         if not _DEEPSPEED_AVAILABLE:
             raise MisconfigurationException(
@@ -311,6 +314,7 @@ def __init__(
 
         self.remote_device = remote_device
         self.load_full_weights = load_full_weights
+        self.exclude_frozen_parameters = exclude_frozen_parameters
 
         # default FP16 parameters.
         self.loss_scale = loss_scale
@@ -648,7 +652,12 @@ def save_checkpoint(self, checkpoint: dict, filepath: _PATH, storage_options: Op
         # dump states as a checkpoint dictionary object
         _exclude_keys = ["state_dict", "optimizer_states"]
         checkpoint = {k: v for k, v in checkpoint.items() if k not in _exclude_keys}
-        self.deepspeed_engine.save_checkpoint(filepath, client_state=checkpoint, tag="checkpoint")
+        self.deepspeed_engine.save_checkpoint(
+            filepath,
+            client_state=checkpoint,
+            tag="checkpoint",
+            exclude_frozen_parameters=self.exclude_frozen_parameters,
+        )
 
     @override
     def load_checkpoint(self, checkpoint_path: _PATH) -> dict[str, Any]:

From 9160f7e5cdd7bfd425beebbc16778b57e07ef70e Mon Sep 17 00:00:00 2001
From: Nicki Skafte <skaftenicki@gmail.com>
Date: Mon, 11 Aug 2025 07:17:07 +0200
Subject: [PATCH 2/6] add testing

---
 .../tests_fabric/strategies/test_deepspeed.py | 21 ++++++++++
 .../strategies/test_deepspeed.py              | 40 +++++++++++++++++++
 2 files changed, 61 insertions(+)

diff --git a/tests/tests_fabric/strategies/test_deepspeed.py b/tests/tests_fabric/strategies/test_deepspeed.py
index d24021fb27b31..8db1605293377 100644
--- a/tests/tests_fabric/strategies/test_deepspeed.py
+++ b/tests/tests_fabric/strategies/test_deepspeed.py
@@ -219,6 +219,27 @@ def test_deepspeed_save_checkpoint_warn_colliding_keys(tmp_path):
         strategy.save_checkpoint(path=tmp_path, state={"model": model, "optimizer": optimizer, "mp_world_size": 2})
 
 
+@RunIf(deepspeed=True)
+@pytest.mark.parametrize("exclude_frozen_parameters", [True, False])
+def test_deepspeed_save_checkpoint_exclude_frozen_parameters(exclude_frozen_parameters):
+    """Test that the DeepSpeed strategy can save checkpoints with the `exclude_frozen_parameters` argument."""
+    from deepspeed import DeepSpeedEngine
+
+    strategy = DeepSpeedStrategy(exclude_frozen_parameters=exclude_frozen_parameters)
+    assert strategy.exclude_frozen_parameters is exclude_frozen_parameters
+
+    model = Mock(spec=DeepSpeedEngine, optimizer=None)
+    model.modules.return_value = [model]
+    strategy.save_checkpoint(path="test_path", state={"model": model, "extra": "data"})
+
+    model.save_checkpoint.assert_called_with(
+        "test_path",
+        client_state={"extra": "data"},
+        tag="checkpoint",
+        exclude_frozen_parameters=exclude_frozen_parameters,
+    )
+
+
 @RunIf(deepspeed=True)
 def test_deepspeed_load_checkpoint_validate_path(tmp_path):
     """Test that we validate the checkpoint path for a DeepSpeed checkpoint and give suggestions for user error."""
diff --git a/tests/tests_pytorch/strategies/test_deepspeed.py b/tests/tests_pytorch/strategies/test_deepspeed.py
index 7e7d2eacd0617..503d1ea0e630b 100644
--- a/tests/tests_pytorch/strategies/test_deepspeed.py
+++ b/tests/tests_pytorch/strategies/test_deepspeed.py
@@ -562,6 +562,46 @@ def test_deepspeed_multigpu_single_file(tmp_path):
     trainer.test(model, ckpt_path=checkpoint_path)
 
 
+@RunIf(min_cuda_gpus=1, standalone=True, deepspeed=True)
+def test_deepspeed_strategy_exclude_frozen_parameters_integration(tmp_path):
+    """Test end-to-end integration of exclude_frozen_parameters with actual model training and checkpointing."""
+
+    class TestModelWithFrozenParams(BoringModel):
+        def __init__(self):
+            super().__init__()
+            self.frozen_layer = torch.nn.Linear(32, 32)
+
+        def configure_model(self) -> None:
+            super().configure_model()
+            # Freeze the additional layer parameters
+            for param in self.frozen_layer.parameters():
+                param.requires_grad = False
+
+        def forward(self, x):
+            x = self.frozen_layer(x)
+            return super().forward(x)
+
+    model = TestModelWithFrozenParams()
+
+    trainer = Trainer(
+        default_root_dir=tmp_path,
+        strategy=DeepSpeedStrategy(exclude_frozen_parameters=True),
+        accelerator="gpu",
+        devices=1,
+        fast_dev_run=True,
+        precision="16-mixed",
+        enable_progress_bar=False,
+        enable_model_summary=False,
+    )
+
+    trainer.fit(model)
+    checkpoint_path = os.path.join(tmp_path, "checkpoint_exclude_frozen.ckpt")
+    trainer.save_checkpoint(checkpoint_path)
+
+    # Verify checkpoint was created
+    assert os.path.exists(checkpoint_path)
+
+
 class ModelParallelClassificationModel(LightningModule):
     def __init__(self, lr: float = 0.01, num_blocks: int = 5):
         super().__init__()

From 457b962d24607608b5d73bed18196a97432e1feb Mon Sep 17 00:00:00 2001
From: Nicki Skafte <skaftenicki@gmail.com>
Date: Thu, 14 Aug 2025 13:15:52 +0200
Subject: [PATCH 3/6] fix tests

---
 tests/tests_fabric/strategies/test_deepspeed.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tests/tests_fabric/strategies/test_deepspeed.py b/tests/tests_fabric/strategies/test_deepspeed.py
index 8db1605293377..0194c7b87820a 100644
--- a/tests/tests_fabric/strategies/test_deepspeed.py
+++ b/tests/tests_fabric/strategies/test_deepspeed.py
@@ -194,7 +194,9 @@ def test_deepspeed_save_checkpoint_client_state_separation(tmp_path):
     model.modules.return_value = [model]
     strategy.save_checkpoint(path=tmp_path, state={"model": model, "test": "data"})
     # the client_state should not contain any deepspeed engine or deepspeed optimizer
-    model.save_checkpoint.assert_called_with(tmp_path, client_state={"test": "data"}, tag="checkpoint")
+    model.save_checkpoint.assert_called_with(
+        tmp_path, client_state={"test": "data"}, tag="checkpoint", exclude_frozen_parameters=False
+    )
 
     # Model and optimizer
     optimizer = Mock()
@@ -202,7 +204,9 @@ def test_deepspeed_save_checkpoint_client_state_separation(tmp_path):
     model.modules.return_value = [model]
     strategy.save_checkpoint(path=tmp_path, state={"model": model, "optimizer": optimizer, "test": "data"})
     # the client_state should not contain any deepspeed engine or deepspeed optimizer
-    model.save_checkpoint.assert_called_with(tmp_path, client_state={"test": "data"}, tag="checkpoint")
+    model.save_checkpoint.assert_called_with(
+        tmp_path, client_state={"test": "data"}, tag="checkpoint", exclude_frozen_parameters=False
+    )
 
 
 @RunIf(deepspeed=True)

From 9e920066ce6f470bbbcecc089f89123af3c85efd Mon Sep 17 00:00:00 2001
From: Nicki Skafte <skaftenicki@gmail.com>
Date: Thu, 14 Aug 2025 13:17:59 +0200
Subject: [PATCH 4/6] changelog

---
 src/lightning/fabric/CHANGELOG.md  | 2 +-
 src/lightning/pytorch/CHANGELOG.md | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md
index 4ff228627182c..832240a406582 100644
--- a/src/lightning/fabric/CHANGELOG.md
+++ b/src/lightning/fabric/CHANGELOG.md
@@ -9,7 +9,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Added
 
--
+- Added `exclude_frozen_parameters` to `DeepSpeedStrategy` ([#21060](https://github.com/Lightning-AI/pytorch-lightning/pull/21060))
 
 
 ### Removed
diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md
index 67d97f027b017..a3b47396f1af2 100644
--- a/src/lightning/pytorch/CHANGELOG.md
+++ b/src/lightning/pytorch/CHANGELOG.md
@@ -16,6 +16,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added support for general mappings being returned from `training_step` when using manual optimization ([#21011](https://github.com/Lightning-AI/pytorch-lightning/pull/21011))
 
 
+- Added `exclude_frozen_parameters` to `DeepSpeedStrategy` ([#21060](https://github.com/Lightning-AI/pytorch-lightning/pull/21060))
+
+
 ### Changed
 
 - Allow returning `ONNXProgram` when calling `to_onnx(dynamo=True)` ([#20811](https://github.com/Lightning-AI/pytorch-lightning/pull/20811))

From aa886d33e7d6eb1565edfd6d846185ed8d6f168b Mon Sep 17 00:00:00 2001
From: Jirka B <j.borovec+github@gmail.com>
Date: Tue, 2 Sep 2025 15:14:24 +0200
Subject: [PATCH 5/6] GLOO_SOCKET_IFNAME

---
 .github/workflows/ci-tests-pytorch.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/ci-tests-pytorch.yml b/.github/workflows/ci-tests-pytorch.yml
index f8df4cee8d10c..ad0d1b48e52e9 100644
--- a/.github/workflows/ci-tests-pytorch.yml
+++ b/.github/workflows/ci-tests-pytorch.yml
@@ -72,6 +72,7 @@ jobs:
       PYPI_CACHE_DIR: "_pip-wheels"
       # TODO: Remove this - Enable running MPS tests on this platform
       DISABLE_MPS: ${{ matrix.os == 'macOS-14' && '1' || '0' }}
+      GLOO_SOCKET_IFNAME: "eth0"
     steps:
       - uses: actions/checkout@v5
 

From 69dae7c4f4982b66f78f39d5bc73f02b9e1c0831 Mon Sep 17 00:00:00 2001
From: Jirka B <j.borovec+github@gmail.com>
Date: Tue, 2 Sep 2025 15:42:44 +0200
Subject: [PATCH 6/6] GLOO_SOCKET_IFNAME

---
 .github/workflows/ci-tests-pytorch.yml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/ci-tests-pytorch.yml b/.github/workflows/ci-tests-pytorch.yml
index ad0d1b48e52e9..f9cfc9a843b6c 100644
--- a/.github/workflows/ci-tests-pytorch.yml
+++ b/.github/workflows/ci-tests-pytorch.yml
@@ -72,7 +72,6 @@ jobs:
       PYPI_CACHE_DIR: "_pip-wheels"
       # TODO: Remove this - Enable running MPS tests on this platform
       DISABLE_MPS: ${{ matrix.os == 'macOS-14' && '1' || '0' }}
-      GLOO_SOCKET_IFNAME: "eth0"
     steps:
       - uses: actions/checkout@v5
 
@@ -84,6 +83,10 @@ jobs:
       - name: basic setup
         run: pip install -q -r .actions/requirements.txt
 
+      - name: Append Env. vars for Linux
+        if: ${{ runner.os == 'Linux' }}
+        run: echo "GLOO_SOCKET_IFNAME=eth0" >> $GITHUB_ENV
+
       - name: Set min. dependencies
         if: ${{ matrix.config.requires == 'oldest' }}
         run: |