diff --git a/.azure/gpu-tests-fabric.yml b/.azure/gpu-tests-fabric.yml
index b2f8ab0447a20..12bd49fdc31d3 100644
--- a/.azure/gpu-tests-fabric.yml
+++ b/.azure/gpu-tests-fabric.yml
@@ -85,6 +85,7 @@ jobs:
         displayName: "extend env. vars 4 future"
 
       - bash: |
+          set -ex
           echo $(DEVICES)
           echo $CUDA_VISIBLE_DEVICES
           echo $CUDA_VERSION_MM
@@ -96,6 +97,10 @@ jobs:
           python --version
           pip --version
           pip list
+          # todo: rather use devel base image
+          apt-get update -qq --fix-missing
+          apt-get install -y cuda-toolkit
+          nvcc --version
         displayName: "Image info & NVIDIA"
 
       - bash: |
@@ -156,7 +161,7 @@ jobs:
       - bash: python -m coverage run --source ${COVERAGE_SOURCE} -m pytest tests_fabric/ -v --durations=50
         workingDirectory: tests/
         displayName: "Testing: fabric standard"
-        timeoutInMinutes: "10"
+        timeoutInMinutes: "15"
 
       - bash: |
           wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/run_standalone_tests.sh
@@ -165,7 +170,7 @@ jobs:
         env:
           PL_RUN_STANDALONE_TESTS: "1"
         displayName: "Testing: fabric standalone"
-        timeoutInMinutes: "10"
+        timeoutInMinutes: "15"
 
       - bash: |
           python -m coverage report
diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml
index 888272d80012a..3f7efdd6490fb 100644
--- a/.azure/gpu-tests-pytorch.yml
+++ b/.azure/gpu-tests-pytorch.yml
@@ -84,6 +84,7 @@ jobs:
         displayName: "extend env. vars 4 future"
 
       - bash: |
+          set -ex
           echo $(DEVICES)
           echo $CUDA_VISIBLE_DEVICES
           echo $CUDA_VERSION_MM
@@ -95,6 +96,10 @@ jobs:
           python --version
           pip --version
           pip list
+          # todo: rather use devel base image
+          apt-get update -qq --fix-missing
+          apt-get install -y cuda-toolkit
+          nvcc --version
         displayName: "Image info & NVIDIA"
 
       - bash: |
@@ -189,7 +194,7 @@ jobs:
         env:
           PL_USE_MOCKED_MNIST: "1"
         displayName: "Testing: PyTorch standalone tasks"
-        timeoutInMinutes: "10"
+        timeoutInMinutes: "15"
 
       - bash: |
           python -m coverage report
diff --git a/.lightning/workflows/fabric.yml b/.lightning/workflows/fabric.yml
index a62776aa9ffab..5b77347603bfc 100644
--- a/.lightning/workflows/fabric.yml
+++ b/.lightning/workflows/fabric.yml
@@ -6,18 +6,18 @@ trigger:
 
 timeout: "60" # minutes
 machine: "L4_X_2"
-image: "nvidia/cuda:12.6.3-runtime-ubuntu22.04"
+image: "nvidia/cuda:12.6.3-devel-ubuntu22.04"
 parametrize:
   matrix: {}
   include:
     # note that this is setting also all oldest requirements which is linked to python == 3.10
-    - image: "nvidia/cuda:12.1.1-runtime-ubuntu22.04"
+    - image: "nvidia/cuda:12.1.1-devel-ubuntu22.04"
       PACKAGE_NAME: "fabric"
       python_version: "3.10"
     - PACKAGE_NAME: "fabric"
       python_version: "3.12"
-    # - image: "nvidia/cuda:12.6-runtime-ubuntu22.04"
-    #   PACKAGE_NAME: "fabric"
+    #- image: "nvidia/cuda:12.6-runtime-ubuntu22.04"
+    #  PACKAGE_NAME: "fabric"
     - PACKAGE_NAME: "lightning"
       python_version: "3.12"
   exclude: []
diff --git a/.lightning/workflows/pytorch.yml b/.lightning/workflows/pytorch.yml
index a1177e6521f06..39ddaf708d0c7 100644
--- a/.lightning/workflows/pytorch.yml
+++ b/.lightning/workflows/pytorch.yml
@@ -6,18 +6,18 @@ trigger:
 
 timeout: "60" # minutes
 machine: "L4_X_2"
-image: "nvidia/cuda:12.6.3-runtime-ubuntu22.04"
+image: "nvidia/cuda:12.6.3-devel-ubuntu22.04"
 parametrize:
   matrix: {}
   include:
     # note that this also sets oldest requirements which are linked to Python == 3.10
-    - image: "nvidia/cuda:12.1.1-runtime-ubuntu22.04"
+    - image: "nvidia/cuda:12.1.1-devel-ubuntu22.04"
       PACKAGE_NAME: "pytorch"
       python_version: "3.10"
     - PACKAGE_NAME: "pytorch"
       python_version: "3.12"
-    # - image: "nvidia/cuda:12.6.3-runtime-ubuntu22.04"
-    #   PACKAGE_NAME: "pytorch"
+    #- image: "nvidia/cuda:12.6.3-devel-ubuntu22.04"
+    #  PACKAGE_NAME: "pytorch"
     - PACKAGE_NAME: "lightning"
       python_version: "3.12"
   exclude: []
diff --git a/src/lightning/fabric/utilities/testing/_runif.py b/src/lightning/fabric/utilities/testing/_runif.py
index ec980693b75f3..d085e4138d742 100644
--- a/src/lightning/fabric/utilities/testing/_runif.py
+++ b/src/lightning/fabric/utilities/testing/_runif.py
@@ -113,7 +113,7 @@ def _runif_reasons(
             reasons.append("Standalone execution")
         kwargs["standalone"] = True
 
-    if deepspeed and not (_DEEPSPEED_AVAILABLE and not _TORCH_GREATER_EQUAL_2_4):
+    if deepspeed and not (_DEEPSPEED_AVAILABLE and _TORCH_GREATER_EQUAL_2_4):
         reasons.append("Deepspeed")
 
     if dynamo:
diff --git a/src/lightning/pytorch/utilities/deepspeed.py b/src/lightning/pytorch/utilities/deepspeed.py
index 619e22cac9401..20b418437c681 100644
--- a/src/lightning/pytorch/utilities/deepspeed.py
+++ b/src/lightning/pytorch/utilities/deepspeed.py
@@ -93,10 +93,10 @@ def convert_zero_checkpoint_to_fp32_state_dict(
     ]
     checkpoint_dir = ds_checkpoint_dir(checkpoint_dir)
     optim_files = get_optim_files(checkpoint_dir)
-    optim_state = torch.load(optim_files[0], map_location=CPU_DEVICE)
+    optim_state = torch.load(optim_files[0], map_location=CPU_DEVICE, weights_only=False)
     zero_stage = optim_state["optimizer_state_dict"]["zero_stage"]
     model_file = get_model_state_file(checkpoint_dir, zero_stage)
-    client_state = torch.load(model_file, map_location=CPU_DEVICE)
+    client_state = torch.load(model_file, map_location=CPU_DEVICE, weights_only=False)
     client_state = {key: value for key, value in client_state.items() if key not in deepspeed_states}
     # State dict keys will include reference to wrapper _LightningModuleWrapperBase in old checkpoints created in
     # Lightning version < 2.1. Delete the `_forward_module` prefix before saving.
diff --git a/tests/tests_pytorch/utilities/test_compile.py b/tests/tests_pytorch/utilities/test_compile.py
index f90cd5e3ef3fa..cd4106c3f150a 100644
--- a/tests/tests_pytorch/utilities/test_compile.py
+++ b/tests/tests_pytorch/utilities/test_compile.py
@@ -13,14 +13,12 @@
 # limitations under the License.
 import os
 import sys
-from contextlib import nullcontext
 from unittest import mock
 
 import pytest
 import torch
-from lightning_utilities.core.imports import RequirementCache
 
-from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_2, _TORCH_GREATER_EQUAL_2_4
+from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_2
 from lightning.pytorch import LightningModule, Trainer
 from lightning.pytorch.demos.boring_classes import BoringModel
 from lightning.pytorch.utilities.compile import from_compiled, to_uncompiled
@@ -34,7 +32,7 @@
 @pytest.mark.skipif(sys.platform == "darwin", reason="fatal error: 'omp.h' file not found")
 @RunIf(dynamo=True, deepspeed=True)
 @mock.patch("lightning.pytorch.trainer.call._call_and_handle_interrupt")
-def test_trainer_compiled_model(_, tmp_path, monkeypatch, mps_count_0):
+def test_trainer_compiled_model_deepspeed(_, tmp_path, monkeypatch, mps_count_0):
     trainer_kwargs = {
         "default_root_dir": tmp_path,
         "fast_dev_run": True,
@@ -69,22 +67,52 @@ def test_trainer_compiled_model(_, tmp_path, monkeypatch, mps_count_0):
     assert trainer.model._compiler_ctx is None
 
     # some strategies do not support it
-    if RequirementCache("deepspeed"):
-        compiled_model = torch.compile(model)
-        mock_cuda_count(monkeypatch, 2)
-
-        # TODO: Update deepspeed to avoid deprecation warning for `torch.cuda.amp.custom_fwd` on import
-        warn_context = (
-            pytest.warns(FutureWarning, match="torch.cuda.amp.*is deprecated")
-            if _TORCH_GREATER_EQUAL_2_4
-            else nullcontext()
-        )
-
-        with warn_context:
-            trainer = Trainer(strategy="deepspeed", accelerator="cuda", **trainer_kwargs)
-
-        with pytest.raises(RuntimeError, match="Using a compiled model is incompatible with the current strategy.*"):
-            trainer.fit(compiled_model)
+    compiled_model = torch.compile(model)
+    mock_cuda_count(monkeypatch, 2)
+
+    trainer = Trainer(strategy="deepspeed", accelerator="cuda", **trainer_kwargs)
+
+    with pytest.raises(RuntimeError, match="Using a compiled model is incompatible with the current strategy.*"):
+        trainer.fit(compiled_model)
+
+
+# https://github.com/pytorch/pytorch/issues/95708
+@pytest.mark.skipif(sys.platform == "darwin", reason="fatal error: 'omp.h' file not found")
+@RunIf(dynamo=True)
+@mock.patch("lightning.pytorch.trainer.call._call_and_handle_interrupt")
+def test_trainer_compiled_model_ddp(_, tmp_path, monkeypatch, mps_count_0):
+    trainer_kwargs = {
+        "default_root_dir": tmp_path,
+        "fast_dev_run": True,
+        "logger": False,
+        "enable_checkpointing": False,
+        "enable_model_summary": False,
+        "enable_progress_bar": False,
+    }
+
+    model = BoringModel()
+    compiled_model = torch.compile(model)
+    assert model._compiler_ctx is compiled_model._compiler_ctx  # shared reference
+
+    # can train with compiled model
+    trainer = Trainer(**trainer_kwargs)
+    trainer.fit(compiled_model)
+    assert trainer.model._compiler_ctx["compiler"] == "dynamo"
+
+    # the compiled model can be uncompiled
+    to_uncompiled_model = to_uncompiled(compiled_model)
+    assert model._compiler_ctx is None
+    assert compiled_model._compiler_ctx is None
+    assert to_uncompiled_model._compiler_ctx is None
+
+    # the compiled model needs to be passed
+    with pytest.raises(ValueError, match="required to be a compiled LightningModule"):
+        to_uncompiled(to_uncompiled_model)
+
+    # the uncompiled model can be fitted
+    trainer = Trainer(**trainer_kwargs)
+    trainer.fit(model)
+    assert trainer.model._compiler_ctx is None
 
     # ddp does
     trainer = Trainer(strategy="ddp", **trainer_kwargs)