From c18a2baaa5e00b0958131b6107193a12f8434f2c Mon Sep 17 00:00:00 2001 From: Jirka B Date: Wed, 10 Sep 2025 18:40:14 +0200 Subject: [PATCH 01/14] tests: fix skipif condition for `deepspeed` --- src/lightning/fabric/utilities/testing/_runif.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lightning/fabric/utilities/testing/_runif.py b/src/lightning/fabric/utilities/testing/_runif.py index ec980693b75f3..d085e4138d742 100644 --- a/src/lightning/fabric/utilities/testing/_runif.py +++ b/src/lightning/fabric/utilities/testing/_runif.py @@ -113,7 +113,7 @@ def _runif_reasons( reasons.append("Standalone execution") kwargs["standalone"] = True - if deepspeed and not (_DEEPSPEED_AVAILABLE and not _TORCH_GREATER_EQUAL_2_4): + if deepspeed and not (_DEEPSPEED_AVAILABLE and _TORCH_GREATER_EQUAL_2_4): reasons.append("Deepspeed") if dynamo: From 4c4ec2589c5da16b95944e31dc605071d109afab Mon Sep 17 00:00:00 2001 From: Jirka B Date: Wed, 10 Sep 2025 20:02:58 +0200 Subject: [PATCH 02/14] split test_trainer_compiled_model --- tests/tests_pytorch/utilities/test_compile.py | 71 ++++++++++++++----- 1 file changed, 54 insertions(+), 17 deletions(-) diff --git a/tests/tests_pytorch/utilities/test_compile.py b/tests/tests_pytorch/utilities/test_compile.py index f90cd5e3ef3fa..961e2689cc952 100644 --- a/tests/tests_pytorch/utilities/test_compile.py +++ b/tests/tests_pytorch/utilities/test_compile.py @@ -34,7 +34,7 @@ @pytest.mark.skipif(sys.platform == "darwin", reason="fatal error: 'omp.h' file not found") @RunIf(dynamo=True, deepspeed=True) @mock.patch("lightning.pytorch.trainer.call._call_and_handle_interrupt") -def test_trainer_compiled_model(_, tmp_path, monkeypatch, mps_count_0): +def test_trainer_compiled_model_deepspeed(_, tmp_path, monkeypatch, mps_count_0): trainer_kwargs = { "default_root_dir": tmp_path, "fast_dev_run": True, @@ -69,22 +69,59 @@ def test_trainer_compiled_model(_, tmp_path, monkeypatch, mps_count_0): assert trainer.model._compiler_ctx is None # some strategies do not support it - if RequirementCache("deepspeed"): - compiled_model = torch.compile(model) - mock_cuda_count(monkeypatch, 2) - - # TODO: Update deepspeed to avoid deprecation warning for `torch.cuda.amp.custom_fwd` on import - warn_context = ( - pytest.warns(FutureWarning, match="torch.cuda.amp.*is deprecated") - if _TORCH_GREATER_EQUAL_2_4 - else nullcontext() - ) - - with warn_context: - trainer = Trainer(strategy="deepspeed", accelerator="cuda", **trainer_kwargs) - - with pytest.raises(RuntimeError, match="Using a compiled model is incompatible with the current strategy.*"): - trainer.fit(compiled_model) + compiled_model = torch.compile(model) + mock_cuda_count(monkeypatch, 2) + + # TODO: Update deepspeed to avoid deprecation warning for `torch.cuda.amp.custom_fwd` on import + warn_context = ( + pytest.warns(FutureWarning, match="torch.cuda.amp.*is deprecated") + if _TORCH_GREATER_EQUAL_2_4 + else nullcontext() + ) + + with warn_context: + trainer = Trainer(strategy="deepspeed", accelerator="cuda", **trainer_kwargs) + + with pytest.raises(RuntimeError, match="Using a compiled model is incompatible with the current strategy.*"): + trainer.fit(compiled_model) + +# https://github.com/pytorch/pytorch/issues/95708 +@pytest.mark.skipif(sys.platform == "darwin", reason="fatal error: 'omp.h' file not found") +@RunIf(dynamo=True) +@mock.patch("lightning.pytorch.trainer.call._call_and_handle_interrupt") +def test_trainer_compiled_model_ddp(_, tmp_path, monkeypatch, mps_count_0): + trainer_kwargs = { + "default_root_dir": tmp_path, + "fast_dev_run": True, + "logger": False, + "enable_checkpointing": False, + "enable_model_summary": False, + "enable_progress_bar": False, + } + + model = BoringModel() + compiled_model = torch.compile(model) + assert model._compiler_ctx is compiled_model._compiler_ctx # shared reference + + # can train with compiled model + trainer = Trainer(**trainer_kwargs) + trainer.fit(compiled_model) + assert trainer.model._compiler_ctx["compiler"] == "dynamo" + + # the compiled model can be uncompiled + to_uncompiled_model = to_uncompiled(compiled_model) + assert model._compiler_ctx is None + assert compiled_model._compiler_ctx is None + assert to_uncompiled_model._compiler_ctx is None + + # the compiled model needs to be passed + with pytest.raises(ValueError, match="required to be a compiled LightningModule"): + to_uncompiled(to_uncompiled_model) + + # the uncompiled model can be fitted + trainer = Trainer(**trainer_kwargs) + trainer.fit(model) + assert trainer.model._compiler_ctx is None # ddp does trainer = Trainer(strategy="ddp", **trainer_kwargs) From 7268bbca54d8eec2a5512e753a45534786442a90 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 10 Sep 2025 18:03:22 +0000 Subject: [PATCH 03/14] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/tests_pytorch/utilities/test_compile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/tests_pytorch/utilities/test_compile.py b/tests/tests_pytorch/utilities/test_compile.py index 961e2689cc952..fd6e405075ee2 100644 --- a/tests/tests_pytorch/utilities/test_compile.py +++ b/tests/tests_pytorch/utilities/test_compile.py @@ -18,7 +18,6 @@ import pytest import torch -from lightning_utilities.core.imports import RequirementCache from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_2, _TORCH_GREATER_EQUAL_2_4 from lightning.pytorch import LightningModule, Trainer @@ -85,6 +84,7 @@ def test_trainer_compiled_model_deepspeed(_, tmp_path, monkeypatch, mps_count_0) with pytest.raises(RuntimeError, match="Using a compiled model is incompatible with the current strategy.*"): trainer.fit(compiled_model) + # https://github.com/pytorch/pytorch/issues/95708 @pytest.mark.skipif(sys.platform == "darwin", reason="fatal error: 'omp.h' file not found") @RunIf(dynamo=True) From 8c9d7f5663229c2688024a40a8e51ddf02fc8a49 Mon Sep 17 00:00:00 2001 From: Jirka B Date: Wed, 10 Sep 2025 20:29:33 +0200 Subject: [PATCH 04/14] test_trainer_compiled_model_deepspeed --- tests/tests_pytorch/utilities/test_compile.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/tests/tests_pytorch/utilities/test_compile.py b/tests/tests_pytorch/utilities/test_compile.py index 961e2689cc952..04d372a4e9577 100644 --- a/tests/tests_pytorch/utilities/test_compile.py +++ b/tests/tests_pytorch/utilities/test_compile.py @@ -72,15 +72,7 @@ def test_trainer_compiled_model_deepspeed(_, tmp_path, monkeypatch, mps_count_0) compiled_model = torch.compile(model) mock_cuda_count(monkeypatch, 2) - # TODO: Update deepspeed to avoid deprecation warning for `torch.cuda.amp.custom_fwd` on import - warn_context = ( - pytest.warns(FutureWarning, match="torch.cuda.amp.*is deprecated") - if _TORCH_GREATER_EQUAL_2_4 - else nullcontext() - ) - - with warn_context: - trainer = Trainer(strategy="deepspeed", accelerator="cuda", **trainer_kwargs) + trainer = Trainer(strategy="deepspeed", accelerator="cuda", **trainer_kwargs) with pytest.raises(RuntimeError, match="Using a compiled model is incompatible with the current strategy.*"): trainer.fit(compiled_model) From 13383e07b72f816b805f8c624f1251b6e4de394c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 10 Sep 2025 18:30:56 +0000 Subject: [PATCH 05/14] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/tests_pytorch/utilities/test_compile.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/tests_pytorch/utilities/test_compile.py b/tests/tests_pytorch/utilities/test_compile.py index 6b7fa6ee5fce1..cd4106c3f150a 100644 --- a/tests/tests_pytorch/utilities/test_compile.py +++ b/tests/tests_pytorch/utilities/test_compile.py @@ -13,13 +13,12 @@ # limitations under the License. import os import sys -from contextlib import nullcontext from unittest import mock import pytest import torch -from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_2, _TORCH_GREATER_EQUAL_2_4 +from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_2 from lightning.pytorch import LightningModule, Trainer from lightning.pytorch.demos.boring_classes import BoringModel from lightning.pytorch.utilities.compile import from_compiled, to_uncompiled From 898195cd0a1da507230d2d73bc4c7b2571fd49c3 Mon Sep 17 00:00:00 2001 From: Jirka B Date: Wed, 10 Sep 2025 20:41:51 +0200 Subject: [PATCH 06/14] cuda-toolkit --- .azure/gpu-tests-fabric.yml | 2 ++ .azure/gpu-tests-pytorch.yml | 2 ++ 2 files changed, 4 insertions(+) diff --git a/.azure/gpu-tests-fabric.yml b/.azure/gpu-tests-fabric.yml index b2f8ab0447a20..01866ac8a1644 100644 --- a/.azure/gpu-tests-fabric.yml +++ b/.azure/gpu-tests-fabric.yml @@ -96,6 +96,8 @@ jobs: python --version pip --version pip list + apt-get install -y cuda-toolkit + nvcc --version displayName: "Image info & NVIDIA" - bash: | diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml index d3c4951a22336..7855ff5c8850a 100644 --- a/.azure/gpu-tests-pytorch.yml +++ b/.azure/gpu-tests-pytorch.yml @@ -100,6 +100,8 @@ jobs: python --version pip --version pip list + apt-get install -y cuda-toolkit + nvcc --version displayName: "Image info & NVIDIA" - bash: | From 597985050472e37e11e5282b96af8d1e95b9876c Mon Sep 17 00:00:00 2001 From: Jirka B Date: Wed, 10 Sep 2025 20:46:07 +0200 Subject: [PATCH 07/14] update --- .azure/gpu-tests-fabric.yml | 2 ++ .azure/gpu-tests-pytorch.yml | 2 ++ 2 files changed, 4 insertions(+) diff --git a/.azure/gpu-tests-fabric.yml b/.azure/gpu-tests-fabric.yml index 01866ac8a1644..ce851d922a58d 100644 --- a/.azure/gpu-tests-fabric.yml +++ b/.azure/gpu-tests-fabric.yml @@ -85,6 +85,7 @@ jobs: displayName: "extend env. vars 4 future" - bash: | + set -ex echo $(DEVICES) echo $CUDA_VISIBLE_DEVICES echo $CUDA_VERSION_MM @@ -96,6 +97,7 @@ jobs: python --version pip --version pip list + apt-get update --qq apt-get install -y cuda-toolkit nvcc --version displayName: "Image info & NVIDIA" diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml index 7855ff5c8850a..2caaee18b6860 100644 --- a/.azure/gpu-tests-pytorch.yml +++ b/.azure/gpu-tests-pytorch.yml @@ -89,6 +89,7 @@ jobs: displayName: "extend env. vars 4 future" - bash: | + set -ex echo $(DEVICES) echo $CUDA_VISIBLE_DEVICES echo $CUDA_VERSION_MM @@ -100,6 +101,7 @@ jobs: python --version pip --version pip list + apt-get update --qq apt-get install -y cuda-toolkit nvcc --version displayName: "Image info & NVIDIA" From c32023cad1a0536c494217eafcc1da6456d442c9 Mon Sep 17 00:00:00 2001 From: Jirka B Date: Wed, 10 Sep 2025 20:48:57 +0200 Subject: [PATCH 08/14] --fix-missing --- .azure/gpu-tests-fabric.yml | 2 +- .azure/gpu-tests-pytorch.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.azure/gpu-tests-fabric.yml b/.azure/gpu-tests-fabric.yml index ce851d922a58d..89f77c3d46237 100644 --- a/.azure/gpu-tests-fabric.yml +++ b/.azure/gpu-tests-fabric.yml @@ -97,7 +97,7 @@ jobs: python --version pip --version pip list - apt-get update --qq + apt-get update -qq --fix-missing apt-get install -y cuda-toolkit nvcc --version displayName: "Image info & NVIDIA" diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml index 2caaee18b6860..b8619d95832c5 100644 --- a/.azure/gpu-tests-pytorch.yml +++ b/.azure/gpu-tests-pytorch.yml @@ -101,7 +101,7 @@ jobs: python --version pip --version pip list - apt-get update --qq + apt-get update -qq --fix-missing apt-get install -y cuda-toolkit nvcc --version displayName: "Image info & NVIDIA" From d8ce97d60caf3c522b35c81d45b6c13d4de4a553 Mon Sep 17 00:00:00 2001 From: Jirka B Date: Wed, 10 Sep 2025 20:54:33 +0200 Subject: [PATCH 09/14] devel --- .azure/gpu-tests-fabric.yml | 1 + .azure/gpu-tests-pytorch.yml | 1 + .lightning/workflows/fabric.yml | 6 +++--- .lightning/workflows/pytorch.yml | 6 +++--- 4 files changed, 8 insertions(+), 6 deletions(-) diff --git a/.azure/gpu-tests-fabric.yml b/.azure/gpu-tests-fabric.yml index 89f77c3d46237..154f5fadec7d9 100644 --- a/.azure/gpu-tests-fabric.yml +++ b/.azure/gpu-tests-fabric.yml @@ -97,6 +97,7 @@ jobs: python --version pip --version pip list + # toto: rather use devel base image apt-get update -qq --fix-missing apt-get install -y cuda-toolkit nvcc --version diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml index b8619d95832c5..175b3d65eb292 100644 --- a/.azure/gpu-tests-pytorch.yml +++ b/.azure/gpu-tests-pytorch.yml @@ -101,6 +101,7 @@ jobs: python --version pip --version pip list + # toto: rather use devel base image apt-get update -qq --fix-missing apt-get install -y cuda-toolkit nvcc --version diff --git a/.lightning/workflows/fabric.yml b/.lightning/workflows/fabric.yml index 767b5588fcbb5..802968968944f 100644 --- a/.lightning/workflows/fabric.yml +++ b/.lightning/workflows/fabric.yml @@ -9,17 +9,17 @@ parametrize: matrix: {} include: # note that this is setting also all oldest requirements which is linked to python == 3.10 - - image: "nvidia/cuda:12.1.1-runtime-ubuntu22.04" + - image: "nvidia/cuda:12.1.1-devel-ubuntu22.04" PACKAGE_NAME: "fabric" python_version: "3.10" machine: "A100_X_2" - - image: "nvidia/cuda:12.6.3-runtime-ubuntu22.04" + - image: "nvidia/cuda:12.6.3-devel-ubuntu22.04" PACKAGE_NAME: "fabric" python_version: "3.12" machine: "L4_X_2" # - image: "nvidia/cuda:12.6-runtime-ubuntu22.04" # PACKAGE_NAME: "fabric" - - image: "nvidia/cuda:12.6.3-runtime-ubuntu22.04" + - image: "nvidia/cuda:12.6.3-devel-ubuntu22.04" PACKAGE_NAME: "lightning" python_version: "3.12" machine: "L4_X_2" diff --git a/.lightning/workflows/pytorch.yml b/.lightning/workflows/pytorch.yml index bbd47a8431fef..dc964ffd0cabf 100644 --- a/.lightning/workflows/pytorch.yml +++ b/.lightning/workflows/pytorch.yml @@ -9,17 +9,17 @@ parametrize: matrix: {} include: # note that this also sets oldest requirements which are linked to Python == 3.10 - - image: "nvidia/cuda:12.1.1-runtime-ubuntu22.04" + - image: "nvidia/cuda:12.1.1-devel-ubuntu22.04" PACKAGE_NAME: "pytorch" python_version: "3.10" machine: "A100_X_2" - - image: "nvidia/cuda:12.6.3-runtime-ubuntu22.04" + - image: "nvidia/cuda:12.6.3-devel-ubuntu22.04" PACKAGE_NAME: "pytorch" python_version: "3.12" machine: "L4_X_2" # - image: "nvidia/cuda:12.6.3-runtime-ubuntu22.04" # PACKAGE_NAME: "pytorch" - - image: "nvidia/cuda:12.6.3-runtime-ubuntu22.04" + - image: "nvidia/cuda:12.6.3-devel-ubuntu22.04" PACKAGE_NAME: "lightning" python_version: "3.12" machine: "L4_X_2" From 52bb67d197c86c6c6793786b51289ed23e6f14b5 Mon Sep 17 00:00:00 2001 From: Jirka B Date: Wed, 10 Sep 2025 22:07:01 +0200 Subject: [PATCH 10/14] 15 --- .azure/gpu-tests-fabric.yml | 4 ++-- .azure/gpu-tests-pytorch.yml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.azure/gpu-tests-fabric.yml b/.azure/gpu-tests-fabric.yml index 154f5fadec7d9..d179370906abe 100644 --- a/.azure/gpu-tests-fabric.yml +++ b/.azure/gpu-tests-fabric.yml @@ -161,7 +161,7 @@ jobs: - bash: python -m coverage run --source ${COVERAGE_SOURCE} -m pytest tests_fabric/ -v --durations=50 workingDirectory: tests/ displayName: "Testing: fabric standard" - timeoutInMinutes: "10" + timeoutInMinutes: "15" - bash: | wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/run_standalone_tests.sh @@ -170,7 +170,7 @@ jobs: env: PL_RUN_STANDALONE_TESTS: "1" displayName: "Testing: fabric standalone" - timeoutInMinutes: "10" + timeoutInMinutes: "15" - bash: | python -m coverage report diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml index 175b3d65eb292..de72121e94dc5 100644 --- a/.azure/gpu-tests-pytorch.yml +++ b/.azure/gpu-tests-pytorch.yml @@ -199,7 +199,7 @@ jobs: env: PL_USE_MOCKED_MNIST: "1" displayName: "Testing: PyTorch standalone tasks" - timeoutInMinutes: "10" + timeoutInMinutes: "15" - bash: | python -m coverage report From fa3e058b8ecd6109958e635bc9a22fad0211a97e Mon Sep 17 00:00:00 2001 From: Nicki Skafte Detlefsen Date: Thu, 11 Sep 2025 08:53:48 +0200 Subject: [PATCH 11/14] Apply suggestion from @bhimrazy Co-authored-by: Bhimraj Yadav --- .azure/gpu-tests-fabric.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.azure/gpu-tests-fabric.yml b/.azure/gpu-tests-fabric.yml index d179370906abe..12bd49fdc31d3 100644 --- a/.azure/gpu-tests-fabric.yml +++ b/.azure/gpu-tests-fabric.yml @@ -97,7 +97,7 @@ jobs: python --version pip --version pip list - # toto: rather use devel base image + # todo: rather use devel base image apt-get update -qq --fix-missing apt-get install -y cuda-toolkit nvcc --version From 35de4884fc291f5ef85b0458da1ed76b123d8852 Mon Sep 17 00:00:00 2001 From: Jirka Borovec <6035284+Borda@users.noreply.github.com> Date: Thu, 11 Sep 2025 09:22:59 +0200 Subject: [PATCH 12/14] typo Co-authored-by: Nicki Skafte Detlefsen --- .azure/gpu-tests-pytorch.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml index de72121e94dc5..e8dcfec6a6a63 100644 --- a/.azure/gpu-tests-pytorch.yml +++ b/.azure/gpu-tests-pytorch.yml @@ -101,7 +101,7 @@ jobs: python --version pip --version pip list - # toto: rather use devel base image + # todo: rather use devel base image apt-get update -qq --fix-missing apt-get install -y cuda-toolkit nvcc --version From 902b04faa8afbe1214835c257e319e318de40c75 Mon Sep 17 00:00:00 2001 From: jirka Date: Thu, 18 Sep 2025 00:37:46 +0200 Subject: [PATCH 13/14] Empty-Commit From e29fb7afeb391fab1b991600221eb8784e08f82f Mon Sep 17 00:00:00 2001 From: Deependu Date: Mon, 22 Sep 2025 07:07:55 +0000 Subject: [PATCH 14/14] update torch.load to include weights_only parameter in deepspeed utility --- src/lightning/pytorch/utilities/deepspeed.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lightning/pytorch/utilities/deepspeed.py b/src/lightning/pytorch/utilities/deepspeed.py index 619e22cac9401..20b418437c681 100644 --- a/src/lightning/pytorch/utilities/deepspeed.py +++ b/src/lightning/pytorch/utilities/deepspeed.py @@ -93,10 +93,10 @@ def convert_zero_checkpoint_to_fp32_state_dict( ] checkpoint_dir = ds_checkpoint_dir(checkpoint_dir) optim_files = get_optim_files(checkpoint_dir) - optim_state = torch.load(optim_files[0], map_location=CPU_DEVICE) + optim_state = torch.load(optim_files[0], map_location=CPU_DEVICE, weights_only=False) zero_stage = optim_state["optimizer_state_dict"]["zero_stage"] model_file = get_model_state_file(checkpoint_dir, zero_stage) - client_state = torch.load(model_file, map_location=CPU_DEVICE) + client_state = torch.load(model_file, map_location=CPU_DEVICE, weights_only=False) client_state = {key: value for key, value in client_state.items() if key not in deepspeed_states} # State dict keys will include reference to wrapper _LightningModuleWrapperBase in old checkpoints created in # Lightning version < 2.1. Delete the `_forward_module` prefix before saving.