diff --git a/.actions/assistant.py b/.actions/assistant.py index 0109a988d5692..e54e69e4860e7 100644 --- a/.actions/assistant.py +++ b/.actions/assistant.py @@ -341,33 +341,6 @@ def create_mirror_package(source_dir: str, package_mapping: dict[str, str]) -> N class AssistantCLI: - @staticmethod - def requirements_prune_pkgs(packages: Sequence[str], req_files: Sequence[str] = REQUIREMENT_FILES_ALL) -> None: - """Remove some packages from given requirement files.""" - if isinstance(req_files, str): - req_files = [req_files] - for req in req_files: - AssistantCLI._prune_packages(req, packages) - - @staticmethod - def _prune_packages(req_file: str, packages: Sequence[str]) -> None: - """Remove some packages from given requirement files.""" - path = Path(req_file) - assert path.exists() - text = path.read_text() - lines = text.splitlines() - final = [] - for line in lines: - ln_ = line.strip() - if not ln_ or ln_.startswith("#"): - final.append(line) - continue - req = list(_parse_requirements([ln_]))[0] - if req.name not in packages: - final.append(line) - print(final) - path.write_text("\n".join(final) + "\n") - @staticmethod def copy_replace_imports( source_dir: str, diff --git a/.azure/gpu-tests-fabric.yml b/.azure/gpu-tests-fabric.yml index f506aa2008df9..b2f8ab0447a20 100644 --- a/.azure/gpu-tests-fabric.yml +++ b/.azure/gpu-tests-fabric.yml @@ -99,10 +99,16 @@ jobs: displayName: "Image info & NVIDIA" - bash: | - cd requirements/fabric + set -ex + pip install "cython<3.0" wheel # for compatibility pip install -U "lightning-utilities[cli]" + cd requirements/fabric + # replace range by pin minimal requirements python -m lightning_utilities.cli requirements set-oldest --req_files "['base.txt', 'strategies.txt']" - pip install "cython<3.0" wheel # for compatibility + # drop deepspeed since it is not supported by our minimal Torch requirements + python -m lightning_utilities.cli requirements prune-pkgs --packages deepspeed --req_files strategies.txt + # uninstall deepspeed since some older docker images have it pre-installed + pip uninstall -y deepspeed condition: contains(variables['Agent.JobName'], 'oldest') displayName: "setting oldest dependencies" diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml index 68e99f2f6285a..d3c4951a22336 100644 --- a/.azure/gpu-tests-pytorch.yml +++ b/.azure/gpu-tests-pytorch.yml @@ -103,10 +103,16 @@ jobs: displayName: "Image info & NVIDIA" - bash: | - cd requirements/pytorch + set -ex + pip install "cython<3.0" wheel # for compatibility pip install -U "lightning-utilities[cli]" + cd requirements/pytorch + # replace range by pin minimal requirements python -m lightning_utilities.cli requirements set-oldest --req_files "['base.txt', 'extra.txt', 'strategies.txt', 'examples.txt']" - pip install "cython<3.0" wheel # for compatibility + # drop deepspeed since it is not supported by our minimal Torch requirements + python -m lightning_utilities.cli requirements prune-pkgs --packages deepspeed --req_files strategies.txt + # uninstall deepspeed since some older docker images have it pre-installed + pip uninstall -y deepspeed condition: contains(variables['Agent.JobName'], 'oldest') displayName: "setting oldest dependencies" diff --git a/dockers/release/Dockerfile b/dockers/release/Dockerfile index c7a7a1e0c8470..b80c23dfc73f3 100644 --- a/dockers/release/Dockerfile +++ b/dockers/release/Dockerfile @@ -21,6 +21,7 @@ FROM pytorchlightning/pytorch_lightning:base-cuda${CUDA_VERSION}-py${PYTHON_VERS LABEL maintainer="Lightning-AI " ARG LIGHTNING_VERSION="" +ARG PYTORCH_VERSION COPY ./ /home/pytorch-lightning/ @@ -39,7 +40,14 @@ RUN \ fi && \ # otherwise there is collision with folder name and pkg name on Pypi cd pytorch-lightning && \ - pip install setuptools==75.6.0 && \ + # pip install setuptools==75.6.0 && \ + pip install -U "lightning-utilities[cli]" && \ + # drop deepspeed since it is not supported by our minimal Torch requirements \ + echo "PYTORCH_VERSION is: '$PYTORCH_VERSION'" && \ + if [[ "$PYTORCH_VERSION" =~ ^(2\.1|2\.2|2\.3|2\.4)$ ]]; then \ + python -m lightning_utilities.cli requirements prune-pkgs --packages deepspeed --req_files requirements/fabric/strategies.txt ; \ + python -m lightning_utilities.cli requirements prune-pkgs --packages deepspeed --req_files requirements/pytorch/strategies.txt ; \ + fi && \ PACKAGE_NAME=lightning pip install '.[extra,loggers,strategies]' --no-cache-dir && \ PACKAGE_NAME=pytorch pip install '.[extra,loggers,strategies]' --no-cache-dir && \ cd .. && \ diff --git a/pyproject.toml b/pyproject.toml index a63da5f246392..b4d5d0b1638f5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -179,6 +179,7 @@ markers = [ "cloud: Run the cloud tests for example", ] filterwarnings = [ + # "error::DeprecationWarning", "error::FutureWarning", "ignore::FutureWarning:onnxscript", # Temporary ignore until onnxscript is updated ] diff --git a/requirements/fabric/strategies.txt b/requirements/fabric/strategies.txt index bea30b37fa5f8..7856db1df2eec 100644 --- a/requirements/fabric/strategies.txt +++ b/requirements/fabric/strategies.txt @@ -5,5 +5,5 @@ # note: is a bug around 0.10 with `MPS_Accelerator must implement all abstract methods` # shall be resolved by https://github.com/microsoft/DeepSpeed/issues/4372 -deepspeed >=0.9.3, <=0.9.3; platform_system != "Windows" and platform_system != "Darwin" # strict +deepspeed >=0.14.1,<=0.15.0; platform_system != "Windows" and platform_system != "Darwin" # strict bitsandbytes >=0.45.2,<0.47.0; platform_system != "Darwin" diff --git a/requirements/pytorch/strategies.txt b/requirements/pytorch/strategies.txt index 1f7296798b551..89392d6006d38 100644 --- a/requirements/pytorch/strategies.txt +++ b/requirements/pytorch/strategies.txt @@ -3,4 +3,4 @@ # note: is a bug around 0.10 with `MPS_Accelerator must implement all abstract methods` # shall be resolved by https://github.com/microsoft/DeepSpeed/issues/4372 -deepspeed >=0.9.3, <=0.9.3; platform_system != "Windows" and platform_system != "Darwin" # strict +deepspeed >=0.14.1,<=0.15.0; platform_system != "Windows" and platform_system != "Darwin" # strict diff --git a/src/lightning/fabric/strategies/deepspeed.py b/src/lightning/fabric/strategies/deepspeed.py index 48333455240cf..c11ae8589d1ff 100644 --- a/src/lightning/fabric/strategies/deepspeed.py +++ b/src/lightning/fabric/strategies/deepspeed.py @@ -47,7 +47,6 @@ from torch.optim.lr_scheduler import _LRScheduler _DEEPSPEED_AVAILABLE = RequirementCache("deepspeed") -_DEEPSPEED_GREATER_EQUAL_0_14_1 = RequirementCache("deepspeed>=0.14.1") # TODO(fabric): Links in the docstrings to PL-specific deepspeed user docs need to be replaced. @@ -503,10 +502,7 @@ def load_checkpoint( ) engine = engines[0] - if _DEEPSPEED_GREATER_EQUAL_0_14_1: - from deepspeed.runtime.base_optimizer import DeepSpeedOptimizer - else: - from deepspeed.runtime import DeepSpeedOptimizer + from deepspeed.runtime.base_optimizer import DeepSpeedOptimizer optimzer_state_requested = any(isinstance(item, (Optimizer, DeepSpeedOptimizer)) for item in state.values()) diff --git a/tests/tests_fabric/strategies/launchers/test_multiprocessing_integration.py b/tests/tests_fabric/strategies/launchers/test_multiprocessing_integration.py index 2eaf1d23572c8..85688ef8fb489 100644 --- a/tests/tests_fabric/strategies/launchers/test_multiprocessing_integration.py +++ b/tests/tests_fabric/strategies/launchers/test_multiprocessing_integration.py @@ -30,6 +30,7 @@ def __init__(self): @RunIf(skip_windows=True) +@pytest.mark.flaky(reruns=3) @pytest.mark.parametrize("strategy", ["ddp_spawn", "ddp_fork"]) def test_memory_sharing_disabled(strategy): """Test that the multiprocessing launcher disables memory sharing on model parameters and buffers to avoid race diff --git a/tests/tests_pytorch/utilities/test_compile.py b/tests/tests_pytorch/utilities/test_compile.py index a053c847dfd6c..f90cd5e3ef3fa 100644 --- a/tests/tests_pytorch/utilities/test_compile.py +++ b/tests/tests_pytorch/utilities/test_compile.py @@ -32,7 +32,7 @@ # https://github.com/pytorch/pytorch/issues/95708 @pytest.mark.skipif(sys.platform == "darwin", reason="fatal error: 'omp.h' file not found") -@RunIf(dynamo=True) +@RunIf(dynamo=True, deepspeed=True) @mock.patch("lightning.pytorch.trainer.call._call_and_handle_interrupt") def test_trainer_compiled_model(_, tmp_path, monkeypatch, mps_count_0): trainer_kwargs = {