diff --git a/.actions/assistant.py b/.actions/assistant.py index 7b2d49423d622..e54e69e4860e7 100644 --- a/.actions/assistant.py +++ b/.actions/assistant.py @@ -341,47 +341,6 @@ def create_mirror_package(source_dir: str, package_mapping: dict[str, str]) -> N class AssistantCLI: - @staticmethod - def requirements_prune_pkgs(packages: Sequence[str], req_files: Sequence[str] = REQUIREMENT_FILES_ALL) -> None: - """Remove some packages from given requirement files.""" - if isinstance(req_files, str): - req_files = [req_files] - for req in req_files: - AssistantCLI._prune_packages(req, packages) - - @staticmethod - def _prune_packages(req_file: str, packages: Sequence[str]) -> None: - """Remove some packages from given requirement files.""" - path = Path(req_file) - assert path.exists() - text = path.read_text() - lines = text.splitlines() - final = [] - for line in lines: - ln_ = line.strip() - if not ln_ or ln_.startswith("#"): - final.append(line) - continue - req = list(_parse_requirements([ln_]))[0] - if req.name not in packages: - final.append(line) - print(final) - path.write_text("\n".join(final) + "\n") - - @staticmethod - def _replace_min(fname: str) -> None: - with open(fname, encoding="utf-8") as fopen: - req = fopen.read().replace(">=", "==") - with open(fname, "w", encoding="utf-8") as fwrite: - fwrite.write(req) - - @staticmethod - def replace_oldest_ver(requirement_fnames: Sequence[str] = REQUIREMENT_FILES_ALL) -> None: - """Replace the min package version by fixed one.""" - for fname in requirement_fnames: - print(fname) - AssistantCLI._replace_min(fname) - @staticmethod def copy_replace_imports( source_dir: str, diff --git a/.azure/gpu-benchmarks.yml b/.azure/gpu-benchmarks.yml index 045c0cd45ccb9..e4d4da32d9e5d 100644 --- a/.azure/gpu-benchmarks.yml +++ b/.azure/gpu-benchmarks.yml @@ -46,7 +46,7 @@ jobs: variables: DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' ) container: - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.5-cuda12.1.0" + image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.8" options: "--gpus=all --shm-size=32g" strategy: matrix: diff --git a/.azure/gpu-tests-fabric.yml b/.azure/gpu-tests-fabric.yml index c584f5bcbd3a2..b2f8ab0447a20 100644 --- a/.azure/gpu-tests-fabric.yml +++ b/.azure/gpu-tests-fabric.yml @@ -57,16 +57,16 @@ jobs: strategy: matrix: "Fabric | oldest": - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.1-cuda12.1.1" + image: "pytorchlightning/pytorch_lightning:base-cuda12.1.1-py3.10-torch2.1" PACKAGE_NAME: "fabric" "Fabric | latest": - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.8-cuda12.6.3" + image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.8" PACKAGE_NAME: "fabric" #"Fabric | future": - # image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.7-cuda12.6.3" + # image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.7" # PACKAGE_NAME: "fabric" "Lightning | latest": - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.8-cuda12.6.3" + image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.8" PACKAGE_NAME: "lightning" workspace: clean: all @@ -99,8 +99,16 @@ jobs: displayName: "Image info & NVIDIA" - bash: | - python .actions/assistant.py replace_oldest_ver + set -ex pip install "cython<3.0" wheel # for compatibility + pip install -U "lightning-utilities[cli]" + cd requirements/fabric + # replace range by pin minimal requirements + python -m lightning_utilities.cli requirements set-oldest --req_files "['base.txt', 'strategies.txt']" + # drop deepspeed since it is not supported by our minimal Torch requirements + python -m lightning_utilities.cli requirements prune-pkgs --packages deepspeed --req_files strategies.txt + # uninstall deepspeed since some older docker images have it pre-installed + pip uninstall -y deepspeed condition: contains(variables['Agent.JobName'], 'oldest') displayName: "setting oldest dependencies" diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml index 16ac6beb34841..d3c4951a22336 100644 --- a/.azure/gpu-tests-pytorch.yml +++ b/.azure/gpu-tests-pytorch.yml @@ -50,16 +50,16 @@ jobs: strategy: matrix: "PyTorch | oldest": - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.1-cuda12.1.1" + image: "pytorchlightning/pytorch_lightning:base-cuda12.1.1-py3.10-torch2.1" PACKAGE_NAME: "pytorch" "PyTorch | latest": - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.8-cuda12.6.3" + image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.8" PACKAGE_NAME: "pytorch" #"PyTorch | future": - # image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.7-cuda12.6.3" + # image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.7" # PACKAGE_NAME: "pytorch" "Lightning | latest": - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.8-cuda12.6.3" + image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.8" PACKAGE_NAME: "lightning" pool: lit-rtx-3090 variables: @@ -103,8 +103,16 @@ jobs: displayName: "Image info & NVIDIA" - bash: | - python .actions/assistant.py replace_oldest_ver + set -ex pip install "cython<3.0" wheel # for compatibility + pip install -U "lightning-utilities[cli]" + cd requirements/pytorch + # replace range by pin minimal requirements + python -m lightning_utilities.cli requirements set-oldest --req_files "['base.txt', 'extra.txt', 'strategies.txt', 'examples.txt']" + # drop deepspeed since it is not supported by our minimal Torch requirements + python -m lightning_utilities.cli requirements prune-pkgs --packages deepspeed --req_files strategies.txt + # uninstall deepspeed since some older docker images have it pre-installed + pip uninstall -y deepspeed condition: contains(variables['Agent.JobName'], 'oldest') displayName: "setting oldest dependencies" diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml index 78695257e2884..99375a2c48bce 100644 --- a/.github/checkgroup.yml +++ b/.github/checkgroup.yml @@ -48,6 +48,7 @@ subprojects: - "!**/*.md" checks: - "pytorch-lightning (GPUs) (testing Lightning | latest)" + - "pytorch-lightning (GPUs) (testing PyTorch | oldest)" - "pytorch-lightning (GPUs) (testing PyTorch | latest)" - id: "pytorch_lightning: Benchmarks" @@ -174,6 +175,7 @@ subprojects: - "!*.md" - "!**/*.md" checks: + - "lightning-fabric (GPUs) (testing Fabric | oldest)" - "lightning-fabric (GPUs) (testing Fabric | latest)" - "lightning-fabric (GPUs) (testing Lightning | latest)" diff --git a/.github/workflows/_build-packages.yml b/.github/workflows/_build-packages.yml index 78035470059d1..9a4a7813a7f56 100644 --- a/.github/workflows/_build-packages.yml +++ b/.github/workflows/_build-packages.yml @@ -25,7 +25,7 @@ jobs: matrix: pkg-name: ${{ fromJSON(inputs.pkg-names) }} steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - uses: actions/setup-python@v5 with: python-version: "3.x" diff --git a/.github/workflows/_legacy-checkpoints.yml b/.github/workflows/_legacy-checkpoints.yml index 03fadc1247f16..8cceb5870af8d 100644 --- a/.github/workflows/_legacy-checkpoints.yml +++ b/.github/workflows/_legacy-checkpoints.yml @@ -55,7 +55,7 @@ jobs: outputs: pl-version: ${{ steps.decide-version.outputs.pl-version }} steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - uses: actions/setup-python@v5 with: @@ -135,7 +135,7 @@ jobs: env: PL_VERSION: ${{ needs.create-legacy-ckpts.outputs.pl-version }} steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 with: ref: master diff --git a/.github/workflows/ci-pkg-install.yml b/.github/workflows/ci-pkg-install.yml index 7a7f4bfdcf955..accec3a2339a1 100644 --- a/.github/workflows/ci-pkg-install.yml +++ b/.github/workflows/ci-pkg-install.yml @@ -46,7 +46,7 @@ jobs: pkg-name: ["fabric", "pytorch", "lightning", "notset"] python-version: ["3.9", "3.11"] steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} diff --git a/.github/workflows/ci-tests-fabric.yml b/.github/workflows/ci-tests-fabric.yml index c8b6d1e71a910..f4c66f425cc71 100644 --- a/.github/workflows/ci-tests-fabric.yml +++ b/.github/workflows/ci-tests-fabric.yml @@ -81,7 +81,7 @@ jobs: # TODO: Remove this - Enable running MPS tests on this platform DISABLE_MPS: ${{ matrix.os == 'macOS-14' && '1' || '0' }} steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 @@ -94,7 +94,9 @@ jobs: - name: Set min. dependencies if: ${{ matrix.requires == 'oldest' }} run: | - python .actions/assistant.py replace_oldest_ver + cd requirements/fabric + pip install -U "lightning-utilities[cli]" + python -m lightning_utilities.cli requirements set-oldest --req_files "['base.txt', 'strategies.txt', 'test.txt']" pip install "cython<3.0" wheel pip install "pyyaml==5.4" --no-build-isolation @@ -140,7 +142,8 @@ jobs: run: | pip install -e ".[${EXTRA_PREFIX}test,${EXTRA_PREFIX}strategies]" \ -U --upgrade-strategy=eager --prefer-binary \ - --extra-index-url="${TORCH_URL}" --find-links="${PYPI_CACHE_DIR}" + --extra-index-url="${TORCH_URL}" \ + --find-links="${PYPI_CACHE_DIR}" pip list - name: Dump handy wheels if: github.event_name == 'push' && github.ref == 'refs/heads/master' diff --git a/.github/workflows/ci-tests-pytorch.yml b/.github/workflows/ci-tests-pytorch.yml index 72a966812397e..e5e9ddd23c06e 100644 --- a/.github/workflows/ci-tests-pytorch.yml +++ b/.github/workflows/ci-tests-pytorch.yml @@ -86,7 +86,7 @@ jobs: # TODO: Remove this - Enable running MPS tests on this platform DISABLE_MPS: ${{ matrix.os == 'macOS-14' && '1' || '0' }} steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 @@ -99,7 +99,9 @@ jobs: - name: Set min. dependencies if: ${{ matrix.requires == 'oldest' }} run: | - python .actions/assistant.py replace_oldest_ver + cd requirements/pytorch + pip install -U "lightning-utilities[cli]" + python -m lightning_utilities.cli requirements set-oldest --req_files "['base.txt', 'extra.txt', 'strategies.txt', 'examples.txt', 'test.txt']" pip install "cython<3.0" wheel pip install "pyyaml==5.4" --no-build-isolation @@ -139,7 +141,8 @@ jobs: pip install ".[${EXTRA_PREFIX}extra,${EXTRA_PREFIX}test,${EXTRA_PREFIX}strategies]" \ -U --upgrade-strategy=eager --prefer-binary \ -r requirements/_integrations/accelerators.txt \ - --extra-index-url="${TORCH_URL}" --find-links="${PYPI_CACHE_DIR}" + --extra-index-url="${TORCH_URL}" \ + --find-links="${PYPI_CACHE_DIR}" pip list - name: Drop LAI from extensions if: ${{ matrix.pkg-name != 'lightning' }} diff --git a/.github/workflows/cleanup-caches.yml b/.github/workflows/cleanup-caches.yml index 93bce48fcc43f..d9ddd90ad1cce 100644 --- a/.github/workflows/cleanup-caches.yml +++ b/.github/workflows/cleanup-caches.yml @@ -9,7 +9,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Check out code - uses: actions/checkout@v4 + uses: actions/checkout@v5 - name: Cleanup run: | diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index 527b605e12366..79707977ffc3f 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -28,7 +28,7 @@ jobs: mypy: runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - uses: actions/setup-python@v5 with: python-version: "3.11" diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml index 93ff401f60f5f..34c6d40898213 100644 --- a/.github/workflows/docker-build.yml +++ b/.github/workflows/docker-build.yml @@ -52,7 +52,7 @@ jobs: - { python_version: "3.12", pytorch_version: "2.7", cuda_version: "12.6.3" } - { python_version: "3.12", pytorch_version: "2.8", cuda_version: "12.6.3", latest: "true" } steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 with: submodules: true - uses: docker/setup-buildx-action@v3 @@ -112,7 +112,7 @@ jobs: - { python_version: "3.12", pytorch_version: "2.7.1", cuda_version: "12.6.3" } - { python_version: "3.12", pytorch_version: "2.8.0", cuda_version: "12.6.3" } steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - uses: docker/setup-buildx-action@v3 - uses: docker/login-action@v3 if: env.PUSH_NIGHTLY == 'true' && github.repository_owner == 'Lightning-AI' @@ -154,7 +154,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@v5 - name: Build Conda Docker # publish master/release continue-on-error: true diff --git a/.github/workflows/docs-build.yml b/.github/workflows/docs-build.yml index 8f6deeb189773..69d35e605db5b 100644 --- a/.github/workflows/docs-build.yml +++ b/.github/workflows/docs-build.yml @@ -64,7 +64,7 @@ jobs: PIN_RELEASE_VERSIONS: 1 ARTIFACT_DAYS: 0 steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 with: ref: ${{ inputs.checkout }} # only Pytorch has/uses notebooks diff --git a/.github/workflows/docs-tutorials.yml b/.github/workflows/docs-tutorials.yml index e6e1f755484fd..8f2fded0b99b9 100644 --- a/.github/workflows/docs-tutorials.yml +++ b/.github/workflows/docs-tutorials.yml @@ -18,7 +18,7 @@ jobs: docs-update: runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 with: submodules: true fetch-depth: 0 diff --git a/.github/workflows/labeler-issue.yml b/.github/workflows/labeler-issue.yml index 00905bdadd656..afcd2737512a2 100644 --- a/.github/workflows/labeler-issue.yml +++ b/.github/workflows/labeler-issue.yml @@ -19,7 +19,7 @@ jobs: contents: read steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - name: Parse issue form uses: stefanbuck/github-issue-parser@v3 diff --git a/.github/workflows/release-nightly.yml b/.github/workflows/release-nightly.yml index 24d1a07f9abbc..3dfc4872fcf15 100644 --- a/.github/workflows/release-nightly.yml +++ b/.github/workflows/release-nightly.yml @@ -23,7 +23,7 @@ jobs: env: PKG_NAME: "lightning" steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - uses: actions/setup-python@v5 with: python-version: 3.9 @@ -53,7 +53,7 @@ jobs: env: PKG_NAME: "lightning" steps: - - uses: actions/checkout@v4 # needed to use local composite action + - uses: actions/checkout@v5 # needed to use local composite action - uses: actions/download-artifact@v5 with: name: nightly-packages-${{ github.sha }} diff --git a/.github/workflows/release-pkg.yml b/.github/workflows/release-pkg.yml index fa2a499f4abe2..f50adbc8fbac8 100644 --- a/.github/workflows/release-pkg.yml +++ b/.github/workflows/release-pkg.yml @@ -37,7 +37,7 @@ jobs: needs: build-packages if: github.event_name == 'release' steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - uses: actions/download-artifact@v5 with: name: dist-packages-${{ github.sha }} @@ -54,7 +54,7 @@ jobs: outputs: tag: ${{ steps.lai-package.outputs.version }} steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - uses: actions/setup-python@v5 with: python-version: ${{ env.PYTHON_VER }} @@ -74,7 +74,7 @@ jobs: TAG: ${{ needs.release-version.outputs.tag }} BRANCH_NAME: "trigger/lightning-${{ needs.release-version.outputs.tag }}" steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 with: repository: gridai/base-images token: ${{ secrets.PAT_GHOST }} @@ -139,7 +139,7 @@ jobs: matrix: name: ["FABRIC", "PYTORCH", "LIGHTNING"] steps: - - uses: actions/checkout@v4 # needed for local action below + - uses: actions/checkout@v5 # needed for local action below - uses: actions/download-artifact@v5 with: name: dist-packages-${{ github.sha }} @@ -164,7 +164,7 @@ jobs: matrix: name: ["FABRIC", "PYTORCH", "LIGHTNING"] steps: - - uses: actions/checkout@v4 # needed for local action below + - uses: actions/checkout@v5 # needed for local action below - uses: actions/download-artifact@v5 with: name: dist-packages-${{ github.sha }} diff --git a/.lightning/workflows/fabric.yml b/.lightning/workflows/fabric.yml index edaf0837fe79e..438f56ef7fe94 100644 --- a/.lightning/workflows/fabric.yml +++ b/.lightning/workflows/fabric.yml @@ -10,13 +10,13 @@ parametrize: matrix: {} include: # note that this is setting also all oldest requirements which is linked to Torch == 2.0 - - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.1-cuda12.1.1" + - image: "pytorchlightning/pytorch_lightning:base-cuda12.1.1-py3.10-torch2.1" PACKAGE_NAME: "fabric" - - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.7-cuda12.6.3" + - image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.7" PACKAGE_NAME: "fabric" - # - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.7-cuda12.6.3" + # - image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.7" # PACKAGE_NAME: "fabric" - - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.7-cuda12.6.3" + - image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.7" PACKAGE_NAME: "lightning" exclude: [] @@ -43,7 +43,10 @@ run: | if [ "${TORCH_VER}" == "2.1" ]; then echo "Set oldest versions" - python .actions/assistant.py replace_oldest_ver + cd requirements/fabric + pip install -U "lightning-utilities[cli]" + python -m lightning_utilities.cli requirements set-oldest --req_files "['base.txt', 'strategies.txt']" + cd ../.. pip install "cython<3.0" wheel # for compatibility fi diff --git a/.lightning/workflows/pytorch.yml b/.lightning/workflows/pytorch.yml index 81063c3699769..5c92bf881d969 100644 --- a/.lightning/workflows/pytorch.yml +++ b/.lightning/workflows/pytorch.yml @@ -10,13 +10,13 @@ parametrize: matrix: {} include: # note that this is setting also all oldest requirements which is linked to Torch == 2.0 - - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.1-cuda12.1.1" + - image: "pytorchlightning/pytorch_lightning:base-cuda12.1.1-py3.10-torch2.1" PACKAGE_NAME: "pytorch" - - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.7-cuda12.6.3" + - image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.7" PACKAGE_NAME: "pytorch" - # - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.7-cuda12.6.3" + # - image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.7" # PACKAGE_NAME: "pytorch" - - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.7-cuda12.6.3" + - image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.7" PACKAGE_NAME: "lightning" exclude: [] @@ -43,7 +43,10 @@ run: | if [ "${TORCH_VER}" == "2.1" ]; then recho "Set oldest versions" - python .actions/assistant.py replace_oldest_ver + cd requirements/pytorch + pip install -U "lightning-utilities[cli]" + python -m lightning_utilities.cli requirements set-oldest --req_files "['base.txt', 'extra.txt', 'strategies.txt', 'examples.txt']" + cd ../.. pip install "cython<3.0" wheel # for compatibility fi diff --git a/dockers/README.md b/dockers/README.md index ad69199a6483e..cff8bf542f95b 100644 --- a/dockers/README.md +++ b/dockers/README.md @@ -11,7 +11,13 @@ git clone https://github.com/Lightning-AI/lightning.git docker image build -t pytorch-lightning:latest -f dockers/base-cuda/Dockerfile . # build with specific arguments -docker image build -t pytorch-lightning:base-cuda-py3.9-torch1.13-cuda11.7.1 -f dockers/base-cuda/Dockerfile --build-arg PYTHON_VERSION=3.9 --build-arg PYTORCH_VERSION=1.13 --build-arg CUDA_VERSION=11.7.1 . +docker image build \ + -t pytorch-lightning:base-cuda12.6.3-py3.10-torch2.8 \ + -f dockers/base-cuda/Dockerfile \ + --build-arg PYTHON_VERSION=3.10 \ + --build-arg PYTORCH_VERSION=2.8 \ + --build-arg CUDA_VERSION=12.6.3 \ + . ``` To run your docker use @@ -45,18 +51,18 @@ sudo systemctl restart docker and later run the docker image with `--gpus all`. For example, ``` -docker run --rm -it --gpus all pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.13-cuda11.7.1 +docker run --rm -it --gpus all pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.10-torch2.8 ``` ## Run Jupyter server 1. Build the docker image: ```bash - docker image build -t pytorch-lightning:v1.6.5 -f dockers/nvidia/Dockerfile --build-arg LIGHTNING_VERSION=1.6.5 . + docker image build -t pytorch-lightning:v2.5.1 -f dockers/nvidia/Dockerfile --build-arg LIGHTNING_VERSION=2.5.1 . ``` 1. start the server and map ports: ```bash - docker run --rm -it --gpus=all -p 8888:8888 pytorch-lightning:v1.6.5 + docker run --rm -it --gpus=all -p 8888:8888 pytorch-lightning:v2.5.1 ``` 1. Connect in local browser: - copy the generated path e.g. `http://hostname:8888/?token=0719fa7e1729778b0cec363541a608d5003e26d4910983c6` diff --git a/dockers/release/Dockerfile b/dockers/release/Dockerfile index 34ff8099afdb1..b80c23dfc73f3 100644 --- a/dockers/release/Dockerfile +++ b/dockers/release/Dockerfile @@ -16,11 +16,12 @@ ARG PYTHON_VERSION=3.10 ARG PYTORCH_VERSION=2.8 ARG CUDA_VERSION=12.6.3 -FROM pytorchlightning/pytorch_lightning:base-cuda-py${PYTHON_VERSION}-torch${PYTORCH_VERSION}-cuda${CUDA_VERSION} +FROM pytorchlightning/pytorch_lightning:base-cuda${CUDA_VERSION}-py${PYTHON_VERSION}-torch${PYTORCH_VERSION} LABEL maintainer="Lightning-AI " ARG LIGHTNING_VERSION="" +ARG PYTORCH_VERSION COPY ./ /home/pytorch-lightning/ @@ -39,7 +40,14 @@ RUN \ fi && \ # otherwise there is collision with folder name and pkg name on Pypi cd pytorch-lightning && \ - pip install setuptools==75.6.0 && \ + # pip install setuptools==75.6.0 && \ + pip install -U "lightning-utilities[cli]" && \ + # drop deepspeed since it is not supported by our minimal Torch requirements \ + echo "PYTORCH_VERSION is: '$PYTORCH_VERSION'" && \ + if [[ "$PYTORCH_VERSION" =~ ^(2\.1|2\.2|2\.3|2\.4)$ ]]; then \ + python -m lightning_utilities.cli requirements prune-pkgs --packages deepspeed --req_files requirements/fabric/strategies.txt ; \ + python -m lightning_utilities.cli requirements prune-pkgs --packages deepspeed --req_files requirements/pytorch/strategies.txt ; \ + fi && \ PACKAGE_NAME=lightning pip install '.[extra,loggers,strategies]' --no-cache-dir && \ PACKAGE_NAME=pytorch pip install '.[extra,loggers,strategies]' --no-cache-dir && \ cd .. && \ diff --git a/docs/source-fabric/advanced/compile.rst b/docs/source-fabric/advanced/compile.rst index cc8fe39c683d4..bbcfc5240c159 100644 --- a/docs/source-fabric/advanced/compile.rst +++ b/docs/source-fabric/advanced/compile.rst @@ -417,7 +417,7 @@ Additional Resources Here are a few resources for further reading after you complete this tutorial: -- `PyTorch 2.0 Paper `_ +- `PyTorch 2.0 Paper `_ - `GenAI with PyTorch 2.0 blog post series `_ - `Training Production AI Models with PyTorch 2.0 `_ - `Empowering Models with Performance: The Art of Generalized Model Transformation Approach `_ diff --git a/docs/source-pytorch/accelerators/gpu_intermediate.rst b/docs/source-pytorch/accelerators/gpu_intermediate.rst index e5dcd151375b2..d70ebbc0823aa 100644 --- a/docs/source-pytorch/accelerators/gpu_intermediate.rst +++ b/docs/source-pytorch/accelerators/gpu_intermediate.rst @@ -59,7 +59,7 @@ variables: MASTER_ADDR=localhost MASTER_PORT=random() WORLD_SIZE=3 NODE_RANK=0 LOCAL_RANK=1 python my_file.py --accelerator 'gpu' --devices 3 --etc MASTER_ADDR=localhost MASTER_PORT=random() WORLD_SIZE=3 NODE_RANK=0 LOCAL_RANK=2 python my_file.py --accelerator 'gpu' --devices 3 --etc -Using DDP this way has a few disadvantages over ``torch.multiprocessing.spawn()``: +Using DDP this way has a few advantages over ``torch.multiprocessing.spawn()``: 1. All processes (including the main process) participate in training and have the updated state of the model and Trainer state. 2. No multiprocessing pickle errors diff --git a/docs/source-pytorch/advanced/compile.rst b/docs/source-pytorch/advanced/compile.rst index dba611c79a475..ac557362cf611 100644 --- a/docs/source-pytorch/advanced/compile.rst +++ b/docs/source-pytorch/advanced/compile.rst @@ -396,7 +396,7 @@ Additional Resources Here are a few resources for further reading after you complete this tutorial: -- `PyTorch 2.0 Paper `_ +- `PyTorch 2.0 Paper `_ - `GenAI with PyTorch 2.0 blog post series `_ - `Training Production AI Models with PyTorch 2.0 `_ - `Empowering Models with Performance: The Art of Generalized Model Transformation Approach `_ diff --git a/docs/source-pytorch/conf.py b/docs/source-pytorch/conf.py index 62cd21fc127f4..35f7aa6ee646c 100644 --- a/docs/source-pytorch/conf.py +++ b/docs/source-pytorch/conf.py @@ -127,7 +127,7 @@ def _load_py_module(name: str, location: str) -> ModuleType: "https://pytorch.org/docs/stable/", "https://pytorch.org/docs/{torch.__version__}/", _PATH_ROOT ) adjust_linked_external_docs( - "https://lightning.ai/docs/torchmetrics", "https://lightning.ai/docs/torchmetrics/v{torchmetrics.__version__}/", _PATH_ROOT, version_digits=3 + "https://lightning.ai/docs/torchmetrics/stable/", "https://lightning.ai/docs/torchmetrics/v{torchmetrics.__version__}/", _PATH_ROOT, version_digits=3 ) adjust_linked_external_docs( "https://lightning.ai/docs/fabric/stable/", "https://lightning.ai/docs/fabric/{lightning_fabric.__version__}/", _PATH_ROOT, version_digits=3 diff --git a/docs/source-pytorch/extensions/logging.rst b/docs/source-pytorch/extensions/logging.rst index f0c12464e6db2..6514716d1b826 100644 --- a/docs/source-pytorch/extensions/logging.rst +++ b/docs/source-pytorch/extensions/logging.rst @@ -120,6 +120,10 @@ methods to log from anywhere in a :doc:`LightningModule <../common/lightning_mod .. note:: Everything explained below applies to both :meth:`~lightning.pytorch.core.LightningModule.log` or :meth:`~lightning.pytorch.core.LightningModule.log_dict` methods. +.. note:: + + When using TorchMetrics with Lightning, we recommend referring to the `TorchMetrics Lightning integration documentation `_ for logging best practices, common pitfalls, and proper usage patterns. + Depending on where the :meth:`~lightning.pytorch.core.LightningModule.log` method is called, Lightning auto-determines the correct logging mode for you. Of course you can override the default behavior by manually setting the :meth:`~lightning.pytorch.core.LightningModule.log` parameters. diff --git a/docs/source-pytorch/model/manual_optimization.rst b/docs/source-pytorch/model/manual_optimization.rst index 4c7400c0457ca..8cf68d1397633 100644 --- a/docs/source-pytorch/model/manual_optimization.rst +++ b/docs/source-pytorch/model/manual_optimization.rst @@ -204,7 +204,6 @@ Here is an example training a simple GAN with multiple optimizers using manual o d_opt = torch.optim.Adam(self.D.parameters(), lr=1e-5) return g_opt, d_opt - Learning Rate Scheduling ======================== @@ -230,6 +229,10 @@ Here is an example calling ``lr_scheduler.step()`` every step. super().__init__() self.automatic_optimization = False + def configure_optimizers(self): + optimizer = torch.optim.Adam(self.parameters(), lr=1e-3) + scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1) + return [optimizer], [scheduler] def training_step(self, batch, batch_idx): # do forward, backward, and optimization @@ -252,6 +255,11 @@ If you want to call ``lr_scheduler.step()`` every ``N`` steps/epochs, do the fol super().__init__() self.automatic_optimization = False + def configure_optimizers(self): + optimizer = torch.optim.Adam(self.parameters(), lr=1e-3) + scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1) + return [optimizer], [scheduler] + def training_step(self, batch, batch_idx): # do forward, backward, and optimization @@ -275,13 +283,22 @@ If you want to call schedulers that require a metric value after each epoch, con super().__init__() self.automatic_optimization = False + def configure_optimizers(self): + optimizer = torch.optim.Adam(self.parameters(), lr=1e-3) + scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10) + return [optimizer], [scheduler] def on_train_epoch_end(self): sch = self.lr_schedulers() - # If the selected scheduler is a ReduceLROnPlateau scheduler. - if isinstance(sch, torch.optim.lr_scheduler.ReduceLROnPlateau): - sch.step(self.trainer.callback_metrics["loss"]) + sch.step(self.trainer.callback_metrics["loss"]) + +.. note:: + :meth:`~lightning.pytorch.core.LightningModule.configure_optimizers` supports 6 different ways to define and return + optimizers and learning rate schedulers. Regardless of the way you define them, `self.optimizers()` will always return + either a single optimizer if you defined a single optimizer, or a list of optimizers if you defined multiple + optimizers. The same applies to the `self.lr_schedulers()` method, which will return a single scheduler + if you defined a single scheduler, or a list of schedulers if you defined multiple schedulers Optimizer Steps at Different Frequencies diff --git a/docs/source-pytorch/versioning.rst b/docs/source-pytorch/versioning.rst index 948986d5699ed..4a04bd1534de9 100644 --- a/docs/source-pytorch/versioning.rst +++ b/docs/source-pytorch/versioning.rst @@ -53,12 +53,8 @@ API Evolution Lightning's development is driven by research and best practices in a rapidly developing field of AI and machine learning. Change is inevitable and when it happens, the Lightning team is committed to minimizing user friction and maximizing ease of transition from one version to the next. We take backwards compatibility and reproducibility very seriously. -For API removal, renaming or other forms of backwards-incompatible changes, the procedure is: - -#. A deprecation process is initiated at a minor version ``MAJOR.MINOR.PATCH`` (e.g. ``1.5.0``), producing a deprecation warning at runtime and removing it from the documentation. -#. The deprecated API remains unchanged during the deprecation phase for two minor versions or the next major update, whichever comes first. -#. The breaking change is done in version ``MAJOR.(MINOR+2).0`` (e.g. ``1.7.0``), or ``(MAJOR+1).0.0`` (e.g. ``2.0.0``), whichever comes first. -#. From that version onward, the deprecation warning gets converted into a helpful error, which will remain until next major release. +Excepting extenuating circumstances (e.g. a critical bug), API removal, renaming or other forms of backwards-incompatible changes are limited to major version upgrades — that is ``(MAJOR+1).0.0``. +Concretely, a breaking change for an API introduced in ``2.x.x`` can be introduced with Lightning ``3.0.0``. This policy is not strict. Shorter or longer deprecation cycles may apply to some cases. For example, in the past DDP2 was removed without a deprecation process because the feature was broken and unusable beyond fixing as discussed in `#12584 `_. @@ -69,6 +65,7 @@ Compatibility matrix PyTorch Lightning follows `NEP 29 `_ which PyTorch also follows (`#74203 `_). The table below indicates the coverage of tested versions in our CI. Versions outside the ranges may unofficially work in some cases. +Since the release of PyTorch `2.0`, Lightning strives to officially support the latest 5 PyTorch minor releases with no breaking changes within major versions [1]_. .. list-table:: :header-rows: 1 @@ -82,102 +79,104 @@ The table below indicates the coverage of tested versions in our CI. Versions ou * - 2.5 - 2.5 - 2.5 - - ≥2.1, ≤2.7 + - ≥2.1, (last tested 2.8) - ≥0.7.0 - - ≥3.9, ≤3.12 + - ≥3.9, (last tested 3.12) * - 2.4 - 2.4 - 2.4 - - ≥2.1, ≤2.6 + - ≥2.1, (last tested 2.6) - ≥0.7.0 - - ≥3.9, ≤3.12 + - ≥3.9, (last tested 3.12) * - 2.3 - 2.3 - 2.3 - - ≥2.0, ≤2.3 + - ≥2.0, (last tested 2.3) - ≥0.7.0 - - ≥3.8, ≤3.11 + - ≥3.8, (last tested 3.11) * - 2.2 - 2.2 - 2.2 - - ≥1.13, ≤2.2 + - ≥1.13, (last tested 2.2) - ≥0.7.0 - - ≥3.8, ≤3.11 + - ≥3.8, (last tested 3.11) * - 2.1 - 2.1 - 2.1 - - ≥1.12, ≤2.1 + - ≥1.12, (last tested 2.1) - ≥0.7.0 - - ≥3.8, ≤3.11 + - ≥3.8, (last tested 3.11) * - 2.0 - 2.0 - 2.0 (GA) - - ≥1.11, ≤2.0 + - ≥1.11, (last tested 2.0) - ≥0.7.0 - - ≥3.8, ≤3.10 + - ≥3.8, (last tested 3.10) * - 1.9 - 1.9 - 1.9 (experimental) - - ≥1.10, ≤1.13 + - ≥1.10, (last tested 1.13) - ≥0.7.0 - - ≥3.7, ≤3.10 + - ≥3.7, (last tested 3.10) * - 1.8** - 1.8 - n/a*** - - ≥1.10, ≤1.13 + - ≥1.10, (last tested 1.13) - ≥0.7.0 - - ≥3.7, ≤3.10 + - ≥3.7, (last tested 3.10) * - n/a - 1.7 - n/a*** - - ≥1.9, ≤1.12 + - ≥1.9, (last tested 1.12) - ≥0.7.0 - - ≥3.7, ≤3.10 + - ≥3.7, (last tested 3.10) * - n/a - 1.6 - n/a*** - - ≥1.8, ≤1.11 + - ≥1.8, (last tested 1.11) - ≥0.4.1 - - ≥3.7, ≤3.9 + - ≥3.7, (last tested 3.9) * - n/a - 1.5 - n/a*** - - ≥1.7, ≤1.10 + - ≥1.7, (last tested 1.10) - ≥0.4.1 - - ≥3.6, ≤3.9 + - ≥3.6, (last tested 3.9) * - n/a - 1.4 - n/a - - ≥1.6, ≤1.9 + - ≥1.6, (last tested 1.9) - ≥0.4.0 - - ≥3.6, ≤3.9 + - ≥3.6, (last tested 3.9) * - n/a - 1.3 - n/a - - ≥1.4, ≤1.8 + - ≥1.4, (last tested 1.8) - ≥0.2.0 - - ≥3.6, ≤3.9 + - ≥3.6, (last tested 3.9) * - n/a - 1.2 - n/a - - ≥1.4, ≤1.8 + - ≥1.4, (last tested 1.8) - n/a* - - ≥3.6, ≤3.8 + - ≥3.6, (last tested 3.8) * - n/a - 1.1 - n/a - - ≥1.3, ≤1.8 + - ≥1.3, (last tested 1.8) - n/a* - - ≥3.6, ≤3.8 + - ≥3.6, (last tested 3.8) * - n/a - 1.0 - n/a - - ≥1.3, ≤1.7 + - ≥1.3, (last tested 1.7) - n/a* - - ≥3.6, ≤3.8 + - ≥3.6, (last tested 3.8) \* ``torchmetrics`` was part of ``pytorch_lightning`` at the time and was decoupled to a separate package in v1.3. \*\* The joint ``lightning`` package was first published in version 1.8 \*\*\* Fabric is the evolution of ``LightningLite`` which was released inside ``pytorch_lightning`` 1.5 and was decoupled to a separate package in v1.9 + +.. [1] See `this community discussion `_. diff --git a/pyproject.toml b/pyproject.toml index a63da5f246392..b4d5d0b1638f5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -179,6 +179,7 @@ markers = [ "cloud: Run the cloud tests for example", ] filterwarnings = [ + # "error::DeprecationWarning", "error::FutureWarning", "ignore::FutureWarning:onnxscript", # Temporary ignore until onnxscript is updated ] diff --git a/requirements/docs.txt b/requirements/docs.txt index 9fa72085df6b5..29d8934b41bd1 100644 --- a/requirements/docs.txt +++ b/requirements/docs.txt @@ -1,5 +1,5 @@ sphinx >5.0, <6.0 -myst-parser >=0.18.1, <4.0.0 +myst-parser >=0.18.1, <5.0.0 nbsphinx >=0.8.5, <=0.9.7 nbconvert >7.14, <7.17 pandoc >=1.0, <=2.4 diff --git a/requirements/fabric/strategies.txt b/requirements/fabric/strategies.txt index bea30b37fa5f8..7856db1df2eec 100644 --- a/requirements/fabric/strategies.txt +++ b/requirements/fabric/strategies.txt @@ -5,5 +5,5 @@ # note: is a bug around 0.10 with `MPS_Accelerator must implement all abstract methods` # shall be resolved by https://github.com/microsoft/DeepSpeed/issues/4372 -deepspeed >=0.9.3, <=0.9.3; platform_system != "Windows" and platform_system != "Darwin" # strict +deepspeed >=0.14.1,<=0.15.0; platform_system != "Windows" and platform_system != "Darwin" # strict bitsandbytes >=0.45.2,<0.47.0; platform_system != "Darwin" diff --git a/requirements/fabric/test.txt b/requirements/fabric/test.txt index d8884253eab80..a9b4271cac2c3 100644 --- a/requirements/fabric/test.txt +++ b/requirements/fabric/test.txt @@ -1,9 +1,10 @@ -coverage ==7.10.3 -numpy >=1.17.2, <1.27.0 +coverage ==7.10.5 +numpy >=1.21.0, <1.27.0 pytest ==8.4.1 pytest-cov ==6.2.1 pytest-timeout ==2.4.0 pytest-rerunfailures ==15.1 pytest-random-order ==1.2.0 -click ==8.1.8 -tensorboardX >=2.2, <2.7.0 # min version is set by torch.onnx missing attribute +click ==8.1.8; python_version < "3.11" +click ==8.2.1; python_version > "3.10" +tensorboardX >=2.6, <2.7.0 # todo: relax it back to `>=2.2` after fixing tests diff --git a/requirements/pytorch/docs.txt b/requirements/pytorch/docs.txt index 35cc6234ae5d2..a3e2e88967f75 100644 --- a/requirements/pytorch/docs.txt +++ b/requirements/pytorch/docs.txt @@ -1,9 +1,9 @@ -r ../docs.txt nbformat # used for generate empty notebook -ipython[notebook] <8.19.0 +ipython[notebook] <9.5.0 setuptools<81.0 # workaround for `error in ipython setup command: use_2to3 is invalid.` -onnxscript >= 0.2.2, <0.4.0 +onnxscript >= 0.2.2, < 0.5.0 #-r ../../_notebooks/.actions/requires.txt diff --git a/requirements/pytorch/extra.txt b/requirements/pytorch/extra.txt index ab3a36f7dad3b..9bb3dee0080a0 100644 --- a/requirements/pytorch/extra.txt +++ b/requirements/pytorch/extra.txt @@ -2,7 +2,7 @@ # in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment # extended list of package dependencies to reach full functionality -matplotlib>3.1, <3.10.0 +matplotlib>3.1, <3.11.0 omegaconf >=2.2.3, <2.4.0 hydra-core >=1.2.0, <1.4.0 jsonargparse[signatures,jsonnet] >=4.39.0, <4.41.0 diff --git a/requirements/pytorch/loggers.info b/requirements/pytorch/loggers.info index 94ff89ff6c62f..59d7768cc7aad 100644 --- a/requirements/pytorch/loggers.info +++ b/requirements/pytorch/loggers.info @@ -4,4 +4,4 @@ neptune >=1.0.0 comet-ml >=3.31.0 mlflow >=1.0.0 wandb >=0.12.10 -tensorboard >=2.9.1 +tensorboard >=2.11 diff --git a/requirements/pytorch/strategies.txt b/requirements/pytorch/strategies.txt index 1f7296798b551..89392d6006d38 100644 --- a/requirements/pytorch/strategies.txt +++ b/requirements/pytorch/strategies.txt @@ -3,4 +3,4 @@ # note: is a bug around 0.10 with `MPS_Accelerator must implement all abstract methods` # shall be resolved by https://github.com/microsoft/DeepSpeed/issues/4372 -deepspeed >=0.9.3, <=0.9.3; platform_system != "Windows" and platform_system != "Darwin" # strict +deepspeed >=0.14.1,<=0.15.0; platform_system != "Windows" and platform_system != "Darwin" # strict diff --git a/requirements/pytorch/test.txt b/requirements/pytorch/test.txt index a1fbdec222c7f..1fd3ec790055f 100644 --- a/requirements/pytorch/test.txt +++ b/requirements/pytorch/test.txt @@ -1,4 +1,4 @@ -coverage ==7.10.3 +coverage ==7.10.5 pytest ==8.4.1 pytest-cov ==6.2.1 pytest-timeout ==2.4.0 @@ -7,14 +7,14 @@ pytest-random-order ==1.2.0 # needed in tests cloudpickle >=1.3, <3.2.0 -scikit-learn >0.22.1, <1.7.0 -numpy >=1.17.2, <1.27.0 -onnx >=1.12.0, <1.19.0 -onnxruntime >=1.12.0, <1.21.0 -onnxscript >= 0.2.2, <0.4.0 +scikit-learn >0.22.1, <1.8.0 +numpy >1.20.0, <1.27.0 +onnx >1.12.0, <1.19.0 +onnxruntime >=1.12.0, <1.23.0 +onnxscript >= 0.1.0, < 0.5.0 psutil <7.0.1 # for `DeviceStatsMonitor` pandas >2.0, <2.4.0 # needed in benchmarks fastapi # for `ServableModuleValidator` # not setting version as re-defined in App uvicorn # for `ServableModuleValidator` # not setting version as re-defined in App -tensorboard >=2.9.1, <2.21.0 # for `TensorBoardLogger` +tensorboard >=2.11, <2.21.0 # for `TensorBoardLogger` diff --git a/requirements/typing.txt b/requirements/typing.txt index 7e0c34e2ac3fa..e24382a1f06c6 100644 --- a/requirements/typing.txt +++ b/requirements/typing.txt @@ -1,5 +1,5 @@ mypy==1.17.1 -torch==2.7.1 # todo: update typing in separate PR +torch==2.8.0 types-Markdown types-PyYAML diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md index 0659105560bba..b1102cdce06b7 100644 --- a/src/lightning/fabric/CHANGELOG.md +++ b/src/lightning/fabric/CHANGELOG.md @@ -6,7 +6,14 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). --- -## [2.5.3] - 2025-08-DD +## [2.5.4] - 2025-08-29 + +### Changed + +- Added support for NVIDIA H200 GPUs in `get_available_flops` ([#20913](https://github.com/Lightning-AI/pytorch-lightning/pull/21119)) + + +## [2.5.3] - 2025-08-13 ### Changed diff --git a/src/lightning/fabric/plugins/collectives/torch_collective.py b/src/lightning/fabric/plugins/collectives/torch_collective.py index 883380bb881aa..182e75f4583ef 100644 --- a/src/lightning/fabric/plugins/collectives/torch_collective.py +++ b/src/lightning/fabric/plugins/collectives/torch_collective.py @@ -24,6 +24,8 @@ class TorchCollective(Collective): """ manages_default_group = False + addr_key = "MASTER_ADDR" + port_key = "MASTER_PORT" def __init__(self) -> None: if not dist.is_available(): @@ -136,26 +138,21 @@ def setup(self, main_address: Optional[str] = None, main_port: Optional[str] = N if self.is_initialized(): return self # maybe set addr - set_addr = False - addr_key = "MASTER_ADDR" - if main_address is not None and addr_key not in os.environ: - os.environ[addr_key] = main_address - set_addr = True + setting_env = [] + if main_address is not None and self.addr_key not in os.environ: + os.environ[self.addr_key] = main_address + setting_env.append(self.addr_key) # maybe set port - set_port = False - port_key = "MASTER_PORT" - if main_port is not None and port_key not in os.environ: - os.environ[port_key] = str(main_port) - set_port = True + if main_port is not None and self.port_key not in os.environ: + os.environ[self.port_key] = str(main_port) + setting_env.append(self.port_key) # this will `init_group` super().setup(**kwargs) # set as a class attribute so any instance can know whether we initialized the default process group TorchCollective.manages_default_group = True # cleanup - if set_addr: - os.environ.pop("MASTER_ADDR", None) - if set_port: - os.environ.pop("MASTER_PORT", None) + for kenv in setting_env: + os.environ.pop(kenv, None) return self @override diff --git a/src/lightning/fabric/strategies/deepspeed.py b/src/lightning/fabric/strategies/deepspeed.py index 41820c1cc433f..322fa1899b0ee 100644 --- a/src/lightning/fabric/strategies/deepspeed.py +++ b/src/lightning/fabric/strategies/deepspeed.py @@ -46,7 +46,6 @@ from deepspeed import DeepSpeedEngine _DEEPSPEED_AVAILABLE = RequirementCache("deepspeed") -_DEEPSPEED_GREATER_EQUAL_0_14_1 = RequirementCache("deepspeed>=0.14.1") # TODO(fabric): Links in the docstrings to PL-specific deepspeed user docs need to be replaced. @@ -503,10 +502,7 @@ def load_checkpoint( ) engine = engines[0] - if _DEEPSPEED_GREATER_EQUAL_0_14_1: - from deepspeed.runtime.base_optimizer import DeepSpeedOptimizer - else: - from deepspeed.runtime import DeepSpeedOptimizer + from deepspeed.runtime.base_optimizer import DeepSpeedOptimizer optimzer_state_requested = any(isinstance(item, (Optimizer, DeepSpeedOptimizer)) for item in state.values()) diff --git a/src/lightning/fabric/utilities/throughput.py b/src/lightning/fabric/utilities/throughput.py index 72b33a41f168c..6bc329fa1c3be 100644 --- a/src/lightning/fabric/utilities/throughput.py +++ b/src/lightning/fabric/utilities/throughput.py @@ -304,6 +304,23 @@ def measure_flops( _CUDA_FLOPS: dict[str, dict[Union[str, torch.dtype], float]] = { # Hopper + # source: https://nvdam.widen.net/s/nb5zzzsjdf/hpc-datasheet-sc23-h200-datasheet-3002446 + "h200 sxm1": { + torch.float64: 3.4e13, + torch.float32: 6.7e13, + "tfloat32": 9.9e14, + torch.bfloat16: 2.0e15, + torch.float16: 2.0e15, + torch.int8: 4.0e15, + }, + "h200 nvl1": { + torch.float64: 3.0e13, + torch.float32: 6.0e13, + "tfloat32": 8.4e14, + torch.bfloat16: 1.7e15, + torch.float16: 1.7e15, + torch.int8: 3.3e15, + }, # source: https://resources.nvidia.com/en-us-tensor-core "h100 nvl": { torch.float64: 67e12, @@ -536,7 +553,12 @@ def get_available_flops(device: torch.device, dtype: Union[torch.dtype, str]) -> if device.type == "cuda": device_name = torch.cuda.get_device_name(device) chip = device_name.lower() - if "h100" in chip: + if "h200" in chip: + if "sxm1" in chip: + chip = "h200 sxm1" + elif "nvl1" in chip: + chip = "h200 nvl1" + elif "h100" in chip: if "hbm3" in chip: chip = "h100 sxm" elif "nvl" in chip: diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md index 6f5d7acbaed43..01b64c38051b3 100644 --- a/src/lightning/pytorch/CHANGELOG.md +++ b/src/lightning/pytorch/CHANGELOG.md @@ -6,7 +6,18 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). --- -## [2.5.3] - 2025-08-DD +## [2.5.4] - 2025-08-29 + +### Fixed + +- Fixed `AsyncCheckpointIO` snapshots tensors to avoid race with parameter mutation ([#21079](https://github.com/Lightning-AI/pytorch-lightning/pull/21079)) +- Fixed `AsyncCheckpointIO` threadpool exception if calling fit or validate more than one ([#20952](https://github.com/Lightning-AI/pytorch-lightning/pull/20952)) +- Fixed learning rate not being correctly set after using `LearningRateFinder` callback ([#21068](https://github.com/Lightning-AI/pytorch-lightning/pull/21068)) +- Fixed misalignment column while using rich model summary in `DeepSpeedstrategy` ([#21100](https://github.com/Lightning-AI/pytorch-lightning/pull/21100)) +- Fixed `RichProgressBar` crashing when sanity checking using val dataloader with 0 len ([#21108](https://github.com/Lightning-AI/pytorch-lightning/pull/21108)) + + +## [2.5.3] - 2025-08-13 ### Changed @@ -57,7 +68,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Allow LightningCLI to use a customized argument parser class ([#20596](https://github.com/Lightning-AI/pytorch-lightning/pull/20596)) - Change `wandb` default x-axis to `tensorboard`'s `global_step` when `sync_tensorboard=True` ([#20611](https://github.com/Lightning-AI/pytorch-lightning/pull/20611)) -- Added a new `checkpoint_path_prefix` parameter to the MLflow logger which can control the path to where the MLflow artifacts for the model checkpoints are stored ([#20538](https://github.com/Lightning-AI/pytorch-lightning/pull/20538)) - CometML logger was updated to support the recent Comet SDK ([#20275](https://github.com/Lightning-AI/pytorch-lightning/pull/20275)) - bump: testing with latest `torch` 2.6 ([#20509](https://github.com/Lightning-AI/pytorch-lightning/pull/20509)) diff --git a/src/lightning/pytorch/callbacks/progress/rich_progress.py b/src/lightning/pytorch/callbacks/progress/rich_progress.py index 644497cbb632f..603dc2364ef51 100644 --- a/src/lightning/pytorch/callbacks/progress/rich_progress.py +++ b/src/lightning/pytorch/callbacks/progress/rich_progress.py @@ -387,8 +387,7 @@ def on_sanity_check_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningM @override def on_sanity_check_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: - if self.progress is not None: - assert self.val_sanity_progress_bar_id is not None + if self.progress is not None and self.val_sanity_progress_bar_id is not None: self.progress.update(self.val_sanity_progress_bar_id, advance=0, visible=False) self.refresh() diff --git a/src/lightning/pytorch/callbacks/rich_model_summary.py b/src/lightning/pytorch/callbacks/rich_model_summary.py index e4027f0dedcb1..f1edb185705a4 100644 --- a/src/lightning/pytorch/callbacks/rich_model_summary.py +++ b/src/lightning/pytorch/callbacks/rich_model_summary.py @@ -78,6 +78,7 @@ def summarize( from rich.table import Table console = get_console() + column_names = list(zip(*summary_data))[0] header_style: str = summarize_kwargs.get("header_style", "bold magenta") table = Table(header_style=header_style) @@ -85,9 +86,11 @@ def summarize( table.add_column("Name", justify="left", no_wrap=True) table.add_column("Type") table.add_column("Params", justify="right") - table.add_column("Mode") - column_names = list(zip(*summary_data))[0] + if "Params per Device" in column_names: + table.add_column("Params per Device", justify="right") + + table.add_column("Mode") for column_name in ["In sizes", "Out sizes"]: if column_name in column_names: diff --git a/src/lightning/pytorch/plugins/io/async_plugin.py b/src/lightning/pytorch/plugins/io/async_plugin.py index 67c02189c541e..5cff35074992e 100644 --- a/src/lightning/pytorch/plugins/io/async_plugin.py +++ b/src/lightning/pytorch/plugins/io/async_plugin.py @@ -13,13 +13,17 @@ # limitations under the License. from concurrent.futures import ThreadPoolExecutor -from typing import Any, Optional +from typing import TYPE_CHECKING, Any, Optional +import torch +from lightning_utilities.core.apply_func import apply_to_collection from typing_extensions import override -from lightning.fabric.plugins import CheckpointIO from lightning.pytorch.plugins.io.wrapper import _WrappingCheckpointIO +if TYPE_CHECKING: + from lightning.fabric.plugins import CheckpointIO + class AsyncCheckpointIO(_WrappingCheckpointIO): """``AsyncCheckpointIO`` enables saving the checkpoints asynchronously in a thread. @@ -31,16 +35,37 @@ class AsyncCheckpointIO(_WrappingCheckpointIO): """ + _executor: Optional[ThreadPoolExecutor] + _error: Optional[BaseException] + def __init__(self, checkpoint_io: Optional["CheckpointIO"] = None) -> None: super().__init__(checkpoint_io) + self._executor = None + self._error = None + + # CheckpointIO doesn't have a setup method so we have to do something like. + def _ensure_setup(self) -> None: + """Ensures that the executor is setup. + + We can't do setup in __init__ because if train or validate is called more than once, the teardown method deletes + the executor. - self._executor = ThreadPoolExecutor(max_workers=1) - self._error: Optional[BaseException] = None + """ + if self._executor is None: + self._executor = ThreadPoolExecutor(max_workers=1) @override def save_checkpoint(self, *args: Any, **kwargs: Any) -> None: """Uses the ``ThreadPoolExecutor`` to save the checkpoints using the base ``checkpoint_io``.""" + self._ensure_setup() + + # rebuild args/kwargs with a cloned checkpoint (supports positional or kw form) + if "checkpoint" in kwargs: + kwargs = {**kwargs, "checkpoint": apply_to_collection(kwargs["checkpoint"], torch.Tensor, _clone_tensor)} + elif len(args) >= 1: + args = (apply_to_collection(args[0], torch.Tensor, _clone_tensor), *args[1:]) + def _save_checkpoint(*args: Any, **kwargs: Any) -> None: try: assert self.checkpoint_io is not None @@ -48,6 +73,7 @@ def _save_checkpoint(*args: Any, **kwargs: Any) -> None: except BaseException as ex: self._error = ex + assert self._executor is not None self._executor.submit(_save_checkpoint, *args, **kwargs) # if an error was raised between the previous time `save_checkpoint`` was called and now, @@ -58,8 +84,17 @@ def _save_checkpoint(*args: Any, **kwargs: Any) -> None: @override def teardown(self) -> None: """This method is called to close the threads.""" - self._executor.shutdown(wait=True) + if self._executor is not None: + self._executor.shutdown(wait=True) + self._executor = None # if an error was raised anytime in any of the `executor.submit` calls if self._error: raise self._error + + +# snapshot the checkpoint payload on the caller thread to avoid races with parameter mutation +def _clone_tensor(t: torch.Tensor) -> torch.Tensor: + """Clones a tensor on the caller thread.""" + # detach to avoid autograd history and clone to take a point-in-time copy + return t.detach().clone() diff --git a/src/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py b/src/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py index 09addf5a5a58c..f6e8885ee050a 100644 --- a/src/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py +++ b/src/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py @@ -93,8 +93,9 @@ def log_metrics(self, metrics: _OUT_DICT, step: Optional[int] = None) -> None: Args: metrics: Metric values - step: Step for which metrics should be logged. Default value is `self.global_step` during training or - the total validation / test log step count during validation and testing. + step: Step for which metrics should be logged. If a `step` metric is logged, this value will + be used else will default to `self.global_step` during training or the total log step count + during validation and testing. """ if not self.trainer.loggers or not metrics: diff --git a/src/lightning/pytorch/trainer/trainer.py b/src/lightning/pytorch/trainer/trainer.py index 8f6d3245a5faf..b78843990af30 100644 --- a/src/lightning/pytorch/trainer/trainer.py +++ b/src/lightning/pytorch/trainer/trainer.py @@ -526,7 +526,7 @@ def fit( datamodule: A :class:`~lightning.pytorch.core.datamodule.LightningDataModule` that defines the :class:`~lightning.pytorch.core.hooks.DataHooks.train_dataloader` hook. - ckpt_path: Path/URL of the checkpoint from which training is resumed. Could also be one of two special + ckpt_path: Path/URL of the checkpoint from which training is resumed. Could also be one of three special keywords ``"last"``, ``"hpc"`` and ``"registry"``. Otherwise, if there is no checkpoint file at the path, an exception is raised. @@ -535,12 +535,11 @@ def fit( - registry: the model will be downloaded from the Lightning Model Registry with following notations: - ``'registry'``: uses the latest/default version of default model set - with ``Tainer(..., model_registry="my-model")`` + with ``Trainer(..., model_registry="my-model")`` - ``'registry:model-name'``: uses the latest/default version of this model `model-name` - ``'registry:model-name:version:v2'``: uses the specific version 'v2' of the model `model-name` - ``'registry:version:v2'``: uses the default model set - with ``Tainer(..., model_registry="my-model")`` and version 'v2' - + with ``Trainer(..., model_registry="my-model")`` and version 'v2' Raises: TypeError: diff --git a/src/lightning/pytorch/tuner/lr_finder.py b/src/lightning/pytorch/tuner/lr_finder.py index a5d758f7fff19..b4b61d5cf0f93 100644 --- a/src/lightning/pytorch/tuner/lr_finder.py +++ b/src/lightning/pytorch/tuner/lr_finder.py @@ -276,17 +276,10 @@ def _lr_find( if trainer.progress_bar_callback: trainer.progress_bar_callback.enable() - # Update lr attr if required + # Update results across ranks lr_finder.results = trainer.strategy.broadcast(lr_finder.results) - if update_attr: - lr = lr_finder.suggestion() - - # TODO: log lr.results to self.logger - if lr is not None: - lightning_setattr(model, attr_name, lr) - log.info(f"Learning rate set to {lr}") - # Restore initial state of model + # Restore initial state of model (this will also restore the original optimizer state) trainer._checkpoint_connector.restore(ckpt_path) trainer.strategy.remove_checkpoint(ckpt_path) trainer.fit_loop.restarting = False # reset restarting flag as checkpoint restoring sets it to True @@ -294,6 +287,19 @@ def _lr_find( trainer.fit_loop.epoch_loop.val_loop._combined_loader = None trainer.fit_loop._combined_loader = None # reset data fetcher to avoid issues with the next fit trainer.fit_loop.setup_data() + + # Apply LR suggestion after restoring so it persists for the real training run + # When used as a callback, the suggestion would otherwise be lost due to checkpoint restore + if update_attr: + lr = lr_finder.suggestion() + if lr is not None: + # update the attribute on the LightningModule (e.g., lr or learning_rate) + lightning_setattr(model, attr_name, lr) + # also update the currently active optimizer(s) so training continues with the suggested LR + for opt in trainer.optimizers or []: + for pg in opt.param_groups: + pg["lr"] = lr + log.info(f"Learning rate set to {lr}") return lr_finder diff --git a/src/lightning/pytorch/utilities/model_summary/model_summary_deepspeed.py b/src/lightning/pytorch/utilities/model_summary/model_summary_deepspeed.py index 5038aebf0db79..3b6e022cd86b3 100644 --- a/src/lightning/pytorch/utilities/model_summary/model_summary_deepspeed.py +++ b/src/lightning/pytorch/utilities/model_summary/model_summary_deepspeed.py @@ -99,6 +99,7 @@ def _get_summary_data(self) -> list[tuple[str, list[str]]]: ("Params", list(map(get_human_readable_count, self.param_nums))), ("Params per Device", list(map(get_human_readable_count, self.parameters_per_layer))), ("Mode", ["train" if mode else "eval" for mode in self.training_modes]), + ("FLOPs", list(map(get_human_readable_count, (sum(x.values()) for x in self.flop_counts.values())))), ] if self._model.example_input_array is not None: arrays.append(("In sizes", [str(x) for x in self.in_sizes])) diff --git a/src/version.info b/src/version.info index aedc15bb0c6e2..fe16b348d97f7 100644 --- a/src/version.info +++ b/src/version.info @@ -1 +1 @@ -2.5.3 +2.5.4 diff --git a/tests/legacy/back-compatible-versions.txt b/tests/legacy/back-compatible-versions.txt index 091d993cdd725..9032e5c13cc54 100644 --- a/tests/legacy/back-compatible-versions.txt +++ b/tests/legacy/back-compatible-versions.txt @@ -106,3 +106,4 @@ 2.3.3 2.5.1 2.5.2 +2.5.3 diff --git a/tests/tests_fabric/strategies/launchers/test_multiprocessing_integration.py b/tests/tests_fabric/strategies/launchers/test_multiprocessing_integration.py index 2abfe73c92dec..85688ef8fb489 100644 --- a/tests/tests_fabric/strategies/launchers/test_multiprocessing_integration.py +++ b/tests/tests_fabric/strategies/launchers/test_multiprocessing_integration.py @@ -30,6 +30,7 @@ def __init__(self): @RunIf(skip_windows=True) +@pytest.mark.flaky(reruns=3) @pytest.mark.parametrize("strategy", ["ddp_spawn", "ddp_fork"]) def test_memory_sharing_disabled(strategy): """Test that the multiprocessing launcher disables memory sharing on model parameters and buffers to avoid race @@ -46,7 +47,8 @@ def test_memory_sharing_disabled(strategy): def _test_memory_sharing_disabled(fabric, tensor, model): is_spawn = fabric.strategy.launcher._start_method == "spawn" - assert not is_spawn or tensor.is_shared() + if is_spawn: + assert tensor.is_shared() assert not model.layer.weight.is_shared() assert not model.tied_layer.weight.is_shared() assert not model.buffer.is_shared() diff --git a/tests/tests_fabric/utilities/test_throughput.py b/tests/tests_fabric/utilities/test_throughput.py index 00dafbb72cb8f..a175fa97fd444 100644 --- a/tests/tests_fabric/utilities/test_throughput.py +++ b/tests/tests_fabric/utilities/test_throughput.py @@ -68,6 +68,8 @@ def test_get_available_flops(xla_available): "device_name", [ # Hopper + "NVIDIA H200 SXM1", + "NVIDIA H200 NVL1", "h100-nvl", # TODO: switch with `torch.cuda.get_device_name()` result "h100-hbm3", # TODO: switch with `torch.cuda.get_device_name()` result "NVIDIA H100 PCIe", diff --git a/tests/tests_pytorch/callbacks/progress/test_rich_progress_bar.py b/tests/tests_pytorch/callbacks/progress/test_rich_progress_bar.py index 639414a797aa0..9d74871ce84e4 100644 --- a/tests/tests_pytorch/callbacks/progress/test_rich_progress_bar.py +++ b/tests/tests_pytorch/callbacks/progress/test_rich_progress_bar.py @@ -577,3 +577,31 @@ def test_rich_progress_bar_metrics_theme_update(*_): theme = RichProgressBar(theme=RichProgressBarTheme(metrics_format=".3e", metrics_text_delimiter="\n")).theme assert theme.metrics_format == ".3e" assert theme.metrics_text_delimiter == "\n" + + +@RunIf(rich=True) +def test_rich_progress_bar_empty_val_dataloader_model(tmp_path): + """Test that RichProgressBar doesn't crash with empty val_dataloader list from model.""" + + class EmptyListModel(BoringModel): + def train_dataloader(self): + return DataLoader(RandomDataset(32, 64), batch_size=2) + + def val_dataloader(self): + return [] + + model = EmptyListModel() + progress_bar = RichProgressBar() + + trainer = Trainer( + default_root_dir=tmp_path, + max_epochs=1, + num_sanity_val_steps=1, + callbacks=[progress_bar], + limit_train_batches=2, + enable_checkpointing=False, + logger=False, + ) + + # This should not raise an AssertionError + trainer.fit(model) diff --git a/tests/tests_pytorch/plugins/test_async_checkpoint.py b/tests/tests_pytorch/plugins/test_async_checkpoint.py new file mode 100644 index 0000000000000..0718dab78d75f --- /dev/null +++ b/tests/tests_pytorch/plugins/test_async_checkpoint.py @@ -0,0 +1,53 @@ +import time +from typing import Any, Optional + +import pytest +import torch + +from lightning.fabric.plugins.io.checkpoint_io import CheckpointIO +from lightning.pytorch.plugins.io.async_plugin import AsyncCheckpointIO + + +class _CaptureCheckpointIO(CheckpointIO): + def __init__(self) -> None: + self.saved: Optional[dict[str, Any]] = None + + def save_checkpoint(self, checkpoint: dict[str, Any], path: str, storage_options: Optional[Any] = None) -> None: + # Simulate some delay to increase race window + time.sleep(0.05) + # Store the received checkpoint object (not a deep copy) to inspect tensor values + self.saved = checkpoint + + def load_checkpoint(self, path: str, map_location: Optional[Any] = None) -> dict[str, Any]: + raise NotImplementedError + + def remove_checkpoint(self, path: str) -> None: + pass + + +@pytest.mark.filterwarnings("ignore::DeprecationWarning") +def test_async_checkpoint_should_snapshot_values_before_mutation(): + base = _CaptureCheckpointIO() + async_io = AsyncCheckpointIO(checkpoint_io=base) + + # a tensor that we will mutate after scheduling the save + t = torch.tensor([0.0]) + ckpt = {"w": t} + + # schedule async save + async_io.save_checkpoint(ckpt, path="unused") + + # mutate immediately afterward to mimic training thread stepping params + t.add_(1.0) + + # ensure background thread finished + async_io.teardown() + + assert base.saved is not None, "Async save did not run" + + # EXPECTATION: AsyncCheckpointIO should have captured value 0.0 (pre-mutation) + # CURRENT BEHAVIOR (bug): it captures 1.0 because the dict holds references + assert torch.allclose(base.saved["w"], torch.tensor([0.0])), ( + "AsyncCheckpointIO must snapshot the checkpoint (clone tensors) on the main thread " + "to avoid races with parameter mutation; got mutated value instead" + ) diff --git a/tests/tests_pytorch/plugins/test_checkpoint_io_plugin.py b/tests/tests_pytorch/plugins/test_checkpoint_io_plugin.py index 0f62eeae69ef8..f7a76079cfca2 100644 --- a/tests/tests_pytorch/plugins/test_checkpoint_io_plugin.py +++ b/tests/tests_pytorch/plugins/test_checkpoint_io_plugin.py @@ -127,6 +127,10 @@ def on_fit_start(self): enable_progress_bar=False, enable_model_summary=False, ) + + # We add a validate step to test that async works when fit or validate is called multiple times. + trainer.validate(model) + trainer.fit(model) assert checkpoint_plugin.save_checkpoint.call_count == 3 diff --git a/tests/tests_pytorch/tuner/test_lr_finder.py b/tests/tests_pytorch/tuner/test_lr_finder.py index e2d1b6bd4ee84..69575a351b0a5 100644 --- a/tests/tests_pytorch/tuner/test_lr_finder.py +++ b/tests/tests_pytorch/tuner/test_lr_finder.py @@ -619,6 +619,78 @@ def test_gradient_correctness(): assert abs(suggestion - math.pi) < 1e-2, "Suggestion should be close to pi for this synthetic example" +def test_lr_finder_callback_applies_lr_after_restore(tmp_path): + """LearningRateFinder used as a callback should apply its suggested LR to the optimizer used after state + restoration.""" + + import torch.nn as nn + import torch.nn.functional as F + from torch.utils.data import DataLoader, Dataset + + from lightning.pytorch.callbacks import LearningRateMonitor + + class RandomDataset(Dataset): + def __init__(self, n: int = 256, in_dim: int = 28 * 28): + self.x = torch.randn(n, in_dim) + self.y = torch.randn(n, in_dim) + + def __len__(self) -> int: + return len(self.x) + + def __getitem__(self, idx): + return self.x[idx], self.y[idx] + + class TinyAE(BoringModel): + def __init__(self, lr: float = 1e-5): + super().__init__() + self.save_hyperparameters() + self.encoder = nn.Sequential(nn.Linear(28 * 28, 128), nn.ReLU(), nn.Linear(128, 3)) + self.decoder = nn.Sequential(nn.Linear(3, 128), nn.ReLU(), nn.Linear(128, 28 * 28)) + + def training_step(self, batch: Any, batch_idx: int) -> STEP_OUTPUT: + x, y = batch + z = self.encoder(x) + x_hat = self.decoder(z) + loss = F.mse_loss(x_hat, y) + return loss + + def configure_optimizers(self): + return torch.optim.Adam(self.parameters(), lr=self.hparams.lr) + + seed_everything(123) + + ds = RandomDataset(n=512) + train_loader = DataLoader(ds, batch_size=64, shuffle=False) + + model = TinyAE(lr=1e-5) + + lr_finder_cb = LearningRateFinder() # default update_attr=True should apply suggestion + lr_monitor = LearningRateMonitor(logging_interval="step") + + trainer = Trainer( + default_root_dir=tmp_path, + max_epochs=2, + callbacks=[lr_finder_cb, lr_monitor], + enable_model_summary=False, + enable_progress_bar=False, + log_every_n_steps=1, + ) + + trainer.fit(model, train_loader) + assert model.hparams.lr is not None + # Ensure LR Finder produced a suggestion for this setup; if not, the test can't assert application + assert lr_finder_cb.optimal_lr is not None, "LR Finder should have computed results" + suggestion = lr_finder_cb.optimal_lr.suggestion() + assert suggestion is not None, "LR Finder should produce a suggestion for this setup" + + # Verify that the optimizer used for subsequent training has the suggested LR applied + assert trainer.optimizers, "Trainer should have an optimizer after fit" + current_lr = trainer.optimizers[0].param_groups[0]["lr"] + assert current_lr == pytest.approx(suggestion), ( + f"LR Finder suggestion {suggestion} should be applied to optimizer, but got {current_lr}" + ) + + def test_exponential_vs_linear_mode_gradient_difference(tmp_path): """Test that exponential and linear modes produce different but valid suggestions. diff --git a/tests/tests_pytorch/utilities/test_compile.py b/tests/tests_pytorch/utilities/test_compile.py index a053c847dfd6c..f90cd5e3ef3fa 100644 --- a/tests/tests_pytorch/utilities/test_compile.py +++ b/tests/tests_pytorch/utilities/test_compile.py @@ -32,7 +32,7 @@ # https://github.com/pytorch/pytorch/issues/95708 @pytest.mark.skipif(sys.platform == "darwin", reason="fatal error: 'omp.h' file not found") -@RunIf(dynamo=True) +@RunIf(dynamo=True, deepspeed=True) @mock.patch("lightning.pytorch.trainer.call._call_and_handle_interrupt") def test_trainer_compiled_model(_, tmp_path, monkeypatch, mps_count_0): trainer_kwargs = { diff --git a/tests/tests_pytorch/utilities/test_deepspeed_model_summary.py b/tests/tests_pytorch/utilities/test_deepspeed_model_summary.py index 256233e01fa98..3679c0d52f8fe 100644 --- a/tests/tests_pytorch/utilities/test_deepspeed_model_summary.py +++ b/tests/tests_pytorch/utilities/test_deepspeed_model_summary.py @@ -12,6 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +from unittest import mock + +import torch + import lightning.pytorch as pl from lightning.pytorch import Callback, Trainer from lightning.pytorch.demos.boring_classes import BoringModel @@ -51,3 +55,38 @@ def on_fit_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") - ) trainer.fit(model) + + +@RunIf(min_cuda_gpus=1, deepspeed=True, rich=True) +@mock.patch("rich.table.Table.add_row", autospec=True) +def test_deepspeed_summary_with_rich_model_summary(mock_table_add_row, tmp_path): + from lightning.pytorch.callbacks import RichModelSummary + + model = BoringModel() + model.example_input_array = torch.randn(4, 32) + + trainer = Trainer( + strategy=DeepSpeedStrategy(stage=3), + default_root_dir=tmp_path, + accelerator="gpu", + fast_dev_run=True, + devices=1, + enable_model_summary=True, + callbacks=[RichModelSummary()], + ) + + trainer.fit(model) + + # assert that the input summary data was converted correctly + args, _ = mock_table_add_row.call_args_list[0] + assert args[1:] == ( + "0", + "layer", + "Linear", + "66 ", + "66 ", + "train", + "512 ", + "[4, 32]", + "[4, 2]", + )