diff --git a/.github/workflows/integration_test_8gpu_models.yaml b/.github/workflows/integration_test_8gpu_models.yaml index b673da5adf..acdbe9cb06 100644 --- a/.github/workflows/integration_test_8gpu_models.yaml +++ b/.github/workflows/integration_test_8gpu_models.yaml @@ -3,6 +3,8 @@ name: 8 GPU Model Tests on: push: branches: [ main ] + tags: + - ciflow/8gpu/* paths-ignore: - 'torchtitan/experiments/**' pull_request: @@ -21,18 +23,30 @@ defaults: run: shell: bash -l -eo pipefail {0} +permissions: + id-token: write + contents: read + jobs: + # Step 1: Dynamically compute the matrix based on conditions + set-matrix: + uses: ./.github/workflows/set-matrix.yaml + + # Step 2: Use the dynamic matrix in the build-test job build-test: + needs: set-matrix uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + strategy: + fail-fast: false + matrix: ${{ fromJSON(needs.set-matrix.outputs.matrix) }} with: - runner: linux.g5.48xlarge.nvidia.gpu - gpu-arch-type: cuda - gpu-arch-version: "12.6" - # This image is faster to clone than the default, but it lacks CC needed by triton - # (1m25s vs 2m37s). - docker-image: torchtitan-ubuntu-20.04-clang12 + runner: ${{ matrix.runner }} + gpu-arch-type: ${{ matrix.gpu-arch-type }} + gpu-arch-version: ${{ matrix.gpu-arch-version }} + docker-image: ${{ matrix.docker-image }} repository: pytorch/torchtitan upload-artifact: outputs + timeout: 45 script: | set -eux @@ -46,12 +60,14 @@ jobs: pip config --user set global.progress_bar off - python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126 + python -m pip install --force-reinstall --pre torch --index-url ${{ matrix.index-url }} + + USE_CPP=0 python -m pip install --pre torchao --index-url ${{ matrix.index-url }} - USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126 + sudo mkdir -p "$RUNNER_TEMP/artifacts-to-be-uploaded" + sudo chown -R $(id -u):$(id -g) "$RUNNER_TEMP/artifacts-to-be-uploaded" - mkdir artifacts-to-be-uploaded - python -m tests.integration_tests.run_tests --test_suite models artifacts-to-be-uploaded --ngpu 8 - python -m tests.integration_tests.flux artifacts-to-be-uploaded/flux --ngpu 8 - rm -rf artifacts-to-be-uploaded/*/checkpoint - rm -rf artifacts-to-be-uploaded/flux/*/inference_results/ + python -m tests.integration_tests.run_tests --gpu_arch_type ${{ matrix.gpu-arch-type }} --test_suite models $RUNNER_TEMP/artifacts-to-be-uploaded --ngpu 8 + python -m tests.integration_tests.flux $RUNNER_TEMP/artifacts-to-be-uploaded/flux --ngpu 8 + rm -rf $RUNNER_TEMP/artifacts-to-be-uploaded/*/checkpoint + rm -rf $RUNNER_TEMP/artifacts-to-be-uploaded/flux/*/inference_results/ diff --git a/.github/workflows/integration_test_8gpu_torchft.yaml b/.github/workflows/integration_test_8gpu_torchft.yaml index 23f59d8bba..0931fa75b7 100644 --- a/.github/workflows/integration_test_8gpu_torchft.yaml +++ b/.github/workflows/integration_test_8gpu_torchft.yaml @@ -3,6 +3,8 @@ name: TorchFT 8 GPU Integration Test on: push: branches: [ main ] + tags: + - ciflow/8gpu/* paths: - 'torchtitan/components/ft.py' - '.github/workflows/integration_test_8gpu_torchft.yaml' @@ -21,18 +23,30 @@ defaults: run: shell: bash -l -eo pipefail {0} +permissions: + id-token: write + contents: read + jobs: + # Step 1: Dynamically compute the matrix based on conditions + set-matrix: + uses: ./.github/workflows/set-matrix.yaml + + # Step 2: Use the dynamic matrix in the build-test job build-test: + needs: set-matrix uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + strategy: + fail-fast: false + matrix: ${{ fromJSON(needs.set-matrix.outputs.matrix) }} with: - runner: linux.g5.48xlarge.nvidia.gpu - gpu-arch-type: cuda - gpu-arch-version: "12.6" - # This image is faster to clone than the default, but it lacks CC needed by triton - # (1m25s vs 2m37s). - docker-image: torchtitan-ubuntu-20.04-clang12 + runner: ${{ matrix.runner }} + gpu-arch-type: ${{ matrix.gpu-arch-type }} + gpu-arch-version: ${{ matrix.gpu-arch-version }} + docker-image: ${{ matrix.docker-image }} repository: pytorch/torchtitan upload-artifact: outputs + timeout: 45 script: | set -eux @@ -47,14 +61,16 @@ jobs: pip config --user set global.progress_bar off python -m pip install torchft-nightly - python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126 - USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126 + python -m pip install --force-reinstall --pre torch --index-url ${{ matrix.index-url }} + USE_CPP=0 python -m pip install --pre torchao --index-url ${{ matrix.index-url }} + + sudo mkdir -p "$RUNNER_TEMP/artifacts-to-be-uploaded" + sudo chown -R $(id -u):$(id -g) "$RUNNER_TEMP/artifacts-to-be-uploaded" - mkdir artifacts-to-be-uploaded echo "torchft_lighthouse" RUST_BACKTRACE=1 torchft_lighthouse --min_replicas 1 --quorum_tick_ms 100 --join_timeout_ms 10000 > /dev/null 2>&1 & echo "ft_integration_test" # Getting error - Cuda failure 217 'peer access is not supported between these two devices' - python -m tests.integration_tests.ft artifacts-to-be-uploaded --ngpu 8 + python -m tests.integration_tests.ft $RUNNER_TEMP/artifacts-to-be-uploaded --ngpu 8 # pkill -9 torchft_lighthouse - rm -rf artifacts-to-be-uploaded/*/checkpoint + rm -rf $RUNNER_TEMP/artifacts-to-be-uploaded/*/checkpoint diff --git a/.github/workflows/set-matrix.yaml b/.github/workflows/set-matrix.yaml index 5564d8d70b..3d6704641c 100644 --- a/.github/workflows/set-matrix.yaml +++ b/.github/workflows/set-matrix.yaml @@ -27,9 +27,9 @@ jobs: "name": "rocm", "runner": "linux.rocm.gpu.gfx942.8", "gpu-arch-type": "rocm", - "gpu-arch-version": "7.0", + "gpu-arch-version": "7.1", "docker-image": "torchtitan-rocm-ubuntu-22.04-clang12", - "index-url": "https://download.pytorch.org/whl/nightly/rocm7.0" + "index-url": "https://download.pytorch.org/whl/nightly/rocm7.1" }' # Define CUDA matrix diff --git a/tests/integration_tests/features.py b/tests/integration_tests/features.py index 8e16ecb4fb..3662aa6bf6 100755 --- a/tests/integration_tests/features.py +++ b/tests/integration_tests/features.py @@ -121,7 +121,6 @@ def build_features_test_list() -> list[OverrideDefinitions]: ], "Checkpoint Integration Test - save load model only checkpoint in HF definition and format", "model_only_hf_checkpoint", - skip_rocm_test=True, ), OverrideDefinitions( [