From b2718a9859586ae8c5010833e41d9f934412fb8f Mon Sep 17 00:00:00 2001 From: lowdy1 Date: Mon, 28 Apr 2025 03:22:59 +0000 Subject: [PATCH 1/9] NPU add titan test --- .github/workflows/_ascend_npu_torchtitan.yml | 140 +++++++++++++++++++ .github/workflows/ascend_npu_test.yml | 22 ++- 2 files changed, 161 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/_ascend_npu_torchtitan.yml diff --git a/.github/workflows/_ascend_npu_torchtitan.yml b/.github/workflows/_ascend_npu_torchtitan.yml new file mode 100644 index 0000000..52a04ff --- /dev/null +++ b/.github/workflows/_ascend_npu_torchtitan.yml @@ -0,0 +1,140 @@ +name: "_ascend_npu_torchtitan" + +on: + workflow_call: + inputs: + runner: + required: true + type: string + description: "The runner selected to run on" + image: + required: true + type: string + description: "The docker image which will be loaded" + device: + required: true + type: string + description: "The device selected to run on" + torch-artifact: + required: false + type: string + description: "The distribution artifact name of torch" + torch-npu-artifact: + required: true + type: string + description: "The distribution artifact name of torch_npu" + secrets: + pr-token: + description: "A token used to create a pull request" + required: true + +# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly +# declared as "shell: bash -el {0}" on steps that need to be properly activated. +# It's used to activate ascend-toolkit environment variables. + +defaults: + run: + shell: bash -el {0} + +jobs: + setup_environment: + name: Run Torchtitan Environment + runs-on: ${{ inputs.runner }} + container: + image: ${{ inputs.image }} + env: + HF_ENDPOINT: https://hf-mirror.com + outputs: + torch_version: ${{ steps.get_torch_version.outputs.torch-version }} + npu_info: ${{ steps.check_npu.outputs.npu_info }} + steps: + - name: Show NPU info + run: | + npu-smi info + + - name: Config mirrors + run: | + sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list + pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple + + - name: Install system dependencies + run: | + apt-get update + apt-get install -y \ + git gcc g++ make cmake ninja-build curl \ + libgl1 libglib2.0-0 libsndfile1 + + - name: Config git + run: | + git config --global --add safe.directory "$GITHUB_WORKSPACE" + git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/ + + - name: Checkout + uses: actions/checkout@v4 + + - name: Checkout benchmark + uses: actions/checkout@v4 + with: + repository: pytorch/torchtitan + path: torchtitan + + - name: Download torch artifact + if: ${{ inputs.torch-artifact }} + uses: actions/download-artifact@v4 + with: + name: ${{ inputs.torch-artifact }} + + - name: Install torch + if: ${{ inputs.torch-artifact }} + run: | + pip install ${{ inputs.torch-artifact }} + + - name: Install torch_npu dependencies + if: ${{ !inputs.torch-artifact }} + run: | + pip install -r https://raw.githubusercontent.com/Ascend/pytorch/refs/heads/master/requirements.txt + + - name: List torch version + id: list-torch-version + shell: bash + run: | + torch_version=$(python -c "import torch; print(torch.__version__)") + echo "torch-version=${torch_version}" >> $GITHUB_OUTPUT + + - name: Download torch_npu artifact + uses: actions/download-artifact@v4 + with: + name: ${{ inputs.torch-npu-artifact }} + path: ascend_npu + + - name: Install torch_npu + working-directory: ascend_npu + run: | + pip install ${{ inputs.torch-npu-artifact }} + + - name: Install nightly torchvision and torchaudio + run: | + pip install --pre torchvision torchaudio --no-deps --index-url https://download.pytorch.org/whl/nightly/cpu + + - name: Install project dependencies + run: | + pip install -r requirements.txt + pip install pytest pytest-cov + + - name: Show environment info + run: | + npu_is_available=$(python -c "import torch; print(torch.npu.is_available())") + npu_count=$(python -c "import torch; print(torch.npu.device_count())") + echo "NPU is available: ${npu_is_available}" + echo "NPU count: ${npu_count}" + pip list | grep -E 'torch|numpy' + + - name: Run torchtitan integration_test + run: | + mkdir artifacts-to-be-uploaded + npu_count=$(python -c "import torch; print(torch.npu.device_count())") + python ./tests/integration_tests.py artifacts-to-be-uploaded --ngpu ${npu_count} + + - name: Run torchtitan unittest + run: | + pytest ./tests/unit_tests --cov=. --cov-report=xml --durations=20 -vv diff --git a/.github/workflows/ascend_npu_test.yml b/.github/workflows/ascend_npu_test.yml index 2abf646..15c4787 100644 --- a/.github/workflows/ascend_npu_test.yml +++ b/.github/workflows/ascend_npu_test.yml @@ -10,6 +10,7 @@ on: - ".github/workflows/_ascend_npu_build_torch_npu.yml" - ".github/workflows/_ascend_npu_ut.yml" - ".github/workflows/_ascend_npu_benchmark.yml" + - ".github/workflows/_ascend_npu_torchtitan.yml" - ".ci/**" - "ascend_npu/**" - "src/**" @@ -23,6 +24,7 @@ on: - ".github/workflows/_ascend_npu_build_torch_npu.yml" - ".github/workflows/_ascend_npu_ut.yml" - ".github/workflows/_ascend_npu_benchmark.yml" + - ".github/workflows/_ascend_npu_torchtitan.yml" - ".ci/**" - "ascend_npu/**" - "src/**" @@ -157,7 +159,6 @@ jobs: - prepare - build-torch - build - - test if: | !cancelled() && github.event_name != 'repository_dispatch' && (success() || (needs.build-torch.result == 'skipped' && needs.build.result == 'success')) @@ -170,3 +171,22 @@ jobs: torch-npu-artifact: ${{ needs.build.outputs.torch-npu-artifact }} secrets: pr-token: ${{ secrets.COSDT_BOT_TOKEN }} + + torchtitan: + name: Run torchtitan + needs: + - prepare + - build-torch + - build + if: | + !cancelled() && github.event_name != 'repository_dispatch' && + (success() || (needs.build-torch.result == 'skipped' && needs.build.result == 'success')) + uses: ./.github/workflows/_ascend_npu_torchtitan.yml + with: + runner: "linux-arm64-npu-2" + image: ${{ needs.prepare.outputs.image }} + device: ${{ needs.prepare.outputs.device }} + torch-artifact: ${{ needs.build-torch.outputs.torch-artifact }} + torch-npu-artifact: ${{ needs.build.outputs.torch-npu-artifact }} + secrets: + pr-token: ${{ secrets.COSDT_BOT_TOKEN }} From 1289a93866e8eaa62199e7903f3f2490f7f4610e Mon Sep 17 00:00:00 2001 From: lowdy1 Date: Mon, 28 Apr 2025 03:48:48 +0000 Subject: [PATCH 2/9] NPU add torchtitan test --- .github/workflows/ascend_npu_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ascend_npu_test.yml b/.github/workflows/ascend_npu_test.yml index 15c4787..de865f6 100644 --- a/.github/workflows/ascend_npu_test.yml +++ b/.github/workflows/ascend_npu_test.yml @@ -56,7 +56,7 @@ on: - ascendai/cann:7.0.1-910b-ubuntu22.04-py3.8 - ascendai/cann:8.0.0-910b-ubuntu22.04-py3.10 - ascendai/cann:latest - default: "ascendai/cann:latest" + default: "ascendai/cann:8.0.0-910b-ubuntu22.04-py3.10" description: "The docker image which will be loaded" device: required: true From 52e285a2730223a798a1f0a08e90500918cee21e Mon Sep 17 00:00:00 2001 From: lowdy1 Date: Mon, 28 Apr 2025 04:46:47 +0000 Subject: [PATCH 3/9] NPU add titan test --- .github/workflows/ascend_npu_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ascend_npu_test.yml b/.github/workflows/ascend_npu_test.yml index de865f6..4c3f311 100644 --- a/.github/workflows/ascend_npu_test.yml +++ b/.github/workflows/ascend_npu_test.yml @@ -56,7 +56,7 @@ on: - ascendai/cann:7.0.1-910b-ubuntu22.04-py3.8 - ascendai/cann:8.0.0-910b-ubuntu22.04-py3.10 - ascendai/cann:latest - default: "ascendai/cann:8.0.0-910b-ubuntu22.04-py3.10" + default: "ascendai/cann:7.0.1-910b-ubuntu22.04-py3.8" description: "The docker image which will be loaded" device: required: true From 2fcbff83647a2c6e27fd74a0801ece56d0550f9a Mon Sep 17 00:00:00 2001 From: lowdy1 Date: Thu, 29 May 2025 01:29:33 +0000 Subject: [PATCH 4/9] add torchtitan --- .github/workflows/_ascend_npu_torchtitan.yml | 2 ++ .github/workflows/ascend_npu_test.yml | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/_ascend_npu_torchtitan.yml b/.github/workflows/_ascend_npu_torchtitan.yml index 52a04ff..8c43d18 100644 --- a/.github/workflows/_ascend_npu_torchtitan.yml +++ b/.github/workflows/_ascend_npu_torchtitan.yml @@ -130,11 +130,13 @@ jobs: pip list | grep -E 'torch|numpy' - name: Run torchtitan integration_test + working-directory: torchtitan run: | mkdir artifacts-to-be-uploaded npu_count=$(python -c "import torch; print(torch.npu.device_count())") python ./tests/integration_tests.py artifacts-to-be-uploaded --ngpu ${npu_count} - name: Run torchtitan unittest + working-directory: torchtitan run: | pytest ./tests/unit_tests --cov=. --cov-report=xml --durations=20 -vv diff --git a/.github/workflows/ascend_npu_test.yml b/.github/workflows/ascend_npu_test.yml index 4c3f311..15c4787 100644 --- a/.github/workflows/ascend_npu_test.yml +++ b/.github/workflows/ascend_npu_test.yml @@ -56,7 +56,7 @@ on: - ascendai/cann:7.0.1-910b-ubuntu22.04-py3.8 - ascendai/cann:8.0.0-910b-ubuntu22.04-py3.10 - ascendai/cann:latest - default: "ascendai/cann:7.0.1-910b-ubuntu22.04-py3.8" + default: "ascendai/cann:latest" description: "The docker image which will be loaded" device: required: true From 895ffd2a89bc4a5bcb823647e72c73ccd03bdb6a Mon Sep 17 00:00:00 2001 From: lowdy1 Date: Thu, 29 May 2025 01:46:45 +0000 Subject: [PATCH 5/9] add titan --- .github/workflows/ascend_npu_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ascend_npu_test.yml b/.github/workflows/ascend_npu_test.yml index 15c4787..ca72e91 100644 --- a/.github/workflows/ascend_npu_test.yml +++ b/.github/workflows/ascend_npu_test.yml @@ -184,7 +184,7 @@ jobs: uses: ./.github/workflows/_ascend_npu_torchtitan.yml with: runner: "linux-arm64-npu-2" - image: ${{ needs.prepare.outputs.image }} + image: "ascendai/cann:8.0.0.beta1-910b-ubuntu22.04-py3.10" device: ${{ needs.prepare.outputs.device }} torch-artifact: ${{ needs.build-torch.outputs.torch-artifact }} torch-npu-artifact: ${{ needs.build.outputs.torch-npu-artifact }} From 5a6f32ee31b28c676165271d1aedf7f228f029f7 Mon Sep 17 00:00:00 2001 From: lowdy1 Date: Thu, 29 May 2025 06:19:56 +0000 Subject: [PATCH 6/9] add torchtitan --- .github/workflows/_ascend_npu_torchtitan.yml | 2 +- .github/workflows/ascend_npu_test.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/_ascend_npu_torchtitan.yml b/.github/workflows/_ascend_npu_torchtitan.yml index 8c43d18..8a9e932 100644 --- a/.github/workflows/_ascend_npu_torchtitan.yml +++ b/.github/workflows/_ascend_npu_torchtitan.yml @@ -119,7 +119,7 @@ jobs: - name: Install project dependencies run: | pip install -r requirements.txt - pip install pytest pytest-cov + pip install pytest pytest-cov tyro - name: Show environment info run: | diff --git a/.github/workflows/ascend_npu_test.yml b/.github/workflows/ascend_npu_test.yml index ca72e91..3b55223 100644 --- a/.github/workflows/ascend_npu_test.yml +++ b/.github/workflows/ascend_npu_test.yml @@ -183,7 +183,7 @@ jobs: (success() || (needs.build-torch.result == 'skipped' && needs.build.result == 'success')) uses: ./.github/workflows/_ascend_npu_torchtitan.yml with: - runner: "linux-arm64-npu-2" + runner: "linux-arm64-npu-4" image: "ascendai/cann:8.0.0.beta1-910b-ubuntu22.04-py3.10" device: ${{ needs.prepare.outputs.device }} torch-artifact: ${{ needs.build-torch.outputs.torch-artifact }} From 37d89397e73fe250925105e87ad7024ab1deaf5b Mon Sep 17 00:00:00 2001 From: lowdy1 Date: Fri, 30 May 2025 05:57:41 +0000 Subject: [PATCH 7/9] del benchmark temporarily --- .github/workflows/ascend_npu_test.yml | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/.github/workflows/ascend_npu_test.yml b/.github/workflows/ascend_npu_test.yml index 3b55223..f1ce779 100644 --- a/.github/workflows/ascend_npu_test.yml +++ b/.github/workflows/ascend_npu_test.yml @@ -153,25 +153,6 @@ jobs: torch-artifact: ${{ needs.build-torch.outputs.torch-artifact }} torch-npu-artifact: ${{ needs.build.outputs.torch-npu-artifact }} - benchmark: - name: Run benchmarks - needs: - - prepare - - build-torch - - build - if: | - !cancelled() && github.event_name != 'repository_dispatch' && - (success() || (needs.build-torch.result == 'skipped' && needs.build.result == 'success')) - uses: ./.github/workflows/_ascend_npu_benchmark.yml - with: - runner: ${{ needs.prepare.outputs.runner }} - image: ${{ needs.prepare.outputs.image }} - device: ${{ needs.prepare.outputs.device }} - torch-artifact: ${{ needs.build-torch.outputs.torch-artifact }} - torch-npu-artifact: ${{ needs.build.outputs.torch-npu-artifact }} - secrets: - pr-token: ${{ secrets.COSDT_BOT_TOKEN }} - torchtitan: name: Run torchtitan needs: From e1c26ff73e700634481d89812357ced777cfe94c Mon Sep 17 00:00:00 2001 From: lowdy1 Date: Tue, 3 Jun 2025 01:59:30 +0000 Subject: [PATCH 8/9] run as fast as possible --- .github/workflows/_ascend_npu_torchtitan.yml | 7 +------ .github/workflows/ascend_npu_test.yml | 17 ----------------- 2 files changed, 1 insertion(+), 23 deletions(-) diff --git a/.github/workflows/_ascend_npu_torchtitan.yml b/.github/workflows/_ascend_npu_torchtitan.yml index 8a9e932..07f554a 100644 --- a/.github/workflows/_ascend_npu_torchtitan.yml +++ b/.github/workflows/_ascend_npu_torchtitan.yml @@ -38,7 +38,7 @@ defaults: jobs: setup_environment: - name: Run Torchtitan Environment + name: run torchtitan tests runs-on: ${{ inputs.runner }} container: image: ${{ inputs.image }} @@ -112,10 +112,6 @@ jobs: run: | pip install ${{ inputs.torch-npu-artifact }} - - name: Install nightly torchvision and torchaudio - run: | - pip install --pre torchvision torchaudio --no-deps --index-url https://download.pytorch.org/whl/nightly/cpu - - name: Install project dependencies run: | pip install -r requirements.txt @@ -132,7 +128,6 @@ jobs: - name: Run torchtitan integration_test working-directory: torchtitan run: | - mkdir artifacts-to-be-uploaded npu_count=$(python -c "import torch; print(torch.npu.device_count())") python ./tests/integration_tests.py artifacts-to-be-uploaded --ngpu ${npu_count} diff --git a/.github/workflows/ascend_npu_test.yml b/.github/workflows/ascend_npu_test.yml index f1ce779..458a017 100644 --- a/.github/workflows/ascend_npu_test.yml +++ b/.github/workflows/ascend_npu_test.yml @@ -136,23 +136,6 @@ jobs: device: ${{ needs.prepare.outputs.device }} torch-artifact: ${{ needs.build-torch.outputs.torch-artifact }} - test: - name: Test torch_npu - needs: - - prepare - - build-torch - - build - if: | - !cancelled() && github.event_name != 'repository_dispatch' && - (success() || (needs.build-torch.result == 'skipped' && needs.build.result == 'success')) - uses: ./.github/workflows/_ascend_npu_ut.yml - with: - runner: ${{ needs.prepare.outputs.runner }} - image: ${{ needs.prepare.outputs.image }} - device: ${{ needs.prepare.outputs.device }} - torch-artifact: ${{ needs.build-torch.outputs.torch-artifact }} - torch-npu-artifact: ${{ needs.build.outputs.torch-npu-artifact }} - torchtitan: name: Run torchtitan needs: From 17771c9c7545cc91f1c2ad4654626a14b24cf264 Mon Sep 17 00:00:00 2001 From: lowdy1 Date: Tue, 3 Jun 2025 02:22:25 +0000 Subject: [PATCH 9/9] run as fast as possible --- .github/workflows/ascend_npu_test.yml | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/.github/workflows/ascend_npu_test.yml b/.github/workflows/ascend_npu_test.yml index fce2322..e6eda7c 100644 --- a/.github/workflows/ascend_npu_test.yml +++ b/.github/workflows/ascend_npu_test.yml @@ -136,20 +136,3 @@ jobs: torch-artifact: ${{ needs.build-torch.outputs.torch-artifact }} torch-npu-artifact: ${{ needs.build.outputs.torch-npu-artifact }} - benchmark: - name: Run benchmarks - needs: - - prepare - - build-torch - - build - if: | - !cancelled() && github.event_name != 'repository_dispatch' && - (success() || (needs.build-torch.result == 'skipped' && needs.build.result == 'success')) - uses: ./.github/workflows/_ascend_npu_benchmark.yml - with: - runner: ${{ needs.prepare.outputs.runner }} - image: ${{ needs.prepare.outputs.image }} - torch-artifact: ${{ needs.build-torch.outputs.torch-artifact }} - torch-npu-artifact: ${{ needs.build.outputs.torch-npu-artifact }} - secrets: - pr-token: ${{ secrets.COSDT_BOT_TOKEN }}