From 64965e2e840e0b15c16a73cb1b655e41199fd866 Mon Sep 17 00:00:00 2001 From: Jiahao Su Date: Mon, 7 Jul 2025 14:47:00 +0800 Subject: [PATCH 01/10] Restore execution benchmark --- .github/workflows/ascend_npu_test.yml | 34 +++++++++++++-------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/.github/workflows/ascend_npu_test.yml b/.github/workflows/ascend_npu_test.yml index b38a62f..c9d5af2 100644 --- a/.github/workflows/ascend_npu_test.yml +++ b/.github/workflows/ascend_npu_test.yml @@ -133,23 +133,23 @@ jobs: torch-artifact: ${{ needs.build-torch.outputs.torch-artifact }} torch-npu-artifact: ${{ needs.build.outputs.torch-npu-artifact }} - # benchmark: - # name: Run benchmarks - # needs: - # - prepare - # - build-torch - # - build - # if: | - # !cancelled() && github.event_name != 'repository_dispatch' && - # (success() || (needs.build-torch.result == 'skipped' && needs.build.result == 'success')) - # uses: ./.github/workflows/_ascend_npu_benchmark.yml - # with: - # runner: ${{ needs.prepare.outputs.runner }} - # image: ${{ needs.prepare.outputs.image }} - # torch-artifact: ${{ needs.build-torch.outputs.torch-artifact }} - # torch-npu-artifact: ${{ needs.build.outputs.torch-npu-artifact }} - # secrets: - # pr-token: ${{ secrets.ASCEND_RUNNER_TOKEN }} + benchmark: + name: Run benchmarks + needs: + - prepare + - build-torch + - build + if: | + !cancelled() && github.event_name != 'repository_dispatch' && + (success() || (needs.build-torch.result == 'skipped' && needs.build.result == 'success')) + uses: ./.github/workflows/_ascend_npu_benchmark.yml + with: + runner: ${{ needs.prepare.outputs.runner }} + image: ${{ needs.prepare.outputs.image }} + torch-artifact: ${{ needs.build-torch.outputs.torch-artifact }} + torch-npu-artifact: ${{ needs.build.outputs.torch-npu-artifact }} + secrets: + pr-token: ${{ secrets.ASCEND_RUNNER_TOKEN }} torchtune: name: Run torchtune From ba272c195aeac1357ece18904e9289b2dff9f761 Mon Sep 17 00:00:00 2001 From: Jiahao Su Date: Mon, 11 Aug 2025 14:44:00 +0800 Subject: [PATCH 02/10] update torch version --- .github/workflows/_ascend_npu_benchmark.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/_ascend_npu_benchmark.yml b/.github/workflows/_ascend_npu_benchmark.yml index 6e78034..e4a0e31 100644 --- a/.github/workflows/_ascend_npu_benchmark.yml +++ b/.github/workflows/_ascend_npu_benchmark.yml @@ -90,13 +90,6 @@ jobs: run: | pip install -r https://raw.githubusercontent.com/Ascend/pytorch/refs/heads/master/requirements.txt - - name: List torch version - id: list-torch-version - shell: bash - run: | - torch_version=$(python -c "import torch; print(torch.__version__)") - echo "torch-version=${torch_version}" >> $GITHUB_OUTPUT - - name: Download torch_npu artifact uses: actions/download-artifact@v4 with: @@ -108,6 +101,13 @@ jobs: run: | pip install ${{ inputs.torch-npu-artifact }} + - name: List torch version + id: list-torch-version + shell: bash + run: | + torch_version=$(python -c "import torch; print(torch.__version__)") + echo "torch-version=${torch_version}" >> $GITHUB_OUTPUT + - name: Install benchmark dependencies run: | pip install -r benchmark/requirements.txt \ From 2c6bec98fb7996e4ad91d8da5d39e8c986a21a17 Mon Sep 17 00:00:00 2001 From: Jiahao Su Date: Mon, 11 Aug 2025 17:01:53 +0800 Subject: [PATCH 03/10] update --- .github/workflows/_ascend_npu_benchmark.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_ascend_npu_benchmark.yml b/.github/workflows/_ascend_npu_benchmark.yml index e4a0e31..4f83cca 100644 --- a/.github/workflows/_ascend_npu_benchmark.yml +++ b/.github/workflows/_ascend_npu_benchmark.yml @@ -105,7 +105,7 @@ jobs: id: list-torch-version shell: bash run: | - torch_version=$(python -c "import torch; print(torch.__version__)") + torch_version=$(python -c "from importlib.metadata import version;print(version('torch'))") echo "torch-version=${torch_version}" >> $GITHUB_OUTPUT - name: Install benchmark dependencies From 130ae30b1e34689800765047f339c4ff0fa70acc Mon Sep 17 00:00:00 2001 From: Jiahao Su Date: Mon, 8 Sep 2025 02:48:23 +0000 Subject: [PATCH 04/10] Add the torchtitan workflow --- .github/workflows/_ascend_npu_torchtitan.yml | 126 +++++++++++++++++++ .github/workflows/ascend_npu_test.yml | 18 +++ 2 files changed, 144 insertions(+) create mode 100644 .github/workflows/_ascend_npu_torchtitan.yml diff --git a/.github/workflows/_ascend_npu_torchtitan.yml b/.github/workflows/_ascend_npu_torchtitan.yml new file mode 100644 index 0000000..c0523d5 --- /dev/null +++ b/.github/workflows/_ascend_npu_torchtitan.yml @@ -0,0 +1,126 @@ +name: "_ascend_npu_torchtitan" + +on: + workflow_call: + inputs: + runner: + required: true + type: string + description: "The runner selected to run on" + image: + required: true + type: string + description: "The docker image which will be loaded" + torch-artifact: + required: false + type: string + description: "The distribution artifact name of torch" + torch-npu-artifact: + required: true + type: string + description: "The distribution artifact name of torch_npu" + +# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly +# declared as "shell: bash -el {0}" on steps that need to be properly activated. +# It's used to activate ascend-toolkit environment variables. + +jobs: + setup_environment: + name: run torchtitan tests + runs-on: ${{ inputs.runner }} + container: + image: ${{ inputs.image }} + env: + HF_ENDPOINT: https://hf-mirror.com + outputs: + torch_version: ${{ steps.get_torch_version.outputs.torch-version }} + npu_info: ${{ steps.check_npu.outputs.npu_info }} + steps: + - name: Show NPU info + run: | + npu-smi info + + - name: Config mirrors + run: | + sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list + pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple + + - name: Install system dependencies + run: | + apt-get update + apt-get install -y \ + git gcc g++ make cmake ninja-build curl \ + libgl1 libglib2.0-0 libsndfile1 + + - name: Config git + run: | + git config --global --add safe.directory "$GITHUB_WORKSPACE" + git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/ + + - name: Checkout + uses: actions/checkout@v4 + + - name: Checkout benchmark + uses: actions/checkout@v4 + with: + repository: pytorch/torchtitan + path: torchtitan + + - name: Download torch artifact + if: ${{ inputs.torch-artifact }} + uses: actions/download-artifact@v4 + with: + name: ${{ inputs.torch-artifact }} + + - name: Install torch + if: ${{ inputs.torch-artifact }} + run: | + pip install ${{ inputs.torch-artifact }} + + - name: Install torch_npu dependencies + if: ${{ !inputs.torch-artifact }} + run: | + pip install -r https://raw.githubusercontent.com/Ascend/pytorch/refs/heads/master/requirements.txt + + - name: List torch version + id: list-torch-version + shell: bash + run: | + torch_version=$(python -c "import torch; print(torch.__version__)") + echo "torch-version=${torch_version}" >> $GITHUB_OUTPUT + + - name: Download torch_npu artifact + uses: actions/download-artifact@v4 + with: + name: ${{ inputs.torch-npu-artifact }} + path: ascend_npu + + - name: Install torch_npu + working-directory: ascend_npu + run: | + pip install ${{ inputs.torch-npu-artifact }} + + - name: Install project dependencies + run: | + pip install pytest pytest-cov tyro tabulate + + - name: Show environment info + id: check_npu + run: | + npu_is_available=$(python -c "import torch; print(torch.npu.is_available())") + npu_count=$(python -c "import torch; print(torch.npu.device_count())") + echo "npu_count=${npu_count}" >> $GITHUB_OUTPUT + echo "NPU is available: ${npu_is_available}" + echo "NPU count: ${npu_count}" + pip list | grep -E 'torch|numpy' + + - name: Run torchtitan integration_test + working-directory: torchtitan + run: | + mkdir artifacts-to-be-uploaded + python -m tests.integration_tests.run_tests --test_name artifacts-to-be-uploaded --ngpu ${{ steps.check_npu.outputs.npu_count }} || true + + - name: Run torchtitan unittest + working-directory: torchtitan· + run: | + pytest tests/unit_tests --cov=. --cov-report=xml --durations=20 -vv \ No newline at end of file diff --git a/.github/workflows/ascend_npu_test.yml b/.github/workflows/ascend_npu_test.yml index c9d5af2..12dc33e 100644 --- a/.github/workflows/ascend_npu_test.yml +++ b/.github/workflows/ascend_npu_test.yml @@ -168,3 +168,21 @@ jobs: torch-npu-artifact: ${{ needs.build.outputs.torch-npu-artifact }} secrets: hf-token: ${{ secrets.HF_TOKEN }} + + torchtitan: + name: Run torchtitan tests + needs: + - prepare + - build-torch + - build + if: | + !cancelled() && github.event_name != 'repository_dispatch' && + (success() || (needs.build-torch.result == 'skipped' && needs.build.result == 'success')) + uses: ./.github/workflows/_ascend_npu_torchtitan.yml + with: + runner: ${{ needs.prepare.outputs.runner }} + image: ${{ needs.prepare.outputs.image }} + torch-artifact: ${{ needs.build-torch.outputs.torch-artifact }} + torch-npu-artifact: ${{ needs.build.outputs.torch-npu-artifact }} + secrets: + pr-token: ${{ secrets.ASCEND_RUNNER_TOKEN }} From d7f2ccf9984d9ca960e5eb0098b1cb0a9926f447 Mon Sep 17 00:00:00 2001 From: Jiahao Su Date: Mon, 8 Sep 2025 02:49:52 +0000 Subject: [PATCH 05/10] update --- .github/workflows/_ascend_npu_torchtitan.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_ascend_npu_torchtitan.yml b/.github/workflows/_ascend_npu_torchtitan.yml index c0523d5..5ae9e21 100644 --- a/.github/workflows/_ascend_npu_torchtitan.yml +++ b/.github/workflows/_ascend_npu_torchtitan.yml @@ -123,4 +123,4 @@ jobs: - name: Run torchtitan unittest working-directory: torchtitan· run: | - pytest tests/unit_tests --cov=. --cov-report=xml --durations=20 -vv \ No newline at end of file + pytest tests/unit_tests --cov=. --cov-report=xml --durations=20 -vv From e66ff6a3567dc97bd461ffb3a6181480e6814eef Mon Sep 17 00:00:00 2001 From: jiahao su Date: Mon, 8 Sep 2025 11:55:14 +0800 Subject: [PATCH 06/10] remove secrets --- .github/workflows/ascend_npu_test.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/ascend_npu_test.yml b/.github/workflows/ascend_npu_test.yml index 12dc33e..732214a 100644 --- a/.github/workflows/ascend_npu_test.yml +++ b/.github/workflows/ascend_npu_test.yml @@ -184,5 +184,3 @@ jobs: image: ${{ needs.prepare.outputs.image }} torch-artifact: ${{ needs.build-torch.outputs.torch-artifact }} torch-npu-artifact: ${{ needs.build.outputs.torch-npu-artifact }} - secrets: - pr-token: ${{ secrets.ASCEND_RUNNER_TOKEN }} From 7f07e41c43529eef87f358cc651b96d669ca4207 Mon Sep 17 00:00:00 2001 From: Jiahao Su Date: Tue, 25 Nov 2025 20:17:05 +0800 Subject: [PATCH 07/10] Update CANN image address --- .github/workflows/ascend_npu_test.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ascend_npu_test.yml b/.github/workflows/ascend_npu_test.yml index b38a62f..b0988d3 100644 --- a/.github/workflows/ascend_npu_test.yml +++ b/.github/workflows/ascend_npu_test.yml @@ -51,10 +51,10 @@ on: required: true type: choice options: - - ascendai/cann:7.0.1-910b-ubuntu22.04-py3.8 - - ascendai/cann:8.0.0-910b-ubuntu22.04-py3.10 - - ascendai/cann:latest - default: "ascendai/cann:latest" + - swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.1.rc1-910b-ubuntu22.04-py3.11 + - swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11 + - swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:latest + default: "swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:latest" description: "The docker image which will be loaded" # Only cancel the previous runs when triggered by a pull_request event From fc2e5117fc25a0c076fc12686a9957ab58181cc1 Mon Sep 17 00:00:00 2001 From: Jiahao Su Date: Fri, 28 Nov 2025 10:56:59 +0800 Subject: [PATCH 08/10] update --- .github/workflows/ascend_npu_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ascend_npu_test.yml b/.github/workflows/ascend_npu_test.yml index b0988d3..1fbe756 100644 --- a/.github/workflows/ascend_npu_test.yml +++ b/.github/workflows/ascend_npu_test.yml @@ -83,7 +83,7 @@ jobs: id: set-env run: | echo "runner=${{ github.event.inputs.runner || 'linux-arm64-npu-1' }}" >> $GITHUB_OUTPUT - echo "image=${{ github.event.inputs.image || 'ascendai/cann:latest' }}" >> $GITHUB_OUTPUT + echo "image=${{ github.event.inputs.image || 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:latest' }}" >> $GITHUB_OUTPUT # TODO(shink): List ghstack PR's ref - name: List ref to the PyTorch branch From 7e3a8302b8d134e83acf6cee85a4c7a4a16433af Mon Sep 17 00:00:00 2001 From: Jiahao Su Date: Tue, 2 Dec 2025 19:39:47 +0800 Subject: [PATCH 09/10] Fix missing module --- .github/workflows/_ascend_npu_torchtitan.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/_ascend_npu_torchtitan.yml b/.github/workflows/_ascend_npu_torchtitan.yml index 5ae9e21..13775d5 100644 --- a/.github/workflows/_ascend_npu_torchtitan.yml +++ b/.github/workflows/_ascend_npu_torchtitan.yml @@ -118,6 +118,7 @@ jobs: working-directory: torchtitan run: | mkdir artifacts-to-be-uploaded + pip install tokenizers python -m tests.integration_tests.run_tests --test_name artifacts-to-be-uploaded --ngpu ${{ steps.check_npu.outputs.npu_count }} || true - name: Run torchtitan unittest From 265cb67e9802a271749c1293f5bc6836b0058da7 Mon Sep 17 00:00:00 2001 From: Jiahao Su Date: Wed, 3 Dec 2025 11:25:05 +0800 Subject: [PATCH 10/10] update --- .github/workflows/_ascend_npu_torchtitan.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_ascend_npu_torchtitan.yml b/.github/workflows/_ascend_npu_torchtitan.yml index 13775d5..5070423 100644 --- a/.github/workflows/_ascend_npu_torchtitan.yml +++ b/.github/workflows/_ascend_npu_torchtitan.yml @@ -122,6 +122,6 @@ jobs: python -m tests.integration_tests.run_tests --test_name artifacts-to-be-uploaded --ngpu ${{ steps.check_npu.outputs.npu_count }} || true - name: Run torchtitan unittest - working-directory: torchtitan· + working-directory: torchtitan run: | pytest tests/unit_tests --cov=. --cov-report=xml --durations=20 -vv