Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
137 changes: 137 additions & 0 deletions .github/workflows/_ascend_npu_torchtitan.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
name: "_ascend_npu_torchtitan"

on:
workflow_call:
inputs:
runner:
required: true
type: string
description: "The runner selected to run on"
image:
required: true
type: string
description: "The docker image which will be loaded"
device:
required: true
type: string
description: "The device selected to run on"
torch-artifact:
required: false
type: string
description: "The distribution artifact name of torch"
torch-npu-artifact:
required: true
type: string
description: "The distribution artifact name of torch_npu"
secrets:
pr-token:
description: "A token used to create a pull request"
required: true

# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
# declared as "shell: bash -el {0}" on steps that need to be properly activated.
# It's used to activate ascend-toolkit environment variables.

defaults:
run:
shell: bash -el {0}

jobs:
setup_environment:
name: run torchtitan tests
runs-on: ${{ inputs.runner }}
container:
image: ${{ inputs.image }}
env:
HF_ENDPOINT: https://hf-mirror.com
outputs:
torch_version: ${{ steps.get_torch_version.outputs.torch-version }}
npu_info: ${{ steps.check_npu.outputs.npu_info }}
steps:
- name: Show NPU info
run: |
npu-smi info

- name: Config mirrors
run: |
sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple

- name: Install system dependencies
run: |
apt-get update
apt-get install -y \
git gcc g++ make cmake ninja-build curl \
libgl1 libglib2.0-0 libsndfile1

- name: Config git
run: |
git config --global --add safe.directory "$GITHUB_WORKSPACE"
git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/

- name: Checkout
uses: actions/checkout@v4

- name: Checkout benchmark
uses: actions/checkout@v4
with:
repository: pytorch/torchtitan
path: torchtitan

- name: Download torch artifact
if: ${{ inputs.torch-artifact }}
uses: actions/download-artifact@v4
with:
name: ${{ inputs.torch-artifact }}

- name: Install torch
if: ${{ inputs.torch-artifact }}
run: |
pip install ${{ inputs.torch-artifact }}

- name: Install torch_npu dependencies
if: ${{ !inputs.torch-artifact }}
run: |
pip install -r https://raw.githubusercontent.com/Ascend/pytorch/refs/heads/master/requirements.txt

- name: List torch version
id: list-torch-version
shell: bash
run: |
torch_version=$(python -c "import torch; print(torch.__version__)")
echo "torch-version=${torch_version}" >> $GITHUB_OUTPUT

- name: Download torch_npu artifact
uses: actions/download-artifact@v4
with:
name: ${{ inputs.torch-npu-artifact }}
path: ascend_npu

- name: Install torch_npu
working-directory: ascend_npu
run: |
pip install ${{ inputs.torch-npu-artifact }}

- name: Install project dependencies
run: |
pip install -r requirements.txt
pip install pytest pytest-cov tyro

- name: Show environment info
run: |
npu_is_available=$(python -c "import torch; print(torch.npu.is_available())")
npu_count=$(python -c "import torch; print(torch.npu.device_count())")
echo "NPU is available: ${npu_is_available}"
echo "NPU count: ${npu_count}"
pip list | grep -E 'torch|numpy'

- name: Run torchtitan integration_test
working-directory: torchtitan
run: |
npu_count=$(python -c "import torch; print(torch.npu.device_count())")
python ./tests/integration_tests.py artifacts-to-be-uploaded --ngpu ${npu_count}

- name: Run torchtitan unittest
working-directory: torchtitan
run: |
pytest ./tests/unit_tests --cov=. --cov-report=xml --durations=20 -vv
25 changes: 5 additions & 20 deletions .github/workflows/ascend_npu_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ on:
- ".github/workflows/_ascend_npu_build_torch_npu.yml"
- ".github/workflows/_ascend_npu_ut.yml"
- ".github/workflows/_ascend_npu_benchmark.yml"
- ".github/workflows/_ascend_npu_torchtitan.yml"
- ".ci/**"
- "ascend_npu/**"
- "src/**"
Expand All @@ -23,6 +24,7 @@ on:
- ".github/workflows/_ascend_npu_build_torch_npu.yml"
- ".github/workflows/_ascend_npu_ut.yml"
- ".github/workflows/_ascend_npu_benchmark.yml"
- ".github/workflows/_ascend_npu_torchtitan.yml"
- ".ci/**"
- "ascend_npu/**"
- "src/**"
Expand Down Expand Up @@ -118,36 +120,19 @@ jobs:
image: ${{ needs.prepare.outputs.image }}
torch-artifact: ${{ needs.build-torch.outputs.torch-artifact }}

test:
name: Test torch_npu
torchtitan:
name: Run torchtitan
needs:
- prepare
- build-torch
- build
if: |
!cancelled() && github.event_name != 'repository_dispatch' &&
(success() || (needs.build-torch.result == 'skipped' && needs.build.result == 'success'))
uses: ./.github/workflows/_ascend_npu_ut.yml
uses: ./.github/workflows/_ascend_npu_torchtitan.yml
with:
runner: ${{ needs.prepare.outputs.runner }}
image: ${{ needs.prepare.outputs.image }}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use ascendai/cann:8.0.0.beta1-910b-ubuntu22.04-py3.10 temporarily

torch-artifact: ${{ needs.build-torch.outputs.torch-artifact }}
torch-npu-artifact: ${{ needs.build.outputs.torch-npu-artifact }}

benchmark:
name: Run benchmarks
Comment on lines -137 to -138
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why remove this?

needs:
- prepare
- build-torch
- build
if: |
!cancelled() && github.event_name != 'repository_dispatch' &&
(success() || (needs.build-torch.result == 'skipped' && needs.build.result == 'success'))
uses: ./.github/workflows/_ascend_npu_benchmark.yml
with:
runner: ${{ needs.prepare.outputs.runner }}
image: ${{ needs.prepare.outputs.image }}
torch-artifact: ${{ needs.build-torch.outputs.torch-artifact }}
torch-npu-artifact: ${{ needs.build.outputs.torch-npu-artifact }}
secrets:
pr-token: ${{ secrets.COSDT_BOT_TOKEN }}