Skip to content

Commit 0e97e53

Browse files
authored
Merge branch 'main' into main
2 parents d6ac69f + cfcc43b commit 0e97e53

7 files changed

+223
-60
lines changed

.github/workflows/_ascend_npu_benchmark.yml

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,6 @@ on:
1111
required: true
1212
type: string
1313
description: "The docker image which will be loaded"
14-
device:
15-
required: true
16-
type: string
17-
description: "The device selected to run on"
1814
torch-artifact:
1915
required: false
2016
type: string
@@ -28,13 +24,6 @@ on:
2824
description: "A token used to create a pull request"
2925
required: true
3026

31-
# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
32-
# declared as "shell: bash -el {0}" on steps that need to be properly activated.
33-
# It's used to activate ascend-toolkit environment variables.
34-
defaults:
35-
run:
36-
shell: bash -el {0}
37-
3827
jobs:
3928
benchmark:
4029
name: run benchmarks for torch_npu

.github/workflows/_ascend_npu_build_torch.yml

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -26,13 +26,6 @@ on:
2626
description: "The distribution artifact name of torch"
2727
value: ${{ jobs.build.outputs.dist-name }}
2828

29-
# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
30-
# declared as "shell: bash -el {0}" on steps that need to be properly activated.
31-
# It's used to activate ascend-toolkit environment variables.
32-
defaults:
33-
run:
34-
shell: bash -el {0}
35-
3629
jobs:
3730
build:
3831
name: build torch for ${{ inputs.pr-number && format('#{0}', inputs.pr-number) || inputs.ref }}

.github/workflows/_ascend_npu_build_torch_npu.yml

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,6 @@ on:
1111
required: true
1212
type: string
1313
description: "The docker image which will be used to build"
14-
device:
15-
required: true
16-
type: string
17-
description: "The device selected to run on"
1814
torch-artifact:
1915
required: false
2016
type: string
@@ -24,13 +20,6 @@ on:
2420
description: "The distribution artifact name of torch_npu"
2521
value: ${{ jobs.build.outputs.dist-name }}
2622

27-
# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
28-
# declared as "shell: bash -el {0}" on steps that need to be properly activated.
29-
# It's used to activate ascend-toolkit environment variables.
30-
defaults:
31-
run:
32-
shell: bash -el {0}
33-
3423
jobs:
3524
build:
3625
name: build torch_npu
Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
name: "_ascend_npu_torchtitan"
2+
3+
on:
4+
workflow_call:
5+
inputs:
6+
runner:
7+
required: true
8+
type: string
9+
description: "The runner selected to run on"
10+
image:
11+
required: true
12+
type: string
13+
description: "The docker image which will be loaded"
14+
device:
15+
required: true
16+
type: string
17+
description: "The device selected to run on"
18+
torch-artifact:
19+
required: false
20+
type: string
21+
description: "The distribution artifact name of torch"
22+
torch-npu-artifact:
23+
required: true
24+
type: string
25+
description: "The distribution artifact name of torch_npu"
26+
secrets:
27+
pr-token:
28+
description: "A token used to create a pull request"
29+
required: true
30+
31+
# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
32+
# declared as "shell: bash -el {0}" on steps that need to be properly activated.
33+
# It's used to activate ascend-toolkit environment variables.
34+
35+
defaults:
36+
run:
37+
shell: bash -el {0}
38+
39+
jobs:
40+
setup_environment:
41+
name: run torchtitan tests
42+
runs-on: ${{ inputs.runner }}
43+
container:
44+
image: ${{ inputs.image }}
45+
env:
46+
HF_ENDPOINT: https://hf-mirror.com
47+
outputs:
48+
torch_version: ${{ steps.get_torch_version.outputs.torch-version }}
49+
npu_info: ${{ steps.check_npu.outputs.npu_info }}
50+
steps:
51+
- name: Show NPU info
52+
run: |
53+
npu-smi info
54+
55+
- name: Config mirrors
56+
run: |
57+
sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
58+
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
59+
60+
- name: Install system dependencies
61+
run: |
62+
apt-get update
63+
apt-get install -y \
64+
git gcc g++ make cmake ninja-build curl \
65+
libgl1 libglib2.0-0 libsndfile1
66+
67+
- name: Config git
68+
run: |
69+
git config --global --add safe.directory "$GITHUB_WORKSPACE"
70+
git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/
71+
72+
- name: Checkout
73+
uses: actions/checkout@v4
74+
75+
- name: Checkout benchmark
76+
uses: actions/checkout@v4
77+
with:
78+
repository: pytorch/torchtitan
79+
path: torchtitan
80+
81+
- name: Download torch artifact
82+
if: ${{ inputs.torch-artifact }}
83+
uses: actions/download-artifact@v4
84+
with:
85+
name: ${{ inputs.torch-artifact }}
86+
87+
- name: Install torch
88+
if: ${{ inputs.torch-artifact }}
89+
run: |
90+
pip install ${{ inputs.torch-artifact }}
91+
92+
- name: Install torch_npu dependencies
93+
if: ${{ !inputs.torch-artifact }}
94+
run: |
95+
pip install -r https://raw.githubusercontent.com/Ascend/pytorch/refs/heads/master/requirements.txt
96+
97+
- name: List torch version
98+
id: list-torch-version
99+
shell: bash
100+
run: |
101+
torch_version=$(python -c "import torch; print(torch.__version__)")
102+
echo "torch-version=${torch_version}" >> $GITHUB_OUTPUT
103+
104+
- name: Download torch_npu artifact
105+
uses: actions/download-artifact@v4
106+
with:
107+
name: ${{ inputs.torch-npu-artifact }}
108+
path: ascend_npu
109+
110+
- name: Install torch_npu
111+
working-directory: ascend_npu
112+
run: |
113+
pip install ${{ inputs.torch-npu-artifact }}
114+
115+
- name: Install project dependencies
116+
run: |
117+
pip install -r requirements.txt
118+
pip install pytest pytest-cov tyro
119+
120+
- name: Show environment info
121+
run: |
122+
npu_is_available=$(python -c "import torch; print(torch.npu.is_available())")
123+
npu_count=$(python -c "import torch; print(torch.npu.device_count())")
124+
echo "NPU is available: ${npu_is_available}"
125+
echo "NPU count: ${npu_count}"
126+
pip list | grep -E 'torch|numpy'
127+
128+
- name: Run torchtitan integration_test
129+
working-directory: torchtitan
130+
run: |
131+
npu_count=$(python -c "import torch; print(torch.npu.device_count())")
132+
python ./tests/integration_tests.py artifacts-to-be-uploaded --ngpu ${npu_count}
133+
134+
- name: Run torchtitan unittest
135+
working-directory: torchtitan
136+
run: |
137+
pytest ./tests/unit_tests --cov=. --cov-report=xml --durations=20 -vv

.github/workflows/_ascend_npu_ut.yml

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,6 @@ on:
1111
required: true
1212
type: string
1313
description: "The docker image which will be loaded"
14-
device:
15-
required: true
16-
type: string
17-
description: "The device selected to run on"
1814
torch-artifact:
1915
required: false
2016
type: string
@@ -24,13 +20,6 @@ on:
2420
type: string
2521
description: "The distribution artifact name of torch_npu"
2622

27-
# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
28-
# declared as "shell: bash -el {0}" on steps that need to be properly activated.
29-
# It's used to activate ascend-toolkit environment variables.
30-
defaults:
31-
run:
32-
shell: bash -el {0}
33-
3423
jobs:
3524
test:
3625
name: test torch_npu

.github/workflows/ascend_npu_test.yml

Lines changed: 6 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ on:
1010
- ".github/workflows/_ascend_npu_build_torch_npu.yml"
1111
- ".github/workflows/_ascend_npu_ut.yml"
1212
- ".github/workflows/_ascend_npu_benchmark.yml"
13+
- ".github/workflows/_ascend_npu_torchtitan.yml"
1314
- ".ci/**"
1415
- "ascend_npu/**"
1516
- "src/**"
@@ -23,6 +24,7 @@ on:
2324
- ".github/workflows/_ascend_npu_build_torch_npu.yml"
2425
- ".github/workflows/_ascend_npu_ut.yml"
2526
- ".github/workflows/_ascend_npu_benchmark.yml"
27+
- ".github/workflows/_ascend_npu_torchtitan.yml"
2628
- ".ci/**"
2729
- "ascend_npu/**"
2830
- "src/**"
@@ -56,20 +58,6 @@ on:
5658
- ascendai/cann:latest
5759
default: "ascendai/cann:latest"
5860
description: "The docker image which will be loaded"
59-
device:
60-
required: true
61-
type: choice
62-
options:
63-
- /dev/davinci1
64-
- /dev/davinci2
65-
- /dev/davinci3
66-
- /dev/davinci4
67-
- /dev/davinci5
68-
- /dev/davinci6
69-
- /dev/davinci7
70-
- /dev/davinci8
71-
default: "/dev/davinci5"
72-
description: "The device selected to run on"
7361

7462
# Only cancel the previous runs when triggered by a pull_request event
7563
#
@@ -98,7 +86,6 @@ jobs:
9886
id: set-env
9987
run: |
10088
echo "runner=${{ github.event.inputs.runner || 'linux-arm64-npu-1' }}" >> $GITHUB_OUTPUT
101-
echo "device=${{ github.event.inputs.device || '/dev/davinci5' }}" >> $GITHUB_OUTPUT
10289
echo "image=${{ github.event.inputs.image || 'ascendai/cann:latest' }}" >> $GITHUB_OUTPUT
10390
10491
# TODO(shink): List ghstack PR's ref
@@ -131,23 +118,21 @@ jobs:
131118
with:
132119
runner: ${{ needs.prepare.outputs.runner }}
133120
image: ${{ needs.prepare.outputs.image }}
134-
device: ${{ needs.prepare.outputs.device }}
135121
torch-artifact: ${{ needs.build-torch.outputs.torch-artifact }}
136122

137-
test:
138-
name: Test torch_npu
123+
torchtitan:
124+
name: Run torchtitan
139125
needs:
140126
- prepare
141127
- build-torch
142128
- build
143129
if: |
144130
!cancelled() && github.event_name != 'repository_dispatch' &&
145131
(success() || (needs.build-torch.result == 'skipped' && needs.build.result == 'success'))
146-
uses: ./.github/workflows/_ascend_npu_ut.yml
132+
uses: ./.github/workflows/_ascend_npu_torchtitan.yml
147133
with:
148134
runner: ${{ needs.prepare.outputs.runner }}
149135
image: ${{ needs.prepare.outputs.image }}
150-
device: ${{ needs.prepare.outputs.device }}
151136
torch-artifact: ${{ needs.build-torch.outputs.torch-artifact }}
152137
torch-npu-artifact: ${{ needs.build.outputs.torch-npu-artifact }}
153138

@@ -190,3 +175,4 @@ jobs:
190175
torch-npu-artifact: ${{ needs.build.outputs.torch-npu-artifact }}
191176
secrets:
192177
hf_token: ${{ secrets.HF_TOKEN }}
178+

CONTRIBUTING_zh.md

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
# 贡献指南
2+
3+
## 贡献流程
4+
5+
1. 在本仓库提交 Pull Request 并得到充分验证
6+
2. Pull Request 合入并稳定运行后再同步提交到 [pytorch-fdn/oota][0]
7+
8+
[0]: https://github.com/pytorch-fdn/oota
9+
10+
## 整体流程
11+
12+
触发任务后,会依次执行下列子工作流:
13+
14+
1. 编译 torch
15+
16+
- 只有监听到上游 PyTorch 仓库 PR 事件时,才会基于上游 PR 的代码源码编译 torch
17+
- 否则会安装 torch_npu [requirements.txt][1] 中指定的 torch 版本
18+
19+
2. 编译 torch_npu
20+
21+
- 核心是执行 `bash torch_npu/ci/build.sh` 进行编译
22+
23+
3. 执行 torch_npu 单元测试
24+
25+
- 核心是执行 `python torch_npu/ci/access_control_test.py` 进行测试
26+
27+
4. 执行 torch_npu [torchbenchmark][2]
28+
29+
- 核心是执行 `python benchmark/run_benchmark.py test_bench` 进行测试
30+
- 周期触发时会自动将 TorchBenchmark 的测试结果提交 PR,例如:[#46][3]
31+
32+
5. 执行其他 PyTorch 生态项目测试(如 [torchtitan][4], [torchtune][5], [torchchat][6]
33+
34+
[1]: https://github.com/Ascend/pytorch/blob/master/requirements.txt
35+
[2]: https://github.com/pytorch/benchmark
36+
[3]: https://github.com/cosdt/pytorch-integration-tests/pull/46
37+
[4]: https://github.com/pytorch/torchtitan
38+
[5]: https://github.com/pytorch/torchtune
39+
[6]: https://github.com/pytorch/torchchat
40+
41+
## 工作流触发条件
42+
43+
torch_npu 测试总入口工作流文件在 [.github/workflows/ascend_npu_test.yml](.github/workflows/ascend_npu_test.yml)
44+
其触发条件为:
45+
46+
1. `pull_request` 触发
47+
2. `workflow_dispatch` 手动触发
48+
3. `schedule` 周期触发
49+
4. `pytorch-pr-event-redispatch` 事件触发
50+
51+
其区别如下:
52+
53+
| 事件类型 | 工作流所在分支 | 触发时机 | 是否源码编译 torch |
54+
| :-------------------------: | :------------: | :------------------------------------------: | :----------------: |
55+
| pull_request | pr-branch | 提交 PR 时 ||
56+
| workflow_dispatch | main | 手动触发 ||
57+
| schedule | main | 每晚定期触发 1 次 ||
58+
| pytorch-pr-event-redispatch | main | 每晚定期扫描 PyTorch 仓库的 PR,会触发此事件 ||
59+
60+
## 代码结构
61+
62+
```
63+
.
64+
├── ascend_npu // Ascend NPU 配置/文档等
65+
├── .ci // CI 配置/文档等
66+
├── .github
67+
│ ├── actions // 自定义 action
68+
│ └── workflows // 工作流文件
69+
│ ├── ascend_npu_test.yml // torch_npu 测试总入口
70+
│ ├── _ascend_npu_build_torch.yml // torch 编译子工作流
71+
│ ├── _ascend_npu_build_torch_npu.yml // torch_npu 编译子工作流
72+
│ ├── _ascend_npu_ut.yml // torch_npu ut 子工作流
73+
│ ├── _ascend_npu_benchmark.yml // torch_npu benchmark 子工作流
74+
│ ├── dispatch-event.yml // 监听上游 PR 事件并分发
75+
│ └── redispatch-event.yml // 重新分发 PR 事件至其他仓库
76+
├── requirements.txt // 本项目依赖的 python 包
77+
├── src
78+
│ └── benchmark // TorchBenchmark 流程代码
79+
└── test // 测试代码
80+
```

0 commit comments

Comments
 (0)