Skip to content

Commit 72bac53

Browse files
authored
【CI】add npu cases (#1475)
* add npu cases * debug npu * debug run * more cases * more cases * set gpu_per_node * debug ep8 * fix path * add gpu vs npu case * update threshold * update threshold * ready to PR * remove tags when resume * add ignore in ut
1 parent 402a6c6 commit 72bac53

15 files changed

+743
-9
lines changed
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
name: ete_test_npu
2+
3+
permissions:
4+
contents: write
5+
pages: write
6+
7+
on:
8+
workflow_dispatch:
9+
inputs:
10+
repo_org:
11+
required: false
12+
description: 'Tested repository organization name. Default is InternLM'
13+
type: string
14+
default: 'InternLM/xtuner'
15+
repo_ref:
16+
required: false
17+
description: 'Set branch or tag or commit id. Default is "main"'
18+
type: string
19+
default: 'main'
20+
schedule:
21+
- cron: '00 21 * * 0-4'
22+
23+
jobs:
24+
ete_test:
25+
if: ${{!cancelled() }}
26+
runs-on: [d_cluster]
27+
steps:
28+
- name: Clean workdir
29+
run: sudo git clean -ffdx
30+
- name: Clone repository
31+
uses: actions/checkout@v2
32+
with:
33+
repository: ${{ github.event.inputs.repo_org || 'InternLM/xtuner' }}
34+
ref: ${{github.event.inputs.repo_ref || 'main'}}
35+
- name: run-test
36+
run: |
37+
source activate npuci
38+
unset HTTP_PROXY;unset HTTPS_PROXY;unset http_proxy;unset https_proxy;
39+
export DEVICE=npu && pytest autotest/test_all.py -m all -n 1 -vv --run_id ${{ github.run_id }}
40+
41+
- name: Upload Artifacts
42+
if: ${{ !cancelled() }}
43+
uses: actions/upload-artifact@v4
44+
with:
45+
path: ${{ github.workspace }}/${{ github.run_id }}
46+
if-no-files-found: ignore
47+
retention-days: 7
48+
name: npu-e2e-${{ github.run_id }}
49+
50+
- name: Deploy to GitHub Pages
51+
if: ${{ !cancelled() }}
52+
uses: JamesIves/github-pages-deploy-action@v4
53+
with:
54+
token: ${{ github.token }}
55+
branch: gh-pages
56+
folder: ./${{ github.run_id }}
57+
target-folder: ${{ github.run_id }}

.github/workflows/unit_test.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@ on:
88
- "docs/**"
99
- "**.md"
1010
- "autotest/**"
11-
- ".github/workflows/e2e_test.yaml "
11+
- ".github/workflows/e2e_test.yaml"
12+
- ".github/workflows/e2e_test_npu.yaml"
1213
- ".github/workflows/lint.yml"
1314
env:
1415
WORKSPACE_PREFIX: $(echo $GITHUB_WORKSPACE |cut -d '/' -f 1-5)

autotest/cluster/clusterx.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ def execute_task(self, task_config: Dict[str, Any]):
3535
all_command.append(f"export {env}")
3636

3737
all_command.append(command)
38-
run_command = "\n".join(all_command)
38+
run_command = "; ".join(all_command)
3939

4040
try:
4141
job_name = "-".join([task_config["type"], task_config["case_name"], task_config["run_id"]])

autotest/config-npu.yaml

Lines changed: 205 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,205 @@
1+
base_path:
2+
base_output_path: /mnt/hwfile/vc-intern-delivery/qa-llm-cicd/test_output
3+
base_baseline_path: /mnt/hwfile/vc-intern-delivery/qa-llm-cicd/xtuner_baseline
4+
5+
default_config:
6+
train:
7+
resource:
8+
gpus_per_task: 16
9+
cpus_per_task: 256
10+
memory_per_task: 1920
11+
image: registry2.d.pjlab.org.cn/ccr-yehaochen/910c:xtuner_rc2-20251011-2
12+
envs:
13+
- HF_HUB_CACHE=/mnt/shared-storage-user/auto-eval-pipeline/opencompass/models/hf_hub
14+
eval:
15+
resource:
16+
gpus_per_task: 0
17+
cpus_per_task: 16
18+
memory_per_task: 128
19+
image: registry.h.pjlab.org.cn/ailab-puyu/auto-eval:ld_0101_oc_8ee07ac_v3
20+
envs:
21+
- HF_HUB_CACHE=/mnt/shared-storage-user/auto-eval-pipeline/opencompass/models/hf_hub
22+
- HF_DATASETS_CACHE=/mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/hf_cache
23+
- COMPASS_DATA_CACHE=/mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/compass_data_cache
24+
- HF_DATASETS_OFFLINE=1
25+
- HF_HUB_OFFLINE=1
26+
27+
case:
28+
npu-qwen3-sft:
29+
-
30+
type: sft
31+
parameters:
32+
config: autotest/config/npu_qwen3.py
33+
output_path: /mnt/hwfile/vc-intern-delivery/qa-llm-cicd/test_output
34+
resource:
35+
num_nodes: 1
36+
envs:
37+
- QWEN3_MOE_PATH=/mnt/hwfile/vc-intern-delivery/qa-llm-cicd/qa_test_models/Qwen3-30B-A3B
38+
- ALPACA_PATH=/mnt/hwfile/vc-intern-delivery/qa-llm-cicd/xtuner_resource/datasets/alpaca
39+
- XTUNER_DETERMINISTIC=true
40+
- TORCH_NPU_USE_HCCL=1
41+
- PIP_INDEX_URL=http://pkg.pjlab.org.cn/repository/pypi-tsinghua/simple
42+
- PIP_TRUSTED_HOST=pkg.pjlab.org.cn
43+
- RANK=0
44+
assert_info:
45+
base_metric: npu-qwen3-sft/812c1021/tracker.jsonl
46+
check_metrics:
47+
grad_norm: 0.000001
48+
loss/reduced_llm_loss: 0.000001
49+
lr: 0
50+
memory/max_memory_GB: 0.2
51+
runtime_info/tgs: 0.05
52+
runtime_info/text_tokens: 0
53+
timeout: 10800
54+
55+
npu-qwen3-sft-ep8:
56+
-
57+
type: sft
58+
parameters:
59+
config: autotest/config/npu_qwen3_moe_30BA3_ep8.py
60+
output_path: /mnt/hwfile/vc-intern-delivery/qa-llm-cicd/test_output
61+
resource:
62+
num_nodes: 1
63+
envs:
64+
- QWEN3_MOE_PATH=/mnt/hwfile/vc-intern-delivery/qa-llm-cicd/qa_test_models/Qwen3-30B-A3B
65+
- ALPACA_PATH=/mnt/hwfile/vc-intern-delivery/qa-llm-cicd/xtuner_resource/datasets/alpaca
66+
- XTUNER_DETERMINISTIC=true
67+
- TORCH_NPU_USE_HCCL=1
68+
- PIP_INDEX_URL=http://pkg.pjlab.org.cn/repository/pypi-tsinghua/simple
69+
- PIP_TRUSTED_HOST=pkg.pjlab.org.cn
70+
assert_info:
71+
base_metric: npu-qwen3-sft-ep8/812c1021/tracker.jsonl
72+
check_metrics:
73+
grad_norm: 0.000001
74+
loss/reduced_llm_loss: 0.000001
75+
lr: 0
76+
memory/max_memory_GB: 0.2
77+
runtime_info/tgs: 0.5
78+
runtime_info/text_tokens: 0
79+
timeout: 10800
80+
-
81+
type: sft
82+
pre_action:
83+
command: 'python ./autotest/utils/update_meta.py'
84+
parameters:
85+
config: autotest/config/npu_qwen3_moe_30BA3_ep8_resume.py
86+
output_path: /mnt/hwfile/vc-intern-delivery/qa-llm-cicd/test_output
87+
resource:
88+
num_nodes: 1
89+
cpus_per_task: 80
90+
envs:
91+
- QWEN3_MOE_PATH=/mnt/hwfile/vc-intern-delivery/qa-llm-cicd/qa_test_models/Qwen3-30B-A3B
92+
- ALPACA_PATH=/mnt/hwfile/vc-intern-delivery/qa-llm-cicd/xtuner_resource/datasets/alpaca
93+
- XTUNER_DETERMINISTIC=true
94+
- TORCH_NPU_USE_HCCL=1
95+
- PIP_INDEX_URL=http://pkg.pjlab.org.cn/repository/pypi-tsinghua/simple
96+
- PIP_TRUSTED_HOST=pkg.pjlab.org.cn
97+
assert_info:
98+
base_metric: npu-qwen3-sft-ep8/812c1021_resume/tracker.jsonl
99+
check_metrics:
100+
grad_norm: 0.000001
101+
loss/reduced_llm_loss: 0.000001
102+
lr: 0
103+
memory/max_memory_GB: 0.2
104+
runtime_info/text_tokens: 0
105+
timeout: 10800
106+
107+
npu-qwen3-sft-tp2:
108+
-
109+
type: sft
110+
parameters:
111+
config: autotest/config/npu_qwen3_moe_30BA3_tp2.py
112+
output_path: /mnt/hwfile/vc-intern-delivery/qa-llm-cicd/test_output
113+
resource:
114+
envs:
115+
- QWEN3_MOE_PATH=/mnt/hwfile/vc-intern-delivery/qa-llm-cicd/qa_test_models/Qwen3-30B-A3B
116+
- ALPACA_PATH=/mnt/hwfile/vc-intern-delivery/qa-llm-cicd/xtuner_resource/datasets/alpaca
117+
- XTUNER_DETERMINISTIC=true
118+
- XTUNER_USE_FA3=1
119+
- TORCH_NPU_USE_HCCL=1
120+
- PIP_INDEX_URL=http://pkg.pjlab.org.cn/repository/pypi-tsinghua/simple
121+
- PIP_TRUSTED_HOST=pkg.pjlab.org.cn
122+
assert_info:
123+
base_metric: npu-qwen3-sft-tp2/812c1021/tracker.jsonl
124+
check_metrics:
125+
grad_norm: 0.000001
126+
loss/reduced_llm_loss: 0.000001
127+
lr: 0
128+
memory/max_memory_GB: 0.2
129+
runtime_info/tgs: 0.05
130+
runtime_info/text_tokens: 0
131+
timeout: 10800
132+
133+
npu-qwen3-sft-recompute:
134+
-
135+
type: sft
136+
parameters:
137+
config: autotest/config/npu_qwen3_recompute.py
138+
output_path: /mnt/hwfile/vc-intern-delivery/qa-llm-cicd/test_output
139+
resource:
140+
num_nodes: 2
141+
cpus_per_task: 256
142+
envs:
143+
- QWEN3_MOE_PATH=/mnt/hwfile/vc-intern-delivery/qa-llm-cicd/qa_test_models/Qwen3-30B-A3B
144+
- ALPACA_PATH=/mnt/hwfile/vc-intern-delivery/qa-llm-cicd/xtuner_resource/datasets/alpaca
145+
- XTUNER_DETERMINISTIC=true
146+
- XTUNER_ACTIVATION_OFFLOAD=1
147+
- TORCH_NPU_USE_HCCL=1
148+
- PIP_INDEX_URL=http://pkg.pjlab.org.cn/repository/pypi-tsinghua/simple
149+
- PIP_TRUSTED_HOST=pkg.pjlab.org.cn
150+
assert_info:
151+
base_metric: npu-qwen3-sft-recompute/812c1021/tracker.jsonl
152+
check_metrics:
153+
grad_norm: 0.000001
154+
loss/reduced_llm_loss: 0.000001
155+
lr: 0
156+
memory/max_memory_GB: 0.2
157+
runtime_info/tgs: 0.05
158+
runtime_info/text_tokens: 0
159+
timeout: 10800
160+
161+
npu-qwen3-sft-16nums:
162+
-
163+
type: sft
164+
parameters:
165+
config: autotest/config/npu_qwen3_16nums.py
166+
output_path: /mnt/hwfile/vc-intern-delivery/qa-llm-cicd/test_output
167+
resource:
168+
num_nodes: 2
169+
envs:
170+
- QWEN3_MOE_PATH=/mnt/hwfile/vc-intern-delivery/qa-llm-cicd/qa_test_models/Qwen3-30B-A3B
171+
- ALPACA_PATH=/mnt/hwfile/vc-intern-delivery/qa-llm-cicd/xtuner_resource/datasets/alpaca
172+
- XTUNER_DETERMINISTIC=true
173+
- TORCH_NPU_USE_HCCL=1
174+
- PIP_INDEX_URL=http://pkg.pjlab.org.cn/repository/pypi-tsinghua/simple
175+
- PIP_TRUSTED_HOST=pkg.pjlab.org.cn
176+
assert_info:
177+
base_metric: npu-qwen3-sft/812c1021/tracker.jsonl
178+
check_metrics:
179+
grad_norm: 0.000001
180+
loss/reduced_llm_loss: 0.000001
181+
lr: 0
182+
timeout: 10800
183+
184+
npu-qwen3-sft-celoss-vs-gpu:
185+
-
186+
type: sft
187+
parameters:
188+
config: autotest/config/npu_qwen3_sft_celoss.py
189+
output_path: /mnt/hwfile/vc-intern-delivery/qa-llm-cicd/test_output
190+
resource:
191+
num_nodes: 1
192+
envs:
193+
- QWEN3_MOE_PATH=/mnt/hwfile/vc-intern-delivery/qa-llm-cicd/qa_test_models/Qwen3-30B-A3B
194+
- ALPACA_PATH=/mnt/hwfile/vc-intern-delivery/qa-llm-cicd/xtuner_resource/datasets/alpaca
195+
- XTUNER_DETERMINISTIC=true
196+
- TORCH_NPU_USE_HCCL=1
197+
- PIP_INDEX_URL=http://pkg.pjlab.org.cn/repository/pypi-tsinghua/simple
198+
- PIP_TRUSTED_HOST=pkg.pjlab.org.cn
199+
assert_info:
200+
base_metric: npu-qwen3-sft-celoss-vs-gpu/812c1021/tracker.jsonl
201+
check_metrics:
202+
grad_norm: 0.02
203+
loss/reduced_llm_loss: 0.01
204+
lr: 0.01
205+
timeout: 10800

autotest/config.yaml

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,6 @@ case:
9191
loss/reduced_llm_loss: 0.000001
9292
lr: 0
9393
memory/max_memory_GB: 0.2
94-
runtime_info/tgs: 0.05
9594
runtime_info/text_tokens: 0
9695
timeout: 10800
9796

@@ -186,5 +185,26 @@ case:
186185
grad_norm: 1
187186
loss/reduced_llm_loss: 0.02
188187
lr: 0
188+
timeout: 10800
189+
190+
qwen3-sft-celoss:
191+
-
192+
type: sft
193+
parameters:
194+
config: autotest/config/qwen3_sft_celoss.py
195+
output_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output
196+
resource:
197+
envs:
198+
- QWEN3_MOE_PATH=/mnt/shared-storage-user/llmrazor-share/model/Qwen3-30B-A3B
199+
- ALPACA_PATH=/mnt/shared-storage-user/llmrazor-share/data/alpaca
200+
- XTUNER_DETERMINISTIC=true
201+
assert_info:
202+
base_metric: qwen3-sft-celoss/812c1021/tracker.jsonl
203+
check_metrics:
204+
grad_norm: 0.000001
205+
loss/reduced_llm_loss: 0.000001
206+
lr: 0
207+
memory/max_memory_GB: 0.2
189208
runtime_info/tgs: 0.05
209+
runtime_info/text_tokens: 0
190210
timeout: 10800

autotest/config/npu_qwen3.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
import os
2+
3+
from xtuner.v1.config import (
4+
AdamWConfig,
5+
FSDPConfig,
6+
LRConfig,
7+
)
8+
from xtuner.v1.datasets import FTDPTokenizeFnConfig
9+
from xtuner.v1.datasets.config import DataloaderConfig, DatasetConfig
10+
from xtuner.v1.loss.ce_loss import CELossConfig
11+
from xtuner.v1.model.moe.qwen3 import Qwen3MoE30BA3Config
12+
from xtuner.v1.train import TrainerConfig
13+
14+
15+
QWEN3_MOE_PATH = os.environ["QWEN3_MOE_PATH"]
16+
ALPACA_PATH = os.environ["ALPACA_PATH"]
17+
18+
19+
moe_cfg = Qwen3MoE30BA3Config()
20+
optim_cfg = AdamWConfig(lr=6e-05)
21+
lr_cfg = LRConfig(lr_type="cosine", lr_min=1e-6)
22+
fsdp_cfg = FSDPConfig(
23+
torch_compile=False,
24+
cpu_offload=False,
25+
ep_size=moe_cfg.ep_size,
26+
)
27+
28+
dataset_config = [
29+
{
30+
"dataset": DatasetConfig(name="alpaca", anno_path=ALPACA_PATH, sample_ratio=1.0),
31+
"tokenize_fn": FTDPTokenizeFnConfig(max_length=16384),
32+
},
33+
]
34+
35+
dataloader_config = DataloaderConfig(pack_max_length=16384)
36+
37+
loss_cfg = CELossConfig(mode="chunk", chunk_size=1024) # CELossConfig()
38+
39+
40+
trainer = TrainerConfig(
41+
load_from=QWEN3_MOE_PATH,
42+
model_cfg=moe_cfg,
43+
optim_cfg=optim_cfg,
44+
fsdp_cfg=fsdp_cfg,
45+
dataset_cfg=dataset_config,
46+
dataloader_cfg=dataloader_config,
47+
lr_cfg=lr_cfg,
48+
loss_cfg=loss_cfg,
49+
tokenizer_path=QWEN3_MOE_PATH,
50+
global_batch_size=64,
51+
total_epoch=1,
52+
work_dir=f"/mnt/hwfile/vc-intern-delivery/qa-llm-cicd/test_output/{os.environ['GITHUB_RUN_ID']}/npu-qwen3-sft/sft",
53+
seed=0,
54+
dist_backend="npu:hccl",
55+
)

0 commit comments

Comments
 (0)