Skip to content

Commit 5b3ee6a

Browse files
committed
run gptoss
1 parent bc35383 commit 5b3ee6a

File tree

4 files changed

+83
-5
lines changed

4 files changed

+83
-5
lines changed

.github/workflows/e2e_test.yaml

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ permissions:
55
pages: write
66

77
on:
8+
push:
89
workflow_dispatch:
910
inputs:
1011
repo_org:
@@ -29,16 +30,16 @@ jobs:
2930
run: sudo git clean -ffdx
3031
- name: Clone repository
3132
uses: actions/checkout@v2
32-
with:
33-
repository: ${{ github.event.inputs.repo_org || 'InternLM/xtuner' }}
34-
ref: ${{github.event.inputs.repo_ref || 'main'}}
33+
#with:
34+
#repository: ${{ github.event.inputs.repo_org || 'InternLM/xtuner' }}
35+
#ref: ${{github.event.inputs.repo_ref || 'main'}}
3536
- name: run-test
3637
run: |
3738
source /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/miniconda3/bin/activate
3839
conda activate clusterx
3940
conda env list
4041
unset HTTP_PROXY;unset HTTPS_PROXY;unset http_proxy;unset https_proxy;
41-
pytest autotest/test_all.py -m all -n 1 -vv --run_id ${{ github.run_id }}
42+
pytest autotest/test_all.py::test_all[gptoss-sft] -m all -n 1 -vv --run_id ${{ github.run_id }}
4243
4344
- name: Upload Artifacts
4445
if: ${{ !cancelled() }}

autotest/cluster/clusterx.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ def execute_task(self, task_config: Dict[str, Any]):
6767
if status in [JobStatus.SUCCEEDED]:
6868
run_time = time.time() - run_start_time
6969
if run_time >= timeout:
70-
return False, 'Task succeeded, but run time is {run_time}, exceeding then {timeout}'
70+
return False, f'Task succeeded, but run time is {run_time}, exceeding then {timeout}'
7171
else:
7272
return True, "Task succeeded"
7373
elif status in [JobStatus.FAILED, JobStatus.STOPPED]:

autotest/config.yaml

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,28 @@ case:
209209
runtime_info/text_tokens: 0
210210
timeout: 10800
211211

212+
gptoss-sft:
213+
-
214+
type: sft
215+
parameters:
216+
config: autotest/config/gptoss.py
217+
output_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output
218+
resource:
219+
envs:
220+
- GPTOSS_21B_PATH=/mnt/shared-storage-user/llmrazor-share/model/gpt-oss-20b-bf16
221+
- ALPACA_PATH=/mnt/shared-storage-user/llmrazor-share/data/alpaca2
222+
- XTUNER_DETERMINISTIC=true
223+
assert_info:
224+
base_metric: gptoss-sft/7b774a0e2/tracker.jsonl
225+
check_metrics:
226+
grad_norm: 0.000001
227+
loss/reduced_llm_loss: 0.000001
228+
lr: 0
229+
memory/max_memory_GB: 0.2
230+
runtime_info/tgs: 0.05
231+
runtime_info/text_tokens: 0
232+
timeout: 10800
233+
212234
qwen3-rl-lmdeploy:
213235
-
214236
type: rl

autotest/config/gptoss.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
import os
2+
3+
from xtuner.v1.config import (
4+
AdamWConfig,
5+
FSDPConfig,
6+
LRConfig,
7+
)
8+
from xtuner.v1.datasets import FTDPTokenizeFnConfig
9+
from xtuner.v1.datasets.config import DataloaderConfig, DatasetConfig
10+
from xtuner.v1.loss.ce_loss import CELossConfig
11+
from xtuner.v1.module.rope import RopeScalingConfig
12+
from xtuner.v1.model.moe.gpt_oss import GptOss21BA3P6Config
13+
from xtuner.v1.train import TrainerConfig
14+
15+
16+
GPTOSS_21B_PATH = os.environ["GPTOSS_21B_PATH"]
17+
ALPACA_PATH = os.environ["ALPACA_PATH"]
18+
19+
20+
gptoss_cfg = GptOss21BA3P6Config(rope_scaling_cfg=RopeScalingConfig(type="yarn", beta_fast=16.0, beta_slow=1.05, factor=16.0, original_max_position_embeddings=4096, truncate=True))
21+
optim_cfg = AdamWConfig(lr=6e-05)
22+
lr_cfg = LRConfig(lr_type="cosine", lr_min=1e-6)
23+
fsdp_cfg = FSDPConfig(
24+
torch_compile=False,
25+
cpu_offload=False,
26+
ep_size=moe_cfg.ep_size,
27+
)
28+
29+
dataset_config = [
30+
{
31+
"dataset": DatasetConfig(name="alpaca", anno_path=ALPACA_PATH, sample_ratio=1.0),
32+
"tokenize_fn": FTDPTokenizeFnConfig(max_length=16384),
33+
},
34+
]
35+
36+
dataloader_config = DataloaderConfig(pack_max_length=16384)
37+
38+
loss_cfg = CELossConfig()
39+
40+
41+
trainer = TrainerConfig(
42+
load_from=GPTOSS_21B_PATH,
43+
model_cfg=gptoss_cfg,
44+
optim_cfg=optim_cfg,
45+
fsdp_cfg=fsdp_cfg,
46+
dataset_cfg=dataset_config,
47+
dataloader_cfg=dataloader_config,
48+
lr_cfg=lr_cfg,
49+
loss_cfg=loss_cfg,
50+
tokenizer_path=GPTOSS_21B_PATH,
51+
global_batch_size=16,
52+
total_epoch=1,
53+
work_dir=f"/mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output/{os.environ['GITHUB_RUN_ID']}/gptoss-sft/sft",
54+
seed=0,
55+
)

0 commit comments

Comments
 (0)