Skip to content

Commit e7e0cd9

Browse files
authored
Update dependencies and show slow tests in CI (axolotl-ai-cloud#2492)
* use latest torchao, gradio, schedule-free * get info on slow tests * speed up tests by avoiding gradient checkpointing and reducing eval size
1 parent 9494710 commit e7e0cd9

File tree

4 files changed

+24
-24
lines changed

4 files changed

+24
-24
lines changed

cicd/multigpu.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,5 @@
22
set -e
33

44
# only run one test at a time so as not to OOM the GPU
5-
pytest -v -n2 /workspace/axolotl/tests/e2e/multigpu/ --ignore=/workspace/axolotl/tests/e2e/multigpu/solo/
6-
pytest -v -n1 /workspace/axolotl/tests/e2e/multigpu/solo/
5+
pytest -v --durations=10 -n2 /workspace/axolotl/tests/e2e/multigpu/ --ignore=/workspace/axolotl/tests/e2e/multigpu/solo/
6+
pytest -v --durations=10 -n1 /workspace/axolotl/tests/e2e/multigpu/solo/

requirements.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ trl==0.16.0
2222
optimum==1.16.2
2323
hf_transfer
2424
sentencepiece
25-
gradio==3.50.2
25+
gradio==5.23.3
2626

2727
modal==0.70.5
2828
pydantic==2.10.6
@@ -59,8 +59,8 @@ langdetect==1.0.9
5959
immutabledict==4.2.0
6060
antlr4-python3-runtime==4.13.2
6161

62-
torchao==0.7.0
63-
schedulefree==1.3.0
62+
torchao==0.9.0
63+
schedulefree==1.4.1
6464

6565
axolotl-contribs-lgpl==0.0.6
6666
axolotl-contribs-mit==0.0.3

tests/e2e/multigpu/test_llama.py

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ def test_lora_ddp(self, temp_dir):
4444
"lora_alpha": 16,
4545
"lora_dropout": 0.05,
4646
"lora_target_linear": True,
47-
"val_set_size": 0.05,
47+
"val_set_size": 0.01,
4848
"special_tokens": {
4949
"pad_token": "<|endoftext|>",
5050
},
@@ -58,7 +58,7 @@ def test_lora_ddp(self, temp_dir):
5858
"max_steps": 2,
5959
"micro_batch_size": 4,
6060
"gradient_accumulation_steps": 4,
61-
"gradient_checkpointing": True,
61+
# "gradient_checkpointing": True,
6262
"output_dir": temp_dir,
6363
"learning_rate": 0.00001,
6464
"optimizer": "adamw_8bit",
@@ -108,7 +108,7 @@ def test_lora_ddp_packed(self, temp_dir, gradient_accumulation_steps):
108108
"lora_alpha": 16,
109109
"lora_dropout": 0.05,
110110
"lora_target_linear": True,
111-
"val_set_size": 0.05,
111+
"val_set_size": 0.01,
112112
"special_tokens": {
113113
"pad_token": "<|endoftext|>",
114114
},
@@ -122,7 +122,7 @@ def test_lora_ddp_packed(self, temp_dir, gradient_accumulation_steps):
122122
"max_steps": 2,
123123
"micro_batch_size": 1,
124124
"gradient_accumulation_steps": gradient_accumulation_steps,
125-
"gradient_checkpointing": True,
125+
# "gradient_checkpointing": True,
126126
"output_dir": temp_dir,
127127
"learning_rate": 0.00001,
128128
"optimizer": "adamw_8bit",
@@ -169,7 +169,7 @@ def test_dpo_lora_ddp(self, temp_dir):
169169
"lora_alpha": 16,
170170
"lora_dropout": 0.05,
171171
"lora_target_linear": True,
172-
"val_set_size": 0.05,
172+
"val_set_size": 0.01,
173173
"special_tokens": {
174174
"pad_token": "<|endoftext|>",
175175
},
@@ -195,7 +195,7 @@ def test_dpo_lora_ddp(self, temp_dir):
195195
"max_steps": 2,
196196
"micro_batch_size": 4,
197197
"gradient_accumulation_steps": 4,
198-
"gradient_checkpointing": True,
198+
# "gradient_checkpointing": True,
199199
"output_dir": temp_dir,
200200
"warmup_steps": 0,
201201
"learning_rate": 0.00001,
@@ -247,7 +247,7 @@ def test_dpo_qlora_ddp(self, temp_dir):
247247
"lora_alpha": 16,
248248
"lora_dropout": 0.05,
249249
"lora_target_linear": True,
250-
"val_set_size": 0.05,
250+
"val_set_size": 0.01,
251251
"special_tokens": {
252252
"pad_token": "<|endoftext|>",
253253
},
@@ -273,7 +273,7 @@ def test_dpo_qlora_ddp(self, temp_dir):
273273
"max_steps": 2,
274274
"micro_batch_size": 2,
275275
"gradient_accumulation_steps": 4,
276-
"gradient_checkpointing": True,
276+
# "gradient_checkpointing": True,
277277
"output_dir": temp_dir,
278278
"warmup_steps": 0,
279279
"learning_rate": 0.00001,
@@ -334,7 +334,7 @@ def test_fsdp(self, temp_dir, gradient_accumulation_steps):
334334
"max_steps": 2,
335335
"micro_batch_size": 2,
336336
"gradient_accumulation_steps": gradient_accumulation_steps,
337-
"gradient_checkpointing": True,
337+
# "gradient_checkpointing": True,
338338
"output_dir": temp_dir,
339339
"learning_rate": 0.00001,
340340
"optimizer": "adamw_torch_fused",
@@ -391,7 +391,7 @@ def test_fsdp_packed(self, temp_dir, fsdp_state_dict_type):
391391
"sample_packing": True,
392392
"pad_to_sequence_len": True,
393393
"sequence_len": 2048,
394-
"val_set_size": 0.05,
394+
"val_set_size": 0.01,
395395
"special_tokens": {
396396
"pad_token": "<|endoftext|>",
397397
},
@@ -405,7 +405,7 @@ def test_fsdp_packed(self, temp_dir, fsdp_state_dict_type):
405405
"max_steps": 2,
406406
"micro_batch_size": 4,
407407
"gradient_accumulation_steps": 2,
408-
"gradient_checkpointing": True,
408+
# "gradient_checkpointing": True,
409409
"output_dir": temp_dir,
410410
"learning_rate": 0.00001,
411411
"optimizer": "adamw_torch_fused",
@@ -470,7 +470,7 @@ def test_fsdp_qlora_prequant_packed(self, temp_dir):
470470
"eval_sample_packing": False,
471471
"pad_to_sequence_len": True,
472472
"sequence_len": 2048,
473-
"val_set_size": 0.05,
473+
"val_set_size": 0.01,
474474
"special_tokens": {
475475
"pad_token": "<|endoftext|>",
476476
},
@@ -485,7 +485,7 @@ def test_fsdp_qlora_prequant_packed(self, temp_dir):
485485
"max_steps": 2,
486486
"micro_batch_size": 4,
487487
"gradient_accumulation_steps": 2,
488-
"gradient_checkpointing": True,
488+
# "gradient_checkpointing": True,
489489
"output_dir": temp_dir,
490490
"learning_rate": 0.00001,
491491
"optimizer": "adamw_torch_fused",
@@ -567,7 +567,7 @@ def test_ds_zero3_packed(
567567
"sample_packing": True,
568568
"pad_to_sequence_len": True,
569569
"sequence_len": 2048,
570-
"val_set_size": 0.05,
570+
"val_set_size": 0.01,
571571
"special_tokens": {
572572
"pad_token": "<|endoftext|>",
573573
},
@@ -640,7 +640,7 @@ def test_ds_zero2_packed(self, temp_dir, gradient_accumulation_steps, qlora):
640640
"sample_packing": True,
641641
"pad_to_sequence_len": True,
642642
"sequence_len": 2048,
643-
"val_set_size": 0.05,
643+
"val_set_size": 0.01,
644644
"special_tokens": {
645645
"pad_token": "<|endoftext|>",
646646
},
@@ -713,7 +713,7 @@ def test_ds_zero1_packed(self, temp_dir, gradient_accumulation_steps, qlora):
713713
"sample_packing": True,
714714
"pad_to_sequence_len": True,
715715
"sequence_len": 2048,
716-
"val_set_size": 0.05,
716+
"val_set_size": 0.01,
717717
"special_tokens": {
718718
"pad_token": "<|endoftext|>",
719719
},
@@ -788,7 +788,7 @@ def test_fix_untrained_tokens(self, temp_dir):
788788
"max_steps": 2,
789789
"micro_batch_size": 1,
790790
"gradient_accumulation_steps": 1,
791-
"gradient_checkpointing": True,
791+
# "gradient_checkpointing": True,
792792
"output_dir": temp_dir,
793793
"learning_rate": 0.00001,
794794
"optimizer": "adamw_torch_fused",

tests/e2e/multigpu/test_qwen2.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ def test_qlora_fsdp_dpo(self, base_model, temp_dir):
3737
"lora_alpha": 16,
3838
"lora_dropout": 0.05,
3939
"lora_target_linear": True,
40-
"val_set_size": 0.05,
40+
"val_set_size": 0.01,
4141
"datasets": [
4242
{
4343
"path": "Intel/orca_dpo_pairs",
@@ -57,7 +57,7 @@ def test_qlora_fsdp_dpo(self, base_model, temp_dir):
5757
"flash_attention": True,
5858
"bf16": "auto",
5959
"tf32": True,
60-
"gradient_checkpointing": True,
60+
# "gradient_checkpointing": True,
6161
"gradient_checkpointing_kwargs": {
6262
"use_reentrant": False,
6363
},

0 commit comments

Comments
 (0)