Update dependencies and show slow tests in CI (axolotl-ai-cloud#2492)

winglian · web-flow · commit e7e0cd97ce66 · 2025-04-05T17:41:31.000-04:00
* use latest torchao, gradio, schedule-free

* get info on slow tests

* speed up tests by avoiding gradient checkpointing and reducing eval size
diff --git a/cicd/multigpu.sh b/cicd/multigpu.sh
@@ -2,5 +2,5 @@
 set -e
 
 # only run one test at a time so as not to OOM the GPU
-pytest -v -n2 /workspace/axolotl/tests/e2e/multigpu/ --ignore=/workspace/axolotl/tests/e2e/multigpu/solo/
-pytest -v -n1 /workspace/axolotl/tests/e2e/multigpu/solo/
+pytest -v  --durations=10 -n2 /workspace/axolotl/tests/e2e/multigpu/ --ignore=/workspace/axolotl/tests/e2e/multigpu/solo/
+pytest -v  --durations=10 -n1 /workspace/axolotl/tests/e2e/multigpu/solo/
diff --git a/requirements.txt b/requirements.txt
@@ -22,7 +22,7 @@ trl==0.16.0
 optimum==1.16.2
 hf_transfer
 sentencepiece
-gradio==3.50.2
+gradio==5.23.3
 
 modal==0.70.5
 pydantic==2.10.6
@@ -59,8 +59,8 @@ langdetect==1.0.9
 immutabledict==4.2.0
 antlr4-python3-runtime==4.13.2
 
-torchao==0.7.0
-schedulefree==1.3.0
+torchao==0.9.0
+schedulefree==1.4.1
 
 axolotl-contribs-lgpl==0.0.6
 axolotl-contribs-mit==0.0.3
diff --git a/tests/e2e/multigpu/test_llama.py b/tests/e2e/multigpu/test_llama.py
@@ -44,7 +44,7 @@ def test_lora_ddp(self, temp_dir):
                 "lora_alpha": 16,
                 "lora_dropout": 0.05,
                 "lora_target_linear": True,
-                "val_set_size": 0.05,
+                "val_set_size": 0.01,
                 "special_tokens": {
                     "pad_token": "<|endoftext|>",
                 },
@@ -58,7 +58,7 @@ def test_lora_ddp(self, temp_dir):
                 "max_steps": 2,
                 "micro_batch_size": 4,
                 "gradient_accumulation_steps": 4,
-                "gradient_checkpointing": True,
+                # "gradient_checkpointing": True,
                 "output_dir": temp_dir,
                 "learning_rate": 0.00001,
                 "optimizer": "adamw_8bit",
@@ -108,7 +108,7 @@ def test_lora_ddp_packed(self, temp_dir, gradient_accumulation_steps):
                 "lora_alpha": 16,
                 "lora_dropout": 0.05,
                 "lora_target_linear": True,
-                "val_set_size": 0.05,
+                "val_set_size": 0.01,
                 "special_tokens": {
                     "pad_token": "<|endoftext|>",
                 },
@@ -122,7 +122,7 @@ def test_lora_ddp_packed(self, temp_dir, gradient_accumulation_steps):
                 "max_steps": 2,
                 "micro_batch_size": 1,
                 "gradient_accumulation_steps": gradient_accumulation_steps,
-                "gradient_checkpointing": True,
+                # "gradient_checkpointing": True,
                 "output_dir": temp_dir,
                 "learning_rate": 0.00001,
                 "optimizer": "adamw_8bit",
@@ -169,7 +169,7 @@ def test_dpo_lora_ddp(self, temp_dir):
                 "lora_alpha": 16,
                 "lora_dropout": 0.05,
                 "lora_target_linear": True,
-                "val_set_size": 0.05,
+                "val_set_size": 0.01,
                 "special_tokens": {
                     "pad_token": "<|endoftext|>",
                 },
@@ -195,7 +195,7 @@ def test_dpo_lora_ddp(self, temp_dir):
                 "max_steps": 2,
                 "micro_batch_size": 4,
                 "gradient_accumulation_steps": 4,
-                "gradient_checkpointing": True,
+                # "gradient_checkpointing": True,
                 "output_dir": temp_dir,
                 "warmup_steps": 0,
                 "learning_rate": 0.00001,
@@ -247,7 +247,7 @@ def test_dpo_qlora_ddp(self, temp_dir):
                 "lora_alpha": 16,
                 "lora_dropout": 0.05,
                 "lora_target_linear": True,
-                "val_set_size": 0.05,
+                "val_set_size": 0.01,
                 "special_tokens": {
                     "pad_token": "<|endoftext|>",
                 },
@@ -273,7 +273,7 @@ def test_dpo_qlora_ddp(self, temp_dir):
                 "max_steps": 2,
                 "micro_batch_size": 2,
                 "gradient_accumulation_steps": 4,
-                "gradient_checkpointing": True,
+                # "gradient_checkpointing": True,
                 "output_dir": temp_dir,
                 "warmup_steps": 0,
                 "learning_rate": 0.00001,
@@ -334,7 +334,7 @@ def test_fsdp(self, temp_dir, gradient_accumulation_steps):
                 "max_steps": 2,
                 "micro_batch_size": 2,
                 "gradient_accumulation_steps": gradient_accumulation_steps,
-                "gradient_checkpointing": True,
+                # "gradient_checkpointing": True,
                 "output_dir": temp_dir,
                 "learning_rate": 0.00001,
                 "optimizer": "adamw_torch_fused",
@@ -391,7 +391,7 @@ def test_fsdp_packed(self, temp_dir, fsdp_state_dict_type):
                 "sample_packing": True,
                 "pad_to_sequence_len": True,
                 "sequence_len": 2048,
-                "val_set_size": 0.05,
+                "val_set_size": 0.01,
                 "special_tokens": {
                     "pad_token": "<|endoftext|>",
                 },
@@ -405,7 +405,7 @@ def test_fsdp_packed(self, temp_dir, fsdp_state_dict_type):
                 "max_steps": 2,
                 "micro_batch_size": 4,
                 "gradient_accumulation_steps": 2,
-                "gradient_checkpointing": True,
+                # "gradient_checkpointing": True,
                 "output_dir": temp_dir,
                 "learning_rate": 0.00001,
                 "optimizer": "adamw_torch_fused",
@@ -470,7 +470,7 @@ def test_fsdp_qlora_prequant_packed(self, temp_dir):
                 "eval_sample_packing": False,
                 "pad_to_sequence_len": True,
                 "sequence_len": 2048,
-                "val_set_size": 0.05,
+                "val_set_size": 0.01,
                 "special_tokens": {
                     "pad_token": "<|endoftext|>",
                 },
@@ -485,7 +485,7 @@ def test_fsdp_qlora_prequant_packed(self, temp_dir):
                 "max_steps": 2,
                 "micro_batch_size": 4,
                 "gradient_accumulation_steps": 2,
-                "gradient_checkpointing": True,
+                # "gradient_checkpointing": True,
                 "output_dir": temp_dir,
                 "learning_rate": 0.00001,
                 "optimizer": "adamw_torch_fused",
@@ -567,7 +567,7 @@ def test_ds_zero3_packed(
                 "sample_packing": True,
                 "pad_to_sequence_len": True,
                 "sequence_len": 2048,
-                "val_set_size": 0.05,
+                "val_set_size": 0.01,
                 "special_tokens": {
                     "pad_token": "<|endoftext|>",
                 },
@@ -640,7 +640,7 @@ def test_ds_zero2_packed(self, temp_dir, gradient_accumulation_steps, qlora):
                 "sample_packing": True,
                 "pad_to_sequence_len": True,
                 "sequence_len": 2048,
-                "val_set_size": 0.05,
+                "val_set_size": 0.01,
                 "special_tokens": {
                     "pad_token": "<|endoftext|>",
                 },
@@ -713,7 +713,7 @@ def test_ds_zero1_packed(self, temp_dir, gradient_accumulation_steps, qlora):
                 "sample_packing": True,
                 "pad_to_sequence_len": True,
                 "sequence_len": 2048,
-                "val_set_size": 0.05,
+                "val_set_size": 0.01,
                 "special_tokens": {
                     "pad_token": "<|endoftext|>",
                 },
@@ -788,7 +788,7 @@ def test_fix_untrained_tokens(self, temp_dir):
                 "max_steps": 2,
                 "micro_batch_size": 1,
                 "gradient_accumulation_steps": 1,
-                "gradient_checkpointing": True,
+                # "gradient_checkpointing": True,
                 "output_dir": temp_dir,
                 "learning_rate": 0.00001,
                 "optimizer": "adamw_torch_fused",
diff --git a/tests/e2e/multigpu/test_qwen2.py b/tests/e2e/multigpu/test_qwen2.py
@@ -37,7 +37,7 @@ def test_qlora_fsdp_dpo(self, base_model, temp_dir):
                 "lora_alpha": 16,
                 "lora_dropout": 0.05,
                 "lora_target_linear": True,
-                "val_set_size": 0.05,
+                "val_set_size": 0.01,
                 "datasets": [
                     {
                         "path": "Intel/orca_dpo_pairs",
@@ -57,7 +57,7 @@ def test_qlora_fsdp_dpo(self, base_model, temp_dir):
                 "flash_attention": True,
                 "bf16": "auto",
                 "tf32": True,
-                "gradient_checkpointing": True,
+                # "gradient_checkpointing": True,
                 "gradient_checkpointing_kwargs": {
                     "use_reentrant": False,
                 },