Merge remote-tracking branch 'upstream/main'

dchourasia · dchourasia · commit ce6fa69b091c · 2025-05-14T00:15:47.000Z
diff --git a/tests/fms/resources/config_meta_llama3_70b_instruct_lora.json b/tests/fms/resources/config_meta_llama3_70b_instruct_lora.json
@@ -7,13 +7,15 @@
     "per_device_train_batch_size": 1,
     "per_device_eval_batch_size": 4,
     "gradient_accumulation_steps": 4,
+    "gradient_checkpointing": true,
     "save_strategy": "no",
     "learning_rate": 1e-5,
     "weight_decay": 0.0,
     "lr_scheduler_type": "cosine",
     "include_tokens_per_second": true,
     "response_template": "\n### Response:",
     "dataset_text_field": "output",
-    "use_flash_attn": false,
+    "use_flash_attn": true,
+    "fast_kernels": [true, true, true],
     "peft_method": "lora"
 }
diff --git a/tests/fms/resources/config_mixtral_8x7b_instruct_v01_lora.json b/tests/fms/resources/config_mixtral_8x7b_instruct_v01_lora.json
@@ -6,15 +6,17 @@
     "num_train_epochs": 1.0,
     "per_device_train_batch_size": 1,
     "per_device_eval_batch_size": 4,
-    "gradient_accumulation_steps": 1,
+    "gradient_accumulation_steps": 4,
+    "gradient_checkpointing": true,
     "save_strategy": "no",
     "learning_rate": 1e-5,
     "weight_decay": 0.0,
     "lr_scheduler_type": "cosine",
     "include_tokens_per_second": true,
     "response_template": "\n### Response:",
     "dataset_text_field": "output",
-    "use_flash_attn": false,
+    "use_flash_attn": true,
+    "fast_kernels": [true, true, true],
     "peft_method": "lora"
 }