@@ -52,7 +52,7 @@ This example has been validated with the following configurations:
52
52
### Llama 3.1 8B Instruct - GSM8k Dataset - LoRA - 8x NVIDIA A100/80G
53
53
54
54
* Infrastructure:
55
- * OpenShift AI 2.17
55
+ * OpenShift AI 2.19
56
56
* 8x NVIDIA-A100-SXM4-80GB
57
57
* Configuration:
58
58
``` yaml
@@ -61,7 +61,7 @@ This example has been validated with the following configurations:
61
61
model_revision : main
62
62
torch_dtype : bfloat16
63
63
attn_implementation : flash_attention_2
64
- use_liger : false
64
+ use_liger_kernel : true
65
65
66
66
# PEFT / LoRA
67
67
use_peft : true
@@ -79,12 +79,13 @@ This example has been validated with the following configurations:
79
79
dataset_config : main
80
80
81
81
# SFT
82
- max_seq_length : 1024
82
+ max_length : 4096
83
83
packing : false
84
+ padding_free : true
84
85
85
86
# Training
86
- per_device_train_batch_size : 64
87
- per_device_eval_batch_size : 64
87
+ per_device_train_batch_size : 128
88
+ per_device_eval_batch_size : 128
88
89
89
90
bf16 : true
90
91
tf32 : false
@@ -108,8 +109,8 @@ This example has been validated with the following configurations:
108
109
resources_per_worker :
109
110
" nvidia.com/gpu " : 1
110
111
" memory " : 96Gi
111
- " cpu " : 4
112
- base_image : quay.io/modh/training:py311-cuda121-torch241
112
+ " cpu " : 8
113
+ base_image : quay.io/modh/training:py311-cuda124-torch251
113
114
env_vars :
114
115
" PYTORCH_CUDA_ALLOC_CONF " : " expandable_segments:True"
115
116
" NCCL_DEBUG " : " INFO"
@@ -189,7 +190,7 @@ This example has been validated with the following configurations:
189
190
### Llama 3.1 8B Instruct - GSM8k Dataset - LoRA - 8x AMD Instinct MI300X
190
191
191
192
* Infrastructure:
192
- * OpenShift AI 2.17
193
+ * OpenShift AI 2.19
193
194
* 8x AMD Instinct MI300X
194
195
* Configuration:
195
196
` ` ` yaml
@@ -198,15 +199,14 @@ This example has been validated with the following configurations:
198
199
model_revision : main
199
200
torch_dtype : bfloat16
200
201
attn_implementation : flash_attention_2
201
- use_liger : true
202
+ use_liger_kernel : true
202
203
203
204
# PEFT / LoRA
204
205
use_peft : true
205
206
lora_r : 16
206
207
lora_alpha : 8
207
208
lora_dropout : 0.05
208
209
lora_target_modules : ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
209
- lora_modules_to_save : []
210
210
211
211
# QLoRA (BitsAndBytes)
212
212
load_in_4bit : false
@@ -217,12 +217,13 @@ This example has been validated with the following configurations:
217
217
dataset_config : main
218
218
219
219
# SFT
220
- max_seq_length : 4096
220
+ max_length : 8192
221
221
packing : false
222
+ padding_free : true
222
223
223
224
# Training
224
- per_device_train_batch_size : 128
225
- per_device_eval_batch_size : 128
225
+ per_device_train_batch_size : 512
226
+ per_device_eval_batch_size : 512
226
227
227
228
bf16 : true
228
229
tf32 : false
@@ -245,18 +246,15 @@ This example has been validated with the following configurations:
245
246
num_procs_per_worker : 1
246
247
resources_per_worker :
247
248
" amd.com/gpu " : 1
248
- " memory " : 96Gi
249
+ " memory " : 128Gi
249
250
" cpu " : 4
250
- base_image : quay.io/modh/training:py311-rocm62-torch241
251
+ base_image : quay.io/modh/training:py311-rocm62-torch251
251
252
env_vars :
252
253
" PYTORCH_HIP_ALLOC_CONF " : " expandable_segments:True"
253
254
" NCCL_DEBUG " : " INFO"
254
255
` ` `
255
256
* Metrics:
256
257

257
- Blue: with Liger kernels
258
-
259
- Orange: without Liger kernels
260
258
261
259
### Llama 3.3 70B Instruct - GSM8k Dataset - LoRA - 8x NVIDIA A100/80G
262
260
0 commit comments