@@ -99,20 +99,20 @@ This example has been validated with the following configurations:
99
99
# FSDP
100
100
fsdp : " full_shard auto_wrap offload"
101
101
fsdp_config :
102
- activation_checkpointing : true
102
+ activation_checkpointing : true
103
103
` ` `
104
104
* Job:
105
105
` ` ` yaml
106
106
num_workers : 8
107
107
num_procs_per_worker : 1
108
108
resources_per_worker :
109
- " nvidia.com/gpu " : 1
110
- " memory " : 96Gi
111
- " cpu " : 4
109
+ " nvidia.com/gpu " : 1
110
+ " memory " : 96Gi
111
+ " cpu " : 4
112
112
base_image : quay.io/modh/training:py311-cuda121-torch241
113
113
env_vars :
114
- " PYTORCH_CUDA_ALLOC_CONF " : " expandable_segments:True"
115
- " NCCL_DEBUG " : " INFO"
114
+ " PYTORCH_CUDA_ALLOC_CONF " : " expandable_segments:True"
115
+ " NCCL_DEBUG " : " INFO"
116
116
` ` `
117
117
* Metrics:
118
118

@@ -133,11 +133,11 @@ This example has been validated with the following configurations:
133
133
134
134
# PEFT / LoRA
135
135
use_peft : true
136
- lora_target_modules : ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
137
- lora_modules_to_save : []
138
136
lora_r : 16
139
137
lora_alpha : 8
140
138
lora_dropout : 0.05
139
+ lora_target_modules : ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
140
+ lora_modules_to_save : []
141
141
142
142
# QLoRA (BitsAndBytes)
143
143
load_in_4bit : false
@@ -168,20 +168,20 @@ This example has been validated with the following configurations:
168
168
# FSDP
169
169
fsdp : " full_shard auto_wrap"
170
170
fsdp_config :
171
- activation_checkpointing : true
171
+ activation_checkpointing : true
172
172
` ` `
173
173
* Job:
174
174
` ` ` yaml
175
175
num_workers : 16
176
176
num_procs_per_worker : 1
177
177
resources_per_worker :
178
- " amd.com/gpu " : 1
179
- " memory " : 192Gi
180
- " cpu " : 4
178
+ " amd.com/gpu " : 1
179
+ " memory " : 192Gi
180
+ " cpu " : 4
181
181
base_image : quay.io/modh/training:py311-cuda121-torch241
182
182
env_vars :
183
- " PYTORCH_CUDA_ALLOC_CONF " : " expandable_segments:True"
184
- " NCCL_DEBUG " : " INFO"
183
+ " PYTORCH_CUDA_ALLOC_CONF " : " expandable_segments:True"
184
+ " NCCL_DEBUG " : " INFO"
185
185
` ` `
186
186
* Metrics:
187
187

@@ -202,11 +202,11 @@ This example has been validated with the following configurations:
202
202
203
203
# PEFT / LoRA
204
204
use_peft : true
205
- lora_target_modules : ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
206
- lora_modules_to_save : []
207
205
lora_r : 16
208
206
lora_alpha : 8
209
207
lora_dropout : 0.05
208
+ lora_target_modules : ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
209
+ lora_modules_to_save : []
210
210
211
211
# QLoRA (BitsAndBytes)
212
212
load_in_4bit : false
@@ -237,23 +237,92 @@ This example has been validated with the following configurations:
237
237
# FSDP
238
238
fsdp : " full_shard auto_wrap"
239
239
fsdp_config :
240
- activation_checkpointing : true
240
+ activation_checkpointing : true
241
241
` ` `
242
242
* Job:
243
243
` ` ` yaml
244
244
num_workers : 8
245
245
num_procs_per_worker : 1
246
246
resources_per_worker :
247
- " amd.com/gpu " : 1
248
- " memory " : 96Gi
249
- " cpu " : 4
247
+ " amd.com/gpu " : 1
248
+ " memory " : 96Gi
249
+ " cpu " : 4
250
250
base_image : quay.io/modh/training:py311-rocm62-torch241
251
251
env_vars :
252
- " PYTORCH_HIP_ALLOC_CONF " : " expandable_segments:True"
253
- " NCCL_DEBUG " : " INFO"
252
+ " PYTORCH_HIP_ALLOC_CONF " : " expandable_segments:True"
253
+ " NCCL_DEBUG " : " INFO"
254
254
` ` `
255
255
* Metrics:
256
256

257
257
Blue: with Liger kernels
258
258
259
259
Orange: without Liger kernels
260
+
261
+ ### Llama 3.3 70B Instruct - GSM8k Dataset - LoRA - 8x NVIDIA A100/80G
262
+
263
+ * Infrastructure:
264
+ * OpenShift AI 2.17
265
+ * 8x NVIDIA-A100-SXM4-80GB
266
+ * Configuration:
267
+ ` ` ` yaml
268
+ # Model
269
+ model_name_or_path : meta-llama/Llama-3.3-70B-Instruct
270
+ model_revision : main
271
+ torch_dtype : bfloat16
272
+ attn_implementation : flash_attention_2
273
+ use_liger : true
274
+
275
+ # PEFT / LoRA
276
+ use_peft : true
277
+ lora_r : 16
278
+ lora_alpha : 8
279
+ lora_dropout : 0.05
280
+ lora_target_modules : ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
281
+ lora_modules_to_save : []
282
+
283
+ # QLoRA (BitsAndBytes)
284
+ load_in_4bit : false
285
+ load_in_8bit : false
286
+
287
+ # Dataset
288
+ dataset_name : gsm8k
289
+ dataset_config : main
290
+
291
+ # SFT
292
+ max_seq_length : 2048
293
+ packing : false
294
+
295
+ # Training
296
+ per_device_train_batch_size : 32
297
+ per_device_eval_batch_size : 32
298
+
299
+ bf16 : true
300
+ tf32 : false
301
+
302
+ learning_rate : 2.0e-4
303
+ warmup_steps : 10
304
+ lr_scheduler_type : inverse_sqrt
305
+
306
+ optim : adamw_torch_fused
307
+ max_grad_norm : 1.0
308
+
309
+ # FSDP
310
+ fsdp : " full_shard auto_wrap"
311
+ fsdp_config :
312
+ activation_checkpointing : true
313
+ ` ` `
314
+ * Job:
315
+ ` ` ` yaml
316
+ num_workers : 8
317
+ num_procs_per_worker : 1
318
+ resources_per_worker :
319
+ " nvidia.com/gpu " : 1
320
+ " memory " : 256Gi
321
+ " cpu " : 4
322
+ base_image : quay.io/modh/training:py311-cuda121-torch241
323
+ env_vars :
324
+ " PYTORCH_CUDA_ALLOC_CONF " : " expandable_segments:True"
325
+ " NCCL_DEBUG " : " INFO"
326
+ ` ` `
327
+ * Metrics:
328
+ 
0 commit comments