Skip to content

Commit 3f96837

Browse files
committed
Merge remote-tracking branch 'upstream/main'
2 parents 7b4357e + 1230079 commit 3f96837

File tree

2 files changed

+91
-22
lines changed

2 files changed

+91
-22
lines changed

examples/kfto-sft-llm/README.md

Lines changed: 91 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -99,20 +99,20 @@ This example has been validated with the following configurations:
9999
# FSDP
100100
fsdp: "full_shard auto_wrap offload"
101101
fsdp_config:
102-
activation_checkpointing: true
102+
activation_checkpointing: true
103103
```
104104
* Job:
105105
```yaml
106106
num_workers: 8
107107
num_procs_per_worker: 1
108108
resources_per_worker:
109-
"nvidia.com/gpu": 1
110-
"memory": 96Gi
111-
"cpu": 4
109+
"nvidia.com/gpu": 1
110+
"memory": 96Gi
111+
"cpu": 4
112112
base_image: quay.io/modh/training:py311-cuda121-torch241
113113
env_vars:
114-
"PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True"
115-
"NCCL_DEBUG": "INFO"
114+
"PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True"
115+
"NCCL_DEBUG": "INFO"
116116
```
117117
* Metrics:
118118
![](./docs/run01.png)
@@ -133,11 +133,11 @@ This example has been validated with the following configurations:
133133

134134
# PEFT / LoRA
135135
use_peft: true
136-
lora_target_modules: ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
137-
lora_modules_to_save: []
138136
lora_r: 16
139137
lora_alpha: 8
140138
lora_dropout: 0.05
139+
lora_target_modules: ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
140+
lora_modules_to_save: []
141141

142142
# QLoRA (BitsAndBytes)
143143
load_in_4bit: false
@@ -168,20 +168,20 @@ This example has been validated with the following configurations:
168168
# FSDP
169169
fsdp: "full_shard auto_wrap"
170170
fsdp_config:
171-
activation_checkpointing: true
171+
activation_checkpointing: true
172172
```
173173
* Job:
174174
```yaml
175175
num_workers: 16
176176
num_procs_per_worker: 1
177177
resources_per_worker:
178-
"amd.com/gpu": 1
179-
"memory": 192Gi
180-
"cpu": 4
178+
"amd.com/gpu": 1
179+
"memory": 192Gi
180+
"cpu": 4
181181
base_image: quay.io/modh/training:py311-cuda121-torch241
182182
env_vars:
183-
"PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True"
184-
"NCCL_DEBUG": "INFO"
183+
"PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True"
184+
"NCCL_DEBUG": "INFO"
185185
```
186186
* Metrics:
187187
![](./docs/run02.png)
@@ -202,11 +202,11 @@ This example has been validated with the following configurations:
202202

203203
# PEFT / LoRA
204204
use_peft: true
205-
lora_target_modules: ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
206-
lora_modules_to_save: []
207205
lora_r: 16
208206
lora_alpha: 8
209207
lora_dropout: 0.05
208+
lora_target_modules: ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
209+
lora_modules_to_save: []
210210

211211
# QLoRA (BitsAndBytes)
212212
load_in_4bit: false
@@ -237,23 +237,92 @@ This example has been validated with the following configurations:
237237
# FSDP
238238
fsdp: "full_shard auto_wrap"
239239
fsdp_config:
240-
activation_checkpointing: true
240+
activation_checkpointing: true
241241
```
242242
* Job:
243243
```yaml
244244
num_workers: 8
245245
num_procs_per_worker: 1
246246
resources_per_worker:
247-
"amd.com/gpu": 1
248-
"memory": 96Gi
249-
"cpu": 4
247+
"amd.com/gpu": 1
248+
"memory": 96Gi
249+
"cpu": 4
250250
base_image: quay.io/modh/training:py311-rocm62-torch241
251251
env_vars:
252-
"PYTORCH_HIP_ALLOC_CONF": "expandable_segments:True"
253-
"NCCL_DEBUG": "INFO"
252+
"PYTORCH_HIP_ALLOC_CONF": "expandable_segments:True"
253+
"NCCL_DEBUG": "INFO"
254254
```
255255
* Metrics:
256256
![](./docs/run03.png)
257257
Blue: with Liger kernels
258258
259259
Orange: without Liger kernels
260+
261+
### Llama 3.3 70B Instruct - GSM8k Dataset - LoRA - 8x NVIDIA A100/80G
262+
263+
* Infrastructure:
264+
* OpenShift AI 2.17
265+
* 8x NVIDIA-A100-SXM4-80GB
266+
* Configuration:
267+
```yaml
268+
# Model
269+
model_name_or_path: meta-llama/Llama-3.3-70B-Instruct
270+
model_revision: main
271+
torch_dtype: bfloat16
272+
attn_implementation: flash_attention_2
273+
use_liger: true
274+
275+
# PEFT / LoRA
276+
use_peft: true
277+
lora_r: 16
278+
lora_alpha: 8
279+
lora_dropout: 0.05
280+
lora_target_modules: ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
281+
lora_modules_to_save: []
282+
283+
# QLoRA (BitsAndBytes)
284+
load_in_4bit: false
285+
load_in_8bit: false
286+
287+
# Dataset
288+
dataset_name: gsm8k
289+
dataset_config: main
290+
291+
# SFT
292+
max_seq_length: 2048
293+
packing: false
294+
295+
# Training
296+
per_device_train_batch_size: 32
297+
per_device_eval_batch_size: 32
298+
299+
bf16: true
300+
tf32: false
301+
302+
learning_rate: 2.0e-4
303+
warmup_steps: 10
304+
lr_scheduler_type: inverse_sqrt
305+
306+
optim: adamw_torch_fused
307+
max_grad_norm: 1.0
308+
309+
# FSDP
310+
fsdp: "full_shard auto_wrap"
311+
fsdp_config:
312+
activation_checkpointing: true
313+
```
314+
* Job:
315+
```yaml
316+
num_workers: 8
317+
num_procs_per_worker: 1
318+
resources_per_worker:
319+
"nvidia.com/gpu": 1
320+
"memory": 256Gi
321+
"cpu": 4
322+
base_image: quay.io/modh/training:py311-cuda121-torch241
323+
env_vars:
324+
"PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True"
325+
"NCCL_DEBUG": "INFO"
326+
```
327+
* Metrics:
328+
![](./docs/run04.png)

examples/kfto-sft-llm/docs/run04.png

572 KB
Loading

0 commit comments

Comments
 (0)