Skip to content

Commit 9f986f5

Browse files
authored
Add Llama4 maverick examples (axolotl-ai-cloud#2512)
1 parent f85861a commit 9f986f5

File tree

2 files changed

+96
-1
lines changed

2 files changed

+96
-1
lines changed

examples/llama-4/README.md

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,10 @@
77
- [Text Single GPU (H100) QLoRA](./scout-qlora-single-h100.yaml)
88
- [Text Multi GPU QLoRA w/ FSDP1](./scout-qlora-fsdp1.yaml)
99

10-
Our Single GPU implementation for Llama 4 Scout uses only 68.5GB VRAM for post-training with 4k context length @ 546 tokens/second.
10+
Our Single H100 implementation for Llama 4 Scout uses only 68.5GB VRAM for post-training with 4k context length @ 546 tokens/second. [WandB logs here](https://wandb.ai/axolotl-ai/llama4-sft/runs/zic56rhd)
11+
12+
### Llama 4 Maverick 17Bx128Experts (400B)
13+
14+
- [Text Multi GPU QLoRA w/FSDP1](./maverick-qlora-fsdp1.yaml)
15+
16+
Our 4xH100 implementation for Llama 4 Maverick uses 79.5GB VRAM/GPU for post-training with 4k context length @ 206 tokens/second. [WandB logs here.](https://wandb.ai/axolotl-ai/llama-sft/runs/siyvwuxc?nw=nwuserwinglian)
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
base_model: axolotl-quants/Llama-4-Maverick-17B-128E-Linearized-bnb-nf4-bf16
2+
model_type: Llama4ForConditionalGeneration
3+
# Automatically upload checkpoint and final model to HF
4+
# hub_model_id: username/custom_model_name
5+
6+
strict: false
7+
8+
plugins:
9+
- axolotl.integrations.liger.LigerPlugin
10+
11+
liger_glu_activation: true
12+
liger_rms_norm: true
13+
liger_layer_norm: true
14+
15+
llama4_linearized_experts: true
16+
load_in_4bit: true
17+
adapter: qlora
18+
lora_r: 32
19+
lora_alpha: 64
20+
lora_target_modules:
21+
- self_attn.q_proj
22+
- self_attn.k_proj
23+
- self_attn.v_proj
24+
- self_attn.o_proj
25+
- shared_expert.gate_proj
26+
- shared_expert.up_proj
27+
- shared_expert.down_proj
28+
# - experts.gate_projs.[0-9]+$
29+
# - experts.up_projs.[0-9]+$
30+
# - experts.down_projs.[0-9]+$
31+
lora_modules_to_save:
32+
# - lm_head
33+
# - embed_tokens
34+
35+
chat_template: llama4
36+
datasets:
37+
- path: mlabonne/FineTome-100k
38+
type: chat_template
39+
split: train[:20%]
40+
field_messages: conversations
41+
message_property_mappings:
42+
role: from
43+
content: value
44+
45+
dataset_prepared_path: last_run_prepared
46+
val_set_size: 0.0
47+
output_dir: ./outputs/out
48+
49+
sequence_len: 4096
50+
sample_packing: true
51+
pad_to_sequence_len: true
52+
53+
gradient_accumulation_steps: 1
54+
micro_batch_size: 1
55+
num_epochs: 1
56+
optimizer: adamw_torch_fused
57+
lr_scheduler: cosine
58+
learning_rate: 1e-4
59+
60+
bf16: true
61+
tf32: true
62+
63+
logging_steps: 1
64+
flash_attention: true
65+
66+
gradient_checkpointing: offload
67+
gradient_checkpointing_kwargs:
68+
use_reentrant: false
69+
70+
warmup_steps: 20
71+
evals_per_epoch: 1
72+
saves_per_epoch: 1
73+
weight_decay: 0.0
74+
fsdp:
75+
- auto_wrap
76+
- full_shard
77+
fsdp_config:
78+
fsdp_transformer_layer_cls_to_wrap: Llama4TextDecoderLayer
79+
fsdp_limit_all_gathers: true
80+
fsdp_sync_module_states: true
81+
fsdp_offload_params: true
82+
fsdp_use_orig_params: false
83+
fsdp_cpu_ram_efficient_loading: true
84+
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
85+
fsdp_state_dict_type: FULL_STATE_DICT
86+
fsdp_sharding_strategy: FULL_SHARD
87+
special_tokens:
88+
pad_token: <|finetune_right_pad_id|>
89+
eos_token: <|eot|>

0 commit comments

Comments
 (0)