[llama4] fix the mm yaml, add scout single gpu yaml (axolotl-ai-cloud#2510)

winglian · web-flow · commit bf9efe2a09dc · 2025-04-09T02:52:45.000-04:00
* [llama4] fix the mm yaml, add scout single gpu yaml

* add README for llama4

* rename to specify fsdp
diff --git a/examples/llama-4/README.md b/examples/llama-4/README.md
@@ -0,0 +1,10 @@
+# Llama 4 by Meta AI
+
+## Available Examples
+
+### Llama 4 Scout 17Bx16Experts (109B)
+- [Multi-Modal/Vision QLoRA w/ FSDP1](./scout-vision-qlora-fsdp.yaml)
+- [Text Single GPU (H100) QLoRA](./scout-qlora-single-h100.yaml)
+- [Text Multi GPU QLoRA w/ FSDP1](./scout-qlora-fsdp1.yaml)
+
+Our Single GPU implementation for Llama 4 Scout uses only 68.5GB VRAM for post-training with 4k context length @ 546 tokens/second.
diff --git a/examples/llama-4/scout-qlora-single-h100.yaml b/examples/llama-4/scout-qlora-single-h100.yaml
@@ -0,0 +1,86 @@
+base_model: axolotl-quants/Llama-4-Scout-17B-16E-Linearized-bnb-nf4-bf16
+model_type: Llama4ForConditionalGeneration
+# Automatically upload checkpoint and final model to HF
+# hub_model_id: username/custom_model_name
+
+strict: false
+
+plugins:
+  - axolotl.integrations.liger.LigerPlugin
+
+liger_glu_activation: true
+liger_rms_norm: true
+liger_layer_norm: true
+
+llama4_linearized_experts: true
+load_in_4bit: true
+adapter: qlora
+lora_r: 32
+lora_alpha: 64
+lora_target_modules:
+  - self_attn.q_proj
+  - self_attn.k_proj
+  - self_attn.v_proj
+  - self_attn.o_proj
+  - shared_expert.gate_proj
+  - shared_expert.up_proj
+  - shared_expert.down_proj
+  # - experts.gate_projs.[0-9]+$
+  # - experts.up_projs.[0-9]+$
+  # - experts.down_projs.[0-9]+$
+lora_modules_to_save:
+  # - lm_head
+  # - embed_tokens
+
+lora_mlp_kernel: true
+lora_qkv_kernel: true
+lora_o_kernel: true
+
+chat_template: llama4
+datasets:
+  - path: mlabonne/FineTome-100k
+    type: chat_template
+    split: train[:20%]
+    field_messages: conversations
+    message_property_mappings:
+      role: from
+      content: value
+
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.0
+output_dir: ./outputs/out
+
+sequence_len: 4096  # up to 8k will work on a single H100
+sample_packing: true
+pad_to_sequence_len: true
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 1
+micro_batch_size: 1
+num_epochs: 1
+optimizer: adamw_torch_4bit
+lr_scheduler: cosine
+learning_rate: 1e-4
+
+bf16: true
+tf32: true
+
+logging_steps: 1
+flash_attention: true
+
+gradient_checkpointing: offload
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+
+warmup_steps: 20
+evals_per_epoch: 1
+saves_per_epoch: 1
+weight_decay: 0.0
+special_tokens:
+  pad_token: <|finetune_right_pad_id|>
+  eos_token: <|eot|>
diff --git a/examples/llama-4/scout-vision-qlora-fsdp.yaml b/examples/llama-4/scout-vision-qlora-fsdp.yaml
@@ -1,74 +1,88 @@
-base_model: meta-llama/Llama-4-Scout-17B-16E
+base_model: axolotl-quants/Llama-4-Scout-17B-16E-Linearized-bnb-nf4-bf16
 model_type: Llama4ForConditionalGeneration
+processor_type: Llama4Processor
   # Automatically upload checkpoint and final model to HF
   # hub_model_id: username/custom_model_name
 
 strict: false
 
-  # torch_compile: true
+# these 3 lines are needed for now to handle vision chat templates w images
+skip_prepare_dataset: true
+remove_unused_columns: false
+sample_packing: false
 
-adapter: lora
+sequence_len: 4096
+
+plugins:
+  - axolotl.integrations.liger.LigerPlugin
+
+liger_glu_activation: true
+liger_rms_norm: true
+liger_layer_norm: true
+
+llama4_linearized_experts: true  # use Axolotl's customized model
+load_in_4bit: true
+adapter: qlora
 lora_r: 32
 lora_alpha: 64
 lora_target_modules:
   - self_attn.q_proj
   - self_attn.k_proj
   - self_attn.v_proj
   - self_attn.o_proj
+  - shared_expert.gate_proj
+  - shared_expert.up_proj
+  - shared_expert.down_proj
+  - vision_adapter.mlp.fc1
+  - vision_adapter.mlp.fc2
+  # - experts.gate_projs.[0-9]+$
+  # - experts.up_projs.[0-9]+$
+  # - experts.down_projs.[0-9]+$
 lora_modules_to_save:
   - lm_head
   - embed_tokens
 
 chat_template: llama4
 datasets:
-  - path: mlabonne/FineTome-100k
+  - path: HuggingFaceH4/llava-instruct-mix-vsft
     type: chat_template
-    split: train[:20%]
-    field_messages: conversations
-    message_property_mappings:
-      role: from
-      content: value
+    split: train[:1%]
+    field_messages: messages
 
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.0
 output_dir: ./outputs/out
 
-sequence_len: 4096
-sample_packing: true
-pad_to_sequence_len: true
-
 gradient_accumulation_steps: 1
 micro_batch_size: 1
 num_epochs: 1
-optimizer: adamw_torch_8bit
+optimizer: adamw_torch_4bit
 lr_scheduler: cosine
 learning_rate: 2e-5
 
 bf16: true
 tf32: true
 
-# gradient_checkpointing: true
-# gradient_checkpointing_kwargs:
-#   use_reentrant: false
 logging_steps: 1
 flash_attention: true
 
 warmup_steps: 100
-evals_per_epoch: 2
+evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
 fsdp:
   - auto_wrap
   - full_shard
 fsdp_config:
-  fsdp_version: 2
-  fsdp_offload_params: false
+  fsdp_transformer_layer_cls_to_wrap: Llama4TextDecoderLayer
+  fsdp_limit_all_gathers: true
+  fsdp_sync_module_states: true
+  fsdp_offload_params: true
+  fsdp_use_orig_params: false
   fsdp_cpu_ram_efficient_loading: true
   fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
-  fsdp_transformer_layer_cls_to_wrap: Llama4TextDecoderLayer
-  fsdp_state_dict_type: SHARDED_STATE_DICT
+  fsdp_state_dict_type: FULL_STATE_DICT
   fsdp_sharding_strategy: FULL_SHARD
-  fsdp_reshard_after_forward: true
   fsdp_activation_checkpointing: true
 special_tokens:
   pad_token: <|finetune_right_pad_id|>