|
| 1 | +base_model: axolotl-quants/Llama-4-Maverick-17B-128E-Linearized-bnb-nf4-bf16 |
| 2 | +model_type: Llama4ForConditionalGeneration |
| 3 | +# Automatically upload checkpoint and final model to HF |
| 4 | +# hub_model_id: username/custom_model_name |
| 5 | + |
| 6 | +strict: false |
| 7 | + |
| 8 | +plugins: |
| 9 | + - axolotl.integrations.liger.LigerPlugin |
| 10 | + |
| 11 | +liger_glu_activation: true |
| 12 | +liger_rms_norm: true |
| 13 | +liger_layer_norm: true |
| 14 | + |
| 15 | +llama4_linearized_experts: true |
| 16 | +load_in_4bit: true |
| 17 | +adapter: qlora |
| 18 | +lora_r: 32 |
| 19 | +lora_alpha: 64 |
| 20 | +lora_target_modules: |
| 21 | + - self_attn.q_proj |
| 22 | + - self_attn.k_proj |
| 23 | + - self_attn.v_proj |
| 24 | + - self_attn.o_proj |
| 25 | + - shared_expert.gate_proj |
| 26 | + - shared_expert.up_proj |
| 27 | + - shared_expert.down_proj |
| 28 | + # - experts.gate_projs.[0-9]+$ |
| 29 | + # - experts.up_projs.[0-9]+$ |
| 30 | + # - experts.down_projs.[0-9]+$ |
| 31 | +lora_modules_to_save: |
| 32 | +# - lm_head |
| 33 | +# - embed_tokens |
| 34 | + |
| 35 | +chat_template: llama4 |
| 36 | +datasets: |
| 37 | + - path: mlabonne/FineTome-100k |
| 38 | + type: chat_template |
| 39 | + split: train[:20%] |
| 40 | + field_messages: conversations |
| 41 | + message_property_mappings: |
| 42 | + role: from |
| 43 | + content: value |
| 44 | + |
| 45 | +dataset_prepared_path: last_run_prepared |
| 46 | +val_set_size: 0.0 |
| 47 | +output_dir: ./outputs/out |
| 48 | + |
| 49 | +sequence_len: 4096 |
| 50 | +sample_packing: true |
| 51 | +pad_to_sequence_len: true |
| 52 | + |
| 53 | +gradient_accumulation_steps: 1 |
| 54 | +micro_batch_size: 1 |
| 55 | +num_epochs: 1 |
| 56 | +optimizer: adamw_torch_fused |
| 57 | +lr_scheduler: cosine |
| 58 | +learning_rate: 1e-4 |
| 59 | + |
| 60 | +bf16: true |
| 61 | +tf32: true |
| 62 | + |
| 63 | +logging_steps: 1 |
| 64 | +flash_attention: true |
| 65 | + |
| 66 | +gradient_checkpointing: offload |
| 67 | +gradient_checkpointing_kwargs: |
| 68 | + use_reentrant: false |
| 69 | + |
| 70 | +warmup_steps: 20 |
| 71 | +evals_per_epoch: 1 |
| 72 | +saves_per_epoch: 1 |
| 73 | +weight_decay: 0.0 |
| 74 | +fsdp: |
| 75 | + - auto_wrap |
| 76 | + - full_shard |
| 77 | +fsdp_config: |
| 78 | + fsdp_transformer_layer_cls_to_wrap: Llama4TextDecoderLayer |
| 79 | + fsdp_limit_all_gathers: true |
| 80 | + fsdp_sync_module_states: true |
| 81 | + fsdp_offload_params: true |
| 82 | + fsdp_use_orig_params: false |
| 83 | + fsdp_cpu_ram_efficient_loading: true |
| 84 | + fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP |
| 85 | + fsdp_state_dict_type: FULL_STATE_DICT |
| 86 | + fsdp_sharding_strategy: FULL_SHARD |
| 87 | +special_tokens: |
| 88 | + pad_token: <|finetune_right_pad_id|> |
| 89 | + eos_token: <|eot|> |
0 commit comments