diff --git a/examples/glm45/README.md b/examples/glm45/README.md new file mode 100644 index 0000000000..ec9be2f8b7 --- /dev/null +++ b/examples/glm45/README.md @@ -0,0 +1,48 @@ +# Finetune GLM4.5 with Axolotl + +[UNSTABLE] + +```bash +# LoRA SFT (4xH200 @ 84GB/GPU) +axolotl train examples/glm45/glm4.5-lora-fsdp2.yaml + +# FFT SFT (4xH200) +# Checkpointing error on backward pass +# Without checkpointing => OOM +axolotl train examples/glm45/glm4.5-fft-fsdp2.yaml +``` + +## Dataset + +In addition to normal OpenAI Messages format, GLM4.5 support an extra parameter for thinking in assistant section. + +```json +{ + "role": "assistant", + "reasoning_content": "...", // or have ... in `content` + "content": "...", +} +``` + +Note: +- The role name for tools in this template is `tool`. +- You will see this Axolotl WARNING. This is to be as expected as the template does not use EOS. +```bash +EOS token '<|endoftext|>' not found in chat_template. Please check if your template/EOS token is correct. +``` +- Make sure you set the below extra attributes if needed +```yaml +datasets: + - path: ... + type: chat_template + message_property_mappings: + role: role + content: content + + # tool_calls: tool_calls # uncomment if using tools + # reasoning_content: reasoning_content # uncomment if have reasoning + +# Uncomment if training on tool role (you would rarely if ever need this) +# eot_tokens: +# - <|observation|> +``` diff --git a/examples/glm45/glm4.5-fft-fsdp2.yaml b/examples/glm45/glm4.5-fft-fsdp2.yaml new file mode 100644 index 0000000000..6dc62f04d6 --- /dev/null +++ b/examples/glm45/glm4.5-fft-fsdp2.yaml @@ -0,0 +1,59 @@ +base_model: zai-org/GLM-4.5-Air +# Automatically upload checkpoint and final model to HF +# hub_model_id: username/custom_model_name + +plugins: + - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin + +experimental_skip_move_to_device: true # prevent OOM by NOT putting model to GPU before sharding + +datasets: + - path: winglian/pirate-ultrachat-10k + type: chat_template +dataset_prepared_path: last_run_prepared +val_set_size: 0 +output_dir: ./outputs/qlora-out + +sequence_len: 2048 +sample_packing: true +eval_sample_packing: true + +wandb_project: +wandb_entity: +wandb_watch: +wandb_name: +wandb_log_model: + +gradient_accumulation_steps: 1 +micro_batch_size: 1 +num_epochs: 1 +optimizer: adamw_torch_4bit +lr_scheduler: cosine +learning_rate: 0.0002 + +bf16: auto +tf32: false + +# gradient_checkpointing: true +resume_from_checkpoint: +logging_steps: 1 +flash_attention: true + +loss_watchdog_threshold: 5.0 +loss_watchdog_patience: 3 + +warmup_ratio: 0.1 +evals_per_epoch: 1 +saves_per_epoch: 1 +weight_decay: 0.0 +special_tokens: + +fsdp_version: 2 +fsdp_config: + offload_params: false + cpu_ram_efficient_loading: true + auto_wrap_policy: TRANSFORMER_BASED_WRAP + transformer_layer_cls_to_wrap: Glm4MoeDecoderLayer + state_dict_type: SHARDED_STATE_DICT + reshard_after_forward: true + activation_checkpointing: true diff --git a/examples/glm45/glm4.5-lora-fsdp2.yaml b/examples/glm45/glm4.5-lora-fsdp2.yaml new file mode 100644 index 0000000000..bdef9465dc --- /dev/null +++ b/examples/glm45/glm4.5-lora-fsdp2.yaml @@ -0,0 +1,74 @@ +base_model: zai-org/GLM-4.5-Air +# Automatically upload checkpoint and final model to HF +# hub_model_id: username/custom_model_name + +plugins: + - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin + +experimental_skip_move_to_device: true # prevent OOM by NOT putting model to GPU before sharding + +datasets: + - path: winglian/pirate-ultrachat-10k + type: chat_template +dataset_prepared_path: last_run_prepared +val_set_size: 0 +output_dir: ./outputs/qlora-out + +adapter: lora +lora_model_dir: + +lora_r: 16 +lora_alpha: 32 +lora_dropout: 0.05 +lora_target_modules: + - gate_proj + - down_proj + - up_proj + - q_proj + - v_proj + - k_proj + - o_proj + +sequence_len: 2048 +sample_packing: true +eval_sample_packing: true + +wandb_project: +wandb_entity: +wandb_watch: +wandb_name: +wandb_log_model: + +gradient_accumulation_steps: 1 +micro_batch_size: 1 +num_epochs: 1 +optimizer: adamw_torch_4bit +lr_scheduler: cosine +learning_rate: 0.0002 + +bf16: auto +tf32: false + +# gradient_checkpointing: true +resume_from_checkpoint: +logging_steps: 1 +flash_attention: true + +loss_watchdog_threshold: 5.0 +loss_watchdog_patience: 3 + +warmup_ratio: 0.1 +evals_per_epoch: 1 +saves_per_epoch: 1 +weight_decay: 0.0 +special_tokens: + +fsdp_version: 2 +fsdp_config: + offload_params: false + cpu_ram_efficient_loading: true + auto_wrap_policy: TRANSFORMER_BASED_WRAP + transformer_layer_cls_to_wrap: Glm4MoeDecoderLayer + state_dict_type: SHARDED_STATE_DICT + reshard_after_forward: true + # activation_checkpointing: false diff --git a/src/axolotl/common/architectures.py b/src/axolotl/common/architectures.py index ce945e670f..616b4159bc 100644 --- a/src/axolotl/common/architectures.py +++ b/src/axolotl/common/architectures.py @@ -13,5 +13,7 @@ "qwen2_moe": "Qwen2MoeSparseMoeBlock", "qwen3_moe": "Qwen3MoeSparseMoeBlock", "deepseek_v2": "DeepseekV2MoE", - "gpt_oss": "GptOssDecoderLayer", + "gpt_oss": "GptOssExperts", + "deepseek_v3": "DeepseekV3MoE", + "glm4_moe": "Glm4MoeMoE", } diff --git a/src/axolotl/integrations/cut_cross_entropy/README.md b/src/axolotl/integrations/cut_cross_entropy/README.md index 02e4e6686c..ac67ebf935 100644 --- a/src/axolotl/integrations/cut_cross_entropy/README.md +++ b/src/axolotl/integrations/cut_cross_entropy/README.md @@ -34,6 +34,7 @@ plugins: - arcee - cohere - cohere2 +- deepseek_v3 - gemma - gemma2 - gemma3 @@ -42,6 +43,7 @@ plugins: - gemma3n_text - glm - glm4 +- glm_moe - gpt_oss - granite - granitemoe diff --git a/src/axolotl/monkeypatch/multipack.py b/src/axolotl/monkeypatch/multipack.py index 7df9877d78..791f551bc7 100644 --- a/src/axolotl/monkeypatch/multipack.py +++ b/src/axolotl/monkeypatch/multipack.py @@ -35,6 +35,7 @@ "deepseek_v3", "glm", "glm4", + "glm4_moe", "smollm3", "gpt_oss", "arcee",