axolotl-ai-cloud · NanoCode012 · Aug 7, 2025 · Aug 7, 2025 · Aug 7, 2025 · Aug 7, 2025
diff --git a/examples/glm45/README.md b/examples/glm45/README.md
@@ -0,0 +1,48 @@
+# Finetune GLM4.5 with Axolotl
+
+[UNSTABLE]
+
+```bash
+# LoRA SFT (4xH200 @ 84GB/GPU)
+axolotl train examples/glm45/glm4.5-lora-fsdp2.yaml
+
+# FFT SFT (4xH200)
+# Checkpointing error on backward pass
+# Without checkpointing => OOM
+axolotl train examples/glm45/glm4.5-fft-fsdp2.yaml
+```
+
+## Dataset
+
+In addition to normal OpenAI Messages format, GLM4.5 support an extra parameter for thinking in assistant section.
+
+```json
+{
+    "role": "assistant",
+    "reasoning_content": "...",  // or have </think>...</think> in `content`
+    "content": "...",
+}
+```
+
+Note:
+- The role name for tools in this template is `tool`.
+- You will see this Axolotl WARNING. This is to be as expected as the template does not use EOS.
+```bash
+EOS token '<|endoftext|>' not found in chat_template. Please check if your template/EOS token is correct.
+```
+- Make sure you set the below extra attributes if needed
+```yaml
+datasets:
+  - path: ...
+    type: chat_template
+    message_property_mappings:
+      role: role
+      content: content
+
+    #   tool_calls: tool_calls  # uncomment if using tools
+    #   reasoning_content: reasoning_content  # uncomment if have reasoning
+
+# Uncomment if training on tool role (you would rarely if ever need this)
+# eot_tokens:
+#   - <|observation|>
+```
diff --git a/examples/glm45/glm4.5-fft-fsdp2.yaml b/examples/glm45/glm4.5-fft-fsdp2.yaml
@@ -0,0 +1,59 @@
+base_model: zai-org/GLM-4.5-Air
+# Automatically upload checkpoint and final model to HF
+# hub_model_id: username/custom_model_name
+
+plugins:
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+
+experimental_skip_move_to_device: true  # prevent OOM by NOT putting model to GPU before sharding
+
+datasets:
+  - path: winglian/pirate-ultrachat-10k
+    type: chat_template
+dataset_prepared_path: last_run_prepared
+val_set_size: 0
+output_dir: ./outputs/qlora-out
+
+sequence_len: 2048
+sample_packing: true
+eval_sample_packing: true
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 1
+micro_batch_size: 1
+num_epochs: 1
+optimizer: adamw_torch_4bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+bf16: auto
+tf32: false
+
+# gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+loss_watchdog_threshold: 5.0
+loss_watchdog_patience: 3
+
+warmup_ratio: 0.1
+evals_per_epoch: 1
+saves_per_epoch: 1
+weight_decay: 0.0
+special_tokens:
+
+fsdp_version: 2
+fsdp_config:
+  offload_params: false
+  cpu_ram_efficient_loading: true
+  auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  transformer_layer_cls_to_wrap: Glm4MoeDecoderLayer
+  state_dict_type: SHARDED_STATE_DICT
+  reshard_after_forward: true
+  activation_checkpointing: true
diff --git a/examples/glm45/glm4.5-lora-fsdp2.yaml b/examples/glm45/glm4.5-lora-fsdp2.yaml
@@ -0,0 +1,74 @@
+base_model: zai-org/GLM-4.5-Air
+# Automatically upload checkpoint and final model to HF
+# hub_model_id: username/custom_model_name
+
+plugins:
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+
+experimental_skip_move_to_device: true  # prevent OOM by NOT putting model to GPU before sharding
+
+datasets:
+  - path: winglian/pirate-ultrachat-10k
+    type: chat_template
+dataset_prepared_path: last_run_prepared
+val_set_size: 0
+output_dir: ./outputs/qlora-out
+
+adapter: lora
+lora_model_dir:
+
+lora_r: 16
+lora_alpha: 32
+lora_dropout: 0.05
+lora_target_modules:
+  - gate_proj
+  - down_proj
+  - up_proj
+  - q_proj
+  - v_proj
+  - k_proj
+  - o_proj
+
+sequence_len: 2048
+sample_packing: true
+eval_sample_packing: true
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 1
+micro_batch_size: 1
+num_epochs: 1
+optimizer: adamw_torch_4bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+bf16: auto
+tf32: false
+
+# gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+loss_watchdog_threshold: 5.0
+loss_watchdog_patience: 3
+
+warmup_ratio: 0.1
+evals_per_epoch: 1
+saves_per_epoch: 1
+weight_decay: 0.0
+special_tokens:
+
+fsdp_version: 2
+fsdp_config:
+  offload_params: false
+  cpu_ram_efficient_loading: true
+  auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  transformer_layer_cls_to_wrap: Glm4MoeDecoderLayer
+  state_dict_type: SHARDED_STATE_DICT
+  reshard_after_forward: true
+  # activation_checkpointing: false
diff --git a/src/axolotl/common/architectures.py b/src/axolotl/common/architectures.py
@@ -13,5 +13,7 @@
     "qwen2_moe": "Qwen2MoeSparseMoeBlock",
     "qwen3_moe": "Qwen3MoeSparseMoeBlock",
     "deepseek_v2": "DeepseekV2MoE",
-    "gpt_oss": "GptOssDecoderLayer",
+    "gpt_oss": "GptOssExperts",
+    "deepseek_v3": "DeepseekV3MoE",
+    "glm4_moe": "Glm4MoeMoE",
 }
diff --git a/src/axolotl/integrations/cut_cross_entropy/README.md b/src/axolotl/integrations/cut_cross_entropy/README.md
@@ -34,6 +34,7 @@ plugins:
 - arcee
 - cohere
 - cohere2
+- deepseek_v3
 - gemma
 - gemma2
 - gemma3
@@ -42,6 +43,7 @@ plugins:
 - gemma3n_text
 - glm
 - glm4
+- glm_moe
 - gpt_oss
 - granite
 - granitemoe

diff --git a/src/axolotl/monkeypatch/multipack.py b/src/axolotl/monkeypatch/multipack.py
@@ -35,6 +35,7 @@
     "deepseek_v3",
     "glm",
     "glm4",
+    "glm4_moe",
     "smollm3",
     "gpt_oss",
     "arcee",
-Original file line number
+Diff line change
@@ Expand Up / @@ -34,6 +34,7 @@ plugins: @@
     - arcee
     - cohere
     - cohere2
+    - deepseek_v3
     - gemma
     - gemma2
     - gemma3
@@ Expand All / @@ -42,6 +43,7 @@ plugins: @@
     - gemma3n_text
     - glm
     - glm4
+    - glm_moe
     - gpt_oss
     - granite
     - granitemoe
@@ Expand Down @@