initial GLM-4.5 integration

ddh0 · ddh0 · commit 2586ae5af7b8 · 2025-08-02T02:07:32.000-05:00
diff --git a/src/llama-arch.h b/src/llama-arch.h
@@ -66,6 +66,7 @@ enum llm_arch {
     LLM_ARCH_DEEPSEEK2,
     LLM_ARCH_CHATGLM,
     LLM_ARCH_GLM4,
+    LLM_ATCH_GLM4_MOE,
     LLM_ARCH_BITNET,
     LLM_ARCH_T5,
     LLM_ARCH_T5ENCODER,
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
@@ -749,8 +749,10 @@ ggml_tensor * llm_graph_context::build_ffn(
 
     if (down) {
         cur = build_lora_mm(down, cur);
-        if (arch == LLM_ARCH_GLM4) {
-            // GLM4 seems to have numerical issues with half-precision accumulators
+        if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE) {
+            // GLM4 FFNs seem to have numerical issues with half-precision accumulators
+            // -- ref: https://github.com/ggml-org/llama.cpp/pull/13101
+            // (GLM4_MOE uses some GLM4 FFNs, so we need to match it too)
             ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
         }
     }
@@ -1391,8 +1393,10 @@ ggml_tensor * llm_graph_context::build_attn(
 
     if (wo) {
         cur = build_lora_mm(wo, cur);
-        if (arch == LLM_ARCH_GLM4) {
-            // GLM4 seems to have numerical issues with half-precision accumulators
+        if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE) {
+            // GLM4 FFNs seem to have numerical issues with half-precision accumulators
+            // -- ref: https://github.com/ggml-org/llama.cpp/pull/13101
+            // (GLM4_MOE uses some GLM4 FFNs, so we need to match it too)
             ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
         }
     }
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -111,6 +111,8 @@ const char * llm_type_name(llm_type type) {
         case LLM_TYPE_30B_A3B:       return "30B.A3B";
         case LLM_TYPE_235B_A22B:     return "235B.A22B";
         case LLM_TYPE_300B_A47B:     return "300B.A47B";
+        case LLM_TYPE_355B_A32B:     return "355B.A32B (GLM-4.5)";
+        case LLM_TYPE_106B_A12B:     return "106B.A12B (GLM-4.5)";
         case LLM_TYPE_E2B:           return "E2B";
         case LLM_TYPE_E4B:           return "E4B";
         default:                     return "?B";
@@ -18153,6 +18155,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_PLM:
         case LLM_ARCH_CHATGLM:
         case LLM_ARCH_GLM4:
+        case LLM_ARCH_GLM4_MOE:
         case LLM_ARCH_GRANITE:
         case LLM_ARCH_GRANITE_MOE:
         case LLM_ARCH_GRANITE_HYBRID:
diff --git a/src/llama-model.h b/src/llama-model.h
@@ -103,6 +103,8 @@ enum llm_type {
     LLM_TYPE_30B_A3B,
     LLM_TYPE_235B_A22B,
     LLM_TYPE_300B_A47B, // Ernie MoE big
+    LLM_TYPE_355B_A32B, // GLM-4.5
+    LLM_TYPE_106B_A12B, // GLM-4.5-Air
     LLM_TYPE_E2B,
     LLM_TYPE_E4B,
 };

Original file line number	Diff line number	Diff line change
`@@ -749,8 +749,10 @@ ggml_tensor * llm_graph_context::build_ffn(`
`749`	`749`
`750`	`750`	`if (down) {`
`751`	`751`	`cur = build_lora_mm(down, cur);`
`752`		`- if (arch == LLM_ARCH_GLM4) {`
`753`		`- // GLM4 seems to have numerical issues with half-precision accumulators`
	`752`	`+ if (arch == LLM_ARCH_GLM4 \|\| arch == LLM_ARCH_GLM4_MOE) {`
	`753`	`+ // GLM4 FFNs seem to have numerical issues with half-precision accumulators`
	`754`	`+ // -- ref: https://github.com/ggml-org/llama.cpp/pull/13101`
	`755`	`+ // (GLM4_MOE uses some GLM4 FFNs, so we need to match it too)`
`754`	`756`	`ggml_mul_mat_set_prec(cur, GGML_PREC_F32);`
`755`	`757`	`}`
`756`	`758`	`}`
`@@ -1391,8 +1393,10 @@ ggml_tensor * llm_graph_context::build_attn(`
`1391`	`1393`
`1392`	`1394`	`if (wo) {`
`1393`	`1395`	`cur = build_lora_mm(wo, cur);`
`1394`		`- if (arch == LLM_ARCH_GLM4) {`
`1395`		`- // GLM4 seems to have numerical issues with half-precision accumulators`
	`1396`	`+ if (arch == LLM_ARCH_GLM4 \|\| arch == LLM_ARCH_GLM4_MOE) {`
	`1397`	`+ // GLM4 FFNs seem to have numerical issues with half-precision accumulators`
	`1398`	`+ // -- ref: https://github.com/ggml-org/llama.cpp/pull/13101`
	`1399`	`+ // (GLM4_MOE uses some GLM4 FFNs, so we need to match it too)`
`1396`	`1400`	`ggml_mul_mat_set_prec(cur, GGML_PREC_F32);`
`1397`	`1401`	`}`
`1398`	`1402`	`}`