Set GLM4 blk.*.attn_output.weight, kqv_out-* matmul to GGML_PREC_F32 to fix infinity values in output

0cc4m · 0cc4m · commit adefa9853c29 · 2025-05-19T19:09:13.000+02:00
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
@@ -1488,6 +1488,10 @@ ggml_tensor * llm_graph_context::build_attn(
 
     if (wo) {
         cur = build_lora_mm(wo, cur);
+        if (arch == LLM_ARCH_GLM4) {
+            // GLM4 seems to have numerical issues with half-precision accumulators
+            ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
+        }
     }
 
     if (wo_b) {

Original file line number	Diff line number	Diff line change
`@@ -1488,6 +1488,10 @@ ggml_tensor * llm_graph_context::build_attn(`
`1488`	`1488`
`1489`	`1489`	`if (wo) {`
`1490`	`1490`	`cur = build_lora_mm(wo, cur);`
	`1491`	`+ if (arch == LLM_ARCH_GLM4) {`
	`1492`	`+ // GLM4 seems to have numerical issues with half-precision accumulators`
	`1493`	`+ ggml_mul_mat_set_prec(cur, GGML_PREC_F32);`
	`1494`	`+ }`
`1491`	`1495`	`}`
`1492`	`1496`
`1493`	`1497`	`if (wo_b) {`