glm4 unclamp for all except vulkan

LostRuins · LostRuins · commit 5d382970ecd3 · 2025-04-30T17:19:38.000+08:00
diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp
@@ -1907,16 +1907,18 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
         printf("Warning: Only GGUF models can use max context above 16k. Max context lowered to 16k.\n");
         clamped_max_context_length = 16384;
     }
-    if (isGguf && file_format_meta.model_architecture == GGUFArch::ARCH_GLM4 && kcpp_data->n_batch > 16) {
+
+    #if defined(GGML_USE_VULKAN)
+    if (isGguf && file_format_meta.model_architecture == GGUFArch::ARCH_GLM4 && kcpp_data->n_ubatch > 16) {
         if(debugmode==1)
         {
-            printf("GLM-4 is broken on larger batch sizes. Clamp ignored in debug.\n");
+            printf("GLM-4 is broken on larger batch sizes in Vulkan. Clamp ignored in debug.\n");
         } else {
-            printf("GLM-4 is broken on larger batch sizes. Clamping batch size to 16.\n");
-            kcpp_data->n_batch = kcpp_data->n_ubatch = 16;
+            printf("GLM-4 is broken on larger batch sizes in Vulkan. Clamping ubatch size to 16.\n");
+            kcpp_data->n_ubatch = 16;
         }
-
     }
+    #endif
 
     kcpp_data->n_ctx = clamped_max_context_length;
     max_context_limit_at_load = clamped_max_context_length;
diff --git a/klite.embd b/klite.embd
@@ -3440,7 +3440,7 @@ Current version indicated by LITEVER below.
 		"name":"GLM-4",
 		"user":"<|user|>\\n",
 		"user_end":"",
-		"assistant":"<|assistant|>",
+		"assistant":"<|assistant|>\\n",
 		"assistant_end":"",
 		"system":"<|system|>\\n",
 		"system_end":"",

Original file line number	Diff line number	Diff line change
`@@ -1907,16 +1907,18 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in`
`1907`	`1907`	`printf("Warning: Only GGUF models can use max context above 16k. Max context lowered to 16k.\n");`
`1908`	`1908`	`clamped_max_context_length = 16384;`
`1909`	`1909`	`}`
`1910`		`- if (isGguf && file_format_meta.model_architecture == GGUFArch::ARCH_GLM4 && kcpp_data->n_batch > 16) {`
	`1910`	`+`
	`1911`	`+ #if defined(GGML_USE_VULKAN)`
	`1912`	`+ if (isGguf && file_format_meta.model_architecture == GGUFArch::ARCH_GLM4 && kcpp_data->n_ubatch > 16) {`
`1911`	`1913`	`if(debugmode==1)`
`1912`	`1914`	`{`
`1913`		`- printf("GLM-4 is broken on larger batch sizes. Clamp ignored in debug.\n");`
	`1915`	`+ printf("GLM-4 is broken on larger batch sizes in Vulkan. Clamp ignored in debug.\n");`
`1914`	`1916`	`} else {`
`1915`		`- printf("GLM-4 is broken on larger batch sizes. Clamping batch size to 16.\n");`
`1916`		`- kcpp_data->n_batch = kcpp_data->n_ubatch = 16;`
	`1917`	`+ printf("GLM-4 is broken on larger batch sizes in Vulkan. Clamping ubatch size to 16.\n");`
	`1918`	`+ kcpp_data->n_ubatch = 16;`
`1917`	`1919`	`}`
`1918`		`-`
`1919`	`1920`	`}`
	`1921`	`+ #endif`
`1920`	`1922`
`1921`	`1923`	`kcpp_data->n_ctx = clamped_max_context_length;`
`1922`	`1924`	`max_context_limit_at_load = clamped_max_context_length;`