From e49cfff302f6df60e81b2854475e7b97f84848c4 Mon Sep 17 00:00:00 2001
From: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Date: Fri, 7 Nov 2025 13:56:24 +0200
Subject: [PATCH] Fix PPL increase caused by mmq_id

---
 ggml/src/ggml-cuda/mmq_id_common.cuh | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cuda/mmq_id_common.cuh b/ggml/src/ggml-cuda/mmq_id_common.cuh
index 89baa31b6..0bb1a1a81 100644
--- a/ggml/src/ggml-cuda/mmq_id_common.cuh
+++ b/ggml/src/ggml-cuda/mmq_id_common.cuh
@@ -3960,7 +3960,10 @@ template <ggml_type type, int mmq_x>
 static void launch_mul_mat_q_id(ggml_backend_cuda_context & ctx, const mmq_args_id & args, cudaStream_t stream) {
     const int id = ggml_cuda_get_device();
     const int cc = ggml_cuda_info().devices[id].cc;
-    const int nsm = ggml_cuda_info().devices[id].nsm;
+    const int nsm_max = ggml_cuda_info().devices[id].nsm;
+    int nsm = 1;
+    //while (nsm*2 <= nsm_max) nsm *= 2;
+    while (nsm < nsm_max) nsm *= 2;
     const int warp_size = ggml_cuda_get_physical_warp_size_host(); //ggml_cuda_info().devices[id].warp_size;
     const int nwarps = mmq_get_nwarps_host(cc, warp_size);
     const int mmq_y = get_mmq_y_host(cc);