Port of ggml-org/llama.cpp#14741 to ollama

ORippler · ORippler · commit 9da98e39ad46 · 2025-07-18T15:37:17.000+02:00
diff --git a/llama/patches/0019-metal-add-mean-kernel-14267.patch b/llama/patches/0019-metal-add-mean-kernel-14267.patch
@@ -16,7 +16,7 @@ ggml-ci
  2 files changed, 67 insertions(+), 14 deletions(-)
 
 diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
-index ee4f2dcb..f20f5615 100644
+index a9eeebc6..110c9ece 100644
 --- a/ggml/src/ggml-metal/ggml-metal.m
 +++ b/ggml/src/ggml-metal/ggml-metal.m
 @@ -489,6 +489,7 @@ enum ggml_metal_kernel_type {
diff --git a/llama/patches/0020-CUDA-add-mean-operation-14313.patch b/llama/patches/0020-CUDA-add-mean-operation-14313.patch
@@ -52,7 +52,7 @@ index 64fb4ff4..5b9a0fe3 100644
  static __device__ __forceinline__ float warp_reduce_max(float x) {
  #pragma unroll
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index 4c829153..9e64e5ae 100644
+index d6960174..2b9fabf4 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
 @@ -35,6 +35,7 @@
diff --git a/llama/patches/0021-cuda-Fix-Gemma3n-not-executed-as-CUDA_GRAPH-on-NVGPU.patch b/llama/patches/0021-cuda-Fix-Gemma3n-not-executed-as-CUDA_GRAPH-on-NVGPU.patch
@@ -0,0 +1,52 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Oliver Simons <osimons@nvidia.com>
+Date: Fri, 18 Jul 2025 13:35:32 +0200
+Subject: [PATCH] cuda : Fix Gemma3n not executed as CUDA_GRAPH on NVGPUs
+ (#14741)
+
+* Fix Gemma3n not executed as CUDA_GRAPH on NVGPUs
+
+Gemma3n uses Matrix-Matrix addition as part of their input processing,
+wrongly triggering CUDA_GRAPH disablement on NVGPUs even when batch-size
+of 1 is used.
+
+* Exclude `project_per_layer_input` by matching node names
+
+This ensures that all other graphs which don't exhibit this pattern do
+not have their behavior changed.
+
+* Revert unnecessary formatting changes
+---
+ ggml/src/ggml-cuda/ggml-cuda.cu | 12 +++++++++---
+ 1 file changed, 9 insertions(+), 3 deletions(-)
+
+diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
+index 2b9fabf4..c1dfee76 100644
+--- a/ggml/src/ggml-cuda/ggml-cuda.cu
++++ b/ggml/src/ggml-cuda/ggml-cuda.cu
+@@ -2474,6 +2474,9 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
+     // Loop over nodes in GGML graph to obtain info needed for CUDA graph
+     cuda_ctx->cuda_graph->cpy_dest_ptrs.clear();
+ 
++    const std::string gemma3n_per_layer_proj_src0_name = "inp_per_layer_selected";
++    const std::string gemma3n_per_layer_proj_src1_name = "per_layer_proj";
++
+     for (int i = 0; i < cgraph->n_nodes; i++) {
+         ggml_tensor * node = cgraph->nodes[i];
+ 
+@@ -2495,9 +2498,12 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
+ #endif
+         }
+ 
+-        if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1) {
+-            // disable CUDA graphs for batch size > 1 for now.
+-            // Changes in batch size or context size can cause changes to the grid size of some kernels.
++        if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1 && (node->src[0] ? node->src[0]->name != gemma3n_per_layer_proj_src0_name : true) && (node->src[1] ? node->src[1]->name != gemma3n_per_layer_proj_src1_name : true)) {
++            // disable CUDA graphs for batch size > 1 for now while excluding the matrix-matrix addition as part of Gemma3n's `project_per_layer_input` operation
++            // by means of matching node names. See
++            // https://github.com/ggml-org/llama.cpp/blob/f9a31eea06a859e34cecb88b4d020c7f03d86cc4/src/llama-model.cpp#L10199-L10241 and
++            // https://github.com/huggingface/transformers/blob/bda75b4011239d065de84aa3e744b67ebfa7b245/src/transformers/models/gemma3n/modeling_gemma3n.py#L1773,
++            // Generally, changes in batch size or context size can cause changes to the grid size of some kernels.
+             use_cuda_graph = false;
+ #ifndef NDEBUG
+             GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2474,6 +2474,9 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
     // Loop over nodes in GGML graph to obtain info needed for CUDA graph
     cuda_ctx->cuda_graph->cpy_dest_ptrs.clear();
 
+    const std::string gemma3n_per_layer_proj_src0_name = "inp_per_layer_selected";
+    const std::string gemma3n_per_layer_proj_src1_name = "per_layer_proj";
+
     for (int i = 0; i < cgraph->n_nodes; i++) {
         ggml_tensor * node = cgraph->nodes[i];
 
@@ -2495,9 +2498,12 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
 #endif
         }
 
-        if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1) {
-            // disable CUDA graphs for batch size > 1 for now.
-            // Changes in batch size or context size can cause changes to the grid size of some kernels.
+        if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1 && (node->src[0] ? node->src[0]->name != gemma3n_per_layer_proj_src0_name : true) && (node->src[1] ? node->src[1]->name != gemma3n_per_layer_proj_src1_name : true)) {
+            // disable CUDA graphs for batch size > 1 for now while excluding the matrix-matrix addition as part of Gemma3n's `project_per_layer_input` operation
+            // by means of matching node names. See
+            // https://github.com/ggml-org/llama.cpp/blob/f9a31eea06a859e34cecb88b4d020c7f03d86cc4/src/llama-model.cpp#L10199-L10241 and
+            // https://github.com/huggingface/transformers/blob/bda75b4011239d065de84aa3e744b67ebfa7b245/src/transformers/models/gemma3n/modeling_gemma3n.py#L1773,
+            // Generally, changes in batch size or context size can cause changes to the grid size of some kernels.
             use_cuda_graph = false;
 #ifndef NDEBUG
             GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);