Fix Gemma3n not executed as CUDA_GRAPH on NVGPUs

ORippler · ORippler · commit 8e35380f5869 · 2025-07-17T09:13:08.000-07:00
Gemma3n uses Matrix-Matrix addition as part of their input processing,
wrongly triggering CUDA_GRAPH disablement on NVGPUs even when batch-size
of 1 is used.
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2589,6 +2589,7 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
 
     // Loop over nodes in GGML graph to obtain info needed for CUDA graph
     cuda_ctx->cuda_graph->cpy_dest_ptrs.clear();
+    std::uint8_t batch_size_counter = 0;
 
     for (int i = 0; i < cgraph->n_nodes; i++) {
         ggml_tensor * node = cgraph->nodes[i];
@@ -2612,12 +2613,18 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
         }
 
         if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1) {
-            // disable CUDA graphs for batch size > 1 for now.
-            // Changes in batch size or context size can cause changes to the grid size of some kernels.
-            use_cuda_graph = false;
-#ifndef NDEBUG
-            GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
-#endif
+            // disable CUDA graphs for batch size > 1 for now. The heuristic here allows to use CUDA graphs
+            // for Gemma3n, which uses a single Matrix-Matrix Addition as part of `project_per_layer_input`, while detecting
+            // batched execution for all graphs with >1 GGML_OP_ADD nodes. See also
+            // https://github.com/huggingface/transformers/blob/bda75b4011239d065de84aa3e744b67ebfa7b245/src/transformers/models/gemma3n/modeling_gemma3n.py#L1773,
+            // Generally, changes in batch size or context size can cause changes to the grid size of some kernels.
+            ++batch_size_counter;
+            if (batch_size_counter > 1) {
+                use_cuda_graph = false;
+                #ifndef NDEBUG
+                  GGML_LOG_DEBUG("%s: disabling CUDA graphs due to repeated batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
+                #endif
+            }
         }
 
         if (node->op == GGML_OP_CPY) {