|
| 1 | +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 |
| 2 | +From: Oliver Simons < [email protected]> |
| 3 | +Date: Fri, 18 Jul 2025 13:35:32 +0200 |
| 4 | +Subject: [PATCH] cuda : Fix Gemma3n not executed as CUDA_GRAPH on NVGPUs |
| 5 | + (#14741) |
| 6 | + |
| 7 | +* Fix Gemma3n not executed as CUDA_GRAPH on NVGPUs |
| 8 | + |
| 9 | +Gemma3n uses Matrix-Matrix addition as part of their input processing, |
| 10 | +wrongly triggering CUDA_GRAPH disablement on NVGPUs even when batch-size |
| 11 | +of 1 is used. |
| 12 | + |
| 13 | +* Exclude `project_per_layer_input` by matching node names |
| 14 | + |
| 15 | +This ensures that all other graphs which don't exhibit this pattern do |
| 16 | +not have their behavior changed. |
| 17 | + |
| 18 | +* Revert unnecessary formatting changes |
| 19 | +--- |
| 20 | + ggml/src/ggml-cuda/ggml-cuda.cu | 12 +++++++++--- |
| 21 | + 1 file changed, 9 insertions(+), 3 deletions(-) |
| 22 | + |
| 23 | +diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu |
| 24 | +index 2b9fabf4..c1dfee76 100644 |
| 25 | +--- a/ggml/src/ggml-cuda/ggml-cuda.cu |
| 26 | ++++ b/ggml/src/ggml-cuda/ggml-cuda.cu |
| 27 | +@@ -2474,6 +2474,9 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud |
| 28 | + // Loop over nodes in GGML graph to obtain info needed for CUDA graph |
| 29 | + cuda_ctx->cuda_graph->cpy_dest_ptrs.clear(); |
| 30 | + |
| 31 | ++ const std::string gemma3n_per_layer_proj_src0_name = "inp_per_layer_selected"; |
| 32 | ++ const std::string gemma3n_per_layer_proj_src1_name = "per_layer_proj"; |
| 33 | ++ |
| 34 | + for (int i = 0; i < cgraph->n_nodes; i++) { |
| 35 | + ggml_tensor * node = cgraph->nodes[i]; |
| 36 | + |
| 37 | +@@ -2495,9 +2498,12 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud |
| 38 | + #endif |
| 39 | + } |
| 40 | + |
| 41 | +- if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1) { |
| 42 | +- // disable CUDA graphs for batch size > 1 for now. |
| 43 | +- // Changes in batch size or context size can cause changes to the grid size of some kernels. |
| 44 | ++ if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1 && (node->src[0] ? node->src[0]->name != gemma3n_per_layer_proj_src0_name : true) && (node->src[1] ? node->src[1]->name != gemma3n_per_layer_proj_src1_name : true)) { |
| 45 | ++ // disable CUDA graphs for batch size > 1 for now while excluding the matrix-matrix addition as part of Gemma3n's `project_per_layer_input` operation |
| 46 | ++ // by means of matching node names. See |
| 47 | ++ // https://github.com/ggml-org/llama.cpp/blob/f9a31eea06a859e34cecb88b4d020c7f03d86cc4/src/llama-model.cpp#L10199-L10241 and |
| 48 | ++ // https://github.com/huggingface/transformers/blob/bda75b4011239d065de84aa3e744b67ebfa7b245/src/transformers/models/gemma3n/modeling_gemma3n.py#L1773, |
| 49 | ++ // Generally, changes in batch size or context size can cause changes to the grid size of some kernels. |
| 50 | + use_cuda_graph = false; |
| 51 | + #ifndef NDEBUG |
| 52 | + GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]); |
0 commit comments