Skip to content

Commit 9da98e3

Browse files
committed
Port of ggml-org/llama.cpp#14741 to ollama
1 parent 191d942 commit 9da98e3

File tree

4 files changed

+63
-5
lines changed

4 files changed

+63
-5
lines changed

llama/patches/0019-metal-add-mean-kernel-14267.patch

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ ggml-ci
1616
2 files changed, 67 insertions(+), 14 deletions(-)
1717

1818
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
19-
index ee4f2dcb..f20f5615 100644
19+
index a9eeebc6..110c9ece 100644
2020
--- a/ggml/src/ggml-metal/ggml-metal.m
2121
+++ b/ggml/src/ggml-metal/ggml-metal.m
2222
@@ -489,6 +489,7 @@ enum ggml_metal_kernel_type {

llama/patches/0020-CUDA-add-mean-operation-14313.patch

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ index 64fb4ff4..5b9a0fe3 100644
5252
static __device__ __forceinline__ float warp_reduce_max(float x) {
5353
#pragma unroll
5454
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
55-
index 4c829153..9e64e5ae 100644
55+
index d6960174..2b9fabf4 100644
5656
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
5757
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
5858
@@ -35,6 +35,7 @@
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2+
From: Oliver Simons <[email protected]>
3+
Date: Fri, 18 Jul 2025 13:35:32 +0200
4+
Subject: [PATCH] cuda : Fix Gemma3n not executed as CUDA_GRAPH on NVGPUs
5+
(#14741)
6+
7+
* Fix Gemma3n not executed as CUDA_GRAPH on NVGPUs
8+
9+
Gemma3n uses Matrix-Matrix addition as part of their input processing,
10+
wrongly triggering CUDA_GRAPH disablement on NVGPUs even when batch-size
11+
of 1 is used.
12+
13+
* Exclude `project_per_layer_input` by matching node names
14+
15+
This ensures that all other graphs which don't exhibit this pattern do
16+
not have their behavior changed.
17+
18+
* Revert unnecessary formatting changes
19+
---
20+
ggml/src/ggml-cuda/ggml-cuda.cu | 12 +++++++++---
21+
1 file changed, 9 insertions(+), 3 deletions(-)
22+
23+
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
24+
index 2b9fabf4..c1dfee76 100644
25+
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
26+
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
27+
@@ -2474,6 +2474,9 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
28+
// Loop over nodes in GGML graph to obtain info needed for CUDA graph
29+
cuda_ctx->cuda_graph->cpy_dest_ptrs.clear();
30+
31+
+ const std::string gemma3n_per_layer_proj_src0_name = "inp_per_layer_selected";
32+
+ const std::string gemma3n_per_layer_proj_src1_name = "per_layer_proj";
33+
+
34+
for (int i = 0; i < cgraph->n_nodes; i++) {
35+
ggml_tensor * node = cgraph->nodes[i];
36+
37+
@@ -2495,9 +2498,12 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
38+
#endif
39+
}
40+
41+
- if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1) {
42+
- // disable CUDA graphs for batch size > 1 for now.
43+
- // Changes in batch size or context size can cause changes to the grid size of some kernels.
44+
+ if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1 && (node->src[0] ? node->src[0]->name != gemma3n_per_layer_proj_src0_name : true) && (node->src[1] ? node->src[1]->name != gemma3n_per_layer_proj_src1_name : true)) {
45+
+ // disable CUDA graphs for batch size > 1 for now while excluding the matrix-matrix addition as part of Gemma3n's `project_per_layer_input` operation
46+
+ // by means of matching node names. See
47+
+ // https://github.com/ggml-org/llama.cpp/blob/f9a31eea06a859e34cecb88b4d020c7f03d86cc4/src/llama-model.cpp#L10199-L10241 and
48+
+ // https://github.com/huggingface/transformers/blob/bda75b4011239d065de84aa3e744b67ebfa7b245/src/transformers/models/gemma3n/modeling_gemma3n.py#L1773,
49+
+ // Generally, changes in batch size or context size can cause changes to the grid size of some kernels.
50+
use_cuda_graph = false;
51+
#ifndef NDEBUG
52+
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);

ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2474,6 +2474,9 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
24742474
// Loop over nodes in GGML graph to obtain info needed for CUDA graph
24752475
cuda_ctx->cuda_graph->cpy_dest_ptrs.clear();
24762476

2477+
const std::string gemma3n_per_layer_proj_src0_name = "inp_per_layer_selected";
2478+
const std::string gemma3n_per_layer_proj_src1_name = "per_layer_proj";
2479+
24772480
for (int i = 0; i < cgraph->n_nodes; i++) {
24782481
ggml_tensor * node = cgraph->nodes[i];
24792482

@@ -2495,9 +2498,12 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
24952498
#endif
24962499
}
24972500

2498-
if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1) {
2499-
// disable CUDA graphs for batch size > 1 for now.
2500-
// Changes in batch size or context size can cause changes to the grid size of some kernels.
2501+
if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1 && (node->src[0] ? node->src[0]->name != gemma3n_per_layer_proj_src0_name : true) && (node->src[1] ? node->src[1]->name != gemma3n_per_layer_proj_src1_name : true)) {
2502+
// disable CUDA graphs for batch size > 1 for now while excluding the matrix-matrix addition as part of Gemma3n's `project_per_layer_input` operation
2503+
// by means of matching node names. See
2504+
// https://github.com/ggml-org/llama.cpp/blob/f9a31eea06a859e34cecb88b4d020c7f03d86cc4/src/llama-model.cpp#L10199-L10241 and
2505+
// https://github.com/huggingface/transformers/blob/bda75b4011239d065de84aa3e744b67ebfa7b245/src/transformers/models/gemma3n/modeling_gemma3n.py#L1773,
2506+
// Generally, changes in batch size or context size can cause changes to the grid size of some kernels.
25012507
use_cuda_graph = false;
25022508
#ifndef NDEBUG
25032509
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);

0 commit comments

Comments
 (0)