From 61cff1bd42182f7183729771358e644dcccd8257 Mon Sep 17 00:00:00 2001
From: Arsh Maknojia <amaknojia@nvidia.com>
Date: Wed, 29 Oct 2025 06:31:23 -0700
Subject: [PATCH] Enable CUDA graphs for embed gemma 300m

---
 ggml/src/ggml-cuda/ggml-cuda.cu | 6 +++++-
 src/llama-model.cpp             | 4 ++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index fcff5d7cdc1f5..f05c973c92f5e 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2806,6 +2806,8 @@ static bool check_node_graph_compatibility(ggml_cgraph * cgraph,
     const std::string ffn_moe_down_bias_prefix = "ffn_moe_down_biased";
     const std::string nemotron_h_block_out_prefix = "nemotron_h_block_out";
     const std::string mamba2_y_add_d_prefix = "mamba2_y_add_d";
+    const std::string emGemma_sa_out_prefix = "emGemma_sa_out";
+    const std::string emGemma_l_out_prefix = "emGemma_l_out";
 
     for (int i = 0; i < cgraph->n_nodes; i++) {
         ggml_tensor * node = cgraph->nodes[i];
@@ -2836,7 +2838,9 @@ static bool check_node_graph_compatibility(ggml_cgraph * cgraph,
             strncmp(node->name, ffn_moe_up_bias_prefix.c_str(), ffn_moe_up_bias_prefix.size()) != 0 &&
             strncmp(node->name, ffn_moe_down_bias_prefix.c_str(), ffn_moe_down_bias_prefix.size()) != 0 &&
             strncmp(node->name, nemotron_h_block_out_prefix.c_str(), nemotron_h_block_out_prefix.size()) != 0 &&
-            strncmp(node->name, mamba2_y_add_d_prefix.c_str(), mamba2_y_add_d_prefix.size()) != 0) {
+            strncmp(node->name, mamba2_y_add_d_prefix.c_str(), mamba2_y_add_d_prefix.size()) != 0 &&
+            strncmp(node->name, emGemma_sa_out_prefix.c_str(), emGemma_sa_out_prefix.size()) != 0 &&
+            strncmp(node->name, emGemma_l_out_prefix.c_str(), emGemma_l_out_prefix.size()) != 0) {
             // disable CUDA graphs for batch size > 1 for now while excluding the matrix-matrix addition as part of Gemma3n's `project_per_layer_input` operation
             // by means of matching node names. See
             // https://github.com/ggml-org/llama.cpp/blob/f9a31eea06a859e34cecb88b4d020c7f03d86cc4/src/llama-model.cpp#L10199-L10241 and
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index ea6f59ed482bb..cda5e45e8aea3 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -11573,7 +11573,7 @@ struct llm_build_gemma_embedding : public llm_graph_context {
             cb(cur, "attn_post_norm", il);
 
             ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
-            cb(sa_out, "sa_out", il);
+            cb(sa_out, "emGemma_sa_out", il);
 
             cur = build_norm(sa_out,
                     model.layers[il].ffn_norm, NULL,
@@ -11599,7 +11599,7 @@ struct llm_build_gemma_embedding : public llm_graph_context {
             cur = ggml_add(ctx0, cur, sa_out);
 
             cur = build_cvec(cur, il);
-            cb(cur, "l_out", il);
+            cb(cur, "emGemma_l_out", il);
 
             // input for next layer
             inpL = cur;