graph : restore ubatch in build_cb

ggerganov · ggerganov · commit 6ee86e5e0f45 · 2025-02-12T16:29:15.000+02:00
ggml-ci
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -196,6 +196,7 @@ bool llama_context::apply_adapter_cvec(
 void llama_context::build_cb(
          ggml_tensor * cur,
           const char * name,
+  const llama_ubatch & ubatch,
                  int   il) {
     if (il >= 0) {
         ggml_format_name(cur, "%s-%d", name, il);
@@ -213,10 +214,7 @@ void llama_context::build_cb(
     // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
     // FIXME: fix in ggml_backend_sched
     const bool full_offload = model.params.n_gpu_layers > (int) model.hparams.n_layer;
-    // TODO: during #11213, the requirement for ubatch.n_tokens < 32 was removed to simplify
-    //       not sure if this is still needed, but it can be brought back if needed
-    //if (ubatch.n_tokens < 32 || full_offload) {
-    if (full_offload) {
+    if (ubatch.n_tokens < 32 || full_offload) {
         if (il != -1 && strcmp(name, "norm") == 0) {
             const auto & dev_layer = model.dev_layer(il);
             for (auto & backend : backends) {
diff --git a/src/llama-context.h b/src/llama-context.h
@@ -85,6 +85,7 @@ struct llama_context : public llama_graph_i {
     virtual void build_cb(
              ggml_tensor * cur,
               const char * name,
+      const llama_ubatch & ubatch,
                      int   il);
 
     // TODO: add encode/decode graphs
diff --git a/src/llama-graph.h b/src/llama-graph.h
@@ -14,6 +14,7 @@ class llama_graph_i {
     virtual void build_cb(
              ggml_tensor * cur,
               const char * name,
+      const llama_ubatch & ubatch,
                      int   il) = 0;
 
     // apply control vector for layer il
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -248,6 +248,7 @@ static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hpara
             return cur_buft;
         }
     }
+
     return nullptr;
 }
 
@@ -3888,7 +3889,7 @@ struct llm_build_context {
 
     // TODO: tmp
     void cb(struct ggml_tensor * cur, const char * name, int il) {
-        lgf.build_cb(cur, name, il);
+        lgf.build_cb(cur, name, ubatch, il);
     }
 
     // TODO: tmp

Original file line number	Diff line number	Diff line change
`@@ -248,6 +248,7 @@ static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hpara`
`248`	`248`	`return cur_buft;`
`249`	`249`	`}`
`250`	`250`	`}`
	`251`	`+`
`251`	`252`	`return nullptr;`
`252`	`253`	`}`
`253`	`254`
`@@ -3888,7 +3889,7 @@ struct llm_build_context {`
`3888`	`3889`
`3889`	`3890`	`// TODO: tmp`
`3890`	`3891`	`void cb(struct ggml_tensor * cur, const char * name, int il) {`
`3891`		`- lgf.build_cb(cur, name, il);`
	`3892`	`+ lgf.build_cb(cur, name, ubatch, il);`
`3892`	`3893`	`}`
`3893`	`3894`
`3894`	`3895`	`// TODO: tmp`