batch : rename batch_allocr to balloc

ggerganov · ggerganov · commit e6ac4ac84261 · 2025-06-17T13:37:43.000+03:00
ggml-ci
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -20,7 +20,7 @@ llama_context::llama_context(
         const llama_model & model,
               llama_context_params params) :
     model(model),
-    batch_allocr(std::make_unique<llama_batch_allocr>()) {
+    balloc(std::make_unique<llama_batch_allocr>()) {
     LLAMA_LOG_INFO("%s: constructing llama_context\n", __func__);
 
     t_start_us = model.t_start_us;
@@ -734,14 +734,14 @@ int llama_context::encode(const llama_batch & batch_inp) {
     const int64_t n_embd = hparams.n_embd;
 
     // note: during encode, we always pass the full sequence starting from pos = 0
-    if (!batch_allocr->init(batch_inp, model.vocab, nullptr, n_embd, true)) {
+    if (!balloc->init(batch_inp, model.vocab, nullptr, n_embd, true)) {
         LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
         return -1;
     }
 
-    const uint32_t n_tokens = batch_allocr->get_n_tokens();
+    const uint32_t n_tokens = balloc->get_n_tokens();
 
-    const llama_ubatch ubatch = batch_allocr->split_simple(n_tokens);
+    const llama_ubatch ubatch = balloc->split_simple(n_tokens);
 
     // micro-batching is not possible for non-causal encoding, so we process the batch in a single shot
     GGML_ASSERT(cparams.n_ubatch >= n_tokens && "encoder requires n_ubatch >= n_tokens");
@@ -859,7 +859,7 @@ int llama_context::encode(const llama_batch & batch_inp) {
         cross.v_embd.resize(cross.n_embd*cross.n_enc);
         memcpy(cross.v_embd.data(), embd, ggml_nbytes(t_embd));
 
-        const auto & batch = batch_allocr->get_batch();
+        const auto & batch = balloc->get_batch();
 
         // remember the sequence ids used during the encoding - needed for cross attention later
         cross.seq_ids_enc.resize(n_tokens);
@@ -897,13 +897,13 @@ int llama_context::decode(const llama_batch & batch_inp) {
     // when computing embeddings, all tokens are output
     const bool output_all = cparams.embeddings;
 
-    if (!batch_allocr->init(batch_inp, vocab, memory.get(), n_embd, output_all)) {
+    if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, output_all)) {
         LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
         return -1;
     }
 
-    const uint32_t n_tokens_all  = batch_allocr->get_n_tokens();
-    const uint32_t n_outputs_all = batch_allocr->get_n_outputs();
+    const uint32_t n_tokens_all  = balloc->get_n_tokens();
+    const uint32_t n_outputs_all = balloc->get_n_outputs();
 
     if (output_all) {
         // require that all tokens are output
@@ -934,7 +934,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
     llama_memory_state_ptr mstate;
 
     while (true) {
-        mstate = memory->init_batch(batch_allocr.get(), cparams.n_ubatch, output_all);
+        mstate = memory->init_batch(*balloc, cparams.n_ubatch, output_all);
         if (!mstate) {
             return -2;
         }
@@ -955,19 +955,19 @@ int llama_context::decode(const llama_batch & batch_inp) {
                         did_optimize = true;
 
                         if (kv_self_update(true)) {
-                            LLAMA_LOG_DEBUG("%s: retrying batch size %d after cache optimization\n", __func__, batch_allocr->get_n_tokens());
+                            LLAMA_LOG_DEBUG("%s: retrying batch size %d after cache optimization\n", __func__, balloc->get_n_tokens());
 
                             continue;
                         }
                     }
 
-                    LLAMA_LOG_WARN("%s: failed to find a memory slot for batch of size %d\n", __func__, batch_allocr->get_n_tokens());
+                    LLAMA_LOG_WARN("%s: failed to find a memory slot for batch of size %d\n", __func__, balloc->get_n_tokens());
 
                     return 1;
                 }
             case LLAMA_MEMORY_STATUS_FAILED_COMPUTE:
                 {
-                    LLAMA_LOG_ERROR("%s: compute failed while preparing batch of size %d\n", __func__, batch_allocr->get_n_tokens());
+                    LLAMA_LOG_ERROR("%s: compute failed while preparing batch of size %d\n", __func__, balloc->get_n_tokens());
 
                     return -2;
                 }
@@ -1133,7 +1133,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
     if (n_outputs > 0) {
         bool sorted_output = true;
 
-        auto & out_ids = batch_allocr->get_out_ids();
+        auto & out_ids = balloc->get_out_ids();
 
         GGML_ASSERT(out_ids.size() == (size_t) n_outputs);
 
@@ -1306,8 +1306,8 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
 
     this->n_outputs = n_outputs;
 
-    llama_batch_allocr batch_allocr;
-    llama_ubatch ubatch = batch_allocr.ubatch_reserve(n_tokens/n_seqs, n_seqs);
+    llama_batch_allocr balloc;
+    llama_ubatch ubatch = balloc.ubatch_reserve(n_tokens/n_seqs, n_seqs);
 
     auto * gf = graph_init();
     auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT, mstate);
@@ -2027,12 +2027,12 @@ void llama_context::opt_epoch_iter(
             batch.logits  [pos_batch]    = true;
         }
 
-        if (!batch_allocr->init(batch, model.vocab, nullptr, model.hparams.n_embd, true)) {
+        if (!balloc->init(batch, model.vocab, nullptr, model.hparams.n_embd, true)) {
             LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
             return;
         }
 
-        const uint32_t n_tokens_all = batch_allocr->get_n_tokens();
+        const uint32_t n_tokens_all = balloc->get_n_tokens();
 
         n_queued_tokens += n_tokens_all;
 
@@ -2041,7 +2041,7 @@ void llama_context::opt_epoch_iter(
         uint32_t n_outputs_all = n_tokens_all;
 
         // TODO: fix
-        auto mstate = memory->init_batch(batch_allocr.get(), cparams.n_ubatch, true);
+        auto mstate = memory->init_batch(*balloc, cparams.n_ubatch, true);
         if (!mstate || mstate->get_status() != LLAMA_MEMORY_STATUS_SUCCESS) {
             LLAMA_LOG_ERROR("%s: could not initialize batch\n", __func__);
             break;
diff --git a/src/llama-context.h b/src/llama-context.h
@@ -247,7 +247,7 @@ struct llama_context {
     std::map<llama_seq_id, std::vector<float>> embd_seq;
 
     // reuse the batch_allocr to avoid unnecessary memory allocations
-    std::unique_ptr<llama_batch_allocr> batch_allocr;
+    std::unique_ptr<llama_batch_allocr> balloc;
 
     uint32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch
 
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
@@ -252,6 +252,7 @@ void llm_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) {
 }
 
 void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
+    // TODO: repace this if with GGML_ASSERT(kq_mask)
     if (kq_mask) {
         if (cparams.causal_attn) {
             const int64_t n_kv         = ubatch->n_tokens;
diff --git a/src/llama-kv-cache-recurrent.cpp b/src/llama-kv-cache-recurrent.cpp
@@ -359,17 +359,17 @@ llama_pos llama_kv_cache_recurrent::seq_pos_max(llama_seq_id seq_id) const {
     return result;
 }
 
-llama_memory_state_ptr llama_kv_cache_recurrent::init_batch(llama_batch_allocr * batch_allocr, uint32_t n_ubatch, bool embd_all) {
+llama_memory_state_ptr llama_kv_cache_recurrent::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
     std::vector<llama_ubatch> ubatches;
 
     while (true) {
         llama_ubatch ubatch;
 
         if (embd_all) {
             // if all tokens are output, split by sequence
-            ubatch = batch_allocr->split_seq(n_ubatch);
+            ubatch = balloc.split_seq(n_ubatch);
         } else {
-            ubatch = batch_allocr->split_equal(n_ubatch);
+            ubatch = balloc.split_equal(n_ubatch);
         }
 
         if (ubatch.n_tokens == 0) {
@@ -824,9 +824,9 @@ bool llama_kv_cache_recurrent::state_read_meta(llama_io_read_i & io, uint32_t ce
 
         seq_rm(dest_seq_id, -1, -1);
 
-        llama_batch_allocr batch_allocr;
+        llama_batch_allocr balloc;
 
-        llama_ubatch ubatch = batch_allocr.ubatch_reserve(cell_count, 1);
+        llama_ubatch ubatch = balloc.ubatch_reserve(cell_count, 1);
 
         for (uint32_t i = 0; i < cell_count; ++i) {
             llama_pos pos;
diff --git a/src/llama-kv-cache-recurrent.h b/src/llama-kv-cache-recurrent.h
@@ -30,7 +30,7 @@ class llama_kv_cache_recurrent : public llama_memory_i {
     //
 
     llama_memory_state_ptr init_batch(
-            llama_batch_allocr * batch_allocr,
+            llama_batch_allocr & balloc,
             uint32_t n_ubatch,
             bool embd_all) override;
 
diff --git a/src/llama-kv-cache-unified-iswa.cpp b/src/llama-kv-cache-unified-iswa.cpp
@@ -95,16 +95,16 @@ llama_pos llama_kv_cache_unified_iswa::seq_pos_max(llama_seq_id seq_id) const {
     return kv_swa->seq_pos_max(seq_id);
 }
 
-llama_memory_state_ptr llama_kv_cache_unified_iswa::init_batch(llama_batch_allocr * batch_allocr, uint32_t n_ubatch, bool embd_all) {
+llama_memory_state_ptr llama_kv_cache_unified_iswa::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
     GGML_UNUSED(embd_all);
 
     // first try simple split
     do {
-        batch_allocr->split_reset();
+        balloc.split_reset();
 
         std::vector<llama_ubatch> ubatches;
         while (true) {
-            auto ubatch = batch_allocr->split_simple(n_ubatch);
+            auto ubatch = balloc.split_simple(n_ubatch);
 
             if (ubatch.n_tokens == 0) {
                 break;
@@ -131,11 +131,11 @@ llama_memory_state_ptr llama_kv_cache_unified_iswa::init_batch(llama_batch_alloc
 
     // if it fails, try equal split
     do {
-        batch_allocr->split_reset();
+        balloc.split_reset();
 
         std::vector<llama_ubatch> ubatches;
         while (true) {
-            auto ubatch = batch_allocr->split_equal(n_ubatch);
+            auto ubatch = balloc.split_equal(n_ubatch);
 
             if (ubatch.n_tokens == 0) {
                 break;
diff --git a/src/llama-kv-cache-unified-iswa.h b/src/llama-kv-cache-unified-iswa.h
@@ -32,7 +32,7 @@ class llama_kv_cache_unified_iswa : public llama_memory_i {
     //
 
     llama_memory_state_ptr init_batch(
-            llama_batch_allocr * batch_allocr,
+            llama_batch_allocr & balloc,
             uint32_t n_ubatch,
             bool embd_all) override;
 
diff --git a/src/llama-kv-cache-unified.cpp b/src/llama-kv-cache-unified.cpp
@@ -308,17 +308,17 @@ llama_pos llama_kv_cache_unified::seq_pos_max(llama_seq_id seq_id) const {
 }
 
 llama_memory_state_ptr llama_kv_cache_unified::init_batch(
-            llama_batch_allocr * batch_allocr,
+            llama_batch_allocr & balloc,
             uint32_t n_ubatch,
             bool embd_all) {
     GGML_UNUSED(embd_all);
 
     do {
-        batch_allocr->split_reset();
+        balloc.split_reset();
 
         std::vector<llama_ubatch> ubatches;
         while (true) {
-            auto ubatch = batch_allocr->split_simple(n_ubatch);
+            auto ubatch = balloc.split_simple(n_ubatch);
 
             if (ubatch.n_tokens == 0) {
                 break;
@@ -1505,9 +1505,9 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell
 
         seq_rm(dest_seq_id, -1, -1);
 
-        llama_batch_allocr batch_allocr;
+        llama_batch_allocr balloc;
 
-        llama_ubatch ubatch = batch_allocr.ubatch_reserve(cell_count, 1);
+        llama_ubatch ubatch = balloc.ubatch_reserve(cell_count, 1);
 
         for (uint32_t i = 0; i < cell_count; ++i) {
             llama_pos pos;
diff --git a/src/llama-kv-cache-unified.h b/src/llama-kv-cache-unified.h
@@ -57,7 +57,7 @@ class llama_kv_cache_unified : public llama_memory_i {
     //
 
     llama_memory_state_ptr init_batch(
-            llama_batch_allocr * batch_allocr,
+            llama_batch_allocr & balloc,
             uint32_t n_ubatch,
             bool embd_all) override;
 
diff --git a/src/llama-memory.h b/src/llama-memory.h
@@ -70,7 +70,7 @@ struct llama_memory_i {
     // return a state object containing the ubatches and KV cache state required to process them
     // check the llama_memory_state_i::get_status() for the result
     virtual llama_memory_state_ptr init_batch(
-            llama_batch_allocr * batch_allocr,
+            llama_batch_allocr & balloc,
             uint32_t n_ubatch,
             bool embd_all) = 0;
 

Original file line number	Diff line number	Diff line change
`@@ -252,6 +252,7 @@ void llm_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) {`
`252`	`252`	`}`
`253`	`253`
`254`	`254`	`void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {`
	`255`	`+ // TODO: repace this if with GGML_ASSERT(kq_mask)`
`255`	`256`	`if (kq_mask) {`
`256`	`257`	`if (cparams.causal_attn) {`
`257`	`258`	`const int64_t n_kv = ubatch->n_tokens;`