ngxson
diff --git a/‎common/common.cpp‎
Lines changed: 0 additions & 37 deletions b/‎common/common.cpp‎
Lines changed: 0 additions & 37 deletions
diff --git a/‎common/common.h‎
Lines changed: 15 additions & 3 deletions b/‎common/common.h‎
Lines changed: 15 additions & 3 deletions
diff --git a/‎examples/llava/gemma3-cli.cpp‎
Lines changed: 14 additions & 44 deletions b/‎examples/llava/gemma3-cli.cpp‎
Lines changed: 14 additions & 44 deletions
diff --git a/‎examples/llava/llava.cpp‎
Lines changed: 3 additions & 35 deletions b/‎examples/llava/llava.cpp‎
Lines changed: 3 additions & 35 deletions
diff --git a/‎examples/llava/qwen2vl-cli.cpp‎
Lines changed: 1 addition & 0 deletions b/‎examples/llava/qwen2vl-cli.cpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/lookahead/lookahead.cpp‎
Lines changed: 12 additions & 9 deletions b/‎examples/lookahead/lookahead.cpp‎
Lines changed: 12 additions & 9 deletions
@@ -582,43 +582,6 @@ std::string string_from(const struct llama_context * ctx, const std::vector<llam
     return buf.str();
 }
 
-/*
-std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch) {
-    std::stringstream buf;
-
-    buf << "[ ";
-
-    bool first = true;
-    for (int i = 0; i < batch.n_tokens; ++i) {
-        if (!first) {
-            buf << ", ";
-        } else {
-            first = false;
-        }
-
-        auto detokenized = common_token_to_piece(ctx, batch.token[i]);
-
-        detokenized.erase(
-                std::remove_if(
-                    detokenized.begin(),
-                    detokenized.end(),
-                    [](const unsigned char c) { return !std::isprint(c); }),
-                detokenized.end());
-
-        buf << "\n"          << std::to_string(i)
-            << ", token '"   << detokenized << "'"
-            << ", pos "      << std::to_string(batch.pos[i])
-            << ", n_seq_id " << std::to_string(batch.n_seq_id[i])
-            << ", seq_id "   << std::to_string(batch.seq_id[i][0])
-            << ", logits "   << std::to_string(batch.logits[i]);
-    }
-
-    buf << " ]";
-
-    return buf.str();
-}
-*/
-
 void string_process_escapes(std::string & input) {
     std::size_t input_len = input.length();
     std::size_t output_idx = 0;
 
@@ -516,7 +516,6 @@ void string_process_escapes(std::string & input);
 std::string string_from(bool value);
 std::string string_from(const std::vector<int> & values);
 std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
-std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);
 
 //
 // Filesystem utils
@@ -587,10 +586,10 @@ struct common_batch {
     llama_batch_ext_ptr batch;
     struct batch_token {
         llama_token  token;
-        llama_seq_id seq_id;
         bool         logits;
     };
     std::vector<batch_token> tokens;
+    int n_outputs = 0;
     common_batch() = default;
     common_batch(int32_t n_tokens, int32_t n_seq_max) {
         batch.reset(llama_batch_ext_init(n_tokens, n_seq_max));
@@ -602,7 +601,17 @@ struct common_batch {
     }
     void add_text(llama_token token, llama_pos pos, llama_seq_id seq_id, bool logits) {
         llama_batch_ext_add_text(batch.get(), token, pos, &seq_id, 1, logits);
-        tokens.push_back({token, seq_id, logits});
+        tokens.push_back({token, logits});
+        if (logits) {
+            n_outputs++;
+        }
+    }
+    void add_text(llama_token token, llama_pos pos, std::vector<llama_seq_id> seq_ids, bool logits) {
+        llama_batch_ext_add_text(batch.get(), token, pos, seq_ids.data(), seq_ids.size(), logits);
+        tokens.push_back({token, logits});
+        if (logits) {
+            n_outputs++;
+        }
     }
     void set_logits_last() {
         if (!tokens.empty()) {
@@ -622,6 +631,9 @@ struct common_batch {
         view.tokens.reserve(n_tokens);
         for (int32_t i = 0; i < n_tokens; i++) {
             view.tokens.push_back(tokens[offset + i]);
+            if (tokens[offset + i].logits) {
+                view.n_outputs++;
+            }
         }
         return view;
     }
 
@@ -5,6 +5,7 @@
 #include "clip.h"
 #include "stb_image.h"
 #include "llama.h"
+#include "llama-cpp.h"
 #include "ggml.h"
 #include "console.h"
 
@@ -63,7 +64,7 @@ struct gemma3_context {
     llama_model       * model;
     llama_context     * lctx;
     const llama_vocab * vocab;
-    llama_batch         batch;
+    llama_batch_ext_ptr batch;
 
     int n_threads    = 1;
     llama_pos n_past = 0;
@@ -73,7 +74,7 @@ struct gemma3_context {
         lctx = llama_init.context.get();
         vocab = llama_model_get_vocab(model);
         n_threads = params.cpuparams.n_threads;
-        batch = llama_batch_init(params.n_batch, 0, 1);
+        batch.reset(llama_batch_ext_init(params.n_batch, 1));
         init_clip_model(params);
     }
 
@@ -87,50 +88,18 @@ struct gemma3_context {
     }
 };
 
-struct decode_embd_batch {
-    std::vector<llama_pos>      pos;
-    std::vector<int32_t>        n_seq_id;
-    std::vector<llama_seq_id>   seq_id_0;
-    std::vector<llama_seq_id *> seq_ids;
-    std::vector<int8_t>         logits;
-    llama_batch batch;
-    decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
-        pos     .resize(n_tokens);
-        n_seq_id.resize(n_tokens);
-        seq_ids .resize(n_tokens + 1);
-        logits  .resize(n_tokens);
-        seq_id_0.resize(1);
-        seq_id_0[0] = seq_id;
-        seq_ids [n_tokens] = nullptr;
-        batch = {
-            /*n_tokens       =*/ n_tokens,
-            /*tokens         =*/ nullptr,
-            /*embd           =*/ embd,
-            /*pos            =*/ pos.data(),
-            /*n_seq_id       =*/ n_seq_id.data(),
-            /*seq_id         =*/ seq_ids.data(),
-            /*logits         =*/ logits.data(),
-        };
-        for (int i = 0; i < n_tokens; i++) {
-            batch.pos     [i] = pos_0 + i;
-            batch.n_seq_id[i] = 1;
-            batch.seq_id  [i] = seq_id_0.data();
-            batch.logits  [i] = false;
-        }
-    }
-};
-
 static int eval_text(gemma3_context & ctx, std::string input, bool logits_last = false) {
     llama_tokens tokens = common_tokenize(ctx.lctx, input, false, true);
-    common_batch_clear(ctx.batch);
+    llama_batch_ext_clear(ctx.batch.get());
     for (llama_token & t : tokens) {
-        common_batch_add(ctx.batch, t, ctx.n_past++, {0}, false);
+        llama_seq_id seq_id = 0;
+        llama_batch_ext_add_text(ctx.batch.get(), t, 0, &seq_id, 1, false);
     }
     if (logits_last) {
-        ctx.batch.logits[ctx.batch.n_tokens - 1] = true;
+        llama_batch_ext_set_output_last(ctx.batch.get());
     }
     // LOG("eval_text (n_tokens = %d): %s\n", (int)tokens.size(), input.c_str());
-    if (llama_decode(ctx.lctx, ctx.batch)) {
+    if (llama_decode_ext(ctx.lctx, ctx.batch.get())) {
         LOG_ERR("Failed to decode text\n");
         return 1;
     }
@@ -179,8 +148,8 @@ static int eval_image(gemma3_context & ctx, std::string & fname) {
     int64_t t1 = ggml_time_ms();
     eval_text(ctx, "<start_of_image>");
     llama_set_causal_attn(ctx.lctx, false);
-    decode_embd_batch batch_img(image_embd_v.data(), n_tokens, ctx.n_past, 0);
-    if (llama_decode(ctx.lctx, batch_img.batch)) {
+    llama_batch_ext_ptr batch_img(llama_batch_ext_init_from_embd(image_embd_v.data(), n_tokens, ctx.n_past, 0));
+    if (llama_decode_ext(ctx.lctx, batch_img.get())) {
         LOG_ERR("failed to decode image\n");
         return 1;
     }
@@ -210,9 +179,10 @@ static int generate_response(gemma3_context & ctx, common_sampler * smpl, int n_
         fflush(stdout);
 
         // eval the token
-        common_batch_clear(ctx.batch);
-        common_batch_add(ctx.batch, token_id, ctx.n_past++, {0}, true);
-        if (llama_decode(ctx.lctx, ctx.batch)) {
+        llama_batch_ext_clear(ctx.batch.get());
+        llama_seq_id seq_id = 0;
+        llama_batch_ext_add_text(ctx.batch.get(), token_id, ctx.n_past++, &seq_id, 1, true);
+        if (llama_decode_ext(ctx.lctx, ctx.batch.get())) {
             LOG_ERR("failed to decode token\n");
             return 1;
         }
 
@@ -2,6 +2,7 @@
 #include "llava.h"
 
 #include "llama.h"
+#include "llama-cpp.h"
 
 #include <algorithm>
 #include <cerrno>
@@ -438,39 +439,6 @@ bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, co
     return true;
 }
 
-struct llava_embd_batch {
-    std::vector<llama_pos>      pos;
-    std::vector<int32_t>        n_seq_id;
-    std::vector<llama_seq_id>   seq_id_0;
-    std::vector<llama_seq_id *> seq_ids;
-    std::vector<int8_t>         logits;
-    llama_batch batch;
-    llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
-        pos     .resize(n_tokens);
-        n_seq_id.resize(n_tokens);
-        seq_ids .resize(n_tokens + 1);
-        logits  .resize(n_tokens);
-        seq_id_0.resize(1);
-        seq_id_0[0] = seq_id;
-        seq_ids [n_tokens] = nullptr;
-        batch = {
-            /*n_tokens       =*/ n_tokens,
-            /*tokens         =*/ nullptr,
-            /*embd           =*/ embd,
-            /*pos            =*/ pos.data(),
-            /*n_seq_id       =*/ n_seq_id.data(),
-            /*seq_id         =*/ seq_ids.data(),
-            /*logits         =*/ logits.data(),
-        };
-        for (int i = 0; i < n_tokens; i++) {
-            batch.pos     [i] = pos_0 + i;
-            batch.n_seq_id[i] = 1;
-            batch.seq_id  [i] = seq_id_0.data();
-            batch.logits  [i] = false;
-        }
-    }
-};
-
 bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed, int n_batch, int * n_past) {
     int n_embd  = llama_model_n_embd(llama_get_model(ctx_llama));
 
@@ -480,8 +448,8 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
             n_eval = n_batch;
         }
         float * embd = image_embed->embed+i*n_embd;
-        llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, *n_past, 0);
-        if (llama_decode(ctx_llama, llava_batch.batch)) {
+        llama_batch_ext_ptr batch(llama_batch_ext_init_from_embd(embd, n_eval, 0, 0));
+        if (llama_decode_ext(ctx_llama, batch.get())) {
             LOG_ERR("%s : failed to eval\n", __func__);
             return false;
         }
 
@@ -66,6 +66,7 @@ static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct lla
         memcpy(&batch_mrope_pos[n_eval * 2], &mrope_pos[img_tokens * 2 + processed], n_eval * sizeof(llama_pos));
         memcpy(&batch_mrope_pos[n_eval * 3], &mrope_pos[img_tokens * 3 + processed], n_eval * sizeof(llama_pos));
 
+        // TODO: move this to llama_batch_ext API
         llama_batch batch = {
             int32_t(n_eval),                // n_tokens
             nullptr,                        // token
 
@@ -115,7 +115,7 @@ int main(int argc, char ** argv) {
     // seq_id == 0           : the current input token
     // seq_id [1, W]         : tokens from the past N - 1 Jacobi iterations
     // seq_id [W + 1, W + G] : verification n-grams
-    llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1);
+    llama_batch_ext * batch = llama_batch_ext_init(params.n_ctx, W + G + 1);
 
     // target model sampling context
     struct common_sampler * smpl = common_sampler_init(model, params.sampling);
@@ -204,10 +204,10 @@ int main(int argc, char ** argv) {
         //                                                      V  V  V  V  V  V
         //                                                             id
         {
-            common_batch_clear(batch);
+            llama_batch_ext_clear(batch);
 
             // current token - first token of the first level
-            common_batch_add(batch, id, n_past, seq_id_all, true);
+            llama_batch_ext_add_text(batch, id, n_past, seq_id_all.data(), seq_id_all.size(), true);
 
             // verification n-grams - queue this before the lookahead tokens for less KV cache fragmentation
             {
@@ -230,9 +230,10 @@ int main(int argc, char ** argv) {
                         const llama_token t = ngrams_observed.tokens[idx + j];
 
                         ngrams_cur[g].tokens [j + 1] = t;
-                        ngrams_cur[g].i_batch[j + 1] = batch.n_tokens;
+                        ngrams_cur[g].i_batch[j + 1] = llama_batch_ext_get_n_tokens(batch);
 
-                        common_batch_add(batch, t, n_past + j + 1, { W + 1 + g }, true);
+                        llama_seq_id seq_id = W + 1 + g;
+                        llama_batch_ext_add_text(batch, t, n_past + j + 1, &seq_id, 1, true);
                     }
                 }
             }
@@ -244,18 +245,20 @@ int main(int argc, char ** argv) {
                     seq_id_look[j] = i + j + 1;
                 }
 
-                common_batch_add(batch, tokens_j[0][i], n_past + i, seq_id_look, false);
+                llama_batch_ext_add_text(batch, tokens_j[0][i], n_past + i,
+                    seq_id_look.data(), seq_id_look.size(), false);
             }
 
             // fill the rest of the levels
             for (int j = 1; j < N - 1; j++) {
                 for (int i = 0; i < W; i++) {
-                    common_batch_add(batch, tokens_j[j][i], n_past + j + i, { i + 1 }, j == N - 2);
+                    llama_seq_id seq_id = i + 1;
+                    llama_batch_ext_add_text(batch, tokens_j[j][i], n_past + j + i, &seq_id, 1, j == N - 2);
                 }
             }
         }
 
-        if (llama_decode(ctx, batch) != 0) {
+        if (llama_decode_ext(ctx, batch) != 0) {
             LOG_ERR("\n\n%s: llama_decode failed - increase KV cache size\n", __func__);
             return 1;
         }
@@ -475,7 +478,7 @@ int main(int argc, char ** argv) {
 
     llama_kv_cache_view_free(&kvc_view);
 
-    llama_batch_free(batch);
+    llama_batch_ext_free(batch);
 
     llama_backend_free();
Original file line number	Diff line number	Diff line change
`@@ -115,7 +115,7 @@ int main(int argc, char ** argv) {`
`115`	`115`	`// seq_id == 0 : the current input token`
`116`	`116`	`// seq_id [1, W] : tokens from the past N - 1 Jacobi iterations`
`117`	`117`	`// seq_id [W + 1, W + G] : verification n-grams`
`118`		`- llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1);`
	`118`	`+ llama_batch_ext * batch = llama_batch_ext_init(params.n_ctx, W + G + 1);`
`119`	`119`
`120`	`120`	`// target model sampling context`
`121`	`121`	`struct common_sampler * smpl = common_sampler_init(model, params.sampling);`
`@@ -204,10 +204,10 @@ int main(int argc, char ** argv) {`
`204`	`204`	`// V V V V V V`
`205`	`205`	`// id`
`206`	`206`	`{`
`207`		`- common_batch_clear(batch);`
	`207`	`+ llama_batch_ext_clear(batch);`
`208`	`208`
`209`	`209`	`// current token - first token of the first level`
`210`		`- common_batch_add(batch, id, n_past, seq_id_all, true);`
	`210`	`+ llama_batch_ext_add_text(batch, id, n_past, seq_id_all.data(), seq_id_all.size(), true);`
`211`	`211`
`212`	`212`	`// verification n-grams - queue this before the lookahead tokens for less KV cache fragmentation`
`213`	`213`	`{`
`@@ -230,9 +230,10 @@ int main(int argc, char ** argv) {`
`230`	`230`	`const llama_token t = ngrams_observed.tokens[idx + j];`
`231`	`231`
`232`	`232`	`ngrams_cur[g].tokens [j + 1] = t;`
`233`		`- ngrams_cur[g].i_batch[j + 1] = batch.n_tokens;`
	`233`	`+ ngrams_cur[g].i_batch[j + 1] = llama_batch_ext_get_n_tokens(batch);`
`234`	`234`
`235`		`- common_batch_add(batch, t, n_past + j + 1, { W + 1 + g }, true);`
	`235`	`+ llama_seq_id seq_id = W + 1 + g;`
	`236`	`+ llama_batch_ext_add_text(batch, t, n_past + j + 1, &seq_id, 1, true);`
`236`	`237`	`}`
`237`	`238`	`}`
`238`	`239`	`}`
`@@ -244,18 +245,20 @@ int main(int argc, char ** argv) {`
`244`	`245`	`seq_id_look[j] = i + j + 1;`
`245`	`246`	`}`
`246`	`247`
`247`		`- common_batch_add(batch, tokens_j[0][i], n_past + i, seq_id_look, false);`
	`248`	`+ llama_batch_ext_add_text(batch, tokens_j[0][i], n_past + i,`
	`249`	`+ seq_id_look.data(), seq_id_look.size(), false);`
`248`	`250`	`}`
`249`	`251`
`250`	`252`	`// fill the rest of the levels`
`251`	`253`	`for (int j = 1; j < N - 1; j++) {`
`252`	`254`	`for (int i = 0; i < W; i++) {`
`253`		`- common_batch_add(batch, tokens_j[j][i], n_past + j + i, { i + 1 }, j == N - 2);`
	`255`	`+ llama_seq_id seq_id = i + 1;`
	`256`	`+ llama_batch_ext_add_text(batch, tokens_j[j][i], n_past + j + i, &seq_id, 1, j == N - 2);`
`254`	`257`	`}`
`255`	`258`	`}`
`256`	`259`	`}`
`257`	`260`
`258`		`- if (llama_decode(ctx, batch) != 0) {`
	`261`	`+ if (llama_decode_ext(ctx, batch) != 0) {`
`259`	`262`	`LOG_ERR("\n\n%s: llama_decode failed - increase KV cache size\n", __func__);`
`260`	`263`	`return 1;`
`261`	`264`	`}`
`@@ -475,7 +478,7 @@ int main(int argc, char ** argv) {`
`475`	`478`
`476`	`479`	`llama_kv_cache_view_free(&kvc_view);`
`477`	`480`
`478`		`- llama_batch_free(batch);`
	`481`	`+ llama_batch_ext_free(batch);`
`479`	`482`
`480`	`483`	`llama_backend_free();`
`481`	`484`