cont : add n_seq_max to batch allocr

ggerganov · ggerganov · commit a00dba75ccf1 · 2025-07-07T10:31:40.000+03:00
ggml-ci
diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp
@@ -27,6 +27,7 @@ bool llama_batch_allocr::init(
         const llama_vocab & vocab,
         const llama_memory_i * memory,
         uint32_t n_embd,
+        uint32_t n_seq_max,
         bool output_all) {
     clear();
 
@@ -40,6 +41,11 @@ bool llama_batch_allocr::init(
     // validate input batch
     //
 
+    if (n_seq_max > LLAMA_MAX_SEQ) {
+        LLAMA_LOG_ERROR("%s: n_seq_max = %d > %d\n", __func__, n_seq_max, LLAMA_MAX_SEQ);
+        return false;
+    }
+
     if (batch.token) {
         for (int32_t i = 0; i < batch.n_tokens; ++i) {
             if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= vocab.n_tokens()) {
@@ -52,8 +58,8 @@ bool llama_batch_allocr::init(
     if (batch.seq_id) {
         for (int32_t i = 0; i < batch.n_tokens; ++i) {
             for (int32_t s = 0; s < batch.n_seq_id[i]; ++s) {
-                if (batch.seq_id && (batch.seq_id[i][s] < 0 || batch.seq_id[i][s] >= LLAMA_MAX_SEQ)) {
-                    LLAMA_LOG_ERROR("%s: invalid seq_id[%d][%d] = %d > %d\n", __func__, i, s, batch.seq_id[i][s], LLAMA_MAX_SEQ);
+                if (batch.seq_id && (batch.seq_id[i][s] < 0 || batch.seq_id[i][s] >= (llama_seq_id) n_seq_max)) {
+                    LLAMA_LOG_ERROR("%s: invalid seq_id[%d][%d] = %d > %d\n", __func__, i, s, batch.seq_id[i][s], (llama_seq_id) n_seq_max);
                     return false;
                 }
             }
@@ -86,7 +92,7 @@ bool llama_batch_allocr::init(
 
         // initialize the starting position for each sequence based on the positions in the memory
         llama_pos p0[LLAMA_MAX_SEQ];
-        for (int32_t s = 0; s < LLAMA_MAX_SEQ; ++s) {
+        for (uint32_t s = 0; s < n_seq_max; ++s) {
             if (!memory) {
                 // if no memory -> start from 0
                 p0[s] = 0;
diff --git a/src/llama-batch.h b/src/llama-batch.h
@@ -48,6 +48,7 @@ class llama_batch_allocr {
             const llama_vocab & vocab,
             const llama_memory_i * memory,
             uint32_t n_embd,
+            uint32_t n_seq_max,
             bool output_all);
 
     const llama_batch & get_batch() const;
@@ -100,6 +101,7 @@ class llama_batch_allocr {
     const uint32_t n_pos_per_embd;
 
     uint32_t n_embd;
+    uint32_t n_seq_max;
     uint32_t n_outputs;
 
     std::array<llama_seq_id, 1> seq_id_0 = { 0 }; // default sequence id
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -740,7 +740,7 @@ int llama_context::encode(const llama_batch & batch_inp) {
     const int64_t n_embd = hparams.n_embd;
 
     // note: during encode, we always pass the full sequence starting from pos = 0
-    if (!balloc->init(batch_inp, model.vocab, nullptr, n_embd, true)) {
+    if (!balloc->init(batch_inp, model.vocab, nullptr, n_embd, cparams.n_seq_max, true)) {
         LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
         return -1;
     }
@@ -907,7 +907,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
     // when computing embeddings, all tokens are output
     const bool output_all = cparams.embeddings;
 
-    if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, output_all)) {
+    if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, cparams.n_seq_max, output_all)) {
         LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
         return -1;
     }
@@ -2036,7 +2036,7 @@ void llama_context::opt_epoch_iter(
             batch.logits  [pos_batch]    = true;
         }
 
-        if (!balloc->init(batch, model.vocab, nullptr, model.hparams.n_embd, true)) {
+        if (!balloc->init(batch, model.vocab, nullptr, model.hparams.n_embd, cparams.n_seq_max, true)) {
             LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
             return;
         }

Original file line number	Diff line number	Diff line change
`@@ -740,7 +740,7 @@ int llama_context::encode(const llama_batch & batch_inp) {`
`740`	`740`	`const int64_t n_embd = hparams.n_embd;`
`741`	`741`
`742`	`742`	`// note: during encode, we always pass the full sequence starting from pos = 0`
`743`		`- if (!balloc->init(batch_inp, model.vocab, nullptr, n_embd, true)) {`
	`743`	`+ if (!balloc->init(batch_inp, model.vocab, nullptr, n_embd, cparams.n_seq_max, true)) {`
`744`	`744`	`LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);`
`745`	`745`	`return -1;`
`746`	`746`	`}`
`@@ -907,7 +907,7 @@ int llama_context::decode(const llama_batch & batch_inp) {`
`907`	`907`	`// when computing embeddings, all tokens are output`
`908`	`908`	`const bool output_all = cparams.embeddings;`
`909`	`909`
`910`		`- if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, output_all)) {`
	`910`	`+ if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, cparams.n_seq_max, output_all)) {`
`911`	`911`	`LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);`
`912`	`912`	`return -1;`
`913`	`913`	`}`
`@@ -2036,7 +2036,7 @@ void llama_context::opt_epoch_iter(`
`2036`	`2036`	`batch.logits [pos_batch] = true;`
`2037`	`2037`	`}`
`2038`	`2038`
`2039`		`- if (!balloc->init(batch, model.vocab, nullptr, model.hparams.n_embd, true)) {`
	`2039`	`+ if (!balloc->init(batch, model.vocab, nullptr, model.hparams.n_embd, cparams.n_seq_max, true)) {`
`2040`	`2040`	`LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);`
`2041`	`2041`	`return;`
`2042`	`2042`	`}`