cont : auto-gen positions + verify multi-seq input

ggerganov · ggerganov · commit b67fe0d2b7a2 · 2025-06-14T11:01:41.000+03:00
ggml-ci
diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp
@@ -3,6 +3,7 @@
 #include "llama-impl.h"
 #include "llama-cparams.h"
 #include "llama-vocab.h"
+#include "llama-memory.h"
 
 #include <cassert>
 #include <cstring>
@@ -295,7 +296,10 @@ llama_batch_allocr::llama_batch_allocr() {
     }
 }
 
-bool llama_batch_allocr::init(const llama_batch & batch_inp, const llama_vocab & vocab, llama_pos p0) {
+bool llama_batch_allocr::init(
+        const llama_batch & batch_inp,
+        const llama_vocab & vocab,
+        const llama_memory_i * memory) {
     clear();
 
     batch = batch_inp;
@@ -306,14 +310,6 @@ bool llama_batch_allocr::init(const llama_batch & batch_inp, const llama_vocab &
     // validate input batch
     //
 
-    // TODO: remove
-    if (!batch.pos) {
-        if (batch.seq_id) {
-            LLAMA_LOG_ERROR("%s: pos == NULL, but seq_id != NULL\n", __func__);
-            return false;
-        }
-    }
-
     if (batch.token) {
         for (int32_t i = 0; i < batch.n_tokens; ++i) {
             if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= vocab.n_tokens()) {
@@ -338,15 +334,6 @@ bool llama_batch_allocr::init(const llama_batch & batch_inp, const llama_vocab &
     // auto-generate missing fields
     //
 
-    if (!batch.pos) {
-        assert(p0 >= 0);
-        pos.resize(batch.n_tokens);
-        for (int32_t i = 0; i < batch.n_tokens; i++) {
-            pos[i] = p0 + i;
-        }
-        batch.pos = pos.data();
-    }
-
     if (!batch.n_seq_id) {
         n_seq_id.resize(batch.n_tokens);
         for (int32_t i = 0; i < batch.n_tokens; i++) {
@@ -364,6 +351,27 @@ bool llama_batch_allocr::init(const llama_batch & batch_inp, const llama_vocab &
         batch.seq_id = seq_id.data();
     }
 
+    if (!batch.pos) {
+        pos.resize(batch.n_tokens);
+
+        // initialize the starting position for each sequence based on the positions in the memory
+        llama_pos p0[LLAMA_MAX_PARALLEL_SEQUENCES];
+        for (int32_t s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
+            if (!memory) {
+                p0[s] = 0;
+            } else {
+                p0[s] = memory->seq_pos_max(s) + 1;
+            }
+        }
+
+        for (int32_t i = 0; i < batch.n_tokens; i++) {
+            const llama_seq_id seq_id = batch.seq_id[i][0];
+            pos[i] = p0[seq_id] + i;
+        }
+
+        batch.pos = pos.data();
+    }
+
     if (!batch.logits) {
         // by default return the output only for the last token
         output.resize(batch.n_tokens);
@@ -379,24 +387,54 @@ bool llama_batch_allocr::init(const llama_batch & batch_inp, const llama_vocab &
         n_outputs += batch.logits[i] != 0;
     }
 
+    // determine coupled sequences
+    // these are pairs of sequences that have at least one token in the input batch that is assigned to both of them
     for (int32_t i = 0; i < batch.n_tokens; ++i) {
         for (int32_t s = 0; s < batch.n_seq_id[i]; ++s) {
             seq_pos[batch.seq_id[i][s]].insert(batch.pos[i]);
 
             if (s > 0) {
-                seq_cpl[batch.seq_id[i][0]][batch.seq_id[i][s]] = true;
+                const llama_seq_id s0 = batch.seq_id[i][0];
+                const llama_seq_id s1 = batch.seq_id[i][s];
+
+                seq_cpl[s1][s0] = true;
             }
         }
     }
 
-    // TODO:
-    // - verify that coupled sequences have same "position contexts"
-    // - verify that input sequences are "contiguous" (no position gaps)
-    // - verify that input sequences begin from the last poition currently in the context
+    for (int32_t s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
+        if (seq_pos[s].empty()) {
+            continue;
+        }
+
+        if (memory && seq_pos_min(s) != memory->seq_pos_max(s) + 1) {
+            LLAMA_LOG_ERROR("%s: sequence %d does not start from the last position stored in the memory\n", __func__, s);
+            return false;
+        }
+
+        if (seq_pos_max(s) - seq_pos_min(s) + 1 > (int) seq_pos[s].size()) {
+            LLAMA_LOG_ERROR("%s: sequence %d is not contiguous\n", __func__, s);
+            return false;
+        }
+    }
+
+    if (memory) {
+        for (int32_t s0 = 0; s0 < LLAMA_MAX_PARALLEL_SEQUENCES; ++s0) {
+            for (int32_t s1 = 0; s1 < LLAMA_MAX_PARALLEL_SEQUENCES; ++s1) {
+                if (seq_cpl[s0][s1]) {
+                    if (memory->seq_pos_min(s0) != memory->seq_pos_min(s1) ||
+                        memory->seq_pos_max(s0) != memory->seq_pos_max(s1)) {
+                        LLAMA_LOG_ERROR("%s: sequence %d is coupled to %d in the input batch, but have divereged\n", __func__, s0, s1);
+                        return false;
+                    }
+                }
+            }
+        }
+    }
 
     if (debug > 0) {
-        LLAMA_LOG_DEBUG("%s: input batch info (p0 = %d):\n", __func__, p0);
-        LLAMA_LOG_DEBUG("%s:   n_tokens  = %d\n", __func__, batch.n_tokens);
+        LLAMA_LOG_DEBUG("%s: input batch info:\n", __func__);
+        LLAMA_LOG_DEBUG("%s:   n_tokens  = %d\n", __func__,          batch.n_tokens);
         LLAMA_LOG_DEBUG("%s:   token     = %p\n", __func__, (void *) batch.token);
         LLAMA_LOG_DEBUG("%s:   embd      = %p\n", __func__, (void *) batch.embd);
         LLAMA_LOG_DEBUG("%s:   pos       = %p\n", __func__, (void *) batch.pos);
@@ -439,14 +477,21 @@ bool llama_batch_allocr::init(const llama_batch & batch_inp, const llama_vocab &
             }
             LLAMA_LOG_DEBUG("%s:   ]\n", __func__);
 
-            LLAMA_LOG_DEBUG("%s:   seq_pos   = [\n", __func__);
-            for (int s = 0; s < (int) seq_pos.size(); ++s) {
-                const auto & cur = seq_pos[s];
-                if (cur.empty()) {
+            LLAMA_LOG_DEBUG("%s:   seq       = [\n", __func__);
+            for (int s0 = 0; s0 < (int) seq_pos.size(); ++s0) {
+                if (seq_pos[s0].empty()) {
                     continue;
                 }
 
-                LLAMA_LOG_DEBUG("%s:  %4d: [%4d, %4d]\n", __func__, s, seq_pos_min(s), seq_pos_max(s));
+                std::stringstream ss;
+                for (int s1 = 0; s1 < (int) seq_cpl[s0].size(); ++s1) {
+                    if (seq_cpl[s0][s1]) {
+                        ss << s1 << " ";
+                    }
+                }
+
+                LLAMA_LOG_DEBUG("%s:  %4d: pos = [%4d, %4d], cpl = %s\n",
+                        __func__, s0, seq_pos_min(s0), seq_pos_max(s0), ss.str().empty() ? "-" : ss.str().c_str());
             }
             LLAMA_LOG_DEBUG("%s:   ]\n", __func__);
         }
@@ -485,7 +530,7 @@ void llama_batch_allocr::clear() {
     }
 
     for (auto & cur : seq_cpl) {
-        cur.clear();
+        std::fill(cur.begin(), cur.end(), false);
     }
 }
 
diff --git a/src/llama-batch.h b/src/llama-batch.h
@@ -84,8 +84,10 @@ class llama_batch_allocr {
     llama_batch_allocr();
 
     // optionally fulfill the batch returned by llama_batch_get_one
-    // TODO: extend p0 to be per-sequence: provide `seq_pos_min` and `seq_pos_max` from the memory
-    bool init(const llama_batch & batch_inp, const llama_vocab & vocab, llama_pos p0);
+    bool init(
+            const llama_batch & batch_inp,
+            const llama_vocab & vocab,
+            const llama_memory_i * memory);
 
     const llama_batch & get_batch() const;
 
@@ -109,7 +111,7 @@ class llama_batch_allocr {
     std::vector<int8_t>         output;
 
     std::vector<std::set<llama_pos>> seq_pos; // the positions of each sequence
-    std::vector<std::vector<bool>>   seq_cpl; // if sequences i and j are coupled
+    std::vector<std::vector<bool>>   seq_cpl; // if sequences i is coupled to sequence j
 
     int debug;
 };
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -729,7 +729,7 @@ int llama_context::encode(const llama_batch & batch_inp) {
 
     // temporary allocate memory for the input batch if needed
     // note: during encode, we always pass the full sequence starting from pos = 0
-    if (!batch_allocr->init(batch_inp, model.vocab, batch_inp.pos ? -1 : 0)) {
+    if (!batch_allocr->init(batch_inp, model.vocab, nullptr)) {
         LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
         return -1;
     }
@@ -896,7 +896,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
     }
 
     // temporary allocate memory for the input batch if needed
-    if (!batch_allocr->init(batch_inp, model.vocab, batch_inp.pos ? -1 : memory->seq_pos_max(0) + 1)) {
+    if (!batch_allocr->init(batch_inp, model.vocab, memory.get())) {
         LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
         return -1;
     }
diff --git a/src/llama-cparams.h b/src/llama-cparams.h
@@ -4,6 +4,7 @@
 
 #include <cstdint>
 
+// TODO: rename to something shorter
 #define LLAMA_MAX_PARALLEL_SEQUENCES 64
 
 struct llama_cparams {

Original file line number	Diff line number	Diff line change
`@@ -729,7 +729,7 @@ int llama_context::encode(const llama_batch & batch_inp) {`
`729`	`729`
`730`	`730`	`// temporary allocate memory for the input batch if needed`
`731`	`731`	`// note: during encode, we always pass the full sequence starting from pos = 0`
`732`		`- if (!batch_allocr->init(batch_inp, model.vocab, batch_inp.pos ? -1 : 0)) {`
	`732`	`+ if (!batch_allocr->init(batch_inp, model.vocab, nullptr)) {`
`733`	`733`	`LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);`
`734`	`734`	`return -1;`
`735`	`735`	`}`
`@@ -896,7 +896,7 @@ int llama_context::decode(const llama_batch & batch_inp) {`
`896`	`896`	`}`
`897`	`897`
`898`	`898`	`// temporary allocate memory for the input batch if needed`
`899`		`- if (!batch_allocr->init(batch_inp, model.vocab, batch_inp.pos ? -1 : memory->seq_pos_max(0) + 1)) {`
	`899`	`+ if (!batch_allocr->init(batch_inp, model.vocab, memory.get())) {`
`900`	`900`	`LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);`
`901`	`901`	`return -1;`
`902`	`902`	`}`