cparams : rename LLAMA_MAX_PARALLEL_SEQUENCES to LLAMA_MAX_SEQ (ggml-org#14188)

ggerganov · qnixsynapse · commit 0235499d81b3 · 2025-07-10T08:05:27.000+05:30
ggml-ci
diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp
@@ -289,10 +289,10 @@ llama_batch_allocr::llama_batch_allocr() {
     const char * LLAMA_BATCH_DEBUG = getenv("LLAMA_BATCH_DEBUG");
     debug = LLAMA_BATCH_DEBUG ? atoi(LLAMA_BATCH_DEBUG) : 0;
 
-    seq_pos.resize(LLAMA_MAX_PARALLEL_SEQUENCES);
-    seq_cpl.resize(LLAMA_MAX_PARALLEL_SEQUENCES);
+    seq_pos.resize(LLAMA_MAX_SEQ);
+    seq_cpl.resize(LLAMA_MAX_SEQ);
     for (auto & cur : seq_cpl) {
-        cur.resize(LLAMA_MAX_PARALLEL_SEQUENCES);
+        cur.resize(LLAMA_MAX_SEQ);
     }
 }
 
@@ -322,8 +322,8 @@ bool llama_batch_allocr::init(
     if (batch.seq_id) {
         for (int32_t i = 0; i < batch.n_tokens; ++i) {
             for (int32_t s = 0; s < batch.n_seq_id[i]; ++s) {
-                if (batch.seq_id && (batch.seq_id[i][s] < 0 || batch.seq_id[i][s] >= LLAMA_MAX_PARALLEL_SEQUENCES)) {
-                    LLAMA_LOG_ERROR("%s: invalid seq_id[%d][%d] = %d > %d\n", __func__, i, s, batch.seq_id[i][s], LLAMA_MAX_PARALLEL_SEQUENCES);
+                if (batch.seq_id && (batch.seq_id[i][s] < 0 || batch.seq_id[i][s] >= LLAMA_MAX_SEQ)) {
+                    LLAMA_LOG_ERROR("%s: invalid seq_id[%d][%d] = %d > %d\n", __func__, i, s, batch.seq_id[i][s], LLAMA_MAX_SEQ);
                     return false;
                 }
             }
@@ -355,8 +355,8 @@ bool llama_batch_allocr::init(
         pos.resize(batch.n_tokens);
 
         // initialize the starting position for each sequence based on the positions in the memory
-        llama_pos p0[LLAMA_MAX_PARALLEL_SEQUENCES];
-        for (int32_t s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
+        llama_pos p0[LLAMA_MAX_SEQ];
+        for (int32_t s = 0; s < LLAMA_MAX_SEQ; ++s) {
             if (!memory) {
                 p0[s] = 0;
             } else {
@@ -480,7 +480,7 @@ bool llama_batch_allocr::init(
     // consistency checks
     //
 
-    for (int32_t s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
+    for (int32_t s = 0; s < LLAMA_MAX_SEQ; ++s) {
         if (seq_pos[s].empty()) {
             continue;
         }
@@ -497,8 +497,8 @@ bool llama_batch_allocr::init(
     }
 
     if (memory) {
-        for (int32_t s0 = 0; s0 < LLAMA_MAX_PARALLEL_SEQUENCES; ++s0) {
-            for (int32_t s1 = 0; s1 < LLAMA_MAX_PARALLEL_SEQUENCES; ++s1) {
+        for (int32_t s0 = 0; s0 < LLAMA_MAX_SEQ; ++s0) {
+            for (int32_t s1 = 0; s1 < LLAMA_MAX_SEQ; ++s1) {
                 if (seq_cpl[s0][s1]) {
                     if (memory->seq_pos_min(s0) != memory->seq_pos_min(s1) ||
                         memory->seq_pos_max(s0) != memory->seq_pos_max(s1)) {
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -29,8 +29,8 @@ llama_context::llama_context(
     const auto & hparams = model.hparams;
 
     cparams.n_seq_max = std::max(1u, params.n_seq_max);
-    if (cparams.n_seq_max > LLAMA_MAX_PARALLEL_SEQUENCES) {
-        throw std::runtime_error("n_seq_max must be <= " + std::to_string(LLAMA_MAX_PARALLEL_SEQUENCES));
+    if (cparams.n_seq_max > LLAMA_MAX_SEQ) {
+        throw std::runtime_error("n_seq_max must be <= " + std::to_string(LLAMA_MAX_SEQ));
     }
 
     cparams.n_threads        = params.n_threads;
@@ -1023,8 +1023,8 @@ int llama_context::decode(const llama_batch & batch_inp) {
 
         if (!res) {
             // the last ubatch failed or was aborted -> remove all positions of that ubatch from the KV cache
-            llama_pos pos_min[LLAMA_MAX_PARALLEL_SEQUENCES];
-            for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
+            llama_pos pos_min[LLAMA_MAX_SEQ];
+            for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
                 pos_min[s] = std::numeric_limits<llama_pos>::max();
             }
 
@@ -1035,7 +1035,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
                 pos_min[seq_id] = std::min(pos_min[seq_id], ubatch.pos[i]);
             }
 
-            for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
+            for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
                 if (pos_min[s] == std::numeric_limits<llama_pos>::max()) {
                     continue;
                 }
diff --git a/src/llama-cparams.h b/src/llama-cparams.h
@@ -4,8 +4,7 @@
 
 #include <cstdint>
 
-// TODO: rename to something shorter
-#define LLAMA_MAX_PARALLEL_SEQUENCES 64
+#define LLAMA_MAX_SEQ 64
 
 struct llama_cparams {
     uint32_t n_ctx;           // context size used during inference
diff --git a/src/llama-kv-cache-unified.cpp b/src/llama-kv-cache-unified.cpp
@@ -572,7 +572,7 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
             LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
         }
 
-        for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
+        for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
             if (cells.seq_pos_min(s) < 0) {
                 continue;
             }
@@ -652,8 +652,8 @@ void llama_kv_cache_unified::apply_ubatch(uint32_t head_cur, const llama_ubatch
 
     // keep track of the max sequence position that we would overwrite with this ubatch
     // for non-SWA cache, this would be always empty
-    llama_seq_id seq_pos_max_rm[LLAMA_MAX_PARALLEL_SEQUENCES];
-    for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
+    llama_seq_id seq_pos_max_rm[LLAMA_MAX_SEQ];
+    for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
         seq_pos_max_rm[s] = -1;
     }
 
@@ -684,7 +684,7 @@ void llama_kv_cache_unified::apply_ubatch(uint32_t head_cur, const llama_ubatch
     // note: we want to preserve the invariant that all positions between [pos_min, pos_max] for each sequence
     //       will be present in the cache. so we have to purge any position which is less than those we would overwrite
     //       ref: https://github.com/ggml-org/llama.cpp/pull/13746#issuecomment-2916057092
-    for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
+    for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
         if (seq_pos_max_rm[s] == -1) {
             continue;
         }
diff --git a/src/llama-kv-cells.h b/src/llama-kv-cells.h
@@ -7,7 +7,6 @@
 #include <cassert>
 #include <vector>
 #include <set>
-#include <map>
 
 // meta information about KV cells that can be part of multiple sequences at the same time
 // TODO: add unit tests
@@ -165,7 +164,7 @@ class llama_kv_cells_unified {
         assert(seq_id >= 0);
 
         seq[i].reset(seq_id);
-        seq_pos_dec(seq_id, pos[i]);
+        seq_pos[seq_id].erase(pos[i]);
 
         if (seq[i].none()) {
             pos[i] = -1;
@@ -188,7 +187,7 @@ class llama_kv_cells_unified {
             seq[i].reset();
 
             seq[i].set(seq_id);
-            seq_pos_inc(seq_id, pos[i]);
+            seq_pos[seq_id].insert(pos[i]);
 
             return false;
         }
@@ -233,7 +232,7 @@ class llama_kv_cells_unified {
         assert(!seq[i].test(seq_id));
 
         seq[i].set(seq_id);
-        seq_pos_inc(seq_id, pos[i]);
+        seq_pos[seq_id].insert(pos[i]);
     }
 
     // return the sequence id of this cell
@@ -260,9 +259,7 @@ class llama_kv_cells_unified {
             return -1;
         }
 
-        assert(seq_pos[seq_id].begin()->second > 0);
-
-        return seq_pos[seq_id].begin()->first;
+        return *seq_pos[seq_id].begin();
     }
 
     // the maximum position of sequence seq_id currently present in any of the cells
@@ -275,9 +272,7 @@ class llama_kv_cells_unified {
             return -1;
         }
 
-        assert(seq_pos[seq_id].rbegin()->second > 0);
-
-        return seq_pos[seq_id].rbegin()->first;
+        return *seq_pos[seq_id].rbegin();
     }
 
     // note: call only if the cell is not empty
@@ -389,41 +384,22 @@ class llama_kv_cells_unified {
     //
     std::vector<llama_pos> shift;
 
-    using seq_set_t = std::bitset<LLAMA_MAX_SEQ>;
+    using bits_t = std::bitset<LLAMA_MAX_SEQ>;
 
     // the bitset seq[i] tells us which sequences are currently occupying the i-th cell
-    std::vector<seq_set_t> seq;
+    std::vector<bits_t> seq;
 
-    // the set seq_pos[s][p] tells us how many times the position p is currently present for sequence s
-    // if the position p is not present, seq_pos[s][p] is not set
+    // the set seq_pos[s] tells us which positions are currently present for sequence s
     // this way seq_pos[s].begin() and seq_pos[s].rbegin() give us the min/max positions currently in the cache
-    //
-    // note that we cannot a use an std::set because in some cases a position can occur more than once for the same seq:
-    //  - during performing a cache reuse via (rm + add)
-    //  - some vision models have input embeddings with repeating positions
-    //
-    std::map<llama_pos, int> seq_pos[LLAMA_MAX_SEQ];
+    std::set<llama_pos> seq_pos[LLAMA_MAX_SEQ];
 
     // helper functions for updating `seq_pos`, once cell at a time:
 
-    void seq_pos_dec(llama_seq_id s, llama_pos p) {
-        auto it = seq_pos[s].find(p);
-        assert(it != seq_pos[s].end());
-
-        if (--it->second == 0) {
-            seq_pos[s].erase(it);
-        }
-    }
-
-    void seq_pos_inc(llama_seq_id s, llama_pos p) {
-        seq_pos[s][p]++;
-    }
-
     // remove cell i
     void seq_pos_rm(uint32_t i) {
         for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
             if (seq[i].test(s)) {
-                seq_pos_dec(s, pos[i]);
+                seq_pos[s].erase(pos[i]);
             }
         }
     }
@@ -432,7 +408,7 @@ class llama_kv_cells_unified {
     void seq_pos_add(uint32_t i) {
         for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
             if (seq[i].test(s)) {
-                seq_pos_inc(s, pos[i]);
+                seq_pos[s].insert(pos[i]);
             }
         }
     }

Original file line number	Diff line number	Diff line change
`@@ -29,8 +29,8 @@ llama_context::llama_context(`
`29`	`29`	`const auto & hparams = model.hparams;`
`30`	`30`
`31`	`31`	`cparams.n_seq_max = std::max(1u, params.n_seq_max);`
`32`		`- if (cparams.n_seq_max > LLAMA_MAX_PARALLEL_SEQUENCES) {`
`33`		`- throw std::runtime_error("n_seq_max must be <= " + std::to_string(LLAMA_MAX_PARALLEL_SEQUENCES));`
	`32`	`+ if (cparams.n_seq_max > LLAMA_MAX_SEQ) {`
	`33`	`+ throw std::runtime_error("n_seq_max must be <= " + std::to_string(LLAMA_MAX_SEQ));`
`34`	`34`	`}`
`35`	`35`
`36`	`36`	`cparams.n_threads = params.n_threads;`
`@@ -1023,8 +1023,8 @@ int llama_context::decode(const llama_batch & batch_inp) {`
`1023`	`1023`
`1024`	`1024`	`if (!res) {`
`1025`	`1025`	`// the last ubatch failed or was aborted -> remove all positions of that ubatch from the KV cache`
`1026`		`- llama_pos pos_min[LLAMA_MAX_PARALLEL_SEQUENCES];`
`1027`		`- for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {`
	`1026`	`+ llama_pos pos_min[LLAMA_MAX_SEQ];`
	`1027`	`+ for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {`
`1028`	`1028`	`pos_min[s] = std::numeric_limits<llama_pos>::max();`
`1029`	`1029`	`}`
`1030`	`1030`
`@@ -1035,7 +1035,7 @@ int llama_context::decode(const llama_batch & batch_inp) {`
`1035`	`1035`	`pos_min[seq_id] = std::min(pos_min[seq_id], ubatch.pos[i]);`
`1036`	`1036`	`}`
`1037`	`1037`
`1038`		`- for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {`
	`1038`	`+ for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {`
`1039`	`1039`	`if (pos_min[s] == std::numeric_limits<llama_pos>::max()) {`
`1040`	`1040`	`continue;`
`1041`	`1041`	`}`
Original file line number	Diff line number	Diff line change
`@@ -572,7 +572,7 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {`
`572`	`572`	`LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());`
`573`	`573`	`}`
`574`	`574`
`575`		`- for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {`
	`575`	`+ for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {`
`576`	`576`	`if (cells.seq_pos_min(s) < 0) {`
`577`	`577`	`continue;`
`578`	`578`	`}`
`@@ -652,8 +652,8 @@ void llama_kv_cache_unified::apply_ubatch(uint32_t head_cur, const llama_ubatch`
`652`	`652`
`653`	`653`	`// keep track of the max sequence position that we would overwrite with this ubatch`
`654`	`654`	`// for non-SWA cache, this would be always empty`
`655`		`- llama_seq_id seq_pos_max_rm[LLAMA_MAX_PARALLEL_SEQUENCES];`
`656`		`- for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {`
	`655`	`+ llama_seq_id seq_pos_max_rm[LLAMA_MAX_SEQ];`
	`656`	`+ for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {`
`657`	`657`	`seq_pos_max_rm[s] = -1;`
`658`	`658`	`}`
`659`	`659`
`@@ -684,7 +684,7 @@ void llama_kv_cache_unified::apply_ubatch(uint32_t head_cur, const llama_ubatch`
`684`	`684`	`// note: we want to preserve the invariant that all positions between [pos_min, pos_max] for each sequence`
`685`	`685`	`// will be present in the cache. so we have to purge any position which is less than those we would overwrite`
`686`	`686`	`// ref: https://github.com/ggml-org/llama.cpp/pull/13746#issuecomment-2916057092`
`687`		`- for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {`
	`687`	`+ for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {`
`688`	`688`	`if (seq_pos_max_rm[s] == -1) {`
`689`	`689`	`continue;`
`690`	`690`	`}`