janhq
diff --git a/‎src/llama-batch.cpp‎
Lines changed: 48 additions & 162 deletions b/‎src/llama-batch.cpp‎
Lines changed: 48 additions & 162 deletions
diff --git a/‎src/llama-batch.h‎
Lines changed: 38 additions & 72 deletions b/‎src/llama-batch.h‎
Lines changed: 38 additions & 72 deletions
@@ -130,42 +130,20 @@ bool llama_batch_allocr::init(
                 warn = true;
             }
         }
-
-        if (warn) {
-            LLAMA_LOG_WARN("%s: embeddings required but some input tokens were not marked as outputs -> overriding\n", __func__);
-
-            output.resize(batch.n_tokens, true);
-            batch.logits = output.data();
-        }
-    }
-
-    //
-    // compute stats
-    //
-
-    this->n_embd = n_embd;
-
-    // count the outputs in this batch
-    for (int32_t i = 0; i < batch.n_tokens; ++i) {
-        n_outputs += batch.logits[i] != 0;
     }
-
-    // determine coupled sequences
-    // these are pairs of sequences that have at least one token in the input batch that is assigned to both of them
-    for (int32_t i = 0; i < batch.n_tokens; ++i) {
-        const llama_seq_id s0 = batch.seq_id[i][0];
-
-        for (int32_t s = 0; s < batch.n_seq_id[i]; ++s) {
-            const llama_seq_id s1 = batch.seq_id[i][s];
-
-            seq_pos[s1].insert(batch.pos[i]);
-
-            if (s > 0) {
-                // mark that sequence s1 is coupled to s0
-                seq_cpl[s1][s0] = true;
-
-                // note: tracking the other way around is not necessary for now
-                //seq_cpl[s0][s1] = true;
+    if (batch->logits) {
+        if (ubatch.equal_seqs) {
+            for (size_t i = 0; i < length; ++i) {
+                size_t id = ids[seq.offset + i];
+                int8_t is_output = batch->logits[id];
+                ubatch.output[ubatch.n_tokens + i] = is_output;
+                if (is_output) { out_ids.push_back(id); }
+            }
+        } else {
+            // simple split
+            ubatch.output = batch->logits + seq.offset;
+            for (size_t i = 0; i < length; ++i) {
+                if (ubatch.output[i] != 0) { out_ids.push_back(seq.offset + i); }
             }
         }
     }
@@ -281,141 +259,49 @@ bool llama_batch_allocr::init(
         }
     }
 
-    if (memory) {
-        for (int32_t s0 = 0; s0 < LLAMA_MAX_SEQ; ++s0) {
-            for (int32_t s1 = 0; s1 < LLAMA_MAX_SEQ; ++s1) {
-                if (seq_cpl[s0][s1]) {
-                    if (memory->seq_pos_min(s0) != memory->seq_pos_min(s1) ||
-                        memory->seq_pos_max(s0) != memory->seq_pos_max(s1)) {
-                        LLAMA_LOG_ERROR("%s: sequence %d is coupled to %d in the input batch, but have divereged\n", __func__, s0, s1);
-                        return false;
-                    }
-                }
+llama_ubatch llama_sbatch::split_equal(size_t n_ubatch) {
+    n_ubatch = n_tokens < n_ubatch ? n_tokens : n_ubatch;
+    llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch->embd != nullptr);
+    if (!seq.empty()) {
+        size_t length = 0;
+        size_t n_tokens_in_ubatch = 0;
+        GGML_ASSERT(seq[0].n_seq_id > 0); // should not be mixed with simple splits
+                                          // smallest first, because it's easier to split this way;
+                                          // starting from the end to pop in constant time.
+        for (size_t i = seq.size(); i-- > 0;) {
+            llama_sbatch_seq & s = seq[i];
+            GGML_ASSERT(s.length > 0);
+            if (length == 0) {
+                length = s.length < n_ubatch ? s.length : n_ubatch;
             }
+            add_seq_to_ubatch(ubatch, s, length);
+            n_tokens_in_ubatch += length;
+            // shared prompts can't be mixed with any of their sequences,
+            // so it's safer to compute them in their own ubatch
+            if (s.n_seq_id > 1) { break; }
+            // stop when there isn't enough space for another sequence
+            if (length + n_tokens_in_ubatch > n_ubatch) { break; }
         }
     }
-
-    // disallow partial sequence sub-sets:
-    //
-    // invalid:          x
-    //            i: 0 1 2 ...
-    // ---------------------------------------
-    // seq_id[i][0]: 0 0 1
-    // seq_id[i][1]: 1 1 2
-    // seq_id[i][2]: 2
-    //
-    // disallow decreasing sequence positions:
-    //
-    // invalid:                  x
-    //            i: 0 1 2 3 4 5 6 ...
-    // ---------------------------------------
-    //       pos[i]: 4 5 0 1 6 2 3
-    // seq_id[i][0]: 0 0 1 1 0 1 0
-    //
-    {
-        seq_set_t cur_seq_set[LLAMA_MAX_SEQ];
-        for (int32_t s = 0; s < LLAMA_MAX_SEQ; ++s) {
-            cur_seq_set[s].set();
-        }
-
-        llama_pos cur_seq_pos[LLAMA_MAX_SEQ];
-        for (int32_t s = 0; s < LLAMA_MAX_SEQ; ++s) {
-            cur_seq_pos[s] = -1;
-        }
-
-        for (int32_t i = 0; i < batch.n_tokens; ++i) {
-            const llama_pos pos = batch.pos[i];
-
-            for (int32_t s = 0; s < batch.n_seq_id[i]; ++s) {
-                const llama_seq_id seq_id = batch.seq_id[i][s];
-
-                cur_seq_set[seq_id] &= seq_set[i];
-
-                if (cur_seq_set[seq_id].none()) {
-                    LLAMA_LOG_ERROR("%s: sequence %d belongs to incompatible sequence sets (not allowed)\n", __func__, seq_id);
-                    return false;
-                }
-
-                if (pos < cur_seq_pos[seq_id]) {
-                    LLAMA_LOG_ERROR("%s: sequence %d positions are decreasing (not allowed)\n", __func__, seq_id);
-                    return false;
-                }
-            }
-        }
-    }
-
-    split_reset();
-
-    return true;
+    return ubatch;
 }
 
-llama_ubatch llama_batch_allocr::ubatch_reserve(uint32_t n_seq_tokens, uint32_t n_seqs) {
-    const uint32_t n_tokens = n_seq_tokens*n_seqs;
-
-    clear();
-    split_reset();
-
-    ubatches.emplace_back();
-
-    auto & ubatch = ubatches.back();
-
-    ubatch.token     .resize(n_tokens);
-    ubatch.embd      .clear();
-    ubatch.pos       .resize(n_tokens);
-    ubatch.n_seq_id  .resize(n_tokens);
-    ubatch.seq_id    .resize(n_tokens);
-    ubatch.seq_id_unq.resize(0);
-    ubatch.seq_idx   .resize(LLAMA_MAX_SEQ, -1);
-    ubatch.output    .resize(n_tokens);
-
-    for (uint32_t s = 0; s < n_seqs; ++s) {
-        ubatch.seq_idx[s] = s;
-        ubatch.seq_id_unq.push_back(s);
+llama_ubatch llama_sbatch::split_seq(size_t n_ubatch) {
+    n_ubatch = n_tokens < n_ubatch ? n_tokens : n_ubatch;
+    llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch->embd != nullptr);
+    if (!seq.empty()) {
+        llama_sbatch_seq & s = seq[seq.size() - 1];
+        size_t length = s.length < n_ubatch ? s.length : n_ubatch;
+        GGML_ASSERT(s.n_seq_id > 0); // should not be mixed with simple splits
+        add_seq_to_ubatch(ubatch, s, length);
     }
-
-    llama_ubatch res {
-        /*.equal_seqs   =*/ true,
-        /*.n_tokens     =*/ n_tokens,
-        /*.n_seq_tokens =*/ n_seq_tokens,
-        /*.n_seqs       =*/ n_seqs,
-        /*.n_seqs_unq   =*/ n_seqs,
-
-        /*.token        =*/ ubatch.token.data(),
-        /*.embd         =*/ nullptr,
-        /*.pos          =*/ ubatch.pos.data(),
-        /*.n_seq_id     =*/ ubatch.n_seq_id.data(),
-        /*.seq_id       =*/ ubatch.seq_id.data(),
-        /*.seq_id_unq   =*/ ubatch.seq_id_unq.data(),
-        /*.seq_idx      =*/ ubatch.seq_idx.data(),
-        /*.output       =*/ ubatch.output.data(),
-    };
-
-    return res;
+    return ubatch;
 }
 
-const llama_batch & llama_batch_allocr::get_batch() const {
-    return batch;
-}
-
-uint32_t llama_batch_allocr::get_n_tokens() const {
-    return batch.n_tokens;
-}
-
-uint32_t llama_batch_allocr::get_n_outputs() const {
-    return n_outputs;
-}
-
-std::vector<int32_t> & llama_batch_allocr::get_out_ids() {
-    return out_ids;
-}
-
-llama_pos llama_batch_allocr::seq_pos_min(llama_seq_id seq_id) const {
-    return seq_pos[seq_id].empty() ? -1 : *seq_pos[seq_id].begin();
-}
-
-llama_pos llama_batch_allocr::seq_pos_max(llama_seq_id seq_id) const {
-    return seq_pos[seq_id].empty() ? -1 : *seq_pos[seq_id].rbegin();
-}
+llama_sbatch::llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split) {
+    GGML_ASSERT(batch.n_tokens >= 0);
+    this->batch = &batch;
+    this->n_embd = n_embd;
 
 void llama_batch_allocr::split_reset() {
     out_ids.clear();
 
@@ -36,94 +36,60 @@ struct llama_ubatch {
     int8_t       *  output;     // [n_tokens]         | i   | -
 };
 
-// a helper for sanitizing, fulfilling and splitting a batch
-class llama_batch_allocr {
-public:
-    llama_batch_allocr(uint32_t n_pos_per_embd);
+struct llama_sbatch_seq {
+    int32_t n_seq_id;
 
-    // sanitize and auto-gen missing data in the input batch
-    // memory is optional. if provided will be used to check for sequence continuity and to determine the positions
-    bool init(
-            const llama_batch & batch_inp,
-            const llama_vocab & vocab,
-            const llama_memory_i * memory,
-            uint32_t n_embd,
-            bool output_all);
+    llama_seq_id * seq_id;
 
-    const llama_batch & get_batch() const;
-
-    uint32_t get_n_tokens()  const;
-    uint32_t get_n_outputs() const;
-
-    // the array of output indices in the order they were encountered during the ubatch splitting
-    std::vector<int32_t> & get_out_ids();
-
-    // min/max positions of each sequence in the current ubatch
-    llama_pos seq_pos_min(llama_seq_id seq_id) const;
-    llama_pos seq_pos_max(llama_seq_id seq_id) const;
-
-    // call once before splitting the batch to reset the internal state
-    void split_reset();
-
-    // simple split, unknown number of sequence sets of unequal lengths
-    llama_ubatch split_simple(uint32_t n_ubatch);
-
-    // make ubatches of equal-length sequences sets
-    llama_ubatch split_equal(uint32_t n_ubatch);
-
-    // sequence-set-wise split - each ubatch contains a single sequence-set
-    llama_ubatch split_seq(uint32_t n_ubatch);
-
-    // a helper method for creating a well-defined ubatch of tokens
-    // TODO: support embeddings if needed in the future
-    llama_ubatch ubatch_reserve(uint32_t n_seq_tokens, uint32_t n_seqs);
-
-private:
-    void clear();
-
-    // create the next ubatch based on the provided batch indices (idxs) and the number of sequence sets (n_seqs)
-    // return llama_ubatch.n_tokens == 0 if the entire batch was consumed
-    llama_ubatch ubatch_add(const std::vector<int32_t> & idxs, uint32_t n_seqs, bool equal_seqs);
-
-    // for debugging, start with LLAMA_BATCH_DEBUG=2
-    void ubatch_print(const llama_ubatch & ubatch, int debug);
+    size_t offset;
+    size_t length;
+};
 
-    llama_batch batch;
+// sequence-length-aware batch splitting
+struct llama_sbatch {
+    // tokens left in this batch
+    size_t n_tokens;
 
     // only for debugging purposes
     const llama_vocab * vocab;
 
-    // TODO: this is more of a temporary solution until we have a better way to handle multiple positions per token/embd
-    //       ref: https://github.com/ggml-org/llama.cpp/issues/13694#issuecomment-2983871762
-    const uint32_t n_pos_per_embd;
-
-    uint32_t n_embd;
-    uint32_t n_outputs;
+    // sorted indices into the batch
+    std::vector<int64_t> ids;
+    // batch indices of the output
+    std::vector<int64_t> out_ids;
+    std::vector<llama_sbatch_seq> seq;
 
     std::array<llama_seq_id, 1> seq_id_0 = { 0 }; // default sequence id
 
-    std::vector<llama_pos>      pos;
-    std::vector<int32_t>        n_seq_id;
-    std::vector<llama_seq_id *> seq_id;
-    std::vector<llama_seq_id>   seq_id_unq;
-    std::vector<int32_t>        seq_idx;
-    std::vector<int8_t>         output;
+    // buffers for the ubatches
+    // TODO: very hacky, this needs a complete rework
+    struct ubatch_data {
+        std::vector<llama_token>    token;
+        std::vector<float>          embd;
+        std::vector<llama_pos>      pos;
+        std::vector<int32_t>        n_seq_id;
+        std::vector<llama_seq_id *> seq_id;
+        std::vector<int8_t>         output;
+    };
+
+    std::vector<ubatch_data> udatas;
 
-    using pos_set_t = std::set<llama_pos>;
-    using seq_cpl_t = std::vector<bool>;
+    llama_ubatch reserve_ubatch(size_t n_ubatch, bool has_embd = false);
 
-    std::vector<pos_set_t> seq_pos; // seq_pos[s]: the set of positions in sequence s
-    std::vector<seq_cpl_t> seq_cpl; // seq_cpl[s0][s1]: if sequence s0 is coupled to sequence s1
+    void add_seq_to_ubatch(llama_ubatch & ubatch, llama_sbatch_seq & seq, size_t length);
 
-    using idx_vec_t = std::vector<int32_t>;
-    using seq_set_t = std::bitset<LLAMA_MAX_SEQ>;
+    // simple split, unknown number of sequences of unequal lengths
+    llama_ubatch split_simple(size_t n_ubatch);
 
-    std::vector<seq_set_t> seq_set; // seq_set[i]: the sequence set of token i
+    // make batches of equal-length sequences
+    llama_ubatch split_equal(size_t n_ubatch);
 
-    std::unordered_map<seq_set_t, idx_vec_t> seq_set_map; // the indices at which the sequence set appears
+    // sequence-wise split
+    llama_ubatch split_seq(size_t n_ubatch);
 
-    // batch indices of the output
-    std::vector<int32_t> out_ids;
+    llama_sbatch() = default;
+    llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split = false);
+};
 
     // used[i] indicates if token i has already been used in a previous ubatch
     std::vector<bool> used;