From bd697ca77df238f0ee7382c0a4e575f18dc5df57 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Mon, 21 Oct 2024 00:09:56 +0200
Subject: [PATCH 1/5] llama : fix empty batch cause llama_batch_allocr to crash

---
 src/llama.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/llama.cpp b/src/llama.cpp
index 1813dd29be2b2..d64200402b034 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -21139,6 +21139,10 @@ struct llama_batch_allocr {
     // optionally fulfill the batch returned by llama_batch_get_one
     llama_batch_allocr(struct llama_context * ctx, struct llama_batch in_batch) {
         batch = in_batch;
+        if (batch.n_tokens == 0) {
+            // llama_(de|en)code_internal will return an error in this case
+            return;
+        }
         if (!batch.pos) {
             // determine the last position in KV cache
             llama_pos last_pos = -1;

From 6ab116ac5a90ebc4fd51e0683703db5cfaf88dfa Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Tue, 22 Oct 2024 13:01:22 +0200
Subject: [PATCH 2/5] move batch_allocr inside decode/encode_internal

---
 src/llama.cpp | 33 ++++++++++++++++-----------------
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index d64200402b034..4d424bfaf19a6 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -17108,16 +17108,19 @@ static void llama_graph_compute(
 //
 static int llama_decode_internal(
          llama_context & lctx,
-           llama_batch   batch) {
+           llama_batch   inp_batch) {
 
     lctx.is_encoding = false;
-    const uint32_t n_tokens_all = batch.n_tokens;
 
-    if (n_tokens_all == 0) {
+    if (inp_batch.n_tokens == 0) {
         LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
         return -1;
     }
 
+    llama_batch_allocr batch_allocr(lctx, inp_batch);
+    llama_batch batch = batch_allocr.batch;
+    const uint32_t n_tokens_all = batch.n_tokens;
+
     const auto & model   = lctx.model;
     const auto & hparams = model.hparams;
     const auto & cparams = lctx.cparams;
@@ -17422,17 +17425,19 @@ static int llama_decode_internal(
 //
 static int llama_encode_internal(
          llama_context & lctx,
-           llama_batch   batch) {
+           llama_batch   inp_batch) {
 
     lctx.is_encoding = true;
 
-    const uint32_t n_tokens = batch.n_tokens;
-
-    if (n_tokens == 0) {
+    if (inp_batch.n_tokens == 0) {
         LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
         return -1;
     }
 
+    llama_batch_allocr batch_allocr(lctx, inp_batch);
+    llama_batch batch = batch_allocr.batch;
+    const uint32_t n_tokens = batch.n_tokens;
+
     const auto & model   = lctx.model;
     const auto & hparams = model.hparams;
     const auto & cparams = lctx.cparams;
@@ -21137,16 +21142,12 @@ struct llama_batch_allocr {
     std::vector<int8_t>         logits;
     struct llama_batch          batch;
     // optionally fulfill the batch returned by llama_batch_get_one
-    llama_batch_allocr(struct llama_context * ctx, struct llama_batch in_batch) {
+    llama_batch_allocr(llama_context & ctx, struct llama_batch in_batch) {
         batch = in_batch;
-        if (batch.n_tokens == 0) {
-            // llama_(de|en)code_internal will return an error in this case
-            return;
-        }
         if (!batch.pos) {
             // determine the last position in KV cache
             llama_pos last_pos = -1;
-            for (const auto & cell : ctx->kv_self.cells) {
+            for (const auto & cell : ctx.kv_self.cells) {
                 if (cell.has_seq_id(batch_default_seq_id)) {
                     last_pos = std::max(last_pos, cell.pos);
                 }
@@ -21184,8 +21185,7 @@ struct llama_batch_allocr {
 int32_t llama_encode(
         struct llama_context * ctx,
           struct llama_batch   batch) {
-    llama_batch_allocr batch_allocr(ctx, batch);
-    const int ret = llama_encode_internal(*ctx, batch_allocr.batch);
+    const int ret = llama_encode_internal(*ctx, batch);
     if (ret != 0) {
         LLAMA_LOG_ERROR("%s: failed to encode, ret = %d\n", __func__, ret);
     }
@@ -21196,8 +21196,7 @@ int32_t llama_encode(
 int32_t llama_decode(
         struct llama_context * ctx,
           struct llama_batch   batch) {
-    llama_batch_allocr batch_allocr(ctx, batch);
-    const int ret = llama_decode_internal(*ctx, batch_allocr.batch);
+    const int ret = llama_decode_internal(*ctx, batch);
     if (ret != 0) {
         LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
     }

From 540c3016d8808c40684ecaa5caf594a1dcceda6b Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Tue, 22 Oct 2024 13:10:40 +0200
Subject: [PATCH 3/5] fix build

---
 src/llama.cpp | 102 +++++++++++++++++++++++++-------------------------
 1 file changed, 52 insertions(+), 50 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 4d424bfaf19a6..3c6d9c363dfe8 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -5190,6 +5190,56 @@ struct llama_model_loader {
     }
 };
 
+// temporary allocate memory for the input batch if needed
+static const llama_seq_id batch_default_seq_id = 0;
+struct llama_batch_allocr {
+    std::array<llama_seq_id, 1> seq_id_0 = {batch_default_seq_id};
+    std::vector<llama_pos>      pos;
+    std::vector<int32_t>        n_seq_id;
+    std::vector<llama_seq_id *> seq_id;
+    std::vector<int8_t>         logits;
+    struct llama_batch          batch;
+    // optionally fulfill the batch returned by llama_batch_get_one
+    llama_batch_allocr(llama_context & ctx, struct llama_batch in_batch) {
+        batch = in_batch;
+        if (!batch.pos) {
+            // determine the last position in KV cache
+            llama_pos last_pos = -1;
+            for (const auto & cell : ctx.kv_self.cells) {
+                if (cell.has_seq_id(batch_default_seq_id)) {
+                    last_pos = std::max(last_pos, cell.pos);
+                }
+            }
+            last_pos++; // next position
+            pos.resize(batch.n_tokens);
+            for (int32_t i = 0; i < batch.n_tokens; i++) {
+                pos[i] = i+last_pos;
+            }
+            batch.pos = pos.data();
+        }
+        if (!batch.n_seq_id) {
+            n_seq_id.resize(batch.n_tokens);
+            for (int32_t i = 0; i < batch.n_tokens; i++) {
+                n_seq_id[i] = seq_id_0.size();
+            }
+            batch.n_seq_id = n_seq_id.data();
+        }
+        if (!batch.seq_id) {
+            seq_id.resize(batch.n_tokens + 1);
+            seq_id[batch.n_tokens] = NULL;
+            for (int32_t i = 0; i < batch.n_tokens; i++) {
+                seq_id[i] = seq_id_0.data();
+            }
+            batch.seq_id = seq_id.data();
+        }
+        if (!batch.logits) {
+            logits.resize(batch.n_tokens);
+            logits[logits.size() - 1] = true;
+            batch.logits = logits.data();
+        }
+    }
+};
+
 template<>
 bool llama_model_loader::get_key(const enum llm_kv kid, enum llama_pooling_type & result, const bool required) {
     uint32_t tmp;
@@ -17117,6 +17167,7 @@ static int llama_decode_internal(
         return -1;
     }
 
+    // temporary allocate memory for the input batch if needed
     llama_batch_allocr batch_allocr(lctx, inp_batch);
     llama_batch batch = batch_allocr.batch;
     const uint32_t n_tokens_all = batch.n_tokens;
@@ -17434,6 +17485,7 @@ static int llama_encode_internal(
         return -1;
     }
 
+    // temporary allocate memory for the input batch if needed
     llama_batch_allocr batch_allocr(lctx, inp_batch);
     llama_batch batch = batch_allocr.batch;
     const uint32_t n_tokens = batch.n_tokens;
@@ -21132,56 +21184,6 @@ void llama_batch_free(struct llama_batch batch) {
     if (batch.logits)   free(batch.logits);
 }
 
-// temporary allocate memory for the input batch if needed
-static const llama_seq_id batch_default_seq_id = 0;
-struct llama_batch_allocr {
-    std::array<llama_seq_id, 1> seq_id_0 = {batch_default_seq_id};
-    std::vector<llama_pos>      pos;
-    std::vector<int32_t>        n_seq_id;
-    std::vector<llama_seq_id *> seq_id;
-    std::vector<int8_t>         logits;
-    struct llama_batch          batch;
-    // optionally fulfill the batch returned by llama_batch_get_one
-    llama_batch_allocr(llama_context & ctx, struct llama_batch in_batch) {
-        batch = in_batch;
-        if (!batch.pos) {
-            // determine the last position in KV cache
-            llama_pos last_pos = -1;
-            for (const auto & cell : ctx.kv_self.cells) {
-                if (cell.has_seq_id(batch_default_seq_id)) {
-                    last_pos = std::max(last_pos, cell.pos);
-                }
-            }
-            last_pos++; // next position
-            pos.resize(batch.n_tokens);
-            for (int32_t i = 0; i < batch.n_tokens; i++) {
-                pos[i] = i+last_pos;
-            }
-            batch.pos = pos.data();
-        }
-        if (!batch.n_seq_id) {
-            n_seq_id.resize(batch.n_tokens);
-            for (int32_t i = 0; i < batch.n_tokens; i++) {
-                n_seq_id[i] = seq_id_0.size();
-            }
-            batch.n_seq_id = n_seq_id.data();
-        }
-        if (!batch.seq_id) {
-            seq_id.resize(batch.n_tokens + 1);
-            seq_id[batch.n_tokens] = NULL;
-            for (int32_t i = 0; i < batch.n_tokens; i++) {
-                seq_id[i] = seq_id_0.data();
-            }
-            batch.seq_id = seq_id.data();
-        }
-        if (!batch.logits) {
-            logits.resize(batch.n_tokens);
-            logits[logits.size() - 1] = true;
-            batch.logits = logits.data();
-        }
-    }
-};
-
 int32_t llama_encode(
         struct llama_context * ctx,
           struct llama_batch   batch) {

From 11b1564efb0690ff9df764e6b8dba700f5ec6f27 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Tue, 22 Oct 2024 13:14:38 +0200
Subject: [PATCH 4/5] add GGML_ASSERT

---
 src/llama.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/llama.cpp b/src/llama.cpp
index 3c6d9c363dfe8..669c5f28e2446 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -5202,6 +5202,7 @@ struct llama_batch_allocr {
     // optionally fulfill the batch returned by llama_batch_get_one
     llama_batch_allocr(llama_context & ctx, struct llama_batch in_batch) {
         batch = in_batch;
+        GGML_ASSERT(batch.n_tokens > 0);
         if (!batch.pos) {
             // determine the last position in KV cache
             llama_pos last_pos = -1;

From f7320036227b870e993d734e47762ac0cdb4dbbb Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <thichthat@gmail.com>
Date: Tue, 22 Oct 2024 15:23:42 +0200
Subject: [PATCH 5/5] Apply suggestions from code review

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 src/llama.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 669c5f28e2446..8f21c39d123c9 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -17170,7 +17170,7 @@ static int llama_decode_internal(
 
     // temporary allocate memory for the input batch if needed
     llama_batch_allocr batch_allocr(lctx, inp_batch);
-    llama_batch batch = batch_allocr.batch;
+    const llama_batch & batch = batch_allocr.batch;
     const uint32_t n_tokens_all = batch.n_tokens;
 
     const auto & model   = lctx.model;
@@ -17488,7 +17488,7 @@ static int llama_encode_internal(
 
     // temporary allocate memory for the input batch if needed
     llama_batch_allocr batch_allocr(lctx, inp_batch);
-    llama_batch batch = batch_allocr.batch;
+    const llama_batch & batch = batch_allocr.batch;
     const uint32_t n_tokens = batch.n_tokens;
 
     const auto & model   = lctx.model;