From bd697ca77df238f0ee7382c0a4e575f18dc5df57 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Mon, 21 Oct 2024 00:09:56 +0200 Subject: [PATCH 1/5] llama : fix empty batch cause llama_batch_allocr to crash --- src/llama.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/llama.cpp b/src/llama.cpp index 1813dd29be2b2..d64200402b034 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -21139,6 +21139,10 @@ struct llama_batch_allocr { // optionally fulfill the batch returned by llama_batch_get_one llama_batch_allocr(struct llama_context * ctx, struct llama_batch in_batch) { batch = in_batch; + if (batch.n_tokens == 0) { + // llama_(de|en)code_internal will return an error in this case + return; + } if (!batch.pos) { // determine the last position in KV cache llama_pos last_pos = -1; From 6ab116ac5a90ebc4fd51e0683703db5cfaf88dfa Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Tue, 22 Oct 2024 13:01:22 +0200 Subject: [PATCH 2/5] move batch_allocr inside decode/encode_internal --- src/llama.cpp | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index d64200402b034..4d424bfaf19a6 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -17108,16 +17108,19 @@ static void llama_graph_compute( // static int llama_decode_internal( llama_context & lctx, - llama_batch batch) { + llama_batch inp_batch) { lctx.is_encoding = false; - const uint32_t n_tokens_all = batch.n_tokens; - if (n_tokens_all == 0) { + if (inp_batch.n_tokens == 0) { LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__); return -1; } + llama_batch_allocr batch_allocr(lctx, inp_batch); + llama_batch batch = batch_allocr.batch; + const uint32_t n_tokens_all = batch.n_tokens; + const auto & model = lctx.model; const auto & hparams = model.hparams; const auto & cparams = lctx.cparams; @@ -17422,17 +17425,19 @@ static int llama_decode_internal( // static int llama_encode_internal( llama_context & lctx, - llama_batch batch) { + llama_batch inp_batch) { lctx.is_encoding = true; - const uint32_t n_tokens = batch.n_tokens; - - if (n_tokens == 0) { + if (inp_batch.n_tokens == 0) { LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__); return -1; } + llama_batch_allocr batch_allocr(lctx, inp_batch); + llama_batch batch = batch_allocr.batch; + const uint32_t n_tokens = batch.n_tokens; + const auto & model = lctx.model; const auto & hparams = model.hparams; const auto & cparams = lctx.cparams; @@ -21137,16 +21142,12 @@ struct llama_batch_allocr { std::vector logits; struct llama_batch batch; // optionally fulfill the batch returned by llama_batch_get_one - llama_batch_allocr(struct llama_context * ctx, struct llama_batch in_batch) { + llama_batch_allocr(llama_context & ctx, struct llama_batch in_batch) { batch = in_batch; - if (batch.n_tokens == 0) { - // llama_(de|en)code_internal will return an error in this case - return; - } if (!batch.pos) { // determine the last position in KV cache llama_pos last_pos = -1; - for (const auto & cell : ctx->kv_self.cells) { + for (const auto & cell : ctx.kv_self.cells) { if (cell.has_seq_id(batch_default_seq_id)) { last_pos = std::max(last_pos, cell.pos); } @@ -21184,8 +21185,7 @@ struct llama_batch_allocr { int32_t llama_encode( struct llama_context * ctx, struct llama_batch batch) { - llama_batch_allocr batch_allocr(ctx, batch); - const int ret = llama_encode_internal(*ctx, batch_allocr.batch); + const int ret = llama_encode_internal(*ctx, batch); if (ret != 0) { LLAMA_LOG_ERROR("%s: failed to encode, ret = %d\n", __func__, ret); } @@ -21196,8 +21196,7 @@ int32_t llama_encode( int32_t llama_decode( struct llama_context * ctx, struct llama_batch batch) { - llama_batch_allocr batch_allocr(ctx, batch); - const int ret = llama_decode_internal(*ctx, batch_allocr.batch); + const int ret = llama_decode_internal(*ctx, batch); if (ret != 0) { LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret); } From 540c3016d8808c40684ecaa5caf594a1dcceda6b Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Tue, 22 Oct 2024 13:10:40 +0200 Subject: [PATCH 3/5] fix build --- src/llama.cpp | 102 +++++++++++++++++++++++++------------------------- 1 file changed, 52 insertions(+), 50 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 4d424bfaf19a6..3c6d9c363dfe8 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -5190,6 +5190,56 @@ struct llama_model_loader { } }; +// temporary allocate memory for the input batch if needed +static const llama_seq_id batch_default_seq_id = 0; +struct llama_batch_allocr { + std::array seq_id_0 = {batch_default_seq_id}; + std::vector pos; + std::vector n_seq_id; + std::vector seq_id; + std::vector logits; + struct llama_batch batch; + // optionally fulfill the batch returned by llama_batch_get_one + llama_batch_allocr(llama_context & ctx, struct llama_batch in_batch) { + batch = in_batch; + if (!batch.pos) { + // determine the last position in KV cache + llama_pos last_pos = -1; + for (const auto & cell : ctx.kv_self.cells) { + if (cell.has_seq_id(batch_default_seq_id)) { + last_pos = std::max(last_pos, cell.pos); + } + } + last_pos++; // next position + pos.resize(batch.n_tokens); + for (int32_t i = 0; i < batch.n_tokens; i++) { + pos[i] = i+last_pos; + } + batch.pos = pos.data(); + } + if (!batch.n_seq_id) { + n_seq_id.resize(batch.n_tokens); + for (int32_t i = 0; i < batch.n_tokens; i++) { + n_seq_id[i] = seq_id_0.size(); + } + batch.n_seq_id = n_seq_id.data(); + } + if (!batch.seq_id) { + seq_id.resize(batch.n_tokens + 1); + seq_id[batch.n_tokens] = NULL; + for (int32_t i = 0; i < batch.n_tokens; i++) { + seq_id[i] = seq_id_0.data(); + } + batch.seq_id = seq_id.data(); + } + if (!batch.logits) { + logits.resize(batch.n_tokens); + logits[logits.size() - 1] = true; + batch.logits = logits.data(); + } + } +}; + template<> bool llama_model_loader::get_key(const enum llm_kv kid, enum llama_pooling_type & result, const bool required) { uint32_t tmp; @@ -17117,6 +17167,7 @@ static int llama_decode_internal( return -1; } + // temporary allocate memory for the input batch if needed llama_batch_allocr batch_allocr(lctx, inp_batch); llama_batch batch = batch_allocr.batch; const uint32_t n_tokens_all = batch.n_tokens; @@ -17434,6 +17485,7 @@ static int llama_encode_internal( return -1; } + // temporary allocate memory for the input batch if needed llama_batch_allocr batch_allocr(lctx, inp_batch); llama_batch batch = batch_allocr.batch; const uint32_t n_tokens = batch.n_tokens; @@ -21132,56 +21184,6 @@ void llama_batch_free(struct llama_batch batch) { if (batch.logits) free(batch.logits); } -// temporary allocate memory for the input batch if needed -static const llama_seq_id batch_default_seq_id = 0; -struct llama_batch_allocr { - std::array seq_id_0 = {batch_default_seq_id}; - std::vector pos; - std::vector n_seq_id; - std::vector seq_id; - std::vector logits; - struct llama_batch batch; - // optionally fulfill the batch returned by llama_batch_get_one - llama_batch_allocr(llama_context & ctx, struct llama_batch in_batch) { - batch = in_batch; - if (!batch.pos) { - // determine the last position in KV cache - llama_pos last_pos = -1; - for (const auto & cell : ctx.kv_self.cells) { - if (cell.has_seq_id(batch_default_seq_id)) { - last_pos = std::max(last_pos, cell.pos); - } - } - last_pos++; // next position - pos.resize(batch.n_tokens); - for (int32_t i = 0; i < batch.n_tokens; i++) { - pos[i] = i+last_pos; - } - batch.pos = pos.data(); - } - if (!batch.n_seq_id) { - n_seq_id.resize(batch.n_tokens); - for (int32_t i = 0; i < batch.n_tokens; i++) { - n_seq_id[i] = seq_id_0.size(); - } - batch.n_seq_id = n_seq_id.data(); - } - if (!batch.seq_id) { - seq_id.resize(batch.n_tokens + 1); - seq_id[batch.n_tokens] = NULL; - for (int32_t i = 0; i < batch.n_tokens; i++) { - seq_id[i] = seq_id_0.data(); - } - batch.seq_id = seq_id.data(); - } - if (!batch.logits) { - logits.resize(batch.n_tokens); - logits[logits.size() - 1] = true; - batch.logits = logits.data(); - } - } -}; - int32_t llama_encode( struct llama_context * ctx, struct llama_batch batch) { From 11b1564efb0690ff9df764e6b8dba700f5ec6f27 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Tue, 22 Oct 2024 13:14:38 +0200 Subject: [PATCH 4/5] add GGML_ASSERT --- src/llama.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/llama.cpp b/src/llama.cpp index 3c6d9c363dfe8..669c5f28e2446 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -5202,6 +5202,7 @@ struct llama_batch_allocr { // optionally fulfill the batch returned by llama_batch_get_one llama_batch_allocr(llama_context & ctx, struct llama_batch in_batch) { batch = in_batch; + GGML_ASSERT(batch.n_tokens > 0); if (!batch.pos) { // determine the last position in KV cache llama_pos last_pos = -1; From f7320036227b870e993d734e47762ac0cdb4dbbb Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Tue, 22 Oct 2024 15:23:42 +0200 Subject: [PATCH 5/5] Apply suggestions from code review Co-authored-by: Georgi Gerganov --- src/llama.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 669c5f28e2446..8f21c39d123c9 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -17170,7 +17170,7 @@ static int llama_decode_internal( // temporary allocate memory for the input batch if needed llama_batch_allocr batch_allocr(lctx, inp_batch); - llama_batch batch = batch_allocr.batch; + const llama_batch & batch = batch_allocr.batch; const uint32_t n_tokens_all = batch.n_tokens; const auto & model = lctx.model; @@ -17488,7 +17488,7 @@ static int llama_encode_internal( // temporary allocate memory for the input batch if needed llama_batch_allocr batch_allocr(lctx, inp_batch); - llama_batch batch = batch_allocr.batch; + const llama_batch & batch = batch_allocr.batch; const uint32_t n_tokens = batch.n_tokens; const auto & model = lctx.model;