diff --git a/include/llama.h b/include/llama.h index a0a660bff88da..bf729471847a9 100644 --- a/include/llama.h +++ b/include/llama.h @@ -218,6 +218,7 @@ extern "C" { // - token : the token ids of the input (used when embd is NULL) // - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL) // - pos : the positions of the respective token in the sequence + // (for M-RoPE, first `n_tokens` are linearly increasing, followed by `n_pos_per_embd * n_tokens` positions for RoPE) // (if set to NULL, the token position will be tracked automatically by llama_encode/llama_decode) // - seq_id : the sequence to which the respective token belongs // (if set to NULL, the sequence ID will be assumed to be 0) diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp index 55d89eca0ad94..c2aab4c91625c 100644 --- a/src/llama-batch.cpp +++ b/src/llama-batch.cpp @@ -259,23 +259,7 @@ bool llama_batch_allocr::init( const llama_pos p0 = memory ? memory->seq_pos_max(s) : -1; if (p0 >= 0) { - bool ok = true; - - if (batch.token) { - if (seq_pos_min(s) != p0 + 1) { - ok = false; - } - } else { - assert(batch.embd); - - // for embeddings (typically used as vision input), we allow them to have repeating positions - // ref: https://github.com/ggml-org/llama.cpp/issues/13694#issuecomment-2983871762 - if (seq_pos_min(s) != p0 && seq_pos_min(s) != p0 + 1) { - ok = false; - } - } - - if (!ok) { + if (seq_pos_min(s) != p0 + 1) { LLAMA_LOG_ERROR( "%s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n" " - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n" @@ -655,7 +639,10 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector & idxs, u auto udata = std::make_shared(); - const int32_t n_pos_cur = batch.embd ? n_pos_per_embd : 1; + const int32_t n_pos_per_embd_inp = n_pos_per_embd > 1 + ? (n_pos_per_embd + 1) // include the extra linearly increasing positions for M-RoPE + : 1; // standard RoPE + const int32_t n_pos_cur = batch.embd ? n_pos_per_embd_inp : 1; const int64_t n_embd_all = batch.embd ? (int64_t) n_tokens*n_embd : 0; const int64_t n_pos_all = (int64_t) n_tokens*n_pos_cur; @@ -681,7 +668,7 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector & idxs, u } for (int j = 0; j < n_pos_cur; ++j) { - udata->pos[j*n_tokens + i] = batch.pos[j*batch.n_tokens + idxs[i]]; + udata->pos[j * n_tokens + i] = batch.pos[j * batch.n_tokens + idxs[i]]; } udata->n_seq_id[i] = batch.n_seq_id[idxs[i]]; diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 112d195f2911e..e0b6d6f9f58aa 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -54,7 +54,10 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) { } ggml_backend_tensor_set(pos, pos_data.data(), 0, pos_data.size()*ggml_element_size(pos)); } else { - ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_embd*ggml_element_size(pos)); + const bool has_mrope = ubatch->embd && n_pos_per_embd > 1; + ggml_backend_tensor_set(pos, + ubatch->pos + (has_mrope ? n_tokens : 0), // skip the first n_tokens positions for M-RoPE + 0, n_tokens*n_pos_per_embd*ggml_element_size(pos)); } } } diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp index fd1fb6581b163..43ab2f3c94244 100644 --- a/tools/mtmd/mtmd-cli.cpp +++ b/tools/mtmd/mtmd-cli.cpp @@ -191,7 +191,8 @@ static int generate_response(mtmd_cli_context & ctx, int n_predict) { // eval the token common_batch_clear(ctx.batch); - common_batch_add(ctx.batch, token_id, ctx.n_past++, {0}, true); + int max_pos = llama_memory_seq_pos_max(llama_get_memory(ctx.lctx), 0); + common_batch_add(ctx.batch, token_id, max_pos+1, {0}, true); if (llama_decode(ctx.lctx, ctx.batch)) { LOG_ERR("failed to decode token\n"); return 1; diff --git a/tools/mtmd/mtmd-helper.cpp b/tools/mtmd/mtmd-helper.cpp index 686f42f3960fe..caf7ca6c336ec 100644 --- a/tools/mtmd/mtmd-helper.cpp +++ b/tools/mtmd/mtmd-helper.cpp @@ -58,7 +58,8 @@ llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks) { struct decode_embd_batch { int n_pos_per_embd; int n_mmproj_embd; - std::vector pos; + std::vector pos; // for M-RoPE, this will have (1+n_pos_per_embd)*n_tokens elements + // the extra n_tokens are for linearly increasing positions std::vector pos_view; // used by mrope std::vector n_seq_id; std::vector seq_id_0; @@ -66,7 +67,7 @@ struct decode_embd_batch { std::vector logits; llama_batch batch; decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) { - pos .resize(n_tokens * n_pos_per_embd); + pos .resize(n_tokens * (n_pos_per_embd + 1)); n_seq_id.resize(n_tokens); seq_ids .resize(n_tokens + 1); logits .resize(n_tokens); @@ -100,13 +101,14 @@ struct decode_embd_batch { for (int y = 0; y < ny; y++) { for (int x = 0; x < nx; x++) { int i = y * nx + x; - pos[i ] = pos_0; - pos[i + batch.n_tokens ] = pos_0 + y; - pos[i + batch.n_tokens * 2] = pos_0 + x; - pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused + pos[i + batch.n_tokens ] = pos_0; + pos[i + batch.n_tokens * 2] = pos_0 + y; + pos[i + batch.n_tokens * 3] = pos_0 + x; + pos[i + batch.n_tokens * 4] = 0; // last pos dim is unused } } for (int i = 0; i < batch.n_tokens; i++) { + batch.pos [i] = pos_0 + i; batch.n_seq_id[i] = 1; batch.seq_id [i] = seq_id_0.data(); batch.logits [i] = false; @@ -118,12 +120,13 @@ struct decode_embd_batch { GGML_ASSERT(n_pos_per_embd == 4); seq_id_0[0] = seq_id; for (int i = 0; i < batch.n_tokens; i++) { - pos[i ] = pos_0 + i; pos[i + batch.n_tokens ] = pos_0 + i; pos[i + batch.n_tokens * 2] = pos_0 + i; - pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused + pos[i + batch.n_tokens * 3] = pos_0 + i; + pos[i + batch.n_tokens * 4] = 0; // last pos dim is unused } for (int i = 0; i < batch.n_tokens; i++) { + batch.pos [i] = pos_0 + i; batch.n_seq_id[i] = 1; batch.seq_id [i] = seq_id_0.data(); batch.logits [i] = false; @@ -133,12 +136,12 @@ struct decode_embd_batch { llama_batch get_view(int offset, int n_tokens) { llama_pos * pos_ptr; pos_view.clear(); - pos_view.reserve(n_tokens * n_pos_per_embd); + pos_view.reserve(n_tokens * (n_pos_per_embd + 1)); if (n_pos_per_embd > 1) { // mrope // for example, with layout of src: 1234...1234...1234...1234... // offset 2 will give us dst: 34...34...34...34... - for (int i = 0; i < n_pos_per_embd; i++) { + for (int i = 0; i <= n_pos_per_embd; i++) { // assume n_tokens is less than or equal to batch.n_tokens // batch.n_tokens is number of **total** tokens // n_tokens is number of viewed token @@ -164,6 +167,59 @@ struct decode_embd_batch { } }; +// helper struct to make working with embd batch easier +struct decode_text_batch { + std::vector tokens; + std::vector n_seq_id; + std::vector seq_id_0; + std::vector seq_ids; + std::vector logits; + llama_seq_id seq_id; + llama_batch batch; + decode_text_batch(int32_t n_tokens, llama_seq_id seq_id) : seq_id(seq_id) { + tokens .resize(n_tokens); + n_seq_id.resize(n_tokens); + seq_ids .resize(n_tokens + 1); + logits .resize(n_tokens); + seq_ids[n_tokens] = nullptr; + for (int32_t i = 0; i < n_tokens; i++) { + n_seq_id[i] = 1; + seq_ids [i] = &this->seq_id; + } + batch = { + /*n_tokens =*/ n_tokens, + /*tokens =*/ tokens.data(), + /*embd =*/ nullptr, + /*pos =*/ nullptr, // position is tracked automatically + /*n_seq_id =*/ n_seq_id.data(), + /*seq_id =*/ seq_ids.data(), + /*logits =*/ logits.data(), + }; + } + + void clear() { + batch.n_tokens = 0; + } + + bool is_full() const { + return batch.n_tokens >= (int32_t) tokens.size(); + } + + void add_token(llama_token tok, bool output) { + GGML_ASSERT(!is_full()); + int32_t j = batch.n_tokens; + batch.token [j] = tok; + batch.logits[j] = output; + batch.n_tokens++; + } + + void set_logits_last() { + if (batch.n_tokens > 0) { + batch.logits[batch.n_tokens - 1] = true; + } + } +}; + // Helper function for decoding an image whose embeddings have already been calculated int32_t mtmd_helper_decode_image_chunk( mtmd_context * ctx, @@ -252,7 +308,7 @@ int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx, bool logits_last, llama_pos * new_n_past) { int32_t ret; - llama_batch text_batch = llama_batch_init(n_batch, 0, 1); + decode_text_batch text_batch(n_batch, seq_id); auto chunk_type = mtmd_input_chunk_get_type(chunk); if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) { @@ -261,28 +317,20 @@ int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx, // LOG_INF("decoding text chunk, n_tokens = %zu\n", n_tokens); size_t i = 0; while (i < n_tokens) { // split into batches - text_batch.n_tokens = 0; // clear the batch - for (; i < n_tokens && text_batch.n_tokens < n_batch; i++) { - int32_t j = text_batch.n_tokens; - text_batch.token [j] = tokens[i]; - text_batch.pos [j] = n_past++; - text_batch.n_seq_id[j] = 1; - text_batch.seq_id [j][0] = seq_id; - text_batch.logits [j] = false; - - text_batch.n_tokens++; + text_batch.clear(); + for (; i < n_tokens && !text_batch.is_full(); i++) { + text_batch.add_token(tokens[i], false); } bool is_last_token = (i == n_tokens); if (logits_last && is_last_token) { - text_batch.logits[text_batch.n_tokens - 1] = true; + text_batch.set_logits_last(); } - ret = llama_decode(lctx, text_batch); + ret = llama_decode(lctx, text_batch.batch); if (ret != 0) { LOG_ERR("failed to decode text\n"); - llama_batch_free(text_batch); return ret; } - *new_n_past += text_batch.n_tokens; + *new_n_past += text_batch.batch.n_tokens; } } else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE || chunk_type == MTMD_INPUT_CHUNK_TYPE_AUDIO) { @@ -294,7 +342,6 @@ int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx, ret = mtmd_encode_chunk(ctx, chunk); if (ret != 0) { LOG_ERR("failed to encode %s slice\n", name); - llama_batch_free(text_batch); return ret; } @@ -304,14 +351,12 @@ int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx, ret = mtmd_helper_decode_image_chunk(ctx, lctx, chunk, embd, n_past, seq_id, n_batch, new_n_past); if (ret != 0) { LOG_ERR("failed to decode %s\n", name); - llama_batch_free(text_batch); return ret; } } else { GGML_ABORT("chunk type not supported"); } - llama_batch_free(text_batch); return 0; } diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 3b901bfac8215..1a48f903efac9 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -5,6 +5,15 @@ #include "llama.h" +// fix problem with std::min and std::max +#if defined(_WIN32) +#define WIN32_LEAN_AND_MEAN +#ifndef NOMINMAX +# define NOMINMAX +#endif +#include +#endif + #include #include #include @@ -1031,7 +1040,9 @@ const char * mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) { llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) { if (image_tokens->use_mrope_pos) { - return 1; // for M-RoPE, the whole image is 1 in temporal dimension + // for M-RoPE, temporal dimension = max(t,h,w) + // t is omitted as we don't support video input + return std::max(image_tokens->nx, image_tokens->ny); } return image_tokens->n_tokens(); } diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h index f4ea07d3ad521..521d1dfe5a384 100644 --- a/tools/mtmd/mtmd.h +++ b/tools/mtmd/mtmd.h @@ -171,7 +171,7 @@ MTMD_API size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * i MTMD_API size_t mtmd_image_tokens_get_nx (const mtmd_image_tokens * image_tokens); MTMD_API size_t mtmd_image_tokens_get_ny (const mtmd_image_tokens * image_tokens); MTMD_API const char * mtmd_image_tokens_get_id (const mtmd_image_tokens * image_tokens); // TODO: deprecate -// number of temporal positions (always 1 for M-RoPE, n_tokens otherwise) +// number of temporal positions (equals to max(t,h,w) for M-RoPE; equals to n_tokens otherwise) MTMD_API llama_pos mtmd_image_tokens_get_n_pos (const mtmd_image_tokens * image_tokens); // TODO: deprecate // tokenize an input text prompt and a list of bitmaps (images/audio)