-
Notifications
You must be signed in to change notification settings - Fork 13.5k
Qwen vl bounding box and overall vision fix #16745
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
b428300
5f91c45
f05fdf9
4d51baa
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -224,6 +224,7 @@ bool llama_batch_allocr::init( | |
| /*.seq_idx =*/ this->seq_idx.data(), | ||
| /*.output =*/ batch.logits, | ||
| /*.data =*/ {}, | ||
| /*.kv_position_of_token=*/ {}, | ||
| }; | ||
|
|
||
| ubatch_print(ubatch, debug); | ||
|
|
@@ -256,36 +257,38 @@ bool llama_batch_allocr::init( | |
| continue; | ||
| } | ||
|
|
||
| const llama_pos p0 = memory ? memory->seq_pos_max(s) : -1; | ||
|
|
||
| if (p0 >= 0) { | ||
| bool ok = true; | ||
|
|
||
| if (batch.token) { | ||
| if (seq_pos_min(s) != p0 + 1) { | ||
| ok = false; | ||
| } | ||
| } else { | ||
| assert(batch.embd); | ||
|
|
||
| // for embeddings (typically used as vision input), we allow them to have repeating positions | ||
| // ref: https://github.com/ggml-org/llama.cpp/issues/13694#issuecomment-2983871762 | ||
| if (seq_pos_min(s) != p0 && seq_pos_min(s) != p0 + 1) { | ||
| ok = false; | ||
| } | ||
| } | ||
|
|
||
| if (!ok) { | ||
| LLAMA_LOG_ERROR( | ||
| "%s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n" | ||
| " - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n" | ||
| " - the tokens for sequence %d in the input batch have a starting position of Y = %d\n" | ||
| " it is required that the sequence positions remain consecutive: Y = X + 1\n", | ||
| __func__, s, s, p0, s, seq_pos_min(s)); | ||
|
|
||
| return false; | ||
| } | ||
| } | ||
| //@fmayran: these checks don't make sense with models using position encoding such as Qwen VL, because the position stored in the KV cache can jump around (it is not even always increasing). | ||
| //it is not enough to let them be repeating. Within an image embedding, arbitrary jumps are expected. | ||
| //const llama_pos p0 = memory ? memory->seq_pos_max(s) : -1; | ||
| // | ||
| //if (p0 >= 0) { | ||
| // bool ok = true; | ||
| // | ||
| // if (batch.token) { | ||
| // if (seq_pos_min(s) != p0 + 1) { | ||
| // ok = false; | ||
| // } | ||
| // } else { | ||
| // assert(batch.embd); | ||
| // | ||
| // // for embeddings (typically used as vision input), we allow them to have repeating positions | ||
| // // ref: https://github.com/ggml-org/llama.cpp/issues/13694#issuecomment-2983871762 | ||
| // if (seq_pos_min(s) != p0 && seq_pos_min(s) != p0 + 1) { | ||
| // ok = false; | ||
| // } | ||
| // } | ||
| // | ||
| // if (!ok) { | ||
| // LLAMA_LOG_ERROR( | ||
| // "%s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n" | ||
| // " - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n" | ||
| // " - the tokens for sequence %d in the input batch have a starting position of Y = %d\n" | ||
| // " it is required that the sequence positions remain consecutive: Y = X + 1\n", | ||
| // __func__, s, s, p0, s, seq_pos_min(s)); | ||
| // | ||
| // return false; | ||
| // } | ||
| //} | ||
|
|
||
| if (seq_pos_max(s) - seq_pos_min(s) + 1 > (int) seq_pos[s].size()) { | ||
| LLAMA_LOG_ERROR("%s: sequence %d positions are not continuous\n", __func__, s); | ||
|
|
@@ -369,36 +372,38 @@ llama_ubatch llama_batch_allocr::ubatch_reserve(uint32_t n_seq_tokens, uint32_t | |
|
|
||
| auto udata = std::make_shared<llama_ubatch::data_t>(); | ||
|
|
||
| udata->token .resize(n_tokens); | ||
| udata->embd .clear(); | ||
| udata->pos .resize(n_tokens); | ||
| udata->n_seq_id .resize(n_tokens); | ||
| udata->seq_id .resize(n_tokens); | ||
| udata->seq_id_unq.resize(0); | ||
| udata->seq_idx .resize(LLAMA_MAX_SEQ, -1); | ||
| udata->output .resize(n_tokens); | ||
| udata->token .resize(n_tokens); | ||
| udata->embd .clear(); | ||
| udata->pos .resize(n_tokens); | ||
| udata->n_seq_id .resize(n_tokens); | ||
| udata->seq_id .resize(n_tokens); | ||
| udata->seq_id_unq .resize(0); | ||
| udata->seq_idx .resize(LLAMA_MAX_SEQ, -1); | ||
| udata->output .resize(n_tokens); | ||
| udata->kv_position_of_token.resize(n_tokens, -1); | ||
|
|
||
| for (uint32_t s = 0; s < n_seqs; ++s) { | ||
| udata->seq_idx[s] = s; | ||
| udata->seq_id_unq.push_back(s); | ||
| } | ||
|
|
||
| llama_ubatch res { | ||
| /*.b_equal_seqs =*/ true, | ||
| /*.n_tokens =*/ n_tokens, | ||
| /*.n_seq_tokens =*/ n_seq_tokens, | ||
| /*.n_seqs =*/ n_seqs, | ||
| /*.n_seqs_unq =*/ n_seqs, | ||
|
|
||
| /*.token =*/ udata->token.data(), | ||
| /*.embd =*/ nullptr, | ||
| /*.pos =*/ udata->pos.data(), | ||
| /*.n_seq_id =*/ udata->n_seq_id.data(), | ||
| /*.seq_id =*/ udata->seq_id.data(), | ||
| /*.seq_id_unq =*/ udata->seq_id_unq.data(), | ||
| /*.seq_idx =*/ udata->seq_idx.data(), | ||
| /*.output =*/ udata->output.data(), | ||
| /*.data =*/ std::move(udata), | ||
| /*.b_equal_seqs =*/ true, | ||
| /*.n_tokens =*/ n_tokens, | ||
| /*.n_seq_tokens =*/ n_seq_tokens, | ||
| /*.n_seqs =*/ n_seqs, | ||
| /*.n_seqs_unq =*/ n_seqs, | ||
|
|
||
| /*.token =*/ udata->token.data(), | ||
| /*.embd =*/ nullptr, | ||
| /*.pos =*/ udata->pos.data(), | ||
| /*.n_seq_id =*/ udata->n_seq_id.data(), | ||
| /*.seq_id =*/ udata->seq_id.data(), | ||
| /*.seq_id_unq =*/ udata->seq_id_unq.data(), | ||
| /*.seq_idx =*/ udata->seq_idx.data(), | ||
| /*.output =*/ udata->output.data(), | ||
| /*.kv_position_of_token=*/ udata->kv_position_of_token.data(), | ||
| /*.data =*/ std::move(udata), | ||
| }; | ||
|
|
||
| return res; | ||
|
|
@@ -660,14 +665,15 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u | |
| const int64_t n_embd_all = batch.embd ? (int64_t) n_tokens*n_embd : 0; | ||
| const int64_t n_pos_all = (int64_t) n_tokens*n_pos_cur; | ||
|
|
||
| udata->token .resize(n_tokens); | ||
| udata->embd .resize(n_embd_all); | ||
| udata->pos .resize(n_pos_all); | ||
| udata->n_seq_id .resize(n_tokens); | ||
| udata->seq_id .resize(n_tokens); | ||
| udata->seq_id_unq.resize(0); | ||
| udata->seq_idx .resize(LLAMA_MAX_SEQ, -1); | ||
| udata->output .resize(n_tokens); | ||
| udata->token .resize(n_tokens); | ||
| udata->embd .resize(n_embd_all); | ||
| udata->pos .resize(n_pos_all); | ||
| udata->n_seq_id .resize(n_tokens); | ||
| udata->seq_id .resize(n_tokens); | ||
| udata->seq_id_unq .resize(0); | ||
| udata->seq_idx .resize(LLAMA_MAX_SEQ, -1); | ||
| udata->output .resize(n_tokens); | ||
| udata->kv_position_of_token.resize(n_tokens, -1); | ||
|
|
||
| seq_set_t seq_set_unq; | ||
|
|
||
|
|
@@ -705,21 +711,23 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u | |
| } | ||
|
|
||
| llama_ubatch res { | ||
| /*.b_equal_seqs =*/ equal_seqs, | ||
| /*.n_tokens =*/ n_tokens, | ||
| /*.n_seq_tokens =*/ n_tokens/n_seqs, | ||
| /*.n_seqs =*/ n_seqs, | ||
| /*.n_seqs_unq =*/ (uint32_t) udata->seq_id_unq.size(), | ||
|
|
||
| /*.token =*/ batch.token ? udata->token.data() : nullptr, | ||
| /*.embd =*/ batch.embd ? udata->embd.data() : nullptr, | ||
| /*.pos =*/ udata->pos.data(), | ||
| /*.n_seq_id =*/ udata->n_seq_id.data(), | ||
| /*.seq_id =*/ udata->seq_id.data(), | ||
| /*.seq_id_unq =*/ udata->seq_id_unq.data(), | ||
| /*.seq_idx =*/ udata->seq_idx.data(), | ||
| /*.output =*/ udata->output.data(), | ||
| /*.data =*/ std::move(udata), | ||
| /*.b_equal_seqs =*/ equal_seqs, | ||
| /*.n_tokens =*/ n_tokens, | ||
| /*.n_seq_tokens =*/ n_tokens/n_seqs, | ||
| /*.n_seqs =*/ n_seqs, | ||
| /*.n_seqs_unq =*/ (uint32_t) udata->seq_id_unq.size(), | ||
|
|
||
| /*.token =*/ batch.token ? udata->token.data() : nullptr, | ||
| /*.embd =*/ batch.embd ? udata->embd.data() : nullptr, | ||
| /*.pos =*/ udata->pos.data(), | ||
| /*.n_seq_id =*/ udata->n_seq_id.data(), | ||
| /*.seq_id =*/ udata->seq_id.data(), | ||
| /*.seq_id_unq =*/ udata->seq_id_unq.data(), | ||
| /*.seq_idx =*/ udata->seq_idx.data(), | ||
| /*.output =*/ udata->output.data(), | ||
| /*.kv_position_of_token=*/ udata->kv_position_of_token.data(), | ||
| /*.data =*/ std::move(udata), | ||
|
|
||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nitpick: reorder so that kv_position_of_token is last. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am not sure about what you mean by that. It can't be after .data because it should be treated like other members of the update struct |
||
| }; | ||
|
|
||
| if (debug > 0) { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -30,15 +30,16 @@ struct llama_ubatch { | |
| // seq_idx: indices of the unique sequence ids in the ubatch in [0, n_seqs_unq) | ||
| // used for extracting sequence pooled embeddings | ||
|
|
||
| // // size | idx | val | ||
| llama_token * token; // [n_tokens] | i | id, token | ||
| float * embd; // [n_embd, n_tokens] | i | embd | ||
| llama_pos * pos; // [n_tokens] | i | pos | ||
| int32_t * n_seq_id; // [n_tokens] | i | - | ||
| llama_seq_id ** seq_id; // [n_tokens] | s | s0, s1, seq_id | ||
| llama_seq_id * seq_id_unq; // [n_seqs_unq] | s | seq_id | ||
| int32_t * seq_idx; // [LLAMA_MAX_SEQ] | - | seq_idx | ||
| int8_t * output; // [n_tokens] | i | - | ||
| // // size | idx | val | ||
| llama_token * token; // [n_tokens] | i | id, token | ||
| float * embd; // [n_embd, n_tokens] | i | embd | ||
| llama_pos * pos; // [n_tokens] | i | pos | ||
| int32_t * n_seq_id; // [n_tokens] | i | - | ||
| llama_seq_id ** seq_id; // [n_tokens] | s | s0, s1, seq_id | ||
| llama_seq_id * seq_id_unq; // [n_seqs_unq] | s | seq_id | ||
| int32_t * seq_idx; // [LLAMA_MAX_SEQ] | - | seq_idx | ||
| int8_t * output; // [n_tokens] | i | - | ||
| int32_t * kv_position_of_token; // [n_tokens] | i | kv position whre the token was inserted | ||
|
|
||
| struct data_t { | ||
| std::vector<llama_token> token; | ||
|
|
@@ -49,6 +50,7 @@ struct llama_ubatch { | |
| std::vector<llama_seq_id> seq_id_unq; | ||
| std::vector<int32_t> seq_idx; | ||
| std::vector<int8_t> output; | ||
| std::vector<int32_t> kv_position_of_token;//when pushed to the kv cache, where is the token pushed (used for causal masking) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd suggest a different naming for it, either There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I was extra pedantic here :-) |
||
| }; | ||
|
|
||
| // the llama_ubatch pointers above point to this data if set. otherwise - points to non-owning data | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -895,6 +895,7 @@ void llama_kv_cache::apply_ubatch(const slot_info & sinfo, const llama_ubatch & | |
| } | ||
|
|
||
| cells.pos_set(idx, ubatch.pos[i]); | ||
| ubatch.kv_position_of_token[i] = (int32_t)idx;//set the position in the kv cache as a property for this token (needed for proper causal masking) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This does not work - we should never use the idx in the KV cache buffer for causality checks. Even if this works sometimes, in general nothing guarantees that the cache is ordered causally in memory. The simplest scenario in which this breaks is multiple sequences with unified KV cache. |
||
|
|
||
| for (int32_t s = 0; s < ubatch.n_seq_id[i]; s++) { | ||
| cells.seq_add(idx, ubatch.seq_id[i][s]); | ||
|
|
@@ -1215,6 +1216,12 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u | |
|
|
||
| std::fill(data, data + ggml_nelements(dst), -INFINITY); | ||
|
|
||
| std::vector<int32_t> map_kv_to_batch(n_kv, -1);//for each token in the cache, either (-1) or the position in the current ubatch | ||
| for (uint32_t i = 0; i < n_tokens; ++i)//invert the batch -> kv position map into a kv -> batch position map | ||
| { | ||
| if (ubatch->kv_position_of_token[i] != -1) | ||
| map_kv_to_batch[ubatch->kv_position_of_token[i]] = i; | ||
| } | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You're filling a vector of int32_t's but iterating over uint32_t's, this is bound to trigger type conversion checks. Just iterate over int32_t. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I will check to minimize casts, thanks for the reminder |
||
| // Use only the previous KV cells of the correct sequence for each token of the ubatch. | ||
| // It's assumed that if a token in the batch has multiple sequences, they are equivalent. | ||
| // Example with a cache of 10 tokens, 2 tokens populated in cache and 3 tokens in batch: | ||
|
|
@@ -1254,8 +1261,10 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u | |
| const llama_pos p0 = cells.pos_get(j); | ||
|
|
||
| // mask future tokens | ||
| if (causal_attn && p0 > p1) { | ||
| continue; | ||
| if (causal_attn) | ||
| { | ||
| if (map_kv_to_batch[j] != -1 && map_kv_to_batch[j] > (int32_t)i)//if the kv cache token is in the current batch AND its position in the batch is higher than i | ||
| continue; | ||
| } | ||
|
|
||
| // apply SWA if any | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -1026,7 +1026,7 @@ const char * mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) { | |||||
|
|
||||||
| llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) { | ||||||
| if (image_tokens->use_mrope_pos) { | ||||||
| return 1; // for M-RoPE, the whole image is 1 in temporal dimension | ||||||
| return (::std::max)(image_tokens->nx, image_tokens->ny);//assuming image, not video // for M-RoPE, the whole image is 1 in temporal dimension | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Regarding (::std::max), I am being extra careful because sometimes on windows max is a preprocessor definition and it breaks std::max. Parenthesis stop that behavior. If this is prevented for sure in llamacpp, we can remove them. And I will remove the useless :: scope There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Regarding the comment, it is not exactly "t", much like k.t with k a frame rate dependent constant and I will update the comment with the actual expression. I have some needs for video encoding in my side and may try and implement it in the future. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||||||
| } | ||||||
| return image_tokens->n_tokens(); | ||||||
| } | ||||||
|
|
||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if this code block is safe to remove, better remove it altogether