@@ -221,12 +221,11 @@ bool llama_batch_allocr::init(
221221 /* .n_seq_id =*/ batch.n_seq_id ,
222222 /* .seq_id =*/ batch.seq_id ,
223223 /* .seq_id_unq =*/ this ->seq_id_unq .data (),
224- /* .seq_idx =*/ this ->seq_idx .data (),
225- /* .output =*/ batch.logits ,
226- /* .data =*/ {},
227- };
228-
229- ubatch_print (ubatch, debug);
224+ /* .seq_idx =*/ this ->seq_idx .data (),
225+ /* .output =*/ batch.logits ,
226+ /* .kv_position_of_token=*/ {},
227+ /* .data =*/ {},
228+ }; ubatch_print (ubatch, debug);
230229
231230 LLAMA_LOG_DEBUG (" %s: seq = [\n " , __func__);
232231 for (int s0 = 0 ; s0 < (int ) seq_pos.size (); ++s0) {
@@ -256,36 +255,38 @@ bool llama_batch_allocr::init(
256255 continue ;
257256 }
258257
259- const llama_pos p0 = memory ? memory->seq_pos_max (s) : -1 ;
260-
261- if (p0 >= 0 ) {
262- bool ok = true ;
263-
264- if (batch.token ) {
265- if (seq_pos_min (s) != p0 + 1 ) {
266- ok = false ;
267- }
268- } else {
269- assert (batch.embd );
270-
271- // for embeddings (typically used as vision input), we allow them to have repeating positions
272- // ref: https://github.com/ggml-org/llama.cpp/issues/13694#issuecomment-2983871762
273- if (seq_pos_min (s) != p0 && seq_pos_min (s) != p0 + 1 ) {
274- ok = false ;
275- }
276- }
277-
278- if (!ok) {
279- LLAMA_LOG_ERROR (
280- " %s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n "
281- " - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n "
282- " - the tokens for sequence %d in the input batch have a starting position of Y = %d\n "
283- " it is required that the sequence positions remain consecutive: Y = X + 1\n " ,
284- __func__, s, s, p0, s, seq_pos_min (s));
285-
286- return false ;
287- }
288- }
258+ // @fmayran: these checks don't make sense with models using position encoding such as Qwen VL, because the position stored in the KV cache can jump around (it is not even always increasing).
259+ // it is not enough to let them be repeating. Within an image embedding, arbitrary jumps are expected.
260+ // const llama_pos p0 = memory ? memory->seq_pos_max(s) : -1;
261+ //
262+ // if (p0 >= 0) {
263+ // bool ok = true;
264+ //
265+ // if (batch.token) {
266+ // if (seq_pos_min(s) != p0 + 1) {
267+ // ok = false;
268+ // }
269+ // } else {
270+ // assert(batch.embd);
271+ //
272+ // // for embeddings (typically used as vision input), we allow them to have repeating positions
273+ // // ref: https://github.com/ggml-org/llama.cpp/issues/13694#issuecomment-2983871762
274+ // if (seq_pos_min(s) != p0 && seq_pos_min(s) != p0 + 1) {
275+ // ok = false;
276+ // }
277+ // }
278+ //
279+ // if (!ok) {
280+ // LLAMA_LOG_ERROR(
281+ // "%s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n"
282+ // " - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n"
283+ // " - the tokens for sequence %d in the input batch have a starting position of Y = %d\n"
284+ // " it is required that the sequence positions remain consecutive: Y = X + 1\n",
285+ // __func__, s, s, p0, s, seq_pos_min(s));
286+ //
287+ // return false;
288+ // }
289+ // }
289290
290291 if (seq_pos_max (s) - seq_pos_min (s) + 1 > (int ) seq_pos[s].size ()) {
291292 LLAMA_LOG_ERROR (" %s: sequence %d positions are not continuous\n " , __func__, s);
@@ -369,36 +370,38 @@ llama_ubatch llama_batch_allocr::ubatch_reserve(uint32_t n_seq_tokens, uint32_t
369370
370371 auto udata = std::make_shared<llama_ubatch::data_t >();
371372
372- udata->token .resize (n_tokens);
373- udata->embd .clear ();
374- udata->pos .resize (n_tokens);
375- udata->n_seq_id .resize (n_tokens);
376- udata->seq_id .resize (n_tokens);
377- udata->seq_id_unq .resize (0 );
378- udata->seq_idx .resize (LLAMA_MAX_SEQ, -1 );
379- udata->output .resize (n_tokens);
373+ udata->token .resize (n_tokens);
374+ udata->embd .clear ();
375+ udata->pos .resize (n_tokens);
376+ udata->n_seq_id .resize (n_tokens);
377+ udata->seq_id .resize (n_tokens);
378+ udata->seq_id_unq .resize (0 );
379+ udata->seq_idx .resize (LLAMA_MAX_SEQ, -1 );
380+ udata->output .resize (n_tokens);
381+ udata->kv_position_of_token .resize (n_tokens, -1 );
380382
381383 for (uint32_t s = 0 ; s < n_seqs; ++s) {
382384 udata->seq_idx [s] = s;
383385 udata->seq_id_unq .push_back (s);
384386 }
385387
386388 llama_ubatch res {
387- /* .b_equal_seqs =*/ true ,
388- /* .n_tokens =*/ n_tokens,
389- /* .n_seq_tokens =*/ n_seq_tokens,
390- /* .n_seqs =*/ n_seqs,
391- /* .n_seqs_unq =*/ n_seqs,
392-
393- /* .token =*/ udata->token .data (),
394- /* .embd =*/ nullptr ,
395- /* .pos =*/ udata->pos .data (),
396- /* .n_seq_id =*/ udata->n_seq_id .data (),
397- /* .seq_id =*/ udata->seq_id .data (),
398- /* .seq_id_unq =*/ udata->seq_id_unq .data (),
399- /* .seq_idx =*/ udata->seq_idx .data (),
400- /* .output =*/ udata->output .data (),
401- /* .data =*/ std::move (udata),
389+ /* .b_equal_seqs =*/ true ,
390+ /* .n_tokens =*/ n_tokens,
391+ /* .n_seq_tokens =*/ n_seq_tokens,
392+ /* .n_seqs =*/ n_seqs,
393+ /* .n_seqs_unq =*/ n_seqs,
394+
395+ /* .token =*/ udata->token .data (),
396+ /* .embd =*/ nullptr ,
397+ /* .pos =*/ udata->pos .data (),
398+ /* .n_seq_id =*/ udata->n_seq_id .data (),
399+ /* .seq_id =*/ udata->seq_id .data (),
400+ /* .seq_id_unq =*/ udata->seq_id_unq .data (),
401+ /* .seq_idx =*/ udata->seq_idx .data (),
402+ /* .output =*/ udata->output .data (),
403+ /* .kv_position_of_token=*/ udata->kv_position_of_token .data (),
404+ /* .data =*/ std::move (udata),
402405 };
403406
404407 return res;
@@ -660,14 +663,15 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
660663 const int64_t n_embd_all = batch.embd ? (int64_t ) n_tokens*n_embd : 0 ;
661664 const int64_t n_pos_all = (int64_t ) n_tokens*n_pos_cur;
662665
663- udata->token .resize (n_tokens);
664- udata->embd .resize (n_embd_all);
665- udata->pos .resize (n_pos_all);
666- udata->n_seq_id .resize (n_tokens);
667- udata->seq_id .resize (n_tokens);
668- udata->seq_id_unq .resize (0 );
669- udata->seq_idx .resize (LLAMA_MAX_SEQ, -1 );
670- udata->output .resize (n_tokens);
666+ udata->token .resize (n_tokens);
667+ udata->embd .resize (n_embd_all);
668+ udata->pos .resize (n_pos_all);
669+ udata->n_seq_id .resize (n_tokens);
670+ udata->seq_id .resize (n_tokens);
671+ udata->seq_id_unq .resize (0 );
672+ udata->seq_idx .resize (LLAMA_MAX_SEQ, -1 );
673+ udata->output .resize (n_tokens);
674+ udata->kv_position_of_token .resize (n_tokens, -1 );
671675
672676 seq_set_t seq_set_unq;
673677
@@ -705,21 +709,22 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
705709 }
706710
707711 llama_ubatch res {
708- /* .b_equal_seqs =*/ equal_seqs,
709- /* .n_tokens =*/ n_tokens,
710- /* .n_seq_tokens =*/ n_tokens/n_seqs,
711- /* .n_seqs =*/ n_seqs,
712- /* .n_seqs_unq =*/ (uint32_t ) udata->seq_id_unq .size (),
713-
714- /* .token =*/ batch.token ? udata->token .data () : nullptr ,
715- /* .embd =*/ batch.embd ? udata->embd .data () : nullptr ,
716- /* .pos =*/ udata->pos .data (),
717- /* .n_seq_id =*/ udata->n_seq_id .data (),
718- /* .seq_id =*/ udata->seq_id .data (),
719- /* .seq_id_unq =*/ udata->seq_id_unq .data (),
720- /* .seq_idx =*/ udata->seq_idx .data (),
721- /* .output =*/ udata->output .data (),
722- /* .data =*/ std::move (udata),
712+ /* .b_equal_seqs =*/ equal_seqs,
713+ /* .n_tokens =*/ n_tokens,
714+ /* .n_seq_tokens =*/ n_tokens/n_seqs,
715+ /* .n_seqs =*/ n_seqs,
716+ /* .n_seqs_unq =*/ (uint32_t ) udata->seq_id_unq .size (),
717+
718+ /* .token =*/ batch.token ? udata->token .data () : nullptr ,
719+ /* .embd =*/ batch.embd ? udata->embd .data () : nullptr ,
720+ /* .pos =*/ udata->pos .data (),
721+ /* .n_seq_id =*/ udata->n_seq_id .data (),
722+ /* .seq_id =*/ udata->seq_id .data (),
723+ /* .seq_id_unq =*/ udata->seq_id_unq .data (),
724+ /* .seq_idx =*/ udata->seq_idx .data (),
725+ /* .output =*/ udata->output .data (),
726+ /* .kv_position_of_token=*/ udata->kv_position_of_token .data (),
727+ /* .data =*/ std::move (udata),
723728 };
724729
725730 if (debug > 0 ) {
0 commit comments