@@ -224,6 +224,7 @@ bool llama_batch_allocr::init(
224224            /* .seq_idx      =*/   this ->seq_idx .data (),
225225            /* .output       =*/   batch.logits ,
226226            /* .data         =*/   {},
227+             /* .kv_position_of_token=*/   {},
227228        };
228229
229230        ubatch_print (ubatch, debug);
@@ -256,36 +257,38 @@ bool llama_batch_allocr::init(
256257            continue ;
257258        }
258259
259-         const  llama_pos p0 = memory ? memory->seq_pos_max (s) : -1 ;
260- 
261-         if  (p0 >= 0 ) {
262-             bool  ok = true ;
263- 
264-             if  (batch.token ) {
265-                 if  (seq_pos_min (s) != p0 + 1 ) {
266-                     ok = false ;
267-                 }
268-             } else  {
269-                 assert (batch.embd );
270- 
271-                 //  for embeddings (typically used as vision input), we allow them to have repeating positions
272-                 //  ref: https://github.com/ggml-org/llama.cpp/issues/13694#issuecomment-2983871762
273-                 if  (seq_pos_min (s) != p0 && seq_pos_min (s) != p0 + 1 ) {
274-                     ok = false ;
275-                 }
276-             }
277- 
278-             if  (!ok) {
279-                 LLAMA_LOG_ERROR (
280-                         " %s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n " 
281-                         "  - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n " 
282-                         "  - the tokens for sequence %d in the input batch have a starting position of Y = %d\n " 
283-                         "  it is required that the sequence positions remain consecutive: Y = X + 1\n "  ,
284-                         __func__, s, s, p0, s, seq_pos_min (s));
285- 
286-                 return  false ;
287-             }
288-         }
260+         // @fmayran: these checks don't make sense with models using position encoding such as Qwen VL, because the position stored in the KV cache can jump around (it is not even always increasing).
261+         // it is not enough to let them be repeating. Within an image embedding, arbitrary jumps are expected.
262+         // const llama_pos p0 = memory ? memory->seq_pos_max(s) : -1;
263+         // 
264+         // if (p0 >= 0) {
265+         //     bool ok = true;
266+         // 
267+         //     if (batch.token) {
268+         //         if (seq_pos_min(s) != p0 + 1) {
269+         //             ok = false;
270+         //         }
271+         //     } else {
272+         //         assert(batch.embd);
273+         // 
274+         //         // for embeddings (typically used as vision input), we allow them to have repeating positions
275+         //         // ref: https://github.com/ggml-org/llama.cpp/issues/13694#issuecomment-2983871762
276+         //         if (seq_pos_min(s) != p0 && seq_pos_min(s) != p0 + 1) {
277+         //             ok = false;
278+         //         }
279+         //     }
280+         // 
281+         //     if (!ok) {
282+         //         LLAMA_LOG_ERROR(
283+         //                 "%s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n"
284+         //                 " - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n"
285+         //                 " - the tokens for sequence %d in the input batch have a starting position of Y = %d\n"
286+         //                 " it is required that the sequence positions remain consecutive: Y = X + 1\n",
287+         //                 __func__, s, s, p0, s, seq_pos_min(s));
288+         // 
289+         //         return false;
290+         //     }
291+         // }
289292
290293        if  (seq_pos_max (s) - seq_pos_min (s) + 1  > (int ) seq_pos[s].size ()) {
291294            LLAMA_LOG_ERROR (" %s: sequence %d positions are not continuous\n "  , __func__, s);
@@ -369,36 +372,38 @@ llama_ubatch llama_batch_allocr::ubatch_reserve(uint32_t n_seq_tokens, uint32_t
369372
370373    auto  udata = std::make_shared<llama_ubatch::data_t >();
371374
372-     udata->token      .resize (n_tokens);
373-     udata->embd       .clear ();
374-     udata->pos        .resize (n_tokens);
375-     udata->n_seq_id   .resize (n_tokens);
376-     udata->seq_id     .resize (n_tokens);
377-     udata->seq_id_unq .resize (0 );
378-     udata->seq_idx    .resize (LLAMA_MAX_SEQ, -1 );
379-     udata->output     .resize (n_tokens);
375+     udata->token                .resize (n_tokens);
376+     udata->embd                 .clear ();
377+     udata->pos                  .resize (n_tokens);
378+     udata->n_seq_id             .resize (n_tokens);
379+     udata->seq_id               .resize (n_tokens);
380+     udata->seq_id_unq           .resize (0 );
381+     udata->seq_idx              .resize (LLAMA_MAX_SEQ, -1 );
382+     udata->output               .resize (n_tokens);
383+     udata->kv_position_of_token .resize (n_tokens, -1 );
380384
381385    for  (uint32_t  s = 0 ; s < n_seqs; ++s) {
382386        udata->seq_idx [s] = s;
383387        udata->seq_id_unq .push_back (s);
384388    }
385389
386390    llama_ubatch res {
387-         /* .b_equal_seqs =*/   true ,
388-         /* .n_tokens     =*/   n_tokens,
389-         /* .n_seq_tokens =*/   n_seq_tokens,
390-         /* .n_seqs       =*/   n_seqs,
391-         /* .n_seqs_unq   =*/   n_seqs,
392- 
393-         /* .token        =*/   udata->token .data (),
394-         /* .embd         =*/   nullptr ,
395-         /* .pos          =*/   udata->pos .data (),
396-         /* .n_seq_id     =*/   udata->n_seq_id .data (),
397-         /* .seq_id       =*/   udata->seq_id .data (),
398-         /* .seq_id_unq   =*/   udata->seq_id_unq .data (),
399-         /* .seq_idx      =*/   udata->seq_idx .data (),
400-         /* .output       =*/   udata->output .data (),
401-         /* .data         =*/   std::move (udata),
391+         /* .b_equal_seqs =*/          true ,
392+         /* .n_tokens     =*/          n_tokens,
393+         /* .n_seq_tokens =*/          n_seq_tokens,
394+         /* .n_seqs       =*/          n_seqs,
395+         /* .n_seqs_unq   =*/          n_seqs,
396+ 
397+         /* .token        =*/          udata->token .data (),
398+         /* .embd         =*/          nullptr ,
399+         /* .pos          =*/          udata->pos .data (),
400+         /* .n_seq_id     =*/          udata->n_seq_id .data (),
401+         /* .seq_id       =*/          udata->seq_id .data (),
402+         /* .seq_id_unq   =*/          udata->seq_id_unq .data (),
403+         /* .seq_idx      =*/          udata->seq_idx .data (),
404+         /* .output       =*/          udata->output .data (),
405+         /* .kv_position_of_token=*/   udata->kv_position_of_token .data (),
406+         /* .data         =*/          std::move (udata),
402407    };
403408
404409    return  res;
@@ -660,14 +665,15 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
660665    const  int64_t  n_embd_all = batch.embd  ? (int64_t ) n_tokens*n_embd : 0 ;
661666    const  int64_t  n_pos_all  =              (int64_t ) n_tokens*n_pos_cur;
662667
663-     udata->token      .resize (n_tokens);
664-     udata->embd       .resize (n_embd_all);
665-     udata->pos        .resize (n_pos_all);
666-     udata->n_seq_id   .resize (n_tokens);
667-     udata->seq_id     .resize (n_tokens);
668-     udata->seq_id_unq .resize (0 );
669-     udata->seq_idx    .resize (LLAMA_MAX_SEQ, -1 );
670-     udata->output     .resize (n_tokens);
668+     udata->token                .resize (n_tokens);
669+     udata->embd                 .resize (n_embd_all);
670+     udata->pos                  .resize (n_pos_all);
671+     udata->n_seq_id             .resize (n_tokens);
672+     udata->seq_id               .resize (n_tokens);
673+     udata->seq_id_unq           .resize (0 );
674+     udata->seq_idx              .resize (LLAMA_MAX_SEQ, -1 );
675+     udata->output               .resize (n_tokens);
676+     udata->kv_position_of_token .resize (n_tokens, -1 );
671677
672678    seq_set_t  seq_set_unq;
673679
@@ -705,21 +711,23 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
705711    }
706712
707713    llama_ubatch res {
708-         /* .b_equal_seqs =*/   equal_seqs,
709-         /* .n_tokens     =*/   n_tokens,
710-         /* .n_seq_tokens =*/   n_tokens/n_seqs,
711-         /* .n_seqs       =*/   n_seqs,
712-         /* .n_seqs_unq   =*/   (uint32_t ) udata->seq_id_unq .size (),
713- 
714-         /* .token        =*/   batch.token  ? udata->token .data () : nullptr ,
715-         /* .embd         =*/   batch.embd  ? udata->embd .data () : nullptr ,
716-         /* .pos          =*/   udata->pos .data (),
717-         /* .n_seq_id     =*/   udata->n_seq_id .data (),
718-         /* .seq_id       =*/   udata->seq_id .data (),
719-         /* .seq_id_unq   =*/   udata->seq_id_unq .data (),
720-         /* .seq_idx      =*/   udata->seq_idx .data (),
721-         /* .output       =*/   udata->output .data (),
722-         /* .data         =*/   std::move (udata),
714+         /* .b_equal_seqs =*/          equal_seqs,
715+         /* .n_tokens     =*/          n_tokens,
716+         /* .n_seq_tokens =*/          n_tokens/n_seqs,
717+         /* .n_seqs       =*/          n_seqs,
718+         /* .n_seqs_unq   =*/          (uint32_t ) udata->seq_id_unq .size (),
719+ 
720+         /* .token        =*/          batch.token  ? udata->token .data () : nullptr ,
721+         /* .embd         =*/          batch.embd  ? udata->embd .data () : nullptr ,
722+         /* .pos          =*/          udata->pos .data (),
723+         /* .n_seq_id     =*/          udata->n_seq_id .data (),
724+         /* .seq_id       =*/          udata->seq_id .data (),
725+         /* .seq_id_unq   =*/          udata->seq_id_unq .data (),
726+         /* .seq_idx      =*/          udata->seq_idx .data (),
727+         /* .output       =*/          udata->output .data (),
728+         /* .kv_position_of_token=*/   udata->kv_position_of_token .data (),
729+         /* .data         =*/          std::move (udata),
730+         
723731    };
724732
725733    if  (debug > 0 ) {
0 commit comments