Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion include/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@ extern "C" {

llama_token * token;
float * embd;
llama_pos * pos;
llama_pos * pos; // first `n_tokens` elements are always linearly increasing position for traditional llm
int32_t * n_seq_id;
llama_seq_id ** seq_id;
int8_t * logits; // TODO: rename this to "output"
Expand Down
22 changes: 3 additions & 19 deletions src/llama-batch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -259,23 +259,7 @@ bool llama_batch_allocr::init(
const llama_pos p0 = memory ? memory->seq_pos_max(s) : -1;

if (p0 >= 0) {
bool ok = true;

if (batch.token) {
if (seq_pos_min(s) != p0 + 1) {
ok = false;
}
} else {
assert(batch.embd);

// for embeddings (typically used as vision input), we allow them to have repeating positions
// ref: https://github.com/ggml-org/llama.cpp/issues/13694#issuecomment-2983871762
if (seq_pos_min(s) != p0 && seq_pos_min(s) != p0 + 1) {
ok = false;
}
}

if (!ok) {
if (seq_pos_min(s) != p0 + 1) {
LLAMA_LOG_ERROR(
"%s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n"
" - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n"
Expand Down Expand Up @@ -655,7 +639,7 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u

auto udata = std::make_shared<llama_ubatch::data_t>();

const int32_t n_pos_cur = batch.embd ? n_pos_per_embd : 1;
const int32_t n_pos_cur = batch.embd ? (n_pos_per_embd + 1) : 1;

const int64_t n_embd_all = batch.embd ? (int64_t) n_tokens*n_embd : 0;
const int64_t n_pos_all = (int64_t) n_tokens*n_pos_cur;
Expand All @@ -681,7 +665,7 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
}

for (int j = 0; j < n_pos_cur; ++j) {
udata->pos[j*n_tokens + i] = batch.pos[j*batch.n_tokens + idxs[i]];
udata->pos[j * n_tokens + i] = batch.pos[j * batch.n_tokens + idxs[i]];
}

udata->n_seq_id[i] = batch.n_seq_id[idxs[i]];
Expand Down
8 changes: 7 additions & 1 deletion src/llama-graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,13 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
}
ggml_backend_tensor_set(pos, pos_data.data(), 0, pos_data.size()*ggml_element_size(pos));
} else {
ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_embd*ggml_element_size(pos));
llama_pos * pos_ptr = ubatch->pos;
// Normally, ubatch->pos stores linearly increasing position
// However, some multi-modal models requires special position embedding (e.g. M-Rope in qwen2vl and qwen2.5vl)
// But linearly increasing position is still needed for proper causal attention masking
// So we store both of them: the first n_tokens elements are not changed, while model-specific positions are appended after that.
if (ubatch->embd && n_pos_per_embd > 1) pos_ptr += n_tokens; // use mrope positions
ggml_backend_tensor_set(pos, pos_ptr, 0, n_tokens * n_pos_per_embd * ggml_element_size(pos));
}
}
}
Expand Down
25 changes: 16 additions & 9 deletions tools/mtmd/mtmd-helper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,11 @@ llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks) {

// helper struct to make working with embd batch easier
// note: this will be removed after llama_batch_ext refactoring
// notes2: Normally, batch's `pos` stores linearly increasing position
// However, some multi-modal models requires special position embedding (e.g. M-Rope in qwen2vl and qwen2.5vl)
// But linearly increasing position is still needed for proper causal attention masking
// So we store both of them: the first n_tokens elements are not changed, while model-specific positions are appended after that.
// So `pos` has `n_tokens * (n_pos_per_embd + 1)` elements
struct decode_embd_batch {
int n_pos_per_embd;
int n_mmproj_embd;
Expand All @@ -66,7 +71,7 @@ struct decode_embd_batch {
std::vector<int8_t> logits;
llama_batch batch;
decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
pos .resize(n_tokens * n_pos_per_embd);
pos .resize(n_tokens * (n_pos_per_embd + 1));
n_seq_id.resize(n_tokens);
seq_ids .resize(n_tokens + 1);
logits .resize(n_tokens);
Expand Down Expand Up @@ -100,13 +105,14 @@ struct decode_embd_batch {
for (int y = 0; y < ny; y++) {
for (int x = 0; x < nx; x++) {
int i = y * nx + x;
pos[i ] = pos_0;
pos[i + batch.n_tokens ] = pos_0 + y;
pos[i + batch.n_tokens * 2] = pos_0 + x;
pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused
pos[i + batch.n_tokens ] = pos_0;
pos[i + batch.n_tokens * 2] = pos_0 + y;
pos[i + batch.n_tokens * 3] = pos_0 + x;
pos[i + batch.n_tokens * 4] = 0; // last pos dim is unused
}
}
for (int i = 0; i < batch.n_tokens; i++) {
batch.pos [i] = pos_0 + i;
batch.n_seq_id[i] = 1;
batch.seq_id [i] = seq_id_0.data();
batch.logits [i] = false;
Expand All @@ -118,12 +124,13 @@ struct decode_embd_batch {
GGML_ASSERT(n_pos_per_embd == 4);
seq_id_0[0] = seq_id;
for (int i = 0; i < batch.n_tokens; i++) {
pos[i ] = pos_0 + i;
pos[i + batch.n_tokens ] = pos_0 + i;
pos[i + batch.n_tokens * 2] = pos_0 + i;
pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused
pos[i + batch.n_tokens * 3] = pos_0 + i;
pos[i + batch.n_tokens * 4] = 0; // last pos dim is unused
}
for (int i = 0; i < batch.n_tokens; i++) {
batch.pos [i] = pos_0 + i;
batch.n_seq_id[i] = 1;
batch.seq_id [i] = seq_id_0.data();
batch.logits [i] = false;
Expand All @@ -133,12 +140,12 @@ struct decode_embd_batch {
llama_batch get_view(int offset, int n_tokens) {
llama_pos * pos_ptr;
pos_view.clear();
pos_view.reserve(n_tokens * n_pos_per_embd);
pos_view.reserve(n_tokens * (n_pos_per_embd + 1));
if (n_pos_per_embd > 1) {
// mrope
// for example, with layout of src: 1234...1234...1234...1234...
// offset 2 will give us dst: 34...34...34...34...
for (int i = 0; i < n_pos_per_embd; i++) {
for (int i = 0; i <= n_pos_per_embd; i++) {
// assume n_tokens is less than or equal to batch.n_tokens
// batch.n_tokens is number of **total** tokens
// n_tokens is number of viewed token
Expand Down
3 changes: 0 additions & 3 deletions tools/mtmd/mtmd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1024,9 +1024,6 @@ const char * mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) {
}

llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) {
if (image_tokens->use_mrope_pos) {
return 1; // for M-RoPE, the whole image is 1 in temporal dimension
}
Comment on lines -1027 to -1029
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So based on what discussed so far, here we should return max(h, w)

However, I don't know how it will impact the rest of the code. Make sure to test it.

return image_tokens->n_tokens();
}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removing this will cause unexpected behavior


Expand Down