Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,9 @@
/trace.json

/*.log

# Test binaries
/test_kv_cache_fix
/test_qwen2vl
/test_sampling_comprehensive
/test_sampling_flags
219 changes: 219 additions & 0 deletions llama.cpp/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,7 @@ enum llm_arch {
LLM_ARCH_QWEN,
LLM_ARCH_QWEN2,
LLM_ARCH_QWEN2MOE,
LLM_ARCH_QWEN2VL,
LLM_ARCH_QWEN3,
LLM_ARCH_QWEN3MOE,
LLM_ARCH_PHI2,
Expand Down Expand Up @@ -205,6 +206,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_QWEN, "qwen" },
{ LLM_ARCH_QWEN2, "qwen2" },
{ LLM_ARCH_QWEN2MOE, "qwen2moe" },
{ LLM_ARCH_QWEN2VL, "qwen2vl" },
{ LLM_ARCH_QWEN3, "qwen3" },
{ LLM_ARCH_QWEN3MOE, "qwen3moe" },
{ LLM_ARCH_PHI2, "phi2" },
Expand Down Expand Up @@ -298,6 +300,7 @@ enum llm_kv {
LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
LLM_KV_ROPE_SCALING_FINETUNED,
LLM_KV_ROPE_SCALING_YARN_LOG_MUL,
LLM_KV_ROPE_DIMENSION_SECTIONS,

LLM_KV_SPLIT_NO,
LLM_KV_SPLIT_COUNT,
Expand Down Expand Up @@ -399,6 +402,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
{ LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
{ LLM_KV_ROPE_SCALING_YARN_LOG_MUL, "%s.rope.scaling.yarn_log_multiplier" },
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },

{ LLM_KV_SPLIT_NO, "split.no" },
{ LLM_KV_SPLIT_COUNT, "split.count" },
Expand Down Expand Up @@ -465,6 +469,10 @@ enum llm_tensor {
LLM_TENSOR_ATTN_V,
LLM_TENSOR_ATTN_QKV,
LLM_TENSOR_ATTN_OUT,
LLM_TENSOR_ATTN_Q_BIAS,
LLM_TENSOR_ATTN_K_BIAS,
LLM_TENSOR_ATTN_V_BIAS,
LLM_TENSOR_ATTN_OUT_BIAS,
LLM_TENSOR_ATTN_NORM,
LLM_TENSOR_ATTN_NORM_2,
LLM_TENSOR_ATTN_OUT_NORM,
Expand Down Expand Up @@ -848,6 +856,27 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
},
},
{
LLM_ARCH_QWEN2VL,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_Q_BIAS, "blk.%d.attn_q_bias" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_K_BIAS, "blk.%d.attn_k_bias" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_V_BIAS, "blk.%d.attn_v_bias" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_ATTN_OUT_BIAS, "blk.%d.attn_output_bias" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
{
LLM_ARCH_QWEN3,
{
Expand Down Expand Up @@ -1973,6 +2002,7 @@ enum e_model {
MODEL_40B,
MODEL_65B,
MODEL_70B,
MODEL_72B,
MODEL_236B,
MODEL_314B,
MODEL_SMALL,
Expand Down Expand Up @@ -2038,6 +2068,9 @@ struct llama_hparams {
float rope_freq_scale_train_swa;
uint32_t n_ctx_orig_yarn;
float rope_yarn_log_mul;

// for qwen2vl - rope dimension sections
std::vector<int32_t> rope_sections;

// for State Space Models
uint32_t ssm_d_conv = 0;
Expand Down Expand Up @@ -4411,6 +4444,7 @@ static const char * llama_model_type_name(e_model type) {
case MODEL_40B: return "40B";
case MODEL_65B: return "65B";
case MODEL_70B: return "70B";
case MODEL_72B: return "72B";
case MODEL_236B: return "236B";
case MODEL_314B: return "314B";
case MODEL_SMALL: return "0.1B";
Expand Down Expand Up @@ -4768,6 +4802,31 @@ static void llm_load_hparams(
default: model.type = e_model::MODEL_UNKNOWN;
}
} break;
case LLM_ARCH_QWEN2VL:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);

// Try to load rope dimension sections (optional for qwen2vl)
try {
int key_idx = gguf_find_key(ml.meta, llm_kv(LLM_KV_ROPE_DIMENSION_SECTIONS).c_str());
if (key_idx >= 0) {
auto arr_info = GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ml.meta, key_idx);
if (arr_info.gt == GGUF_TYPE_INT32 && arr_info.length == 4) {
hparams.rope_sections.resize(4);
memcpy(hparams.rope_sections.data(), arr_info.data, 4 * sizeof(int32_t));
}
}
} catch (...) {
// rope_sections are optional - ignore errors
}

switch (hparams.n_layer) {
case 32: model.type = e_model::MODEL_2B; break;
case 40: model.type = e_model::MODEL_7B; break;
case 80: model.type = e_model::MODEL_72B; break;
default: model.type = e_model::MODEL_UNKNOWN;
}
} break;
case LLM_ARCH_QWEN3:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
Expand Down Expand Up @@ -6691,6 +6750,46 @@ static bool llm_load_tensors(
layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp});
}
} break;
case LLM_ARCH_QWEN2VL:
{
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});

// output
{
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
// if output is NULL, init from the input tok embed
if (model.output == NULL) {
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
}
}

for (int i = 0; i < n_layer; ++i) {
ggml_context * ctx_layer = ctx_for_layer(i);
ggml_context * ctx_split = ctx_for_layer_split(i);

auto & layer = model.layers[i];

layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});

layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});

// bias tensors for qwen2vl
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});

layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});

layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
}
} break;
case LLM_ARCH_QWEN3:
{
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
Expand Down Expand Up @@ -10898,6 +10997,121 @@ struct llm_build_context {
return gf;
}

struct ggml_cgraph * build_qwen2vl() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);

const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);

struct ggml_tensor * cur;
struct ggml_tensor * inpL;

inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);

// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos();

// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();

for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * inpSA = inpL;

// norm
cur = llm_build_norm(ctx0, inpL, hparams,
model.layers[il].attn_norm, NULL,
LLM_NORM_RMS, cb, il);
cb(cur, "attn_norm", il);

// self-attention
{
// compute Q and K and RoPE them
struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
cb(Qcur, "Qcur", il);

struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
cb(Kcur, "Kcur", il);

struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
cb(Vcur, "Vcur", il);

Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);

// Apply rope - qwen2vl uses standard rope for now
// TODO: Implement rope_multi with sections (hparams.rope_sections) when available
Qcur = ggml_rope_ext(
ctx0, Qcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);

Kcur = ggml_rope_ext(
ctx0, Kcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);

cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il);

cur = llm_build_kv(ctx0, lctx, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
}

if (il == n_layer - 1) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}

struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
cb(ffn_inp, "ffn_inp", il);

// feed-forward network
cur = llm_build_norm(ctx0, ffn_inp, hparams,
model.layers[il].ffn_norm, NULL,
LLM_NORM_RMS, cb, il);
cb(cur, "ffn_norm", il);

cur = llm_build_ffn(ctx0, lctx, cur,
model.layers[il].ffn_up, NULL, NULL,
model.layers[il].ffn_gate, NULL, NULL,
model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il);

cur = ggml_add(ctx0, cur, ffn_inp);
cur = lctx.cvec.apply_to(ctx0, cur, il);
cb(cur, "l_out", il);

// input for next layer
inpL = cur;
}

cur = inpL;

cur = llm_build_norm(ctx0, cur, hparams,
model.output_norm, NULL,
LLM_NORM_RMS, cb, -1);
cb(cur, "result_norm", -1);

// lm_head
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
cb(cur, "result_output", -1);

ggml_build_forward_expand(gf, cur);

return gf;
}

struct ggml_cgraph * build_qwen3() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);

Expand Down Expand Up @@ -14736,6 +14950,10 @@ static struct ggml_cgraph * llama_build_graph(
{
result = llm.build_qwen2moe();
} break;
case LLM_ARCH_QWEN2VL:
{
result = llm.build_qwen2vl();
} break;
case LLM_ARCH_QWEN3:
{
result = llm.build_qwen3();
Expand Down Expand Up @@ -17963,6 +18181,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
case LLM_ARCH_QWEN:
case LLM_ARCH_QWEN2:
case LLM_ARCH_QWEN2MOE:
case LLM_ARCH_QWEN2VL:
case LLM_ARCH_QWEN3:
case LLM_ARCH_QWEN3MOE:
case LLM_ARCH_PHI2:
Expand Down
7 changes: 6 additions & 1 deletion llama.cpp/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1711,7 +1711,12 @@ struct llama_server_context
slot.cache_tokens[i - n_discard] = slot.cache_tokens[i];
}

slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard);
// Prevent integer underflow that causes std::length_error
if (n_discard >= 0 && (size_t)n_discard < slot.cache_tokens.size()) {
slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard);
} else {
slot.cache_tokens.clear();
}

slot.n_past -= n_discard;

Expand Down
41 changes: 41 additions & 0 deletions test_kv_cache_fix.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#include <iostream>
#include <vector>
#include <cassert>

// Simulated test for the KV cache fix
void test_cache_tokens_resize_fix() {
std::cout << "Testing KV cache resize fix..." << std::endl;

// Simulate the problematic condition
std::vector<int> cache_tokens = {1, 2, 3, 4, 5};
size_t original_size = cache_tokens.size();

// Test cases that could cause integer underflow
int n_discard_cases[] = {-1, 0, 3, 5, 10};

for (int n_discard : n_discard_cases) {
std::vector<int> test_tokens = cache_tokens;

std::cout << "Testing n_discard = " << n_discard
<< " with cache size = " << test_tokens.size() << std::endl;

// Apply the fixed logic
if (n_discard >= 0 && (size_t)n_discard < test_tokens.size()) {
test_tokens.resize(test_tokens.size() - n_discard);
std::cout << " Resized to: " << test_tokens.size() << std::endl;
} else {
test_tokens.clear();
std::cout << " Cleared to: " << test_tokens.size() << std::endl;
}

// Verify no crash occurred
assert(test_tokens.size() <= original_size);
}

std::cout << "All test cases passed! KV cache fix works correctly." << std::endl;
}

int main() {
test_cache_tokens_resize_fix();
return 0;
}
Loading