From 4d1ff87115291194bbe38a37d15af76a4397e882 Mon Sep 17 00:00:00 2001 From: dinhhuy Date: Mon, 2 Jun 2025 00:19:50 +0900 Subject: [PATCH 1/2] qwen3 get embedding from logits --- src/llama-context.cpp | 39 ++++++++++++++++++++++++++++++++++----- src/llama-model.cpp | 4 ++-- tools/server/server.cpp | 14 +++++++++++++- 3 files changed, 49 insertions(+), 8 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index e153351af3809..086be04363298 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -887,7 +887,12 @@ int llama_context::decode(llama_batch & inp_batch) { const int32_t n_vocab = vocab.n_tokens(); const int64_t n_tokens_all = batch.n_tokens; - const int64_t n_embd = hparams.n_embd; + int64_t n_embd = hparams.n_embd; + + if (model.arch == LLM_ARCH_QWEN3 || model.arch == LLM_ARCH_QWEN3MOE) { + // Qwen3 uses a different embedding size + n_embd = n_vocab; + } llama_kv_cache_guard kv_guard(kv_self); @@ -1021,7 +1026,15 @@ int llama_context::decode(llama_batch & inp_batch) { if (n_outputs) { GGML_ASSERT( n_outputs_prev + n_outputs <= n_outputs_all); - GGML_ASSERT((n_outputs_prev + n_outputs)*n_vocab <= (int64_t) logits_size); + + if (model.arch == LLM_ARCH_QWEN3 && cparams.embeddings) { + // For Qwen3 with embeddings enabled, we share the tensor between logits and embeddings + GGML_ASSERT(n_outputs * n_vocab <= (int64_t) logits_size); + } else { + // Standard check for other model architectures + GGML_ASSERT((n_outputs_prev + n_outputs) * n_vocab <= (int64_t) logits_size); + } + ggml_backend_tensor_get_async(backend_res, t_logits, logits_out, 0, n_outputs*n_vocab*sizeof(float)); } } @@ -1170,7 +1183,12 @@ int32_t llama_context::output_reserve(int32_t n_outputs) { const auto n_batch = cparams.n_batch; const auto n_vocab = vocab.n_tokens(); - const auto n_embd = hparams.n_embd; + int64_t n_embd = hparams.n_embd; + + // For Qwen3, n_embd is equal to n_vocab + if (model.arch == LLM_ARCH_QWEN3) { + n_embd = n_vocab; + } // TODO: use a per-batch flag for logits presence instead bool has_logits = !cparams.embeddings; @@ -1182,8 +1200,19 @@ int32_t llama_context::output_reserve(int32_t n_outputs) { has_embd = true; } - logits_size = has_logits ? n_vocab*n_outputs_max : 0; - embd_size = has_embd ? n_embd*n_outputs_max : 0; + // For Qwen3 models, both logits and embeddings point to the same tensor + bool shared_tensor = (model.arch == LLM_ARCH_QWEN3); + + // Adjust buffer sizes for the case where both tensors are shared + if (shared_tensor && has_logits && has_embd) { + // For Qwen3, we only need one buffer since logits and embeddings share the same tensor + logits_size = n_vocab * n_outputs_max; + embd_size = 0; // No need for a separate embedding buffer + } else { + // Normal case - separate buffers + logits_size = has_logits ? n_vocab * n_outputs_max : 0; + embd_size = has_embd ? n_embd * n_outputs_max : 0; + } if (output_ids.empty()) { // init, never resized afterwards diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 3f1f6c9bf3b06..2a7b30d634ae0 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -7074,13 +7074,13 @@ struct llm_build_qwen3 : public llm_graph_context { LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); - res->t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); res->t_logits = cur; + res->t_embd = cur; ggml_build_forward_expand(gf, cur); } @@ -7202,13 +7202,13 @@ struct llm_build_qwen3moe : public llm_graph_context { LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); - res->t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); res->t_logits = cur; + res->t_embd = cur; ggml_build_forward_expand(gf, cur); } diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 5d03dc3dc790a..ac27ef61fb4cd 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -2545,7 +2545,18 @@ struct server_context { res->n_tokens = slot.n_prompt_tokens; res->oaicompat = slot.params.oaicompat; - const int n_embd = llama_model_n_embd(model); + int n_embd = llama_model_n_embd(model); + // For Qwen3 specific handling + bool is_qwen3 = false; + char arch_name[128] = {0}; + if (llama_model_meta_val_str(model, "general.architecture", arch_name, sizeof(arch_name)) > 0) { + is_qwen3 = (strcmp(arch_name, "qwen3") == 0 || strcmp(arch_name, "qwen3moe") == 0); + if (is_qwen3) { + // Get vocabulary size for Qwen3 models - they use n_vocab as embedding size + n_embd = llama_vocab_n_tokens(vocab); + SLT_INF(slot, "Qwen3 model embedding size: %d\n", n_embd); + } + } std::vector embd_res(n_embd, 0.0f); @@ -2556,6 +2567,7 @@ struct server_context { const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]); if (embd == NULL) { + fprintf(stderr, "Failed to get embeddings\n"); embd = llama_get_embeddings_ith(ctx, i); } From 6a1a9c97c51179a19191c800222aa73dcea3663e Mon Sep 17 00:00:00 2001 From: dinhhuy Date: Mon, 2 Jun 2025 00:55:20 +0900 Subject: [PATCH 2/2] fix lint --- src/llama-context.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 8ad35de20ff3b..d78254211b9bd 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1072,7 +1072,7 @@ int llama_context::decode(llama_batch & inp_batch) { if (n_outputs) { GGML_ASSERT( n_outputs_prev + n_outputs <= n_outputs_all); - + if (model.arch == LLM_ARCH_QWEN3 && cparams.embeddings) { // For Qwen3 with embeddings enabled, we share the tensor between logits and embeddings GGML_ASSERT(n_outputs * n_vocab <= (int64_t) logits_size); @@ -1080,7 +1080,7 @@ int llama_context::decode(llama_batch & inp_batch) { // Standard check for other model architectures GGML_ASSERT((n_outputs_prev + n_outputs) * n_vocab <= (int64_t) logits_size); } - + ggml_backend_tensor_get_async(backend_res, t_logits, logits_out, 0, n_outputs*n_vocab*sizeof(float)); } }