@@ -887,7 +887,12 @@ int llama_context::decode(llama_batch & inp_batch) {
887887 const int32_t n_vocab = vocab.n_tokens ();
888888
889889 const int64_t n_tokens_all = batch.n_tokens ;
890- const int64_t n_embd = hparams.n_embd ;
890+ int64_t n_embd = hparams.n_embd ;
891+
892+ if (model.arch == LLM_ARCH_QWEN3 || model.arch == LLM_ARCH_QWEN3MOE) {
893+ // Qwen3 uses a different embedding size
894+ n_embd = n_vocab;
895+ }
891896
892897 llama_kv_cache_guard kv_guard (kv_self);
893898
@@ -1021,7 +1026,15 @@ int llama_context::decode(llama_batch & inp_batch) {
10211026
10221027 if (n_outputs) {
10231028 GGML_ASSERT ( n_outputs_prev + n_outputs <= n_outputs_all);
1024- GGML_ASSERT ((n_outputs_prev + n_outputs)*n_vocab <= (int64_t ) logits_size);
1029+
1030+ if (model.arch == LLM_ARCH_QWEN3 && cparams.embeddings ) {
1031+ // For Qwen3 with embeddings enabled, we share the tensor between logits and embeddings
1032+ GGML_ASSERT (n_outputs * n_vocab <= (int64_t ) logits_size);
1033+ } else {
1034+ // Standard check for other model architectures
1035+ GGML_ASSERT ((n_outputs_prev + n_outputs) * n_vocab <= (int64_t ) logits_size);
1036+ }
1037+
10251038 ggml_backend_tensor_get_async (backend_res, t_logits, logits_out, 0 , n_outputs*n_vocab*sizeof (float ));
10261039 }
10271040 }
@@ -1170,7 +1183,12 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
11701183
11711184 const auto n_batch = cparams.n_batch ;
11721185 const auto n_vocab = vocab.n_tokens ();
1173- const auto n_embd = hparams.n_embd ;
1186+ int64_t n_embd = hparams.n_embd ;
1187+
1188+ // For Qwen3, n_embd is equal to n_vocab
1189+ if (model.arch == LLM_ARCH_QWEN3) {
1190+ n_embd = n_vocab;
1191+ }
11741192
11751193 // TODO: use a per-batch flag for logits presence instead
11761194 bool has_logits = !cparams.embeddings ;
@@ -1182,8 +1200,19 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
11821200 has_embd = true ;
11831201 }
11841202
1185- logits_size = has_logits ? n_vocab*n_outputs_max : 0 ;
1186- embd_size = has_embd ? n_embd*n_outputs_max : 0 ;
1203+ // For Qwen3 models, both logits and embeddings point to the same tensor
1204+ bool shared_tensor = (model.arch == LLM_ARCH_QWEN3);
1205+
1206+ // Adjust buffer sizes for the case where both tensors are shared
1207+ if (shared_tensor && has_logits && has_embd) {
1208+ // For Qwen3, we only need one buffer since logits and embeddings share the same tensor
1209+ logits_size = n_vocab * n_outputs_max;
1210+ embd_size = 0 ; // No need for a separate embedding buffer
1211+ } else {
1212+ // Normal case - separate buffers
1213+ logits_size = has_logits ? n_vocab * n_outputs_max : 0 ;
1214+ embd_size = has_embd ? n_embd * n_outputs_max : 0 ;
1215+ }
11871216
11881217 if (output_ids.empty ()) {
11891218 // init, never resized afterwards
0 commit comments