Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions include/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -482,6 +482,7 @@ extern "C" {

LLAMA_API int32_t llama_model_n_ctx_train(const struct llama_model * model);
LLAMA_API int32_t llama_model_n_embd (const struct llama_model * model);
LLAMA_API int32_t llama_model_n_embd_inp (const struct llama_model * model);
LLAMA_API int32_t llama_model_n_layer (const struct llama_model * model);
LLAMA_API int32_t llama_model_n_head (const struct llama_model * model);
LLAMA_API int32_t llama_model_n_head_kv (const struct llama_model * model);
Expand Down
8 changes: 4 additions & 4 deletions src/llama-context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -620,7 +620,7 @@ float * llama_context::get_embeddings_ith(int32_t i) {
throw std::runtime_error(format("corrupt output buffer (j=%" PRId64 ", n_outputs=%d)", j, n_outputs));
}

return embd + j*model.hparams.n_embd;
return embd + j*model.hparams.n_embd_inp();
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what());
#ifndef NDEBUG
Expand Down Expand Up @@ -808,7 +808,7 @@ int llama_context::encode(const llama_batch & batch_inp) {

const auto & hparams = model.hparams;

const int64_t n_embd = hparams.n_embd;
const int64_t n_embd = hparams.n_embd_inp();
const int64_t n_vocab = model.vocab.n_tokens();

// note: during encode, we always pass the full sequence starting from pos = 0
Expand Down Expand Up @@ -977,7 +977,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
const auto & hparams = model.hparams;

const int64_t n_vocab = vocab.n_tokens();
const int64_t n_embd = hparams.n_embd;
const int64_t n_embd = hparams.n_embd_inp();

// when computing embeddings, all tokens are output
const bool output_all = cparams.embeddings;
Expand Down Expand Up @@ -2135,7 +2135,7 @@ void llama_context::opt_epoch_iter(
batch.logits [pos_batch] = true;
}

if (!balloc->init(batch, model.vocab, nullptr, model.hparams.n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, true)) {
if (!balloc->init(batch, model.vocab, nullptr, model.hparams.n_embd_inp(), cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, true)) {
LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
return;
}
Expand Down
4 changes: 2 additions & 2 deletions src/llama-graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1142,7 +1142,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(

// input embeddings with optional lora
ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
const int64_t n_embd = hparams.n_embd;
const int64_t n_embd = hparams.n_embd_inp();

auto inp = std::make_unique<llm_graph_input_embd>();

Expand Down Expand Up @@ -1279,7 +1279,7 @@ ggml_tensor * llm_graph_context::build_inp_cross_embd() const {
// return cur;
//}

const auto n_embd = !cross->v_embd.empty() ? cross->n_embd : hparams.n_embd;
const auto n_embd = !cross->v_embd.empty() ? cross->n_embd : hparams.n_embd_inp();
const auto n_enc = !cross->v_embd.empty() ? cross->n_enc : hparams.n_ctx_train;

cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_enc);
Expand Down
10 changes: 10 additions & 0 deletions src/llama-hparams.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,16 @@ uint32_t llama_hparams::n_gqa(uint32_t il) const {
return n_head/n_head_kv;
}

uint32_t llama_hparams::n_embd_inp() const {
uint32_t n_embd_inp = n_embd;

if (n_deepstack_layers > 0) {
n_embd_inp += n_embd * n_deepstack_layers;
}

return n_embd_inp;
}

uint32_t llama_hparams::n_embd_k_gqa(uint32_t il) const {
const uint32_t n_head_kv = this->n_head_kv(il);

Expand Down
3 changes: 3 additions & 0 deletions src/llama-hparams.h
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,9 @@ struct llama_hparams {

uint32_t n_gqa(uint32_t il = 0) const;

// dimension of main + auxiliary input embeddings
uint32_t n_embd_inp() const;

// dimension of key embeddings across all k-v heads
uint32_t n_embd_k_gqa(uint32_t il = 0) const;

Expand Down
28 changes: 10 additions & 18 deletions src/llama-model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -276,8 +276,8 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
} break;
case GGML_OP_IM2COL:
{
const int n_embd = hparams.n_embd;
ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd, w->ne[1], 1, 1);
const int n_embd_inp = hparams.n_embd_inp();
ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd_inp, w->ne[1], 1, 1);
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm unsure if this is correct as well...

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think either n_embd_inp or n_embd should not be very important. at least for now, models that use this ops will have n_embd_inp == n_embd

op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
} break;
case GGML_OP_SCALE:
Expand Down Expand Up @@ -1039,9 +1039,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
case 64: type = LLM_TYPE_32B; break;
default: type = LLM_TYPE_UNKNOWN;
}
// since vision model stacks deepstack features along feature dim
// we also create a fake "n_embd" for text model to be the main embd + deepstack embds
hparams.n_embd *= hparams.n_deepstack_layers + 1;
} break;
case LLM_ARCH_QWEN3MOE:
{
Expand All @@ -1065,9 +1062,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
case 94: type = LLM_TYPE_235B_A22B; break;
default: type = LLM_TYPE_UNKNOWN;
}
// since vision model stacks deepstack features along feature dim
// we also create a fake "n_embd" for text model to be the main embd + deepstack embds
hparams.n_embd *= hparams.n_deepstack_layers + 1;
} break;
case LLM_ARCH_PHI2:
{
Expand Down Expand Up @@ -3332,10 +3326,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
case LLM_ARCH_QWEN3:
case LLM_ARCH_QWEN3VL:
{
// for model loading, the weights only have the main embd
// so we need to divide by the number of deepstack layers + 1
// n_embd is const int so we declare a new variable
int64_t n_embd = hparams.n_embd / (hparams.n_deepstack_layers + 1);
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);

// output
Expand Down Expand Up @@ -3371,10 +3361,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
case LLM_ARCH_QWEN3MOE:
case LLM_ARCH_QWEN3VLMOE:
{
// for model loading, the weights only have the main embd
// so we need to divide by the number of deepstack layers + 1
// n_embd is const int so we declare a new variable
int64_t n_embd = hparams.n_embd / (hparams.n_deepstack_layers + 1);
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);

// output
Expand Down Expand Up @@ -6482,6 +6468,7 @@ void llama_model::print_info() const {
if (!hparams.vocab_only) {
LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
LLAMA_LOG_INFO("%s: n_embd_inp = %u\n", __func__, hparams.n_embd_inp());
LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
LLAMA_LOG_INFO("%s: n_head = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head(il); }, hparams.n_layer).c_str());
LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
Expand Down Expand Up @@ -6681,8 +6668,9 @@ ggml_backend_buffer_type_t llama_model::select_buft(int il) const {
return ::select_buft(
*pimpl->dev_layer.at(il).buft_list,
[&](ggml_context * ctx) {
ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
const int n_embd_inp = hparams.n_embd_inp();
ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd_inp);
ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd_inp);
return ggml_add(ctx, cur, layer_dir);
});
}
Expand Down Expand Up @@ -7329,6 +7317,10 @@ int32_t llama_model_n_embd(const llama_model * model) {
return model->hparams.n_embd;
}

int32_t llama_model_n_embd_inp(const llama_model * model) {
return model->hparams.n_embd_inp();
}

int32_t llama_model_n_layer(const llama_model * model) {
return model->hparams.n_layer;
}
Expand Down
3 changes: 1 addition & 2 deletions src/models/qwen3vl-moe.cpp
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
#include "models.h"

llm_build_qwen3vlmoe::llm_build_qwen3vlmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_full = hparams.n_embd; // main embd + deepstack embds
const size_t n_deepstack_layers = hparams.n_deepstack_layers;
const int64_t n_embd = n_embd_full / (n_deepstack_layers + 1);
const int64_t n_embd = hparams.n_embd;
const int64_t n_embd_head = hparams.n_embd_head_v;

GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
Expand Down
5 changes: 1 addition & 4 deletions src/models/qwen3vl.cpp
Original file line number Diff line number Diff line change
@@ -1,13 +1,10 @@
#include "models.h"

llm_build_qwen3vl::llm_build_qwen3vl(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {

const int64_t n_embd_full = hparams.n_embd; // main embd + deepstack embds
const size_t n_deepstack_layers = hparams.n_deepstack_layers;
const int64_t n_embd = n_embd_full / (n_deepstack_layers + 1);
const int64_t n_embd = hparams.n_embd;
const int64_t n_embd_head = hparams.n_embd_head_v;


GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);

Expand Down
2 changes: 1 addition & 1 deletion tools/mtmd/mtmd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ struct mtmd_context {
print_timings(ctx_params.print_timings),
n_threads (ctx_params.n_threads),
media_marker (ctx_params.media_marker),
n_embd_text (llama_model_n_embd(text_model))
n_embd_text (llama_model_n_embd_inp(text_model))
{
if (std::string(ctx_params.image_marker) != MTMD_DEFAULT_IMAGE_MARKER) {
throw std::runtime_error("custom image_marker is not supported anymore, use media_marker instead");
Expand Down
Loading