Skip to content

Commit ba2135c

Browse files
authored
gemma : allow offloading the output tensor (ggml-org#5646)
1 parent 89febfe commit ba2135c

File tree

1 file changed

+3
-1
lines changed

1 file changed

+3
-1
lines changed

llama.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4394,6 +4394,8 @@ static bool llm_load_tensors(
43944394

43954395
// output
43964396
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
4397+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // same as tok_embd, duplicated to allow offloading
4398+
ml.n_created--; // artificial tensor
43974399

43984400
const int64_t n_ff = hparams.n_ff;
43994401
const int64_t n_embd_head_k = hparams.n_embd_head_k;
@@ -7525,7 +7527,7 @@ struct llm_build_context {
75257527
cb(cur, "result_norm", -1);
75267528

75277529
// lm_head
7528-
cur = ggml_mul_mat(ctx0, model.tok_embd, cur);
7530+
cur = ggml_mul_mat(ctx0, model.output, cur);
75297531
cb(cur, "result_output", -1);
75307532

75317533
ggml_build_forward_expand(gf, cur);

0 commit comments

Comments
 (0)