@@ -3985,7 +3985,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
39853985
39863986 // output
39873987 output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3988- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
3988+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3989+ // if output is NULL, init from the input tok embed
3990+ if (output == NULL) {
3991+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3992+ }
39893993
39903994 for (int i = 0; i < n_layer; ++i) {
39913995 auto & layer = layers[i];
@@ -18007,9 +18011,6 @@ struct llm_build_seed_oss : public llm_graph_context {
1800718011
1800818012 // self-attention
1800918013 {
18010- // rope freq factors for llama3; may return nullptr for llama2 and other models
18011- ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
18012-
1801318014 // compute Q and K and RoPE them
1801418015 ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
1801518016 cb(Qcur, "Qcur", il);
@@ -18037,13 +18038,13 @@ struct llm_build_seed_oss : public llm_graph_context {
1803718038 Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
1803818039
1803918040 Qcur = ggml_rope_ext(
18040- ctx0, Qcur, inp_pos, rope_factors ,
18041+ ctx0, Qcur, inp_pos, nullptr ,
1804118042 n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
1804218043 ext_factor, attn_factor, beta_fast, beta_slow
1804318044 );
1804418045
1804518046 Kcur = ggml_rope_ext(
18046- ctx0, Kcur, inp_pos, rope_factors ,
18047+ ctx0, Kcur, inp_pos, nullptr ,
1804718048 n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
1804818049 ext_factor, attn_factor, beta_fast, beta_slow
1804918050 );
0 commit comments