@@ -451,7 +451,6 @@ void llama_model::load_arch(llama_model_loader & ml) {
451451}
452452
453453void llama_model::load_hparams(llama_model_loader & ml) {
454-
455454 const gguf_context * ctx = ml.meta.get();
456455
457456 // get metadata as string
@@ -465,7 +464,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
465464 gguf_kv.emplace(name, value);
466465 }
467466
468-
469467 // get general kv
470468 ml.get_key(LLM_KV_GENERAL_NAME, name, false);
471469
@@ -586,7 +584,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
586584 }
587585
588586 // arch-specific KVs
589- LLAMA_LOG_INFO("Switching Arch\n");
590587 switch (arch) {
591588 case LLM_ARCH_LLAMA:
592589 {
@@ -1901,6 +1898,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
19011898
19021899void llama_model::load_vocab(llama_model_loader & ml) {
19031900 const auto kv = LLM_KV(arch);
1901+
19041902 vocab.load(ml, kv);
19051903}
19061904
@@ -2045,7 +2043,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
20452043
20462044 auto create_tensor = [&](const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) -> ggml_tensor * {
20472045 ggml_tensor * t_meta = ml.get_tensor_meta(tn.str().c_str());
2048- LLAMA_LOG_INFO("Creating Tensor: %s\n", tn.str().c_str());
2046+
20492047 if (!t_meta) {
20502048 if (flags & TENSOR_NOT_REQUIRED) {
20512049 return nullptr;
@@ -2120,6 +2118,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
21202118 }
21212119
21222120 ggml_backend_buffer_type_t buft = nullptr;
2121+
21232122 // check overrides
21242123 if (ml.tensor_buft_overrides) {
21252124 std::string tensor_name = tn.str();
@@ -2167,6 +2166,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
21672166 first_moved_to_buft = buft;
21682167 }
21692168 }
2169+
21702170 ggml_context * ctx = ctx_for_buft(buft);
21712171
21722172 // if duplicated, check if the original tensor was allocated in the same buffer type context and avoid creating a new one
@@ -2624,26 +2624,26 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
26242624 case LLM_ARCH_NOMIC_BERT_MOE:
26252625 {
26262626 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2627- tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
2628-
26292627 type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, TENSOR_NOT_REQUIRED);
2630- tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
2631-
26322628
26332629 if (arch == LLM_ARCH_BERT) {
26342630 pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
2631+
26352632 cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
26362633 cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
26372634
26382635 cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
26392636 cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
26402637 }
26412638
2639+ tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
2640+ tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
2641+
26422642 for (int i = 0; i < n_layer; ++i) {
26432643 auto & layer = layers[i];
26442644
26452645 layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
2646-
2646+ layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
26472647
26482648 if (!layer.wqkv) {
26492649 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
@@ -2657,8 +2657,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
26572657 }
26582658
26592659 layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
2660-
2661- layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
2660+
26622661 layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
26632662 layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
26642663
@@ -2668,7 +2667,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
26682667 layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
26692668 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
26702669 } else {
2671-
26722670 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
26732671 layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
26742672
@@ -2683,7 +2681,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
26832681
26842682 layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
26852683 layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
2686-
26872684 }
26882685 } break;
26892686 case LLM_ARCH_MODERN_BERT:
@@ -7549,7 +7546,6 @@ struct llm_build_modern_bert : public llm_graph_context {
75497546 const int64_t n_embd_head = hparams.n_embd_head_v;
75507547 const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
75517548 const int64_t n_tokens = ubatch.n_tokens;
7552- const int64_t n_ff = hparams.n_ff();
75537549
75547550 GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
75557551
0 commit comments