fixed tensor mappings and working on buildin graph

ryan-mangeno · ryan-mangeno · commit 18c0c23ed89d · 2025-08-27T15:32:20.000-04:00
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
@@ -3015,9 +3015,6 @@ struct ggml_tensor * ggml_mul_mat(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
         struct ggml_tensor  * b) {
-
-    printf("Up: {%lld, %lld}\n", a->ne[0], a->ne[1]);
-    printf("Cur: {%lld, %lld}\n", b->ne[0], b->ne[1]);
     
     GGML_ASSERT(ggml_can_mul_mat(a, b));
     GGML_ASSERT(!ggml_is_transposed(a));
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
@@ -367,7 +367,7 @@ class TensorNameMap:
             "model.layers.{bid}.mlp.up_proj",                         # llama-hf refact nemotron olmo2
             "layers.{bid}.feed_forward.w3",                           # llama-pth
             "encoder.layer.{bid}.intermediate.dense",                 # bert
-            "layers.{bid}.mlp.Wo",                                    # modern bert
+            "layers.{bid}.mlp.Wi",                                    # modern bert
             "transformer.layer.{bid}.ffn.lin1",                       # distillbert
             "transformer.h.{bid}.mlp.fc_in",                          # gpt-j
             "transformer.h.{bid}.mlp.linear_3",                       # refact
@@ -467,7 +467,7 @@ class TensorNameMap:
             "model.layers.{bid}.mlp.down_proj",                       # llama-hf nemotron olmo2
             "layers.{bid}.feed_forward.w2",                           # llama-pth
             "encoder.layer.{bid}.output.dense",                       # bert
-            "layers.{bid}.mlp.Wi",                                    # modern bert
+            "layers.{bid}.mlp.Wo",                                    # modern bert
             "transformer.layer.{bid}.ffn.lin2",                       # distillbert
             "transformer.h.{bid}.mlp.fc_out",                         # gpt-j
             "language_model.encoder.layers.{bid}.mlp.dense_4h_to_h",  # persimmon
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -2708,8 +2708,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                         layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, 3 * n_embd }, 0);
                         layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
 
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_ff, n_embd}, 0);   // [3072, 384]
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_embd, 2 * n_ff}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, 2 * n_ff}, 0); 
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
                         layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
                     }
                 } break;
@@ -7548,6 +7548,7 @@ struct llm_build_modern_bert : public llm_graph_context {
         const int64_t n_embd_head   = hparams.n_embd_head_v;
         const int64_t n_embd_gqa    = hparams.n_embd_v_gqa(); // == n_head_kv * n_embd_head
         const int64_t n_tokens      = ubatch.n_tokens;
+        const int64_t n_ff          = hparams.n_ff();
 
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 
@@ -7667,30 +7668,63 @@ struct llm_build_modern_bert : public llm_graph_context {
 
             // MLP (prefer GEGLU if gate exists or up has 2*n_ff rows)
             ggml_tensor * mlp_out = nullptr;
-            const bool has_gate_tensor = (model.layers[il].ffn_gate != nullptr);
-            const bool up_is_2x = (model.layers[il].ffn_up && model.layers[il].ffn_up->ne[0] == 2*hparams.n_ff());
+            ggml_tensor * ffn_gate_view = model.layers[il].ffn_gate;
+            ggml_tensor * ffn_up_view   = model.layers[il].ffn_up;
+
+            if (ffn_gate_view == nullptr && ffn_up_view) {
+
+                // Case A: weight stored as (2*ffn, hidden)  -> split rows into two (ffn x hidden)
+                if( ffn_up_view->ne[0] == 2 * n_ff and ffn_up_view->ne[1] == n_embd) {
+
+                    // top half, (ffn up)
+                    ffn_up_view = ggml_view_2d(ctx0, model.layers[il].ffn_up,
+                                   /*ne0*/ n_ff, /*ne1*/ n_embd,
+                                   /*nb1*/ model.layers[il].ffn_up->nb[1],
+                                   /*offset_bytes*/ (size_t)0);
+                    // bottom half (gate)
+                    ffn_gate_view = ggml_view_2d(ctx0, model.layers[il].ffn_up,
+                                                /*ne0*/ n_ff, /*ne1*/ n_embd,
+                                                /*nb1*/ model.layers[il].ffn_up->nb[1],
+                                                /*offset_bytes*/ (size_t)n_ff * model.layers[il].ffn_up->nb[1]);
+                }
+                else if ( ffn_up_view->ne[0] == n_embd && ffn_up_view->ne[1] == 2 * n_ff) {
+                    // top half
+                    ffn_up_view = ggml_view_2d(ctx0, model.layers[il].ffn_up,
+                           n_embd, n_ff,
+                           model.layers[il].ffn_up->nb[1],
+                           0);
+                    ffn_up_view = ggml_cont(ctx0, ffn_up_view);
+
+                    ffn_gate_view = ggml_view_2d(ctx0, model.layers[il].ffn_up,
+                                                n_embd, n_ff,
+                                                model.layers[il].ffn_up->nb[1],
+                                                n_ff * sizeof(float));
+                    ffn_gate_view = ggml_cont(ctx0, ffn_gate_view);
+                }
+
+                ggml_tensor * ffn_down_view = model.layers[il].ffn_down;
+                LLAMA_LOG_INFO("ffn shapes: Up: {%lld, %lld},  Gate: {%lld, %lld},  Down: {%lld, %lld}",
+                                                ffn_up_view->ne[0], ffn_up_view->ne[1], ffn_gate_view->ne[0], ffn_gate_view->ne[1], ffn_down_view->ne[0], ffn_down_view->ne[1]);
 
-            if (has_gate_tensor || up_is_2x) {
                 mlp_out = build_ffn(
                     h,
                     model.layers[il].ffn_up,   /*up_b*/   NULL,           /*up_shexp*/   NULL,
-                    model.layers[il].ffn_gate, /*gate_b*/ NULL,           /*gate_shexp*/ NULL,
+                    ffn_gate_view         ,    /*gate_b*/ NULL,           /*gate_shexp*/ NULL,
                     model.layers[il].ffn_down, /*down_b*/ NULL,           /*down_shexp*/ NULL,
                     /*expert_scores*/ NULL,
-                    LLM_FFN_GEGLU, LLM_FFN_PAR, il);
-                cb(mlp_out, "ffn_out_geglu", il);
+                    LLM_FFN_GEGLU, LLM_FFN_PAR, il
+                );
+                cb(mlp_out, "ffn_out_geglu", il);   
             } else {
-
-                LLAMA_LOG_INFO("Ffn_up : {%lld, %lld}, ffn_down : {%lld, %lld}\n", model.layers[il].ffn_up->ne[0], model.layers[il].ffn_up->ne[1],
-                                                                                   model.layers[il].ffn_down->ne[0], model.layers[il].ffn_down->ne[0]);
                 mlp_out = build_ffn(
                     h,
-                    model.layers[il].ffn_up,   /*up_b*/   NULL,           /*up_shexp*/   NULL,
-                    /*gate*/ NULL,             /*gate_b*/ NULL,           /*gate_shexp*/ NULL,
-                    model.layers[il].ffn_down, /*down_b*/ NULL,           /*down_shexp*/ NULL,
-                    /*expert_scores*/ NULL,
-                    LLM_FFN_GELU, LLM_FFN_SEQ, il);
-                cb(mlp_out, "ffn_out_gelu", il);
+                    model.layers[il].ffn_up,   NULL,    NULL,
+                    model.layers[il].ffn_gate, NULL,    NULL,
+                    model.layers[il].ffn_down, NULL,    NULL,
+                    NULL,
+                    LLM_FFN_GEGLU, LLM_FFN_PAR, il
+                );
+                cb(mlp_out, "ffn_out_geglu", il);
             }
 
             // Residual after MLP