b4rtaz
diff --git a/‎src/llm.cpp‎
Lines changed: 4 additions & 0 deletions b/‎src/llm.cpp‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/nn/nn-core.cpp‎
Lines changed: 2 additions & 1 deletion b/‎src/nn/nn-core.cpp‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/nn/nn-cpu-ops.cpp‎
Lines changed: 9 additions & 2 deletions b/‎src/nn/nn-cpu-ops.cpp‎
Lines changed: 9 additions & 2 deletions
@@ -621,6 +621,7 @@ void loadLlmNetWeight(const char *path, LlmNet *net, NnRootWeightLoader *loader)
     printf("💿 Loading weights...\n");
 #endif
 
+    Timer timer;
     NnByte *data = (NnByte *)file.data;
     NnByte *b = &data[net->header->headerSize];
     b += loader->loadRoot("embedding", 0, net->tokenEmbeddingSize.nBytes, b);
@@ -651,6 +652,9 @@ void loadLlmNetWeight(const char *path, LlmNet *net, NnRootWeightLoader *loader)
 
         b += loader->loadAll("block_norm_0", layerIndex, net->rmsNormSize.nBytes, b);
         b += loader->loadAll("block_norm_1", layerIndex, net->rmsNormSize.nBytes, b);
+
+        if (timer.elapsedMiliseconds() > 10000)
+            printf("💿 Loaded %u/%u\n", layerIndex + 1, net->header->nLayers);
     }
 
     b += loader->loadAll("final_norm", 0u, net->rmsNormSize.nBytes, b);
 
@@ -86,8 +86,9 @@ const char *opCodeToString(NnOpCode code) {
     if (code == OP_CAST) return "CAST";
     if (code == OP_REPEAT_Z) return "REPEAT_Z";
     if (code == OP_SHIFT) return "SHIFT";
+    if (code == OP_SOFTMAX) return "SOFTMAX";
     if (code == OP_MOE_GATE) return "MOE_GATE";
-    throw std::invalid_argument("Unknown op code");
+    throw std::invalid_argument("Unknown op code: " + std::to_string(code));
 }
 
 const char *opQuantTypeToString(NnOpQuantType type) {
 
@@ -1149,8 +1149,9 @@ static void matmulForward_F32_F32_F32(NnUint nThreads, NnUint threadIndex, NnUin
                 ? 0u
                 : (NnUint)activeExpertIndexes[y * config->nActiveExperts + e];
 
+            float *output = (float *)context->output[e * context->outputSize.y + y];
             matmul_F32_F32_F32(
-                (float *)context->output[e * context->outputSize.y + y],
+                output,
                 (float *)context->input[e * context->inputSize.y + y],
                 (float *)&context->weight[activeExpertIndex * context->weightSize.nBytesXY],
                 context->weightSize.y,
@@ -1176,14 +1177,16 @@ static void matmulForward_Q80_Q40_F32(NnUint nThreads, NnUint threadIndex, NnUin
                 ? 0u
                 : (NnUint)activeExpertIndexes[y * config->nActiveExperts + e];
 
+            float *output = (float *)context->output[e * context->outputSize.y + y];
             matmul_Q80_Q40_F32(
-                (float *)context->output[e * context->outputSize.y + y],
+                output,
                 (NnBlockQ80 *)context->input[e * context->inputSize.y + y],
                 (NnBlockQ40 *)&context->weight[activeExpertIndex * context->weightSize.nBytesXY],
                 context->weightSize.y,
                 context->weightSize.x,
                 nThreads,
                 threadIndex);
+            DEBUG_VECTOR(context, "output", output);
         }
     }
 }
@@ -1478,9 +1481,13 @@ static void moeGateForward_F32_F32(NnUint nThreads, NnUint threadIndex, NnUint b
         for (NnUint k = 0u; k < config->k; k++) {
             const NnUint p = pos[k];
             indexes[y * config->k + k] = (float)p;
+
+            // (nActiveExperts, nBatches, 1)
             float *output = (float *)context->output[k * context->outputSize.y + y];
             *output = input[p] / sum;
         }
+
+        DEBUG_VECTOR(context, "indexes", (&indexes[y * config->k]));
     }
 }