@@ -1149,8 +1149,9 @@ static void matmulForward_F32_F32_F32(NnUint nThreads, NnUint threadIndex, NnUin
11491149 ? 0u
11501150 : (NnUint)activeExpertIndexes[y * config->nActiveExperts + e];
11511151
1152+ float *output = (float *)context->output [e * context->outputSize .y + y];
11521153 matmul_F32_F32_F32 (
1153- ( float *)context-> output [e * context-> outputSize . y + y] ,
1154+ output,
11541155 (float *)context->input [e * context->inputSize .y + y],
11551156 (float *)&context->weight [activeExpertIndex * context->weightSize .nBytesXY ],
11561157 context->weightSize .y ,
@@ -1176,14 +1177,16 @@ static void matmulForward_Q80_Q40_F32(NnUint nThreads, NnUint threadIndex, NnUin
11761177 ? 0u
11771178 : (NnUint)activeExpertIndexes[y * config->nActiveExperts + e];
11781179
1180+ float *output = (float *)context->output [e * context->outputSize .y + y];
11791181 matmul_Q80_Q40_F32 (
1180- ( float *)context-> output [e * context-> outputSize . y + y] ,
1182+ output,
11811183 (NnBlockQ80 *)context->input [e * context->inputSize .y + y],
11821184 (NnBlockQ40 *)&context->weight [activeExpertIndex * context->weightSize .nBytesXY ],
11831185 context->weightSize .y ,
11841186 context->weightSize .x ,
11851187 nThreads,
11861188 threadIndex);
1189+ DEBUG_VECTOR (context, " output" , output);
11871190 }
11881191 }
11891192}
@@ -1478,9 +1481,13 @@ static void moeGateForward_F32_F32(NnUint nThreads, NnUint threadIndex, NnUint b
14781481 for (NnUint k = 0u ; k < config->k ; k++) {
14791482 const NnUint p = pos[k];
14801483 indexes[y * config->k + k] = (float )p;
1484+
1485+ // (nActiveExperts, nBatches, 1)
14811486 float *output = (float *)context->output [k * context->outputSize .y + y];
14821487 *output = input[p] / sum;
14831488 }
1489+
1490+ DEBUG_VECTOR (context, " indexes" , (&indexes[y * config->k ]));
14841491 }
14851492}
14861493
0 commit comments