b4rtaz
diff --git a/‎converter/convert-hf.py‎
Lines changed: 16 additions & 9 deletions b/‎converter/convert-hf.py‎
Lines changed: 16 additions & 9 deletions
diff --git a/‎converter/writer.py‎
Lines changed: 2 additions & 1 deletion b/‎converter/writer.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎launch.py‎
Lines changed: 5 additions & 0 deletions b/‎launch.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/llm.cpp‎
Lines changed: 197 additions & 72 deletions b/‎src/llm.cpp‎
Lines changed: 197 additions & 72 deletions
diff --git a/‎src/llm.hpp‎
Lines changed: 9 additions & 5 deletions b/‎src/llm.hpp‎
Lines changed: 9 additions & 5 deletions
diff --git a/‎src/nn/nn-config-builder.hpp‎
Lines changed: 3 additions & 3 deletions b/‎src/nn/nn-config-builder.hpp‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/nn/nn-core.cpp‎
Lines changed: 17 additions & 7 deletions b/‎src/nn/nn-core.cpp‎
Lines changed: 17 additions & 7 deletions
diff --git a/‎src/nn/nn-core.hpp‎
Lines changed: 48 additions & 16 deletions b/‎src/nn/nn-core.hpp‎
Lines changed: 48 additions & 16 deletions
@@ -8,6 +8,7 @@
 class ArchType:
     LLAMA = 0xABCD00
     QWEN3 = 0xABCD01
+    QWEN3_MOE = 0xABCD02
 
 def permute(tensor, nHeads: int, nKvHeads: int):
     if nHeads != nKvHeads:
@@ -71,22 +72,23 @@ def __preparePlan(self):
                 f'model.layers.{l}.self_attn.o_proj.weight'])
 
             if (self.config['n_experts'] > 0):
+                p.append([FloatType.F32, f'model.layers.{l}.mlp.gate.weight'])
                 for e in range(self.config['n_experts']):
                     p.append([wt,
-                        f'model.layers.{l}.block_sparse_moe.experts.{e}.w3.weight']) # up
+                        f'model.layers.{l}.mlp.experts.{e}.gate_proj.weight'])
                     p.append([wt,
-                        f'model.layers.{l}.block_sparse_moe.experts.{e}.w1.weight']) # gate
+                        f'model.layers.{l}.mlp.experts.{e}.down_proj.weight'])
                     p.append([wt,
-                        f'model.layers.{l}.block_sparse_moe.experts.{e}.w2.weight']) # down
+                        f'model.layers.{l}.mlp.experts.{e}.up_proj.weight'])
             else:
                 p.append([wt,
-                    f'model.layers.{l}.mlp.gate_proj.weight']) # gate
+                    f'model.layers.{l}.mlp.gate_proj.weight'])
                 p.append([wt,
-                    f'model.layers.{l}.mlp.down_proj.weight']) # down
+                    f'model.layers.{l}.mlp.down_proj.weight'])
                 p.append([wt,
-                    f'model.layers.{l}.mlp.up_proj.weight']) # up
+                    f'model.layers.{l}.mlp.up_proj.weight'])
 
-            if (self.archType == ArchType.QWEN3):
+            if (self.archType == ArchType.QWEN3 or self.archType == ArchType.QWEN3_MOE):
                 p.append([FloatType.F32,
                     f'model.layers.{l}.self_attn.q_norm.weight'])
                 p.append([FloatType.F32,
@@ -146,6 +148,7 @@ def parseArchType(type: str):
         'llama': ArchType.LLAMA,
         'mistral': ArchType.LLAMA,
         'qwen3': ArchType.QWEN3,
+        'qwen3_moe': ArchType.QWEN3_MOE,
     }.get(type)
     if (archType is None):
         raise Exception(f'Unsupported arch type: {type}')
@@ -202,8 +205,8 @@ def loadConfig(folderPath: str, weightsFloatType: int):
         'files': files,
     }
 
-    nExperts = config.get('num_local_experts')
-    nActiveExperts = config.get('num_active_local_experts') or config.get('num_experts_per_tok')
+    nExperts = config.get('num_experts')
+    nActiveExperts = config.get('num_experts_per_tok')
     result['n_experts'] = int(nExperts) if nExperts is not None else 0
     result['n_active_experts'] = int(nActiveExperts) if nActiveExperts is not None else 0
 
@@ -226,6 +229,10 @@ def loadConfig(folderPath: str, weightsFloatType: int):
     rmsNormEps = config.get('rms_norm_eps')
     if (rmsNormEps is not None):
         result['norm_epsilon'] = parseRmsNormEpsilon(rmsNormEps)
+
+    moeHiddenDim = config.get('moe_intermediate_size')
+    if (moeHiddenDim is not None):
+        result['moe_hidden_dim'] = int(moeHiddenDim)
     return result
 
 def printUsage():
 
@@ -128,7 +128,8 @@ def writeHeader(file, params):
         'rope_scaling_orig_max_seq_len': 17,
         'rope_type': 18,
         'head_dim': 19,
-        'norm_epsilon': 20
+        'norm_epsilon': 20,
+        'moe_hidden_dim': 21,
     }
     header = struct.pack('i', 0xA00ABCD)
 
 
@@ -65,6 +65,11 @@ def parts(length):
         'https://huggingface.co/b4rtaz/Qwen3-14B-Q40-Distributed-Llama/resolve/main/dllama_tokenizer_qwen3_14b.t?download=true',
         'q40', 'q80', 'chat', '--max-seq-len 4096'
     ],
+    'qwen3_30b_a3b_q40': [
+        list(map(lambda suffix : f'https://huggingface.co/b4rtaz/Qwen3-30B-A3B-Q40-Distributed-Llama/resolve/main/dllama_model_qwen3_30b_a3b_{suffix}?download=true', parts(5))),
+        'https://huggingface.co/b4rtaz/Qwen3-30B-A3B-Q40-Distributed-Llama/resolve/main/dllama_tokenizer_qwen3_30b_a3b.t?download=true',
+        'q40', 'q80', 'chat', '--max-seq-len 4096'
+    ],
 }
 
 def confirm(message: str):
 
@@ -26,7 +26,8 @@ enum LlmHeaderKey {
     ROPE_SCALING_ORIG_MAX_SEQ_LEN = 17,
     ROPE_TYPE = 18,
     HEAD_DIM = 19,
-    NORM_EPSILON = 20
+    NORM_EPSILON = 20,
+    MOE_HIDDEN_DIM = 21,
 };
 
 enum LlmHiddenAct {
@@ -36,7 +37,8 @@ enum LlmHiddenAct {
 
 enum LlmArchType {
     LLAMA = 0xABCD00,
-    QWEN3 = 0xABCD01
+    QWEN3 = 0xABCD01,
+    QWEN3_MOE = 0xABCD02,
 };
 
 typedef struct {
@@ -54,6 +56,7 @@ typedef struct {
     NnUint origSeqLen; // Original model context length
     NnUint seqLen; // Limited context length by the `--max-seq-len` argument
     NnUint hiddenDim;
+    NnUint moeHiddenDim;
     LlmHiddenAct hiddenAct;
     NnUint qDim;
     NnUint kvDim;
@@ -86,9 +89,10 @@ typedef struct {
     NnUint tokenPipeIndex;
     NnUint xPipeIndex;
     NnUint logitsPipeIndex;
-    NnSize2D tokenEmbeddingSize;
-    NnSize2D rmsNormSize;
-    NnSize2D qkRmsNormSize;
+    NnSize3D tokenEmbeddingSize;
+    NnSize3D rmsNormSize;
+    NnSize3D qkRmsNormSize;
+    NnSize3D moeGateSize;
 } LlmNet;
 
 LlmHeader loadLlmHeader(const char* path, const unsigned int maxSeqLen, NnFloatType syncType);
 
@@ -24,7 +24,7 @@ class NnNetConfigBuilder {
         this->nBatches = nBatches;
     }
 
-    NnUint addPipe(const char *name, NnSize2D size) {
+    NnUint addPipe(const char *name, NnSize3D size) {
         NnUint pipeIndex = pipes.size();
         pipes.push_back({ cloneString(name), size });
         return pipeIndex;
@@ -62,7 +62,7 @@ class NnNodeConfigBuilder {
         this->nodeIndex = nodeIndex;
     }
 
-    NnUint addBuffer(const char *name, NnSize2D size) {
+    NnUint addBuffer(const char *name, NnSize3D size) {
         NnUint bufferIndex = buffers.size();
         buffers.push_back({ cloneString(name), size });
         return bufferIndex;
@@ -98,7 +98,7 @@ class NnSegmentConfigBuilder {
 
 public:
     template <typename T>
-    void addOp(NnOpCode code, const char *name, NnUint index, NnPointerConfig input, NnPointerConfig output, NnSize2D weightSize, T config) {
+    void addOp(NnOpCode code, const char *name, NnUint index, NnPointerConfig input, NnPointerConfig output, NnSize3D weightSize, T config) {
         NnUint configSize = sizeof(T);
         NnByte *configCopy = new NnByte[configSize];
         std::memcpy(configCopy, &config, configSize);
 
@@ -72,6 +72,7 @@ NnOpQuantType getOpQuantType(NnFloatType input, NnFloatType weight, NnFloatType
 
 const char *opCodeToString(NnOpCode code) {
     if (code == OP_MERGE_ADD) return "MERGE_ADD";
+    if (code == OP_MERGE_SUM) return "MERGE_SUM";
     if (code == OP_EMBEDDING) return "EMBEDDING";
     if (code == OP_INV_RMS) return "INV_RMS";
     if (code == OP_RMS_NORM) return "RMS_NORM";
@@ -81,7 +82,11 @@ const char *opCodeToString(NnOpCode code) {
     if (code == OP_GELU) return "GELU";
     if (code == OP_SILU) return "SILU";
     if (code == OP_MUL) return "MUL";
+    if (code == OP_SCALE) return "SCALE";
     if (code == OP_CAST) return "CAST";
+    if (code == OP_REPEAT_Z) return "REPEAT_Z";
+    if (code == OP_SHIFT) return "SHIFT";
+    if (code == OP_MOE_GATE) return "MOE_GATE";
     throw std::invalid_argument("Unknown op code");
 }
 
@@ -97,17 +102,22 @@ const char *opQuantTypeToString(NnOpQuantType type) {
     throw std::invalid_argument("Unknown op quant type");
 }
 
-NnSize2D size0() {
-    return { F_UNK, 0, 0, 0, 0 };
+NnSize3D size0() {
+    return { F_UNK, 0, 0, 0, 0, 0 };
 }
 
-NnSize2D size1D(NnFloatType floatType, NnUint x) {
-    return size2D(floatType, 1, x);
+NnSize3D size1D(NnFloatType floatType, NnUint x) {
+    return size3D(floatType, 1, 1, x);
 }
 
-NnSize2D size2D(NnFloatType floatType, NnUint y, NnUint x) {
-    NnSize length = y * x;
-    return { floatType, y, x, length, getBytes(floatType, length) };
+NnSize3D size2D(NnFloatType floatType, NnUint y, NnUint x) {
+    return size3D(floatType, 1, y, x);
+}
+
+NnSize3D size3D(NnFloatType floatType, NnUint z, NnUint y, NnUint x) {
+    NnSize len = z * y * x;
+    NnSize lenXY = y * x;
+    return { floatType, z, y, x, len, getBytes(floatType, len), getBytes(floatType, lenXY) };
 }
 
 NnPointerConfig pointerBatchConfig(NnPointerSource source, NnUint index) {
 
@@ -11,27 +11,29 @@
 
 typedef struct {
     NnFloatType floatType;
+    NnUint z;
     NnUint y;
     NnUint x;
     NnSize length;
     NnSize nBytes;
-} NnSize2D;
+    NnSize nBytesXY;
+} NnSize3D;
 
 // slices
 
 typedef struct {
     NnUint kvDim0;
-    NnSize2D keySize;
-    NnSize2D valueSize;
+    NnSize3D keySize;
+    NnSize3D valueSize;
 } NnKvCacheSlice;
 
 typedef struct {
     NnFloatType type;
     NnUint nNodes;
     NnUint d0;
     NnUint n;
-    NnSize2D size;
-    NnSize2D sliceSize;
+    NnSize3D size;
+    NnSize3D sliceSize;
 } NnRowMatmulSlice;
 
 typedef struct {
@@ -40,8 +42,8 @@ typedef struct {
     NnUint n;
     NnUint n0;
     NnUint d;
-    NnSize2D size;
-    NnSize2D sliceSize;
+    NnSize3D size;
+    NnSize3D sliceSize;
 } NnColMatmulSlice;
 
 typedef struct {
@@ -57,19 +59,20 @@ typedef struct {
     NnUint headDim;
     NnUint nKvHeads;
     float ropeTheta;
-    NnSize2D cacheSize;
+    NnSize3D cacheSize;
 } NnRopeSlice;
 
 typedef struct {
     NnUint nHeads;
     NnUint nHeads0;
-    NnSize2D attSize;
+    NnSize3D attSize;
 } NnMultiHeadAttSlice;
 
 // base enums
 
 enum NnOpCode {
     OP_MERGE_ADD,
+    OP_MERGE_SUM,
     OP_EMBEDDING,
     OP_INV_RMS,
     OP_RMS_NORM,
@@ -79,8 +82,12 @@ enum NnOpCode {
     OP_GELU,
     OP_SILU,
     OP_MUL,
+    OP_SCALE,
     OP_CAST,
+    OP_REPEAT_Z,
     OP_SHIFT,
+    OP_SOFTMAX,
+    OP_MOE_GATE,
 };
 
 enum NnOpQuantType {
@@ -125,12 +132,12 @@ enum NnRopeType {
 
 typedef struct {
     char *name;
-    NnSize2D size;
+    NnSize3D size;
 } NnPipeConfig;
 
 typedef struct {
     char *name;
-    NnSize2D size;
+    NnSize3D size;
 } NnBufferConfig;
 
 typedef struct {
@@ -145,7 +152,7 @@ typedef struct {
     NnUint index;
     NnPointerConfig input;
     NnPointerConfig output;
-    NnSize2D weightSize;
+    NnSize3D weightSize;
     NnByte *config;
     NnUint configSize;
 } NnOpConfig;
@@ -200,7 +207,9 @@ typedef struct {
 } NnRmsNormOpConfig;
 
 typedef struct {
-    // empty
+    NnUint nExperts;
+    NnUint nActiveExperts;
+    NnUint activeExpertIndexesBufferIndex;
 } NnMatmulOpConfig;
 
 typedef struct {
@@ -234,6 +243,10 @@ typedef struct {
     // empty
 } NnMergeAddOpCodeConfig;
 
+typedef struct {
+    // empty
+} NnMergeSumOpCodeConfig;
+
 typedef struct {
     // empty
 } NnSiluOpCodeConfig;
@@ -242,14 +255,32 @@ typedef struct {
     NnUint multiplierBufferIndex;
 } NnMulOpCodeConfig;
 
+typedef struct {
+    NnUint scaleBufferIndex;
+} NnScaleOpCodeConfig;
+
 typedef struct {
     // empty
 } NnCastOpCodeConfig;
 
+typedef struct {
+    // empty
+} NnRepeatZOpCodeConfig;
+
 typedef struct {
     NnUint indexPipeIndex;
 } NnShiftOpCodeConfig;
 
+typedef struct {
+    // empty
+} NnSoftmaxOpCodeConfig;
+
+typedef struct {
+    NnUint k;
+    NnUint normTopk;
+    NnUint indexesBufferIndex;
+} NnMoeGateOpCodeConfig;
+
 // utility functions
 
 const char *opCodeToString(NnOpCode code);
@@ -258,9 +289,10 @@ const char *opQuantTypeToString(NnOpQuantType type);
 NnSize getBytes(NnFloatType floatType, NnSize n);
 NnSize getBlockSize(NnFloatType floatType);
 NnOpQuantType getOpQuantType(NnFloatType input, NnFloatType weight, NnFloatType output);
-NnSize2D size0();
-NnSize2D size1D(NnFloatType floatType, NnUint x);
-NnSize2D size2D(NnFloatType floatType, NnUint y, NnUint x);
+NnSize3D size0();
+NnSize3D size1D(NnFloatType floatType, NnUint x);
+NnSize3D size2D(NnFloatType floatType, NnUint y, NnUint x);
+NnSize3D size3D(NnFloatType floatType, NnUint z, NnUint y, NnUint x);
 NnPointerConfig pointerBatchConfig(NnPointerSource source, NnUint index);
 NnPointerConfig pointerBatchedSliceConfig(NnPointerSource source, NnUint index);
 NnPointerConfig pointerRawConfig(NnPointerSource source, NnUint index);
Original file line number	Diff line number	Diff line change
`@@ -128,7 +128,8 @@ def writeHeader(file, params):`
`128`	`128`	`'rope_scaling_orig_max_seq_len': 17,`
`129`	`129`	`'rope_type': 18,`
`130`	`130`	`'head_dim': 19,`
`131`		`- 'norm_epsilon': 20`
	`131`	`+ 'norm_epsilon': 20,`
	`132`	`+ 'moe_hidden_dim': 21,`
`132`	`133`	`}`
`133`	`134`	`header = struct.pack('i', 0xA00ABCD)`
`134`	`135`