Vulkan: Disable multi-add for now (#581)

ikawrakow · Iwan Kawrakow · web-flow · commit 3e024de1dae4 · 2025-07-03T18:31:48.000+02:00
Co-authored-by: Iwan Kawrakow &lt;iwan.kawrakow@gmail.com&gt;
diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp
@@ -5520,6 +5520,9 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
 
     const uint64_t nei0 = ids->ne[0];
     const uint64_t nei1 = ids->ne[1];
+    if (nei0*nei1 > 4096) {
+        fprintf(stderr, "%s: nei0 = %d, nei1 = %d\n", __func__, (int)nei0, (int)nei1);
+    }
     GGML_ASSERT(nei0 * nei1 <= 4096);
 
     const uint32_t nbi1 = ids->nb[1];
@@ -5915,7 +5918,30 @@ static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context& subctx
     if (src2->ne[1] == 1 && (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type))) {
         ggml_vk_mul_mat_vec_id_q_f16(ctx, subctx, src0, src1, src2, dst, dryrun);
     } else {
-        ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, src1, src2, dst, dryrun);
+        // Split based on number of ids, to fit in shared memory
+        const uint32_t nei0 = (uint32_t)src2->ne[0];
+        const uint32_t nei1 = (uint32_t)src2->ne[1];
+
+        GGML_ASSERT(nei0 <= 4096);
+        const uint32_t split_size = std::min(nei1, 4096u / nei0);
+
+        ggml_tensor src1_copy = *src1;
+        ggml_tensor src2_copy = *src2;
+        ggml_tensor dst_copy = *dst;
+
+        for (uint32_t token_start = 0; token_start < nei1; token_start += split_size) {
+            const uint32_t n_tokens = std::min(split_size, nei1 - token_start);
+
+            src1_copy.view_offs = src1->view_offs + token_start * src1_copy.nb[2];
+            src2_copy.view_offs = src2->view_offs + token_start * src2_copy.nb[1];
+            dst_copy.view_offs = dst->view_offs + token_start * dst_copy.nb[2];
+
+            src1_copy.ne[2] = n_tokens;
+            src2_copy.ne[1] = n_tokens;
+            dst_copy.ne[2] = n_tokens;
+
+            ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, &src1_copy, &src2_copy, &dst_copy, dryrun);
+        }
     }
 }
 
@@ -9510,9 +9536,15 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
                 ggml_type src0_type = op->src[0]->type;
                 ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
                 const vk_device& device = ctx->device;
-                if (op->op == GGML_OP_MUL_MAT_ID && !device->mul_mat_id_s[src0_type] && !device->mul_mat_id_m[src0_type] && !device->mul_mat_id_l[src0_type]) {
-                    // If there's not enough shared memory for row_ids and the result tile, fallback to CPU
-                    return false;
+                if (op->op == GGML_OP_MUL_MAT_ID) {
+                    if (!device->mul_mat_id_s[src0_type] && !device->mul_mat_id_m[src0_type] && !device->mul_mat_id_l[src0_type]) {
+                        // If there's not enough shared memory for row_ids and the result tile, fallback to CPU
+                        return false;
+                    }
+                    // Check against size of shared memory variable
+                    if (op->src[2]->ne[0] > 4096) {
+                        return false;
+                    }
                 }
                 switch (src0_type) {
                     case GGML_TYPE_F32:
@@ -9580,6 +9612,10 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
                 default:
                     return false;
                 }
+                if (op->src[1]->ne[0] != op->src[2]->ne[0]) {
+                    // different head sizes of K and V are not supported yet
+                    return false;
+                }
                 if (op->src[0]->type != GGML_TYPE_F32) {
                     return false;
                 }
diff --git a/src/llama.cpp b/src/llama.cpp
@@ -9870,6 +9870,28 @@ llm_expert_gating_func_type   gating_op,
         cb(cur, "ffn_moe_weighted", il);
     }
 
+#ifdef GGML_USE_VULKAN
+    // aggregate experts
+    ggml_tensor * moe_out = nullptr;
+    //ggml_tensor * first_expert = nullptr;
+    for (int i = 0; i < n_expert_used; ++i) {
+        ggml_tensor * cur_expert = ggml_view_2d(ctx, experts, n_embd, n_tokens,
+                experts->nb[2], i*experts->nb[1]);
+
+        if (i == 0) {
+            moe_out = cur_expert;
+        } else {
+            moe_out = ggml_add(ctx, moe_out, cur_expert);
+        }
+    }
+
+    if (n_expert_used == 1) {
+        // avoid returning a non-contiguous tensor
+        moe_out = ggml_cont(ctx, moe_out);
+    }
+
+    return moe_out;
+#else
     if (n_expert_used == 1) {
         return ggml_cont(ctx, ggml_view_2d(ctx, experts, n_embd, n_tokens, experts->nb[2], 0));
     }
@@ -9878,32 +9900,8 @@ llm_expert_gating_func_type   gating_op,
                              ggml_view_2d(ctx, experts, n_embd, n_tokens, experts->nb[2], experts->nb[1]));
     }
     return ggml_multi_add(ctx, ggml_view_2d(ctx, experts, n_embd, n_tokens, experts->nb[2], 0), n_expert_used);
+#endif
 
-    //// aggregate experts
-    //ggml_tensor * moe_out = nullptr;
-    ////ggml_tensor * first_expert = nullptr;
-    //for (int i = 0; i < n_expert_used; ++i) {
-    //    ggml_tensor * cur_expert = ggml_view_2d(ctx, experts, n_embd, n_tokens,
-    //            experts->nb[2], i*experts->nb[1]);
-
-    //    if (i == 0) {
-    //        moe_out = cur_expert;
-    //        //first_expert = cur_expert;
-    //        //printf("%s: %d: %d x %d x %d x %d | %d x %d x %d x %d\n", __func__, ggml_is_contiguous(first_expert),
-    //        //        (int)cur_expert->ne[0], (int)cur_expert->ne[1], (int)cur_expert->ne[2], (int)cur_expert->ne[3],
-    //        //        (int)cur_expert->nb[0], (int)cur_expert->nb[1], (int)cur_expert->nb[2], (int)cur_expert->nb[3]);
-    //    } else {
-    //        moe_out = ggml_add(ctx, moe_out, cur_expert);
-    //        //printf("%s: %d  %d\n", __func__, ggml_is_contiguous(cur_expert), ggml_are_same_shape(cur_expert, first_expert));
-    //    }
-    //}
-
-    //if (n_expert_used == 1) {
-    //    // avoid returning a non-contiguous tensor
-    //    moe_out = ggml_cont(ctx, moe_out);
-    //}
-
-    //return moe_out;
 }
 
 static struct ggml_tensor * llm_build_kqv(