contigous checks, better formatting, use std::vector instead of array

am17an · am17an · commit e765d9adbc4a · 2025-10-31T14:02:13.000+08:00
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -3180,7 +3180,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
                         }
 
                         while (current_node < cgraph->n_nodes && cgraph->nodes[current_node]->op == GGML_OP_ADD &&
-                            num_adds < num_views - 1) {
+                                num_adds < num_views - 1) {
                             num_adds++;
                             current_node++;
                         }
diff --git a/ggml/src/ggml-cuda/moe-expert-reduce.cu b/ggml/src/ggml-cuda/moe-expert-reduce.cu
@@ -1,16 +1,14 @@
 #include "moe-expert-reduce.cuh"
 
-// This kernel is fusion of the expert weight reduce, common in MoE models
+// This kernel is a fusion of the expert weight reduce, common in MoE models
 
 template <int n_expert_used_template>
 __global__ void moe_expert_reduce_cuda(const float * __restrict__ experts,
                                        const float * __restrict__ weights,
                                        float * __restrict__ dst,
                                        const int n_expert_used,
                                        const int n_cols) {
-    const int row             = blockIdx.x;
-    const int n_expert_used_t = n_expert_used_template == 0 ? n_expert_used : n_expert_used_template;
-
+    const int row = blockIdx.x;
     const int col = blockIdx.y * blockDim.x + threadIdx.x;
     if (col >= n_cols) {
         return;
@@ -22,7 +20,7 @@ __global__ void moe_expert_reduce_cuda(const float * __restrict__ experts,
 
     float acc = 0.f;
     if constexpr (n_expert_used_template == 0) {
-        for (int expert = 0; expert < n_expert_used_t; ++expert) {
+        for (int expert = 0; expert < n_expert_used; ++expert) {
             ggml_cuda_mad(acc, experts[col], weights[expert]);
             experts += n_cols;
         }
@@ -98,37 +96,34 @@ static void launch_moe_expert_reduce(ggml_backend_cuda_context & ctx,
 }
 
 bool ggml_cuda_should_use_moe_expert_reduce(const ggml_cgraph * cgraph, int start_index, int end_index) {
-    const ggml_tensor * experts = cgraph->nodes[start_index];
-    if (experts->op != GGML_OP_MUL) {
+    const ggml_tensor * mul = cgraph->nodes[start_index];
+
+    if (mul->op != GGML_OP_MUL || !ggml_is_contiguous(mul->src[0]) || !ggml_is_contiguous(mul->src[1])) {
         return false;
     }
 
     int    current_node   = start_index + 1;
     size_t current_offset = 0;
 
-    const ggml_tensor * view_nodes[32];
-    int                 num_views = 0;
+    std::vector<const ggml_tensor *> view_nodes;
     //check if all are views of the expert in increasing order
     while (current_node < end_index && cgraph->nodes[current_node]->op == GGML_OP_VIEW) {
         const ggml_tensor * node = cgraph->nodes[current_node];
-        if (node->view_src != experts) {
+        if (node->view_src != mul) {
             return false;
         }
         if (node->view_offs < current_offset) {
             return false;
         }
         current_offset = node->view_offs;
         current_node++;
-        view_nodes[num_views++] = node;
-
-        if (num_views >= 32) {
-            return false;
-        }
+        view_nodes.push_back(node);
     }
 
     //check if all the adds are in increasing order
-    const ggml_tensor * prev_add_src = view_nodes[0];
+    const ggml_tensor * prev_add_src = view_nodes.size() ? view_nodes[0] : nullptr;
     int                 num_adds     = 0;
+    int                 num_views    = view_nodes.size();
     while (current_node < end_index && cgraph->nodes[current_node]->op == GGML_OP_ADD) {
         const ggml_tensor * add_node = cgraph->nodes[current_node];
 

Original file line number	Diff line number	Diff line change
`@@ -3180,7 +3180,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx`
`3180`	`3180`	`}`
`3181`	`3181`
`3182`	`3182`	`while (current_node < cgraph->n_nodes && cgraph->nodes[current_node]->op == GGML_OP_ADD &&`
`3183`		`- num_adds < num_views - 1) {`
	`3183`	`+ num_adds < num_views - 1) {`
`3184`	`3184`	`num_adds++;`
`3185`	`3185`	`current_node++;`
`3186`	`3186`	`}`