Thireus
diff --git a/‎common/common.cpp‎
Lines changed: 2 additions & 2 deletions b/‎common/common.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎ggml/src/ggml-cuda.cu‎
Lines changed: 39 additions & 3 deletions b/‎ggml/src/ggml-cuda.cu‎
Lines changed: 39 additions & 3 deletions
@@ -1152,7 +1152,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         return true;
     }
     if (arg == "--cpu-moe" || arg == "-cmoe") {
-        params.tensor_buft_overrides.push_back({strdup("\\.ffn_(up|down|gate)_exps"), ggml_backend_cpu_buffer_type()});
+        params.tensor_buft_overrides.push_back({strdup("\\.ffn_(up|down|gate)_exps\\.weight"), ggml_backend_cpu_buffer_type()});
         return true;
     }
     if (arg == "--n-cpu-moe" || arg == "-ncmoe") {
@@ -1164,7 +1164,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
             return true;
         }
         for (int32_t l = 0; l < n_layers; ++l) {
-            std::string pattern = "blk\\." + std::to_string(l) + "\\.(ffn_(up|down|gate)_exps)";
+            std::string pattern = "blk\\." + std::to_string(l) + "\\.(ffn_(up|down|gate)_exps\\.weight)";
             params.tensor_buft_overrides.push_back({strdup(pattern.c_str()), ggml_backend_cpu_buffer_type()});
         }
         return true;
 
@@ -3173,7 +3173,25 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
                     ggml_cuda_op_relu(ctx, dst);
                     break;
                 case GGML_UNARY_OP_SIGMOID:
-                    ggml_cuda_op_sigmoid(ctx, dst);
+                    if (i + 5 < cgraph->n_nodes &&
+                        cgraph->nodes[i+1]->op == GGML_OP_RESHAPE &&
+                        cgraph->nodes[i+2]->op == GGML_OP_ADD &&
+                        cgraph->nodes[i+3]->op == GGML_OP_ARGSORT &&
+                        cgraph->nodes[i+4]->op == GGML_OP_VIEW &&
+                        cgraph->nodes[i+5]->op == GGML_OP_GET_ROWS) {
+                        cuda_glm45moe_experts(ctx, cgraph->nodes[i+5], cgraph->nodes[i+4]);
+                        i += 5;
+                    }
+                    else if (i + 4 < cgraph->n_nodes &&
+                        cgraph->nodes[i+1]->op == GGML_OP_RESHAPE &&
+                        cgraph->nodes[i+2]->op == GGML_OP_ADD &&
+                        cgraph->nodes[i+3]->op == GGML_OP_GROUPED_TOPK &&
+                        cgraph->nodes[i+4]->op == GGML_OP_GET_ROWS) {
+                        cuda_bailingmoev2_experts(ctx, cgraph->nodes[i+4], cgraph->nodes[i+4]);
+                        i += 4;
+                    } else {
+                        ggml_cuda_op_sigmoid(ctx, dst);
+                    }
                     break;
                 case GGML_UNARY_OP_HARDSIGMOID:
                     ggml_cuda_op_hardsigmoid(ctx, dst);
@@ -3315,10 +3333,28 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
             ggml_cuda_op_pool2d(ctx, dst);
             break;
         case GGML_OP_SUM_ROWS:
-            ggml_cuda_op_sum_rows(ctx, dst);
+            if (i + 1 < cgraph->n_nodes &&
+                cgraph->nodes[i+1]->op == GGML_OP_DIV &&
+                cgraph->nodes[i+1]->src[1] == dst &&
+                cgraph->nodes[i+1]->src[0] == dst->src[0]) {
+                ggml_cuda_op_sum_rows_div(ctx, cgraph->nodes[i+1]);
+                ++i;
+            } else {
+                ggml_cuda_op_sum_rows(ctx, dst);
+            }
             break;
         case GGML_OP_ARGSORT:
-            ggml_cuda_op_argsort(ctx, dst);
+            if (i + 5 < cgraph->n_nodes &&
+                cgraph->nodes[i+1]->op == GGML_OP_VIEW &&
+                cgraph->nodes[i+2]->op == GGML_OP_GET_ROWS &&
+                cgraph->nodes[i+3]->op == GGML_OP_RESHAPE &&
+                cgraph->nodes[i+4]->op == GGML_OP_SOFT_MAX &&
+                cgraph->nodes[i+5]->op == GGML_OP_RESHAPE) {
+                cuda_openai_experts(ctx, dst, cgraph->nodes[i+4]);
+                i += 5;
+            } else {
+                ggml_cuda_op_argsort(ctx, dst);
+            }
             break;
         case GGML_OP_ARGSORT_THRESH:
             ggml_cuda_op_argsort_thresh(ctx, dst);
Original file line number	Diff line number	Diff line change
`@@ -1152,7 +1152,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa`
`1152`	`1152`	`return true;`
`1153`	`1153`	`}`
`1154`	`1154`	`if (arg == "--cpu-moe" \|\| arg == "-cmoe") {`
`1155`		`- params.tensor_buft_overrides.push_back({strdup("\\.ffn_(up\|down\|gate)_exps"), ggml_backend_cpu_buffer_type()});`
	`1155`	`+ params.tensor_buft_overrides.push_back({strdup("\\.ffn_(up\|down\|gate)_exps\\.weight"), ggml_backend_cpu_buffer_type()});`
`1156`	`1156`	`return true;`
`1157`	`1157`	`}`
`1158`	`1158`	`if (arg == "--n-cpu-moe" \|\| arg == "-ncmoe") {`
`@@ -1164,7 +1164,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa`
`1164`	`1164`	`return true;`
`1165`	`1165`	`}`
`1166`	`1166`	`for (int32_t l = 0; l < n_layers; ++l) {`
`1167`		`- std::string pattern = "blk\\." + std::to_string(l) + "\\.(ffn_(up\|down\|gate)_exps)";`
	`1167`	`+ std::string pattern = "blk\\." + std::to_string(l) + "\\.(ffn_(up\|down\|gate)_exps\\.weight)";`
`1168`	`1168`	`params.tensor_buft_overrides.push_back({strdup(pattern.c_str()), ggml_backend_cpu_buffer_type()});`
`1169`	`1169`	`}`
`1170`	`1170`	`return true;`