Skip to content

Commit 5f6f378

Browse files
authored
Merge branch 'ikawrakow:main' into main
2 parents fa54b5b + 2f5dae2 commit 5f6f378

File tree

10 files changed

+716
-67
lines changed

10 files changed

+716
-67
lines changed

common/common.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1152,7 +1152,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
11521152
return true;
11531153
}
11541154
if (arg == "--cpu-moe" || arg == "-cmoe") {
1155-
params.tensor_buft_overrides.push_back({strdup("\\.ffn_(up|down|gate)_exps"), ggml_backend_cpu_buffer_type()});
1155+
params.tensor_buft_overrides.push_back({strdup("\\.ffn_(up|down|gate)_exps\\.weight"), ggml_backend_cpu_buffer_type()});
11561156
return true;
11571157
}
11581158
if (arg == "--n-cpu-moe" || arg == "-ncmoe") {
@@ -1164,7 +1164,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
11641164
return true;
11651165
}
11661166
for (int32_t l = 0; l < n_layers; ++l) {
1167-
std::string pattern = "blk\\." + std::to_string(l) + "\\.(ffn_(up|down|gate)_exps)";
1167+
std::string pattern = "blk\\." + std::to_string(l) + "\\.(ffn_(up|down|gate)_exps\\.weight)";
11681168
params.tensor_buft_overrides.push_back({strdup(pattern.c_str()), ggml_backend_cpu_buffer_type()});
11691169
}
11701170
return true;

ggml/src/ggml-cuda.cu

Lines changed: 39 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3173,7 +3173,25 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
31733173
ggml_cuda_op_relu(ctx, dst);
31743174
break;
31753175
case GGML_UNARY_OP_SIGMOID:
3176-
ggml_cuda_op_sigmoid(ctx, dst);
3176+
if (i + 5 < cgraph->n_nodes &&
3177+
cgraph->nodes[i+1]->op == GGML_OP_RESHAPE &&
3178+
cgraph->nodes[i+2]->op == GGML_OP_ADD &&
3179+
cgraph->nodes[i+3]->op == GGML_OP_ARGSORT &&
3180+
cgraph->nodes[i+4]->op == GGML_OP_VIEW &&
3181+
cgraph->nodes[i+5]->op == GGML_OP_GET_ROWS) {
3182+
cuda_glm45moe_experts(ctx, cgraph->nodes[i+5], cgraph->nodes[i+4]);
3183+
i += 5;
3184+
}
3185+
else if (i + 4 < cgraph->n_nodes &&
3186+
cgraph->nodes[i+1]->op == GGML_OP_RESHAPE &&
3187+
cgraph->nodes[i+2]->op == GGML_OP_ADD &&
3188+
cgraph->nodes[i+3]->op == GGML_OP_GROUPED_TOPK &&
3189+
cgraph->nodes[i+4]->op == GGML_OP_GET_ROWS) {
3190+
cuda_bailingmoev2_experts(ctx, cgraph->nodes[i+4], cgraph->nodes[i+4]);
3191+
i += 4;
3192+
} else {
3193+
ggml_cuda_op_sigmoid(ctx, dst);
3194+
}
31773195
break;
31783196
case GGML_UNARY_OP_HARDSIGMOID:
31793197
ggml_cuda_op_hardsigmoid(ctx, dst);
@@ -3315,10 +3333,28 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
33153333
ggml_cuda_op_pool2d(ctx, dst);
33163334
break;
33173335
case GGML_OP_SUM_ROWS:
3318-
ggml_cuda_op_sum_rows(ctx, dst);
3336+
if (i + 1 < cgraph->n_nodes &&
3337+
cgraph->nodes[i+1]->op == GGML_OP_DIV &&
3338+
cgraph->nodes[i+1]->src[1] == dst &&
3339+
cgraph->nodes[i+1]->src[0] == dst->src[0]) {
3340+
ggml_cuda_op_sum_rows_div(ctx, cgraph->nodes[i+1]);
3341+
++i;
3342+
} else {
3343+
ggml_cuda_op_sum_rows(ctx, dst);
3344+
}
33193345
break;
33203346
case GGML_OP_ARGSORT:
3321-
ggml_cuda_op_argsort(ctx, dst);
3347+
if (i + 5 < cgraph->n_nodes &&
3348+
cgraph->nodes[i+1]->op == GGML_OP_VIEW &&
3349+
cgraph->nodes[i+2]->op == GGML_OP_GET_ROWS &&
3350+
cgraph->nodes[i+3]->op == GGML_OP_RESHAPE &&
3351+
cgraph->nodes[i+4]->op == GGML_OP_SOFT_MAX &&
3352+
cgraph->nodes[i+5]->op == GGML_OP_RESHAPE) {
3353+
cuda_openai_experts(ctx, dst, cgraph->nodes[i+4]);
3354+
i += 5;
3355+
} else {
3356+
ggml_cuda_op_argsort(ctx, dst);
3357+
}
33223358
break;
33233359
case GGML_OP_ARGSORT_THRESH:
33243360
ggml_cuda_op_argsort_thresh(ctx, dst);

0 commit comments

Comments
 (0)