Skip to content

Commit 3449842

Browse files
committed
CUDA: Fix bug in topk-moe for gpt-oss
When using ggml_can_fuse_subgraph, the output nodes which are passed are wrong. This causes `test-backend-ops` to still fuse ndoes (because the nodes are not used elsewhere in the graph), but it actually doesn't fuse in the actual gpt-oss
1 parent 1c1409e commit 3449842

File tree

1 file changed

+12
-1
lines changed

1 file changed

+12
-1
lines changed

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2993,7 +2993,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
29932993
}
29942994

29952995
if (ops.size() == topk_moe_ops_delayed_softmax.size() &&
2996-
ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 2, node_idx + 5 })) {
2996+
ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 1, node_idx + 5 })) {
29972997
ggml_tensor * softmax = cgraph->nodes[node_idx + 4];
29982998
ggml_tensor * weights = cgraph->nodes[node_idx + 5];
29992999

@@ -3114,9 +3114,20 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
31143114
// With the use of CUDA graphs, the execution will be performed by the graph launch.
31153115
if (!use_cuda_graph || cuda_graph_update_required) {
31163116

3117+
[[maybe_unused]] int prev_i = 0;
3118+
31173119
for (int i = 0; i < cgraph->n_nodes; i++) {
31183120
ggml_tensor * node = cgraph->nodes[i];
31193121

3122+
3123+
#ifndef GGML_CUDA_DEBUG
3124+
const int nodes_fused = i - prev_i - 1;
3125+
prev_i = i;
3126+
if (nodes_fused > 0) {
3127+
GGML_LOG_INFO("nodes_fused: %d\n", nodes_fused);
3128+
}
3129+
#endif
3130+
31203131
if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
31213132
continue;
31223133
}

0 commit comments

Comments
 (0)