@@ -2978,7 +2978,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
29782978 ggml_cuda_topk_moe_ops (/* with_norm=*/ false , /* delayed_softmax=*/ true );
29792979
29802980 if (ops.size () == topk_moe_ops_with_norm.size () &&
2981- ggml_can_fuse_subgraph (cgraph, node_idx, ops, { node_idx + 3 , node_idx + 8 })) {
2981+ ggml_can_fuse_subgraph (cgraph, node_idx, ops, { node_idx + 3 , node_idx + 9 })) {
29822982 ggml_tensor * softmax = cgraph->nodes [node_idx];
29832983 ggml_tensor * weights = cgraph->nodes [node_idx + 9 ];
29842984
@@ -2997,7 +2997,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
29972997 }
29982998
29992999 if (ops.size () == topk_moe_ops_delayed_softmax.size () &&
3000- ggml_can_fuse_subgraph (cgraph, node_idx, ops, { node_idx + 2 , node_idx + 5 })) {
3000+ ggml_can_fuse_subgraph (cgraph, node_idx, ops, { node_idx + 1 , node_idx + 5 })) {
30013001 ggml_tensor * softmax = cgraph->nodes [node_idx + 4 ];
30023002 ggml_tensor * weights = cgraph->nodes [node_idx + 5 ];
30033003
@@ -3118,9 +3118,20 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
31183118 // With the use of CUDA graphs, the execution will be performed by the graph launch.
31193119 if (!use_cuda_graph || cuda_graph_update_required) {
31203120
3121+ [[maybe_unused]] int prev_i = 0 ;
3122+
31213123 for (int i = 0 ; i < cgraph->n_nodes ; i++) {
31223124 ggml_tensor * node = cgraph->nodes [i];
31233125
3126+
3127+ #ifdef GGML_CUDA_DEBUG
3128+ const int nodes_fused = i - prev_i - 1 ;
3129+ prev_i = i;
3130+ if (nodes_fused > 0 ) {
3131+ GGML_LOG_INFO (" nodes_fused: %d\n " , nodes_fused);
3132+ }
3133+ #endif
3134+
31243135 if (ggml_is_empty (node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
31253136 continue ;
31263137 }
0 commit comments