Skip to content

Commit a0c91e8

Browse files
gaugarg-nvJohannesGaessleram17an
authored
Improve CUDA graph capture (ggml-org#19754)
* Improve CUDA graph capture Currently, CUDA graphs are eagerly enabled on the first call to ggml_backend_cuda_graph_compute. If the graph properties keep changing (4+ consecutive updates), the graph is permanently disabled. This is suboptimal because: - The first call always incurs CUDA graph capture overhead even if the graph is unstable - Once permanently disabled, CUDA graphs never re-enable even after the graph stabilizes (e.g., switching from prompt processing to decode) The new approach delays CUDA graph activation until warmup completes: the same cgraph must be called at least twice with matching properties before CUDA graph capture begins. This avoids wasted capture overhead on volatile graphs and allows graphs to become eligible once they stabilize. This also fixes issues such as ggml-org#19708 * Update ggml/src/ggml-cuda/ggml-cuda.cu Co-authored-by: Johannes Gäßler <johannesg@5d6.de> * Remove EM dashes * Update ggml/src/ggml-cuda/ggml-cuda.cu Co-authored-by: Aman Gupta <amangupta052@gmail.com> --------- Co-authored-by: Johannes Gäßler <johannesg@5d6.de> Co-authored-by: Aman Gupta <amangupta052@gmail.com>
1 parent 07968d5 commit a0c91e8

File tree

2 files changed

+28
-24
lines changed

2 files changed

+28
-24
lines changed

ggml/src/ggml-cuda/common.cuh

Lines changed: 2 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1149,8 +1149,7 @@ struct ggml_cuda_graph {
11491149
size_t num_nodes = 0;
11501150
std::vector<cudaGraphNode_t> nodes;
11511151
bool disable_due_to_gpu_arch = false;
1152-
bool disable_due_to_too_many_updates = false;
1153-
int number_consecutive_updates = 0;
1152+
bool warmup_complete = false;
11541153
std::vector<ggml_cuda_graph_node_properties> props;
11551154

11561155
// these are extra tensors (inputs) that participate in the ggml graph but are not nodes
@@ -1159,21 +1158,9 @@ struct ggml_cuda_graph {
11591158
// ref: https://github.com/ggml-org/llama.cpp/pull/19165
11601159
std::vector<ggml_cuda_graph_node_properties> extra;
11611160

1162-
void record_update(bool use_graph, bool update_required) {
1163-
if (use_graph && update_required) {
1164-
number_consecutive_updates++;
1165-
} else {
1166-
number_consecutive_updates = 0;
1167-
}
1168-
if (number_consecutive_updates >= 4) {
1169-
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
1170-
disable_due_to_too_many_updates = true;
1171-
}
1172-
}
1173-
11741161
bool is_enabled() const {
11751162
static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
1176-
return !(disable_due_to_gpu_arch || disable_cuda_graphs_due_to_env || disable_due_to_too_many_updates);
1163+
return !(disable_due_to_gpu_arch || disable_cuda_graphs_due_to_env);
11771164
}
11781165
#endif
11791166
};

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 26 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2979,10 +2979,6 @@ static bool ggml_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx
29792979
const void * graph_key = ggml_cuda_graph_get_key(cgraph);
29802980
ggml_cuda_graph * graph = cuda_ctx->cuda_graph(graph_key);
29812981

2982-
if (graph->instance == nullptr) {
2983-
res = true;
2984-
}
2985-
29862982
// Check if the graph size has changed
29872983
if (graph->props.size() != (size_t)cgraph->n_nodes) {
29882984
res = true;
@@ -3931,14 +3927,35 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
39313927
#ifdef USE_CUDA_GRAPH
39323928
graph_key = ggml_cuda_graph_get_key(cgraph);
39333929

3934-
use_cuda_graph = ggml_cuda_graph_set_enabled(cuda_ctx, graph_key);
3930+
ggml_cuda_graph_set_enabled(cuda_ctx, graph_key);
39353931

39363932
ggml_cuda_graph * graph = cuda_ctx->cuda_graph(graph_key);
39373933
if (graph->is_enabled()) {
3938-
cuda_graph_update_required = ggml_cuda_graph_update_required(cuda_ctx, cgraph);
3939-
use_cuda_graph = ggml_cuda_graph_check_compability(cgraph);
3940-
3941-
graph->record_update(use_cuda_graph, cuda_graph_update_required);
3934+
const bool graph_compatible = ggml_cuda_graph_check_compability(cgraph);
3935+
if (graph_compatible) {
3936+
const bool properties_changed = ggml_cuda_graph_update_required(cuda_ctx, cgraph);
3937+
3938+
if (!graph->warmup_complete) {
3939+
// Warmup: need at least 2 calls with no property change on the 2nd call
3940+
if (!properties_changed) {
3941+
graph->warmup_complete = true;
3942+
GGML_LOG_DEBUG("%s: CUDA graph warmup complete\n", __func__);
3943+
use_cuda_graph = true;
3944+
cuda_graph_update_required = true;
3945+
}
3946+
// else: properties changed or first call - execute directly (use_cuda_graph stays false)
3947+
} else {
3948+
// Post-warmup: normal CUDA graph operation
3949+
if (properties_changed) {
3950+
// Properties changed - reset warmup, execute directly until stable again
3951+
graph->warmup_complete = false;
3952+
GGML_LOG_DEBUG("%s: CUDA graph warmup reset\n", __func__);
3953+
} else {
3954+
use_cuda_graph = true;
3955+
cuda_graph_update_required = graph->instance == nullptr;
3956+
}
3957+
}
3958+
}
39423959
}
39433960
#endif // USE_CUDA_GRAPH
39443961

0 commit comments

Comments
 (0)