Skip to content

Commit 1bcb140

Browse files
aendkjammm
andcommitted
Overlap CUDA graph building and processing to minimize GPU idle time and improve tokens per seconds performance.
Co-authored-by: Aaryaman Vasishta <[email protected]>
1 parent ee09828 commit 1bcb140

File tree

3 files changed

+117
-73
lines changed

3 files changed

+117
-73
lines changed

ggml/src/ggml-cuda/common.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -955,7 +955,7 @@ struct ggml_backend_cuda_context {
955955
cudaStream_t streams[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { { nullptr } };
956956
cublasHandle_t cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
957957

958-
std::unique_ptr<ggml_cuda_graph> cuda_graph;
958+
std::vector<std::unique_ptr<ggml_cuda_graph>> cuda_graphs;
959959

960960
explicit ggml_backend_cuda_context(int device) :
961961
device(device),

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 113 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -2641,9 +2641,15 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
26412641
GGML_UNUSED(backend);
26422642
}
26432643

2644+
// groups cgraph->nodes offsets per cuda_graph
2645+
struct cgraph_offset {
2646+
int begin;
2647+
int end;
2648+
};
2649+
26442650
#ifdef USE_CUDA_GRAPH
26452651
static bool check_node_graph_compatibility(ggml_cgraph * cgraph,
2646-
bool use_cuda_graph) {
2652+
bool use_cuda_graph, cgraph_offset & offset) {
26472653

26482654
// Loop over nodes in GGML graph to obtain info needed for CUDA graph
26492655

@@ -2655,7 +2661,7 @@ static bool check_node_graph_compatibility(ggml_cgraph * cgraph,
26552661
const std::string nemotron_h_block_out_prefix = "nemotron_h_block_out";
26562662
const std::string mamba2_y_add_d_prefix = "mamba2_y_add_d";
26572663

2658-
for (int i = 0; i < cgraph->n_nodes; i++) {
2664+
for (int i = offset.begin; i < offset.end; i++) {
26592665
ggml_tensor * node = cgraph->nodes[i];
26602666

26612667
if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
@@ -2753,45 +2759,45 @@ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_gra
27532759
return true;
27542760
}
27552761

2756-
static bool is_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph) {
2762+
static bool is_cuda_graph_update_required(std::unique_ptr<ggml_cuda_graph> & cuda_graph, ggml_cgraph * cgraph, cgraph_offset & offset) {
27572763

27582764
bool cuda_graph_update_required = false;
27592765

2760-
if (cuda_ctx->cuda_graph->instance == nullptr) {
2766+
if (cuda_graph->instance == nullptr) {
27612767
cuda_graph_update_required = true;
27622768
}
27632769

27642770
// Check if the graph size has changed
2765-
if (cuda_ctx->cuda_graph->ggml_graph_properties.size() != (size_t)cgraph->n_nodes) {
2771+
if (cuda_graph->ggml_graph_properties.size() != (size_t)(offset.end - offset.begin)) {
27662772
cuda_graph_update_required = true;
2767-
cuda_ctx->cuda_graph->ggml_graph_properties.resize(cgraph->n_nodes);
2773+
cuda_graph->ggml_graph_properties.resize((offset.end - offset.begin));
27682774
}
27692775

27702776
// Loop over nodes in GGML graph to determine if CUDA graph update is required
27712777
// and store properties to allow this comparison for the next token
2772-
for (int i = 0; i < cgraph->n_nodes; i++) {
2778+
for (int i = offset.begin; i < offset.end; i++) {
27732779
bool has_matching_properties = true;
27742780
if (!cuda_graph_update_required) {
2775-
has_matching_properties = ggml_graph_node_has_matching_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
2781+
has_matching_properties = ggml_graph_node_has_matching_properties(cgraph->nodes[i], &cuda_graph->ggml_graph_properties[i - offset.begin]);
27762782
}
27772783
if (!has_matching_properties) {
27782784
cuda_graph_update_required = true;
27792785
}
2780-
set_ggml_graph_node_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
2786+
set_ggml_graph_node_properties(cgraph->nodes[i], &cuda_graph->ggml_graph_properties[i - offset.begin]);
27812787
}
27822788

27832789
return cuda_graph_update_required;
27842790
}
27852791

2786-
static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) {
2792+
static void update_cuda_graph_executable(std::unique_ptr<ggml_cuda_graph> & cuda_graph) {
27872793

27882794
#if CUDART_VERSION >= 12000
27892795
cudaGraphExecUpdateResultInfo result_info;
2790-
cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
2796+
cudaError_t stat = cudaGraphExecUpdate(cuda_graph->instance, cuda_graph->graph, &result_info);
27912797
#else
27922798
cudaGraphNode_t errorNode;
27932799
cudaGraphExecUpdateResult result_info;
2794-
cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &errorNode, &result_info);
2800+
cudaError_t stat = cudaGraphExecUpdate(cuda_graph->instance, cuda_graph->graph, &errorNode, &result_info);
27952801
#endif // CUDART_VERSION >= 12000
27962802

27972803
if (stat == cudaErrorGraphExecUpdateFailure) {
@@ -2802,9 +2808,9 @@ static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) {
28022808
// The pre-existing graph exec cannot be updated due to violated constraints
28032809
// so instead clear error and re-instantiate
28042810
(void)cudaGetLastError();
2805-
CUDA_CHECK(cudaGraphExecDestroy(cuda_ctx->cuda_graph->instance));
2806-
cuda_ctx->cuda_graph->instance = nullptr;
2807-
CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
2811+
CUDA_CHECK(cudaGraphExecDestroy(cuda_graph->instance));
2812+
cuda_graph->instance = nullptr;
2813+
CUDA_CHECK(cudaGraphInstantiate(&cuda_graph->instance, cuda_graph->graph, NULL, NULL, 0));
28082814
} else {
28092815
GGML_ASSERT(stat == cudaSuccess);
28102816
}
@@ -2925,8 +2931,8 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
29252931
return false;
29262932
}
29272933

2928-
static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
2929-
bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) {
2934+
static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, [[maybe_unused]] std::unique_ptr<ggml_cuda_graph> & cuda_graph,
2935+
ggml_cgraph * cgraph, bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required, cgraph_offset & offset) {
29302936
// flag used to determine whether it is an integrated_gpu
29312937
const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated;
29322938

@@ -2935,7 +2941,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
29352941
// With the use of CUDA graphs, the execution will be performed by the graph launch.
29362942
if (!use_cuda_graph || cuda_graph_update_required) {
29372943

2938-
for (int i = 0; i < cgraph->n_nodes; i++) {
2944+
for (int i = offset.begin; i < offset.end; i++) {
29392945
ggml_tensor * node = cgraph->nodes[i];
29402946

29412947
if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
@@ -3034,12 +3040,12 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
30343040

30353041
#ifdef USE_CUDA_GRAPH
30363042
if (use_cuda_graph && cuda_graph_update_required) { // End CUDA graph capture
3037-
if (cuda_ctx->cuda_graph->graph != nullptr) {
3038-
CUDA_CHECK(cudaGraphDestroy(cuda_ctx->cuda_graph->graph));
3039-
cuda_ctx->cuda_graph->graph = nullptr;
3043+
if (cuda_graph->graph != nullptr) {
3044+
CUDA_CHECK(cudaGraphDestroy(cuda_graph->graph));
3045+
cuda_graph->graph = nullptr;
30403046
}
30413047

3042-
CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &cuda_ctx->cuda_graph->graph));
3048+
CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &cuda_graph->graph));
30433049
graph_evaluated_or_captured = true; // CUDA graph has been captured
30443050

30453051
std::lock_guard<std::mutex> lock(ggml_cuda_lock);
@@ -3052,14 +3058,14 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
30523058
}
30533059

30543060
if (use_cuda_graph) {
3055-
if (cuda_ctx->cuda_graph->instance == nullptr) { // Create executable graph from captured graph.
3056-
CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
3061+
if (cuda_graph->instance == nullptr) { // Create executable graph from captured graph.
3062+
CUDA_CHECK(cudaGraphInstantiate(&cuda_graph->instance, cuda_graph->graph, NULL, NULL, 0));
30573063
}
30583064
if (cuda_graph_update_required) { // Update graph executable
3059-
update_cuda_graph_executable(cuda_ctx);
3065+
update_cuda_graph_executable(cuda_graph);
30603066
}
30613067
// Launch graph
3062-
CUDA_CHECK(cudaGraphLaunch(cuda_ctx->cuda_graph->instance, cuda_ctx->stream()));
3068+
CUDA_CHECK(cudaGraphLaunch(cuda_graph->instance, cuda_ctx->stream()));
30633069
#else
30643070
graph_evaluated_or_captured = true;
30653071
#endif // USE_CUDA_GRAPH
@@ -3071,74 +3077,107 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
30713077

30723078
ggml_cuda_set_device(cuda_ctx->device);
30733079

3080+
// Heuristic to minimize GPU idle time. Work is split over several CUDA graphs,
3081+
// to overlap graph building (CPU) and graph execution (GPU).
3082+
// The first graphs are small to minimize the time in which the CPU prepares work and the GPU is idle.
3083+
// After that, graph building (CPU) is done in parallel to the execution of another previously built graph (GPU).
3084+
int first_graph_subset = 20;
3085+
int second_graph_subset = 50;
3086+
int remaining_graph_subset = 100;
3087+
int remaining_nodes = (cgraph->n_nodes - first_graph_subset) - second_graph_subset;
3088+
int num_cuda_graphs_required = 2 + (remaining_nodes / remaining_graph_subset);
3089+
cuda_ctx->cuda_graphs.resize(num_cuda_graphs_required);
3090+
cgraph_offset offset {0,0};
3091+
3092+
for (size_t i = 0; i < cuda_ctx->cuda_graphs.size(); i++) {
3093+
auto & cuda_graph = cuda_ctx->cuda_graphs[i];
3094+
3095+
offset.begin = offset.end;
3096+
if (i == 0) offset.end += first_graph_subset;
3097+
if (i == 1) offset.end += second_graph_subset;
3098+
if (i >= 2) offset.end += remaining_graph_subset;
3099+
3100+
// last graph does the rest
3101+
if ((i + 1) == cuda_ctx->cuda_graphs.size()) offset.end = cgraph->n_nodes;
3102+
3103+
// special case for graphs smaller than the ramp-up heuristic
3104+
if (cgraph->n_nodes <= first_graph_subset + second_graph_subset) {
3105+
offset.end = cgraph->n_nodes;
3106+
if (i > 0) break;
3107+
}
3108+
3109+
3110+
30743111
#ifdef USE_CUDA_GRAPH
3075-
static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
3112+
static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
30763113

3077-
// Objects required for CUDA Graph
3078-
if (cuda_ctx->cuda_graph == nullptr) {
3079-
cuda_ctx->cuda_graph.reset(new ggml_cuda_graph());
3080-
}
3114+
// Objects required for CUDA Graph
3115+
if (cuda_graph == nullptr) {
3116+
cuda_graph = std::make_unique<ggml_cuda_graph>();
3117+
}
30813118

3082-
bool use_cuda_graph = true;
3083-
bool cuda_graph_update_required = false;
3119+
bool use_cuda_graph = true;
3120+
bool cuda_graph_update_required = false;
30843121

3085-
if (cuda_ctx->cuda_graph->graph == nullptr) {
3086-
if (ggml_cuda_info().devices[cuda_ctx->device].cc < GGML_CUDA_CC_AMPERE) {
3087-
cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
3122+
if (cuda_graph->graph == nullptr) {
3123+
if (ggml_cuda_info().devices[cuda_ctx->device].cc < GGML_CUDA_CC_AMPERE) {
3124+
cuda_graph->disable_due_to_gpu_arch = true;
30883125
#ifndef NDEBUG
3089-
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
3126+
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
30903127
#endif
3128+
}
30913129
}
3092-
}
30933130

3094-
// Disable CUDA graphs in presence of env var, old GPU, use-case which is changing too rapidly,
3095-
// or previous graph capture failure.
3096-
// Also disable for multi-gpu for now. TO DO investigate
3097-
if (disable_cuda_graphs_due_to_env
3098-
|| cuda_ctx->cuda_graph->disable_due_to_gpu_arch
3099-
|| cuda_ctx->cuda_graph->disable_due_to_too_many_updates
3100-
|| cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture) {
3101-
use_cuda_graph = false;
3102-
}
3131+
// Disable CUDA graphs in presence of env var, old GPU, use-case which is changing too rapidly,
3132+
// or previous graph capture failure.
3133+
// Also disable for multi-gpu for now. TO DO investigate
3134+
if (disable_cuda_graphs_due_to_env
3135+
|| cuda_graph->disable_due_to_gpu_arch
3136+
|| cuda_graph->disable_due_to_too_many_updates
3137+
|| cuda_graph->disable_due_to_failed_graph_capture) {
3138+
use_cuda_graph = false;
3139+
}
31033140

3104-
if (use_cuda_graph) {
3105-
cuda_graph_update_required = is_cuda_graph_update_required(cuda_ctx, cgraph);
3141+
if (use_cuda_graph) {
3142+
cuda_graph_update_required = is_cuda_graph_update_required(cuda_graph, cgraph, offset);
31063143

3107-
use_cuda_graph = check_node_graph_compatibility(cgraph, use_cuda_graph);
3144+
use_cuda_graph = check_node_graph_compatibility(cgraph, use_cuda_graph, offset);
31083145

3109-
// Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
3110-
if (use_cuda_graph && cuda_graph_update_required) {
3111-
cuda_ctx->cuda_graph->number_consecutive_updates++;
3112-
} else {
3113-
cuda_ctx->cuda_graph->number_consecutive_updates = 0;
3114-
}
3146+
// Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
3147+
if (use_cuda_graph && cuda_graph_update_required) {
3148+
cuda_graph->number_consecutive_updates++;
3149+
} else {
3150+
cuda_graph->number_consecutive_updates = 0;
3151+
}
31153152

3116-
if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
3117-
cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
3153+
if (cuda_graph->number_consecutive_updates >= 4) {
3154+
cuda_graph->disable_due_to_too_many_updates = true;
31183155
#ifndef NDEBUG
3119-
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
3156+
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
31203157
#endif
3158+
}
31213159
}
3122-
}
31233160

3124-
if (use_cuda_graph && cuda_graph_update_required) {
3125-
// Start CUDA graph capture
3126-
{
3127-
std::lock_guard<std::mutex> lock(ggml_cuda_lock);
3128-
ggml_cuda_lock_counter.fetch_add(1, std::memory_order_relaxed);
3129-
}
3161+
if (use_cuda_graph && cuda_graph_update_required) {
3162+
// Start CUDA graph capture
3163+
{
3164+
std::lock_guard<std::mutex> lock(ggml_cuda_lock);
3165+
ggml_cuda_lock_counter.fetch_add(1, std::memory_order_relaxed);
3166+
}
31303167

3131-
CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
3132-
}
3168+
CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
3169+
}
31333170

31343171
#else
3135-
bool use_cuda_graph = false;
3136-
bool cuda_graph_update_required = false;
3172+
bool use_cuda_graph = false;
3173+
bool cuda_graph_update_required = false;
31373174
#endif // USE_CUDA_GRAPH
31383175

3139-
bool graph_evaluated_or_captured = false;
3176+
bool graph_evaluated_or_captured = false;
31403177

3141-
evaluate_and_capture_cuda_graph(cuda_ctx, cgraph, graph_evaluated_or_captured, use_cuda_graph, cuda_graph_update_required);
3178+
evaluate_and_capture_cuda_graph(cuda_ctx, cuda_graph, cgraph,
3179+
graph_evaluated_or_captured, use_cuda_graph, cuda_graph_update_required, offset);
3180+
}
31423181

31433182
return GGML_STATUS_SUCCESS;
31443183
}
@@ -3896,6 +3935,8 @@ ggml_backend_t ggml_backend_cuda_init(int device) {
38963935
/* .context = */ ctx,
38973936
};
38983937

3938+
cublasHandle_t cublas_handle = ctx->cublas_handle(device);
3939+
38993940
return cuda_backend;
39003941
}
39013942

ggml/src/ggml-hip/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,9 @@ endif()
4848

4949
message(STATUS "HIP and hipBLAS found")
5050

51+
message(STATUS "hipBLAS version: ${hipblas_VERSION}")
52+
53+
5154
# Workaround old compilers
5255
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} --gpu-max-threads-per-block=1024")
5356

0 commit comments

Comments
 (0)