Skip to content

Commit 71faa94

Browse files
committed
Revert "Reapply test " Akieslinger/reduce cuda graph cpu overhead #332""
This reverts commit 1ae6694.
1 parent 1ae6694 commit 71faa94

File tree

2 files changed

+42
-203
lines changed

2 files changed

+42
-203
lines changed

ggml/src/ggml-cuda/common.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -958,7 +958,7 @@ struct ggml_backend_cuda_context {
958958
cudaStream_t streams[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { { nullptr } };
959959
cublasHandle_t cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
960960

961-
std::vector<std::unique_ptr<ggml_cuda_graph>> cuda_graphs;
961+
std::unique_ptr<ggml_cuda_graph> cuda_graph;
962962

963963
explicit ggml_backend_cuda_context(int device) :
964964
device(device),

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 41 additions & 202 deletions
Original file line numberDiff line numberDiff line change
@@ -2814,22 +2814,14 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
28142814
GGML_UNUSED(backend);
28152815
}
28162816

2817-
// groups cgraph->nodes offsets per cuda_graph
2818-
struct cgraph_offset {
2819-
int begin;
2820-
int end;
2821-
};
2822-
28232817
#ifdef USE_CUDA_GRAPH
2824-
2825-
static bool check_node_graph_compatibility_and_refresh_copy_ops(std::unique_ptr<ggml_cuda_graph> & cuda_graph, ggml_cgraph * cgraph,
2826-
bool use_cuda_graph, cgraph_offset & offset) {
2818+
static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
2819+
bool use_cuda_graph) {
28272820

28282821
// Loop over nodes in GGML graph to obtain info needed for CUDA graph
2829-
cuda_graph->cpy_dest_ptrs.clear();
2830-
2831-
for (int i = offset.begin; i < offset.end; i++) {
2822+
cuda_ctx->cuda_graph->cpy_dest_ptrs.clear();
28322823

2824+
for (int i = 0; i < cgraph->n_nodes; i++) {
28332825
ggml_tensor * node = cgraph->nodes[i];
28342826

28352827
if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
@@ -2872,7 +2864,7 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(std::unique_ptr<
28722864

28732865
// Store the pointers which are updated for each token, such that these can be sent
28742866
// to the device and accessed using indirection from CUDA graph
2875-
cuda_graph->cpy_dest_ptrs.push_back((char *) node->src[1]->data);
2867+
cuda_ctx->cuda_graph->cpy_dest_ptrs.push_back((char *) node->src[1]->data);
28762868

28772869
// store a pointer to each copy op CUDA kernel to identify it later
28782870
void * ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
@@ -2949,108 +2941,47 @@ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_gra
29492941
return true;
29502942
}
29512943

2952-
<<<<<<< HEAD
2953-
static bool is_cuda_graph_update_required(ggml_cuda_graph> & cuda_graph, ggml_cgraph * cgraph) {
2954-
=======
2955-
static void maintain_cuda_graph(std::unique_ptr<ggml_cuda_graph> & cuda_graph, std::vector<void *> & ggml_cuda_cpy_fn_ptrs,
2956-
bool cuda_graph_update_required) {
2957-
2958-
if (cuda_graph_update_required) {
2959-
// Extract nodes from graph
2960-
// First call with null argument gets number of nodes in graph
2961-
CUDA_CHECK(cudaGraphGetNodes(cuda_graph->graph, nullptr, &cuda_graph->num_nodes));
2962-
// Subsequent call with non-null argument gets nodes
2963-
cuda_graph->nodes.clear();
2964-
cuda_graph->nodes.resize(cuda_graph->num_nodes);
2965-
cuda_graph->params.clear();
2966-
cuda_graph->params.resize(cuda_graph->num_nodes);
2967-
if (cuda_graph->num_nodes > 0) {
2968-
CUDA_CHECK(cudaGraphGetNodes(cuda_graph->graph, cuda_graph->nodes.data(), &cuda_graph->num_nodes));
2969-
2970-
// Loop over nodes, and extract kernel parameters from each node
2971-
for (size_t i = 0; i < cuda_graph->num_nodes; i++) {
2972-
cudaGraphNodeType node_type;
2973-
CUDA_CHECK(cudaGraphNodeGetType(cuda_graph->nodes[i], &node_type));
2974-
if (node_type == cudaGraphNodeTypeKernel) {
2975-
// Get params using runtime
2976-
cudaError_t stat = cudaGraphKernelNodeGetParams(cuda_graph->nodes[i], &cuda_graph->params[i]);
2977-
if (stat == cudaErrorInvalidDeviceFunction) {
2978-
// Fails due to incorrect handling by CUDA runtime of CUDA BLAS node.
2979-
// We don't need to update blas nodes, so clear error and move on.
2980-
(void)cudaGetLastError();
2981-
} else {
2982-
GGML_ASSERT(stat == cudaSuccess);
2983-
}
2984-
}
2985-
}
2986-
}
2987-
} else {
2988-
// One of the arguments to the copy kernel is updated for each token, hence we need to
2989-
// replace that argument with the updated value in the CUDA graph
2990-
// on update steps, the live parameters will already be captured
2991-
int k = 0;
2992-
for (size_t i = 0; i < cuda_graph->num_nodes; i++) {
2993-
if(count(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), cuda_graph->params[i].func) > 0) {
2994-
char ** updated_kernel_arg_ptr = cuda_graph->updated_kernel_arg.at(k++);
2995-
cuda_graph->params[i].kernelParams[1] = updated_kernel_arg_ptr;
2996-
CUDA_CHECK(cudaGraphKernelNodeSetParams(cuda_graph->nodes[i], &cuda_graph->params[i]));
2997-
}
2998-
}
2999-
}
3000-
}
3001-
3002-
static bool is_cuda_graph_update_required(std::unique_ptr<ggml_cuda_graph> & cuda_graph, ggml_cgraph * cgraph,
3003-
cgraph_offset & offset) {
3004-
>>>>>>> parent of 7eea329f6 (Revert " Akieslinger/reduce cuda graph cpu overhead #332")
2944+
static bool is_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph) {
30052945

30062946
bool cuda_graph_update_required = false;
30072947

3008-
if (cuda_graph->instance == nullptr) {
2948+
if (cuda_ctx->cuda_graph->instance == nullptr) {
30092949
cuda_graph_update_required = true;
30102950
}
30112951

30122952
// Check if the graph size has changed
3013-
if (cuda_graph->ggml_graph_properties.size() != (size_t)(offset.end - offset.begin)) {
2953+
if (cuda_ctx->cuda_graph->ggml_graph_properties.size() != (size_t)cgraph->n_nodes) {
30142954
cuda_graph_update_required = true;
3015-
cuda_graph->ggml_graph_properties.resize((offset.end - offset.begin));
2955+
cuda_ctx->cuda_graph->ggml_graph_properties.resize(cgraph->n_nodes);
30162956
}
30172957

30182958
// Loop over nodes in GGML graph to determine if CUDA graph update is required
30192959
// and store properties to allow this comparison for the next token
3020-
for (int i = offset.begin; i < offset.end; i++) {
2960+
for (int i = 0; i < cgraph->n_nodes; i++) {
30212961
bool has_matching_properties = true;
30222962
if (!cuda_graph_update_required) {
3023-
has_matching_properties = ggml_graph_node_has_matching_properties(cgraph->nodes[i], &cuda_graph->ggml_graph_properties[i - offset.begin]);
2963+
has_matching_properties = ggml_graph_node_has_matching_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
30242964
}
30252965
if (!has_matching_properties) {
30262966
cuda_graph_update_required = true;
30272967
}
3028-
set_ggml_graph_node_properties(cgraph->nodes[i], &cuda_graph->ggml_graph_properties[i - offset.begin]);
2968+
set_ggml_graph_node_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
30292969
}
30302970

30312971
return cuda_graph_update_required;
30322972
}
30332973

3034-
static void update_cuda_graph_executable(std::unique_ptr<ggml_cuda_graph> & cuda_graph) {
2974+
static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) {
30352975

30362976
#if CUDART_VERSION >= 12000
30372977
cudaGraphExecUpdateResultInfo result_info;
3038-
<<<<<<< HEAD
30392978
cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
30402979
#else
30412980
cudaGraphNode_t errorNode;
30422981
cudaGraphExecUpdateResult result_info;
30432982
cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &errorNode, &result_info);
30442983
#endif // CUDART_VERSION >= 12000
30452984

3046-
=======
3047-
#ifdef __HIP_PLATFORM_AMD__
3048-
hipGraphNode_t errorNode;
3049-
hipError_t stat = hipGraphExecUpdate(cuda_graph->instance, cuda_graph->graph, &errorNode, &result_info);
3050-
#else
3051-
cudaError_t stat = cudaGraphExecUpdate(cuda_graph->instance, cuda_graph->graph, &result_info);
3052-
#endif
3053-
>>>>>>> parent of 7eea329f6 (Revert " Akieslinger/reduce cuda graph cpu overhead #332")
30542985
if (stat == cudaErrorGraphExecUpdateFailure) {
30552986
#ifndef NDEBUG
30562987
GGML_LOG_DEBUG("%s: CUDA graph update failed\n", __func__);
@@ -3059,31 +2990,25 @@ static void update_cuda_graph_executable(std::unique_ptr<ggml_cuda_graph> & cuda
30592990
// The pre-existing graph exec cannot be updated due to violated constraints
30602991
// so instead clear error and re-instantiate
30612992
(void)cudaGetLastError();
3062-
CUDA_CHECK(cudaGraphExecDestroy(cuda_graph->instance));
3063-
cuda_graph->instance = nullptr;
3064-
CUDA_CHECK(cudaGraphInstantiate(&cuda_graph->instance, cuda_graph->graph, NULL, NULL, 0));
2993+
CUDA_CHECK(cudaGraphExecDestroy(cuda_ctx->cuda_graph->instance));
2994+
cuda_ctx->cuda_graph->instance = nullptr;
2995+
CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
30652996
} else {
30662997
GGML_ASSERT(stat == cudaSuccess);
30672998
}
30682999
}
30693000
#endif
30703001

3071-
<<<<<<< HEAD
30723002
static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
30733003
bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) {
30743004
// flag used to determine whether it is an integrated_gpu
30753005
const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated;
3076-
=======
3077-
static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, [[maybe_unused]] std::unique_ptr<ggml_cuda_graph> & cuda_graph,
3078-
ggml_cgraph * cgraph, [[maybe_unused]] std::vector<void *> & ggml_cuda_cpy_fn_ptrs,
3079-
bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required, cgraph_offset & offset) {
3080-
>>>>>>> parent of 7eea329f6 (Revert " Akieslinger/reduce cuda graph cpu overhead #332")
30813006

30823007
while (!graph_evaluated_or_captured) {
30833008
// Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph.
30843009
// With the use of CUDA graphs, the execution will be performed by the graph launch.
30853010
if (!use_cuda_graph || cuda_graph_update_required) {
3086-
for (int i = offset.begin; i < offset.end; i++) {
3011+
for (int i = 0; i < cgraph->n_nodes; i++) {
30873012
ggml_tensor * node = cgraph->nodes[i];
30883013

30893014
if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
@@ -3113,12 +3038,12 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
31133038

31143039
#ifdef USE_CUDA_GRAPH
31153040
if (use_cuda_graph && cuda_graph_update_required) { // End CUDA graph capture
3116-
if (cuda_graph->graph != nullptr) {
3117-
CUDA_CHECK(cudaGraphDestroy(cuda_graph->graph));
3118-
cuda_graph->graph = nullptr;
3041+
if (cuda_ctx->cuda_graph->graph != nullptr) {
3042+
CUDA_CHECK(cudaGraphDestroy(cuda_ctx->cuda_graph->graph));
3043+
cuda_ctx->cuda_graph->graph = nullptr;
31193044
}
31203045

3121-
CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &cuda_graph->graph));
3046+
CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &cuda_ctx->cuda_graph->graph));
31223047
graph_evaluated_or_captured = true; // CUDA graph has been captured
31233048

31243049
std::lock_guard<std::mutex> lock(ggml_cuda_lock);
@@ -3131,24 +3056,14 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
31313056
}
31323057

31333058
if (use_cuda_graph) {
3134-
if (cuda_graph->instance == nullptr) { // Create executable graph from captured graph.
3135-
CUDA_CHECK(cudaGraphInstantiate(&cuda_graph->instance, cuda_graph->graph, NULL, NULL, 0));
3059+
if (cuda_ctx->cuda_graph->instance == nullptr) { // Create executable graph from captured graph.
3060+
CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
31363061
}
3137-
<<<<<<< HEAD
31383062
if (cuda_graph_update_required) { // Update graph executable
31393063
update_cuda_graph_executable(cuda_ctx);
31403064
}
3141-
=======
3142-
3143-
// Perform update to graph (if required for this token), and change copy parameter (required for every token)
3144-
maintain_cuda_graph(cuda_graph, ggml_cuda_cpy_fn_ptrs, cuda_graph_update_required);
3145-
3146-
// Update graph executable
3147-
update_cuda_graph_executable(cuda_graph);
3148-
3149-
>>>>>>> parent of 7eea329f6 (Revert " Akieslinger/reduce cuda graph cpu overhead #332")
31503065
// Launch graph
3151-
CUDA_CHECK(cudaGraphLaunch(cuda_graph->instance, cuda_ctx->stream()));
3066+
CUDA_CHECK(cudaGraphLaunch(cuda_ctx->cuda_graph->instance, cuda_ctx->stream()));
31523067
#else
31533068
graph_evaluated_or_captured = true;
31543069
#endif // USE_CUDA_GRAPH
@@ -3160,60 +3075,23 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
31603075

31613076
ggml_cuda_set_device(cuda_ctx->device);
31623077

3163-
<<<<<<< HEAD
3164-
=======
3165-
// vector of pointers to CUDA cpy kernels, which are required to identify
3166-
// kernel parameters which need updated in the graph for each token
3167-
std::vector<void *> ggml_cuda_cpy_fn_ptrs;
3168-
3169-
// Heuristic to minimize GPU idle time. Work is split over several CUDA graphs,
3170-
// to overlap graph building (CPU) and graph execution (GPU).
3171-
// The first graphs are small to minimize the time in which the CPU prepares work and the GPU is idle.
3172-
// After that, graph building (CPU) is done in parallel to the execution of another previously built graph (GPU).
3173-
int first_graph_subset = 20;
3174-
int second_graph_subset = 50;
3175-
int remaining_graph_subset = 100;
3176-
int remaining_nodes = (cgraph->n_nodes - first_graph_subset) - second_graph_subset;
3177-
int num_cuda_graphs_required = 2 + (remaining_nodes / remaining_graph_subset);
3178-
cuda_ctx->cuda_graphs.resize(num_cuda_graphs_required);
3179-
cgraph_offset offset {0,0};
3180-
3181-
for (size_t i = 0; i < cuda_ctx->cuda_graphs.size(); i++) {
3182-
auto & cuda_graph = cuda_ctx->cuda_graphs[i];
3183-
3184-
offset.begin = offset.end;
3185-
if (i == 0) offset.end += first_graph_subset;
3186-
if (i == 1) offset.end += second_graph_subset;
3187-
if (i >= 2) offset.end += remaining_graph_subset;
3188-
3189-
// last graph does the rest
3190-
if ((i + 1) == cuda_ctx->cuda_graphs.size()) offset.end = cgraph->n_nodes;
3191-
3192-
// special case for graphs smaller than the ramp-up heuristic
3193-
if (cgraph->n_nodes <= first_graph_subset + second_graph_subset) {
3194-
offset.end = cgraph->n_nodes;
3195-
if (i > 0) break;
3196-
}
3197-
3198-
>>>>>>> parent of 7eea329f6 (Revert " Akieslinger/reduce cuda graph cpu overhead #332")
31993078
#ifdef USE_CUDA_GRAPH
3200-
static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
3079+
static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
32013080

3202-
// Objects required for CUDA Graph
3203-
if (cuda_graph == nullptr) {
3204-
cuda_graph = std::make_unique<ggml_cuda_graph>();
3205-
}
3081+
// Objects required for CUDA Graph
3082+
if (cuda_ctx->cuda_graph == nullptr) {
3083+
cuda_ctx->cuda_graph.reset(new ggml_cuda_graph());
3084+
}
32063085

3207-
bool use_cuda_graph = true;
3208-
bool cuda_graph_update_required = false;
3086+
bool use_cuda_graph = true;
3087+
bool cuda_graph_update_required = false;
32093088

3210-
if (cuda_graph->graph == nullptr) {
3211-
if (ggml_cuda_info().devices[cuda_ctx->device].cc < GGML_CUDA_CC_AMPERE) {
3212-
cuda_graph->disable_due_to_gpu_arch = true;
3089+
if (cuda_ctx->cuda_graph->graph == nullptr) {
3090+
if (ggml_cuda_info().devices[cuda_ctx->device].cc < GGML_CUDA_CC_AMPERE) {
3091+
cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
32133092
#ifndef NDEBUG
3214-
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
3093+
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
32153094
#endif
3216-
<<<<<<< HEAD
32173095
}
32183096
}
32193097

@@ -3237,43 +3115,16 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
32373115
cuda_ctx->cuda_graph->number_consecutive_updates++;
32383116
} else {
32393117
cuda_ctx->cuda_graph->number_consecutive_updates = 0;
3240-
=======
3241-
}
3242-
>>>>>>> parent of 7eea329f6 (Revert " Akieslinger/reduce cuda graph cpu overhead #332")
3243-
}
3244-
3245-
// Disable CUDA graphs in presence of env var, old GPU, use-case which is changing too rapidly,
3246-
// or previous graph capture failure.
3247-
// Also disable for multi-gpu for now. TO DO investigate
3248-
if (disable_cuda_graphs_due_to_env
3249-
|| cuda_graph->disable_due_to_gpu_arch
3250-
|| cuda_graph->disable_due_to_too_many_updates
3251-
|| cuda_graph->disable_due_to_failed_graph_capture) {
3252-
use_cuda_graph = false;
32533118
}
32543119

3255-
if (use_cuda_graph) {
3256-
cuda_graph_update_required = is_cuda_graph_update_required(cuda_graph, cgraph, offset);
3257-
3258-
use_cuda_graph = check_node_graph_compatibility_and_refresh_copy_ops(cuda_graph, cgraph,
3259-
ggml_cuda_cpy_fn_ptrs, use_cuda_graph, offset);
3260-
3261-
// Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
3262-
if (use_cuda_graph && cuda_graph_update_required) {
3263-
cuda_graph->number_consecutive_updates++;
3264-
} else {
3265-
cuda_graph->number_consecutive_updates = 0;
3266-
}
3267-
3268-
if (cuda_graph->number_consecutive_updates >= 4) {
3269-
cuda_graph->disable_due_to_too_many_updates = true;
3120+
if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
3121+
cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
32703122
#ifndef NDEBUG
3271-
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
3123+
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
32723124
#endif
3273-
}
32743125
}
3126+
}
32753127

3276-
<<<<<<< HEAD
32773128
if (use_cuda_graph && cuda_graph_update_required) {
32783129
// Start CUDA graph capture
32793130
{
@@ -3283,32 +3134,20 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
32833134

32843135
CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
32853136
}
3286-
=======
3287-
if (use_cuda_graph && cuda_graph_update_required) { // Start CUDA graph capture
3288-
CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
3289-
}
3290-
>>>>>>> parent of 7eea329f6 (Revert " Akieslinger/reduce cuda graph cpu overhead #332")
32913137

32923138
if (!use_cuda_graph) {
32933139
cuda_ctx->cuda_graph->use_cpy_indirection = false;
32943140
}
32953141

32963142
#else
3297-
bool use_cuda_graph = false;
3298-
bool cuda_graph_update_required = false;
3143+
bool use_cuda_graph = false;
3144+
bool cuda_graph_update_required = false;
32993145
#endif // USE_CUDA_GRAPH
33003146

3301-
<<<<<<< HEAD
33023147
bool graph_evaluated_or_captured = false;
33033148

33043149
evaluate_and_capture_cuda_graph(cuda_ctx, cgraph, graph_evaluated_or_captured, use_cuda_graph, cuda_graph_update_required);
3305-
=======
3306-
bool graph_evaluated_or_captured = false;
3307-
>>>>>>> parent of 7eea329f6 (Revert " Akieslinger/reduce cuda graph cpu overhead #332")
33083150

3309-
evaluate_and_capture_cuda_graph(cuda_ctx, cuda_graph, cgraph, ggml_cuda_cpy_fn_ptrs,
3310-
graph_evaluated_or_captured, use_cuda_graph, cuda_graph_update_required, offset);
3311-
}
33123151
return GGML_STATUS_SUCCESS;
33133152
}
33143153

0 commit comments

Comments
 (0)