@@ -2814,22 +2814,14 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
28142814 GGML_UNUSED (backend);
28152815}
28162816
2817- // groups cgraph->nodes offsets per cuda_graph
2818- struct cgraph_offset {
2819- int begin;
2820- int end;
2821- };
2822-
28232817#ifdef USE_CUDA_GRAPH
2824-
2825- static bool check_node_graph_compatibility_and_refresh_copy_ops (std::unique_ptr<ggml_cuda_graph> & cuda_graph, ggml_cgraph * cgraph,
2826- bool use_cuda_graph, cgraph_offset & offset) {
2818+ static bool check_node_graph_compatibility_and_refresh_copy_ops (ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
2819+ bool use_cuda_graph) {
28272820
28282821 // Loop over nodes in GGML graph to obtain info needed for CUDA graph
2829- cuda_graph->cpy_dest_ptrs .clear ();
2830-
2831- for (int i = offset.begin ; i < offset.end ; i++) {
2822+ cuda_ctx->cuda_graph ->cpy_dest_ptrs .clear ();
28322823
2824+ for (int i = 0 ; i < cgraph->n_nodes ; i++) {
28332825 ggml_tensor * node = cgraph->nodes [i];
28342826
28352827 if (ggml_is_empty (node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
@@ -2872,7 +2864,7 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(std::unique_ptr<
28722864
28732865 // Store the pointers which are updated for each token, such that these can be sent
28742866 // to the device and accessed using indirection from CUDA graph
2875- cuda_graph->cpy_dest_ptrs .push_back ((char *) node->src [1 ]->data );
2867+ cuda_ctx-> cuda_graph ->cpy_dest_ptrs .push_back ((char *) node->src [1 ]->data );
28762868
28772869 // store a pointer to each copy op CUDA kernel to identify it later
28782870 void * ptr = ggml_cuda_cpy_fn (node->src [0 ], node->src [1 ]);
@@ -2949,108 +2941,47 @@ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_gra
29492941 return true ;
29502942}
29512943
2952- <<<<<<< HEAD
2953- static bool is_cuda_graph_update_required (ggml_cuda_graph> & cuda_graph, ggml_cgraph * cgraph) {
2954- =======
2955- static void maintain_cuda_graph (std::unique_ptr<ggml_cuda_graph> & cuda_graph, std::vector<void *> & ggml_cuda_cpy_fn_ptrs,
2956- bool cuda_graph_update_required) {
2957-
2958- if (cuda_graph_update_required) {
2959- // Extract nodes from graph
2960- // First call with null argument gets number of nodes in graph
2961- CUDA_CHECK (cudaGraphGetNodes (cuda_graph->graph , nullptr , &cuda_graph->num_nodes ));
2962- // Subsequent call with non-null argument gets nodes
2963- cuda_graph->nodes .clear ();
2964- cuda_graph->nodes .resize (cuda_graph->num_nodes );
2965- cuda_graph->params .clear ();
2966- cuda_graph->params .resize (cuda_graph->num_nodes );
2967- if (cuda_graph->num_nodes > 0 ) {
2968- CUDA_CHECK (cudaGraphGetNodes (cuda_graph->graph , cuda_graph->nodes .data (), &cuda_graph->num_nodes ));
2969-
2970- // Loop over nodes, and extract kernel parameters from each node
2971- for (size_t i = 0 ; i < cuda_graph->num_nodes ; i++) {
2972- cudaGraphNodeType node_type;
2973- CUDA_CHECK (cudaGraphNodeGetType (cuda_graph->nodes [i], &node_type));
2974- if (node_type == cudaGraphNodeTypeKernel) {
2975- // Get params using runtime
2976- cudaError_t stat = cudaGraphKernelNodeGetParams (cuda_graph->nodes [i], &cuda_graph->params [i]);
2977- if (stat == cudaErrorInvalidDeviceFunction) {
2978- // Fails due to incorrect handling by CUDA runtime of CUDA BLAS node.
2979- // We don't need to update blas nodes, so clear error and move on.
2980- (void )cudaGetLastError ();
2981- } else {
2982- GGML_ASSERT (stat == cudaSuccess);
2983- }
2984- }
2985- }
2986- }
2987- } else {
2988- // One of the arguments to the copy kernel is updated for each token, hence we need to
2989- // replace that argument with the updated value in the CUDA graph
2990- // on update steps, the live parameters will already be captured
2991- int k = 0 ;
2992- for (size_t i = 0 ; i < cuda_graph->num_nodes ; i++) {
2993- if (count (ggml_cuda_cpy_fn_ptrs.begin (), ggml_cuda_cpy_fn_ptrs.end (), cuda_graph->params [i].func ) > 0 ) {
2994- char ** updated_kernel_arg_ptr = cuda_graph->updated_kernel_arg .at (k++);
2995- cuda_graph->params [i].kernelParams [1 ] = updated_kernel_arg_ptr;
2996- CUDA_CHECK (cudaGraphKernelNodeSetParams (cuda_graph->nodes [i], &cuda_graph->params [i]));
2997- }
2998- }
2999- }
3000- }
3001-
3002- static bool is_cuda_graph_update_required (std::unique_ptr<ggml_cuda_graph> & cuda_graph, ggml_cgraph * cgraph,
3003- cgraph_offset & offset) {
3004- >>>>>>> parent of 7eea329f6 (Revert " Akieslinger/reduce cuda graph cpu overhead #332" )
2944+ static bool is_cuda_graph_update_required (ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph) {
30052945
30062946 bool cuda_graph_update_required = false ;
30072947
3008- if (cuda_graph->instance == nullptr ) {
2948+ if (cuda_ctx-> cuda_graph ->instance == nullptr ) {
30092949 cuda_graph_update_required = true ;
30102950 }
30112951
30122952 // Check if the graph size has changed
3013- if (cuda_graph->ggml_graph_properties .size () != (size_t )(offset. end - offset. begin ) ) {
2953+ if (cuda_ctx-> cuda_graph ->ggml_graph_properties .size () != (size_t )cgraph-> n_nodes ) {
30142954 cuda_graph_update_required = true ;
3015- cuda_graph->ggml_graph_properties .resize ((offset. end - offset. begin ) );
2955+ cuda_ctx-> cuda_graph ->ggml_graph_properties .resize (cgraph-> n_nodes );
30162956 }
30172957
30182958 // Loop over nodes in GGML graph to determine if CUDA graph update is required
30192959 // and store properties to allow this comparison for the next token
3020- for (int i = offset. begin ; i < offset. end ; i++) {
2960+ for (int i = 0 ; i < cgraph-> n_nodes ; i++) {
30212961 bool has_matching_properties = true ;
30222962 if (!cuda_graph_update_required) {
3023- has_matching_properties = ggml_graph_node_has_matching_properties (cgraph->nodes [i], &cuda_graph->ggml_graph_properties [i - offset. begin ]);
2963+ has_matching_properties = ggml_graph_node_has_matching_properties (cgraph->nodes [i], &cuda_ctx-> cuda_graph ->ggml_graph_properties [i]);
30242964 }
30252965 if (!has_matching_properties) {
30262966 cuda_graph_update_required = true ;
30272967 }
3028- set_ggml_graph_node_properties (cgraph->nodes [i], &cuda_graph->ggml_graph_properties [i - offset. begin ]);
2968+ set_ggml_graph_node_properties (cgraph->nodes [i], &cuda_ctx-> cuda_graph ->ggml_graph_properties [i]);
30292969 }
30302970
30312971 return cuda_graph_update_required;
30322972}
30332973
3034- static void update_cuda_graph_executable (std::unique_ptr<ggml_cuda_graph> & cuda_graph ) {
2974+ static void update_cuda_graph_executable (ggml_backend_cuda_context * cuda_ctx ) {
30352975
30362976#if CUDART_VERSION >= 12000
30372977 cudaGraphExecUpdateResultInfo result_info;
3038- <<<<<<< HEAD
30392978 cudaError_t stat = cudaGraphExecUpdate (cuda_ctx->cuda_graph ->instance , cuda_ctx->cuda_graph ->graph , &result_info);
30402979#else
30412980 cudaGraphNode_t errorNode;
30422981 cudaGraphExecUpdateResult result_info;
30432982 cudaError_t stat = cudaGraphExecUpdate (cuda_ctx->cuda_graph ->instance , cuda_ctx->cuda_graph ->graph , &errorNode, &result_info);
30442983#endif // CUDART_VERSION >= 12000
30452984
3046- =======
3047- #ifdef __HIP_PLATFORM_AMD__
3048- hipGraphNode_t errorNode;
3049- hipError_t stat = hipGraphExecUpdate(cuda_graph->instance, cuda_graph->graph, &errorNode, &result_info);
3050- #else
3051- cudaError_t stat = cudaGraphExecUpdate(cuda_graph->instance, cuda_graph->graph, &result_info);
3052- #endif
3053- >>> >>> > parent of 7eea329f6 (Revert " Akieslinger/reduce cuda graph cpu overhead #332" )
30542985 if (stat == cudaErrorGraphExecUpdateFailure) {
30552986#ifndef NDEBUG
30562987 GGML_LOG_DEBUG (" %s: CUDA graph update failed\n " , __func__);
@@ -3059,31 +2990,25 @@ static void update_cuda_graph_executable(std::unique_ptr<ggml_cuda_graph> & cuda
30592990 // The pre-existing graph exec cannot be updated due to violated constraints
30602991 // so instead clear error and re-instantiate
30612992 (void )cudaGetLastError ();
3062- CUDA_CHECK (cudaGraphExecDestroy (cuda_graph->instance ));
3063- cuda_graph->instance = nullptr ;
3064- CUDA_CHECK (cudaGraphInstantiate (&cuda_graph->instance , cuda_graph->graph , NULL , NULL , 0 ));
2993+ CUDA_CHECK (cudaGraphExecDestroy (cuda_ctx-> cuda_graph ->instance ));
2994+ cuda_ctx-> cuda_graph ->instance = nullptr ;
2995+ CUDA_CHECK (cudaGraphInstantiate (&cuda_ctx-> cuda_graph ->instance , cuda_ctx-> cuda_graph ->graph , NULL , NULL , 0 ));
30652996 } else {
30662997 GGML_ASSERT (stat == cudaSuccess);
30672998 }
30682999}
30693000#endif
30703001
3071- <<<<<<< HEAD
30723002static void evaluate_and_capture_cuda_graph (ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
30733003 bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) {
30743004 // flag used to determine whether it is an integrated_gpu
30753005 const bool integrated = ggml_cuda_info ().devices [cuda_ctx->device ].integrated ;
3076- =======
3077- static void evaluate_and_capture_cuda_graph (ggml_backend_cuda_context * cuda_ctx, [[maybe_unused]] std::unique_ptr<ggml_cuda_graph> & cuda_graph,
3078- ggml_cgraph * cgraph, [[maybe_unused]] std::vector<void *> & ggml_cuda_cpy_fn_ptrs,
3079- bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required, cgraph_offset & offset) {
3080- >>>>>>> parent of 7eea329f6 (Revert " Akieslinger/reduce cuda graph cpu overhead #332" )
30813006
30823007 while (!graph_evaluated_or_captured) {
30833008 // Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph.
30843009 // With the use of CUDA graphs, the execution will be performed by the graph launch.
30853010 if (!use_cuda_graph || cuda_graph_update_required) {
3086- for (int i = offset. begin ; i < offset. end ; i++) {
3011+ for (int i = 0 ; i < cgraph-> n_nodes ; i++) {
30873012 ggml_tensor * node = cgraph->nodes [i];
30883013
30893014 if (ggml_is_empty (node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
@@ -3113,12 +3038,12 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
31133038
31143039#ifdef USE_CUDA_GRAPH
31153040 if (use_cuda_graph && cuda_graph_update_required) { // End CUDA graph capture
3116- if (cuda_graph->graph != nullptr ) {
3117- CUDA_CHECK (cudaGraphDestroy (cuda_graph->graph ));
3118- cuda_graph->graph = nullptr ;
3041+ if (cuda_ctx-> cuda_graph ->graph != nullptr ) {
3042+ CUDA_CHECK (cudaGraphDestroy (cuda_ctx-> cuda_graph ->graph ));
3043+ cuda_ctx-> cuda_graph ->graph = nullptr ;
31193044 }
31203045
3121- CUDA_CHECK (cudaStreamEndCapture (cuda_ctx->stream (), &cuda_graph->graph ));
3046+ CUDA_CHECK (cudaStreamEndCapture (cuda_ctx->stream (), &cuda_ctx-> cuda_graph ->graph ));
31223047 graph_evaluated_or_captured = true ; // CUDA graph has been captured
31233048
31243049 std::lock_guard<std::mutex> lock (ggml_cuda_lock);
@@ -3131,24 +3056,14 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
31313056 }
31323057
31333058 if (use_cuda_graph) {
3134- if (cuda_graph->instance == nullptr ) { // Create executable graph from captured graph.
3135- CUDA_CHECK (cudaGraphInstantiate (&cuda_graph->instance , cuda_graph->graph , NULL , NULL , 0 ));
3059+ if (cuda_ctx-> cuda_graph ->instance == nullptr ) { // Create executable graph from captured graph.
3060+ CUDA_CHECK (cudaGraphInstantiate (&cuda_ctx-> cuda_graph ->instance , cuda_ctx-> cuda_graph ->graph , NULL , NULL , 0 ));
31363061 }
3137- <<<<<<< HEAD
31383062 if (cuda_graph_update_required) { // Update graph executable
31393063 update_cuda_graph_executable (cuda_ctx);
31403064 }
3141- =======
3142-
3143- // Perform update to graph (if required for this token), and change copy parameter (required for every token)
3144- maintain_cuda_graph (cuda_graph, ggml_cuda_cpy_fn_ptrs, cuda_graph_update_required);
3145-
3146- // Update graph executable
3147- update_cuda_graph_executable (cuda_graph);
3148-
3149- >>> >>> > parent of 7eea329f6 (Revert " Akieslinger/reduce cuda graph cpu overhead #332" )
31503065 // Launch graph
3151- CUDA_CHECK (cudaGraphLaunch (cuda_graph->instance , cuda_ctx->stream ()));
3066+ CUDA_CHECK (cudaGraphLaunch (cuda_ctx-> cuda_graph ->instance , cuda_ctx->stream ()));
31523067#else
31533068 graph_evaluated_or_captured = true ;
31543069#endif // USE_CUDA_GRAPH
@@ -3160,60 +3075,23 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
31603075
31613076 ggml_cuda_set_device (cuda_ctx->device );
31623077
3163- <<<<<<< HEAD
3164- =======
3165- // vector of pointers to CUDA cpy kernels, which are required to identify
3166- // kernel parameters which need updated in the graph for each token
3167- std::vector<void *> ggml_cuda_cpy_fn_ptrs;
3168-
3169- // Heuristic to minimize GPU idle time. Work is split over several CUDA graphs,
3170- // to overlap graph building (CPU) and graph execution (GPU).
3171- // The first graphs are small to minimize the time in which the CPU prepares work and the GPU is idle.
3172- // After that, graph building (CPU) is done in parallel to the execution of another previously built graph (GPU).
3173- int first_graph_subset = 20 ;
3174- int second_graph_subset = 50 ;
3175- int remaining_graph_subset = 100 ;
3176- int remaining_nodes = (cgraph->n_nodes - first_graph_subset) - second_graph_subset;
3177- int num_cuda_graphs_required = 2 + (remaining_nodes / remaining_graph_subset);
3178- cuda_ctx->cuda_graphs.resize(num_cuda_graphs_required);
3179- cgraph_offset offset {0 ,0 };
3180-
3181- for (size_t i = 0 ; i < cuda_ctx->cuda_graphs.size(); i++) {
3182- auto & cuda_graph = cuda_ctx->cuda_graphs [i];
3183-
3184- offset.begin = offset.end ;
3185- if (i == 0 ) offset.end += first_graph_subset;
3186- if (i == 1 ) offset.end += second_graph_subset;
3187- if (i >= 2 ) offset.end += remaining_graph_subset;
3188-
3189- // last graph does the rest
3190- if ((i + 1 ) == cuda_ctx->cuda_graphs .size ()) offset.end = cgraph->n_nodes ;
3191-
3192- // special case for graphs smaller than the ramp-up heuristic
3193- if (cgraph->n_nodes <= first_graph_subset + second_graph_subset) {
3194- offset.end = cgraph->n_nodes ;
3195- if (i > 0 ) break ;
3196- }
3197-
3198- >>>>>>> parent of 7eea329f6 (Revert " Akieslinger/reduce cuda graph cpu overhead #332" )
31993078#ifdef USE_CUDA_GRAPH
3200- static const bool disable_cuda_graphs_due_to_env = (getenv (" GGML_CUDA_DISABLE_GRAPHS" ) != nullptr );
3079+ static const bool disable_cuda_graphs_due_to_env = (getenv (" GGML_CUDA_DISABLE_GRAPHS" ) != nullptr );
32013080
3202- // Objects required for CUDA Graph
3203- if (cuda_graph == nullptr ) {
3204- cuda_graph = std::make_unique< ggml_cuda_graph>( );
3205- }
3081+ // Objects required for CUDA Graph
3082+ if (cuda_ctx-> cuda_graph == nullptr ) {
3083+ cuda_ctx-> cuda_graph . reset ( new ggml_cuda_graph () );
3084+ }
32063085
3207- bool use_cuda_graph = true ;
3208- bool cuda_graph_update_required = false ;
3086+ bool use_cuda_graph = true ;
3087+ bool cuda_graph_update_required = false ;
32093088
3210- if (cuda_graph->graph == nullptr ) {
3211- if (ggml_cuda_info ().devices [cuda_ctx->device ].cc < GGML_CUDA_CC_AMPERE) {
3212- cuda_graph->disable_due_to_gpu_arch = true ;
3089+ if (cuda_ctx-> cuda_graph ->graph == nullptr ) {
3090+ if (ggml_cuda_info ().devices [cuda_ctx->device ].cc < GGML_CUDA_CC_AMPERE) {
3091+ cuda_ctx-> cuda_graph ->disable_due_to_gpu_arch = true ;
32133092#ifndef NDEBUG
3214- GGML_LOG_DEBUG (" %s: disabling CUDA graphs due to GPU architecture\n " , __func__);
3093+ GGML_LOG_DEBUG (" %s: disabling CUDA graphs due to GPU architecture\n " , __func__);
32153094#endif
3216- <<<<<<< HEAD
32173095 }
32183096 }
32193097
@@ -3237,43 +3115,16 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
32373115 cuda_ctx->cuda_graph ->number_consecutive_updates ++;
32383116 } else {
32393117 cuda_ctx->cuda_graph ->number_consecutive_updates = 0 ;
3240- =======
3241- }
3242- >>>>>>> parent of 7eea329f6 (Revert " Akieslinger/reduce cuda graph cpu overhead #332" )
3243- }
3244-
3245- // Disable CUDA graphs in presence of env var, old GPU, use-case which is changing too rapidly,
3246- // or previous graph capture failure.
3247- // Also disable for multi-gpu for now. TO DO investigate
3248- if (disable_cuda_graphs_due_to_env
3249- || cuda_graph->disable_due_to_gpu_arch
3250- || cuda_graph->disable_due_to_too_many_updates
3251- || cuda_graph->disable_due_to_failed_graph_capture) {
3252- use_cuda_graph = false ;
32533118 }
32543119
3255- if (use_cuda_graph) {
3256- cuda_graph_update_required = is_cuda_graph_update_required (cuda_graph, cgraph, offset);
3257-
3258- use_cuda_graph = check_node_graph_compatibility_and_refresh_copy_ops (cuda_graph, cgraph,
3259- ggml_cuda_cpy_fn_ptrs, use_cuda_graph, offset);
3260-
3261- // Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
3262- if (use_cuda_graph && cuda_graph_update_required) {
3263- cuda_graph->number_consecutive_updates ++;
3264- } else {
3265- cuda_graph->number_consecutive_updates = 0 ;
3266- }
3267-
3268- if (cuda_graph->number_consecutive_updates >= 4 ) {
3269- cuda_graph->disable_due_to_too_many_updates = true ;
3120+ if (cuda_ctx->cuda_graph ->number_consecutive_updates >= 4 ) {
3121+ cuda_ctx->cuda_graph ->disable_due_to_too_many_updates = true ;
32703122#ifndef NDEBUG
3271- GGML_LOG_DEBUG (" %s: disabling CUDA graphs due to too many consecutive updates\n " , __func__);
3123+ GGML_LOG_DEBUG (" %s: disabling CUDA graphs due to too many consecutive updates\n " , __func__);
32723124#endif
3273- }
32743125 }
3126+ }
32753127
3276- <<<<<<< HEAD
32773128 if (use_cuda_graph && cuda_graph_update_required) {
32783129 // Start CUDA graph capture
32793130 {
@@ -3283,32 +3134,20 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
32833134
32843135 CUDA_CHECK (cudaStreamBeginCapture (cuda_ctx->stream (), cudaStreamCaptureModeRelaxed));
32853136 }
3286- =======
3287- if (use_cuda_graph && cuda_graph_update_required) { // Start CUDA graph capture
3288- CUDA_CHECK (cudaStreamBeginCapture (cuda_ctx->stream (), cudaStreamCaptureModeRelaxed));
3289- }
3290- >>> >>> > parent of 7eea329f6 (Revert " Akieslinger/reduce cuda graph cpu overhead #332" )
32913137
32923138 if (!use_cuda_graph) {
32933139 cuda_ctx->cuda_graph ->use_cpy_indirection = false ;
32943140 }
32953141
32963142#else
3297- bool use_cuda_graph = false ;
3298- bool cuda_graph_update_required = false ;
3143+ bool use_cuda_graph = false ;
3144+ bool cuda_graph_update_required = false ;
32993145#endif // USE_CUDA_GRAPH
33003146
3301- <<<<<<< HEAD
33023147 bool graph_evaluated_or_captured = false ;
33033148
33043149 evaluate_and_capture_cuda_graph (cuda_ctx, cgraph, graph_evaluated_or_captured, use_cuda_graph, cuda_graph_update_required);
3305- =======
3306- bool graph_evaluated_or_captured = false ;
3307- >>> >>> > parent of 7eea329f6 (Revert " Akieslinger/reduce cuda graph cpu overhead #332" )
33083150
3309- evaluate_and_capture_cuda_graph (cuda_ctx, cuda_graph, cgraph, ggml_cuda_cpy_fn_ptrs,
3310- graph_evaluated_or_captured, use_cuda_graph, cuda_graph_update_required, offset);
3311- }
33123151 return GGML_STATUS_SUCCESS;
33133152}
33143153
0 commit comments