@@ -243,10 +243,10 @@ static ggml_cuda_device_info ggml_cuda_init() {
243243
244244 info.default_tensor_split [id] = total_vram;
245245 total_vram += prop.totalGlobalMem ;
246-
247- info.devices [id].nsm = prop.multiProcessorCount ;
248- info.devices [id].smpb = prop.sharedMemPerBlock ;
249- info.devices [id].warp_size = prop.warpSize ;
246+ info. devices [id]. integrated = prop. integrated ;
247+ info.devices [id].nsm = prop.multiProcessorCount ;
248+ info.devices [id].smpb = prop.sharedMemPerBlock ;
249+ info.devices [id].warp_size = prop.warpSize ;
250250#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
251251 info.devices [id].smpbo = prop.sharedMemPerBlock ;
252252
@@ -1065,6 +1065,10 @@ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_
10651065 GGML_UNUSED (buft);
10661066}
10671067
1068+ static bool ggml_backend_buft_is_cuda_host (ggml_backend_buffer_type_t buft) {
1069+ return buft->iface .get_name == ggml_backend_cuda_host_buffer_type_name;
1070+ }
1071+
10681072static void ggml_backend_cuda_host_buffer_free_buffer (ggml_backend_buffer_t buffer) {
10691073 CUDA_CHECK (cudaFreeHost (buffer->context ));
10701074}
@@ -2641,6 +2645,8 @@ static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) {
26412645
26422646static void evaluate_and_capture_cuda_graph (ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
26432647 bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) {
2648+ // flag used to determine whether it is an integrated_gpu
2649+ const bool integrated = ggml_cuda_info ().devices [cuda_ctx->device ].integrated ;
26442650
26452651 while (!graph_evaluated_or_captured) {
26462652 // Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph.
@@ -2659,7 +2665,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
26592665 if (node->src [j] != nullptr ) {
26602666 assert (node->src [j]->buffer );
26612667 assert (node->src [j]->buffer ->buft == ggml_backend_cuda_buffer_type (cuda_ctx->device ) ||
2662- ggml_backend_buft_is_cuda_split (node->src [j]->buffer ->buft ));
2668+ ggml_backend_buft_is_cuda_split (node->src [j]->buffer ->buft ) || (integrated && ggml_backend_buft_is_cuda_host (node-> src [j]-> buffer -> buft )) );
26632669 }
26642670 }
26652671#endif
@@ -3266,7 +3272,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
32663272}
32673273
32683274static bool ggml_backend_cuda_device_supports_buft (ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
3269- return (ggml_backend_buft_is_cuda (buft) || ggml_backend_buft_is_cuda_split (buft)) && buft->device == dev;
3275+ ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context ;
3276+ const bool integrated = ggml_cuda_info ().devices [dev_ctx->device ].integrated ;
3277+ return (((ggml_backend_buft_is_cuda (buft) || ggml_backend_buft_is_cuda_split (buft)) && buft->device == dev) || (integrated && ggml_backend_buft_is_cuda_host (buft)));
32703278}
32713279
32723280static int64_t get_op_batch_size (const ggml_tensor * op) {
0 commit comments