Nexesenex
diff --git a/‎ggml/include/ggml-backend.h‎
Lines changed: 1 addition & 0 deletions b/‎ggml/include/ggml-backend.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎ggml/src/ggml-backend.cpp‎
Lines changed: 111 additions & 1 deletion b/‎ggml/src/ggml-backend.cpp‎
Lines changed: 111 additions & 1 deletion
diff --git a/‎ggml/src/ggml-cuda/common.cuh‎
Lines changed: 5 additions & 5 deletions b/‎ggml/src/ggml-cuda/common.cuh‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎ggml/src/ggml-cuda/cpy.cu‎
Lines changed: 8 additions & 8 deletions b/‎ggml/src/ggml-cuda/cpy.cu‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎ggml/src/ggml-cuda/cpy.cuh‎
Lines changed: 2 additions & 2 deletions b/‎ggml/src/ggml-cuda/cpy.cuh‎
Lines changed: 2 additions & 2 deletions
@@ -95,6 +95,7 @@ extern "C" {
 
     GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph);
     GGML_API void                      ggml_backend_graph_plan_free  (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+    GGML_API void                      ggml_backend_graph_plan_update(ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph * cgraph);
 
     GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
     GGML_API enum ggml_status ggml_backend_graph_compute      (ggml_backend_t backend, struct ggml_cgraph * cgraph);
 
@@ -327,6 +327,18 @@ void ggml_backend_synchronize(ggml_backend_t backend) {
     backend->iface.synchronize(backend);
 }
 
+bool ggml_backend_supports_graph_plan(ggml_backend_t backend) {
+    GGML_ASSERT(backend);
+
+    return (bool) backend->iface.graph_plan_create;
+}
+
+bool ggml_backend_supports_graph_plan_update(ggml_backend_t backend) {
+    GGML_ASSERT(backend);
+
+    return (bool) backend->iface.graph_plan_update;
+}
+
 ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
     GGML_ASSERT(backend);
     GGML_ASSERT(backend->iface.graph_plan_create != NULL);
@@ -341,6 +353,13 @@ void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_pla
     backend->iface.graph_plan_free(backend, plan);
 }
 
+void ggml_backend_graph_plan_update(ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph* cgraph) {
+    GGML_ASSERT(backend);
+    GGML_ASSERT(backend->iface.graph_plan_update != NULL);
+
+    backend->iface.graph_plan_update(backend, plan, cgraph);
+}
+
 enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
     GGML_ASSERT(backend);
     GGML_ASSERT(backend->iface.graph_plan_compute != NULL);
@@ -675,6 +694,11 @@ struct ggml_backend_sched_split {
     struct ggml_cgraph graph;
 };
 
+struct ggml_backend_sched_plan {
+    int backend_id;
+    ggml_backend_graph_plan_t plan;
+};
+
 struct ggml_backend_sched {
     bool is_reset; // true if the scheduler has been reset since the last graph split
     bool is_alloc;
@@ -704,6 +728,12 @@ struct ggml_backend_sched {
     int n_splits;
     int splits_capacity;
 
+    // graph plans
+    struct ggml_backend_sched_plan * plans;
+    int n_plans;
+    int plans_capacity;
+    bool plan_dirty;
+
     // pipeline parallelism support
     int n_copies;
     int cur_copy;
@@ -908,6 +938,16 @@ static void ggml_backend_sched_set_if_supported(ggml_backend_sched_t sched, stru
     }
 }
 
+static void ggml_backend_sched_free_plans(ggml_backend_sched_t sched) {
+    for (int i = 0; i < sched->n_plans; i++) {
+        ggml_backend_t backend = sched->backends[sched->plans[i].backend_id];
+        if (ggml_backend_supports_graph_plan(backend)) {
+            ggml_backend_graph_plan_free(backend, sched->plans[i].plan);
+        }
+    }
+    sched->n_plans = 0;
+}
+
 // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
 void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
     // reset splits
@@ -1372,6 +1412,7 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra
         assert(graph_copy->size > graph_copy->n_leafs);
         graph_copy->leafs[graph_copy->n_leafs++] = leaf;
     }
+    sched->plan_dirty = true;
 }
 
 static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
@@ -1413,6 +1454,62 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
     return true;
 }
 
+static void ggml_backend_sched_update_plans(ggml_backend_sched_t sched) {
+    // create graph plans
+    if (sched->plan_dirty) {
+        bool create_new_plans;
+        if (sched->n_plans == sched->n_splits) {
+            create_new_plans = false;
+            for (int i = 0; i < sched->n_splits; i++) {
+                if (sched->splits[i].backend_id != sched->plans[i].backend_id) {
+                    create_new_plans = true;
+                    break;
+                }
+            }
+        } else {
+            create_new_plans = true;
+        }
+        if (create_new_plans) {
+            // free previous and recreate new plans
+            ggml_backend_sched_free_plans(sched);
+            if (sched->plans_capacity < sched->n_splits) {
+                while (sched->plans_capacity < sched->n_splits) {
+                    sched->plans_capacity *= 2;
+                }
+                sched->plans = (ggml_backend_sched_plan *) realloc(
+                    sched->plans, sched->plans_capacity * sizeof(struct ggml_backend_sched_plan));
+                GGML_ASSERT(sched->plans);
+            }
+            sched->n_plans = sched->n_splits;
+            for (int i = 0; i < sched->n_splits; i++) {
+                ggml_backend_t backend = sched->backends[sched->splits[i].backend_id];
+                sched->plans[i].backend_id = sched->splits[i].backend_id;
+                if (ggml_backend_supports_graph_plan(backend)) {
+                    sched->plans[i].plan = ggml_backend_graph_plan_create(backend, &sched->splits[i].graph);
+                } else {
+                    sched->plans[i].plan = nullptr;
+                }
+            }
+        } else {
+            // update existing plans
+            for (int i = 0; i < sched->n_splits; i++) {
+                ggml_backend_t backend = sched->backends[sched->splits[i].backend_id];
+                if (ggml_backend_supports_graph_plan(backend)) {
+                    if (ggml_backend_supports_graph_plan_update(backend)) {
+                        ggml_backend_graph_plan_update(backend, sched->plans[i].plan, &sched->splits[i].graph);
+                    } else {
+                        ggml_backend_graph_plan_free(backend, sched->plans[i].plan);
+                        sched->plans[i].plan = ggml_backend_graph_plan_create(backend, &sched->splits[i].graph);
+                    }
+                } else {
+                    sched->plans[i].plan = nullptr;
+                }
+            }
+        }
+        sched->plan_dirty = false;
+    }
+}
+
 static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
     GGML_ASSERT(sched);
     struct ggml_backend_sched_split * splits = sched->splits;
@@ -1421,6 +1518,8 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
     std::vector<int32_t> ids;
     std::vector<ggml_bitset_t> used_ids;
 
+    ggml_backend_sched_update_plans(sched);
+
     for (int split_id = 0; split_id < sched->n_splits; split_id++) {
         struct ggml_backend_sched_split * split = &splits[split_id];
         int split_backend_id = split->backend_id;
@@ -1550,7 +1649,12 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
         }
 
         if (!sched->callback_eval) {
-            enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
+            enum ggml_status ec;
+            if (ggml_backend_supports_graph_plan(split_backend) && sched->plans[split_id].plan) {
+                ec = ggml_backend_graph_plan_compute(split_backend, sched->plans[split_id].plan);
+            } else {
+                ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
+            }
             if (ec != GGML_STATUS_SUCCESS) {
                 return ec;
             }
@@ -1637,6 +1741,10 @@ ggml_backend_sched_t ggml_backend_sched_new(
     sched->splits = (ggml_backend_sched_split *) calloc(initial_splits_capacity, sizeof(sched->splits[0]));
     sched->splits_capacity = initial_splits_capacity;
 
+    const int initial_plans_capacity = 16;
+    sched->plans = (ggml_backend_sched_plan *) calloc(initial_plans_capacity, sizeof(sched->plans[0]));
+    sched->plans_capacity = initial_plans_capacity;
+
     for (int b = 0; b < n_backends; b++) {
         sched->backends[b] = backends[b];
         sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]);
@@ -1670,6 +1778,8 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
     ggml_free(sched->ctx);
     ggml_hash_set_free(&sched->hash_set);
     free(sched->splits);
+    ggml_backend_sched_free_plans(sched);
+    free(sched->plans);
     free(sched->hv_tensor_backend_ids);
     free(sched->hv_tensor_copies);
     free(sched->node_backend_ids);
 
@@ -936,13 +936,10 @@ struct ggml_cuda_graph {
     }
     cudaGraph_t graph = nullptr;
     cudaGraphExec_t instance = nullptr;
+    const ggml_cgraph * cgraph;
     size_t num_nodes = 0;
     std::vector<cudaGraphNode_t> nodes;
     std::vector<cudaKernelNodeParams> params;
-    bool disable_due_to_gpu_arch = false;
-    bool disable_due_to_too_many_updates = false;
-    bool disable_due_to_failed_graph_capture = false;
-    int number_consecutive_updates = 0;
     std::vector<ggml_graph_node_properties> ggml_graph_properties;
     bool use_cpy_indirection = false;
     std::vector<char *> cpy_dest_ptrs;
@@ -962,7 +959,10 @@ struct ggml_backend_cuda_context {
     cudaStream_t streams[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { { nullptr } };
     cublasHandle_t cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
 
-    std::unique_ptr<ggml_cuda_graph> cuda_graph;
+#ifdef USE_CUDA_GRAPH
+    bool cuda_graph_initialized = false;
+    bool disable_due_to_gpu_arch = false;
+#endif
 
     explicit ggml_backend_cuda_context(int device) :
         device(device),
 
@@ -277,7 +277,7 @@ static void ggml_cpy_f32_iq4_nl_cuda(
         (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
 }
 
-void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1, bool disable_indirection_for_this_node) {
+void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, ggml_cuda_graph * cuda_graph, const ggml_tensor * src0, ggml_tensor * src1, bool disable_indirection_for_this_node) {
     const int64_t ne = ggml_nelements(src0);
     GGML_ASSERT(ne == ggml_nelements(src1));
 
@@ -314,9 +314,9 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
     char ** dest_ptrs_d = nullptr;
     int graph_cpynode_index = -1;
 #if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) || defined(GGML_MUSA_GRAPHS)
-    if(ctx.cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) {
-        dest_ptrs_d = ctx.cuda_graph->dest_ptrs_d;
-        graph_cpynode_index = ctx.cuda_graph->graph_cpynode_index;
+    if (cuda_graph && cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) {
+        dest_ptrs_d = cuda_graph->dest_ptrs_d;
+        graph_cpynode_index = cuda_graph->graph_cpynode_index;
     }
 #else
     GGML_UNUSED(disable_indirection_for_this_node);
@@ -387,19 +387,19 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
                 ggml_type_name(src0->type), ggml_type_name(src1->type));
     }
 #if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) || defined(GGML_MUSA_GRAPHS)
-    if(ctx.cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) {
-        ctx.cuda_graph->graph_cpynode_index = graph_cpynode_index;
+    if (cuda_graph && cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) {
+        cuda_graph->graph_cpynode_index = graph_cpynode_index;
     }
 #else
     GGML_UNUSED(disable_indirection_for_this_node);
 #endif
 
 }
 
-void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_cuda_graph * cuda_graph, ggml_tensor * dst) {
     const ggml_tensor * src0 = dst->src[0];
     bool disable_indirection = true;
-    ggml_cuda_cpy(ctx, src0, dst, disable_indirection);
+    ggml_cuda_cpy(ctx, cuda_graph, src0, dst, disable_indirection);
 }
 
 void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
 
@@ -2,9 +2,9 @@
 
 #define CUDA_CPY_BLOCK_SIZE 64
 
-void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1,  bool disable_indirection = false);
+void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, ggml_cuda_graph * cuda_graph, const ggml_tensor * src0, ggml_tensor * src1,  bool disable_indirection = false);
 
-void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_cuda_graph * cuda_graph, ggml_tensor * dst);
 
 void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1);