From 0c908590fb49f69a486564f3d188b2dfd0f2b3b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sat, 17 May 2025 12:52:07 +0200 Subject: [PATCH 01/65] WIP --- ggml/include/ggml-backend.h | 1 + ggml/src/ggml-backend-impl.h | 1 + ggml/src/ggml-backend.cpp | 6 ++++++ ggml/src/ggml-cuda/ggml-cuda.cu | 5 +++++ 4 files changed, 13 insertions(+) diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h index 778927f68217a..77d8cf364ed11 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -41,6 +41,7 @@ extern "C" { GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft); GGML_API ggml_backend_dev_t ggml_backend_buft_get_device (ggml_backend_buffer_type_t buft); + GGML_API bool ggml_backend_buft_is_split (ggml_backend_buffer_type_t buft); // // Backend buffer diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h index c36c12d6579ac..db8b4ba64e86b 100644 --- a/ggml/src/ggml-backend-impl.h +++ b/ggml/src/ggml-backend-impl.h @@ -26,6 +26,7 @@ extern "C" { size_t (*get_alloc_size)(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // (optional) check if tensor data is in host memory and uses standard ggml tensor layout (defaults to false) bool (*is_host) (ggml_backend_buffer_type_t buft); + bool (*is_split) (ggml_backend_buffer_type_t buft); }; struct ggml_backend_buffer_type { diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index b30b4cb386f9f..f9ddc07453400 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -77,6 +77,10 @@ ggml_backend_dev_t ggml_backend_buft_get_device(ggml_backend_buffer_type_t buft) return buft->device; } +bool ggml_backend_buft_is_split(ggml_backend_buffer_type_t buft) { + return buft->iface.is_split && buft->iface.is_split(buft); +} + // backend buffer ggml_backend_buffer_t ggml_backend_buffer_init( @@ -1971,6 +1975,7 @@ ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) { /* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes /* .is_host = */ ggml_backend_cpu_buffer_type_is_host, + /* .is_split = */ NULL, }, /* .device = */ NULL, // FIXME ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0), /* .context = */ NULL, @@ -1994,6 +1999,7 @@ static ggml_backend_buffer_type_t ggml_backend_cpu_buffer_from_ptr_type(void) { /* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes /* .is_host = */ ggml_backend_cpu_buffer_type_is_host, + /* .is_split = */ NULL, }, /* .device = */ NULL, // FIXME ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0), /* .context = */ NULL, diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 02dc8c12dbd8c..fc27203d56a9c 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -689,6 +689,8 @@ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_t GGML_UNUSED(buft); } +static bool ggml_backend_buft_is_cuda_split(ggml_backend_buffer_type_t buft); + static const ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = { /* .get_name = */ ggml_backend_cuda_buffer_type_get_name, /* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer, @@ -696,6 +698,7 @@ static const ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface /* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size, /* .is_host = */ NULL, + /* .is_split = */ ggml_backend_buft_is_cuda_split, }; ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) { @@ -1013,6 +1016,7 @@ static const ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_inte /* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_alloc_size = */ ggml_backend_cuda_split_buffer_type_get_alloc_size, /* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host, + /* .is_split = */ ggml_backend_buft_is_cuda_split, }; ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split) { @@ -1111,6 +1115,7 @@ ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() { /* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, + /* .is_split = */ NULL, }, /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), 0), /* .context = */ nullptr, From 838f5779c4a3deb825a09ad4ee66a3c7c1cc6124 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sat, 17 May 2025 13:34:14 +0200 Subject: [PATCH 02/65] WIP --- ggml/src/ggml-backend.cpp | 24 +++++++++++++++++++++++- ggml/src/ggml-cuda/ggml-cuda.cu | 6 +++--- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index f9ddc07453400..ab5b288296f29 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1401,7 +1401,29 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s } } - if (!sched->callback_eval) { + constexpr bool tp = true; + + if (tp) { + for (int j0 = 0; j0 < split->graph.n_nodes; j0++) { + struct ggml_tensor * dst = split->graph.nodes[j0]; + struct ggml_tensor * src0 = dst->src[0]; + bool split_src0 = src0 && ggml_backend_buft_is_split(ggml_backend_buffer_get_type(src0->buffer)); + int j1 = j0; + + while (j1 < split->graph.n_nodes - 1 && !split_src0) { + dst = split->graph.nodes[++j1]; + split_src0 = ggml_backend_buft_is_split(ggml_backend_buffer_get_type(src0->buffer)); + } + + struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1); + enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv); + if (ec != GGML_STATUS_SUCCESS) { + return ec; + } + + j0 = j1; + } + } else if (!sched->callback_eval) { enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph); if (ec != GGML_STATUS_SUCCESS) { return ec; diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index fc27203d56a9c..45d41f5645355 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -1912,7 +1912,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co } static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft); + const bool split = false && ggml_backend_buft_is_cuda_split(src0->buffer->buft); // If src0 is a temporary compute buffer it may have some padding that needs to be cleared for mul_mat_vec_q or mul_mat_q. // But if src0 is also a view of another tensor then this cannot be done safely because it may overwrite valid tensor data. @@ -2129,7 +2129,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst) { // why is this here instead of mul_mat? - if (dst->src[0] != nullptr && ggml_backend_buft_is_cuda_split(dst->src[0]->buffer->buft)) { + if (false && dst->src[0] != nullptr && ggml_backend_buft_is_cuda_split(dst->src[0]->buffer->buft)) { ggml_cuda_set_peer_access(dst->src[1]->ne[1], ctx.device); } @@ -2997,7 +2997,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g struct ggml_tensor * b = op->src[1]; // for small weight matrices the active device can end up without any rows, don't use row split in those cases // this avoids some edge cases (and the performance would not be good anyways) - if (a->buffer && ggml_backend_buft_is_cuda_split(a->buffer->buft)) { + if (false && a->buffer && ggml_backend_buft_is_cuda_split(a->buffer->buft)) { ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) a->buffer->buft->context; int64_t row_low; int64_t row_high; From aedc3f72854ce3c79ed691210a155a8c05680424 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sat, 17 May 2025 13:51:37 +0200 Subject: [PATCH 03/65] WIP --- ggml/src/ggml-backend.cpp | 29 +++++++++++++++----------- ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp | 1 + 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index ab5b288296f29..ac74f0e29c339 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1404,21 +1404,26 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s constexpr bool tp = true; if (tp) { - for (int j0 = 0; j0 < split->graph.n_nodes; j0++) { - struct ggml_tensor * dst = split->graph.nodes[j0]; - struct ggml_tensor * src0 = dst->src[0]; - bool split_src0 = src0 && ggml_backend_buft_is_split(ggml_backend_buffer_get_type(src0->buffer)); - int j1 = j0; + for (int j0 = 0; j0 < split->graph.n_nodes;) { + bool split_src0 = false; - while (j1 < split->graph.n_nodes - 1 && !split_src0) { - dst = split->graph.nodes[++j1]; - split_src0 = ggml_backend_buft_is_split(ggml_backend_buffer_get_type(src0->buffer)); + int j1 = j0; + for (;j1 < split->graph.n_nodes; ++j1) { + struct ggml_tensor * dst = split->graph.nodes[j1]; + struct ggml_tensor * src0 = dst->src[0]; + split_src0 = src0 && ggml_backend_buft_is_split(ggml_backend_buffer_get_type(src0->buffer)); } + GGML_ASSERT(j1 > j0); - struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1); - enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv); - if (ec != GGML_STATUS_SUCCESS) { - return ec; + if (split_src0) { + GGML_ASSERT(j1 == j0 + 1); + GGML_ASSERT(false); + } else { + struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1); + enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv); + if (ec != GGML_STATUS_SUCCESS) { + return ec; + } } j0 = j1; diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp index 8ff6d64a4d0d1..68f864342ac45 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp @@ -6422,6 +6422,7 @@ ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void) { /* .get_max_size = */ nullptr, // defaults to SIZE_MAX /* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes /* .is_host = */ nullptr, + /* .is_split = */ nullptr, }, /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0), /* .context = */ new ggml::cpu::aarch64::extra_buffer_type(), From 99bb015884de306aa4155e1147d1ad779bf8f0d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sat, 17 May 2025 14:12:34 +0200 Subject: [PATCH 04/65] try fix --- ggml/src/ggml-backend.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index ac74f0e29c339..759383dcd8f06 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1408,7 +1408,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s bool split_src0 = false; int j1 = j0; - for (;j1 < split->graph.n_nodes; ++j1) { + for (;!split_src0 && j1 < split->graph.n_nodes; ++j1) { struct ggml_tensor * dst = split->graph.nodes[j1]; struct ggml_tensor * src0 = dst->src[0]; split_src0 = src0 && ggml_backend_buft_is_split(ggml_backend_buffer_get_type(src0->buffer)); From 06d7a8881dd83c9350ce400661bd3e0d9b4ad48b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sat, 17 May 2025 14:29:41 +0200 Subject: [PATCH 05/65] WIP --- ggml/src/ggml-backend.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 759383dcd8f06..a9d16ca79dd24 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1416,15 +1416,18 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s GGML_ASSERT(j1 > j0); if (split_src0) { - GGML_ASSERT(j1 == j0 + 1); - GGML_ASSERT(false); - } else { + j1--; + } + if (j1 > j0) { struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1); enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv); if (ec != GGML_STATUS_SUCCESS) { return ec; } } + if (split_src0) { + GGML_ASSERT(false); + } j0 = j1; } From 0a695558615cc0be11c5458dc4d0a62515778a27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sat, 17 May 2025 17:42:25 +0200 Subject: [PATCH 06/65] WIP --- ggml/src/ggml-alloc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c index 5fd379f6a9461..30aa3fef8cfb5 100644 --- a/ggml/src/ggml-alloc.c +++ b/ggml/src/ggml-alloc.c @@ -758,8 +758,9 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c } } - size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0; - size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]); + // FIXME + size_t cur_size = (galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0) + 1024*1024*1024; + size_t new_size = (ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i])) + 1024*1024*1024; // even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views if (new_size > cur_size || galloc->buffers[i] == NULL) { From 7563db81e35408ee007a1358898a172c1cd6eeb0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sat, 17 May 2025 20:22:35 +0200 Subject: [PATCH 07/65] WIP --- ggml/src/ggml-alloc.c | 5 ++--- ggml/src/ggml-backend.cpp | 32 +------------------------------- 2 files changed, 3 insertions(+), 34 deletions(-) diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c index 30aa3fef8cfb5..5fd379f6a9461 100644 --- a/ggml/src/ggml-alloc.c +++ b/ggml/src/ggml-alloc.c @@ -758,9 +758,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c } } - // FIXME - size_t cur_size = (galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0) + 1024*1024*1024; - size_t new_size = (ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i])) + 1024*1024*1024; + size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0; + size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]); // even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views if (new_size > cur_size || galloc->buffers[i] == NULL) { diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index a9d16ca79dd24..f9ddc07453400 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1401,37 +1401,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s } } - constexpr bool tp = true; - - if (tp) { - for (int j0 = 0; j0 < split->graph.n_nodes;) { - bool split_src0 = false; - - int j1 = j0; - for (;!split_src0 && j1 < split->graph.n_nodes; ++j1) { - struct ggml_tensor * dst = split->graph.nodes[j1]; - struct ggml_tensor * src0 = dst->src[0]; - split_src0 = src0 && ggml_backend_buft_is_split(ggml_backend_buffer_get_type(src0->buffer)); - } - GGML_ASSERT(j1 > j0); - - if (split_src0) { - j1--; - } - if (j1 > j0) { - struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1); - enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv); - if (ec != GGML_STATUS_SUCCESS) { - return ec; - } - } - if (split_src0) { - GGML_ASSERT(false); - } - - j0 = j1; - } - } else if (!sched->callback_eval) { + if (!sched->callback_eval) { enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph); if (ec != GGML_STATUS_SUCCESS) { return ec; From 363e2379f0b0faa18e4b6e57a12f818dc292b3b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Mon, 19 May 2025 21:36:38 +0200 Subject: [PATCH 08/65] WIP --- ggml/src/ggml-backend.cpp | 82 ++++++++++++++++++++++++++++++++------- ggml/src/ggml-impl.h | 62 +++++++++++++++++++++++++++++ ggml/src/ggml-opt.cpp | 64 +----------------------------- 3 files changed, 131 insertions(+), 77 deletions(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index f9ddc07453400..e3bf11f8d4486 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -624,10 +624,12 @@ static bool ggml_is_view_op(enum ggml_op op) { #define GGML_SCHED_MAX_COPIES 4 #endif +// slice of a graph for nodes [i_start, i_end) running on backend sched->backends[backend_id] struct ggml_backend_sched_split { int backend_id; int i_start; int i_end; + bool tensor_parallel; // whether the nodes in this split are shared with splits for other backends struct ggml_tensor * inputs[GGML_SCHED_MAX_SPLIT_INPUTS]; int n_inputs; // graph view of this split @@ -927,6 +929,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg // expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend) // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops // ops unsupported by the backend being expanded will be left unassigned so that they can be assigned later when the locations of its inputs are known + // expand gpu down { int cur_backend_id = -1; @@ -1100,6 +1103,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg struct ggml_tensor * node = graph->nodes[i]; if (!ggml_is_view_op(node->op)) { split->backend_id = tensor_backend_id(node); + split->tensor_parallel = node->src[0] && ggml_backend_buft_is_split( + ggml_backend_buffer_get_type(node->src[0]->buffer)); break; } } @@ -1125,10 +1130,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg if (src == NULL) { continue; } + // check if a weight is on a different and incompatible backend // by starting a new split, the memory of the previously offloaded weights can be reused if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { - int src_backend_id = tensor_backend_id(src); + const int src_backend_id = tensor_backend_id(src); if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) { need_new_split = true; break; @@ -1138,8 +1144,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg // FIXME: count the number of inputs instead of only checking when full if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) { const size_t id = hash_id(src); - int src_backend_id = sched->hv_tensor_backend_ids[id]; - bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id); + const int src_backend_id = sched->hv_tensor_backend_ids[id]; + const bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id); if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) { need_new_split = true; break; @@ -1148,6 +1154,33 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } } + const bool src0_on_split_buffer = node->src[0] && node->src[0]->buffer && + ggml_backend_buft_is_split(ggml_backend_buffer_get_type(node->src[0]->buffer)); + GGML_ASSERT(!src0_on_split_buffer || node->op == GGML_OP_MUL_MAT); + need_new_split = src0_on_split_buffer != split->tensor_parallel; + + if (need_new_split && !src0_on_split_buffer && split->tensor_parallel) { + split->i_end = i; + + // FIXME + const int n_gpus = sched->n_backends - 1; + GGML_ASSERT(split->backend_id != sched->n_backends - 1); + for (int j = 0; j < n_gpus; j++) { + if (j == split->backend_id) { + continue; + } + + i_split++; + if (i_split >= sched->splits_capacity) { + sched->splits_capacity *= 2; + sched->splits = (ggml_backend_sched_split *) + realloc(sched->splits, sched->splits_capacity * sizeof(struct ggml_backend_sched_split)); + GGML_ASSERT(sched->splits != NULL); + } + sched->splits[i_split] = *split; + sched->splits[i_split].backend_id = j; + } + } if (node_backend_id != cur_backend_id || need_new_split) { split->i_end = i; i_split++; @@ -1161,6 +1194,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg split->backend_id = node_backend_id; split->i_start = i; split->n_inputs = 0; + split->tensor_parallel = src0_on_split_buffer; cur_backend_id = node_backend_id; } @@ -1171,12 +1205,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg continue; } - size_t src_id = hash_id(src); + const size_t src_id = hash_id(src); const int src_backend_id = sched->hv_tensor_backend_ids[src_id]; assert(src_backend_id != -1); // all inputs should be assigned by now if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) { - if (tensor_id_copy(src_id, src_backend_id, 0) == NULL) { + if (tensor_id_copy(src_id, src_backend_id, 0) == nullptr) { ggml_backend_t backend = sched->backends[src_backend_id]; for (int c = 0; c < sched->n_copies; c++) { struct ggml_tensor * tensor_copy; @@ -1193,7 +1227,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg tensor_id_copy(src_id, src_backend_id, c) = tensor_copy; SET_CAUSE(tensor_copy, "4.cpy"); } - int n_graph_inputs = sched->n_graph_inputs++; + const int n_graph_inputs = sched->n_graph_inputs++; GGML_ASSERT(n_graph_inputs < GGML_SCHED_MAX_SPLIT_INPUTS); sched->graph_inputs[n_graph_inputs] = src; } @@ -1213,7 +1247,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg tensor_id_copy(src_id, cur_backend_id, c) = tensor_copy; SET_CAUSE(tensor_copy, "4.cpy"); } - int n_inputs = split->n_inputs++; + const int n_inputs = split->n_inputs++; GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS); split->inputs[n_inputs] = src; } @@ -1229,6 +1263,24 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg ggml_backend_sched_print_assignments(sched, graph); } +#ifndef NDEBUG + // assert that the splits are in the expected order + // subsequent splits are expected to either contain a contiguous slice of the graph + // or to contain the same nodes but executed on subsequent backends + assert(sched->n_splits >= 1); + assert(sched->splits[0].i_start == 0); + for (int i = 1; i < sched->n_splits; i++) { + const ggml_backend_sched_split & split_now = sched->splits[i]; + const ggml_backend_sched_split & split_prev = sched->splits[i - 1]; + const bool splits_sequential = split_now.i_start == split_prev.i_end; + const bool splits_parallel = split_now.tensor_parallel && split_prev.tensor_parallel && + split_now.backend_id != split_prev.backend_id && + split_now.i_start == split_prev.i_start && split_now.i_end == split_prev.i_end; + assert(splits_sequential || splits_parallel); + } + assert(sched->splits[sched->n_splits-1].i_end == graph->n_nodes); +#endif // NDEBUG + // swap node_backend_ids and leaf _backend_ids with prevs { int * tmp = sched->node_backend_ids; @@ -1240,13 +1292,13 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg sched->prev_leaf_backend_ids = tmp; } - int graph_size = std::max(graph->n_nodes, graph->n_leafs) + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sched->n_copies; + const int graph_size = std::max(graph->n_nodes, graph->n_leafs) + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sched->n_copies; if (sched->graph.size < graph_size) { sched->graph.size = graph_size; sched->graph.nodes = (ggml_tensor **) realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *)); sched->graph.leafs = (ggml_tensor **) realloc(sched->graph.leafs, graph_size * sizeof(struct ggml_tensor *)); - GGML_ASSERT(sched->graph.nodes != NULL); - GGML_ASSERT(sched->graph.leafs != NULL); + GGML_ASSERT(sched->graph.nodes != nullptr); + GGML_ASSERT(sched->graph.leafs != nullptr); } sched->graph.n_nodes = 0; sched->graph.n_leafs = 0; @@ -1287,8 +1339,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg // add input copies as leafs so that they are allocated first for (int i = 0; i < sched->n_graph_inputs; i++) { struct ggml_tensor * input = sched->graph_inputs[i]; - size_t id = hash_id(input); - int backend_id = tensor_backend_id(input); + const size_t id = hash_id(input); + const int backend_id = tensor_backend_id(input); for (int c = 0; c < sched->n_copies; c++) { struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c); sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id; @@ -1299,10 +1351,10 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg for (int i = 0; i < sched->n_splits; i++) { struct ggml_backend_sched_split * split = &sched->splits[i]; - int backend_id = split->backend_id; + const int backend_id = split->backend_id; for (int j = 0; j < split->n_inputs; j++) { struct ggml_tensor * input = split->inputs[j]; - size_t id = hash_id(input); + const size_t id = hash_id(input); for (int c = 0; c < sched->n_copies; c++) { struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c); sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id; @@ -1363,7 +1415,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s for (int i = 0; i < sched->n_splits; i++) { struct ggml_backend_sched_split * split = &splits[i]; - int split_backend_id = split->backend_id; + const int split_backend_id = split->backend_id; ggml_backend_t split_backend = sched->backends[split_backend_id]; // copy the input tensors to the split backend diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index a19cfb14e0f9f..a1d91eefc157d 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -593,9 +593,71 @@ static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) { #ifdef __cplusplus #include +#include // expose GGUF internals for test code GGML_API size_t gguf_type_size(enum gguf_type type); GGML_API struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params); GGML_API void gguf_write_to_buf(const struct gguf_context * ctx, std::vector & buf, bool only_meta); + +static ggml_tensor * map_tensor(std::map & tensor_map, ggml_context * ctx, ggml_tensor * tensor) { + if (!tensor) { + return nullptr; + } + + if (tensor_map.find(tensor) != tensor_map.end()) { + return tensor_map[tensor]; + } + + ggml_tensor * new_tensor = ggml_dup_tensor(ctx, tensor); + tensor_map[tensor] = new_tensor; + + new_tensor->op = tensor->op; + for (int i = 0; i < GGML_MAX_DIMS; i++) { + new_tensor->nb[i] = tensor->nb[i]; + } + new_tensor->flags = tensor->flags; + memcpy(new_tensor->op_params, tensor->op_params, sizeof(tensor->op_params)); + strcpy(new_tensor->name, tensor->name); + new_tensor->data = tensor->data; + new_tensor->buffer = tensor->buffer; + new_tensor->extra = tensor->extra; + new_tensor->view_offs = tensor->view_offs; + new_tensor->view_src = map_tensor(tensor_map, ctx, tensor->view_src); + for (int i = 0; i < GGML_MAX_SRC; i++) { + new_tensor->src[i] = map_tensor(tensor_map, ctx, tensor->src[i]); + } + + return new_tensor; +} + +static void dup_graph(ggml_context * ctx, const ggml_cgraph * src, ggml_cgraph * dst) { + std::map tensor_map; + + for (int i = 0; i < src->n_leafs; i++) { + ggml_build_forward_expand(dst, map_tensor(tensor_map, ctx, src->leafs[i])); + } + GGML_ASSERT(dst->n_leafs == src->n_leafs); + + for (int i = 0; i < src->n_nodes; i++) { + ggml_build_forward_expand(dst, map_tensor(tensor_map, ctx, src->nodes[i])); + } + GGML_ASSERT(dst->n_nodes == src->n_nodes); + + if (src->grads) { + GGML_ASSERT(dst->grads); + for (int i = 0; i < src->n_nodes; ++i) { + const size_t igrad_src = ggml_hash_find(&src->visited_hash_set, src->nodes[i]); + const size_t igrad_dst = ggml_hash_find(&dst->visited_hash_set, dst->nodes[i]); + + GGML_ASSERT(igrad_src != GGML_HASHSET_FULL); + GGML_ASSERT(ggml_bitset_get(src->visited_hash_set.used, igrad_src)); + GGML_ASSERT(igrad_dst != GGML_HASHSET_FULL); + GGML_ASSERT(ggml_bitset_get(dst->visited_hash_set.used, igrad_dst)); + + dst->grads[igrad_dst] = src->grads[igrad_src]; + dst->grad_accs[igrad_dst] = src->grad_accs[igrad_src]; + } + } +} #endif // __cplusplus diff --git a/ggml/src/ggml-opt.cpp b/ggml/src/ggml-opt.cpp index 58d77578f458d..09914ce96bd8d 100644 --- a/ggml/src/ggml-opt.cpp +++ b/ggml/src/ggml-opt.cpp @@ -9,7 +9,6 @@ #include #include #include -#include #include #include @@ -252,66 +251,6 @@ struct ggml_opt_params ggml_opt_default_params( }; } -static ggml_tensor * map_tensor(std::map & tensor_map, ggml_context * ctx, ggml_tensor * tensor) { - if (!tensor) { - return nullptr; - } - - if (tensor_map.find(tensor) != tensor_map.end()) { - return tensor_map[tensor]; - } - - ggml_tensor * new_tensor = ggml_dup_tensor(ctx, tensor); - tensor_map[tensor] = new_tensor; - - new_tensor->op = tensor->op; - for (int i = 0; i < GGML_MAX_DIMS; i++) { - new_tensor->nb[i] = tensor->nb[i]; - } - new_tensor->flags = tensor->flags; - memcpy(new_tensor->op_params, tensor->op_params, sizeof(tensor->op_params)); - strcpy(new_tensor->name, tensor->name); - new_tensor->data = tensor->data; - new_tensor->buffer = tensor->buffer; - new_tensor->extra = tensor->extra; - new_tensor->view_offs = tensor->view_offs; - new_tensor->view_src = map_tensor(tensor_map, ctx, tensor->view_src); - for (int i = 0; i < GGML_MAX_SRC; i++) { - new_tensor->src[i] = map_tensor(tensor_map, ctx, tensor->src[i]); - } - - return new_tensor; -} - -static ggml_cgraph * dup_graph(ggml_context * ctx, ggml_cgraph * src) { - std::map tensor_map; - - ggml_cgraph * dst = ggml_new_graph_custom(ctx, src->size, /*grads =*/ true); - - for (int i = 0; i < src->n_leafs; i++) { - ggml_build_forward_expand(dst, map_tensor(tensor_map, ctx, src->leafs[i])); - } - GGML_ASSERT(dst->n_leafs == src->n_leafs); - for (int i = 0; i < src->n_nodes; i++) { - ggml_build_forward_expand(dst, map_tensor(tensor_map, ctx, src->nodes[i])); - } - GGML_ASSERT(dst->n_nodes == src->n_nodes); - for (int i = 0; i < src->n_nodes; ++i) { - const size_t igrad_src = ggml_hash_find(&src->visited_hash_set, src->nodes[i]); - const size_t igrad_dst = ggml_hash_find(&dst->visited_hash_set, dst->nodes[i]); - - GGML_ASSERT(igrad_src != GGML_HASHSET_FULL); - GGML_ASSERT(ggml_bitset_get(src->visited_hash_set.used, igrad_src)); - GGML_ASSERT(igrad_dst != GGML_HASHSET_FULL); - GGML_ASSERT(ggml_bitset_get(dst->visited_hash_set.used, igrad_dst)); - - dst->grads[igrad_dst] = src->grads[igrad_src]; - dst->grad_accs[igrad_dst] = src->grad_accs[igrad_src]; - } - - return dst; -} - static void ggml_opt_build(ggml_opt_context_t opt_ctx) { GGML_ASSERT(opt_ctx->ctx_compute && "no compute context set, either use static graphs or set one with ggml_opt_prepare_alloc"); GGML_ASSERT((!opt_ctx->static_graphs || opt_ctx->inputs->data) && "when using static graphs the inputs must be allocated statically"); @@ -738,7 +677,8 @@ void ggml_opt_alloc(ggml_opt_context_t opt_ctx, bool backward) { ggml_free(opt_ctx->ctx_copy); opt_ctx->ctx_copy = ggml_init(params); - opt_ctx->allocated_graph_copy = dup_graph(opt_ctx->ctx_copy, graph); + opt_ctx->allocated_graph_copy = ggml_new_graph_custom(opt_ctx->ctx_copy, graph->size, /*grads =*/ true); + dup_graph(opt_ctx->ctx_copy, graph, opt_ctx->allocated_graph_copy); } else { opt_ctx->allocated_graph_copy = graph; } From 47e6d245dc19e285ce86eb821618f4fa2b090db8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Tue, 20 May 2025 11:34:20 +0200 Subject: [PATCH 09/65] fix --- ggml/src/ggml-backend.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index e3bf11f8d4486..e8797c8436165 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1157,7 +1157,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg const bool src0_on_split_buffer = node->src[0] && node->src[0]->buffer && ggml_backend_buft_is_split(ggml_backend_buffer_get_type(node->src[0]->buffer)); GGML_ASSERT(!src0_on_split_buffer || node->op == GGML_OP_MUL_MAT); - need_new_split = src0_on_split_buffer != split->tensor_parallel; + if (src0_on_split_buffer != split->tensor_parallel) { + need_new_split = true; + } if (need_new_split && !src0_on_split_buffer && split->tensor_parallel) { split->i_end = i; From 751e488638172358e1ebad152cc870e6780c0135 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Tue, 20 May 2025 11:38:15 +0200 Subject: [PATCH 10/65] WIP --- ggml/src/ggml-backend.cpp | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index e8797c8436165..08bc4c70e3db1 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1161,28 +1161,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg need_new_split = true; } - if (need_new_split && !src0_on_split_buffer && split->tensor_parallel) { - split->i_end = i; - - // FIXME - const int n_gpus = sched->n_backends - 1; - GGML_ASSERT(split->backend_id != sched->n_backends - 1); - for (int j = 0; j < n_gpus; j++) { - if (j == split->backend_id) { - continue; - } - - i_split++; - if (i_split >= sched->splits_capacity) { - sched->splits_capacity *= 2; - sched->splits = (ggml_backend_sched_split *) - realloc(sched->splits, sched->splits_capacity * sizeof(struct ggml_backend_sched_split)); - GGML_ASSERT(sched->splits != NULL); - } - sched->splits[i_split] = *split; - sched->splits[i_split].backend_id = j; - } - } if (node_backend_id != cur_backend_id || need_new_split) { split->i_end = i; i_split++; From 3f8f323ad8e47c1b55a7bc2685f6e5aa9b172bcd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Tue, 20 May 2025 13:31:23 +0200 Subject: [PATCH 11/65] WIP --- ggml/src/ggml-backend.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 08bc4c70e3db1..3c3a94890d8a6 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1277,8 +1277,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg sched->graph.size = graph_size; sched->graph.nodes = (ggml_tensor **) realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *)); sched->graph.leafs = (ggml_tensor **) realloc(sched->graph.leafs, graph_size * sizeof(struct ggml_tensor *)); - GGML_ASSERT(sched->graph.nodes != nullptr); - GGML_ASSERT(sched->graph.leafs != nullptr); + GGML_ASSERT(sched->graph.nodes != NULL); + GGML_ASSERT(sched->graph.leafs != NULL); } sched->graph.n_nodes = 0; sched->graph.n_leafs = 0; From 316ef4e8541fde76c4669d8022806521d1503639 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Tue, 20 May 2025 18:25:02 +0200 Subject: [PATCH 12/65] WIP --- ggml/src/ggml-backend.cpp | 136 +++++++++++++++++++++++++++++++++++--- 1 file changed, 127 insertions(+), 9 deletions(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 3c3a94890d8a6..667a9f9a79e59 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -650,6 +650,7 @@ struct ggml_backend_sched { struct ggml_hash_set hash_set; int * hv_tensor_backend_ids; // [hash_set.size] struct ggml_tensor ** hv_tensor_copies; // [hash_set.size][n_backends][n_copies] + struct ggml_tensor ** hv_tensor_parallel; // [hash_set.size][n_backends] int * node_backend_ids; // [graph_size] int * leaf_backend_ids; // [graph_size] @@ -689,6 +690,7 @@ struct ggml_backend_sched { #define tensor_backend_id(tensor) sched->hv_tensor_backend_ids[hash_id(tensor)] #define tensor_id_copy(id, backend_id, copy_id) sched->hv_tensor_copies[(id) * sched->n_backends * sched->n_copies + (backend_id) * sched->n_copies + (copy_id)] #define tensor_copy(tensor, backend_id, copy_id) tensor_id_copy(hash_id(tensor), backend_id, copy_id) +#define tensor_id_tp(id, backend_id) sched->hv_tensor_parallel[(id) * sched->n_backends + (backend_id)] // returns the priority of the backend, lower id is higher priority static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) { @@ -1097,6 +1099,10 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg { int i_split = 0; struct ggml_backend_sched_split * split = &sched->splits[0]; + + // FIXME + const int n_gpus = sched->n_backends - 1; + // find the backend of the first split, skipping view ops int i = 0; for (; i < graph->n_nodes; i++) { @@ -1110,6 +1116,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } split->i_start = 0; split->n_inputs = 0; + GGML_ASSERT(!split->tensor_parallel); int cur_backend_id = split->backend_id; for (; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; @@ -1146,7 +1153,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg const size_t id = hash_id(src); const int src_backend_id = sched->hv_tensor_backend_ids[id]; const bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id); - if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) { + if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == nullptr && !supported) { need_new_split = true; break; } @@ -1163,6 +1170,59 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg if (node_backend_id != cur_backend_id || need_new_split) { split->i_end = i; + { + const ggml_cgraph tmp = ggml_graph_view(graph, split->i_start, split->i_end); + if (split->tensor_parallel && split->backend_id > 0) { + dup_graph(sched->ctx, &tmp, &split->graph); + } else { + split->graph = tmp; + } + } + if (split->tensor_parallel) { + for (int n = 0; n < split->graph.n_nodes; n++) { + ggml_tensor * node_orig = graph->nodes[split->i_start + n]; + ggml_tensor * node_sg = split->graph.nodes[n]; + assert(node_orig == node_sg || split->backend_id != 0); + + const size_t node_id = hash_id(node_orig); + assert(tensor_id_tp(node_id, split->backend_id) == nullptr); + tensor_id_tp(node_id, split->backend_id) = node_sg; + } + + const int i_gpu = split->backend_id; + for (int n = 0; n < split->graph.n_nodes; n++) { + ggml_tensor * dst = split->graph.nodes[n]; + GGML_ASSERT(dst->op == GGML_OP_MUL_MAT); + ggml_tensor * src0 = dst->src[0]; + ggml_tensor * src1 = dst->src[1]; + + GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_is_contiguous(src1)); + GGML_ASSERT(ggml_is_contiguous(dst)); + + GGML_TENSOR_BINARY_OP_LOCALS; + + GGML_ASSERT(ne01 == ne0); + GGML_ASSERT(ne01 % n_gpus == 0); + + const int64_t row_low = ne01 * i_gpu /n_gpus; + const int64_t row_high = ne01 * (i_gpu+1)/n_gpus; + const int64_t row_diff = row_high - row_low; + + src0->data = ((void **) src0->extra)[i_gpu]; + src0->ne[1] = row_diff; + src0->nb[2] = src0->ne[1]*src0->nb[1]; + src0->nb[3] = src0->ne[2]*src0->nb[2]; + + dst->ne[0] = row_diff; + dst->nb[1] = ggml_row_size(dst->type, dst->ne[0]); + dst->nb[2] = dst->ne[1]*dst->nb[1]; + dst->nb[3] = dst->ne[2]*dst->nb[2]; + } + } + + const ggml_backend_sched_split * prev_split = split; + i_split++; if (i_split >= sched->splits_capacity) { sched->splits_capacity *= 2; @@ -1171,11 +1231,28 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg GGML_ASSERT(sched->splits != NULL); } split = &sched->splits[i_split]; - split->backend_id = node_backend_id; - split->i_start = i; - split->n_inputs = 0; - split->tensor_parallel = src0_on_split_buffer; - cur_backend_id = node_backend_id; + + if (src0_on_split_buffer && !prev_split->tensor_parallel) { + split->tensor_parallel = src0_on_split_buffer; + split->backend_id = 0; + split->i_start = i; + split->n_inputs = 0; + cur_backend_id = 0; + } else if (prev_split->tensor_parallel && prev_split->backend_id < n_gpus - 1) { + *split = *prev_split; + split->backend_id++; + + + i = split->i_start; + node = graph->nodes[i]; + cur_backend_id = split->backend_id; + } else { + split->tensor_parallel = src0_on_split_buffer; + split->backend_id = node_backend_id; + split->i_start = i; + split->n_inputs = 0; + cur_backend_id = node_backend_id; + } } // find inputs that are not on the same backend @@ -1213,9 +1290,46 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } } - if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) { + if (tensor_id_tp(src_id, 0) != nullptr) { + if (tensor_id_copy(src_id, cur_backend_id, 0) == nullptr) { + GGML_ASSERT(n_gpus == 2); + ggml_backend_t backend = sched->backends[cur_backend_id]; + + for (int c = 0; c < sched->n_copies; c++) { + ggml_tensor * a = tensor_id_tp(src_id, 0); + ggml_tensor * b = tensor_id_tp(src_id, 1); + if (cur_backend_id == 0) { + ggml_tensor * b_copy = ggml_dup_tensor_layout(sched->ctx, b); + ggml_format_name(b_copy, "%s#%s part1#%d", ggml_backend_name(backend), src->name, c); + SET_CAUSE(b_copy, "5.cpy for gather"); + if (sched->n_copies > 1) { + ggml_set_input(b_copy); + ggml_set_output(b_copy); // prevent ggml-alloc from overwriting the tensor + } + b = b_copy; + } else { + ggml_tensor * a_copy = ggml_dup_tensor_layout(sched->ctx, a); + ggml_format_name(a_copy, "%s#%s part0#%d", ggml_backend_name(backend), src->name, c); + SET_CAUSE(a_copy, "5.cpy for gather"); + if (sched->n_copies > 1) { + ggml_set_input(a_copy); + ggml_set_output(a_copy); // prevent ggml-alloc from overwriting the tensor + } + a = a_copy; + } + ggml_tensor * concat = ggml_concat(sched->ctx, a, b, /*dim =*/ 0); + + ggml_format_name(concat, "%s#%s concat#%d", ggml_backend_name(backend), src->name, c); + tensor_id_copy(src_id, cur_backend_id, c) = concat; + } + const int n_inputs = split->n_inputs++; + GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS); + split->inputs[n_inputs] = src; + } + node->src[j] = tensor_id_copy(src_id, cur_backend_id, sched->cur_copy); + } else if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) { // create a copy of the input in the split's backend - if (tensor_id_copy(src_id, cur_backend_id, 0) == NULL) { + if (tensor_id_copy(src_id, cur_backend_id, 0) == nullptr) { ggml_backend_t backend = sched->backends[cur_backend_id]; for (int c = 0; c < sched->n_copies; c++) { struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src); @@ -1236,6 +1350,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } } split->i_end = graph->n_nodes; + GGML_ASSERT(!split->tensor_parallel); + split->graph = ggml_graph_view(graph, split->i_start, split->i_end); sched->n_splits = i_split + 1; } @@ -1287,7 +1403,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg for (int i = 0; i < sched->n_splits; i++) { struct ggml_backend_sched_split * split = &sched->splits[i]; - split->graph = ggml_graph_view(graph, split->i_start, split->i_end); // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split for (int j = 0; j < split->n_inputs; j++) { @@ -1508,6 +1623,7 @@ ggml_backend_sched_t ggml_backend_sched_new( sched->hash_set = ggml_hash_set_new(graph_size); sched->hv_tensor_backend_ids = (int *) malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0])); sched->hv_tensor_copies = (ggml_tensor **) malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *)); + sched->hv_tensor_parallel = (ggml_tensor **) calloc(sched->hash_set.size * sched->n_backends, sizeof(struct ggml_tensor *)); const size_t ggml_sched_max_splits = graph_size; // at most there is one split for each node in the graph const size_t nodes_size = graph_size + ggml_sched_max_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2; @@ -1558,6 +1674,7 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) { free(sched->splits); free(sched->hv_tensor_backend_ids); free(sched->hv_tensor_copies); + free(sched->hv_tensor_parallel); free(sched->node_backend_ids); free(sched->leaf_backend_ids); free(sched->prev_node_backend_ids); @@ -1574,6 +1691,7 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) { ggml_hash_set_reset(&sched->hash_set); memset(sched->hv_tensor_backend_ids, -1, sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0])); memset(sched->hv_tensor_copies, 0, sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *)); + memset(sched->hv_tensor_parallel, 0, sched->hash_set.size * sched->n_backends * sizeof(struct ggml_tensor *)); sched->is_reset = true; } sched->is_alloc = false; From 47b228f359f00e0f457c74448081022c1f187d34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Tue, 20 May 2025 18:33:37 +0200 Subject: [PATCH 13/65] try fix --- ggml/src/ggml-backend.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 667a9f9a79e59..b7427cd8c2dca 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1121,7 +1121,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg for (; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; - if (ggml_is_view_op(node->op)) { + if (ggml_is_view_op(node->op) && !split->tensor_parallel) { continue; } From cf4d0b6b734bb3e5000188417125a59803b01e62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Tue, 20 May 2025 23:46:13 +0200 Subject: [PATCH 14/65] try fix --- ggml/src/ggml-backend.cpp | 2 +- ggml/src/ggml-impl.h | 22 +++++++++++++++------- ggml/src/ggml-opt.cpp | 2 +- 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index b7427cd8c2dca..55e8041b942d7 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1173,7 +1173,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg { const ggml_cgraph tmp = ggml_graph_view(graph, split->i_start, split->i_end); if (split->tensor_parallel && split->backend_id > 0) { - dup_graph(sched->ctx, &tmp, &split->graph); + dup_graph(sched->ctx, &tmp, &split->graph, /*expand =*/ false); } else { split->graph = tmp; } diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index a1d91eefc157d..7b2f64acc7a78 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -631,17 +631,25 @@ static ggml_tensor * map_tensor(std::map & tensor_ return new_tensor; } -static void dup_graph(ggml_context * ctx, const ggml_cgraph * src, ggml_cgraph * dst) { +static void dup_graph(ggml_context * ctx, const ggml_cgraph * src, ggml_cgraph * dst, bool expand) { std::map tensor_map; - for (int i = 0; i < src->n_leafs; i++) { - ggml_build_forward_expand(dst, map_tensor(tensor_map, ctx, src->leafs[i])); + if (expand) { + for (int i = 0; i < src->n_leafs; i++) { + ggml_build_forward_expand(dst, map_tensor(tensor_map, ctx, src->leafs[i])); + } + for (int i = 0; i < src->n_nodes; i++) { + ggml_build_forward_expand(dst, map_tensor(tensor_map, ctx, src->nodes[i])); + } + } else { + for (int i = 0; i < src->n_leafs; i++) { + dst->leafs[dst->n_leafs++] = map_tensor(tensor_map, ctx, src->leafs[i]); + } + for (int i = 0; i < src->n_nodes; i++) { + dst->nodes[dst->n_nodes++] = map_tensor(tensor_map, ctx, src->nodes[i]); + } } GGML_ASSERT(dst->n_leafs == src->n_leafs); - - for (int i = 0; i < src->n_nodes; i++) { - ggml_build_forward_expand(dst, map_tensor(tensor_map, ctx, src->nodes[i])); - } GGML_ASSERT(dst->n_nodes == src->n_nodes); if (src->grads) { diff --git a/ggml/src/ggml-opt.cpp b/ggml/src/ggml-opt.cpp index 09914ce96bd8d..1fa35ec271ee3 100644 --- a/ggml/src/ggml-opt.cpp +++ b/ggml/src/ggml-opt.cpp @@ -678,7 +678,7 @@ void ggml_opt_alloc(ggml_opt_context_t opt_ctx, bool backward) { opt_ctx->ctx_copy = ggml_init(params); opt_ctx->allocated_graph_copy = ggml_new_graph_custom(opt_ctx->ctx_copy, graph->size, /*grads =*/ true); - dup_graph(opt_ctx->ctx_copy, graph, opt_ctx->allocated_graph_copy); + dup_graph(opt_ctx->ctx_copy, graph, opt_ctx->allocated_graph_copy, /*expand =*/ true); } else { opt_ctx->allocated_graph_copy = graph; } From bb48a906fc86c55c4c5555c6425d2075c9736c40 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Wed, 21 May 2025 00:06:13 +0200 Subject: [PATCH 15/65] try fix --- ggml/src/ggml-backend.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 55e8041b942d7..43bee69b5261b 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1173,6 +1173,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg { const ggml_cgraph tmp = ggml_graph_view(graph, split->i_start, split->i_end); if (split->tensor_parallel && split->backend_id > 0) { + split->graph = *ggml_new_graph_custom(sched->ctx, tmp.size, /*grads =*/ false); dup_graph(sched->ctx, &tmp, &split->graph, /*expand =*/ false); } else { split->graph = tmp; From 016405b26225f67e7324fb84bdfce3de9432ce35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Wed, 21 May 2025 23:35:47 +0200 Subject: [PATCH 16/65] WIP --- ggml/src/ggml-backend.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 43bee69b5261b..c22317d20fe31 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1328,7 +1328,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg split->inputs[n_inputs] = src; } node->src[j] = tensor_id_copy(src_id, cur_backend_id, sched->cur_copy); - } else if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) { + } else if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, src, cur_backend_id) && + !ggml_backend_buft_is_split(ggml_backend_buffer_get_type(src->buffer))) { // create a copy of the input in the split's backend if (tensor_id_copy(src_id, cur_backend_id, 0) == nullptr) { ggml_backend_t backend = sched->backends[cur_backend_id]; From 7c17ff1acbfb37c8dcb910b3832bd9839f061a7f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Wed, 21 May 2025 23:37:51 +0200 Subject: [PATCH 17/65] WIP --- ggml/src/ggml-backend.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index c22317d20fe31..6ba382838a4fe 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1329,7 +1329,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } node->src[j] = tensor_id_copy(src_id, cur_backend_id, sched->cur_copy); } else if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, src, cur_backend_id) && - !ggml_backend_buft_is_split(ggml_backend_buffer_get_type(src->buffer))) { + !(src->buffer && ggml_backend_buft_is_split(ggml_backend_buffer_get_type(src->buffer)))) { // create a copy of the input in the split's backend if (tensor_id_copy(src_id, cur_backend_id, 0) == nullptr) { ggml_backend_t backend = sched->backends[cur_backend_id]; From deda9c26bedac6ddf5a8c1c008d0910df61419bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Thu, 22 May 2025 15:03:57 +0200 Subject: [PATCH 18/65] WIP --- ggml/src/ggml-backend.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 6ba382838a4fe..609ed442ab833 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1188,6 +1188,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg const size_t node_id = hash_id(node_orig); assert(tensor_id_tp(node_id, split->backend_id) == nullptr); tensor_id_tp(node_id, split->backend_id) = node_sg; + tensor_backend_id(node_sg) = split->backend_id; } const int i_gpu = split->backend_id; From 3c1291ff5a78e1d0541362b03b8a913ca5c49068 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Thu, 22 May 2025 15:25:50 +0200 Subject: [PATCH 19/65] WIP --- ggml/src/ggml-backend.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 609ed442ab833..a72da7ced359c 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1328,6 +1328,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS); split->inputs[n_inputs] = src; } + fprintf(stderr, "%s: 100 replacing src%d=%s of %s\n", __func__, j, node->src[j]->name, node->name); node->src[j] = tensor_id_copy(src_id, cur_backend_id, sched->cur_copy); } else if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, src, cur_backend_id) && !(src->buffer && ggml_backend_buft_is_split(ggml_backend_buffer_get_type(src->buffer)))) { @@ -1348,6 +1349,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS); split->inputs[n_inputs] = src; } + fprintf(stderr, "%s: 200 replacing src%d=%s of %s\n", __func__, j, node->src[j]->name, node->name); node->src[j] = tensor_id_copy(src_id, cur_backend_id, sched->cur_copy); } } From 16d29fe9e4282e1ce752656f75f2d3db1ae51119 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Thu, 22 May 2025 15:52:00 +0200 Subject: [PATCH 20/65] WIP --- ggml/src/ggml-backend.cpp | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index a72da7ced359c..4fc609c3afab6 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1172,11 +1172,18 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg split->i_end = i; { const ggml_cgraph tmp = ggml_graph_view(graph, split->i_start, split->i_end); - if (split->tensor_parallel && split->backend_id > 0) { + int isplit_main = i_split; + while (sched->splits[isplit_main].tensor_parallel) { + isplit_main--; + assert(isplit_main >= 0); + } + const int backend_id_main = sched->splits[isplit_main].backend_id; + + if (split->backend_id == backend_id_main) { + split->graph = tmp; + } else { split->graph = *ggml_new_graph_custom(sched->ctx, tmp.size, /*grads =*/ false); dup_graph(sched->ctx, &tmp, &split->graph, /*expand =*/ false); - } else { - split->graph = tmp; } } if (split->tensor_parallel) { From 7468e9d4392e471bc774692b03478c54f6883d21 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Thu, 22 May 2025 15:53:30 +0200 Subject: [PATCH 21/65] WIP --- ggml/src/ggml-backend.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 4fc609c3afab6..48f222dff1542 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1190,7 +1190,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg for (int n = 0; n < split->graph.n_nodes; n++) { ggml_tensor * node_orig = graph->nodes[split->i_start + n]; ggml_tensor * node_sg = split->graph.nodes[n]; - assert(node_orig == node_sg || split->backend_id != 0); const size_t node_id = hash_id(node_orig); assert(tensor_id_tp(node_id, split->backend_id) == nullptr); From 119657a5768523d0a75e7ac602028abbcf226a3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Thu, 22 May 2025 17:39:24 +0200 Subject: [PATCH 22/65] WIP --- ggml/src/ggml-backend.cpp | 146 ++------------------------------------ 1 file changed, 7 insertions(+), 139 deletions(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 48f222dff1542..26d487ece5f05 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1099,29 +1099,22 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg { int i_split = 0; struct ggml_backend_sched_split * split = &sched->splits[0]; - - // FIXME - const int n_gpus = sched->n_backends - 1; - // find the backend of the first split, skipping view ops int i = 0; for (; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; if (!ggml_is_view_op(node->op)) { split->backend_id = tensor_backend_id(node); - split->tensor_parallel = node->src[0] && ggml_backend_buft_is_split( - ggml_backend_buffer_get_type(node->src[0]->buffer)); break; } } split->i_start = 0; split->n_inputs = 0; - GGML_ASSERT(!split->tensor_parallel); int cur_backend_id = split->backend_id; for (; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; - if (ggml_is_view_op(node->op) && !split->tensor_parallel) { + if (ggml_is_view_op(node->op)) { continue; } @@ -1161,76 +1154,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } } - const bool src0_on_split_buffer = node->src[0] && node->src[0]->buffer && - ggml_backend_buft_is_split(ggml_backend_buffer_get_type(node->src[0]->buffer)); - GGML_ASSERT(!src0_on_split_buffer || node->op == GGML_OP_MUL_MAT); - if (src0_on_split_buffer != split->tensor_parallel) { - need_new_split = true; - } - if (node_backend_id != cur_backend_id || need_new_split) { split->i_end = i; - { - const ggml_cgraph tmp = ggml_graph_view(graph, split->i_start, split->i_end); - int isplit_main = i_split; - while (sched->splits[isplit_main].tensor_parallel) { - isplit_main--; - assert(isplit_main >= 0); - } - const int backend_id_main = sched->splits[isplit_main].backend_id; - - if (split->backend_id == backend_id_main) { - split->graph = tmp; - } else { - split->graph = *ggml_new_graph_custom(sched->ctx, tmp.size, /*grads =*/ false); - dup_graph(sched->ctx, &tmp, &split->graph, /*expand =*/ false); - } - } - if (split->tensor_parallel) { - for (int n = 0; n < split->graph.n_nodes; n++) { - ggml_tensor * node_orig = graph->nodes[split->i_start + n]; - ggml_tensor * node_sg = split->graph.nodes[n]; - - const size_t node_id = hash_id(node_orig); - assert(tensor_id_tp(node_id, split->backend_id) == nullptr); - tensor_id_tp(node_id, split->backend_id) = node_sg; - tensor_backend_id(node_sg) = split->backend_id; - } - - const int i_gpu = split->backend_id; - for (int n = 0; n < split->graph.n_nodes; n++) { - ggml_tensor * dst = split->graph.nodes[n]; - GGML_ASSERT(dst->op == GGML_OP_MUL_MAT); - ggml_tensor * src0 = dst->src[0]; - ggml_tensor * src1 = dst->src[1]; - - GGML_ASSERT(ggml_is_contiguous(src0)); - GGML_ASSERT(ggml_is_contiguous(src1)); - GGML_ASSERT(ggml_is_contiguous(dst)); - - GGML_TENSOR_BINARY_OP_LOCALS; - - GGML_ASSERT(ne01 == ne0); - GGML_ASSERT(ne01 % n_gpus == 0); - - const int64_t row_low = ne01 * i_gpu /n_gpus; - const int64_t row_high = ne01 * (i_gpu+1)/n_gpus; - const int64_t row_diff = row_high - row_low; - - src0->data = ((void **) src0->extra)[i_gpu]; - src0->ne[1] = row_diff; - src0->nb[2] = src0->ne[1]*src0->nb[1]; - src0->nb[3] = src0->ne[2]*src0->nb[2]; - - dst->ne[0] = row_diff; - dst->nb[1] = ggml_row_size(dst->type, dst->ne[0]); - dst->nb[2] = dst->ne[1]*dst->nb[1]; - dst->nb[3] = dst->ne[2]*dst->nb[2]; - } - } - - const ggml_backend_sched_split * prev_split = split; - i_split++; if (i_split >= sched->splits_capacity) { sched->splits_capacity *= 2; @@ -1239,28 +1164,10 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg GGML_ASSERT(sched->splits != NULL); } split = &sched->splits[i_split]; - - if (src0_on_split_buffer && !prev_split->tensor_parallel) { - split->tensor_parallel = src0_on_split_buffer; - split->backend_id = 0; - split->i_start = i; - split->n_inputs = 0; - cur_backend_id = 0; - } else if (prev_split->tensor_parallel && prev_split->backend_id < n_gpus - 1) { - *split = *prev_split; - split->backend_id++; - - - i = split->i_start; - node = graph->nodes[i]; - cur_backend_id = split->backend_id; - } else { - split->tensor_parallel = src0_on_split_buffer; - split->backend_id = node_backend_id; - split->i_start = i; - split->n_inputs = 0; - cur_backend_id = node_backend_id; - } + split->backend_id = node_backend_id; + split->i_start = i; + split->n_inputs = 0; + cur_backend_id = node_backend_id; } // find inputs that are not on the same backend @@ -1292,52 +1199,13 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg tensor_id_copy(src_id, src_backend_id, c) = tensor_copy; SET_CAUSE(tensor_copy, "4.cpy"); } - const int n_graph_inputs = sched->n_graph_inputs++; + int n_graph_inputs = sched->n_graph_inputs++; GGML_ASSERT(n_graph_inputs < GGML_SCHED_MAX_SPLIT_INPUTS); sched->graph_inputs[n_graph_inputs] = src; } } - if (tensor_id_tp(src_id, 0) != nullptr) { - if (tensor_id_copy(src_id, cur_backend_id, 0) == nullptr) { - GGML_ASSERT(n_gpus == 2); - ggml_backend_t backend = sched->backends[cur_backend_id]; - - for (int c = 0; c < sched->n_copies; c++) { - ggml_tensor * a = tensor_id_tp(src_id, 0); - ggml_tensor * b = tensor_id_tp(src_id, 1); - if (cur_backend_id == 0) { - ggml_tensor * b_copy = ggml_dup_tensor_layout(sched->ctx, b); - ggml_format_name(b_copy, "%s#%s part1#%d", ggml_backend_name(backend), src->name, c); - SET_CAUSE(b_copy, "5.cpy for gather"); - if (sched->n_copies > 1) { - ggml_set_input(b_copy); - ggml_set_output(b_copy); // prevent ggml-alloc from overwriting the tensor - } - b = b_copy; - } else { - ggml_tensor * a_copy = ggml_dup_tensor_layout(sched->ctx, a); - ggml_format_name(a_copy, "%s#%s part0#%d", ggml_backend_name(backend), src->name, c); - SET_CAUSE(a_copy, "5.cpy for gather"); - if (sched->n_copies > 1) { - ggml_set_input(a_copy); - ggml_set_output(a_copy); // prevent ggml-alloc from overwriting the tensor - } - a = a_copy; - } - ggml_tensor * concat = ggml_concat(sched->ctx, a, b, /*dim =*/ 0); - - ggml_format_name(concat, "%s#%s concat#%d", ggml_backend_name(backend), src->name, c); - tensor_id_copy(src_id, cur_backend_id, c) = concat; - } - const int n_inputs = split->n_inputs++; - GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS); - split->inputs[n_inputs] = src; - } - fprintf(stderr, "%s: 100 replacing src%d=%s of %s\n", __func__, j, node->src[j]->name, node->name); - node->src[j] = tensor_id_copy(src_id, cur_backend_id, sched->cur_copy); - } else if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, src, cur_backend_id) && - !(src->buffer && ggml_backend_buft_is_split(ggml_backend_buffer_get_type(src->buffer)))) { + if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) { // create a copy of the input in the split's backend if (tensor_id_copy(src_id, cur_backend_id, 0) == nullptr) { ggml_backend_t backend = sched->backends[cur_backend_id]; From 50d2c5e225bcbc3902fb9251f198ef883aa7f386 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Thu, 22 May 2025 18:00:55 +0200 Subject: [PATCH 23/65] WIP --- ggml/src/ggml-backend.cpp | 61 +++++++++++++++++++++------------------ 1 file changed, 33 insertions(+), 28 deletions(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 26d487ece5f05..65311e395dc11 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1096,21 +1096,21 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } // pass 5: split graph, find tensors that need to be copied + std::vector splits_no_tp; { - int i_split = 0; - struct ggml_backend_sched_split * split = &sched->splits[0]; + ggml_backend_sched_split split; // find the backend of the first split, skipping view ops - int i = 0; + int i = 0; // graph node index for (; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; if (!ggml_is_view_op(node->op)) { - split->backend_id = tensor_backend_id(node); + split.backend_id = tensor_backend_id(node); break; } } - split->i_start = 0; - split->n_inputs = 0; - int cur_backend_id = split->backend_id; + split.i_start = 0; + split.n_inputs = 0; + int cur_backend_id = split.backend_id; for (; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; @@ -1124,10 +1124,10 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg // check if we should start a new split based on the sources of the current node bool need_new_split = false; - if (node_backend_id == cur_backend_id && split->n_inputs > 0) { + if (node_backend_id == cur_backend_id && split.n_inputs > 0) { for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * src = node->src[j]; - if (src == NULL) { + if (src == nullptr) { continue; } @@ -1142,7 +1142,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } // check if the split has too many inputs // FIXME: count the number of inputs instead of only checking when full - if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) { + if (split.n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) { const size_t id = hash_id(src); const int src_backend_id = sched->hv_tensor_backend_ids[id]; const bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id); @@ -1155,18 +1155,13 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } if (node_backend_id != cur_backend_id || need_new_split) { - split->i_end = i; - i_split++; - if (i_split >= sched->splits_capacity) { - sched->splits_capacity *= 2; - sched->splits = (ggml_backend_sched_split *) - realloc(sched->splits, sched->splits_capacity * sizeof(struct ggml_backend_sched_split)); - GGML_ASSERT(sched->splits != NULL); - } - split = &sched->splits[i_split]; - split->backend_id = node_backend_id; - split->i_start = i; - split->n_inputs = 0; + split.i_end = i; + split.graph = ggml_graph_view(graph, split.i_start, split.i_end); + splits_no_tp.push_back(split); + + split.backend_id = node_backend_id; + split.i_start = i; + split.n_inputs = 0; cur_backend_id = node_backend_id; } @@ -1219,20 +1214,30 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg tensor_id_copy(src_id, cur_backend_id, c) = tensor_copy; SET_CAUSE(tensor_copy, "4.cpy"); } - const int n_inputs = split->n_inputs++; + const int n_inputs = split.n_inputs++; GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS); - split->inputs[n_inputs] = src; + split.inputs[n_inputs] = src; } fprintf(stderr, "%s: 200 replacing src%d=%s of %s\n", __func__, j, node->src[j]->name, node->name); node->src[j] = tensor_id_copy(src_id, cur_backend_id, sched->cur_copy); } } } - split->i_end = graph->n_nodes; - GGML_ASSERT(!split->tensor_parallel); - split->graph = ggml_graph_view(graph, split->i_start, split->i_end); - sched->n_splits = i_split + 1; + split.i_end = graph->n_nodes; + GGML_ASSERT(!split.tensor_parallel); + split.graph = ggml_graph_view(graph, split.i_start, split.i_end); + } + + for (size_t i_split = 0; i_split < splits_no_tp.size(); i_split++) { + if (int(i_split) >= sched->splits_capacity) { + sched->splits_capacity *= 2; + sched->splits = (ggml_backend_sched_split *) + realloc(sched->splits, sched->splits_capacity * sizeof(struct ggml_backend_sched_split)); + GGML_ASSERT(sched->splits != NULL); + } + sched->splits[i_split] = splits_no_tp[i_split]; } + sched->n_splits = splits_no_tp.size(); if (sched->debug) { ggml_backend_sched_print_assignments(sched, graph); From fe2747e227c3223303a027d20ed58c87c162caeb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Thu, 22 May 2025 18:04:08 +0200 Subject: [PATCH 24/65] try fix --- ggml/src/ggml-backend.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 65311e395dc11..157f3409d4540 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1218,13 +1218,13 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS); split.inputs[n_inputs] = src; } - fprintf(stderr, "%s: 200 replacing src%d=%s of %s\n", __func__, j, node->src[j]->name, node->name); + // fprintf(stderr, "%s: 200 replacing src%d=%s of %s\n", __func__, j, node->src[j]->name, node->name); node->src[j] = tensor_id_copy(src_id, cur_backend_id, sched->cur_copy); } } } split.i_end = graph->n_nodes; - GGML_ASSERT(!split.tensor_parallel); + // GGML_ASSERT(!split.tensor_parallel); split.graph = ggml_graph_view(graph, split.i_start, split.i_end); } From 6ddf206968c1c802e71ea8bc4147b6ff15a040ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Thu, 22 May 2025 18:09:15 +0200 Subject: [PATCH 25/65] try fix --- ggml/src/ggml-backend.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 157f3409d4540..03b83f8f07313 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1226,6 +1226,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg split.i_end = graph->n_nodes; // GGML_ASSERT(!split.tensor_parallel); split.graph = ggml_graph_view(graph, split.i_start, split.i_end); + splits_no_tp.push_back(split); } for (size_t i_split = 0; i_split < splits_no_tp.size(); i_split++) { From 996d263325d5adb419eeef3e4918dc4e1fc9ba8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Thu, 22 May 2025 20:47:49 +0200 Subject: [PATCH 26/65] WIP --- ggml/src/ggml-backend.cpp | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 03b83f8f07313..6da82c082cbed 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1110,7 +1110,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } split.i_start = 0; split.n_inputs = 0; - int cur_backend_id = split.backend_id; for (; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; @@ -1124,7 +1123,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg // check if we should start a new split based on the sources of the current node bool need_new_split = false; - if (node_backend_id == cur_backend_id && split.n_inputs > 0) { + if (node_backend_id == split.backend_id && split.n_inputs > 0) { for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * src = node->src[j]; if (src == nullptr) { @@ -1135,7 +1134,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg // by starting a new split, the memory of the previously offloaded weights can be reused if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { const int src_backend_id = tensor_backend_id(src); - if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) { + if (src_backend_id != split.backend_id && !ggml_backend_sched_buffer_supported(sched, src, split.backend_id)) { need_new_split = true; break; } @@ -1145,8 +1144,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg if (split.n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) { const size_t id = hash_id(src); const int src_backend_id = sched->hv_tensor_backend_ids[id]; - const bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id); - if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == nullptr && !supported) { + const bool supported = ggml_backend_sched_buffer_supported(sched, src, split.backend_id); + if (src_backend_id != split.backend_id && tensor_id_copy(id, split.backend_id, 0) == nullptr && !supported) { need_new_split = true; break; } @@ -1154,7 +1153,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } } - if (node_backend_id != cur_backend_id || need_new_split) { + if (node_backend_id != split.backend_id || need_new_split) { split.i_end = i; split.graph = ggml_graph_view(graph, split.i_start, split.i_end); splits_no_tp.push_back(split); @@ -1162,7 +1161,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg split.backend_id = node_backend_id; split.i_start = i; split.n_inputs = 0; - cur_backend_id = node_backend_id; } // find inputs that are not on the same backend @@ -1200,10 +1198,10 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } } - if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) { + if (src_backend_id != split.backend_id && !ggml_backend_sched_buffer_supported(sched, src, split.backend_id)) { // create a copy of the input in the split's backend - if (tensor_id_copy(src_id, cur_backend_id, 0) == nullptr) { - ggml_backend_t backend = sched->backends[cur_backend_id]; + if (tensor_id_copy(src_id, split.backend_id, 0) == nullptr) { + ggml_backend_t backend = sched->backends[split.backend_id]; for (int c = 0; c < sched->n_copies; c++) { struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src); ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c); @@ -1211,7 +1209,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg ggml_set_input(tensor_copy); ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor } - tensor_id_copy(src_id, cur_backend_id, c) = tensor_copy; + tensor_id_copy(src_id, split.backend_id, c) = tensor_copy; SET_CAUSE(tensor_copy, "4.cpy"); } const int n_inputs = split.n_inputs++; @@ -1219,7 +1217,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg split.inputs[n_inputs] = src; } // fprintf(stderr, "%s: 200 replacing src%d=%s of %s\n", __func__, j, node->src[j]->name, node->name); - node->src[j] = tensor_id_copy(src_id, cur_backend_id, sched->cur_copy); + node->src[j] = tensor_id_copy(src_id, split.backend_id, sched->cur_copy); } } } From 3a432abfeddf143960266ee721e14b8bfc912214 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Thu, 22 May 2025 23:07:45 +0200 Subject: [PATCH 27/65] WIP --- ggml/src/ggml-backend.cpp | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 6da82c082cbed..c1c17407bd8d3 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1162,6 +1162,16 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg split.i_start = i; split.n_inputs = 0; } + } + split.i_end = graph->n_nodes; + // GGML_ASSERT(!split.tensor_parallel); + split.graph = ggml_graph_view(graph, split.i_start, split.i_end); + splits_no_tp.push_back(split); + } + + for (ggml_backend_sched_split & split : splits_no_tp) { + for (int i = 0; i < split.graph.n_nodes; i++) { + ggml_tensor * node = split.graph.nodes[i]; // find inputs that are not on the same backend for (int j = 0; j < GGML_MAX_SRC; j++) { @@ -1221,10 +1231,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } } } - split.i_end = graph->n_nodes; - // GGML_ASSERT(!split.tensor_parallel); - split.graph = ggml_graph_view(graph, split.i_start, split.i_end); - splits_no_tp.push_back(split); } for (size_t i_split = 0; i_split < splits_no_tp.size(); i_split++) { From 9c6550e6bc8e219493257a2a93345a3026364545 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Fri, 23 May 2025 12:39:50 +0200 Subject: [PATCH 28/65] WIP --- ggml/src/ggml-backend.cpp | 198 ++++++++++++++++++++++++++++++++------ 1 file changed, 171 insertions(+), 27 deletions(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index c1c17407bd8d3..5cd98fda4b0fe 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1105,15 +1105,18 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg struct ggml_tensor * node = graph->nodes[i]; if (!ggml_is_view_op(node->op)) { split.backend_id = tensor_backend_id(node); + split.tensor_parallel = node->src[0] && ggml_backend_buft_is_split( + ggml_backend_buffer_get_type(node->src[0]->buffer)); break; } } split.i_start = 0; split.n_inputs = 0; + GGML_ASSERT(!split.tensor_parallel); for (; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; - if (ggml_is_view_op(node->op)) { + if (ggml_is_view_op(node->op) && !split.tensor_parallel) { continue; } @@ -1153,6 +1156,13 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } } + const bool src0_on_split_buffer = node->src[0] && node->src[0]->buffer && + ggml_backend_buft_is_split(ggml_backend_buffer_get_type(node->src[0]->buffer)); + GGML_ASSERT(!src0_on_split_buffer || node->op == GGML_OP_MUL_MAT); + if (src0_on_split_buffer != split.tensor_parallel) { + need_new_split = true; + } + if (node_backend_id != split.backend_id || need_new_split) { split.i_end = i; split.graph = ggml_graph_view(graph, split.i_start, split.i_end); @@ -1164,19 +1174,132 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } } split.i_end = graph->n_nodes; - // GGML_ASSERT(!split.tensor_parallel); + GGML_ASSERT(!split.tensor_parallel); split.graph = ggml_graph_view(graph, split.i_start, split.i_end); splits_no_tp.push_back(split); } - for (ggml_backend_sched_split & split : splits_no_tp) { +#ifndef NDEBUG + // assert that the splits are in the expected order and contain the correct nodes + // splits_no_tp should contain all graph nodes in sequential order + assert(!splits_no_tp.empty()); + assert(splits_no_tp.front().i_start == 0); + assert(splits_no_tp.front().graph.n_nodes == splits_no_tp.front().i_end - splits_no_tp.front().i_start); + for (size_t i = 1; i < splits_no_tp.size(); i++) { + const ggml_backend_sched_split & split_now = splits_no_tp[i]; + const ggml_backend_sched_split & split_prev = splits_no_tp[i - 1]; + const bool splits_sequential = split_now.i_start == split_prev.i_end; + assert(splits_sequential); + assert(split_now.graph.n_nodes == split_now.i_end - split_now.i_start); + for (int j = 0; j < split_now.graph.n_nodes; j++) { + assert(split_now.graph.nodes[j] == graph->nodes[split_now.i_start + j]); + } + } + assert(splits_no_tp.back().i_end == graph->n_nodes); +#endif // NDEBUG + + std::vector splits_tp; + const int n_gpus = sched->n_backends - 1; // FIXME + splits_tp.reserve(n_gpus * splits_no_tp.size()); + + { + for (const ggml_backend_sched_split & split_main : splits_no_tp) { + if (split_main.tensor_parallel) { + for (int i_gpu = 0; i_gpu < n_gpus; i_gpu++) { + if (i_gpu == split_main.backend_id) { + continue; + } + + ggml_backend_sched_split split = split_main; + split.backend_id = i_gpu; + split.graph = *ggml_new_graph_custom(sched->ctx, split_main.graph.size, /*grads =*/ false); + dup_graph(sched->ctx, &split_main.graph, &split.graph, /*expand =*/ false); + } + } + + // add split on same backend last so that splits on other backends don't have to wait for it + splits_tp.push_back(split_main); + } + + for (ggml_backend_sched_split & split : splits_tp) { + if (!split.tensor_parallel) { + continue; + } + + const int i_gpu = split.backend_id; + for (int n = 0; n < split.graph.n_nodes; n++) { + ggml_tensor * dst = split.graph.nodes[n]; + GGML_ASSERT(dst->op == GGML_OP_MUL_MAT); + ggml_tensor * src0 = dst->src[0]; + ggml_tensor * src1 = dst->src[1]; + + GGML_ASSERT(src0->buffer); + GGML_ASSERT(ggml_backend_buft_is_split(ggml_backend_buffer_get_type(src0->buffer))); + + GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_is_contiguous(src1)); + GGML_ASSERT(ggml_is_contiguous(dst)); + + GGML_TENSOR_BINARY_OP_LOCALS; + + GGML_ASSERT(ne01 == ne0); + GGML_ASSERT(ne01 % n_gpus == 0); + + const int64_t row_low = ne01 * i_gpu /n_gpus; + const int64_t row_high = ne01 * (i_gpu+1)/n_gpus; + const int64_t row_diff = row_high - row_low; + + src0->data = ((void **) src0->extra)[i_gpu]; + src0->ne[1] = row_diff; + src0->nb[2] = src0->ne[1]*src0->nb[1]; + src0->nb[3] = src0->ne[2]*src0->nb[2]; + + dst->ne[0] = row_diff; + dst->nb[1] = ggml_row_size(dst->type, dst->ne[0]); + dst->nb[2] = dst->ne[1]*dst->nb[1]; + dst->nb[3] = dst->ne[2]*dst->nb[2]; + } + + for (int n = 0; n < split.graph.n_nodes; n++) { + ggml_tensor * node = split.graph.nodes[n]; + tensor_id_tp(hash_id(node), split.backend_id) = node; + } + } + } + +#ifndef NDEBUG + // assert that the splits are in the expected order and that parallel splits have different tensors with same ops + assert(!splits_tp.empty()); + assert(splits_tp.front().i_start == 0); + assert(splits_tp.front().graph.n_nodes == splits_tp.front().i_end - splits_tp.front().i_start); + for (size_t i = 1; i < splits_tp.size(); i++) { + const ggml_backend_sched_split & split_now = splits_tp[i]; + const ggml_backend_sched_split & split_prev = splits_tp[i - 1]; + const bool splits_sequential = split_now.i_start == split_prev.i_end; + const bool splits_parallel = split_now.tensor_parallel && split_prev.tensor_parallel && + split_now.backend_id != split_prev.backend_id && + split_now.i_start == split_prev.i_start && split_now.i_end == split_prev.i_end; + assert(splits_sequential || splits_parallel); + assert(split_now.graph.n_nodes == split_now.i_end - split_now.i_start); + if (splits_parallel) { + assert(split_now.graph.n_nodes == split_prev.graph.n_nodes); + for (int j = 0; j < split_now.graph.n_nodes; j++) { + assert(split_now.graph.nodes[j] != split_prev.graph.nodes[j]); + assert(split_now.graph.nodes[j]->op == split_prev.graph.nodes[j]->op); + } + } + } + assert(splits_tp.back().i_end == graph->n_nodes); +#endif // NDEBUG + + for (ggml_backend_sched_split & split : splits_tp) { for (int i = 0; i < split.graph.n_nodes; i++) { ggml_tensor * node = split.graph.nodes[i]; // find inputs that are not on the same backend for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * src = node->src[j]; - if (src == NULL) { + if (src == nullptr) { continue; } @@ -1208,7 +1331,46 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } } - if (src_backend_id != split.backend_id && !ggml_backend_sched_buffer_supported(sched, src, split.backend_id)) { + if (tensor_id_tp(src_id, 0) != nullptr) { + if (tensor_id_copy(src_id, split.backend_id, 0) == nullptr) { + GGML_ASSERT(n_gpus == 2); + ggml_backend_t backend = sched->backends[split.backend_id]; + + for (int c = 0; c < sched->n_copies; c++) { + ggml_tensor * a = tensor_id_tp(src_id, 0); + ggml_tensor * b = tensor_id_tp(src_id, 1); + if (split.backend_id == 0) { + ggml_tensor * b_copy = ggml_dup_tensor_layout(sched->ctx, b); + ggml_format_name(b_copy, "%s#%s part1#%d", ggml_backend_name(backend), src->name, c); + SET_CAUSE(b_copy, "5.cpy for gather"); + if (sched->n_copies > 1) { + ggml_set_input(b_copy); + ggml_set_output(b_copy); // prevent ggml-alloc from overwriting the tensor + } + b = b_copy; + } else { + GGML_ASSERT(split.backend_id == 1); + ggml_tensor * a_copy = ggml_dup_tensor_layout(sched->ctx, a); + ggml_format_name(a_copy, "%s#%s part0#%d", ggml_backend_name(backend), src->name, c); + SET_CAUSE(a_copy, "5.cpy for gather"); + if (sched->n_copies > 1) { + ggml_set_input(a_copy); + ggml_set_output(a_copy); // prevent ggml-alloc from overwriting the tensor + } + a = a_copy; + } + ggml_tensor * concat = ggml_concat(sched->ctx, a, b, /*dim =*/ 0); + + ggml_format_name(concat, "%s#%s concat#%d", ggml_backend_name(backend), src->name, c); + tensor_id_copy(src_id, split.backend_id, c) = concat; + } + const int n_inputs = split.n_inputs++; + GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS); + split.inputs[n_inputs] = src; + } + fprintf(stderr, "%s: 100 replacing src%d=%s of %s\n", __func__, j, node->src[j]->name, node->name); + node->src[j] = tensor_id_copy(src_id, split.backend_id, sched->cur_copy); + } else if (src_backend_id != split.backend_id && !ggml_backend_sched_buffer_supported(sched, src, split.backend_id)) { // create a copy of the input in the split's backend if (tensor_id_copy(src_id, split.backend_id, 0) == nullptr) { ggml_backend_t backend = sched->backends[split.backend_id]; @@ -1226,46 +1388,28 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS); split.inputs[n_inputs] = src; } - // fprintf(stderr, "%s: 200 replacing src%d=%s of %s\n", __func__, j, node->src[j]->name, node->name); + fprintf(stderr, "%s: 200 replacing src%d=%s of %s\n", __func__, j, node->src[j]->name, node->name); node->src[j] = tensor_id_copy(src_id, split.backend_id, sched->cur_copy); } } } } - for (size_t i_split = 0; i_split < splits_no_tp.size(); i_split++) { + for (size_t i_split = 0; i_split < splits_tp.size(); i_split++) { if (int(i_split) >= sched->splits_capacity) { sched->splits_capacity *= 2; sched->splits = (ggml_backend_sched_split *) realloc(sched->splits, sched->splits_capacity * sizeof(struct ggml_backend_sched_split)); GGML_ASSERT(sched->splits != NULL); } - sched->splits[i_split] = splits_no_tp[i_split]; + sched->splits[i_split] = splits_tp[i_split]; } - sched->n_splits = splits_no_tp.size(); + sched->n_splits = splits_tp.size(); if (sched->debug) { ggml_backend_sched_print_assignments(sched, graph); } -#ifndef NDEBUG - // assert that the splits are in the expected order - // subsequent splits are expected to either contain a contiguous slice of the graph - // or to contain the same nodes but executed on subsequent backends - assert(sched->n_splits >= 1); - assert(sched->splits[0].i_start == 0); - for (int i = 1; i < sched->n_splits; i++) { - const ggml_backend_sched_split & split_now = sched->splits[i]; - const ggml_backend_sched_split & split_prev = sched->splits[i - 1]; - const bool splits_sequential = split_now.i_start == split_prev.i_end; - const bool splits_parallel = split_now.tensor_parallel && split_prev.tensor_parallel && - split_now.backend_id != split_prev.backend_id && - split_now.i_start == split_prev.i_start && split_now.i_end == split_prev.i_end; - assert(splits_sequential || splits_parallel); - } - assert(sched->splits[sched->n_splits-1].i_end == graph->n_nodes); -#endif // NDEBUG - // swap node_backend_ids and leaf _backend_ids with prevs { int * tmp = sched->node_backend_ids; From 2da2cc36ef122abd507767f2d8e9c690b3a0fc6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Fri, 23 May 2025 14:56:49 +0200 Subject: [PATCH 29/65] WIP --- ggml/src/ggml-backend.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 5cd98fda4b0fe..8f9730bc52ec1 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1456,10 +1456,10 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg graph_copy->nodes[graph_copy->n_nodes++] = input_cpy; } - for (int j = split->i_start; j < split->i_end; j++) { + for (int j = 0; j < split->graph.n_nodes; j++) { assert(graph_copy->size > graph_copy->n_nodes); - sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]); - graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j]; + sched->node_backend_ids[graph_copy->n_nodes] = split->backend_id; + graph_copy->nodes[graph_copy->n_nodes++] = split->graph.nodes[j]; } } From 67f02bf6bbab2e5cacaaa457e56e417a336ef0de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Fri, 23 May 2025 15:18:42 +0200 Subject: [PATCH 30/65] WIP --- ggml/src/ggml-backend.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 8f9730bc52ec1..2507d3c44d6da 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1232,6 +1232,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg GGML_ASSERT(dst->op == GGML_OP_MUL_MAT); ggml_tensor * src0 = dst->src[0]; ggml_tensor * src1 = dst->src[1]; + fprintf(stderr, "%s: 025 src0={%ld, %ld, %ld, %ld} src1={%ld, %ld, %ld, %ld} dst={%ld, %ld, %ld, %ld}\n", + __func__, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], + dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]); GGML_ASSERT(src0->buffer); GGML_ASSERT(ggml_backend_buft_is_split(ggml_backend_buffer_get_type(src0->buffer))); @@ -1258,6 +1261,10 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg dst->nb[1] = ggml_row_size(dst->type, dst->ne[0]); dst->nb[2] = dst->ne[1]*dst->nb[1]; dst->nb[3] = dst->ne[2]*dst->nb[2]; + + fprintf(stderr, "%s: 050 src0={%ld, %ld, %ld, %ld} src1={%ld, %ld, %ld, %ld} dst={%ld, %ld, %ld, %ld}\n", + __func__, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], + dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]); } for (int n = 0; n < split.graph.n_nodes; n++) { From 2e282d58be289924f5b2b0667966e0174522da6c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Fri, 23 May 2025 15:21:13 +0200 Subject: [PATCH 31/65] WIP --- ggml/src/ggml-backend.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 2507d3c44d6da..83ac8ec1421c9 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1171,6 +1171,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg split.backend_id = node_backend_id; split.i_start = i; split.n_inputs = 0; + split.tensor_parallel = src0_on_split_buffer; } } split.i_end = graph->n_nodes; From 886012220ba2bedc6f9eafaa7774f0389d1b176d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Fri, 23 May 2025 16:30:44 +0200 Subject: [PATCH 32/65] WIP --- src/llama-model.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 7fd094b63f269..eb9d08eee9f5f 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -4705,6 +4705,7 @@ struct llm_build_llama : public llm_graph_context { cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); + cur = ggml_scale(ctx0, cur, 1.0f); res->t_logits = cur; ggml_build_forward_expand(gf, cur); From 3d96528f24d4cec18fefd076fb2320a676db8ff8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Fri, 23 May 2025 16:34:06 +0200 Subject: [PATCH 33/65] WIP --- ggml/src/ggml-backend.cpp | 5 +++-- src/llama-model.cpp | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 83ac8ec1421c9..8b615cb792502 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1269,8 +1269,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } for (int n = 0; n < split.graph.n_nodes; n++) { - ggml_tensor * node = split.graph.nodes[n]; - tensor_id_tp(hash_id(node), split.backend_id) = node; + ggml_tensor * node_orig = graph->nodes[split.i_start + n]; + ggml_tensor * node_sg = split.graph.nodes[n]; + tensor_id_tp(hash_id(node_orig), split.backend_id) = node_sg; } } } diff --git a/src/llama-model.cpp b/src/llama-model.cpp index eb9d08eee9f5f..5c311e8e13b70 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -4705,7 +4705,7 @@ struct llm_build_llama : public llm_graph_context { cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); - cur = ggml_scale(ctx0, cur, 1.0f); + cur = ggml_scale(ctx0, cur, 1.0f); // FIXME res->t_logits = cur; ggml_build_forward_expand(gf, cur); From 2d2ef89511c3aaf8ac12f57b7effd904a647f264 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Fri, 23 May 2025 16:55:20 +0200 Subject: [PATCH 34/65] WIP --- ggml/src/ggml-backend.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 8b615cb792502..6c400780c909f 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1215,6 +1215,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg split.backend_id = i_gpu; split.graph = *ggml_new_graph_custom(sched->ctx, split_main.graph.size, /*grads =*/ false); dup_graph(sched->ctx, &split_main.graph, &split.graph, /*expand =*/ false); + splits_tp.push_back(split); } } From 6b836c829b07b889afbd8b0c82846bb8919974c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Fri, 23 May 2025 17:13:55 +0200 Subject: [PATCH 35/65] WIP --- ggml/src/ggml-backend.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 6c400780c909f..6fd0b4103834e 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1273,6 +1273,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg ggml_tensor * node_orig = graph->nodes[split.i_start + n]; ggml_tensor * node_sg = split.graph.nodes[n]; tensor_id_tp(hash_id(node_orig), split.backend_id) = node_sg; + tensor_backend_id(node_sg) = split.backend_id; } } } From f5a515572f738bc089f8470df64cdb068dfee740 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Fri, 23 May 2025 18:32:33 +0200 Subject: [PATCH 36/65] WIP --- ggml/src/ggml-backend.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 6fd0b4103834e..b70ff0c46d0d8 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1250,11 +1250,16 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg GGML_ASSERT(ne01 == ne0); GGML_ASSERT(ne01 % n_gpus == 0); + GGML_ASSERT(src0->op == GGML_OP_NONE); + src0 = ggml_dup_tensor_layout(sched->ctx, src0); + src0->data = ((void **) dst->src[0]->extra)[i_gpu]; // FIXME + dst->src[0] = src0; + tensor_backend_id(src0) = split.backend_id; + const int64_t row_low = ne01 * i_gpu /n_gpus; const int64_t row_high = ne01 * (i_gpu+1)/n_gpus; const int64_t row_diff = row_high - row_low; - src0->data = ((void **) src0->extra)[i_gpu]; src0->ne[1] = row_diff; src0->nb[2] = src0->ne[1]*src0->nb[1]; src0->nb[3] = src0->ne[2]*src0->nb[2]; From cc91ca153d1f7b2c6044cc47cd2a99033c7ed8fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Fri, 23 May 2025 21:15:55 +0200 Subject: [PATCH 37/65] WIP --- ggml/src/ggml-backend.cpp | 4 ++-- ggml/src/ggml-impl.h | 23 +++++++++++++---------- ggml/src/ggml-opt.cpp | 2 +- 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index b70ff0c46d0d8..d1d1c9ac3aca3 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1214,7 +1214,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg ggml_backend_sched_split split = split_main; split.backend_id = i_gpu; split.graph = *ggml_new_graph_custom(sched->ctx, split_main.graph.size, /*grads =*/ false); - dup_graph(sched->ctx, &split_main.graph, &split.graph, /*expand =*/ false); + dup_graph(sched->ctx, &split_main.graph, &split.graph, /*deep =*/ false); splits_tp.push_back(split); } } @@ -1228,7 +1228,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg continue; } - const int i_gpu = split.backend_id; + const int i_gpu = split.backend_id; for (int n = 0; n < split.graph.n_nodes; n++) { ggml_tensor * dst = split.graph.nodes[n]; GGML_ASSERT(dst->op == GGML_OP_MUL_MAT); diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index 7b2f64acc7a78..5f041252b251a 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -600,7 +600,7 @@ GGML_API size_t gguf_type_size(enum gguf_type type); GGML_API struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params); GGML_API void gguf_write_to_buf(const struct gguf_context * ctx, std::vector & buf, bool only_meta); -static ggml_tensor * map_tensor(std::map & tensor_map, ggml_context * ctx, ggml_tensor * tensor) { +static ggml_tensor * map_tensor(std::map & tensor_map, ggml_context * ctx, ggml_tensor * tensor, bool deep) { if (!tensor) { return nullptr; } @@ -623,30 +623,33 @@ static ggml_tensor * map_tensor(std::map & tensor_ new_tensor->buffer = tensor->buffer; new_tensor->extra = tensor->extra; new_tensor->view_offs = tensor->view_offs; - new_tensor->view_src = map_tensor(tensor_map, ctx, tensor->view_src); - for (int i = 0; i < GGML_MAX_SRC; i++) { - new_tensor->src[i] = map_tensor(tensor_map, ctx, tensor->src[i]); + + if (deep) { + new_tensor->view_src = map_tensor(tensor_map, ctx, tensor->view_src, deep); + for (int i = 0; i < GGML_MAX_SRC; i++) { + new_tensor->src[i] = map_tensor(tensor_map, ctx, tensor->src[i], deep); + } } return new_tensor; } -static void dup_graph(ggml_context * ctx, const ggml_cgraph * src, ggml_cgraph * dst, bool expand) { +static void dup_graph(ggml_context * ctx, const ggml_cgraph * src, ggml_cgraph * dst, bool deep) { std::map tensor_map; - if (expand) { + if (deep) { for (int i = 0; i < src->n_leafs; i++) { - ggml_build_forward_expand(dst, map_tensor(tensor_map, ctx, src->leafs[i])); + ggml_build_forward_expand(dst, map_tensor(tensor_map, ctx, src->leafs[i], deep)); } for (int i = 0; i < src->n_nodes; i++) { - ggml_build_forward_expand(dst, map_tensor(tensor_map, ctx, src->nodes[i])); + ggml_build_forward_expand(dst, map_tensor(tensor_map, ctx, src->nodes[i], deep)); } } else { for (int i = 0; i < src->n_leafs; i++) { - dst->leafs[dst->n_leafs++] = map_tensor(tensor_map, ctx, src->leafs[i]); + dst->leafs[dst->n_leafs++] = map_tensor(tensor_map, ctx, src->leafs[i], deep); } for (int i = 0; i < src->n_nodes; i++) { - dst->nodes[dst->n_nodes++] = map_tensor(tensor_map, ctx, src->nodes[i]); + dst->nodes[dst->n_nodes++] = map_tensor(tensor_map, ctx, src->nodes[i], deep); } } GGML_ASSERT(dst->n_leafs == src->n_leafs); diff --git a/ggml/src/ggml-opt.cpp b/ggml/src/ggml-opt.cpp index 1fa35ec271ee3..fc0324acd5cd7 100644 --- a/ggml/src/ggml-opt.cpp +++ b/ggml/src/ggml-opt.cpp @@ -678,7 +678,7 @@ void ggml_opt_alloc(ggml_opt_context_t opt_ctx, bool backward) { opt_ctx->ctx_copy = ggml_init(params); opt_ctx->allocated_graph_copy = ggml_new_graph_custom(opt_ctx->ctx_copy, graph->size, /*grads =*/ true); - dup_graph(opt_ctx->ctx_copy, graph, opt_ctx->allocated_graph_copy, /*expand =*/ true); + dup_graph(opt_ctx->ctx_copy, graph, opt_ctx->allocated_graph_copy, /*deep =*/ true); } else { opt_ctx->allocated_graph_copy = graph; } From 6ee4d0ed9a6b4028783d93ae49da85db7db2d602 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Fri, 23 May 2025 21:23:35 +0200 Subject: [PATCH 38/65] WIP --- ggml/src/ggml-impl.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index 5f041252b251a..e7917bd97779e 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -629,6 +629,11 @@ static ggml_tensor * map_tensor(std::map & tensor_ for (int i = 0; i < GGML_MAX_SRC; i++) { new_tensor->src[i] = map_tensor(tensor_map, ctx, tensor->src[i], deep); } + } else { + new_tensor->view_src = tensor->view_src; + for (int i = 0; i < GGML_MAX_SRC; i++) { + new_tensor->src[i] = tensor->src[i]; + } } return new_tensor; From 935d6524a5158b6e18e8445e0f5ec0dba31f5ff9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Fri, 23 May 2025 21:30:57 +0200 Subject: [PATCH 39/65] WIP --- ggml/src/ggml-backend.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index d1d1c9ac3aca3..04e6b222cdda7 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1253,6 +1253,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg GGML_ASSERT(src0->op == GGML_OP_NONE); src0 = ggml_dup_tensor_layout(sched->ctx, src0); src0->data = ((void **) dst->src[0]->extra)[i_gpu]; // FIXME + src0->buffer = dst->src[0]->buffer; dst->src[0] = src0; tensor_backend_id(src0) = split.backend_id; From 4dacb2f0063b0413d87bbdb05f7a2f8566cd4284 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Fri, 23 May 2025 21:39:34 +0200 Subject: [PATCH 40/65] WIP --- ggml/src/ggml-backend.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 04e6b222cdda7..9c3e00e783acc 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1234,9 +1234,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg GGML_ASSERT(dst->op == GGML_OP_MUL_MAT); ggml_tensor * src0 = dst->src[0]; ggml_tensor * src1 = dst->src[1]; - fprintf(stderr, "%s: 025 src0={%ld, %ld, %ld, %ld} src1={%ld, %ld, %ld, %ld} dst={%ld, %ld, %ld, %ld}\n", - __func__, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], - dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]); + // fprintf(stderr, "%s: 025 src0={%ld, %ld, %ld, %ld} src1={%ld, %ld, %ld, %ld} dst={%ld, %ld, %ld, %ld}\n", + // __func__, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], + // dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]); GGML_ASSERT(src0->buffer); GGML_ASSERT(ggml_backend_buft_is_split(ggml_backend_buffer_get_type(src0->buffer))); @@ -1270,9 +1270,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg dst->nb[2] = dst->ne[1]*dst->nb[1]; dst->nb[3] = dst->ne[2]*dst->nb[2]; - fprintf(stderr, "%s: 050 src0={%ld, %ld, %ld, %ld} src1={%ld, %ld, %ld, %ld} dst={%ld, %ld, %ld, %ld}\n", - __func__, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], - dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]); + // fprintf(stderr, "%s: 050 src0={%ld, %ld, %ld, %ld} src1={%ld, %ld, %ld, %ld} dst={%ld, %ld, %ld, %ld}\n", + // __func__, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], + // dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]); } for (int n = 0; n < split.graph.n_nodes; n++) { @@ -1385,7 +1385,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS); split.inputs[n_inputs] = src; } - fprintf(stderr, "%s: 100 replacing src%d=%s of %s\n", __func__, j, node->src[j]->name, node->name); + // fprintf(stderr, "%s: 100 replacing src%d=%s of %s\n", __func__, j, node->src[j]->name, node->name); node->src[j] = tensor_id_copy(src_id, split.backend_id, sched->cur_copy); } else if (src_backend_id != split.backend_id && !ggml_backend_sched_buffer_supported(sched, src, split.backend_id)) { // create a copy of the input in the split's backend @@ -1405,7 +1405,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS); split.inputs[n_inputs] = src; } - fprintf(stderr, "%s: 200 replacing src%d=%s of %s\n", __func__, j, node->src[j]->name, node->name); + // fprintf(stderr, "%s: 200 replacing src%d=%s of %s\n", __func__, j, node->src[j]->name, node->name); node->src[j] = tensor_id_copy(src_id, split.backend_id, sched->cur_copy); } } From 7b7f39912b2ef2c86bc7b2d8463f58b55f4310ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Fri, 23 May 2025 22:50:05 +0200 Subject: [PATCH 41/65] WIP --- ggml/src/ggml-backend.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 9c3e00e783acc..d6c8aab539c5d 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1423,10 +1423,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } sched->n_splits = splits_tp.size(); - if (sched->debug) { - ggml_backend_sched_print_assignments(sched, graph); - } - // swap node_backend_ids and leaf _backend_ids with prevs { int * tmp = sched->node_backend_ids; @@ -1480,6 +1476,10 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } } + if (sched->debug) { + ggml_backend_sched_print_assignments(sched, &sched->graph); + } + if (sched->n_copies > 1) { // add input copies as leafs so that they are allocated first for (int i = 0; i < sched->n_graph_inputs; i++) { From aeda7e07b9735eb25499bb3452c9dc5476d9068a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Fri, 23 May 2025 23:11:08 +0200 Subject: [PATCH 42/65] WIP --- ggml/src/ggml-backend.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index d6c8aab539c5d..c32b91ebcb03e 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1420,6 +1420,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg GGML_ASSERT(sched->splits != NULL); } sched->splits[i_split] = splits_tp[i_split]; + fprintf(stderr, "%s: split%d backend_id=%d i_start=%d i_end=%d\n", + __func__, int(i_split), sched->splits[i_split].backend_id, sched->splits[i_split].i_start, sched->splits[i_split].i_end); } sched->n_splits = splits_tp.size(); From 95f1caf740820704b2bc4dd165b133c35434bb58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Fri, 23 May 2025 23:34:44 +0200 Subject: [PATCH 43/65] WIP --- ggml/src/ggml-backend.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index c32b91ebcb03e..b9a1f738da13d 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1422,6 +1422,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg sched->splits[i_split] = splits_tp[i_split]; fprintf(stderr, "%s: split%d backend_id=%d i_start=%d i_end=%d\n", __func__, int(i_split), sched->splits[i_split].backend_id, sched->splits[i_split].i_start, sched->splits[i_split].i_end); + for (int n = 0; n < sched->splits[i_split].graph.n_nodes; n++) { + fprintf(stderr, "%s: - %d: %s\n", __func__, n, sched->splits[i_split].graph.nodes[n]->name); + } } sched->n_splits = splits_tp.size(); From 1f648bafd9960f290227d949420407e354aff9a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Fri, 23 May 2025 23:40:45 +0200 Subject: [PATCH 44/65] WIP --- ggml/src/ggml-backend.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index b9a1f738da13d..45c8e95192a02 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1422,8 +1422,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg sched->splits[i_split] = splits_tp[i_split]; fprintf(stderr, "%s: split%d backend_id=%d i_start=%d i_end=%d\n", __func__, int(i_split), sched->splits[i_split].backend_id, sched->splits[i_split].i_start, sched->splits[i_split].i_end); + for (int n = 0; n < sched->splits[i_split].n_inputs; n++) { + fprintf(stderr, "%s: - input %d: %s\n", __func__, n, sched->graph_inputs[n]->name); + } for (int n = 0; n < sched->splits[i_split].graph.n_nodes; n++) { - fprintf(stderr, "%s: - %d: %s\n", __func__, n, sched->splits[i_split].graph.nodes[n]->name); + fprintf(stderr, "%s: - node %d: %s\n", __func__, n, sched->splits[i_split].graph.nodes[n]->name); } } sched->n_splits = splits_tp.size(); From e18d1ef6871de0730c0acfdff8b135a3f243ef49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Fri, 23 May 2025 23:42:36 +0200 Subject: [PATCH 45/65] WIP --- ggml/src/ggml-backend.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 45c8e95192a02..dfc1cb22bd4dc 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1423,7 +1423,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg fprintf(stderr, "%s: split%d backend_id=%d i_start=%d i_end=%d\n", __func__, int(i_split), sched->splits[i_split].backend_id, sched->splits[i_split].i_start, sched->splits[i_split].i_end); for (int n = 0; n < sched->splits[i_split].n_inputs; n++) { - fprintf(stderr, "%s: - input %d: %s\n", __func__, n, sched->graph_inputs[n]->name); + fprintf(stderr, "%s: - input %d: %s\n", __func__, n, sched->splits[i_split].inputs[n]->name); } for (int n = 0; n < sched->splits[i_split].graph.n_nodes; n++) { fprintf(stderr, "%s: - node %d: %s\n", __func__, n, sched->splits[i_split].graph.nodes[n]->name); From 66c8eec0f011f92cfb105697f8a6a8cf66ccaa73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Fri, 23 May 2025 23:53:51 +0200 Subject: [PATCH 46/65] WIP --- ggml/src/ggml-backend.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index dfc1cb22bd4dc..086fe5c60c86c 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1215,6 +1215,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg split.backend_id = i_gpu; split.graph = *ggml_new_graph_custom(sched->ctx, split_main.graph.size, /*grads =*/ false); dup_graph(sched->ctx, &split_main.graph, &split.graph, /*deep =*/ false); + + for (int n = 0; n < split.graph.n_nodes; n++) { + ggml_format_name(split.graph.nodes[n], "%s (parallel %d)", split.graph.nodes[n]->name, i_gpu); + } + splits_tp.push_back(split); } } From 206ab5881d87dc0a89532866cc4de62c00c1b1e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Fri, 23 May 2025 23:58:52 +0200 Subject: [PATCH 47/65] WIP --- ggml/src/ggml-backend.cpp | 3 ++- ggml/src/ggml.c | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 086fe5c60c86c..155500708c215 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1217,7 +1217,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg dup_graph(sched->ctx, &split_main.graph, &split.graph, /*deep =*/ false); for (int n = 0; n < split.graph.n_nodes; n++) { - ggml_format_name(split.graph.nodes[n], "%s (parallel %d)", split.graph.nodes[n]->name, i_gpu); + std::string name = split.graph.nodes[n]->name; + ggml_format_name(split.graph.nodes[n], "%s (parallel %d)", name.c_str(), i_gpu); } splits_tp.push_back(split); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 8a6546240f46f..087d7c31da67d 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -1714,6 +1714,7 @@ struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * nam struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...) { va_list args; va_start(args, fmt); + assert(tensor->name != fmt); vsnprintf(tensor->name, sizeof(tensor->name), fmt, args); va_end(args); return tensor; From ae1617c9a1106d96d2870d7814985199bfa94e52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sat, 24 May 2025 16:18:14 +0200 Subject: [PATCH 48/65] WIP --- ggml/src/ggml-backend.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 155500708c215..4cffdc8517774 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1116,7 +1116,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg for (; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; - if (ggml_is_view_op(node->op) && !split.tensor_parallel) { + if (ggml_is_view_op(node->op)) { + GGML_ASSERT(!split.tensor_parallel); continue; } From f617bbb62facbb5161e14c4eff313f55506ee98f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sat, 24 May 2025 16:23:09 +0200 Subject: [PATCH 49/65] WIP --- src/llama-model.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 5c311e8e13b70..c01b843a73500 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -4566,6 +4566,11 @@ struct llm_build_llama : public llm_graph_context { cb(Vcur, "Vcur", il); } + // FIXME + Qcur = ggml_scale(ctx0, Qcur, 1.0f); + Kcur = ggml_scale(ctx0, Kcur, 1.0f); + Vcur = ggml_scale(ctx0, Vcur, 1.0f); + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); From 528dd51f5a10a2289ef8a1d7f23e923955506e7f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sat, 24 May 2025 16:29:47 +0200 Subject: [PATCH 50/65] WIP --- ggml/src/ggml-backend.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 4cffdc8517774..2c6c7beffc93a 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1427,14 +1427,14 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg GGML_ASSERT(sched->splits != NULL); } sched->splits[i_split] = splits_tp[i_split]; - fprintf(stderr, "%s: split%d backend_id=%d i_start=%d i_end=%d\n", - __func__, int(i_split), sched->splits[i_split].backend_id, sched->splits[i_split].i_start, sched->splits[i_split].i_end); - for (int n = 0; n < sched->splits[i_split].n_inputs; n++) { - fprintf(stderr, "%s: - input %d: %s\n", __func__, n, sched->splits[i_split].inputs[n]->name); - } - for (int n = 0; n < sched->splits[i_split].graph.n_nodes; n++) { - fprintf(stderr, "%s: - node %d: %s\n", __func__, n, sched->splits[i_split].graph.nodes[n]->name); - } + // fprintf(stderr, "%s: split%d backend_id=%d i_start=%d i_end=%d\n", + // __func__, int(i_split), sched->splits[i_split].backend_id, sched->splits[i_split].i_start, sched->splits[i_split].i_end); + // for (int n = 0; n < sched->splits[i_split].n_inputs; n++) { + // fprintf(stderr, "%s: - input %d: %s\n", __func__, n, sched->splits[i_split].inputs[n]->name); + // } + // for (int n = 0; n < sched->splits[i_split].graph.n_nodes; n++) { + // fprintf(stderr, "%s: - node %d: %s\n", __func__, n, sched->splits[i_split].graph.nodes[n]->name); + // } } sched->n_splits = splits_tp.size(); From 400629323cd8f10e892fe1cc409423c632a1e660 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sat, 24 May 2025 16:49:46 +0200 Subject: [PATCH 51/65] WIP --- ggml/src/ggml-backend.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 2c6c7beffc93a..248042a1d1dac 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -805,7 +805,7 @@ static char * fmt_size(size_t size) { static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { int cur_split = 0; for (int i = 0; i < graph->n_nodes; i++) { - if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) { + while (cur_split < sched->n_splits && i >= sched->splits[cur_split].i_start) { ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id]; GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs", cur_split, ggml_backend_name(split_backend), sched->splits[cur_split].n_inputs); From 943456b02e5284b45755d048647d806719a20fb6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sat, 24 May 2025 17:03:11 +0200 Subject: [PATCH 52/65] WIP --- ggml/src/ggml-backend.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 248042a1d1dac..b446c514994f5 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1222,11 +1222,13 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg ggml_format_name(split.graph.nodes[n], "%s (parallel %d)", name.c_str(), i_gpu); } + fprintf(stderr, "%s: 10 index=%d backend_id=%d\n", __func__, int(splits_tp.size()), split.backend_id); splits_tp.push_back(split); } } // add split on same backend last so that splits on other backends don't have to wait for it + fprintf(stderr, "%s: 10 index=%d backend_id=%d\n", __func__, int(splits_tp.size()), split_main.backend_id); splits_tp.push_back(split_main); } From 25c25eab065cf9dabe3baae2e7c1c41eaa80056c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sat, 24 May 2025 17:15:30 +0200 Subject: [PATCH 53/65] WIP --- ggml/src/ggml-backend.cpp | 60 +++++++++++++++++++-------------------- 1 file changed, 29 insertions(+), 31 deletions(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index b446c514994f5..a4afa8a66ce19 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -802,41 +802,39 @@ static char * fmt_size(size_t size) { return buffer; } -static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { - int cur_split = 0; - for (int i = 0; i < graph->n_nodes; i++) { - while (cur_split < sched->n_splits && i >= sched->splits[cur_split].i_start) { - ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id]; - GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs", cur_split, ggml_backend_name(split_backend), - sched->splits[cur_split].n_inputs); - for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) { - if (j == 0) { - GGML_LOG_DEBUG(": "); - } - GGML_LOG_DEBUG("[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name, - fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j]))); +static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgraph * /*graph*/) { // FIXME + for (int i_split = 0; i_split < sched->n_splits; i_split++) { + const ggml_backend_sched_split & split = sched->splits[i_split]; + ggml_backend_t split_backend = sched->backends[split.backend_id]; + GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs", i_split, ggml_backend_name(split_backend), split.n_inputs); + for (int j = 0; j < split.n_inputs; j++) { + if (j == 0) { + GGML_LOG_DEBUG(": "); } - GGML_LOG_DEBUG("\n"); - cur_split++; - } - struct ggml_tensor * node = graph->nodes[i]; - if (ggml_is_view_op(node->op)) { - continue; + GGML_LOG_DEBUG("[%s (%5.5s)] ", split.inputs[j]->name, fmt_size(ggml_nbytes(split.inputs[j]))); } - if (sched->debug > 1) { - ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node); - GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name, - fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node)); - for (int j = 0; j < GGML_MAX_SRC; j++) { - struct ggml_tensor * src = node->src[j]; - if (src == NULL) { - continue; + GGML_LOG_DEBUG("\n"); + for (int i = 0; i < split.graph.n_nodes; i++) { + // const ggml_tensor * node = split.graph.nodes[i]; // FIXME + ggml_tensor * node = split.graph.nodes[i]; + if (ggml_is_view_op(node->op)) { + continue; + } + if (sched->debug > 1) { + ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node); + GGML_LOG_DEBUG("node #%3d#%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, split.i_start + i, ggml_op_name(node->op), node->name, + fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node)); + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * src = node->src[j]; + if (src == NULL) { + continue; + } + ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src); + GGML_LOG_DEBUG(" %20.20s (%5.5s) [%5.5s %8.8s]", src->name, + fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src)); } - ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src); - GGML_LOG_DEBUG(" %20.20s (%5.5s) [%5.5s %8.8s]", src->name, - fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src)); + GGML_LOG_DEBUG("\n"); } - GGML_LOG_DEBUG("\n"); } } } From 739d9025722c9537eb0cf724c90044fa03c73466 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sat, 24 May 2025 17:20:18 +0200 Subject: [PATCH 54/65] WIP --- ggml/src/ggml-backend.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index a4afa8a66ce19..c135387e32177 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1220,13 +1220,13 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg ggml_format_name(split.graph.nodes[n], "%s (parallel %d)", name.c_str(), i_gpu); } - fprintf(stderr, "%s: 10 index=%d backend_id=%d\n", __func__, int(splits_tp.size()), split.backend_id); + // fprintf(stderr, "%s: 10 index=%d backend_id=%d\n", __func__, int(splits_tp.size()), split.backend_id); splits_tp.push_back(split); } } // add split on same backend last so that splits on other backends don't have to wait for it - fprintf(stderr, "%s: 10 index=%d backend_id=%d\n", __func__, int(splits_tp.size()), split_main.backend_id); + // fprintf(stderr, "%s: 10 index=%d backend_id=%d\n", __func__, int(splits_tp.size()), split_main.backend_id); splits_tp.push_back(split_main); } From 26807a99bd99627ce2b991515256306bc8db536f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sat, 24 May 2025 17:23:13 +0200 Subject: [PATCH 55/65] WIP --- ggml/src/ggml-backend.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index c135387e32177..af0b8e62b81d8 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1385,7 +1385,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } ggml_tensor * concat = ggml_concat(sched->ctx, a, b, /*dim =*/ 0); - ggml_format_name(concat, "%s#%s concat#%d", ggml_backend_name(backend), src->name, c); + ggml_format_name(concat, "%s#%s cc#%d", ggml_backend_name(backend), src->name, c); tensor_id_copy(src_id, split.backend_id, c) = concat; } const int n_inputs = split.n_inputs++; From 1c9dcde49f233fa1f441a439886607c4db9b10f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sat, 24 May 2025 17:25:06 +0200 Subject: [PATCH 56/65] WIP --- src/llama-model.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index c01b843a73500..6ef279c33e817 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -4570,6 +4570,9 @@ struct llm_build_llama : public llm_graph_context { Qcur = ggml_scale(ctx0, Qcur, 1.0f); Kcur = ggml_scale(ctx0, Kcur, 1.0f); Vcur = ggml_scale(ctx0, Vcur, 1.0f); + cb(Qcur, "QcurFIXME", il); + cb(Kcur, "KcurFIXME", il); + cb(Vcur, "VcurFIXME", il); Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); From 3c21fddc11b38c0baaaa65d9a677c9bae279acc3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sat, 24 May 2025 17:33:41 +0200 Subject: [PATCH 57/65] WIP --- ggml/src/ggml-backend.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index af0b8e62b81d8..21e7b40b69f0c 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1217,7 +1217,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg for (int n = 0; n < split.graph.n_nodes; n++) { std::string name = split.graph.nodes[n]->name; - ggml_format_name(split.graph.nodes[n], "%s (parallel %d)", name.c_str(), i_gpu); + ggml_format_name(split.graph.nodes[n], "%s#tp%d", name.c_str(), i_gpu); } // fprintf(stderr, "%s: 10 index=%d backend_id=%d\n", __func__, int(splits_tp.size()), split.backend_id); @@ -1261,6 +1261,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg src0 = ggml_dup_tensor_layout(sched->ctx, src0); src0->data = ((void **) dst->src[0]->extra)[i_gpu]; // FIXME src0->buffer = dst->src[0]->buffer; + ggml_format_name(src0, "%s#tp%d", dst->src[0]->name, i_gpu); dst->src[0] = src0; tensor_backend_id(src0) = split.backend_id; From 97190037428b5e7f1bca5f50caa0f6be771135a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sat, 24 May 2025 22:52:49 +0200 Subject: [PATCH 58/65] WIP --- ggml/src/ggml-backend.cpp | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 21e7b40b69f0c..da435040c8dbc 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1357,13 +1357,13 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } if (tensor_id_tp(src_id, 0) != nullptr) { - if (tensor_id_copy(src_id, split.backend_id, 0) == nullptr) { + if (tensor_id_copy(src_id, split.backend_id, sched->cur_copy) == nullptr) { GGML_ASSERT(n_gpus == 2); ggml_backend_t backend = sched->backends[split.backend_id]; + ggml_tensor * a = tensor_id_tp(src_id, 0); + ggml_tensor * b = tensor_id_tp(src_id, 1); for (int c = 0; c < sched->n_copies; c++) { - ggml_tensor * a = tensor_id_tp(src_id, 0); - ggml_tensor * b = tensor_id_tp(src_id, 1); if (split.backend_id == 0) { ggml_tensor * b_copy = ggml_dup_tensor_layout(sched->ctx, b); ggml_format_name(b_copy, "%s#%s part1#%d", ggml_backend_name(backend), src->name, c); @@ -1372,6 +1372,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg ggml_set_input(b_copy); ggml_set_output(b_copy); // prevent ggml-alloc from overwriting the tensor } + tensor_id_copy(hash_id(b), split.backend_id, c) = b_copy; b = b_copy; } else { GGML_ASSERT(split.backend_id == 1); @@ -1382,16 +1383,28 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg ggml_set_input(a_copy); ggml_set_output(a_copy); // prevent ggml-alloc from overwriting the tensor } + tensor_id_copy(hash_id(a), split.backend_id, c) = a_copy; a = a_copy; } ggml_tensor * concat = ggml_concat(sched->ctx, a, b, /*dim =*/ 0); - ggml_format_name(concat, "%s#%s cc#%d", ggml_backend_name(backend), src->name, c); tensor_id_copy(src_id, split.backend_id, c) = concat; } - const int n_inputs = split.n_inputs++; - GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS); - split.inputs[n_inputs] = src; + { + const int n_inputs = split.n_inputs++; + GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS); + if (split.backend_id == 0) { + split.inputs[n_inputs] = b; + } else { + GGML_ASSERT(split.backend_id == 1); + split.inputs[n_inputs] = a; + } + } + { + const int n_inputs = split.n_inputs++; + GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS); + split.inputs[n_inputs] = src; + } } // fprintf(stderr, "%s: 100 replacing src%d=%s of %s\n", __func__, j, node->src[j]->name, node->name); node->src[j] = tensor_id_copy(src_id, split.backend_id, sched->cur_copy); @@ -1584,6 +1597,9 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]); struct ggml_tensor * input = split->inputs[j]; struct ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy); + if (input_cpy->op != GGML_OP_NONE) { + continue; + } if (input->flags & GGML_TENSOR_FLAG_INPUT) { // inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done From 1c37a20c2e1d742631b558854075b68471364654 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sat, 24 May 2025 23:00:04 +0200 Subject: [PATCH 59/65] WIP --- ggml/src/ggml-backend.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index da435040c8dbc..f845d84b0ba17 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1484,8 +1484,10 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg assert(graph_copy->size > (graph_copy->n_nodes + 1)); struct ggml_tensor * input = split->inputs[j]; + assert(input); const size_t input_id = hash_id(input); struct ggml_tensor * input_cpy = tensor_id_copy(input_id, split->backend_id, sched->cur_copy); + assert(input_cpy); // add a dependency to the input source so that it is not freed before the copy is done struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input); From f6dd08e10a4d87a721cba1abadb981bb2be682cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sat, 24 May 2025 23:04:07 +0200 Subject: [PATCH 60/65] WIP --- ggml/src/ggml-backend.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index f845d84b0ba17..9626cb7200dde 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1361,9 +1361,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg GGML_ASSERT(n_gpus == 2); ggml_backend_t backend = sched->backends[split.backend_id]; - ggml_tensor * a = tensor_id_tp(src_id, 0); - ggml_tensor * b = tensor_id_tp(src_id, 1); for (int c = 0; c < sched->n_copies; c++) { + ggml_tensor * a = tensor_id_tp(src_id, 0); + ggml_tensor * b = tensor_id_tp(src_id, 1); if (split.backend_id == 0) { ggml_tensor * b_copy = ggml_dup_tensor_layout(sched->ctx, b); ggml_format_name(b_copy, "%s#%s part1#%d", ggml_backend_name(backend), src->name, c); @@ -1394,9 +1394,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg const int n_inputs = split.n_inputs++; GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS); if (split.backend_id == 0) { + ggml_tensor * b = tensor_id_tp(src_id, 1); split.inputs[n_inputs] = b; } else { GGML_ASSERT(split.backend_id == 1); + ggml_tensor * a = tensor_id_tp(src_id, 0); split.inputs[n_inputs] = a; } } From 02e4af1ad8ed2f41e41ffdace7c11cba80ef926c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sat, 24 May 2025 23:15:44 +0200 Subject: [PATCH 61/65] WIP --- ggml/src/ggml-backend.cpp | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 9626cb7200dde..305060119f91e 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1633,6 +1633,23 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s } } } + { + ggml_cgraph graph_inputs = { + /*.size =*/ 0, + /*.n_nodes =*/ split->n_inputs, + /*.n_leafs =*/ 0, + /*.nodes =*/ split->inputs, + /*.grads =*/ NULL, // gradients would need visited_hash_set + /*.grad_accs =*/ NULL, + /*.leafs =*/ NULL, + /*.visited_hash_set =*/ { 0, NULL, NULL }, + /*.order =*/ split->graph.order, + }; + enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &graph_inputs); + if (ec != GGML_STATUS_SUCCESS) { + return ec; + } + } if (!sched->callback_eval) { enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph); From 07ca4b86f8dde1a740adf0c883407be72feccad5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sat, 24 May 2025 23:18:45 +0200 Subject: [PATCH 62/65] WIP --- ggml/src/ggml-backend.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 305060119f91e..3da139d5a8181 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1596,12 +1596,14 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s const int split_backend_id = split->backend_id; ggml_backend_t split_backend = sched->backends[split_backend_id]; + bool execute_inputs = false; // copy the input tensors to the split backend for (int j = 0; j < split->n_inputs; j++) { ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]); struct ggml_tensor * input = split->inputs[j]; struct ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy); if (input_cpy->op != GGML_OP_NONE) { + execute_inputs = true; continue; } @@ -1633,7 +1635,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s } } } - { + if (execute_inputs) { ggml_cgraph graph_inputs = { /*.size =*/ 0, /*.n_nodes =*/ split->n_inputs, From c0358bdb424e6b1499be603b7832eccfe131ce13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sat, 24 May 2025 23:24:04 +0200 Subject: [PATCH 63/65] WIP --- ggml/src/ggml-cuda/ggml-cuda.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 45d41f5645355..c80d49f9d24d4 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -2656,7 +2656,8 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx } #ifndef NDEBUG - assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device)); + assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || + ggml_backend_buft_is_cuda_split(node->buffer->buft)); for (int j = 0; j < GGML_MAX_SRC; j++) { if (node->src[j] != nullptr) { assert(node->src[j]->buffer); From ea3cab5ffbb864753fb8e2a40b6457f7df0de415 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sat, 24 May 2025 23:27:59 +0200 Subject: [PATCH 64/65] WIP --- ggml/src/ggml-backend.cpp | 10 +++++----- ggml/src/ggml-cuda/ggml-cuda.cu | 3 +-- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 3da139d5a8181..614749acd9206 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1596,14 +1596,14 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s const int split_backend_id = split->backend_id; ggml_backend_t split_backend = sched->backends[split_backend_id]; - bool execute_inputs = false; + std::vector active_inputs; // copy the input tensors to the split backend for (int j = 0; j < split->n_inputs; j++) { ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]); struct ggml_tensor * input = split->inputs[j]; struct ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy); if (input_cpy->op != GGML_OP_NONE) { - execute_inputs = true; + active_inputs.push_back(input_cpy); continue; } @@ -1635,12 +1635,12 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s } } } - if (execute_inputs) { + if (!active_inputs.empty()) { ggml_cgraph graph_inputs = { /*.size =*/ 0, - /*.n_nodes =*/ split->n_inputs, + /*.n_nodes =*/ int(active_inputs.size()), /*.n_leafs =*/ 0, - /*.nodes =*/ split->inputs, + /*.nodes =*/ active_inputs.data(), /*.grads =*/ NULL, // gradients would need visited_hash_set /*.grad_accs =*/ NULL, /*.leafs =*/ NULL, diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index c80d49f9d24d4..45d41f5645355 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -2656,8 +2656,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx } #ifndef NDEBUG - assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || - ggml_backend_buft_is_cuda_split(node->buffer->buft)); + assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device)); for (int j = 0; j < GGML_MAX_SRC; j++) { if (node->src[j] != nullptr) { assert(node->src[j]->buffer); From 027d97e3d1aa93ec84b63c05444cad184186065e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sun, 25 May 2025 11:18:24 +0200 Subject: [PATCH 65/65] WIP --- src/llama-context.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index a3b84a6a82e74..b0de3a73d49fa 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -221,7 +221,7 @@ llama_context::llama_context( bool pipeline_parallel = model.n_devices() > 1 && model.params.n_gpu_layers > (int) model.hparams.n_layer && - model.params.split_mode == LLAMA_SPLIT_MODE_LAYER && + (model.params.split_mode == LLAMA_SPLIT_MODE_LAYER || model.params.split_mode == LLAMA_SPLIT_MODE_ROW) && cparams.offload_kqv && !model.has_tensor_overrides();