ggml-org · JohannesGaessler · May 17, 2025 · May 17, 2025 · May 17, 2025 · May 17, 2025
@@ -41,6 +41,7 @@ extern "C" {
     GGML_API size_t                ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
     GGML_API bool                  ggml_backend_buft_is_host       (ggml_backend_buffer_type_t buft);
     GGML_API ggml_backend_dev_t    ggml_backend_buft_get_device    (ggml_backend_buffer_type_t buft);
+    GGML_API bool                  ggml_backend_buft_is_split      (ggml_backend_buffer_type_t buft);
 
     //
     // Backend buffer

@@ -26,6 +26,7 @@ extern "C" {
         size_t                (*get_alloc_size)(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
         // (optional) check if tensor data is in host memory and uses standard ggml tensor layout (defaults to false)
         bool                  (*is_host)       (ggml_backend_buffer_type_t buft);
+        bool                  (*is_split)      (ggml_backend_buffer_type_t buft);
     };
 
     struct ggml_backend_buffer_type {

@@ -6422,6 +6422,7 @@ ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void) {
                            /* .get_max_size     = */ nullptr,  // defaults to SIZE_MAX
                            /* .get_alloc_size   = */ nullptr,  // defaults to ggml_nbytes
                            /* .is_host          = */ nullptr,
+                           /* .is_split         = */ nullptr,
                            },
         /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
         /* .context = */ new ggml::cpu::aarch64::extra_buffer_type(),

@@ -689,13 +689,16 @@ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_t
     GGML_UNUSED(buft);
 }
 
+static bool ggml_backend_buft_is_cuda_split(ggml_backend_buffer_type_t buft);
+
 static const ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
     /* .get_name         = */ ggml_backend_cuda_buffer_type_get_name,
     /* .alloc_buffer     = */ ggml_backend_cuda_buffer_type_alloc_buffer,
     /* .get_alignment    = */ ggml_backend_cuda_buffer_type_get_alignment,
     /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
     /* .get_alloc_size   = */ ggml_backend_cuda_buffer_type_get_alloc_size,
     /* .is_host          = */ NULL,
+    /* .is_split         = */ ggml_backend_buft_is_cuda_split,
 };
 
 ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
@@ -1013,6 +1016,7 @@ static const ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_inte
     /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
     /* .get_alloc_size   = */ ggml_backend_cuda_split_buffer_type_get_alloc_size,
     /* .is_host          = */ ggml_backend_cuda_split_buffer_type_is_host,
+    /* .is_split         = */ ggml_backend_buft_is_cuda_split,
 };
 
 ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split) {
@@ -1111,6 +1115,7 @@ ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
             /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
             /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
             /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
+            /* .is_split         = */ NULL,
         },
         /* .device   = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), 0),
         /* .context  = */ nullptr,
@@ -1907,7 +1912,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
 }
 
 static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
+    const bool split = false && ggml_backend_buft_is_cuda_split(src0->buffer->buft);
 
     // If src0 is a temporary compute buffer it may have some padding that needs to be cleared for mul_mat_vec_q or mul_mat_q.
     // But if src0 is also a view of another tensor then this cannot be done safely because it may overwrite valid tensor data.
@@ -2124,7 +2129,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
 
 static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst) {
     // why is this here instead of mul_mat?
-    if (dst->src[0] != nullptr && ggml_backend_buft_is_cuda_split(dst->src[0]->buffer->buft)) {
+    if (false && dst->src[0] != nullptr && ggml_backend_buft_is_cuda_split(dst->src[0]->buffer->buft)) {
         ggml_cuda_set_peer_access(dst->src[1]->ne[1], ctx.device);
     }
 
@@ -2992,7 +2997,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
                 struct ggml_tensor * b = op->src[1];
                 // for small weight matrices the active device can end up without any rows, don't use row split in those cases
                 // this avoids some edge cases (and the performance would not be good anyways)
-                if (a->buffer && ggml_backend_buft_is_cuda_split(a->buffer->buft)) {
+                if (false && a->buffer && ggml_backend_buft_is_cuda_split(a->buffer->buft)) {
                     ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) a->buffer->buft->context;
                     int64_t row_low;
                     int64_t row_high;

@@ -593,9 +593,87 @@ static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
 
 #ifdef __cplusplus
 #include <vector>
+#include <map>
 
 // expose GGUF internals for test code
 GGML_API size_t gguf_type_size(enum gguf_type type);
 GGML_API struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params);
 GGML_API void gguf_write_to_buf(const struct gguf_context * ctx, std::vector<int8_t> & buf, bool only_meta);
+
+static ggml_tensor * map_tensor(std::map<ggml_tensor *, ggml_tensor *> & tensor_map, ggml_context * ctx, ggml_tensor * tensor, bool deep) {
+    if (!tensor) {
+        return nullptr;
+    }
+
+    if (tensor_map.find(tensor) != tensor_map.end()) {
+        return tensor_map[tensor];
+    }
+
+    ggml_tensor * new_tensor = ggml_dup_tensor(ctx, tensor);
+    tensor_map[tensor] = new_tensor;
+
+    new_tensor->op = tensor->op;
+    for (int i = 0; i < GGML_MAX_DIMS; i++) {
+        new_tensor->nb[i] = tensor->nb[i];
+    }
+    new_tensor->flags = tensor->flags;
+    memcpy(new_tensor->op_params, tensor->op_params, sizeof(tensor->op_params));
+    strcpy(new_tensor->name, tensor->name);
+    new_tensor->data = tensor->data;
+    new_tensor->buffer = tensor->buffer;
+    new_tensor->extra = tensor->extra;
+    new_tensor->view_offs = tensor->view_offs;
+
+    if (deep) {
+        new_tensor->view_src = map_tensor(tensor_map, ctx, tensor->view_src, deep);
+        for (int i = 0; i < GGML_MAX_SRC; i++) {
+            new_tensor->src[i] = map_tensor(tensor_map, ctx, tensor->src[i], deep);
+        }
+    } else {
+        new_tensor->view_src = tensor->view_src;
+        for (int i = 0; i < GGML_MAX_SRC; i++) {
+            new_tensor->src[i] = tensor->src[i];
+        }
+    }
+
+    return new_tensor;
+}
+
+static void dup_graph(ggml_context * ctx, const ggml_cgraph * src, ggml_cgraph * dst, bool deep) {
+    std::map<ggml_tensor *, ggml_tensor *> tensor_map;
+
+    if (deep) {
+        for (int i = 0; i < src->n_leafs; i++) {
+            ggml_build_forward_expand(dst, map_tensor(tensor_map, ctx, src->leafs[i], deep));
+        }
+        for (int i = 0; i < src->n_nodes; i++) {
+            ggml_build_forward_expand(dst, map_tensor(tensor_map, ctx, src->nodes[i], deep));
+        }
+    } else {
+        for (int i = 0; i < src->n_leafs; i++) {
+            dst->leafs[dst->n_leafs++] = map_tensor(tensor_map, ctx, src->leafs[i], deep);
+        }
+        for (int i = 0; i < src->n_nodes; i++) {
+            dst->nodes[dst->n_nodes++] = map_tensor(tensor_map, ctx, src->nodes[i], deep);
+        }
+    }
+    GGML_ASSERT(dst->n_leafs == src->n_leafs);
+    GGML_ASSERT(dst->n_nodes == src->n_nodes);
+
+    if (src->grads) {
+        GGML_ASSERT(dst->grads);
+        for (int i = 0; i < src->n_nodes; ++i) {
+            const size_t igrad_src = ggml_hash_find(&src->visited_hash_set, src->nodes[i]);
+            const size_t igrad_dst = ggml_hash_find(&dst->visited_hash_set, dst->nodes[i]);
+
+            GGML_ASSERT(igrad_src != GGML_HASHSET_FULL);
+            GGML_ASSERT(ggml_bitset_get(src->visited_hash_set.used, igrad_src));
+            GGML_ASSERT(igrad_dst != GGML_HASHSET_FULL);
+            GGML_ASSERT(ggml_bitset_get(dst->visited_hash_set.used, igrad_dst));
+
+            dst->grads[igrad_dst]     = src->grads[igrad_src];
+            dst->grad_accs[igrad_dst] = src->grad_accs[igrad_src];
+        }
+    }
+}
 #endif // __cplusplus
@@ -9,7 +9,6 @@
 #include <cmath>
 #include <cstdint>
 #include <cinttypes>
-#include <map>
 #include <random>
 #include <vector>
 
@@ -252,66 +251,6 @@ struct ggml_opt_params ggml_opt_default_params(
     };
 }
 
-static ggml_tensor * map_tensor(std::map<ggml_tensor *, ggml_tensor *> & tensor_map, ggml_context * ctx, ggml_tensor * tensor) {
-    if (!tensor) {
-        return nullptr;
-    }
-
-    if (tensor_map.find(tensor) != tensor_map.end()) {
-        return tensor_map[tensor];
-    }
-
-    ggml_tensor * new_tensor = ggml_dup_tensor(ctx, tensor);
-    tensor_map[tensor] = new_tensor;
-
-    new_tensor->op = tensor->op;
-    for (int i = 0; i < GGML_MAX_DIMS; i++) {
-        new_tensor->nb[i] = tensor->nb[i];
-    }
-    new_tensor->flags = tensor->flags;
-    memcpy(new_tensor->op_params, tensor->op_params, sizeof(tensor->op_params));
-    strcpy(new_tensor->name, tensor->name);
-    new_tensor->data = tensor->data;
-    new_tensor->buffer = tensor->buffer;
-    new_tensor->extra = tensor->extra;
-    new_tensor->view_offs = tensor->view_offs;
-    new_tensor->view_src = map_tensor(tensor_map, ctx, tensor->view_src);
-    for (int i = 0; i < GGML_MAX_SRC; i++) {
-        new_tensor->src[i] = map_tensor(tensor_map, ctx, tensor->src[i]);
-    }
-
-    return new_tensor;
-}
-
-static ggml_cgraph * dup_graph(ggml_context * ctx, ggml_cgraph * src) {
-    std::map<ggml_tensor *, ggml_tensor *> tensor_map;
-
-    ggml_cgraph * dst = ggml_new_graph_custom(ctx, src->size, /*grads =*/ true);
-
-    for (int i = 0; i < src->n_leafs; i++) {
-        ggml_build_forward_expand(dst, map_tensor(tensor_map, ctx, src->leafs[i]));
-    }
-    GGML_ASSERT(dst->n_leafs == src->n_leafs);
-    for (int i = 0; i < src->n_nodes; i++) {
-        ggml_build_forward_expand(dst, map_tensor(tensor_map, ctx, src->nodes[i]));
-    }
-    GGML_ASSERT(dst->n_nodes == src->n_nodes);
-    for (int i = 0; i < src->n_nodes; ++i) {
-        const size_t igrad_src = ggml_hash_find(&src->visited_hash_set, src->nodes[i]);
-        const size_t igrad_dst = ggml_hash_find(&dst->visited_hash_set, dst->nodes[i]);
-
-        GGML_ASSERT(igrad_src != GGML_HASHSET_FULL);
-        GGML_ASSERT(ggml_bitset_get(src->visited_hash_set.used, igrad_src));
-        GGML_ASSERT(igrad_dst != GGML_HASHSET_FULL);
-        GGML_ASSERT(ggml_bitset_get(dst->visited_hash_set.used, igrad_dst));
-
-        dst->grads[igrad_dst]     = src->grads[igrad_src];
-        dst->grad_accs[igrad_dst] = src->grad_accs[igrad_src];
-    }
-
-    return dst;
-}
-
 static void ggml_opt_build(ggml_opt_context_t opt_ctx) {
     GGML_ASSERT(opt_ctx->ctx_compute && "no compute context set, either use static graphs or set one with ggml_opt_prepare_alloc");
     GGML_ASSERT((!opt_ctx->static_graphs || opt_ctx->inputs->data) && "when using static graphs the inputs must be allocated statically");
@@ -738,7 +677,8 @@ void ggml_opt_alloc(ggml_opt_context_t opt_ctx, bool backward) {
         ggml_free(opt_ctx->ctx_copy);
         opt_ctx->ctx_copy = ggml_init(params);
 
-        opt_ctx->allocated_graph_copy = dup_graph(opt_ctx->ctx_copy, graph);
+        opt_ctx->allocated_graph_copy = ggml_new_graph_custom(opt_ctx->ctx_copy, graph->size, /*grads =*/ true);
+        dup_graph(opt_ctx->ctx_copy, graph, opt_ctx->allocated_graph_copy, /*deep =*/ true);
     } else {
         opt_ctx->allocated_graph_copy = graph;
     }

@@ -1714,6 +1714,7 @@ struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * nam
 struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...) {
     va_list args;
     va_start(args, fmt);
+    assert(tensor->name != fmt);
     vsnprintf(tensor->name, sizeof(tensor->name), fmt, args);
     va_end(args);
     return tensor;

@@ -221,7 +221,7 @@ llama_context::llama_context(
         bool pipeline_parallel =
             model.n_devices() > 1 &&
             model.params.n_gpu_layers > (int) model.hparams.n_layer &&
-            model.params.split_mode == LLAMA_SPLIT_MODE_LAYER &&
+            (model.params.split_mode == LLAMA_SPLIT_MODE_LAYER || model.params.split_mode == LLAMA_SPLIT_MODE_ROW) &&
             cparams.offload_kqv &&
             !model.has_tensor_overrides();
 

@@ -4566,6 +4566,14 @@ struct llm_build_llama : public llm_graph_context {
                     cb(Vcur, "Vcur", il);
                 }
 
+                // FIXME
+                Qcur = ggml_scale(ctx0, Qcur, 1.0f);
+                Kcur = ggml_scale(ctx0, Kcur, 1.0f);
+                Vcur = ggml_scale(ctx0, Vcur, 1.0f);
+                cb(Qcur, "QcurFIXME", il);
+                cb(Kcur, "KcurFIXME", il);
+                cb(Vcur, "VcurFIXME", il);
+
                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
                 Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
                 Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
@@ -4705,6 +4713,7 @@ struct llm_build_llama : public llm_graph_context {
         cur = build_lora_mm(model.output, cur);
 
         cb(cur, "result_output", -1);
+        cur = ggml_scale(ctx0, cur, 1.0f); // FIXME
         res->t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);