diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h index 778927f68217a..77d8cf364ed11 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -41,6 +41,7 @@ extern "C" { GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft); GGML_API ggml_backend_dev_t ggml_backend_buft_get_device (ggml_backend_buffer_type_t buft); + GGML_API bool ggml_backend_buft_is_split (ggml_backend_buffer_type_t buft); // // Backend buffer diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h index c36c12d6579ac..db8b4ba64e86b 100644 --- a/ggml/src/ggml-backend-impl.h +++ b/ggml/src/ggml-backend-impl.h @@ -26,6 +26,7 @@ extern "C" { size_t (*get_alloc_size)(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // (optional) check if tensor data is in host memory and uses standard ggml tensor layout (defaults to false) bool (*is_host) (ggml_backend_buffer_type_t buft); + bool (*is_split) (ggml_backend_buffer_type_t buft); }; struct ggml_backend_buffer_type { diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index b30b4cb386f9f..614749acd9206 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -77,6 +77,10 @@ ggml_backend_dev_t ggml_backend_buft_get_device(ggml_backend_buffer_type_t buft) return buft->device; } +bool ggml_backend_buft_is_split(ggml_backend_buffer_type_t buft) { + return buft->iface.is_split && buft->iface.is_split(buft); +} + // backend buffer ggml_backend_buffer_t ggml_backend_buffer_init( @@ -620,10 +624,12 @@ static bool ggml_is_view_op(enum ggml_op op) { #define GGML_SCHED_MAX_COPIES 4 #endif +// slice of a graph for nodes [i_start, i_end) running on backend sched->backends[backend_id] struct ggml_backend_sched_split { int backend_id; int i_start; int i_end; + bool tensor_parallel; // whether the nodes in this split are shared with splits for other backends struct ggml_tensor * inputs[GGML_SCHED_MAX_SPLIT_INPUTS]; int n_inputs; // graph view of this split @@ -644,6 +650,7 @@ struct ggml_backend_sched { struct ggml_hash_set hash_set; int * hv_tensor_backend_ids; // [hash_set.size] struct ggml_tensor ** hv_tensor_copies; // [hash_set.size][n_backends][n_copies] + struct ggml_tensor ** hv_tensor_parallel; // [hash_set.size][n_backends] int * node_backend_ids; // [graph_size] int * leaf_backend_ids; // [graph_size] @@ -683,6 +690,7 @@ struct ggml_backend_sched { #define tensor_backend_id(tensor) sched->hv_tensor_backend_ids[hash_id(tensor)] #define tensor_id_copy(id, backend_id, copy_id) sched->hv_tensor_copies[(id) * sched->n_backends * sched->n_copies + (backend_id) * sched->n_copies + (copy_id)] #define tensor_copy(tensor, backend_id, copy_id) tensor_id_copy(hash_id(tensor), backend_id, copy_id) +#define tensor_id_tp(id, backend_id) sched->hv_tensor_parallel[(id) * sched->n_backends + (backend_id)] // returns the priority of the backend, lower id is higher priority static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) { @@ -794,41 +802,39 @@ static char * fmt_size(size_t size) { return buffer; } -static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { - int cur_split = 0; - for (int i = 0; i < graph->n_nodes; i++) { - if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) { - ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id]; - GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs", cur_split, ggml_backend_name(split_backend), - sched->splits[cur_split].n_inputs); - for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) { - if (j == 0) { - GGML_LOG_DEBUG(": "); - } - GGML_LOG_DEBUG("[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name, - fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j]))); +static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgraph * /*graph*/) { // FIXME + for (int i_split = 0; i_split < sched->n_splits; i_split++) { + const ggml_backend_sched_split & split = sched->splits[i_split]; + ggml_backend_t split_backend = sched->backends[split.backend_id]; + GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs", i_split, ggml_backend_name(split_backend), split.n_inputs); + for (int j = 0; j < split.n_inputs; j++) { + if (j == 0) { + GGML_LOG_DEBUG(": "); } - GGML_LOG_DEBUG("\n"); - cur_split++; + GGML_LOG_DEBUG("[%s (%5.5s)] ", split.inputs[j]->name, fmt_size(ggml_nbytes(split.inputs[j]))); } - struct ggml_tensor * node = graph->nodes[i]; - if (ggml_is_view_op(node->op)) { - continue; - } - if (sched->debug > 1) { - ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node); - GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name, - fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node)); - for (int j = 0; j < GGML_MAX_SRC; j++) { - struct ggml_tensor * src = node->src[j]; - if (src == NULL) { - continue; + GGML_LOG_DEBUG("\n"); + for (int i = 0; i < split.graph.n_nodes; i++) { + // const ggml_tensor * node = split.graph.nodes[i]; // FIXME + ggml_tensor * node = split.graph.nodes[i]; + if (ggml_is_view_op(node->op)) { + continue; + } + if (sched->debug > 1) { + ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node); + GGML_LOG_DEBUG("node #%3d#%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, split.i_start + i, ggml_op_name(node->op), node->name, + fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node)); + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * src = node->src[j]; + if (src == NULL) { + continue; + } + ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src); + GGML_LOG_DEBUG(" %20.20s (%5.5s) [%5.5s %8.8s]", src->name, + fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src)); } - ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src); - GGML_LOG_DEBUG(" %20.20s (%5.5s) [%5.5s %8.8s]", src->name, - fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src)); + GGML_LOG_DEBUG("\n"); } - GGML_LOG_DEBUG("\n"); } } } @@ -923,6 +929,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg // expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend) // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops // ops unsupported by the backend being expanded will be left unassigned so that they can be assigned later when the locations of its inputs are known + // expand gpu down { int cur_backend_id = -1; @@ -1087,25 +1094,28 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } // pass 5: split graph, find tensors that need to be copied + std::vector splits_no_tp; { - int i_split = 0; - struct ggml_backend_sched_split * split = &sched->splits[0]; + ggml_backend_sched_split split; // find the backend of the first split, skipping view ops - int i = 0; + int i = 0; // graph node index for (; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; if (!ggml_is_view_op(node->op)) { - split->backend_id = tensor_backend_id(node); + split.backend_id = tensor_backend_id(node); + split.tensor_parallel = node->src[0] && ggml_backend_buft_is_split( + ggml_backend_buffer_get_type(node->src[0]->buffer)); break; } } - split->i_start = 0; - split->n_inputs = 0; - int cur_backend_id = split->backend_id; + split.i_start = 0; + split.n_inputs = 0; + GGML_ASSERT(!split.tensor_parallel); for (; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; if (ggml_is_view_op(node->op)) { + GGML_ASSERT(!split.tensor_parallel); continue; } @@ -1115,28 +1125,29 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg // check if we should start a new split based on the sources of the current node bool need_new_split = false; - if (node_backend_id == cur_backend_id && split->n_inputs > 0) { + if (node_backend_id == split.backend_id && split.n_inputs > 0) { for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * src = node->src[j]; - if (src == NULL) { + if (src == nullptr) { continue; } + // check if a weight is on a different and incompatible backend // by starting a new split, the memory of the previously offloaded weights can be reused if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { - int src_backend_id = tensor_backend_id(src); - if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) { + const int src_backend_id = tensor_backend_id(src); + if (src_backend_id != split.backend_id && !ggml_backend_sched_buffer_supported(sched, src, split.backend_id)) { need_new_split = true; break; } } // check if the split has too many inputs // FIXME: count the number of inputs instead of only checking when full - if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) { + if (split.n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) { const size_t id = hash_id(src); - int src_backend_id = sched->hv_tensor_backend_ids[id]; - bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id); - if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) { + const int src_backend_id = sched->hv_tensor_backend_ids[id]; + const bool supported = ggml_backend_sched_buffer_supported(sched, src, split.backend_id); + if (src_backend_id != split.backend_id && tensor_id_copy(id, split.backend_id, 0) == nullptr && !supported) { need_new_split = true; break; } @@ -1144,35 +1155,185 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } } - if (node_backend_id != cur_backend_id || need_new_split) { - split->i_end = i; - i_split++; - if (i_split >= sched->splits_capacity) { - sched->splits_capacity *= 2; - sched->splits = (ggml_backend_sched_split *) - realloc(sched->splits, sched->splits_capacity * sizeof(struct ggml_backend_sched_split)); - GGML_ASSERT(sched->splits != NULL); + const bool src0_on_split_buffer = node->src[0] && node->src[0]->buffer && + ggml_backend_buft_is_split(ggml_backend_buffer_get_type(node->src[0]->buffer)); + GGML_ASSERT(!src0_on_split_buffer || node->op == GGML_OP_MUL_MAT); + if (src0_on_split_buffer != split.tensor_parallel) { + need_new_split = true; + } + + if (node_backend_id != split.backend_id || need_new_split) { + split.i_end = i; + split.graph = ggml_graph_view(graph, split.i_start, split.i_end); + splits_no_tp.push_back(split); + + split.backend_id = node_backend_id; + split.i_start = i; + split.n_inputs = 0; + split.tensor_parallel = src0_on_split_buffer; + } + } + split.i_end = graph->n_nodes; + GGML_ASSERT(!split.tensor_parallel); + split.graph = ggml_graph_view(graph, split.i_start, split.i_end); + splits_no_tp.push_back(split); + } + +#ifndef NDEBUG + // assert that the splits are in the expected order and contain the correct nodes + // splits_no_tp should contain all graph nodes in sequential order + assert(!splits_no_tp.empty()); + assert(splits_no_tp.front().i_start == 0); + assert(splits_no_tp.front().graph.n_nodes == splits_no_tp.front().i_end - splits_no_tp.front().i_start); + for (size_t i = 1; i < splits_no_tp.size(); i++) { + const ggml_backend_sched_split & split_now = splits_no_tp[i]; + const ggml_backend_sched_split & split_prev = splits_no_tp[i - 1]; + const bool splits_sequential = split_now.i_start == split_prev.i_end; + assert(splits_sequential); + assert(split_now.graph.n_nodes == split_now.i_end - split_now.i_start); + for (int j = 0; j < split_now.graph.n_nodes; j++) { + assert(split_now.graph.nodes[j] == graph->nodes[split_now.i_start + j]); + } + } + assert(splits_no_tp.back().i_end == graph->n_nodes); +#endif // NDEBUG + + std::vector splits_tp; + const int n_gpus = sched->n_backends - 1; // FIXME + splits_tp.reserve(n_gpus * splits_no_tp.size()); + + { + for (const ggml_backend_sched_split & split_main : splits_no_tp) { + if (split_main.tensor_parallel) { + for (int i_gpu = 0; i_gpu < n_gpus; i_gpu++) { + if (i_gpu == split_main.backend_id) { + continue; + } + + ggml_backend_sched_split split = split_main; + split.backend_id = i_gpu; + split.graph = *ggml_new_graph_custom(sched->ctx, split_main.graph.size, /*grads =*/ false); + dup_graph(sched->ctx, &split_main.graph, &split.graph, /*deep =*/ false); + + for (int n = 0; n < split.graph.n_nodes; n++) { + std::string name = split.graph.nodes[n]->name; + ggml_format_name(split.graph.nodes[n], "%s#tp%d", name.c_str(), i_gpu); + } + + // fprintf(stderr, "%s: 10 index=%d backend_id=%d\n", __func__, int(splits_tp.size()), split.backend_id); + splits_tp.push_back(split); } - split = &sched->splits[i_split]; - split->backend_id = node_backend_id; - split->i_start = i; - split->n_inputs = 0; - cur_backend_id = node_backend_id; } + // add split on same backend last so that splits on other backends don't have to wait for it + // fprintf(stderr, "%s: 10 index=%d backend_id=%d\n", __func__, int(splits_tp.size()), split_main.backend_id); + splits_tp.push_back(split_main); + } + + for (ggml_backend_sched_split & split : splits_tp) { + if (!split.tensor_parallel) { + continue; + } + + const int i_gpu = split.backend_id; + for (int n = 0; n < split.graph.n_nodes; n++) { + ggml_tensor * dst = split.graph.nodes[n]; + GGML_ASSERT(dst->op == GGML_OP_MUL_MAT); + ggml_tensor * src0 = dst->src[0]; + ggml_tensor * src1 = dst->src[1]; + // fprintf(stderr, "%s: 025 src0={%ld, %ld, %ld, %ld} src1={%ld, %ld, %ld, %ld} dst={%ld, %ld, %ld, %ld}\n", + // __func__, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], + // dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]); + + GGML_ASSERT(src0->buffer); + GGML_ASSERT(ggml_backend_buft_is_split(ggml_backend_buffer_get_type(src0->buffer))); + + GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_is_contiguous(src1)); + GGML_ASSERT(ggml_is_contiguous(dst)); + + GGML_TENSOR_BINARY_OP_LOCALS; + + GGML_ASSERT(ne01 == ne0); + GGML_ASSERT(ne01 % n_gpus == 0); + + GGML_ASSERT(src0->op == GGML_OP_NONE); + src0 = ggml_dup_tensor_layout(sched->ctx, src0); + src0->data = ((void **) dst->src[0]->extra)[i_gpu]; // FIXME + src0->buffer = dst->src[0]->buffer; + ggml_format_name(src0, "%s#tp%d", dst->src[0]->name, i_gpu); + dst->src[0] = src0; + tensor_backend_id(src0) = split.backend_id; + + const int64_t row_low = ne01 * i_gpu /n_gpus; + const int64_t row_high = ne01 * (i_gpu+1)/n_gpus; + const int64_t row_diff = row_high - row_low; + + src0->ne[1] = row_diff; + src0->nb[2] = src0->ne[1]*src0->nb[1]; + src0->nb[3] = src0->ne[2]*src0->nb[2]; + + dst->ne[0] = row_diff; + dst->nb[1] = ggml_row_size(dst->type, dst->ne[0]); + dst->nb[2] = dst->ne[1]*dst->nb[1]; + dst->nb[3] = dst->ne[2]*dst->nb[2]; + + // fprintf(stderr, "%s: 050 src0={%ld, %ld, %ld, %ld} src1={%ld, %ld, %ld, %ld} dst={%ld, %ld, %ld, %ld}\n", + // __func__, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], + // dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]); + } + + for (int n = 0; n < split.graph.n_nodes; n++) { + ggml_tensor * node_orig = graph->nodes[split.i_start + n]; + ggml_tensor * node_sg = split.graph.nodes[n]; + tensor_id_tp(hash_id(node_orig), split.backend_id) = node_sg; + tensor_backend_id(node_sg) = split.backend_id; + } + } + } + +#ifndef NDEBUG + // assert that the splits are in the expected order and that parallel splits have different tensors with same ops + assert(!splits_tp.empty()); + assert(splits_tp.front().i_start == 0); + assert(splits_tp.front().graph.n_nodes == splits_tp.front().i_end - splits_tp.front().i_start); + for (size_t i = 1; i < splits_tp.size(); i++) { + const ggml_backend_sched_split & split_now = splits_tp[i]; + const ggml_backend_sched_split & split_prev = splits_tp[i - 1]; + const bool splits_sequential = split_now.i_start == split_prev.i_end; + const bool splits_parallel = split_now.tensor_parallel && split_prev.tensor_parallel && + split_now.backend_id != split_prev.backend_id && + split_now.i_start == split_prev.i_start && split_now.i_end == split_prev.i_end; + assert(splits_sequential || splits_parallel); + assert(split_now.graph.n_nodes == split_now.i_end - split_now.i_start); + if (splits_parallel) { + assert(split_now.graph.n_nodes == split_prev.graph.n_nodes); + for (int j = 0; j < split_now.graph.n_nodes; j++) { + assert(split_now.graph.nodes[j] != split_prev.graph.nodes[j]); + assert(split_now.graph.nodes[j]->op == split_prev.graph.nodes[j]->op); + } + } + } + assert(splits_tp.back().i_end == graph->n_nodes); +#endif // NDEBUG + + for (ggml_backend_sched_split & split : splits_tp) { + for (int i = 0; i < split.graph.n_nodes; i++) { + ggml_tensor * node = split.graph.nodes[i]; + // find inputs that are not on the same backend for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * src = node->src[j]; - if (src == NULL) { + if (src == nullptr) { continue; } - size_t src_id = hash_id(src); + const size_t src_id = hash_id(src); const int src_backend_id = sched->hv_tensor_backend_ids[src_id]; assert(src_backend_id != -1); // all inputs should be assigned by now if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) { - if (tensor_id_copy(src_id, src_backend_id, 0) == NULL) { + if (tensor_id_copy(src_id, src_backend_id, 0) == nullptr) { ggml_backend_t backend = sched->backends[src_backend_id]; for (int c = 0; c < sched->n_copies; c++) { struct ggml_tensor * tensor_copy; @@ -1195,10 +1356,64 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } } - if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) { + if (tensor_id_tp(src_id, 0) != nullptr) { + if (tensor_id_copy(src_id, split.backend_id, sched->cur_copy) == nullptr) { + GGML_ASSERT(n_gpus == 2); + ggml_backend_t backend = sched->backends[split.backend_id]; + + for (int c = 0; c < sched->n_copies; c++) { + ggml_tensor * a = tensor_id_tp(src_id, 0); + ggml_tensor * b = tensor_id_tp(src_id, 1); + if (split.backend_id == 0) { + ggml_tensor * b_copy = ggml_dup_tensor_layout(sched->ctx, b); + ggml_format_name(b_copy, "%s#%s part1#%d", ggml_backend_name(backend), src->name, c); + SET_CAUSE(b_copy, "5.cpy for gather"); + if (sched->n_copies > 1) { + ggml_set_input(b_copy); + ggml_set_output(b_copy); // prevent ggml-alloc from overwriting the tensor + } + tensor_id_copy(hash_id(b), split.backend_id, c) = b_copy; + b = b_copy; + } else { + GGML_ASSERT(split.backend_id == 1); + ggml_tensor * a_copy = ggml_dup_tensor_layout(sched->ctx, a); + ggml_format_name(a_copy, "%s#%s part0#%d", ggml_backend_name(backend), src->name, c); + SET_CAUSE(a_copy, "5.cpy for gather"); + if (sched->n_copies > 1) { + ggml_set_input(a_copy); + ggml_set_output(a_copy); // prevent ggml-alloc from overwriting the tensor + } + tensor_id_copy(hash_id(a), split.backend_id, c) = a_copy; + a = a_copy; + } + ggml_tensor * concat = ggml_concat(sched->ctx, a, b, /*dim =*/ 0); + ggml_format_name(concat, "%s#%s cc#%d", ggml_backend_name(backend), src->name, c); + tensor_id_copy(src_id, split.backend_id, c) = concat; + } + { + const int n_inputs = split.n_inputs++; + GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS); + if (split.backend_id == 0) { + ggml_tensor * b = tensor_id_tp(src_id, 1); + split.inputs[n_inputs] = b; + } else { + GGML_ASSERT(split.backend_id == 1); + ggml_tensor * a = tensor_id_tp(src_id, 0); + split.inputs[n_inputs] = a; + } + } + { + const int n_inputs = split.n_inputs++; + GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS); + split.inputs[n_inputs] = src; + } + } + // fprintf(stderr, "%s: 100 replacing src%d=%s of %s\n", __func__, j, node->src[j]->name, node->name); + node->src[j] = tensor_id_copy(src_id, split.backend_id, sched->cur_copy); + } else if (src_backend_id != split.backend_id && !ggml_backend_sched_buffer_supported(sched, src, split.backend_id)) { // create a copy of the input in the split's backend - if (tensor_id_copy(src_id, cur_backend_id, 0) == NULL) { - ggml_backend_t backend = sched->backends[cur_backend_id]; + if (tensor_id_copy(src_id, split.backend_id, 0) == nullptr) { + ggml_backend_t backend = sched->backends[split.backend_id]; for (int c = 0; c < sched->n_copies; c++) { struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src); ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c); @@ -1206,24 +1421,38 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg ggml_set_input(tensor_copy); ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor } - tensor_id_copy(src_id, cur_backend_id, c) = tensor_copy; + tensor_id_copy(src_id, split.backend_id, c) = tensor_copy; SET_CAUSE(tensor_copy, "4.cpy"); } - int n_inputs = split->n_inputs++; + const int n_inputs = split.n_inputs++; GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS); - split->inputs[n_inputs] = src; + split.inputs[n_inputs] = src; } - node->src[j] = tensor_id_copy(src_id, cur_backend_id, sched->cur_copy); + // fprintf(stderr, "%s: 200 replacing src%d=%s of %s\n", __func__, j, node->src[j]->name, node->name); + node->src[j] = tensor_id_copy(src_id, split.backend_id, sched->cur_copy); } } } - split->i_end = graph->n_nodes; - sched->n_splits = i_split + 1; } - if (sched->debug) { - ggml_backend_sched_print_assignments(sched, graph); - } + for (size_t i_split = 0; i_split < splits_tp.size(); i_split++) { + if (int(i_split) >= sched->splits_capacity) { + sched->splits_capacity *= 2; + sched->splits = (ggml_backend_sched_split *) + realloc(sched->splits, sched->splits_capacity * sizeof(struct ggml_backend_sched_split)); + GGML_ASSERT(sched->splits != NULL); + } + sched->splits[i_split] = splits_tp[i_split]; + // fprintf(stderr, "%s: split%d backend_id=%d i_start=%d i_end=%d\n", + // __func__, int(i_split), sched->splits[i_split].backend_id, sched->splits[i_split].i_start, sched->splits[i_split].i_end); + // for (int n = 0; n < sched->splits[i_split].n_inputs; n++) { + // fprintf(stderr, "%s: - input %d: %s\n", __func__, n, sched->splits[i_split].inputs[n]->name); + // } + // for (int n = 0; n < sched->splits[i_split].graph.n_nodes; n++) { + // fprintf(stderr, "%s: - node %d: %s\n", __func__, n, sched->splits[i_split].graph.nodes[n]->name); + // } + } + sched->n_splits = splits_tp.size(); // swap node_backend_ids and leaf _backend_ids with prevs { @@ -1236,7 +1465,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg sched->prev_leaf_backend_ids = tmp; } - int graph_size = std::max(graph->n_nodes, graph->n_leafs) + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sched->n_copies; + const int graph_size = std::max(graph->n_nodes, graph->n_leafs) + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sched->n_copies; if (sched->graph.size < graph_size) { sched->graph.size = graph_size; sched->graph.nodes = (ggml_tensor **) realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *)); @@ -1251,15 +1480,16 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg for (int i = 0; i < sched->n_splits; i++) { struct ggml_backend_sched_split * split = &sched->splits[i]; - split->graph = ggml_graph_view(graph, split->i_start, split->i_end); // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split for (int j = 0; j < split->n_inputs; j++) { assert(graph_copy->size > (graph_copy->n_nodes + 1)); struct ggml_tensor * input = split->inputs[j]; + assert(input); const size_t input_id = hash_id(input); struct ggml_tensor * input_cpy = tensor_id_copy(input_id, split->backend_id, sched->cur_copy); + assert(input_cpy); // add a dependency to the input source so that it is not freed before the copy is done struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input); @@ -1272,19 +1502,23 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg graph_copy->nodes[graph_copy->n_nodes++] = input_cpy; } - for (int j = split->i_start; j < split->i_end; j++) { + for (int j = 0; j < split->graph.n_nodes; j++) { assert(graph_copy->size > graph_copy->n_nodes); - sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]); - graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j]; + sched->node_backend_ids[graph_copy->n_nodes] = split->backend_id; + graph_copy->nodes[graph_copy->n_nodes++] = split->graph.nodes[j]; } } + if (sched->debug) { + ggml_backend_sched_print_assignments(sched, &sched->graph); + } + if (sched->n_copies > 1) { // add input copies as leafs so that they are allocated first for (int i = 0; i < sched->n_graph_inputs; i++) { struct ggml_tensor * input = sched->graph_inputs[i]; - size_t id = hash_id(input); - int backend_id = tensor_backend_id(input); + const size_t id = hash_id(input); + const int backend_id = tensor_backend_id(input); for (int c = 0; c < sched->n_copies; c++) { struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c); sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id; @@ -1295,10 +1529,10 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg for (int i = 0; i < sched->n_splits; i++) { struct ggml_backend_sched_split * split = &sched->splits[i]; - int backend_id = split->backend_id; + const int backend_id = split->backend_id; for (int j = 0; j < split->n_inputs; j++) { struct ggml_tensor * input = split->inputs[j]; - size_t id = hash_id(input); + const size_t id = hash_id(input); for (int c = 0; c < sched->n_copies; c++) { struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c); sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id; @@ -1359,14 +1593,19 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s for (int i = 0; i < sched->n_splits; i++) { struct ggml_backend_sched_split * split = &splits[i]; - int split_backend_id = split->backend_id; + const int split_backend_id = split->backend_id; ggml_backend_t split_backend = sched->backends[split_backend_id]; + std::vector active_inputs; // copy the input tensors to the split backend for (int j = 0; j < split->n_inputs; j++) { ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]); struct ggml_tensor * input = split->inputs[j]; struct ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy); + if (input_cpy->op != GGML_OP_NONE) { + active_inputs.push_back(input_cpy); + continue; + } if (input->flags & GGML_TENSOR_FLAG_INPUT) { // inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done @@ -1396,6 +1635,23 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s } } } + if (!active_inputs.empty()) { + ggml_cgraph graph_inputs = { + /*.size =*/ 0, + /*.n_nodes =*/ int(active_inputs.size()), + /*.n_leafs =*/ 0, + /*.nodes =*/ active_inputs.data(), + /*.grads =*/ NULL, // gradients would need visited_hash_set + /*.grad_accs =*/ NULL, + /*.leafs =*/ NULL, + /*.visited_hash_set =*/ { 0, NULL, NULL }, + /*.order =*/ split->graph.order, + }; + enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &graph_inputs); + if (ec != GGML_STATUS_SUCCESS) { + return ec; + } + } if (!sched->callback_eval) { enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph); @@ -1472,6 +1728,7 @@ ggml_backend_sched_t ggml_backend_sched_new( sched->hash_set = ggml_hash_set_new(graph_size); sched->hv_tensor_backend_ids = (int *) malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0])); sched->hv_tensor_copies = (ggml_tensor **) malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *)); + sched->hv_tensor_parallel = (ggml_tensor **) calloc(sched->hash_set.size * sched->n_backends, sizeof(struct ggml_tensor *)); const size_t ggml_sched_max_splits = graph_size; // at most there is one split for each node in the graph const size_t nodes_size = graph_size + ggml_sched_max_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2; @@ -1522,6 +1779,7 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) { free(sched->splits); free(sched->hv_tensor_backend_ids); free(sched->hv_tensor_copies); + free(sched->hv_tensor_parallel); free(sched->node_backend_ids); free(sched->leaf_backend_ids); free(sched->prev_node_backend_ids); @@ -1538,6 +1796,7 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) { ggml_hash_set_reset(&sched->hash_set); memset(sched->hv_tensor_backend_ids, -1, sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0])); memset(sched->hv_tensor_copies, 0, sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *)); + memset(sched->hv_tensor_parallel, 0, sched->hash_set.size * sched->n_backends * sizeof(struct ggml_tensor *)); sched->is_reset = true; } sched->is_alloc = false; @@ -1971,6 +2230,7 @@ ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) { /* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes /* .is_host = */ ggml_backend_cpu_buffer_type_is_host, + /* .is_split = */ NULL, }, /* .device = */ NULL, // FIXME ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0), /* .context = */ NULL, @@ -1994,6 +2254,7 @@ static ggml_backend_buffer_type_t ggml_backend_cpu_buffer_from_ptr_type(void) { /* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes /* .is_host = */ ggml_backend_cpu_buffer_type_is_host, + /* .is_split = */ NULL, }, /* .device = */ NULL, // FIXME ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0), /* .context = */ NULL, diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp index 8ff6d64a4d0d1..68f864342ac45 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp @@ -6422,6 +6422,7 @@ ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void) { /* .get_max_size = */ nullptr, // defaults to SIZE_MAX /* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes /* .is_host = */ nullptr, + /* .is_split = */ nullptr, }, /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0), /* .context = */ new ggml::cpu::aarch64::extra_buffer_type(), diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 02dc8c12dbd8c..45d41f5645355 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -689,6 +689,8 @@ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_t GGML_UNUSED(buft); } +static bool ggml_backend_buft_is_cuda_split(ggml_backend_buffer_type_t buft); + static const ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = { /* .get_name = */ ggml_backend_cuda_buffer_type_get_name, /* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer, @@ -696,6 +698,7 @@ static const ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface /* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size, /* .is_host = */ NULL, + /* .is_split = */ ggml_backend_buft_is_cuda_split, }; ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) { @@ -1013,6 +1016,7 @@ static const ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_inte /* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_alloc_size = */ ggml_backend_cuda_split_buffer_type_get_alloc_size, /* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host, + /* .is_split = */ ggml_backend_buft_is_cuda_split, }; ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split) { @@ -1111,6 +1115,7 @@ ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() { /* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, + /* .is_split = */ NULL, }, /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), 0), /* .context = */ nullptr, @@ -1907,7 +1912,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co } static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft); + const bool split = false && ggml_backend_buft_is_cuda_split(src0->buffer->buft); // If src0 is a temporary compute buffer it may have some padding that needs to be cleared for mul_mat_vec_q or mul_mat_q. // But if src0 is also a view of another tensor then this cannot be done safely because it may overwrite valid tensor data. @@ -2124,7 +2129,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst) { // why is this here instead of mul_mat? - if (dst->src[0] != nullptr && ggml_backend_buft_is_cuda_split(dst->src[0]->buffer->buft)) { + if (false && dst->src[0] != nullptr && ggml_backend_buft_is_cuda_split(dst->src[0]->buffer->buft)) { ggml_cuda_set_peer_access(dst->src[1]->ne[1], ctx.device); } @@ -2992,7 +2997,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g struct ggml_tensor * b = op->src[1]; // for small weight matrices the active device can end up without any rows, don't use row split in those cases // this avoids some edge cases (and the performance would not be good anyways) - if (a->buffer && ggml_backend_buft_is_cuda_split(a->buffer->buft)) { + if (false && a->buffer && ggml_backend_buft_is_cuda_split(a->buffer->buft)) { ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) a->buffer->buft->context; int64_t row_low; int64_t row_high; diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index a19cfb14e0f9f..e7917bd97779e 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -593,9 +593,87 @@ static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) { #ifdef __cplusplus #include +#include // expose GGUF internals for test code GGML_API size_t gguf_type_size(enum gguf_type type); GGML_API struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params); GGML_API void gguf_write_to_buf(const struct gguf_context * ctx, std::vector & buf, bool only_meta); + +static ggml_tensor * map_tensor(std::map & tensor_map, ggml_context * ctx, ggml_tensor * tensor, bool deep) { + if (!tensor) { + return nullptr; + } + + if (tensor_map.find(tensor) != tensor_map.end()) { + return tensor_map[tensor]; + } + + ggml_tensor * new_tensor = ggml_dup_tensor(ctx, tensor); + tensor_map[tensor] = new_tensor; + + new_tensor->op = tensor->op; + for (int i = 0; i < GGML_MAX_DIMS; i++) { + new_tensor->nb[i] = tensor->nb[i]; + } + new_tensor->flags = tensor->flags; + memcpy(new_tensor->op_params, tensor->op_params, sizeof(tensor->op_params)); + strcpy(new_tensor->name, tensor->name); + new_tensor->data = tensor->data; + new_tensor->buffer = tensor->buffer; + new_tensor->extra = tensor->extra; + new_tensor->view_offs = tensor->view_offs; + + if (deep) { + new_tensor->view_src = map_tensor(tensor_map, ctx, tensor->view_src, deep); + for (int i = 0; i < GGML_MAX_SRC; i++) { + new_tensor->src[i] = map_tensor(tensor_map, ctx, tensor->src[i], deep); + } + } else { + new_tensor->view_src = tensor->view_src; + for (int i = 0; i < GGML_MAX_SRC; i++) { + new_tensor->src[i] = tensor->src[i]; + } + } + + return new_tensor; +} + +static void dup_graph(ggml_context * ctx, const ggml_cgraph * src, ggml_cgraph * dst, bool deep) { + std::map tensor_map; + + if (deep) { + for (int i = 0; i < src->n_leafs; i++) { + ggml_build_forward_expand(dst, map_tensor(tensor_map, ctx, src->leafs[i], deep)); + } + for (int i = 0; i < src->n_nodes; i++) { + ggml_build_forward_expand(dst, map_tensor(tensor_map, ctx, src->nodes[i], deep)); + } + } else { + for (int i = 0; i < src->n_leafs; i++) { + dst->leafs[dst->n_leafs++] = map_tensor(tensor_map, ctx, src->leafs[i], deep); + } + for (int i = 0; i < src->n_nodes; i++) { + dst->nodes[dst->n_nodes++] = map_tensor(tensor_map, ctx, src->nodes[i], deep); + } + } + GGML_ASSERT(dst->n_leafs == src->n_leafs); + GGML_ASSERT(dst->n_nodes == src->n_nodes); + + if (src->grads) { + GGML_ASSERT(dst->grads); + for (int i = 0; i < src->n_nodes; ++i) { + const size_t igrad_src = ggml_hash_find(&src->visited_hash_set, src->nodes[i]); + const size_t igrad_dst = ggml_hash_find(&dst->visited_hash_set, dst->nodes[i]); + + GGML_ASSERT(igrad_src != GGML_HASHSET_FULL); + GGML_ASSERT(ggml_bitset_get(src->visited_hash_set.used, igrad_src)); + GGML_ASSERT(igrad_dst != GGML_HASHSET_FULL); + GGML_ASSERT(ggml_bitset_get(dst->visited_hash_set.used, igrad_dst)); + + dst->grads[igrad_dst] = src->grads[igrad_src]; + dst->grad_accs[igrad_dst] = src->grad_accs[igrad_src]; + } + } +} #endif // __cplusplus diff --git a/ggml/src/ggml-opt.cpp b/ggml/src/ggml-opt.cpp index 58d77578f458d..fc0324acd5cd7 100644 --- a/ggml/src/ggml-opt.cpp +++ b/ggml/src/ggml-opt.cpp @@ -9,7 +9,6 @@ #include #include #include -#include #include #include @@ -252,66 +251,6 @@ struct ggml_opt_params ggml_opt_default_params( }; } -static ggml_tensor * map_tensor(std::map & tensor_map, ggml_context * ctx, ggml_tensor * tensor) { - if (!tensor) { - return nullptr; - } - - if (tensor_map.find(tensor) != tensor_map.end()) { - return tensor_map[tensor]; - } - - ggml_tensor * new_tensor = ggml_dup_tensor(ctx, tensor); - tensor_map[tensor] = new_tensor; - - new_tensor->op = tensor->op; - for (int i = 0; i < GGML_MAX_DIMS; i++) { - new_tensor->nb[i] = tensor->nb[i]; - } - new_tensor->flags = tensor->flags; - memcpy(new_tensor->op_params, tensor->op_params, sizeof(tensor->op_params)); - strcpy(new_tensor->name, tensor->name); - new_tensor->data = tensor->data; - new_tensor->buffer = tensor->buffer; - new_tensor->extra = tensor->extra; - new_tensor->view_offs = tensor->view_offs; - new_tensor->view_src = map_tensor(tensor_map, ctx, tensor->view_src); - for (int i = 0; i < GGML_MAX_SRC; i++) { - new_tensor->src[i] = map_tensor(tensor_map, ctx, tensor->src[i]); - } - - return new_tensor; -} - -static ggml_cgraph * dup_graph(ggml_context * ctx, ggml_cgraph * src) { - std::map tensor_map; - - ggml_cgraph * dst = ggml_new_graph_custom(ctx, src->size, /*grads =*/ true); - - for (int i = 0; i < src->n_leafs; i++) { - ggml_build_forward_expand(dst, map_tensor(tensor_map, ctx, src->leafs[i])); - } - GGML_ASSERT(dst->n_leafs == src->n_leafs); - for (int i = 0; i < src->n_nodes; i++) { - ggml_build_forward_expand(dst, map_tensor(tensor_map, ctx, src->nodes[i])); - } - GGML_ASSERT(dst->n_nodes == src->n_nodes); - for (int i = 0; i < src->n_nodes; ++i) { - const size_t igrad_src = ggml_hash_find(&src->visited_hash_set, src->nodes[i]); - const size_t igrad_dst = ggml_hash_find(&dst->visited_hash_set, dst->nodes[i]); - - GGML_ASSERT(igrad_src != GGML_HASHSET_FULL); - GGML_ASSERT(ggml_bitset_get(src->visited_hash_set.used, igrad_src)); - GGML_ASSERT(igrad_dst != GGML_HASHSET_FULL); - GGML_ASSERT(ggml_bitset_get(dst->visited_hash_set.used, igrad_dst)); - - dst->grads[igrad_dst] = src->grads[igrad_src]; - dst->grad_accs[igrad_dst] = src->grad_accs[igrad_src]; - } - - return dst; -} - static void ggml_opt_build(ggml_opt_context_t opt_ctx) { GGML_ASSERT(opt_ctx->ctx_compute && "no compute context set, either use static graphs or set one with ggml_opt_prepare_alloc"); GGML_ASSERT((!opt_ctx->static_graphs || opt_ctx->inputs->data) && "when using static graphs the inputs must be allocated statically"); @@ -738,7 +677,8 @@ void ggml_opt_alloc(ggml_opt_context_t opt_ctx, bool backward) { ggml_free(opt_ctx->ctx_copy); opt_ctx->ctx_copy = ggml_init(params); - opt_ctx->allocated_graph_copy = dup_graph(opt_ctx->ctx_copy, graph); + opt_ctx->allocated_graph_copy = ggml_new_graph_custom(opt_ctx->ctx_copy, graph->size, /*grads =*/ true); + dup_graph(opt_ctx->ctx_copy, graph, opt_ctx->allocated_graph_copy, /*deep =*/ true); } else { opt_ctx->allocated_graph_copy = graph; } diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 8a6546240f46f..087d7c31da67d 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -1714,6 +1714,7 @@ struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * nam struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...) { va_list args; va_start(args, fmt); + assert(tensor->name != fmt); vsnprintf(tensor->name, sizeof(tensor->name), fmt, args); va_end(args); return tensor; diff --git a/src/llama-context.cpp b/src/llama-context.cpp index a3b84a6a82e74..b0de3a73d49fa 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -221,7 +221,7 @@ llama_context::llama_context( bool pipeline_parallel = model.n_devices() > 1 && model.params.n_gpu_layers > (int) model.hparams.n_layer && - model.params.split_mode == LLAMA_SPLIT_MODE_LAYER && + (model.params.split_mode == LLAMA_SPLIT_MODE_LAYER || model.params.split_mode == LLAMA_SPLIT_MODE_ROW) && cparams.offload_kqv && !model.has_tensor_overrides(); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 7fd094b63f269..6ef279c33e817 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -4566,6 +4566,14 @@ struct llm_build_llama : public llm_graph_context { cb(Vcur, "Vcur", il); } + // FIXME + Qcur = ggml_scale(ctx0, Qcur, 1.0f); + Kcur = ggml_scale(ctx0, Kcur, 1.0f); + Vcur = ggml_scale(ctx0, Vcur, 1.0f); + cb(Qcur, "QcurFIXME", il); + cb(Kcur, "KcurFIXME", il); + cb(Vcur, "VcurFIXME", il); + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); @@ -4705,6 +4713,7 @@ struct llm_build_llama : public llm_graph_context { cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); + cur = ggml_scale(ctx0, cur, 1.0f); // FIXME res->t_logits = cur; ggml_build_forward_expand(gf, cur);