Skip to content

Commit 15506dc

Browse files
committed
Consolidate multiple tensor copies to reduce API overhead
Fixes #15749
1 parent 0a2a384 commit 15506dc

File tree

1 file changed

+88
-12
lines changed

1 file changed

+88
-12
lines changed

ggml/src/ggml-backend.cpp

Lines changed: 88 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -664,6 +664,9 @@ struct ggml_backend_sched_split {
664664
int i_end;
665665
struct ggml_tensor * inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
666666
int n_inputs;
667+
// contiguous buffer for multiple input tensors
668+
struct ggml_tensor * inputs_contiguous_buffer;
669+
size_t inputs_contiguous_buffer_size;
667670
// graph view of this split
668671
struct ggml_cgraph graph;
669672
};
@@ -713,6 +716,10 @@ struct ggml_backend_sched {
713716
char * context_buffer;
714717
size_t context_buffer_size;
715718

719+
// host staging buffer for bulk contiguous copies of input tensors
720+
void * host_staging_buffer;
721+
size_t host_staging_buffer_size;
722+
716723
bool op_offload;
717724

718725
int debug;
@@ -1202,6 +1209,8 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra
12021209
split->backend_id = node_backend_id;
12031210
split->i_start = i;
12041211
split->n_inputs = 0;
1212+
split->inputs_contiguous_buffer = NULL;
1213+
split->inputs_contiguous_buffer_size = 0;
12051214
cur_backend_id = node_backend_id;
12061215
}
12071216

@@ -1317,6 +1326,29 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra
13171326
graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
13181327
}
13191328

1329+
// Create and add contiguous buffer to the graph for this split
1330+
if (split->n_inputs > 0) {
1331+
// Calculate total size needed for contiguous allocation of all input tensors in a split
1332+
size_t total_size = 0;
1333+
for (int i = 0; i < split->n_inputs; i++) {
1334+
total_size = (total_size + TENSOR_ALIGNMENT - 1) & ~(TENSOR_ALIGNMENT - 1);
1335+
total_size += ggml_nbytes(split->inputs[i]);
1336+
}
1337+
split->inputs_contiguous_buffer_size = total_size;
1338+
1339+
// Create a single buffer tensor to hold all input data
1340+
split->inputs_contiguous_buffer = ggml_new_tensor_1d(sched->ctx, GGML_TYPE_I8, total_size);
1341+
ggml_format_name(split->inputs_contiguous_buffer, "%s#inputs_contiguous_buffer#%d", ggml_backend_name(sched->backends[split->backend_id]), 0);
1342+
ggml_set_input(split->inputs_contiguous_buffer);
1343+
ggml_set_output(split->inputs_contiguous_buffer);
1344+
1345+
if (split->inputs_contiguous_buffer != NULL) {
1346+
assert(graph_copy->size > graph_copy->n_nodes);
1347+
sched->node_backend_ids[graph_copy->n_nodes] = split->backend_id;
1348+
graph_copy->nodes[graph_copy->n_nodes++] = split->inputs_contiguous_buffer;
1349+
}
1350+
}
1351+
13201352
for (int j = split->i_start; j < split->i_end; j++) {
13211353
assert(graph_copy->size > graph_copy->n_nodes);
13221354
sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]);
@@ -1416,19 +1448,37 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
14161448
ggml_backend_t split_backend = sched->backends[split_backend_id];
14171449

14181450
// copy the input tensors to the split backend
1451+
1452+
bool bulk_copy_used = false;
1453+
size_t bulk_offset = 0;
1454+
// Ensure host staging buffer is large enough, reallocate if needed
1455+
if (sched->host_staging_buffer_size < split->inputs_contiguous_buffer_size) {
1456+
free(sched->host_staging_buffer);
1457+
sched->host_staging_buffer = malloc(split->inputs_contiguous_buffer_size);
1458+
sched->host_staging_buffer_size = split->inputs_contiguous_buffer_size;
1459+
}
1460+
14191461
for (int input_id = 0; input_id < split->n_inputs; input_id++) {
14201462
ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[input_id]);
14211463
struct ggml_tensor * input = split->inputs[input_id];
14221464
struct ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy);
14231465

14241466
if (input->flags & GGML_TENSOR_FLAG_INPUT) {
14251467
// inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
1426-
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1427-
ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
1428-
} else {
1429-
ggml_backend_synchronize(split_backend);
1430-
}
1431-
ggml_backend_tensor_copy(input, input_cpy);
1468+
// Bulk copy: accumulate data in host buffer and setup tensor views
1469+
GGML_ASSERT(split->inputs_contiguous_buffer != NULL && split->inputs_contiguous_buffer->data != NULL);
1470+
bulk_offset = (bulk_offset + TENSOR_ALIGNMENT - 1) & ~(TENSOR_ALIGNMENT - 1);
1471+
1472+
memcpy((char*)sched->host_staging_buffer + bulk_offset, input->data, ggml_nbytes(input));
1473+
1474+
// Update tensor_copy to point into contiguous GPU buffer
1475+
input_cpy->data = (char*)split->inputs_contiguous_buffer->data + bulk_offset;
1476+
input_cpy->buffer = split->inputs_contiguous_buffer->buffer;
1477+
input_cpy->view_src = split->inputs_contiguous_buffer;
1478+
input_cpy->view_offs = bulk_offset;
1479+
1480+
bulk_offset += ggml_nbytes(input);
1481+
bulk_copy_used = true;
14321482
} else {
14331483
// wait for the split backend to finish using the input before overwriting it
14341484
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
@@ -1527,17 +1577,38 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
15271577
// TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
15281578
if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
15291579
ggml_backend_synchronize(input_backend);
1530-
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1531-
ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
1532-
} else {
1533-
ggml_backend_synchronize(split_backend);
1534-
}
1535-
ggml_backend_tensor_copy(input, input_cpy);
1580+
1581+
// Bulk copy: accumulate data in host buffer and setup tensor views
1582+
GGML_ASSERT(split->inputs_contiguous_buffer != NULL && split->inputs_contiguous_buffer->data != NULL);
1583+
bulk_offset = (bulk_offset + TENSOR_ALIGNMENT - 1) & ~(TENSOR_ALIGNMENT - 1);
1584+
1585+
// Copy tensor data to host buffer
1586+
ggml_backend_tensor_get(input, (char*)sched->host_staging_buffer + bulk_offset, 0, ggml_nbytes(input));
1587+
1588+
// Update tensor_copy to point into contiguous GPU buffer
1589+
input_cpy->data = (char*)split->inputs_contiguous_buffer->data + bulk_offset;
1590+
input_cpy->buffer = split->inputs_contiguous_buffer->buffer;
1591+
input_cpy->view_src = split->inputs_contiguous_buffer;
1592+
input_cpy->view_offs = bulk_offset;
1593+
1594+
bulk_offset += ggml_nbytes(input);
1595+
bulk_copy_used = true;
15361596
}
15371597
}
15381598
}
15391599
}
15401600

1601+
// Finalize bulk copy if it was actually used
1602+
if (bulk_copy_used) {
1603+
// Synchronize and perform single bulk copy to GPU
1604+
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1605+
ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
1606+
} else {
1607+
ggml_backend_synchronize(split_backend);
1608+
}
1609+
ggml_backend_tensor_set(split->inputs_contiguous_buffer, sched->host_staging_buffer, 0, bulk_offset);
1610+
}
1611+
15411612
if (!sched->callback_eval) {
15421613
enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
15431614
if (ec != GGML_STATUS_SUCCESS) {
@@ -1622,6 +1693,10 @@ ggml_backend_sched_t ggml_backend_sched_new(
16221693
sched->context_buffer_size = ggml_sched_max_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + ggml_graph_overhead_custom(graph_size, false);
16231694
sched->context_buffer = (char *) malloc(sched->context_buffer_size);
16241695

1696+
// initialize reusable host buffer for bulk copies
1697+
sched->host_staging_buffer = NULL;
1698+
sched->host_staging_buffer_size = 0;
1699+
16251700
const int initial_splits_capacity = 16;
16261701
sched->splits = (ggml_backend_sched_split *) calloc(initial_splits_capacity, sizeof(sched->splits[0]));
16271702
sched->splits_capacity = initial_splits_capacity;
@@ -1666,6 +1741,7 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
16661741
free(sched->prev_node_backend_ids);
16671742
free(sched->prev_leaf_backend_ids);
16681743
free(sched->context_buffer);
1744+
free(sched->host_staging_buffer);
16691745
free(sched->graph.nodes);
16701746
free(sched->graph.leafs);
16711747
free(sched);

0 commit comments

Comments
 (0)