@@ -664,6 +664,9 @@ struct ggml_backend_sched_split {
664664 int i_end;
665665 struct ggml_tensor * inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
666666 int n_inputs;
667+ // contiguous buffer for multiple input tensors
668+ struct ggml_tensor * inputs_contiguous_buffer;
669+ size_t inputs_contiguous_buffer_size;
667670 // graph view of this split
668671 struct ggml_cgraph graph;
669672};
@@ -713,6 +716,10 @@ struct ggml_backend_sched {
713716 char * context_buffer;
714717 size_t context_buffer_size;
715718
719+ // host staging buffer for bulk contiguous copies of input tensors
720+ void * host_staging_buffer;
721+ size_t host_staging_buffer_size;
722+
716723 bool op_offload;
717724
718725 int debug;
@@ -1202,6 +1209,8 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra
12021209 split->backend_id = node_backend_id;
12031210 split->i_start = i;
12041211 split->n_inputs = 0 ;
1212+ split->inputs_contiguous_buffer = NULL ;
1213+ split->inputs_contiguous_buffer_size = 0 ;
12051214 cur_backend_id = node_backend_id;
12061215 }
12071216
@@ -1317,6 +1326,29 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra
13171326 graph_copy->nodes [graph_copy->n_nodes ++] = input_cpy;
13181327 }
13191328
1329+ // Create and add contiguous buffer to the graph for this split
1330+ if (split->n_inputs > 0 ) {
1331+ // Calculate total size needed for contiguous allocation of all input tensors in a split
1332+ size_t total_size = 0 ;
1333+ for (int i = 0 ; i < split->n_inputs ; i++) {
1334+ total_size = (total_size + TENSOR_ALIGNMENT - 1 ) & ~(TENSOR_ALIGNMENT - 1 );
1335+ total_size += ggml_nbytes (split->inputs [i]);
1336+ }
1337+ split->inputs_contiguous_buffer_size = total_size;
1338+
1339+ // Create a single buffer tensor to hold all input data
1340+ split->inputs_contiguous_buffer = ggml_new_tensor_1d (sched->ctx , GGML_TYPE_I8, total_size);
1341+ ggml_format_name (split->inputs_contiguous_buffer , " %s#inputs_contiguous_buffer#%d" , ggml_backend_name (sched->backends [split->backend_id ]), 0 );
1342+ ggml_set_input (split->inputs_contiguous_buffer );
1343+ ggml_set_output (split->inputs_contiguous_buffer );
1344+
1345+ if (split->inputs_contiguous_buffer != NULL ) {
1346+ assert (graph_copy->size > graph_copy->n_nodes );
1347+ sched->node_backend_ids [graph_copy->n_nodes ] = split->backend_id ;
1348+ graph_copy->nodes [graph_copy->n_nodes ++] = split->inputs_contiguous_buffer ;
1349+ }
1350+ }
1351+
13201352 for (int j = split->i_start ; j < split->i_end ; j++) {
13211353 assert (graph_copy->size > graph_copy->n_nodes );
13221354 sched->node_backend_ids [graph_copy->n_nodes ] = tensor_backend_id (graph->nodes [j]);
@@ -1416,19 +1448,37 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
14161448 ggml_backend_t split_backend = sched->backends [split_backend_id];
14171449
14181450 // copy the input tensors to the split backend
1451+
1452+ bool bulk_copy_used = false ;
1453+ size_t bulk_offset = 0 ;
1454+ // Ensure host staging buffer is large enough, reallocate if needed
1455+ if (sched->host_staging_buffer_size < split->inputs_contiguous_buffer_size ) {
1456+ free (sched->host_staging_buffer );
1457+ sched->host_staging_buffer = malloc (split->inputs_contiguous_buffer_size );
1458+ sched->host_staging_buffer_size = split->inputs_contiguous_buffer_size ;
1459+ }
1460+
14191461 for (int input_id = 0 ; input_id < split->n_inputs ; input_id++) {
14201462 ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend (sched, split->inputs [input_id]);
14211463 struct ggml_tensor * input = split->inputs [input_id];
14221464 struct ggml_tensor * input_cpy = tensor_copy (input, split_backend_id, sched->cur_copy );
14231465
14241466 if (input->flags & GGML_TENSOR_FLAG_INPUT) {
14251467 // inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
1426- if (sched->events [split_backend_id][sched->cur_copy ] != NULL ) {
1427- ggml_backend_event_synchronize (sched->events [split_backend_id][sched->cur_copy ]);
1428- } else {
1429- ggml_backend_synchronize (split_backend);
1430- }
1431- ggml_backend_tensor_copy (input, input_cpy);
1468+ // Bulk copy: accumulate data in host buffer and setup tensor views
1469+ GGML_ASSERT (split->inputs_contiguous_buffer != NULL && split->inputs_contiguous_buffer ->data != NULL );
1470+ bulk_offset = (bulk_offset + TENSOR_ALIGNMENT - 1 ) & ~(TENSOR_ALIGNMENT - 1 );
1471+
1472+ memcpy ((char *)sched->host_staging_buffer + bulk_offset, input->data , ggml_nbytes (input));
1473+
1474+ // Update tensor_copy to point into contiguous GPU buffer
1475+ input_cpy->data = (char *)split->inputs_contiguous_buffer ->data + bulk_offset;
1476+ input_cpy->buffer = split->inputs_contiguous_buffer ->buffer ;
1477+ input_cpy->view_src = split->inputs_contiguous_buffer ;
1478+ input_cpy->view_offs = bulk_offset;
1479+
1480+ bulk_offset += ggml_nbytes (input);
1481+ bulk_copy_used = true ;
14321482 } else {
14331483 // wait for the split backend to finish using the input before overwriting it
14341484 if (sched->events [split_backend_id][sched->cur_copy ] != NULL ) {
@@ -1527,17 +1577,38 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
15271577 // TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
15281578 if (!split_backend->iface .cpy_tensor_async || !split_backend->iface .cpy_tensor_async (input_backend, split_backend, input, input_cpy)) {
15291579 ggml_backend_synchronize (input_backend);
1530- if (sched->events [split_backend_id][sched->cur_copy ] != NULL ) {
1531- ggml_backend_event_synchronize (sched->events [split_backend_id][sched->cur_copy ]);
1532- } else {
1533- ggml_backend_synchronize (split_backend);
1534- }
1535- ggml_backend_tensor_copy (input, input_cpy);
1580+
1581+ // Bulk copy: accumulate data in host buffer and setup tensor views
1582+ GGML_ASSERT (split->inputs_contiguous_buffer != NULL && split->inputs_contiguous_buffer ->data != NULL );
1583+ bulk_offset = (bulk_offset + TENSOR_ALIGNMENT - 1 ) & ~(TENSOR_ALIGNMENT - 1 );
1584+
1585+ // Copy tensor data to host buffer
1586+ ggml_backend_tensor_get (input, (char *)sched->host_staging_buffer + bulk_offset, 0 , ggml_nbytes (input));
1587+
1588+ // Update tensor_copy to point into contiguous GPU buffer
1589+ input_cpy->data = (char *)split->inputs_contiguous_buffer ->data + bulk_offset;
1590+ input_cpy->buffer = split->inputs_contiguous_buffer ->buffer ;
1591+ input_cpy->view_src = split->inputs_contiguous_buffer ;
1592+ input_cpy->view_offs = bulk_offset;
1593+
1594+ bulk_offset += ggml_nbytes (input);
1595+ bulk_copy_used = true ;
15361596 }
15371597 }
15381598 }
15391599 }
15401600
1601+ // Finalize bulk copy if it was actually used
1602+ if (bulk_copy_used) {
1603+ // Synchronize and perform single bulk copy to GPU
1604+ if (sched->events [split_backend_id][sched->cur_copy ] != NULL ) {
1605+ ggml_backend_event_synchronize (sched->events [split_backend_id][sched->cur_copy ]);
1606+ } else {
1607+ ggml_backend_synchronize (split_backend);
1608+ }
1609+ ggml_backend_tensor_set (split->inputs_contiguous_buffer , sched->host_staging_buffer , 0 , bulk_offset);
1610+ }
1611+
15411612 if (!sched->callback_eval ) {
15421613 enum ggml_status ec = ggml_backend_graph_compute_async (split_backend, &split->graph );
15431614 if (ec != GGML_STATUS_SUCCESS) {
@@ -1622,6 +1693,10 @@ ggml_backend_sched_t ggml_backend_sched_new(
16221693 sched->context_buffer_size = ggml_sched_max_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2 *sizeof (struct ggml_tensor ) + ggml_graph_overhead_custom (graph_size, false );
16231694 sched->context_buffer = (char *) malloc (sched->context_buffer_size );
16241695
1696+ // initialize reusable host buffer for bulk copies
1697+ sched->host_staging_buffer = NULL ;
1698+ sched->host_staging_buffer_size = 0 ;
1699+
16251700 const int initial_splits_capacity = 16 ;
16261701 sched->splits = (ggml_backend_sched_split *) calloc (initial_splits_capacity, sizeof (sched->splits [0 ]));
16271702 sched->splits_capacity = initial_splits_capacity;
@@ -1666,6 +1741,7 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
16661741 free (sched->prev_node_backend_ids );
16671742 free (sched->prev_leaf_backend_ids );
16681743 free (sched->context_buffer );
1744+ free (sched->host_staging_buffer );
16691745 free (sched->graph .nodes );
16701746 free (sched->graph .leafs );
16711747 free (sched);
0 commit comments