@@ -507,7 +507,7 @@ static void ensure_reduce_split_views(const ggml_tensor *tensor) {
507507 reduce_split_view->buffer = wrapped->buffer ;
508508 reduce_split_view->view_src = wrapped;
509509 reduce_split_view->view_offs = col_offset * wrapped->nb [0 ];
510- reduce_split_view->data = wrapped->data + reduce_split_view->view_offs ;
510+ reduce_split_view->data = ( char *) wrapped->data + reduce_split_view->view_offs ;
511511 reduce_split_view->ne [0 ] = splits.split [j];
512512
513513 col_offset += splits.split [j];
@@ -644,7 +644,7 @@ static void ensure_rejoined(const ggml_tensor *reason, const ggml_tensor * src)
644644
645645static int memdiff_index (const void *a, const void *b, size_t length) {
646646 for (size_t i = 0 ; i < length; ++i) {
647- if (((char *)a)[i] != ((char *)b)[i]) {
647+ if (((const char *)a)[i] != ((const char *)b)[i]) {
648648 return (int )i; // return index of first difference
649649 }
650650 }
@@ -757,7 +757,6 @@ static ggml_status reduce_gathered_tensors(ggml_cgraph * backend_graph, int devi
757757 return GGML_STATUS_SUCCESS;
758758 }
759759
760- auto be = ggml_parallel_backends[device_index];
761760 ggml_tensor * wrapped = extra->tensors [device_index];
762761
763762 // when reducing a tensor, the actual op (sub or add) is contained in reduce_op_tensors
@@ -781,17 +780,16 @@ static ggml_status reduce_gathered_tensors(ggml_cgraph * backend_graph, int devi
781780 return GGML_STATUS_SUCCESS;
782781}
783782
784- void set_tensor (ggml_backend_t be, ggml_tensor * tensor, float value) {
783+ static void set_tensor (ggml_backend_t be, ggml_tensor * tensor, float value) {
785784 std::unique_ptr<float , decltype (&std::free)> data (static_cast <float *>(std::malloc (ggml_nbytes (tensor))), &std::free);
786785
787- for (size_t i = 0 ; i < ggml_nelements (tensor); i++) {
786+ for (int64_t i = 0 ; i < ggml_nelements (tensor); i++) {
788787 data.get ()[i] = value;
789788 }
790789 be->iface .set_tensor_async (be, tensor, data.get (), 0 , ggml_nbytes (tensor));
791790}
792791
793792static ggml_tensor* ggml_backend_tp_node_compute_split (int device_index, ggml_tensor * tensor) {
794- auto be = ggml_parallel_backends[device_index];
795793 auto extra = (ggml_tensor_parallel_extra *)tensor->extra ;
796794
797795 auto wrapped = extra->tensors [device_index];
@@ -842,6 +840,7 @@ static void ggml_backend_tp_buffer_compute_graph(ggml_cgraph * cgraph, std::func
842840
843841static void ggml_backend_tp_buffer_graph_compute_one (struct compute_thread * thread) {
844842 auto startTime = std::chrono::high_resolution_clock::now ();
843+ GGML_UNUSED (startTime);
845844 auto cgraph = thread->cgraph ;
846845
847846 struct ggml_init_params params = {
@@ -903,7 +902,7 @@ static void ggml_backend_tp_buffer_graph_compute_one(struct compute_thread * thr
903902 view_src = view_src->view_src ;
904903 }
905904 if (!be->iface .cpy_tensor2d_async (be, other_be, view_src, rejoined_tensor_view)) {
906- GGML_ABORT (" Failed to copy tensor %s from device %d to device %d \n " , tensor->name , device_index, other_device_index);
905+ GGML_ABORT (" Failed to copy tensor %s from device %d to device %ld \n " , tensor->name , device_index, other_device_index);
907906 // TODO, this is recoverable if something like this is implemented:
908907 // ggml_backend_tensor2d_copy(view_src, rejoined_tensor_view);
909908 }
@@ -929,19 +928,18 @@ static void ggml_backend_tp_buffer_graph_compute_one(struct compute_thread * thr
929928 pending_extra->rejoined [device_index] = true ;
930929 }
931930 return true ;
931+ GGML_UNUSED (node_index);
932932 };
933933
934934 auto compute = [&](int node_index, ggml_tensor * tensor, ggml_tensor_parallel_extra * extra) {
935935 auto wrapped = ggml_backend_tp_node_compute_split (device_index, tensor);
936936 if (extra->split_tensors != GGML_TP_SPLIT_VIEW) {
937937 backend_graph->nodes [backend_graph->n_nodes ++] = wrapped;
938938 }
939- else {
940- int i = 0 ;
941- }
942939 extra->computed [device_index] = true ;
943940
944941 return true ;
942+ GGML_UNUSED (node_index);
945943 };
946944
947945 ggml_backend_tp_buffer_compute_graph (cgraph, gather_pending, compute, flush_compute);
@@ -986,7 +984,6 @@ static void do_init(size_t node_index, ggml_tensor * tensor, ggml_tensor_paralle
986984 auto create_default_tensors_for = [](ggml_tensor * tensor, ggml_tensor_parallel_extra * extra) {
987985 extra->split_tensors = GGML_TP_SPLIT_NONE;
988986 for (size_t j = 0 ; j < ggml_parallel_devices.size (); j++) {
989- auto dev = ggml_parallel_devices[j];
990987 auto wrapped = ggml_backend_tp_clone_tensor (tensor);
991988 extra->tensors [j] = wrapped;
992989 }
@@ -999,7 +996,6 @@ static void do_init(size_t node_index, ggml_tensor * tensor, ggml_tensor_paralle
999996 auto create_reduce_tensors = [&]() {
1000997 extra->split_tensors = GGML_TP_SPLIT_REDUCE;
1001998 for (size_t j = 0 ; j < ggml_parallel_devices.size (); j++) {
1002- auto dev = ggml_parallel_devices[j];
1003999 auto wrapped = ggml_backend_tp_clone_tensor (tensor);
10041000 extra->tensors [j] = wrapped;
10051001 }
@@ -1021,7 +1017,6 @@ static void do_init(size_t node_index, ggml_tensor * tensor, ggml_tensor_paralle
10211017 extra->split_tensors = GGML_TP_SPLIT_ROWS;
10221018 auto splits = get_row_splits (dims);
10231019 for (size_t j = 0 ; j < ggml_parallel_devices.size (); j++) {
1024- auto dev = ggml_parallel_devices[j];
10251020 auto wrapped = prepare_wrapped (tensor, dims);
10261021 extra->tensors [j] = wrapped;
10271022
@@ -1042,7 +1037,6 @@ static void do_init(size_t node_index, ggml_tensor * tensor, ggml_tensor_paralle
10421037 extra->split_tensors = GGML_TP_SPLIT_COLUMNS;
10431038 auto splits = get_col_splits (dims);
10441039 for (size_t j = 0 ; j < ggml_parallel_devices.size (); j++) {
1045- auto dev = ggml_parallel_devices[j];
10461040 auto wrapped = prepare_wrapped (tensor, dims);
10471041 extra->tensors [j] = wrapped;
10481042
@@ -1064,7 +1058,6 @@ static void do_init(size_t node_index, ggml_tensor * tensor, ggml_tensor_paralle
10641058 extra->split_tensors = GGML_TP_SPLIT_DIM2;
10651059 auto splits = get_dim_splits (dims->ne [2 ]);
10661060 for (size_t j = 0 ; j < ggml_parallel_devices.size (); j++) {
1067- auto dev = ggml_parallel_devices[j];
10681061 auto wrapped = prepare_wrapped (tensor, dims);
10691062 extra->tensors [j] = wrapped;
10701063
@@ -1117,15 +1110,9 @@ static void do_init(size_t node_index, ggml_tensor * tensor, ggml_tensor_paralle
11171110 reduce_op->buffer = wrapped->buffer ;
11181111 reduce_op->view_src = wrapped;
11191112 reduce_op->view_offs = col_offset * wrapped->nb [0 ];
1120- reduce_op->data = wrapped->data + reduce_op->view_offs ;
1113+ reduce_op->data = ( char *) wrapped->data + reduce_op->view_offs ;
11211114 reduce_op->ne [0 ] = splits.split [j];
11221115
1123- // the reduce was rejoined, and the
1124- auto reduce = reduce_extra->tensors [j];
1125- if (reduce_extra->has_rejoin ) {
1126- reduce = reduce_extra->rejoined_tensor_views [j][j];
1127- }
1128-
11291116 // create a col split view of the reduced tensor
11301117 ensure_reduce_split_views (reduce_tensor);
11311118
@@ -1596,9 +1583,6 @@ static void do_init(size_t node_index, ggml_tensor * tensor, ggml_tensor_paralle
15961583 // one split, one not split
15971584 auto split_tensors = src0_split_tensors ? src0_split_tensors : src1_split_tensors;
15981585 if (split_tensors == GGML_TP_SPLIT_COLUMNS) {
1599- if (src0_extra->has_rejoin || src1_extra->has_rejoin ) {
1600- int i = 0 ;
1601- }
16021586 ensure_column_split (src0);
16031587 ensure_column_split (src1);
16041588 create_column_split_tensors ();
@@ -1886,7 +1870,7 @@ static enum ggml_status ggml_backend_tp_graph_compute(ggml_backend_t backend, gg
18861870 continue ;
18871871 }
18881872
1889- wrapped->data = wrapped->src [0 ]->data + wrapped->view_offs ;
1873+ wrapped->data = ( char *) wrapped->src [0 ]->data + wrapped->view_offs ;
18901874 wrapped->buffer = wrapped->src [0 ]->buffer ;
18911875 }
18921876 }
@@ -2100,6 +2084,9 @@ static ggml_backend_i ggml_backend_tp_interface = {
21002084 /* .graph_compute = */ ggml_backend_tp_graph_compute,
21012085 /* .event_record = */ NULL ,
21022086 /* .event_wait = */ NULL ,
2087+ /* .set_tensor2d_async = */ NULL ,
2088+ /* .get_tensor2d_async = */ NULL ,
2089+ /* .cpy_tensor2d_async = */ NULL ,
21032090};
21042091
21052092static ggml_backend_dev_t ggml_backend_tp_reg_get_device (ggml_backend_reg_t reg, size_t index);
@@ -2223,7 +2210,7 @@ static void ensure_weight_column_split(ggml_tensor * weight) {
22232210 std::unique_ptr<char , decltype (&std::free)> data (
22242211 static_cast <char *>(std::malloc (size)), &std::free);
22252212 size_t offset = 0 ;
2226- for (int j = 0 ; j < ggml_parallel_devices.size (); j++) {
2213+ for (size_t j = 0 ; j < ggml_parallel_devices.size (); j++) {
22272214 auto wrapped = extra->tensors [j];
22282215 auto buft = wrapped->buffer ;
22292216 auto wrapped_size = ggml_nbytes (wrapped);
@@ -2251,7 +2238,7 @@ static void ensure_weight_column_split(ggml_tensor * weight) {
22512238 auto splits = get_dim_splits (blocks_per_row);
22522239
22532240 offset = 0 ;
2254- for (int j = 0 ; j < ggml_parallel_devices.size (); j++) {
2241+ for (size_t j = 0 ; j < ggml_parallel_devices.size (); j++) {
22552242 auto wrapped = extra->tensors [j];
22562243 wrapped->ne [0 ] = splits.split [j] * elements_per_block;
22572244 wrapped->ne [1 ] = weight->ne [1 ];
@@ -2352,6 +2339,7 @@ static enum ggml_status ggml_backend_tp_finish_init_tensor(ggml_tensor *tensor)
23522339 GGML_ABORT (" ggml_backend_tp_buffer_init_tensor: init_tensor failed for tensor %s\n " , tensor->name );
23532340 }
23542341 }
2342+
23552343 return GGML_STATUS_SUCCESS;
23562344}
23572345
@@ -2506,7 +2494,6 @@ static size_t ggml_backend_tp_buffer_type_get_alloc_size(ggml_backend_buffer_typ
25062494 // to get cleanly diviible splits, make sure the allocation alignment is the multiple of the number of devices
25072495 max_alloc_size = ggml_align_size (max_alloc_size, ggml_backend_tp_buffer_type_get_alignment (buft) * ggml_parallel_devices.size ());
25082496 return max_alloc_size;
2509- // return ggml_nbytes(tensor);
25102497}
25112498
25122499static ggml_backend_buffer_type_i ggml_backend_tp_buffer_type_interface = {
@@ -2549,10 +2536,6 @@ static bool ggml_backend_tp_device_supports_op(ggml_backend_dev_t dev, const str
25492536 GGML_UNUSED (dev);
25502537 GGML_UNUSED (op);
25512538
2552- if (op->op == GGML_OP_MUL_MAT_ID) {
2553- return false ;
2554- }
2555-
25562539 auto buft = op->buffer ? op->buffer ->buft : nullptr ;
25572540 if (buft && (!ggml_backend_buft_is_tp_split (buft) && !ggml_backend_buft_is_tp (buft))) {
25582541 return false ;
0 commit comments