@@ -234,7 +234,7 @@ static size_t ggml_align_size(size_t size, size_t alignment) {
234234 return (size + alignment - 1 ) & ~(alignment - 1 );
235235}
236236
237- static ggml_tensor* ggml_backend_tp_clone_tensor (const ggml_tensor * tensor) {
237+ static ggml_tensor* ggml_backend_tp_clone_tensor (const ggml_tensor * tensor, bool offset_aware = false ) {
238238 ggml_tensor * wrapped = new ggml_tensor ();
239239 ggml_set_name (wrapped, tensor->name );
240240 wrapped->type = (ggml_type) tensor->type ;
@@ -252,6 +252,14 @@ static ggml_tensor* ggml_backend_tp_clone_tensor(const ggml_tensor * tensor) {
252252 for (uint32_t i = 0 ; i < GGML_MAX_OP_PARAMS / sizeof (int32_t ); i++) {
253253 wrapped->op_params [i] = tensor->op_params [i];
254254 }
255+
256+ if (tensor->view_offs ) {
257+ if (!offset_aware) {
258+ GGML_ABORT (" Tensor %s is a view, cannot clone it.\n " , tensor->name );
259+ }
260+ wrapped->view_offs = tensor->view_offs ;
261+ }
262+
255263 return wrapped;
256264}
257265
@@ -305,6 +313,11 @@ static ggml_status ensure_dim2_split(const ggml_tensor *src) {
305313 // no actual conversion needs to take place, the split tensors can be
306314 // created by using offsets within the original tensor.
307315 auto splits = get_dim_splits (src->ne [2 ]);
316+
317+ if (splits.split [0 ] != splits.split [1 ]) {
318+ GGML_ABORT (" Tensor %s is not evenly split across devices, dim2 split requires equal splits.\n " , src->name );
319+ }
320+
308321 size_t offset = 0 ;
309322 for (size_t j = 0 ; j < ggml_parallel_devices.size (); j++) {
310323 auto split = ggml_backend_tp_clone_tensor (src);
@@ -339,6 +352,10 @@ static ggml_status ensure_row_split(const ggml_tensor *src) {
339352 // no actual conversion needs to take place, the split tensors can be
340353 // created by using offsets within the original tensor.
341354 auto splits = get_row_splits (src);
355+ if (splits.split [0 ] != splits.split [1 ]) {
356+ GGML_ABORT (" Tensor %s is not evenly split across devices, row split requires equal splits.\n " , src->name );
357+ }
358+
342359 size_t offset = 0 ;
343360 for (size_t j = 0 ; j < ggml_parallel_devices.size (); j++) {
344361 auto split = ggml_backend_tp_clone_tensor (src);
@@ -376,6 +393,9 @@ static ggml_status ensure_column_split(const ggml_tensor *src) {
376393 // unlike the matmult weight tensors which are rejoined column wise, when
377394 // splitting tensors for unary or arithmetic operations, split them row wise.
378395 auto splits = get_col_splits (src);
396+ if (splits.split [0 ] != splits.split [1 ]) {
397+ GGML_ABORT (" Tensor %s is not evenly split across devices, column split requires equal splits.\n " , src->name );
398+ }
379399
380400 size_t offset = 0 ;
381401 for (size_t j = 0 ; j < ggml_parallel_devices.size (); j++) {
@@ -588,6 +608,9 @@ static void ensure_rejoined(const ggml_tensor *reason, const ggml_tensor * src)
588608 }
589609 else if (src_extra->split_tensors == GGML_TP_SPLIT_ROWS) {
590610 auto splits = get_row_splits (src);
611+ if (splits.split [0 ] != splits.split [1 ]) {
612+ GGML_ABORT (" Tensor %s is not evenly split across devices, row rejoin requires equal splits.\n " , src->name );
613+ }
591614 for (size_t j = 0 ; j < ggml_parallel_devices.size (); j++) {
592615 auto rejoined = src_extra->converted_tensors [j];
593616
@@ -617,6 +640,9 @@ static void ensure_rejoined(const ggml_tensor *reason, const ggml_tensor * src)
617640 // A A B B C C D D
618641 // A A B B C C D D
619642 auto splits = get_col_splits (src);
643+ if (splits.split [0 ] != splits.split [1 ]) {
644+ GGML_ABORT (" Tensor %s is not evenly split across devices, column rejoin requires equal splits.\n " , src->name );
645+ }
620646 for (size_t j = 0 ; j < ggml_parallel_devices.size (); j++) {
621647 auto rejoined = src_extra->converted_tensors [j];
622648
@@ -982,7 +1008,7 @@ static void do_init(size_t node_index, ggml_tensor * tensor, ggml_tensor_paralle
9821008 auto create_default_tensors_for = [](ggml_tensor * tensor, ggml_tensor_parallel_extra * extra) {
9831009 extra->split_tensors = GGML_TP_SPLIT_NONE;
9841010 for (size_t j = 0 ; j < ggml_parallel_devices.size (); j++) {
985- auto wrapped = ggml_backend_tp_clone_tensor (tensor);
1011+ auto wrapped = ggml_backend_tp_clone_tensor (tensor, true );
9861012 extra->tensors [j] = wrapped;
9871013 }
9881014 };
@@ -999,8 +1025,8 @@ static void do_init(size_t node_index, ggml_tensor * tensor, ggml_tensor_paralle
9991025 }
10001026 };
10011027
1002- auto prepare_wrapped = [](ggml_tensor * tensor, ggml_tensor * dims) {
1003- auto wrapped = ggml_backend_tp_clone_tensor (dims);
1028+ auto prepare_wrapped = [](ggml_tensor * tensor, ggml_tensor * dims, bool offset_aware = false ) {
1029+ auto wrapped = ggml_backend_tp_clone_tensor (dims, offset_aware );
10041030 if (dims != tensor) {
10051031 wrapped->op = tensor->op ;
10061032 for (uint32_t i = 0 ; i < GGML_MAX_OP_PARAMS / sizeof (int32_t ); i++) {
@@ -1030,12 +1056,12 @@ static void do_init(size_t node_index, ggml_tensor * tensor, ggml_tensor_paralle
10301056 create_row_split_tensors_for (tensor, extra);
10311057 };
10321058
1033- auto create_column_split_tensors_for = [prepare_wrapped](ggml_tensor * tensor, ggml_tensor_parallel_extra * extra, ggml_tensor * dims = nullptr ) {
1059+ auto create_column_split_tensors_for = [prepare_wrapped](ggml_tensor * tensor, ggml_tensor_parallel_extra * extra, ggml_tensor * dims = nullptr , bool offset_aware = false ) {
10341060 dims = dims ? dims : tensor;
10351061 extra->split_tensors = GGML_TP_SPLIT_COLUMNS;
10361062 auto splits = get_col_splits (dims);
10371063 for (size_t j = 0 ; j < ggml_parallel_devices.size (); j++) {
1038- auto wrapped = prepare_wrapped (tensor, dims);
1064+ auto wrapped = prepare_wrapped (tensor, dims, offset_aware );
10391065 extra->tensors [j] = wrapped;
10401066
10411067 // update col count
@@ -1047,8 +1073,8 @@ static void do_init(size_t node_index, ggml_tensor * tensor, ggml_tensor_paralle
10471073 }
10481074 };
10491075
1050- auto create_column_split_tensors = [&]() {
1051- create_column_split_tensors_for (tensor, extra);
1076+ auto create_column_split_tensors = [&](bool offset_aware = false ) {
1077+ create_column_split_tensors_for (tensor, extra, nullptr , offset_aware );
10521078 };
10531079
10541080 auto create_dim2_split_tensors_for = [prepare_wrapped](ggml_tensor * tensor, ggml_tensor_parallel_extra * extra, ggml_tensor * dims = nullptr ) {
@@ -1577,19 +1603,44 @@ static void do_init(size_t node_index, ggml_tensor * tensor, ggml_tensor_paralle
15771603 // one split, one not split
15781604 auto split_tensors = src0_split_tensors ? src0_split_tensors : src1_split_tensors;
15791605 if (split_tensors == GGML_TP_SPLIT_COLUMNS) {
1580- ensure_column_split (src0);
1581- ensure_column_split (src1);
15821606 create_column_split_tensors ();
1583- set_src_tensor (0 , GGML_TP_SPLIT_COLUMNS);
1584- set_src_tensor (1 , GGML_TP_SPLIT_COLUMNS);
1585- // ensure_rejoined(nullptr, tensor);
1607+ // this may be a broadcast tensor.
1608+ if (src0->ne [0 ] != 1 ) {
1609+ ensure_column_split (src0);
1610+ set_src_tensor (0 , GGML_TP_SPLIT_COLUMNS);
1611+ }
1612+ else {
1613+ ensure_rejoined (tensor, src0);
1614+ set_src_tensor (0 , GGML_TP_SPLIT_NONE);
1615+ }
1616+ if (src1->ne [0 ] != 1 ) {
1617+ ensure_column_split (src1);
1618+ set_src_tensor (1 , GGML_TP_SPLIT_COLUMNS);
1619+ }
1620+ else {
1621+ ensure_rejoined (tensor, src1);
1622+ set_src_tensor (1 , GGML_TP_SPLIT_NONE);
1623+ }
15861624 }
15871625 else if (split_tensors == GGML_TP_SPLIT_ROWS) {
1588- ensure_row_split (src0);
1589- ensure_row_split (src1);
15901626 create_row_split_tensors ();
1591- set_src_tensor (0 , GGML_TP_SPLIT_ROWS);
1592- set_src_tensor (1 , GGML_TP_SPLIT_ROWS);
1627+ // this may be a broadcast tensor.
1628+ if (src0->ne [0 ] != 1 ) {
1629+ ensure_row_split (src0);
1630+ set_src_tensor (0 , GGML_TP_SPLIT_ROWS);
1631+ }
1632+ else {
1633+ ensure_rejoined (tensor, src0);
1634+ set_src_tensor (0 , GGML_TP_SPLIT_NONE);
1635+ }
1636+ if (src1->ne [0 ] != 1 ) {
1637+ ensure_row_split (src1);
1638+ set_src_tensor (1 , GGML_TP_SPLIT_ROWS);
1639+ }
1640+ else {
1641+ ensure_rejoined (tensor, src1);
1642+ set_src_tensor (1 , GGML_TP_SPLIT_NONE);
1643+ }
15931644 }
15941645 else {
15951646 GGML_ABORT (" Tensor %s has unsupported op %s for tensor parallelism, src0 is split but src1 is not.\n " , tensor->name , ggml_op_name (tensor->op ));
@@ -1664,10 +1715,11 @@ static void do_init(size_t node_index, ggml_tensor * tensor, ggml_tensor_paralle
16641715 else {
16651716 // a weight matrix is multiplied by a column split tensor (prior to ROPE), it can be massaged to a column split.
16661717 // this results in a reduce split.
1667- ensure_weight_column_split (src0);
1668- create_reduce_tensors ();
1669- set_src_tensor (0 , GGML_TP_SPLIT_COLUMNS);
1670- set_src_tensor (1 , GGML_TP_SPLIT_COLUMNS);
1718+ ensure_row_split (src0);
1719+ ensure_rejoined (tensor, src1);
1720+ create_column_split_tensors ();
1721+ set_src_tensor (0 , GGML_TP_SPLIT_ROWS);
1722+ set_src_tensor (1 , GGML_TP_SPLIT_NONE);
16711723 }
16721724 }
16731725 else if (src0_split_tensors == GGML_TP_SPLIT_COLUMNS && src1_split_tensors == GGML_TP_SPLIT_COLUMNS) {
@@ -1697,15 +1749,16 @@ static void do_init(size_t node_index, ggml_tensor * tensor, ggml_tensor_paralle
16971749 }
16981750
16991751 GGML_ASSERT (extra->tensors [0 ]->src [0 ]->ne [0 ] == extra->tensors [0 ]->src [0 ]->ne [0 ] && " Tensor parallel tensors must have the same inner dimension (ne0)." );
1700- GGML_ASSERT (extra->tensors [0 ]->ne [0 ] == extra->tensors [0 ]->src [0 ]->ne [1 ] && " Tensor parallel has incorrect outer dimension (ne0)." );
17011752
17021753 if (tensor->op == GGML_OP_MUL_MAT_ID) {
17031754 // all experts are split so all GPUs will run a portion of each expert.
17041755 set_src_tensor (2 , GGML_TP_SPLIT_NONE);
17051756
1757+ GGML_ASSERT (extra->tensors [0 ]->ne [0 ] == extra->tensors [0 ]->src [0 ]->ne [1 ] && " Tensor parallel has incorrect outer dimension (ne0)." );
17061758 GGML_ASSERT (extra->tensors [0 ]->ne [2 ] == extra->tensors [0 ]->src [1 ]->ne [2 ] && " Tensor parallel has incorrect outer dimension (ne1)." );
17071759 }
17081760 else {
1761+ GGML_ASSERT (extra->tensors [0 ]->ne [0 ] == extra->tensors [0 ]->src [0 ]->ne [1 ] && " Tensor parallel has incorrect outer dimension (ne0)." );
17091762 GGML_ASSERT (extra->tensors [0 ]->ne [1 ] == extra->tensors [0 ]->src [1 ]->ne [1 ] && " Tensor parallel has incorrect outer dimension (ne1)." );
17101763 }
17111764 break ;
@@ -1794,9 +1847,9 @@ static void do_init(size_t node_index, ggml_tensor * tensor, ggml_tensor_paralle
17941847 }
17951848 else {
17961849 if (src0_split_tensors == GGML_TP_SPLIT_COLUMNS && src0->ne [0 ] == tensor->ne [0 ]) {
1797- GGML_LOG_WARN (" UNUSED CODE PATH VIEW SPLIT COL\n " );
1850+ // GGML_LOG_WARN("UNUSED CODE PATH VIEW SPLIT COL\n");
17981851 // column split tensor with no change to columns
1799- create_column_split_tensors ();
1852+ create_column_split_tensors (true );
18001853 set_src_tensor (0 , GGML_TP_SPLIT_COLUMNS);
18011854 }
18021855 else if (src0_split_tensors == GGML_TP_SPLIT_ROWS && src0->ne [1 ] == tensor->ne [1 ]) {
@@ -2265,6 +2318,9 @@ static void ensure_weight_column_split(ggml_tensor * weight) {
22652318 auto blocks_per_row = weight->ne [0 ] / block_size;
22662319 auto elements_per_block = weight->ne [0 ] / blocks_per_row;
22672320 auto splits = get_dim_splits (blocks_per_row);
2321+ if (splits.split [0 ] != splits.split [1 ]) {
2322+ GGML_ABORT (" Tensor %s has uneven splits for columns, expected equal splits for all devices.\n " , weight->name );
2323+ }
22682324
22692325 offset = 0 ;
22702326 for (size_t j = 0 ; j < ggml_parallel_devices.size (); j++) {
@@ -2606,7 +2662,14 @@ static bool ggml_backend_tp_device_supports_op(ggml_backend_dev_t dev, const str
26062662
26072663 // using something too small reduces performance due to additional rejoins.
26082664 // return src0->ne[1] >= 2048;
2609- return src0->ne [1 ] >= 4096 ;
2665+ if (src0->ne [1 ] >= 4096 )
2666+ return true ;
2667+ if (src0->ne [1 ] * src0->ne [2 ] >= 4096 ) {
2668+ if (src0->ne [1 ] >= 2048 )
2669+ return true ;
2670+ return false ;
2671+ }
2672+ return false ;
26102673 return src0->ne [1 ] >= 8192 ;
26112674}
26122675
0 commit comments