@@ -1244,7 +1244,7 @@ static void do_init(size_t node_index, ggml_tensor * tensor, ggml_tensor_paralle
12441244 ggml_backend_tp_finish_init_tensor (tensor);
12451245 };
12461246
1247- bool force_rejoin = true ;
1247+ bool force_rejoin = false ;
12481248 if (force_rejoin) {
12491249 for (int i = 0 ; i < GGML_MAX_SRC; i++) {
12501250 auto src = tensor->src [i];
@@ -1710,9 +1710,10 @@ static void do_init(size_t node_index, ggml_tensor * tensor, ggml_tensor_paralle
17101710
17111711 case GGML_OP_CPY: {
17121712 // the src1 is the destination, and has already been created.
1713- // it maybe op NONE or op VIEW. without graph introspection .
1713+ // it maybe op NONE or op VIEW. without graph inspection .
17141714 // it is possible to use this cpy op to make the src1 tensor tree
17151715 // split, but this is simpler for now.
1716+ // the min split amount in supports_opt also affects this.
17161717 ensure_init_from_viewsrc (src0, src0_extra);
17171718 ensure_init_from_viewsrc (src1, src1_extra);
17181719 ensure_rejoined (tensor, src0);
@@ -2575,7 +2576,9 @@ static bool ggml_backend_tp_device_supports_op(ggml_backend_dev_t dev, const str
25752576 return true ;
25762577 }
25772578
2578- return src0->ne [1 ] >= 1024 ;
2579+ // using something too small reduces performance due to additional rejoins.
2580+ // return src0->ne[1] >= 2048;
2581+ return src0->ne [1 ] >= 4096 ;
25792582 return src0->ne [1 ] >= 8192 ;
25802583}
25812584
0 commit comments