@@ -911,7 +911,7 @@ static void ggml_backend_tp_buffer_graph_compute_one(struct compute_thread * thr
911911
912912 for (auto & tensor : pending_gathers) {
913913 // why does this happen?
914- if (tensor->ne [1 ] == 0 ) {
914+ if (tensor->ne [0 ] == 0 || tensor-> ne [ 1 ] == 0 || tensor-> ne [ 2 ] == 0 || tensor-> ne [ 3 ] == 0 ) {
915915 continue ;
916916 }
917917
@@ -1723,12 +1723,19 @@ static void do_init(size_t node_index, ggml_tensor * tensor, ggml_tensor_paralle
17231723 set_src_tensor (1 , GGML_TP_SPLIT_NONE);
17241724 }
17251725 else {
1726+ // TODO: this path is disabled because column splitting quantized weights in MoE models often results in busted splits.
1727+
17261728 // a weight matrix is multiplied by a column split tensor (prior to ROPE), it can be massaged to a column split.
17271729 // this results in a reduce split.
1728- ensure_weight_column_split (src0);
1729- create_reduce_tensors ();
1730- set_src_tensor (0 , GGML_TP_SPLIT_COLUMNS);
1731- set_src_tensor (1 , GGML_TP_SPLIT_COLUMNS);
1730+ // ensure_weight_column_split(src0);
1731+ // create_reduce_tensors();
1732+ // set_src_tensor(0, GGML_TP_SPLIT_COLUMNS);
1733+ // set_src_tensor(1, GGML_TP_SPLIT_COLUMNS);
1734+
1735+ ensure_rejoined (tensor, src1);
1736+ create_column_split_tensors ();
1737+ set_src_tensor (0 , GGML_TP_SPLIT_ROWS);
1738+ set_src_tensor (1 , GGML_TP_SPLIT_NONE);
17321739 }
17331740 }
17341741 else if (src0_split_tensors == GGML_TP_SPLIT_COLUMNS && src1_split_tensors == GGML_TP_SPLIT_COLUMNS) {
@@ -1855,7 +1862,13 @@ static void do_init(size_t node_index, ggml_tensor * tensor, ggml_tensor_paralle
18551862 set_src_tensor (0 , GGML_TP_SPLIT_NONE);
18561863 }
18571864 else {
1858- if (src0_split_tensors == GGML_TP_SPLIT_COLUMNS && src0->ne [0 ] == tensor->ne [0 ]) {
1865+ // i'm not sure why the following code does not work, will need to investigate.
1866+ if (true ) {
1867+ ensure_rejoined (tensor, src0);
1868+ create_default_tensors ();
1869+ set_src_tensor (0 , GGML_TP_SPLIT_NONE);
1870+ }
1871+ else if (src0_split_tensors == GGML_TP_SPLIT_COLUMNS && src0->ne [0 ] == tensor->ne [0 ]) {
18591872 // GGML_LOG_WARN("UNUSED CODE PATH VIEW SPLIT COL\n");
18601873 // column split tensor with no change to columns
18611874 create_column_split_tensors (true );
@@ -2037,29 +2050,39 @@ static void ggml_backend_set_tensor_async_common(ggml_backend_buffer_t buffer, g
20372050
20382051 // weight matrices used for mul mat are transposed, so split on row
20392052 ggml_split splits = get_row_splits (tensor);
2040- size_t cur_row = 0 ;
2041- for (size_t j = 0 ; j < ggml_parallel_devices.size (); j++) {
2042- auto wrapped = extra->tensors [j];
2043- auto be = ggml_parallel_backends[j];
2053+ for (int n_expert = 0 ; n_expert < tensor->ne [2 ]; n_expert++) {
2054+ size_t data_expert_offset = n_expert * tensor->nb [2 ];
2055+ // weight matrix in moe models will have multiple sequences ne[2] per expert
2056+ size_t cur_row = 0 ;
2057+ for (size_t j = 0 ; j < ggml_parallel_devices.size (); j++) {
2058+ auto wrapped = extra->tensors [j];
2059+ auto be = ggml_parallel_backends[j];
20442060
2045- // the split tensors should have the same alignment as the wrapping tensor, and thus the same stride.
2046- if (wrapped->nb [1 ] != tensor->nb [1 ]) {
2047- GGML_LOG_ERROR (" ggml_backend_tp_buffer_set_tensor: wrapped->nb[1] %zu != tensor->nb[1] %zu\n " , wrapped->nb [1 ], tensor->nb [1 ]);
2048- return ;
2049- }
2061+ // the split tensors should have the same alignment as the wrapping tensor, and thus the same stride.
2062+ if (wrapped->nb [1 ] != tensor->nb [1 ]) {
2063+ GGML_LOG_ERROR (" ggml_backend_tp_buffer_set_tensor: wrapped->nb[1] %zu != tensor->nb[1] %zu\n " , wrapped->nb [1 ], tensor->nb [1 ]);
2064+ return ;
2065+ }
20502066
2051- auto split_offset = cur_row * tensor->nb [1 ];
2052- auto split_size = (size_t ) splits.split [j] * tensor->nb [1 ];
2053-
2054- if (be->iface .set_tensor_async ) {
2055- be->iface .set_tensor_async (be, wrapped, (const char *) data + split_offset, 0 , split_size);
2056- }
2057- else {
2058- auto backend_buffer = ctx->backend_buffers [j];
2059- backend_buffer->iface .set_tensor (backend_buffer, wrapped, (const char *) data + split_offset, 0 , split_size);
2060- }
2067+ auto data_row_offset = cur_row * tensor->nb [1 ];
2068+ auto split_size = (size_t ) splits.split [j] * tensor->nb [1 ];
2069+ auto split_expert_offset = n_expert * split_size;
2070+
2071+ GGML_ASSERT (split_expert_offset == n_expert * wrapped->nb [2 ]);
2072+ GGML_ASSERT (wrapped->nb [2 ] == split_size);
2073+ GGML_ASSERT (wrapped->ne [1 ] == splits.split [j]); // confirm split row count
2074+ GGML_ASSERT (wrapped->ne [2 ] == tensor->ne [2 ]); // unless you're splitting experts too
2075+
2076+ if (be->iface .set_tensor_async ) {
2077+ be->iface .set_tensor_async (be, wrapped, (const char *) data + data_expert_offset + data_row_offset, split_expert_offset, split_size);
2078+ }
2079+ else {
2080+ auto backend_buffer = ctx->backend_buffers [j];
2081+ backend_buffer->iface .set_tensor (backend_buffer, wrapped, (const char *) data + data_row_offset, split_expert_offset, split_size);
2082+ }
20612083
2062- cur_row += splits.split [j];
2084+ cur_row += splits.split [j];
2085+ }
20632086 }
20642087}
20652088
@@ -2420,7 +2443,7 @@ static enum ggml_status ggml_backend_tp_finish_init_tensor(ggml_tensor *tensor)
24202443 auto view_src = view_src_extra->tensors [j];
24212444 auto rem = tensor->view_offs % alignment;
24222445 auto view_offs = tensor->view_offs / alignment * device_alignment + rem;
2423- wrapped->data = (char *) view_src->data + view_offs;
2446+ wrapped->data = (char *) view_src->data + wrapped-> view_offs ;
24242447 wrapped->view_src = view_src;
24252448 wrapped->view_offs = view_offs;
24262449 if (wrapped->view_src == NULL ) {
@@ -2490,6 +2513,13 @@ static enum ggml_status ggml_backend_tp_buffer_init_tensor(ggml_backend_buffer_t
24902513
24912514static void ggml_backend_tp_buffer_set_tensor (ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
24922515 ggml_backend_set_tensor_async_common (buffer, tensor, data, offset, size);
2516+
2517+ for (size_t j = 0 ; j < ggml_parallel_devices.size (); j++) {
2518+ auto be = ggml_parallel_backends[j];
2519+ if (be->iface .set_tensor_async ) {
2520+ ggml_backend_synchronize (be);
2521+ }
2522+ }
24932523}
24942524
24952525static void ggml_backend_tp_buffer_get_tensor (ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
@@ -2673,10 +2703,9 @@ static bool ggml_backend_tp_device_supports_op(ggml_backend_dev_t dev, const str
26732703 // return src0->ne[1] >= 2048;
26742704 if (src0->ne [1 ] >= 4096 )
26752705 return true ;
2706+ // moe
26762707 if (src0->ne [1 ] * src0->ne [2 ] >= 4096 ) {
2677- if (src0->ne [1 ] >= 1024 )
2678- return true ;
2679- return false ;
2708+ return true ;
26802709 }
26812710 return false ;
26822711 return src0->ne [1 ] >= 8192 ;
0 commit comments