Skip to content

Commit 83d172a

Browse files
committed
moe fixes
1 parent 9ec98cd commit 83d172a

File tree

1 file changed

+59
-30
lines changed

1 file changed

+59
-30
lines changed

ggml/src/ggml-tp/ggml-tp.cpp

Lines changed: 59 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -911,7 +911,7 @@ static void ggml_backend_tp_buffer_graph_compute_one(struct compute_thread * thr
911911

912912
for (auto & tensor : pending_gathers) {
913913
// why does this happen?
914-
if (tensor->ne[1] == 0) {
914+
if (tensor->ne[0] == 0 || tensor->ne[1] == 0 || tensor->ne[2] == 0 || tensor->ne[3] == 0) {
915915
continue;
916916
}
917917

@@ -1723,12 +1723,19 @@ static void do_init(size_t node_index, ggml_tensor * tensor, ggml_tensor_paralle
17231723
set_src_tensor(1, GGML_TP_SPLIT_NONE);
17241724
}
17251725
else {
1726+
// TODO: this path is disabled because column splitting quantized weights in MoE models often results in busted splits.
1727+
17261728
// a weight matrix is multiplied by a column split tensor (prior to ROPE), it can be massaged to a column split.
17271729
// this results in a reduce split.
1728-
ensure_weight_column_split(src0);
1729-
create_reduce_tensors();
1730-
set_src_tensor(0, GGML_TP_SPLIT_COLUMNS);
1731-
set_src_tensor(1, GGML_TP_SPLIT_COLUMNS);
1730+
// ensure_weight_column_split(src0);
1731+
// create_reduce_tensors();
1732+
// set_src_tensor(0, GGML_TP_SPLIT_COLUMNS);
1733+
// set_src_tensor(1, GGML_TP_SPLIT_COLUMNS);
1734+
1735+
ensure_rejoined(tensor, src1);
1736+
create_column_split_tensors();
1737+
set_src_tensor(0, GGML_TP_SPLIT_ROWS);
1738+
set_src_tensor(1, GGML_TP_SPLIT_NONE);
17321739
}
17331740
}
17341741
else if (src0_split_tensors == GGML_TP_SPLIT_COLUMNS && src1_split_tensors == GGML_TP_SPLIT_COLUMNS) {
@@ -1855,7 +1862,13 @@ static void do_init(size_t node_index, ggml_tensor * tensor, ggml_tensor_paralle
18551862
set_src_tensor(0, GGML_TP_SPLIT_NONE);
18561863
}
18571864
else {
1858-
if (src0_split_tensors == GGML_TP_SPLIT_COLUMNS && src0->ne[0] == tensor->ne[0]) {
1865+
// i'm not sure why the following code does not work, will need to investigate.
1866+
if (true) {
1867+
ensure_rejoined(tensor, src0);
1868+
create_default_tensors();
1869+
set_src_tensor(0, GGML_TP_SPLIT_NONE);
1870+
}
1871+
else if (src0_split_tensors == GGML_TP_SPLIT_COLUMNS && src0->ne[0] == tensor->ne[0]) {
18591872
// GGML_LOG_WARN("UNUSED CODE PATH VIEW SPLIT COL\n");
18601873
// column split tensor with no change to columns
18611874
create_column_split_tensors(true);
@@ -2037,29 +2050,39 @@ static void ggml_backend_set_tensor_async_common(ggml_backend_buffer_t buffer, g
20372050

20382051
// weight matrices used for mul mat are transposed, so split on row
20392052
ggml_split splits = get_row_splits(tensor);
2040-
size_t cur_row = 0;
2041-
for (size_t j = 0; j < ggml_parallel_devices.size(); j++) {
2042-
auto wrapped = extra->tensors[j];
2043-
auto be = ggml_parallel_backends[j];
2053+
for (int n_expert = 0; n_expert < tensor->ne[2]; n_expert++) {
2054+
size_t data_expert_offset = n_expert * tensor->nb[2];
2055+
// weight matrix in moe models will have multiple sequences ne[2] per expert
2056+
size_t cur_row = 0;
2057+
for (size_t j = 0; j < ggml_parallel_devices.size(); j++) {
2058+
auto wrapped = extra->tensors[j];
2059+
auto be = ggml_parallel_backends[j];
20442060

2045-
// the split tensors should have the same alignment as the wrapping tensor, and thus the same stride.
2046-
if (wrapped->nb[1] != tensor->nb[1]) {
2047-
GGML_LOG_ERROR("ggml_backend_tp_buffer_set_tensor: wrapped->nb[1] %zu != tensor->nb[1] %zu\n", wrapped->nb[1], tensor->nb[1]);
2048-
return;
2049-
}
2061+
// the split tensors should have the same alignment as the wrapping tensor, and thus the same stride.
2062+
if (wrapped->nb[1] != tensor->nb[1]) {
2063+
GGML_LOG_ERROR("ggml_backend_tp_buffer_set_tensor: wrapped->nb[1] %zu != tensor->nb[1] %zu\n", wrapped->nb[1], tensor->nb[1]);
2064+
return;
2065+
}
20502066

2051-
auto split_offset = cur_row * tensor->nb[1];
2052-
auto split_size = (size_t) splits.split[j] * tensor->nb[1];
2053-
2054-
if (be->iface.set_tensor_async) {
2055-
be->iface.set_tensor_async(be, wrapped, (const char *) data + split_offset, 0, split_size);
2056-
}
2057-
else {
2058-
auto backend_buffer = ctx->backend_buffers[j];
2059-
backend_buffer->iface.set_tensor(backend_buffer, wrapped, (const char *) data + split_offset, 0, split_size);
2060-
}
2067+
auto data_row_offset = cur_row * tensor->nb[1];
2068+
auto split_size = (size_t) splits.split[j] * tensor->nb[1];
2069+
auto split_expert_offset = n_expert * split_size;
2070+
2071+
GGML_ASSERT(split_expert_offset == n_expert * wrapped->nb[2]);
2072+
GGML_ASSERT(wrapped->nb[2] == split_size);
2073+
GGML_ASSERT(wrapped->ne[1] == splits.split[j]); // confirm split row count
2074+
GGML_ASSERT(wrapped->ne[2] == tensor->ne[2]); // unless you're splitting experts too
2075+
2076+
if (be->iface.set_tensor_async) {
2077+
be->iface.set_tensor_async(be, wrapped, (const char *) data + data_expert_offset + data_row_offset, split_expert_offset, split_size);
2078+
}
2079+
else {
2080+
auto backend_buffer = ctx->backend_buffers[j];
2081+
backend_buffer->iface.set_tensor(backend_buffer, wrapped, (const char *) data + data_row_offset, split_expert_offset, split_size);
2082+
}
20612083

2062-
cur_row += splits.split[j];
2084+
cur_row += splits.split[j];
2085+
}
20632086
}
20642087
}
20652088

@@ -2420,7 +2443,7 @@ static enum ggml_status ggml_backend_tp_finish_init_tensor(ggml_tensor *tensor)
24202443
auto view_src = view_src_extra->tensors[j];
24212444
auto rem = tensor->view_offs % alignment;
24222445
auto view_offs = tensor->view_offs / alignment * device_alignment + rem;
2423-
wrapped->data = (char *) view_src->data + view_offs;
2446+
wrapped->data = (char *) view_src->data + wrapped->view_offs;
24242447
wrapped->view_src = view_src;
24252448
wrapped->view_offs = view_offs;
24262449
if (wrapped->view_src == NULL) {
@@ -2490,6 +2513,13 @@ static enum ggml_status ggml_backend_tp_buffer_init_tensor(ggml_backend_buffer_t
24902513

24912514
static void ggml_backend_tp_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
24922515
ggml_backend_set_tensor_async_common(buffer, tensor, data, offset, size);
2516+
2517+
for (size_t j = 0; j < ggml_parallel_devices.size(); j++) {
2518+
auto be = ggml_parallel_backends[j];
2519+
if (be->iface.set_tensor_async) {
2520+
ggml_backend_synchronize(be);
2521+
}
2522+
}
24932523
}
24942524

24952525
static void ggml_backend_tp_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
@@ -2673,10 +2703,9 @@ static bool ggml_backend_tp_device_supports_op(ggml_backend_dev_t dev, const str
26732703
// return src0->ne[1] >= 2048;
26742704
if (src0->ne[1] >= 4096)
26752705
return true;
2706+
// moe
26762707
if (src0->ne[1] * src0->ne[2] >= 4096) {
2677-
if (src0->ne[1] >= 1024)
2678-
return true;
2679-
return false;
2708+
return true;
26802709
}
26812710
return false;
26822711
return src0->ne[1] >= 8192;

0 commit comments

Comments
 (0)