Skip to content

Commit 842e510

Browse files
committed
asserts
1 parent 7d76980 commit 842e510

File tree

2 files changed

+89
-26
lines changed

2 files changed

+89
-26
lines changed

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -560,7 +560,7 @@ static enum ggml_status ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer
560560

561561
if (padded_size > original_size) {
562562
ggml_cuda_set_device(ctx->device);
563-
CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size));
563+
// CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size));
564564
}
565565
}
566566
return GGML_STATUS_SUCCESS;

ggml/src/ggml-tp/ggml-tp.cpp

Lines changed: 88 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -234,7 +234,7 @@ static size_t ggml_align_size(size_t size, size_t alignment) {
234234
return (size + alignment - 1) & ~(alignment - 1);
235235
}
236236

237-
static ggml_tensor* ggml_backend_tp_clone_tensor(const ggml_tensor * tensor) {
237+
static ggml_tensor* ggml_backend_tp_clone_tensor(const ggml_tensor * tensor, bool offset_aware = false) {
238238
ggml_tensor * wrapped = new ggml_tensor();
239239
ggml_set_name(wrapped, tensor->name);
240240
wrapped->type = (ggml_type) tensor->type;
@@ -252,6 +252,14 @@ static ggml_tensor* ggml_backend_tp_clone_tensor(const ggml_tensor * tensor) {
252252
for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) {
253253
wrapped->op_params[i] = tensor->op_params[i];
254254
}
255+
256+
if (tensor->view_offs) {
257+
if (!offset_aware) {
258+
GGML_ABORT("Tensor %s is a view, cannot clone it.\n", tensor->name);
259+
}
260+
wrapped->view_offs = tensor->view_offs;
261+
}
262+
255263
return wrapped;
256264
}
257265

@@ -305,6 +313,11 @@ static ggml_status ensure_dim2_split(const ggml_tensor *src) {
305313
// no actual conversion needs to take place, the split tensors can be
306314
// created by using offsets within the original tensor.
307315
auto splits = get_dim_splits(src->ne[2]);
316+
317+
if (splits.split[0] != splits.split[1]) {
318+
GGML_ABORT("Tensor %s is not evenly split across devices, dim2 split requires equal splits.\n", src->name);
319+
}
320+
308321
size_t offset = 0;
309322
for (size_t j = 0; j < ggml_parallel_devices.size(); j++) {
310323
auto split = ggml_backend_tp_clone_tensor(src);
@@ -339,6 +352,10 @@ static ggml_status ensure_row_split(const ggml_tensor *src) {
339352
// no actual conversion needs to take place, the split tensors can be
340353
// created by using offsets within the original tensor.
341354
auto splits = get_row_splits(src);
355+
if (splits.split[0] != splits.split[1]) {
356+
GGML_ABORT("Tensor %s is not evenly split across devices, row split requires equal splits.\n", src->name);
357+
}
358+
342359
size_t offset = 0;
343360
for (size_t j = 0; j < ggml_parallel_devices.size(); j++) {
344361
auto split = ggml_backend_tp_clone_tensor(src);
@@ -376,6 +393,9 @@ static ggml_status ensure_column_split(const ggml_tensor *src) {
376393
// unlike the matmult weight tensors which are rejoined column wise, when
377394
// splitting tensors for unary or arithmetic operations, split them row wise.
378395
auto splits = get_col_splits(src);
396+
if (splits.split[0] != splits.split[1]) {
397+
GGML_ABORT("Tensor %s is not evenly split across devices, column split requires equal splits.\n", src->name);
398+
}
379399

380400
size_t offset = 0;
381401
for (size_t j = 0; j < ggml_parallel_devices.size(); j++) {
@@ -588,6 +608,9 @@ static void ensure_rejoined(const ggml_tensor *reason, const ggml_tensor * src)
588608
}
589609
else if (src_extra->split_tensors == GGML_TP_SPLIT_ROWS) {
590610
auto splits = get_row_splits(src);
611+
if (splits.split[0] != splits.split[1]) {
612+
GGML_ABORT("Tensor %s is not evenly split across devices, row rejoin requires equal splits.\n", src->name);
613+
}
591614
for (size_t j = 0; j < ggml_parallel_devices.size(); j++) {
592615
auto rejoined = src_extra->converted_tensors[j];
593616

@@ -617,6 +640,9 @@ static void ensure_rejoined(const ggml_tensor *reason, const ggml_tensor * src)
617640
// A A B B C C D D
618641
// A A B B C C D D
619642
auto splits = get_col_splits(src);
643+
if (splits.split[0] != splits.split[1]) {
644+
GGML_ABORT("Tensor %s is not evenly split across devices, column rejoin requires equal splits.\n", src->name);
645+
}
620646
for (size_t j = 0; j < ggml_parallel_devices.size(); j++) {
621647
auto rejoined = src_extra->converted_tensors[j];
622648

@@ -982,7 +1008,7 @@ static void do_init(size_t node_index, ggml_tensor * tensor, ggml_tensor_paralle
9821008
auto create_default_tensors_for = [](ggml_tensor * tensor, ggml_tensor_parallel_extra * extra) {
9831009
extra->split_tensors = GGML_TP_SPLIT_NONE;
9841010
for (size_t j = 0; j < ggml_parallel_devices.size(); j++) {
985-
auto wrapped = ggml_backend_tp_clone_tensor(tensor);
1011+
auto wrapped = ggml_backend_tp_clone_tensor(tensor, true);
9861012
extra->tensors[j] = wrapped;
9871013
}
9881014
};
@@ -999,8 +1025,8 @@ static void do_init(size_t node_index, ggml_tensor * tensor, ggml_tensor_paralle
9991025
}
10001026
};
10011027

1002-
auto prepare_wrapped = [](ggml_tensor * tensor, ggml_tensor * dims) {
1003-
auto wrapped = ggml_backend_tp_clone_tensor(dims);
1028+
auto prepare_wrapped = [](ggml_tensor * tensor, ggml_tensor * dims, bool offset_aware = false) {
1029+
auto wrapped = ggml_backend_tp_clone_tensor(dims, offset_aware);
10041030
if (dims != tensor) {
10051031
wrapped->op = tensor->op;
10061032
for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) {
@@ -1030,12 +1056,12 @@ static void do_init(size_t node_index, ggml_tensor * tensor, ggml_tensor_paralle
10301056
create_row_split_tensors_for(tensor, extra);
10311057
};
10321058

1033-
auto create_column_split_tensors_for = [prepare_wrapped](ggml_tensor * tensor, ggml_tensor_parallel_extra * extra, ggml_tensor * dims = nullptr) {
1059+
auto create_column_split_tensors_for = [prepare_wrapped](ggml_tensor * tensor, ggml_tensor_parallel_extra * extra, ggml_tensor * dims = nullptr, bool offset_aware = false) {
10341060
dims = dims ? dims : tensor;
10351061
extra->split_tensors = GGML_TP_SPLIT_COLUMNS;
10361062
auto splits = get_col_splits(dims);
10371063
for (size_t j = 0; j < ggml_parallel_devices.size(); j++) {
1038-
auto wrapped = prepare_wrapped(tensor, dims);
1064+
auto wrapped = prepare_wrapped(tensor, dims, offset_aware);
10391065
extra->tensors[j] = wrapped;
10401066

10411067
// update col count
@@ -1047,8 +1073,8 @@ static void do_init(size_t node_index, ggml_tensor * tensor, ggml_tensor_paralle
10471073
}
10481074
};
10491075

1050-
auto create_column_split_tensors = [&]() {
1051-
create_column_split_tensors_for(tensor, extra);
1076+
auto create_column_split_tensors = [&](bool offset_aware = false) {
1077+
create_column_split_tensors_for(tensor, extra, nullptr, offset_aware);
10521078
};
10531079

10541080
auto create_dim2_split_tensors_for = [prepare_wrapped](ggml_tensor * tensor, ggml_tensor_parallel_extra * extra, ggml_tensor * dims = nullptr) {
@@ -1577,19 +1603,44 @@ static void do_init(size_t node_index, ggml_tensor * tensor, ggml_tensor_paralle
15771603
// one split, one not split
15781604
auto split_tensors = src0_split_tensors ? src0_split_tensors : src1_split_tensors;
15791605
if (split_tensors == GGML_TP_SPLIT_COLUMNS) {
1580-
ensure_column_split(src0);
1581-
ensure_column_split(src1);
15821606
create_column_split_tensors();
1583-
set_src_tensor(0, GGML_TP_SPLIT_COLUMNS);
1584-
set_src_tensor(1, GGML_TP_SPLIT_COLUMNS);
1585-
// ensure_rejoined(nullptr, tensor);
1607+
// this may be a broadcast tensor.
1608+
if (src0->ne[0] != 1) {
1609+
ensure_column_split(src0);
1610+
set_src_tensor(0, GGML_TP_SPLIT_COLUMNS);
1611+
}
1612+
else {
1613+
ensure_rejoined(tensor, src0);
1614+
set_src_tensor(0, GGML_TP_SPLIT_NONE);
1615+
}
1616+
if (src1->ne[0] != 1) {
1617+
ensure_column_split(src1);
1618+
set_src_tensor(1, GGML_TP_SPLIT_COLUMNS);
1619+
}
1620+
else {
1621+
ensure_rejoined(tensor, src1);
1622+
set_src_tensor(1, GGML_TP_SPLIT_NONE);
1623+
}
15861624
}
15871625
else if (split_tensors == GGML_TP_SPLIT_ROWS) {
1588-
ensure_row_split(src0);
1589-
ensure_row_split(src1);
15901626
create_row_split_tensors();
1591-
set_src_tensor(0, GGML_TP_SPLIT_ROWS);
1592-
set_src_tensor(1, GGML_TP_SPLIT_ROWS);
1627+
// this may be a broadcast tensor.
1628+
if (src0->ne[0] != 1) {
1629+
ensure_row_split(src0);
1630+
set_src_tensor(0, GGML_TP_SPLIT_ROWS);
1631+
}
1632+
else {
1633+
ensure_rejoined(tensor, src0);
1634+
set_src_tensor(0, GGML_TP_SPLIT_NONE);
1635+
}
1636+
if (src1->ne[0] != 1) {
1637+
ensure_row_split(src1);
1638+
set_src_tensor(1, GGML_TP_SPLIT_ROWS);
1639+
}
1640+
else {
1641+
ensure_rejoined(tensor, src1);
1642+
set_src_tensor(1, GGML_TP_SPLIT_NONE);
1643+
}
15931644
}
15941645
else {
15951646
GGML_ABORT("Tensor %s has unsupported op %s for tensor parallelism, src0 is split but src1 is not.\n", tensor->name, ggml_op_name(tensor->op));
@@ -1664,10 +1715,11 @@ static void do_init(size_t node_index, ggml_tensor * tensor, ggml_tensor_paralle
16641715
else {
16651716
// a weight matrix is multiplied by a column split tensor (prior to ROPE), it can be massaged to a column split.
16661717
// this results in a reduce split.
1667-
ensure_weight_column_split(src0);
1668-
create_reduce_tensors();
1669-
set_src_tensor(0, GGML_TP_SPLIT_COLUMNS);
1670-
set_src_tensor(1, GGML_TP_SPLIT_COLUMNS);
1718+
ensure_row_split(src0);
1719+
ensure_rejoined(tensor, src1);
1720+
create_column_split_tensors();
1721+
set_src_tensor(0, GGML_TP_SPLIT_ROWS);
1722+
set_src_tensor(1, GGML_TP_SPLIT_NONE);
16711723
}
16721724
}
16731725
else if (src0_split_tensors == GGML_TP_SPLIT_COLUMNS && src1_split_tensors == GGML_TP_SPLIT_COLUMNS) {
@@ -1697,15 +1749,16 @@ static void do_init(size_t node_index, ggml_tensor * tensor, ggml_tensor_paralle
16971749
}
16981750

16991751
GGML_ASSERT(extra->tensors[0]->src[0]->ne[0] == extra->tensors[0]->src[0]->ne[0] && "Tensor parallel tensors must have the same inner dimension (ne0).");
1700-
GGML_ASSERT(extra->tensors[0]->ne[0] == extra->tensors[0]->src[0]->ne[1] && "Tensor parallel has incorrect outer dimension (ne0).");
17011752

17021753
if (tensor->op == GGML_OP_MUL_MAT_ID) {
17031754
// all experts are split so all GPUs will run a portion of each expert.
17041755
set_src_tensor(2, GGML_TP_SPLIT_NONE);
17051756

1757+
GGML_ASSERT(extra->tensors[0]->ne[0] == extra->tensors[0]->src[0]->ne[1] && "Tensor parallel has incorrect outer dimension (ne0).");
17061758
GGML_ASSERT(extra->tensors[0]->ne[2] == extra->tensors[0]->src[1]->ne[2] && "Tensor parallel has incorrect outer dimension (ne1).");
17071759
}
17081760
else {
1761+
GGML_ASSERT(extra->tensors[0]->ne[0] == extra->tensors[0]->src[0]->ne[1] && "Tensor parallel has incorrect outer dimension (ne0).");
17091762
GGML_ASSERT(extra->tensors[0]->ne[1] == extra->tensors[0]->src[1]->ne[1] && "Tensor parallel has incorrect outer dimension (ne1).");
17101763
}
17111764
break;
@@ -1794,9 +1847,9 @@ static void do_init(size_t node_index, ggml_tensor * tensor, ggml_tensor_paralle
17941847
}
17951848
else {
17961849
if (src0_split_tensors == GGML_TP_SPLIT_COLUMNS && src0->ne[0] == tensor->ne[0]) {
1797-
GGML_LOG_WARN("UNUSED CODE PATH VIEW SPLIT COL\n");
1850+
// GGML_LOG_WARN("UNUSED CODE PATH VIEW SPLIT COL\n");
17981851
// column split tensor with no change to columns
1799-
create_column_split_tensors();
1852+
create_column_split_tensors(true);
18001853
set_src_tensor(0, GGML_TP_SPLIT_COLUMNS);
18011854
}
18021855
else if (src0_split_tensors == GGML_TP_SPLIT_ROWS && src0->ne[1] == tensor->ne[1]) {
@@ -2265,6 +2318,9 @@ static void ensure_weight_column_split(ggml_tensor * weight) {
22652318
auto blocks_per_row = weight->ne[0] / block_size;
22662319
auto elements_per_block = weight->ne[0] / blocks_per_row;
22672320
auto splits = get_dim_splits(blocks_per_row);
2321+
if (splits.split[0] != splits.split[1]) {
2322+
GGML_ABORT("Tensor %s has uneven splits for columns, expected equal splits for all devices.\n", weight->name);
2323+
}
22682324

22692325
offset = 0;
22702326
for (size_t j = 0; j < ggml_parallel_devices.size(); j++) {
@@ -2606,7 +2662,14 @@ static bool ggml_backend_tp_device_supports_op(ggml_backend_dev_t dev, const str
26062662

26072663
// using something too small reduces performance due to additional rejoins.
26082664
// return src0->ne[1] >= 2048;
2609-
return src0->ne[1] >= 4096;
2665+
if (src0->ne[1] >= 4096)
2666+
return true;
2667+
if (src0->ne[1] * src0->ne[2] >= 4096) {
2668+
if (src0->ne[1] >= 2048)
2669+
return true;
2670+
return false;
2671+
}
2672+
return false;
26102673
return src0->ne[1] >= 8192;
26112674
}
26122675

0 commit comments

Comments
 (0)