Skip to content

Commit 2522c97

Browse files
ikawrakowIwan Kawrakow
andauthored
Faster tensor name formatting (#860)
* Adding fused mul+multi_add + CPU implementation * fused mul+multi_add: command line argument to disable it * Faster tensor name formatting We gain ~1% for Ling-mini-2.0 when running on CUDA. --------- Co-authored-by: Iwan Kawrakow <[email protected]>
1 parent db3ba49 commit 2522c97

File tree

2 files changed

+59
-15
lines changed

2 files changed

+59
-15
lines changed

ggml/src/ggml.c

Lines changed: 41 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -5941,11 +5941,24 @@ struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char *
59415941
return tensor;
59425942
}
59435943

5944+
static inline void ggml_format_name_fast(const char * name, const char * suffix, int suffix_len, char * new_name) {
5945+
int j = 0;
5946+
for (; j < GGML_MAX_NAME-1; ++j) {
5947+
new_name[j] = name[j];
5948+
if (!name[j]) break;
5949+
}
5950+
for (int k = 0; k < suffix_len && j < GGML_MAX_NAME-1; ++k) {
5951+
new_name[j++] = suffix[k];
5952+
}
5953+
new_name[j] = 0;
5954+
}
5955+
59445956
struct ggml_tensor * ggml_view_tensor(
59455957
struct ggml_context * ctx,
59465958
struct ggml_tensor * src) {
59475959
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, GGML_MAX_DIMS, src->ne, src, 0);
5948-
ggml_format_name(result, "%s (view)", src->name);
5960+
//ggml_format_name(result, "%s (view)", src->name);
5961+
ggml_format_name_fast(src->name, " (view)", 7, result->name);
59495962

59505963
for (int i = 0; i < GGML_MAX_DIMS; i++) {
59515964
result->nb[i] = src->nb[i];
@@ -7894,7 +7907,8 @@ static struct ggml_tensor * ggml_cpy_impl(
78947907
if (strlen(b->name) > 0) {
78957908
ggml_format_name(result, "%s (copy of %s)", b->name, a->name);
78967909
} else {
7897-
ggml_format_name(result, "%s (copy)", a->name);
7910+
//ggml_format_name(result, "%s (copy)", a->name);
7911+
ggml_format_name_fast(a->name, " (copy)", 7, result->name);
78987912
}
78997913

79007914
result->op = GGML_OP_CPY;
@@ -7919,7 +7933,8 @@ struct ggml_tensor * ggml_cast(
79197933
bool is_node = false;
79207934

79217935
struct ggml_tensor * result = ggml_new_tensor(ctx, type, GGML_MAX_DIMS, a->ne);
7922-
ggml_format_name(result, "%s (copy)", a->name);
7936+
//ggml_format_name(result, "%s (copy)", a->name);
7937+
ggml_format_name_fast(a->name, " (copy)", 7, result->name);
79237938

79247939
result->op = GGML_OP_CPY;
79257940
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -7941,7 +7956,8 @@ static struct ggml_tensor * ggml_cont_impl(
79417956
}
79427957

79437958
struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
7944-
ggml_format_name(result, "%s (cont)", a->name);
7959+
//ggml_format_name(result, "%s (cont)", a->name);
7960+
ggml_format_name_fast(a->name, " (cont)", 7, result->name);
79457961

79467962
result->op = GGML_OP_CONT;
79477963
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -7993,7 +8009,8 @@ struct ggml_tensor * ggml_cont_4d(
79938009
bool is_node = false;
79948010

79958011
struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, ne0, ne1, ne2, ne3);
7996-
ggml_format_name(result, "%s (cont)", a->name);
8012+
//ggml_format_name(result, "%s (cont)", a->name);
8013+
ggml_format_name_fast(a->name, " (cont)", 7, result->name);
79978014

79988015
result->op = GGML_OP_CONT;
79998016
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -8024,7 +8041,8 @@ struct ggml_tensor * ggml_reshape(
80248041
}
80258042

80268043
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, GGML_MAX_DIMS, b->ne, a, 0);
8027-
ggml_format_name(result, "%s (reshaped)", a->name);
8044+
//ggml_format_name(result, "%s (reshaped)", a->name);
8045+
ggml_format_name_fast(a->name, " (reshaped)", 11, result->name);
80288046

80298047
result->op = GGML_OP_RESHAPE;
80308048
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -8048,7 +8066,8 @@ struct ggml_tensor * ggml_reshape_1d(
80488066

80498067
const int64_t ne[1] = { ne0 };
80508068
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a, 0);
8051-
ggml_format_name(result, "%s (reshaped)", a->name);
8069+
//ggml_format_name(result, "%s (reshaped)", a->name);
8070+
ggml_format_name_fast(a->name, " (reshaped)", 11, result->name);
80528071

80538072
result->op = GGML_OP_RESHAPE;
80548073
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -8073,7 +8092,8 @@ struct ggml_tensor * ggml_reshape_2d(
80738092

80748093
const int64_t ne[2] = { ne0, ne1 };
80758094
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a, 0);
8076-
ggml_format_name(result, "%s (reshaped)", a->name);
8095+
//ggml_format_name(result, "%s (reshaped)", a->name);
8096+
ggml_format_name_fast(a->name, " (reshaped)", 11, result->name);
80778097

80788098
result->op = GGML_OP_RESHAPE;
80798099
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -8099,7 +8119,8 @@ struct ggml_tensor * ggml_reshape_3d(
80998119

81008120
const int64_t ne[3] = { ne0, ne1, ne2 };
81018121
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a, 0);
8102-
ggml_format_name(result, "%s (reshaped)", a->name);
8122+
//ggml_format_name(result, "%s (reshaped)", a->name);
8123+
ggml_format_name_fast(a->name, " (reshaped)", 11, result->name);
81038124

81048125
result->op = GGML_OP_RESHAPE;
81058126
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -8126,7 +8147,8 @@ struct ggml_tensor * ggml_reshape_4d(
81268147

81278148
const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
81288149
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a, 0);
8129-
ggml_format_name(result, "%s (reshaped)", a->name);
8150+
//ggml_format_name(result, "%s (reshaped)", a->name);
8151+
ggml_format_name_fast(a->name, " (reshaped)", 11, result->name);
81308152

81318153
result->op = GGML_OP_RESHAPE;
81328154
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -8149,7 +8171,8 @@ static struct ggml_tensor * ggml_view_impl(
81498171
}
81508172

81518173
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, n_dims, ne, a, offset);
8152-
ggml_format_name(result, "%s (view)", a->name);
8174+
//ggml_format_name(result, "%s (view)", a->name);
8175+
ggml_format_name_fast(a->name, " (view)", 7, result->name);
81538176

81548177
ggml_set_op_params(result, &offset, sizeof(offset));
81558178

@@ -8270,7 +8293,8 @@ struct ggml_tensor * ggml_permute(
82708293
}
82718294

82728295
struct ggml_tensor * result = ggml_view_tensor(ctx, a);
8273-
ggml_format_name(result, "%s (permuted)", a->name);
8296+
//ggml_format_name(result, "%s (permuted)", a->name);
8297+
ggml_format_name_fast(a->name, " (permuted)", 11, result->name);
82748298

82758299
int ne[GGML_MAX_DIMS];
82768300
int nb[GGML_MAX_DIMS];
@@ -8317,7 +8341,8 @@ struct ggml_tensor * ggml_transpose(
83178341
}
83188342

83198343
struct ggml_tensor * result = ggml_view_tensor(ctx, a);
8320-
ggml_format_name(result, "%s (transposed)", a->name);
8344+
//ggml_format_name(result, "%s (transposed)", a->name);
8345+
ggml_format_name_fast(a->name, " (transposed)", 13, result->name);
83218346

83228347
result->ne[0] = a->ne[1];
83238348
result->ne[1] = a->ne[0];
@@ -9510,6 +9535,7 @@ struct ggml_tensor * ggml_top_k(
95109535
GGML_ASSERT(a->ne[0] >= k);
95119536

95129537
struct ggml_tensor * result = ggml_argsort(ctx, a, GGML_SORT_ORDER_DESC);
9538+
ggml_format_name_fast(a->name, " (sort)", 7, result->name);
95139539
ggml_set_op_params_i32(result, 1, k);
95149540

95159541
result = ggml_view_4d(ctx, result,
@@ -10439,7 +10465,8 @@ void ggml_set_param(
1043910465

1044010466
GGML_ASSERT(tensor->grad == NULL);
1044110467
tensor->grad = ggml_dup_tensor(ctx, tensor);
10442-
ggml_format_name(tensor->grad, "%s (grad)", tensor->name);
10468+
//ggml_format_name(tensor->grad, "%s (grad)", tensor->name);
10469+
ggml_format_name_fast(tensor->name, " (grad)", 7, tensor->grad->name);
1044310470
}
1044410471

1044510472
// ggml_compute_forward_dup

src/llama-build-context.cpp

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1342,6 +1342,8 @@ ggml_cgraph * llm_build_context::build_llama() {
13421342
n_tokens = n_outputs;
13431343
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
13441344
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
1345+
cb(cur, "last_attn", il);
1346+
cb(inpSA, "last_ffn_inp", il);
13451347
}
13461348

13471349
// For Granite architecture
@@ -5942,6 +5944,8 @@ ggml_cgraph * llm_build_context::build_deepseek2() {
59425944
n_tokens = n_outputs;
59435945
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
59445946
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
5947+
cb(cur, "last_attn", il);
5948+
cb(inpSA, "last_ffn_inp", il);
59455949
}
59465950

59475951
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
@@ -8040,7 +8044,20 @@ ggml_cgraph * llm_build_context::llama_build_graph(
80408044
// this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
80418045
llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
80428046
if (il >= 0) {
8043-
ggml_format_name(cur, "%s-%d", name, il);
8047+
int j = 0;
8048+
for (; j < GGML_MAX_NAME - 1; ++j) {
8049+
cur->name[j] = name[j];
8050+
if (!name[j]) break;
8051+
}
8052+
if (j < GGML_MAX_NAME - 3) {
8053+
cur->name[j++] = '-';
8054+
auto sil = std::to_string(il);
8055+
for (int k = 0; k < (int)sil.size() && j < GGML_MAX_NAME - 1; ++k) {
8056+
cur->name[j++] = sil[k];
8057+
}
8058+
}
8059+
cur->name[j] = 0;
8060+
//ggml_format_name(cur, "%s-%d", name, il);
80448061
} else {
80458062
ggml_set_name(cur, name);
80468063
}

0 commit comments

Comments
 (0)