Skip to content

Commit b7ae2d1

Browse files
profiler: initial support for profiling graph ops
1 parent ea9c32b commit b7ae2d1

File tree

4 files changed

+172
-0
lines changed

4 files changed

+172
-0
lines changed

ggml/src/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1313,6 +1313,7 @@ add_library(ggml
13131313
ggml-backend.c
13141314
ggml-quants.c
13151315
ggml-quants.h
1316+
ggml-profile.cpp
13161317
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
13171318
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
13181319
${GGML_SOURCES_RPC} ${GGML_HEADERS_RPC}

ggml/src/ggml-impl.h

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,17 @@ static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct g
157157
GGML_ABORT("fatal error");
158158
}
159159

160+
// op profile data (per op / per thread)
161+
enum ggml_profile_event {
162+
GGML_PROF_OP_START,
163+
GGML_PROF_OP_SYNC,
164+
GGML_PROF_OP_END
165+
};
166+
167+
struct ggml_profile_data {
168+
uint64_t nsec[GGML_PROF_OP_END + 1]; // event times in nsec
169+
};
170+
160171
// computation graph
161172

162173
enum ggml_cgraph_eval_order {
@@ -174,13 +185,21 @@ struct ggml_cgraph {
174185
struct ggml_tensor ** grads;
175186
struct ggml_tensor ** leafs;
176187

188+
struct ggml_profile_data ** prof;
189+
177190
struct ggml_hash_set visited_hash_set;
178191

179192
enum ggml_cgraph_eval_order order;
180193
};
181194

182195
struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
183196

197+
void ggml_profile_graph_init(struct ggml_cgraph *cg, int n_threads);
198+
void ggml_profile_graph_start(struct ggml_cgraph *cg, int n_threads);
199+
void ggml_profile_graph_finish(struct ggml_cgraph *cg, int n_threads);
200+
void ggml_profile_graph_free(struct ggml_cgraph *cg);
201+
void ggml_profile_op_event(const struct ggml_cgraph *cg, enum ggml_profile_event e, int node_n, int ith);
202+
184203
#ifdef __cplusplus
185204
}
186205
#endif

ggml/src/ggml-profile.cpp

Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
#include "ggml-impl.h"
2+
#include <stdint.h>
3+
#include <stdlib.h>
4+
5+
#include <chrono>
6+
7+
extern "C" void ggml_profile_graph_init(struct ggml_cgraph *cg, int n_threads)
8+
{
9+
if (!getenv("GGML_GRAPH_PROFILE")) { return; }
10+
11+
// The number of threads may change between passes (pp vs tg).
12+
// Allocate for max_n_threads for simplicity for now.
13+
// TODO: use aligned allocator
14+
15+
size_t node_size = sizeof(struct ggml_profile_data) * GGML_MAX_N_THREADS;
16+
size_t pvec_size = sizeof(std::intptr_t) * cg->n_nodes;
17+
size_t data_size = node_size * cg->n_nodes;
18+
size_t t_size = pvec_size + data_size;
19+
20+
cg->prof = (struct ggml_profile_data **) malloc(t_size);
21+
if (!cg->prof) {
22+
fprintf(stderr, "ggml-profile: failed to allocate profiling data : n_threads %d n_nodes %d\n", n_threads, cg->n_nodes);
23+
return;
24+
}
25+
26+
memset(cg->prof, 0, t_size);
27+
28+
// init pre-thread pointers
29+
uint8_t * data = (uint8_t *) cg->prof + pvec_size;
30+
for (int i=0; i < cg->n_nodes; i++) {
31+
cg->prof[i] = (struct ggml_profile_data *) data; data += node_size;
32+
}
33+
}
34+
35+
extern "C" void ggml_profile_graph_start(struct ggml_cgraph *cg, int n_threads)
36+
{
37+
if (!cg->prof) { ggml_profile_graph_init(cg, n_threads); }
38+
if (!cg->prof) { return; }
39+
}
40+
41+
static inline int ggml_profile_format_tensor_dims(char *str, struct ggml_tensor *t)
42+
{
43+
return sprintf(str, "%d:%d:%d:%d",
44+
(int) t->ne[0], (int) t->ne[1], (int) t->ne[3], (int) t->ne[3]);
45+
}
46+
47+
static inline void ggml_profile_format_op_dims(char *str, struct ggml_tensor *t)
48+
{
49+
char *p = str;
50+
51+
// append src0 and src1 (if any)
52+
if (t->src[0]) {
53+
p += ggml_profile_format_tensor_dims(p, t->src[0]);
54+
55+
for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
56+
p += sprintf(p, " x ");
57+
p += ggml_profile_format_tensor_dims(p, t->src[i]);
58+
}
59+
60+
p += sprintf(p, " -> ");
61+
}
62+
63+
// format self dims separately for better visual alignment
64+
char self[64];
65+
ggml_profile_format_tensor_dims(self, t);
66+
67+
p += sprintf(p, "%12s", self);
68+
}
69+
70+
static inline void ggml_profile_format_op_types(char *str, struct ggml_tensor *t)
71+
{
72+
char *p = str;
73+
74+
// append src0 and src1 (if any)
75+
if (t->src[0]) {
76+
p += sprintf(p, "%s", ggml_type_name(t->src[0]->type));
77+
78+
for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
79+
p += sprintf(p, " x ");
80+
p += sprintf(p, "%s", ggml_type_name(t->src[i]->type));
81+
}
82+
83+
p += sprintf(p, " -> ");
84+
}
85+
86+
p += sprintf(p, "%3s", ggml_type_name(t->type));
87+
}
88+
89+
90+
extern "C" void ggml_profile_graph_finish(struct ggml_cgraph *cg, int n_threads)
91+
{
92+
if (!cg->prof) { return; }
93+
94+
fprintf(stderr, "ggml-profile: | node idx | op name | proc (nsec) | sync (nsec) | total (nsec) | op dims | op types | tensor name |\n");
95+
fprintf(stderr, "ggml-profile: | -------: | :------ | ----------: | ----------: | -----------: | ------: | -------: | ----------: |\n");
96+
97+
char dims[64 * GGML_MAX_SRC];
98+
char types[16 * GGML_MAX_SRC];
99+
100+
for (int i = 0; i < cg->n_nodes; i++) {
101+
uint64_t p_nsec = 0;
102+
uint64_t s_nsec = 0;
103+
uint64_t t_nsec = 0;
104+
105+
// add up per thread counters and reset them
106+
for (int t=0; t < n_threads; t++) {
107+
p_nsec += cg->prof[i][t].nsec[GGML_PROF_OP_SYNC] - cg->prof[i][t].nsec[GGML_PROF_OP_START];
108+
s_nsec += cg->prof[i][t].nsec[GGML_PROF_OP_END] - cg->prof[i][t].nsec[GGML_PROF_OP_SYNC];
109+
t_nsec += cg->prof[i][t].nsec[GGML_PROF_OP_END] - cg->prof[i][t].nsec[GGML_PROF_OP_START];
110+
111+
cg->prof[i][t].nsec[GGML_PROF_OP_START] = 0;
112+
cg->prof[i][t].nsec[GGML_PROF_OP_SYNC] = 0;
113+
cg->prof[i][t].nsec[GGML_PROF_OP_END] = 0;
114+
}
115+
116+
ggml_profile_format_op_dims(dims, cg->nodes[i]);
117+
ggml_profile_format_op_types(types, cg->nodes[i]);
118+
119+
fprintf(stderr, "ggml-profile: | %04d | %10s | %10lu | %10lu | %10lu | %46s | %22s | %20s |\n",
120+
i, ggml_op_name(cg->nodes[i]->op),
121+
(unsigned long) p_nsec, (unsigned long) s_nsec, (unsigned long) t_nsec,
122+
dims, types, cg->nodes[i]->name);
123+
}
124+
fprintf(stderr, "ggml-profile: \n"); // empty line to split tables
125+
}
126+
127+
extern "C" void ggml_profile_graph_free(struct ggml_cgraph *cg)
128+
{
129+
if (!cg->prof) { return; }
130+
131+
free(cg->prof); cg->prof = nullptr;
132+
}
133+
134+
extern "C" void ggml_profile_op_event(const struct ggml_cgraph *cg, enum ggml_profile_event e, int node_n, int ith)
135+
{
136+
if (!cg->prof) { return; }
137+
138+
using clock = std::chrono::high_resolution_clock;
139+
cg->prof[node_n][ith].nsec[e] = std::chrono::nanoseconds(clock::now().time_since_epoch()).count();
140+
}

ggml/src/ggml.c

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19355,6 +19355,7 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz
1935519355
/*.nodes =*/ nodes_ptr,
1935619356
/*.grads =*/ grads_ptr,
1935719357
/*.leafs =*/ leafs_ptr,
19358+
/*.prof =*/ NULL,
1935819359
/*.hash_table =*/ { hash_size, hash_used, hash_keys_ptr },
1935919360
/*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
1936019361
};
@@ -19376,6 +19377,7 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1)
1937619377
/*.nodes =*/ cgraph0->nodes + i0,
1937719378
/*.grads =*/ cgraph0->grads ? cgraph0->grads + i0 : NULL,
1937819379
/*.leafs =*/ NULL,
19380+
/*.prof =*/ NULL,
1937919381
/*.hash_table =*/ { 0, NULL, NULL },
1938019382
/*.order =*/ cgraph0->order,
1938119383
};
@@ -20229,6 +20231,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
2022920231
for (int node_n = 0; node_n < cgraph->n_nodes && !tp->abort; node_n++) {
2023020232
struct ggml_tensor * node = cgraph->nodes[node_n];
2023120233

20234+
ggml_profile_op_event(cgraph, GGML_PROF_OP_START, node_n, state->ith);
20235+
2023220236
ggml_compute_forward(&params, node);
2023320237

2023420238
if (state->ith == 0 && cplan->abort_callback &&
@@ -20237,7 +20241,11 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
2023720241
tp->ec = GGML_STATUS_ABORTED;
2023820242
}
2023920243

20244+
ggml_profile_op_event(cgraph, GGML_PROF_OP_SYNC, node_n, state->ith);
20245+
2024020246
ggml_barrier(state->threadpool);
20247+
20248+
ggml_profile_op_event(cgraph, GGML_PROF_OP_END, node_n, state->ith);
2024120249
}
2024220250

2024320251
return 0;
@@ -20510,6 +20518,8 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
2051020518
threadpool->ec = GGML_STATUS_SUCCESS;
2051120519
}
2051220520

20521+
ggml_profile_graph_start(cgraph, n_threads);
20522+
2051320523
#ifdef GGML_USE_OPENMP
2051420524
if (n_threads > 1) {
2051520525
#pragma omp parallel num_threads(n_threads)
@@ -20549,6 +20559,8 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
2054920559
ggml_threadpool_free(threadpool);
2055020560
}
2055120561

20562+
ggml_profile_graph_finish(cgraph, n_threads);
20563+
2055220564
return ret;
2055320565
}
2055420566

0 commit comments

Comments
 (0)