Skip to content

Commit a362c74

Browse files
profiler: initial support for profiling graph ops
1 parent d79d8f3 commit a362c74

File tree

7 files changed

+298
-1
lines changed

7 files changed

+298
-1
lines changed

ggml/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ option(GGML_CCACHE "ggml: use ccache if available" ON)
8383
option(GGML_ALL_WARNINGS "ggml: enable all compiler warnings" ON)
8484
option(GGML_ALL_WARNINGS_3RD_PARTY "ggml: enable all compiler warnings in 3rd party libs" OFF)
8585
option(GGML_GPROF "ggml: enable gprof" OFF)
86+
option(GGML_GRAPH_PROFILER "ggml: enable internal Graph and Op profiler" OFF)
8687

8788
# build
8889
option(GGML_FATAL_WARNINGS "ggml: enable -Werror flag" OFF)

ggml/src/CMakeLists.txt

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@ if (CMAKE_SYSTEM_NAME MATCHES "Linux")
77
add_compile_definitions($<$<CONFIG:Debug>:_GLIBCXX_ASSERTIONS>)
88
endif()
99

10+
if (GGML_GRAPH_PROFILER)
11+
add_compile_definitions(GGML_GRAPH_PROFILER)
12+
endif()
13+
1014
if (NOT MSVC)
1115
if (GGML_SANITIZE_THREAD)
1216
add_compile_options(-fsanitize=thread)
@@ -215,7 +219,9 @@ add_library(ggml-base
215219
ggml-threading.cpp
216220
ggml-threading.h
217221
ggml-quants.c
218-
ggml-quants.h)
222+
ggml-quants.h
223+
ggml-profile.h
224+
ggml-profile.cpp)
219225

220226
target_include_directories(ggml-base PRIVATE .)
221227

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#include "ggml-quants.h"
1111
#include "ggml-cpu-quants.h"
1212
#include "ggml-threading.h"
13+
#include "ggml-profile.h"
1314
#include "amx/amx.h"
1415
#include "ggml.h"
1516

@@ -13585,6 +13586,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
1358513586
for (int node_n = 0; node_n < cgraph->n_nodes && !tp->abort; node_n++) {
1358613587
struct ggml_tensor * node = cgraph->nodes[node_n];
1358713588

13589+
ggml_graph_profile_event(cgraph, GGML_PROF_OP_START, node_n, state->ith);
13590+
1358813591
ggml_compute_forward(&params, node);
1358913592

1359013593
if (state->ith == 0 && cplan->abort_callback &&
@@ -13593,6 +13596,15 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
1359313596
tp->ec = GGML_STATUS_ABORTED;
1359413597
}
1359513598

13599+
ggml_graph_profile_event(cgraph, GGML_PROF_OP_SYNC, node_n, state->ith);
13600+
13601+
ggml_barrier(state->threadpool);
13602+
13603+
ggml_graph_profile_event(cgraph, GGML_PROF_OP_END, node_n, state->ith);
13604+
}
13605+
13606+
if (ggml_graph_profile_enabled(cgraph)) {
13607+
// need another barrier to flush the last timing update
1359613608
ggml_barrier(state->threadpool);
1359713609
}
1359813610

@@ -13827,6 +13839,8 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
1382713839
int n_threads = cplan->n_threads;
1382813840
struct ggml_threadpool * threadpool = cplan->threadpool;
1382913841

13842+
ggml_graph_profile_start(cgraph, n_threads);
13843+
1383013844
bool disposable_threadpool = false;
1383113845

1383213846
if (threadpool == NULL) {
@@ -13878,6 +13892,8 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
1387813892
// don't leave affinity set on the main thread
1387913893
clear_numa_thread_affinity();
1388013894

13895+
ggml_graph_profile_finish(cgraph, n_threads);
13896+
1388113897
enum ggml_status ret = threadpool->ec;
1388213898

1388313899
if (disposable_threadpool) {

ggml/src/ggml-impl.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,8 @@ enum ggml_cgraph_eval_order {
282282
GGML_CGRAPH_EVAL_ORDER_COUNT
283283
};
284284

285+
struct ggml_profile_data;
286+
285287
struct ggml_cgraph {
286288
int size; // maximum number of nodes/leafs/grads/grad_accs
287289
int n_nodes; // number of nodes currently in use
@@ -292,6 +294,8 @@ struct ggml_cgraph {
292294
struct ggml_tensor ** grad_accs; // accumulators for node gradients
293295
struct ggml_tensor ** leafs; // tensors with constant data
294296

297+
struct ggml_profile_data * prof;
298+
295299
struct ggml_hash_set visited_hash_set;
296300

297301
enum ggml_cgraph_eval_order order;

ggml/src/ggml-profile.cpp

Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
#include "ggml-profile.h"
2+
3+
#include <stdint.h>
4+
#include <stdlib.h>
5+
#include <stdio.h>
6+
#include <string.h>
7+
8+
#include <string>
9+
#include <chrono>
10+
11+
#ifdef GGML_GRAPH_PROFILER
12+
13+
struct ggml_profile_output {
14+
const char * prefix;
15+
FILE * stream;
16+
};
17+
18+
extern "C" void ggml_graph_profile_init(struct ggml_cgraph *cg, int n_threads)
19+
{
20+
// TODO: make this a param
21+
const char *env = getenv("GGML_GRAPH_PROFILE");
22+
if (!env) { return; }
23+
24+
// The number of threads may change between passes (pp vs tg).
25+
// Allocate for max_n_threads for simplicity for now.
26+
// TODO: use aligned allocator
27+
28+
size_t node_size = sizeof(struct ggml_profile_timing) * GGML_MAX_N_THREADS;
29+
size_t pvec_size = sizeof(std::intptr_t) * cg->n_nodes;
30+
size_t time_size = node_size * cg->n_nodes;
31+
size_t t_size = pvec_size + time_size + sizeof(ggml_profile_output) + sizeof(ggml_profile_data);
32+
33+
uint8_t * ptr = (uint8_t *) malloc(t_size);
34+
if (!ptr) {
35+
fprintf(stderr, "ggml-profile: failed to allocate profiling data : n_threads %d n_nodes %d\n", n_threads, cg->n_nodes);
36+
return;
37+
}
38+
memset(ptr, 0, t_size);
39+
40+
// init all pointers
41+
cg->prof = (ggml_profile_data *) ptr; ptr += sizeof(ggml_profile_data);
42+
cg->prof->output = (ggml_profile_output *) ptr; ptr += sizeof(ggml_profile_output);
43+
cg->prof->timing = (ggml_profile_timing **) ptr; ptr += pvec_size;
44+
for (int i=0; i < cg->n_nodes; i++) {
45+
cg->prof->timing[i] = (struct ggml_profile_timing *) ptr; ptr += node_size;
46+
}
47+
48+
// init the output
49+
ggml_profile_output *out = cg->prof->output;
50+
if (!strcmp("stderr", env) || !strcmp("1", env)) {
51+
out->prefix = "ggml-profile:";
52+
out->stream = stderr;
53+
} else {
54+
out->prefix = "";
55+
out->stream = fopen(env, "w");
56+
}
57+
58+
}
59+
60+
extern "C" void ggml_graph_profile_start(struct ggml_cgraph *cg, int n_threads)
61+
{
62+
if (!cg->prof) { ggml_graph_profile_init(cg, n_threads); }
63+
if (!cg->prof) { return; }
64+
}
65+
66+
static inline int ggml_profile_format_tensor_dims(char *str, struct ggml_tensor *t)
67+
{
68+
return sprintf(str, "%d:%d:%d:%d",
69+
(int) t->ne[0], (int) t->ne[1], (int) t->ne[3], (int) t->ne[3]);
70+
}
71+
72+
static inline void ggml_profile_format_op_dims(char *str, struct ggml_tensor *t)
73+
{
74+
char *p = str;
75+
76+
// append src0 and src1 (if any)
77+
if (t->src[0]) {
78+
p += ggml_profile_format_tensor_dims(p, t->src[0]);
79+
80+
for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
81+
p += sprintf(p, " x ");
82+
p += ggml_profile_format_tensor_dims(p, t->src[i]);
83+
}
84+
85+
p += sprintf(p, " -> ");
86+
}
87+
88+
// format self dims separately for better visual alignment
89+
char self[64];
90+
ggml_profile_format_tensor_dims(self, t);
91+
92+
p += sprintf(p, "%12s", self);
93+
}
94+
95+
static inline void ggml_profile_format_op_types(char *str, struct ggml_tensor *t)
96+
{
97+
char *p = str;
98+
99+
// append src0 and src1 (if any)
100+
if (t->src[0]) {
101+
p += sprintf(p, "%s", ggml_type_name(t->src[0]->type));
102+
103+
for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
104+
p += sprintf(p, " x ");
105+
p += sprintf(p, "%s", ggml_type_name(t->src[i]->type));
106+
}
107+
108+
p += sprintf(p, " -> ");
109+
}
110+
111+
p += sprintf(p, "%3s", ggml_type_name(t->type));
112+
}
113+
114+
extern "C" void ggml_graph_profile_finish(struct ggml_cgraph *cg, int n_threads)
115+
{
116+
if (!cg->prof) { return; }
117+
118+
ggml_profile_output *out = cg->prof->output;
119+
120+
fprintf(out->stream, "%s| node idx | op name | proc (nsec) | sync (nsec) | total (nsec) | op dims | op types | tensor name |\n", out->prefix);
121+
fprintf(out->stream, "%s| -------: | :------ | ----------: | ----------: | -----------: | ------: | -------: | ----------: |\n", out->prefix);
122+
123+
char dims[64 * GGML_MAX_SRC];
124+
char types[16 * GGML_MAX_SRC];
125+
126+
for (int i = 0; i < cg->n_nodes; i++) {
127+
uint64_t p_nsec = 0;
128+
uint64_t s_nsec = 0;
129+
uint64_t t_nsec = 0;
130+
131+
// add up per thread counters and reset them
132+
for (int t=0; t < n_threads; t++) {
133+
ggml_profile_timing &timing = cg->prof->timing[i][t];
134+
135+
p_nsec += timing.nsec[GGML_PROF_OP_SYNC] - timing.nsec[GGML_PROF_OP_START];
136+
s_nsec += timing.nsec[GGML_PROF_OP_END] - timing.nsec[GGML_PROF_OP_SYNC];
137+
t_nsec += timing.nsec[GGML_PROF_OP_END] - timing.nsec[GGML_PROF_OP_START];
138+
139+
timing.nsec[GGML_PROF_OP_START] = 0;
140+
timing.nsec[GGML_PROF_OP_SYNC] = 0;
141+
timing.nsec[GGML_PROF_OP_END] = 0;
142+
}
143+
144+
ggml_profile_format_op_dims(dims, cg->nodes[i]);
145+
ggml_profile_format_op_types(types, cg->nodes[i]);
146+
147+
fprintf(out->stream, "%s| %04d | %10s | %10lu | %10lu | %10lu | %46s | %22s | %20s |\n", out->prefix,
148+
i, ggml_op_name(cg->nodes[i]->op),
149+
(unsigned long) p_nsec, (unsigned long) s_nsec, (unsigned long) t_nsec,
150+
dims, types, cg->nodes[i]->name);
151+
}
152+
fprintf(out->stream, "%s \n", out->prefix); // empty line to split tables
153+
}
154+
155+
extern "C" void ggml_graph_profile_free(struct ggml_cgraph *cg)
156+
{
157+
if (!cg->prof) { return; }
158+
159+
ggml_profile_output *out = cg->prof->output;
160+
if (out->stream != stderr) {
161+
fclose(out->stream);
162+
}
163+
164+
free(cg->prof); cg->prof = nullptr;
165+
}
166+
167+
extern "C" void ggml_graph_profile_event(const struct ggml_cgraph *cg, enum ggml_profile_event e, int node_n, int ith)
168+
{
169+
if (!cg->prof) { return; }
170+
171+
using clock = std::chrono::high_resolution_clock;
172+
173+
ggml_profile_timing &timing = cg->prof->timing[node_n][ith];
174+
timing.nsec[e] = std::chrono::nanoseconds(clock::now().time_since_epoch()).count();
175+
}
176+
177+
#endif // GGML_GRAPH_PROFILER

ggml/src/ggml-profile.h

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
#pragma once
2+
3+
#include "ggml-impl.h"
4+
5+
// GGML internal header
6+
7+
#ifdef __cplusplus
8+
extern "C" {
9+
#endif
10+
11+
// op profile events & timing (per op / per thread)
12+
enum ggml_profile_event {
13+
GGML_PROF_OP_START,
14+
GGML_PROF_OP_SYNC,
15+
GGML_PROF_OP_END
16+
};
17+
18+
struct ggml_profile_timing {
19+
uint64_t nsec[GGML_PROF_OP_END + 1]; // event times in nsec
20+
};
21+
22+
struct ggml_profile_output;
23+
24+
struct ggml_profile_data {
25+
struct ggml_profile_output *output;
26+
struct ggml_profile_timing ** timing; // per op / per thread timing
27+
};
28+
29+
// check if profiling is enabled for this graph
30+
static inline bool ggml_graph_profile_enabled(const struct ggml_cgraph *cg)
31+
{
32+
return cg->prof != NULL;
33+
}
34+
35+
// get pointer to the timing data for specific node / thread
36+
// can be used by the backends to populate data collected internally
37+
static inline struct ggml_profile_timing * ggml_graph_profile_timing(const struct ggml_cgraph *cg, int node_n, int ith)
38+
{
39+
if (!cg->prof) { return NULL; }
40+
return &cg->prof->timing[node_n][ith];
41+
}
42+
43+
#ifndef GGML_GRAPH_PROFILER
44+
45+
// Stub out all profiler functions
46+
47+
static inline void ggml_graph_profile_init(struct ggml_cgraph *cg, int n_threads)
48+
{
49+
GGML_UNUSED(cg);
50+
GGML_UNUSED(n_threads);
51+
}
52+
53+
static inline void ggml_graph_profile_start(struct ggml_cgraph *cg, int n_threads)
54+
{
55+
GGML_UNUSED(cg);
56+
GGML_UNUSED(n_threads);
57+
}
58+
59+
static inline void ggml_graph_profile_finish(struct ggml_cgraph *cg, int n_threads)
60+
{
61+
GGML_UNUSED(cg);
62+
GGML_UNUSED(n_threads);
63+
}
64+
65+
static inline void ggml_graph_profile_free(struct ggml_cgraph *cg)
66+
{
67+
GGML_UNUSED(cg);
68+
}
69+
70+
static inline void ggml_graph_profile_event(const struct ggml_cgraph *cg, enum ggml_profile_event e, int node_n, int ith)
71+
{
72+
GGML_UNUSED(cg);
73+
GGML_UNUSED(e);
74+
GGML_UNUSED(node_n);
75+
GGML_UNUSED(ith);
76+
}
77+
78+
#else
79+
80+
void ggml_graph_profile_init(struct ggml_cgraph *cg, int n_threads);
81+
void ggml_graph_profile_start(struct ggml_cgraph *cg, int n_threads);
82+
void ggml_graph_profile_finish(struct ggml_cgraph *cg, int n_threads);
83+
void ggml_graph_profile_free(struct ggml_cgraph *cg);
84+
void ggml_graph_profile_event(const struct ggml_cgraph *cg, enum ggml_profile_event e, int node_n, int ith);
85+
86+
#endif // GGML_GRAPH_PROFILER
87+
88+
#ifdef __cplusplus
89+
}
90+
#endif

ggml/src/ggml.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
// FIXME: required here for quantization functions
1010
#include "ggml-quants.h"
11+
#include "ggml-profile.h"
1112

1213
#ifdef GGML_USE_CPU_HBM
1314
#include <hbwmalloc.h>
@@ -5884,6 +5885,7 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz
58845885
/*.grads =*/ grads_ptr,
58855886
/*.grad_accs =*/ grad_accs_ptr,
58865887
/*.leafs =*/ leafs_ptr,
5888+
/*.prof =*/ NULL,
58875889
/*.hash_table =*/ { hash_size, hash_used, hash_keys_ptr },
58885890
/*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
58895891
};
@@ -5910,6 +5912,7 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1)
59105912
/*.grads =*/ NULL, // gradients would need visited_hash_set
59115913
/*.grad_accs =*/ NULL,
59125914
/*.leafs =*/ NULL,
5915+
/*.prof =*/ NULL,
59135916
/*.visited_hash_set =*/ { 0, NULL, NULL },
59145917
/*.order =*/ cgraph0->order,
59155918
};

0 commit comments

Comments
 (0)