Skip to content

Commit ca40774

Browse files
profiler: initial support for profiling graph ops
1 parent 7675c55 commit ca40774

File tree

7 files changed

+292
-0
lines changed

7 files changed

+292
-0
lines changed

ggml/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ option(GGML_CCACHE "ggml: use ccache if available" ON)
8484
option(GGML_ALL_WARNINGS "ggml: enable all compiler warnings" ON)
8585
option(GGML_ALL_WARNINGS_3RD_PARTY "ggml: enable all compiler warnings in 3rd party libs" OFF)
8686
option(GGML_GPROF "ggml: enable gprof" OFF)
87+
option(GGML_GRAPH_PROFILER "ggml: enable internal Graph and Op profiler" OFF)
8788

8889
# build
8990
option(GGML_FATAL_WARNINGS "ggml: enable -Werror flag" OFF)

ggml/src/CMakeLists.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,10 @@ if (CMAKE_SYSTEM_NAME MATCHES "Linux")
88
add_compile_definitions($<$<CONFIG:Debug>:_GLIBCXX_ASSERTIONS>)
99
endif()
1010

11+
if (GGML_GRAPH_PROFILER)
12+
add_compile_definitions(GGML_GRAPH_PROFILER)
13+
endif()
14+
1115
if (NOT MSVC)
1216
if (GGML_SANITIZE_THREAD)
1317
add_compile_options(-fsanitize=thread)
@@ -204,6 +208,8 @@ add_library(ggml-base
204208
ggml-threading.h
205209
ggml-quants.c
206210
ggml-quants.h
211+
ggml-profile.h
212+
ggml-profile.cpp
207213
gguf.cpp)
208214

209215
target_include_directories(ggml-base PRIVATE .)

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include "binary-ops.h"
1414
#include "vec.h"
1515
#include "ops.h"
16+
#include "ggml-profile.h"
1617
#include "ggml.h"
1718

1819
#if defined(_MSC_VER) || defined(__MINGW32__)
@@ -2871,6 +2872,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
28712872
for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
28722873
struct ggml_tensor * node = cgraph->nodes[node_n];
28732874

2875+
ggml_graph_profile_event(cgraph, GGML_PROF_OP_START, node_n, state->ith);
2876+
28742877
ggml_compute_forward(&params, node);
28752878

28762879
if (state->ith == 0 && cplan->abort_callback &&
@@ -2879,9 +2882,13 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
28792882
tp->ec = GGML_STATUS_ABORTED;
28802883
}
28812884

2885+
ggml_graph_profile_event(cgraph, GGML_PROF_OP_SYNC, node_n, state->ith);
2886+
28822887
if (node_n + 1 < cgraph->n_nodes) {
28832888
ggml_barrier(state->threadpool);
28842889
}
2890+
2891+
ggml_graph_profile_event(cgraph, GGML_PROF_OP_END, node_n, state->ith);
28852892
}
28862893

28872894
ggml_barrier(state->threadpool);
@@ -3117,6 +3124,8 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
31173124
int n_threads = cplan->n_threads;
31183125
struct ggml_threadpool * threadpool = cplan->threadpool;
31193126

3127+
ggml_graph_profile_start(cgraph, n_threads);
3128+
31203129
bool disposable_threadpool = false;
31213130

31223131
if (threadpool == NULL) {
@@ -3168,6 +3177,8 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
31683177
// don't leave affinity set on the main thread
31693178
clear_numa_thread_affinity();
31703179

3180+
ggml_graph_profile_finish(cgraph, n_threads);
3181+
31713182
enum ggml_status ret = threadpool->ec;
31723183

31733184
if (disposable_threadpool) {

ggml/src/ggml-impl.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -292,6 +292,8 @@ enum ggml_cgraph_eval_order {
292292
GGML_CGRAPH_EVAL_ORDER_COUNT
293293
};
294294

295+
struct ggml_profile_data;
296+
295297
struct ggml_cgraph {
296298
int size; // maximum number of nodes/leafs/grads/grad_accs
297299
int n_nodes; // number of nodes currently in use
@@ -302,6 +304,8 @@ struct ggml_cgraph {
302304
struct ggml_tensor ** grad_accs; // accumulators for node gradients
303305
struct ggml_tensor ** leafs; // tensors with constant data
304306

307+
struct ggml_profile_data * prof;
308+
305309
struct ggml_hash_set visited_hash_set;
306310

307311
enum ggml_cgraph_eval_order order;

ggml/src/ggml-profile.cpp

Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
#include "ggml-profile.h"
2+
3+
#include <stdint.h>
4+
#include <stdlib.h>
5+
#include <stdio.h>
6+
#include <string.h>
7+
8+
#include <string>
9+
#include <chrono>
10+
11+
#ifdef GGML_GRAPH_PROFILER
12+
13+
struct ggml_profile_output {
14+
const char * prefix;
15+
FILE * stream;
16+
};
17+
18+
extern "C" void ggml_graph_profile_init(struct ggml_cgraph *cg, int n_threads)
19+
{
20+
// TODO: make this a param
21+
const char *env = getenv("GGML_GRAPH_PROFILE");
22+
if (!env) { return; }
23+
24+
// The number of threads may change between passes (pp vs tg).
25+
// Allocate for max_n_threads for simplicity for now.
26+
// TODO: use aligned allocator
27+
28+
size_t node_size = sizeof(struct ggml_profile_timing) * GGML_MAX_N_THREADS;
29+
size_t pvec_size = sizeof(std::intptr_t) * cg->n_nodes;
30+
size_t time_size = node_size * cg->n_nodes;
31+
size_t t_size = pvec_size + time_size + sizeof(ggml_profile_output) + sizeof(ggml_profile_data);
32+
33+
uint8_t * ptr = (uint8_t *) malloc(t_size);
34+
if (!ptr) {
35+
fprintf(stderr, "ggml-profile: failed to allocate profiling data : n_threads %d n_nodes %d\n", n_threads, cg->n_nodes);
36+
return;
37+
}
38+
memset(ptr, 0, t_size);
39+
40+
// init all pointers
41+
cg->prof = (ggml_profile_data *) ptr; ptr += sizeof(ggml_profile_data);
42+
cg->prof->output = (ggml_profile_output *) ptr; ptr += sizeof(ggml_profile_output);
43+
cg->prof->timing = (ggml_profile_timing **) ptr; ptr += pvec_size;
44+
for (int i=0; i < cg->n_nodes; i++) {
45+
cg->prof->timing[i] = (struct ggml_profile_timing *) ptr; ptr += node_size;
46+
}
47+
48+
// init the output
49+
ggml_profile_output *out = cg->prof->output;
50+
if (!strcmp("stderr", env) || !strcmp("1", env)) {
51+
out->prefix = "ggml-profile:";
52+
out->stream = stderr;
53+
} else {
54+
out->prefix = "";
55+
out->stream = fopen(env, "w");
56+
}
57+
58+
}
59+
60+
extern "C" void ggml_graph_profile_start(struct ggml_cgraph *cg, int n_threads)
61+
{
62+
if (!cg->prof) { ggml_graph_profile_init(cg, n_threads); }
63+
if (!cg->prof) { return; }
64+
}
65+
66+
static inline int ggml_profile_format_tensor_dims(char *str, struct ggml_tensor *t)
67+
{
68+
return sprintf(str, "%d:%d:%d:%d",
69+
(int) t->ne[0], (int) t->ne[1], (int) t->ne[3], (int) t->ne[3]);
70+
}
71+
72+
static inline void ggml_profile_format_op_dims(char *str, struct ggml_tensor *t)
73+
{
74+
char *p = str;
75+
76+
// append src0 and src1 (if any)
77+
if (t->src[0]) {
78+
p += ggml_profile_format_tensor_dims(p, t->src[0]);
79+
80+
for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
81+
p += sprintf(p, " x ");
82+
p += ggml_profile_format_tensor_dims(p, t->src[i]);
83+
}
84+
85+
p += sprintf(p, " -> ");
86+
}
87+
88+
// format self dims separately for better visual alignment
89+
char self[64];
90+
ggml_profile_format_tensor_dims(self, t);
91+
92+
p += sprintf(p, "%12s", self);
93+
}
94+
95+
static inline void ggml_profile_format_op_types(char *str, struct ggml_tensor *t)
96+
{
97+
char *p = str;
98+
99+
// append src0 and src1 (if any)
100+
if (t->src[0]) {
101+
p += sprintf(p, "%s", ggml_type_name(t->src[0]->type));
102+
103+
for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
104+
p += sprintf(p, " x ");
105+
p += sprintf(p, "%s", ggml_type_name(t->src[i]->type));
106+
}
107+
108+
p += sprintf(p, " -> ");
109+
}
110+
111+
p += sprintf(p, "%3s", ggml_type_name(t->type));
112+
}
113+
114+
extern "C" void ggml_graph_profile_finish(struct ggml_cgraph *cg, int n_threads)
115+
{
116+
if (!cg->prof) { return; }
117+
118+
ggml_profile_output *out = cg->prof->output;
119+
120+
fprintf(out->stream, "%s| node idx | op name | proc (nsec) | sync (nsec) | total (nsec) | op dims | op types | tensor name |\n", out->prefix);
121+
fprintf(out->stream, "%s| -------: | :------ | ----------: | ----------: | -----------: | ------: | -------: | ----------: |\n", out->prefix);
122+
123+
char dims[64 * GGML_MAX_SRC];
124+
char types[16 * GGML_MAX_SRC];
125+
126+
for (int i = 0; i < cg->n_nodes; i++) {
127+
uint64_t p_nsec = 0;
128+
uint64_t s_nsec = 0;
129+
uint64_t t_nsec = 0;
130+
131+
// add up per thread counters and reset them
132+
for (int t=0; t < n_threads; t++) {
133+
ggml_profile_timing &timing = cg->prof->timing[i][t];
134+
135+
p_nsec += timing.nsec[GGML_PROF_OP_SYNC] - timing.nsec[GGML_PROF_OP_START];
136+
s_nsec += timing.nsec[GGML_PROF_OP_END] - timing.nsec[GGML_PROF_OP_SYNC];
137+
t_nsec += timing.nsec[GGML_PROF_OP_END] - timing.nsec[GGML_PROF_OP_START];
138+
139+
timing.nsec[GGML_PROF_OP_START] = 0;
140+
timing.nsec[GGML_PROF_OP_SYNC] = 0;
141+
timing.nsec[GGML_PROF_OP_END] = 0;
142+
}
143+
144+
ggml_profile_format_op_dims(dims, cg->nodes[i]);
145+
ggml_profile_format_op_types(types, cg->nodes[i]);
146+
147+
fprintf(out->stream, "%s| %04d | %10s | %10lu | %10lu | %10lu | %46s | %22s | %20s |\n", out->prefix,
148+
i, ggml_op_name(cg->nodes[i]->op),
149+
(unsigned long) p_nsec, (unsigned long) s_nsec, (unsigned long) t_nsec,
150+
dims, types, cg->nodes[i]->name);
151+
}
152+
fprintf(out->stream, "%s \n", out->prefix); // empty line to split tables
153+
}
154+
155+
extern "C" void ggml_graph_profile_free(struct ggml_cgraph *cg)
156+
{
157+
if (!cg->prof) { return; }
158+
159+
ggml_profile_output *out = cg->prof->output;
160+
if (out->stream != stderr) {
161+
fclose(out->stream);
162+
}
163+
164+
free(cg->prof); cg->prof = nullptr;
165+
}
166+
167+
extern "C" void ggml_graph_profile_event(const struct ggml_cgraph *cg, enum ggml_profile_event e, int node_n, int ith)
168+
{
169+
if (!cg->prof) { return; }
170+
171+
using clock = std::chrono::high_resolution_clock;
172+
173+
ggml_profile_timing &timing = cg->prof->timing[node_n][ith];
174+
timing.nsec[e] = std::chrono::nanoseconds(clock::now().time_since_epoch()).count();
175+
}
176+
177+
#endif // GGML_GRAPH_PROFILER

ggml/src/ggml-profile.h

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
#pragma once
2+
3+
#include "ggml-impl.h"
4+
5+
// GGML internal header
6+
7+
#ifdef __cplusplus
8+
extern "C" {
9+
#endif
10+
11+
// op profile events & timing (per op / per thread)
12+
enum ggml_profile_event {
13+
GGML_PROF_OP_START,
14+
GGML_PROF_OP_SYNC,
15+
GGML_PROF_OP_END
16+
};
17+
18+
struct ggml_profile_timing {
19+
uint64_t nsec[GGML_PROF_OP_END + 1]; // event times in nsec
20+
};
21+
22+
struct ggml_profile_output;
23+
24+
struct ggml_profile_data {
25+
struct ggml_profile_output *output;
26+
struct ggml_profile_timing ** timing; // per op / per thread timing
27+
};
28+
29+
// check if profiling is enabled for this graph
30+
static inline bool ggml_graph_profile_enabled(const struct ggml_cgraph *cg)
31+
{
32+
return cg->prof != NULL;
33+
}
34+
35+
// get pointer to the timing data for specific node / thread
36+
// can be used by the backends to populate data collected internally
37+
static inline struct ggml_profile_timing * ggml_graph_profile_timing(const struct ggml_cgraph *cg, int node_n, int ith)
38+
{
39+
if (!cg->prof) { return NULL; }
40+
return &cg->prof->timing[node_n][ith];
41+
}
42+
43+
#ifndef GGML_GRAPH_PROFILER
44+
45+
// Stub out all profiler functions
46+
47+
static inline void ggml_graph_profile_init(struct ggml_cgraph *cg, int n_threads)
48+
{
49+
GGML_UNUSED(cg);
50+
GGML_UNUSED(n_threads);
51+
}
52+
53+
static inline void ggml_graph_profile_start(struct ggml_cgraph *cg, int n_threads)
54+
{
55+
GGML_UNUSED(cg);
56+
GGML_UNUSED(n_threads);
57+
}
58+
59+
static inline void ggml_graph_profile_finish(struct ggml_cgraph *cg, int n_threads)
60+
{
61+
GGML_UNUSED(cg);
62+
GGML_UNUSED(n_threads);
63+
}
64+
65+
static inline void ggml_graph_profile_free(struct ggml_cgraph *cg)
66+
{
67+
GGML_UNUSED(cg);
68+
}
69+
70+
static inline void ggml_graph_profile_event(const struct ggml_cgraph *cg, enum ggml_profile_event e, int node_n, int ith)
71+
{
72+
GGML_UNUSED(cg);
73+
GGML_UNUSED(e);
74+
GGML_UNUSED(node_n);
75+
GGML_UNUSED(ith);
76+
}
77+
78+
#else
79+
80+
void ggml_graph_profile_init(struct ggml_cgraph *cg, int n_threads);
81+
void ggml_graph_profile_start(struct ggml_cgraph *cg, int n_threads);
82+
void ggml_graph_profile_finish(struct ggml_cgraph *cg, int n_threads);
83+
void ggml_graph_profile_free(struct ggml_cgraph *cg);
84+
void ggml_graph_profile_event(const struct ggml_cgraph *cg, enum ggml_profile_event e, int node_n, int ith);
85+
86+
#endif // GGML_GRAPH_PROFILER
87+
88+
#ifdef __cplusplus
89+
}
90+
#endif

ggml/src/ggml.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
// FIXME: required here for quantization functions
1111
#include "ggml-quants.h"
12+
#include "ggml-profile.h"
1213

1314
#ifdef GGML_USE_CPU_HBM
1415
#include <hbwmalloc.h>
@@ -6012,6 +6013,7 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz
60126013
/*.grads =*/ grads_ptr,
60136014
/*.grad_accs =*/ grad_accs_ptr,
60146015
/*.leafs =*/ leafs_ptr,
6016+
/*.prof =*/ NULL,
60156017
/*.hash_table =*/ { hash_size, hash_used, hash_keys_ptr },
60166018
/*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
60176019
};
@@ -6038,6 +6040,7 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1)
60386040
/*.grads =*/ NULL, // gradients would need visited_hash_set
60396041
/*.grad_accs =*/ NULL,
60406042
/*.leafs =*/ NULL,
6043+
/*.prof =*/ NULL,
60416044
/*.visited_hash_set =*/ { 0, NULL, NULL },
60426045
/*.order =*/ cgraph0->order,
60436046
};

0 commit comments

Comments
 (0)