Skip to content

Commit 8d76b0c

Browse files
profiler: add support for different outputs
1 parent 4578c37 commit 8d76b0c

File tree

4 files changed

+105
-48
lines changed

4 files changed

+105
-48
lines changed

ggml/src/ggml-impl.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,7 @@ struct ggml_cgraph {
176176
struct ggml_tensor ** grads;
177177
struct ggml_tensor ** leafs;
178178

179-
struct ggml_profile_data ** prof;
179+
struct ggml_profile_data * prof;
180180

181181
struct ggml_hash_set visited_hash_set;
182182

ggml/src/ggml-profile.cpp

Lines changed: 60 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -2,42 +2,63 @@
22

33
#include <stdint.h>
44
#include <stdlib.h>
5+
#include <stdio.h>
56

7+
#include <string>
68
#include <chrono>
79

810
#ifdef GGML_GRAPH_PROFILER
911

10-
extern "C" void ggml_profile_graph_init(struct ggml_cgraph *cg, int n_threads)
12+
struct ggml_profile_output {
13+
const char * prefix;
14+
FILE * stream;
15+
};
16+
17+
extern "C" void ggml_graph_profile_init(struct ggml_cgraph *cg, int n_threads)
1118
{
12-
if (!getenv("GGML_GRAPH_PROFILE")) { return; }
19+
// TODO: make this a param
20+
const char *env = getenv("GGML_GRAPH_PROFILE");
21+
if (!env) { return; }
1322

1423
// The number of threads may change between passes (pp vs tg).
1524
// Allocate for max_n_threads for simplicity for now.
1625
// TODO: use aligned allocator
1726

18-
size_t node_size = sizeof(struct ggml_profile_data) * GGML_MAX_N_THREADS;
27+
size_t node_size = sizeof(struct ggml_profile_timing) * GGML_MAX_N_THREADS;
1928
size_t pvec_size = sizeof(std::intptr_t) * cg->n_nodes;
20-
size_t data_size = node_size * cg->n_nodes;
21-
size_t t_size = pvec_size + data_size;
29+
size_t time_size = node_size * cg->n_nodes;
30+
size_t t_size = pvec_size + time_size + sizeof(ggml_profile_output) + sizeof(ggml_profile_data);
2231

23-
cg->prof = (struct ggml_profile_data **) malloc(t_size);
24-
if (!cg->prof) {
32+
uint8_t * ptr = (uint8_t *) malloc(t_size);
33+
if (!ptr) {
2534
fprintf(stderr, "ggml-profile: failed to allocate profiling data : n_threads %d n_nodes %d\n", n_threads, cg->n_nodes);
2635
return;
2736
}
37+
memset(ptr, 0, t_size);
2838

29-
memset(cg->prof, 0, t_size);
30-
31-
// init pre-thread pointers
32-
uint8_t * data = (uint8_t *) cg->prof + pvec_size;
39+
// init all pointers
40+
cg->prof = (ggml_profile_data *) ptr; ptr += sizeof(ggml_profile_data);
41+
cg->prof->output = (ggml_profile_output *) ptr; ptr += sizeof(ggml_profile_output);
42+
cg->prof->timing = (ggml_profile_timing **) ptr; ptr += pvec_size;
3343
for (int i=0; i < cg->n_nodes; i++) {
34-
cg->prof[i] = (struct ggml_profile_data *) data; data += node_size;
44+
cg->prof->timing[i] = (struct ggml_profile_timing *) ptr; ptr += node_size;
45+
}
46+
47+
// init the output
48+
ggml_profile_output *out = cg->prof->output;
49+
if (!strcmp("stderr", env) || !strcmp("1", env)) {
50+
out->prefix = "ggml-profile:";
51+
out->stream = stderr;
52+
} else {
53+
out->prefix = "";
54+
out->stream = fopen(env, "w");
3555
}
56+
3657
}
3758

38-
extern "C" void ggml_profile_graph_start(struct ggml_cgraph *cg, int n_threads)
59+
extern "C" void ggml_graph_profile_start(struct ggml_cgraph *cg, int n_threads)
3960
{
40-
if (!cg->prof) { ggml_profile_graph_init(cg, n_threads); }
61+
if (!cg->prof) { ggml_graph_profile_init(cg, n_threads); }
4162
if (!cg->prof) { return; }
4263
}
4364

@@ -89,13 +110,14 @@ static inline void ggml_profile_format_op_types(char *str, struct ggml_tensor *t
89110
p += sprintf(p, "%3s", ggml_type_name(t->type));
90111
}
91112

92-
93-
extern "C" void ggml_profile_graph_finish(struct ggml_cgraph *cg, int n_threads)
113+
extern "C" void ggml_graph_profile_finish(struct ggml_cgraph *cg, int n_threads)
94114
{
95115
if (!cg->prof) { return; }
96116

97-
fprintf(stderr, "ggml-profile: | node idx | op name | proc (nsec) | sync (nsec) | total (nsec) | op dims | op types | tensor name |\n");
98-
fprintf(stderr, "ggml-profile: | -------: | :------ | ----------: | ----------: | -----------: | ------: | -------: | ----------: |\n");
117+
ggml_profile_output *out = cg->prof->output;
118+
119+
fprintf(out->stream, "%s| node idx | op name | proc (nsec) | sync (nsec) | total (nsec) | op dims | op types | tensor name |\n", out->prefix);
120+
fprintf(out->stream, "%s| -------: | :------ | ----------: | ----------: | -----------: | ------: | -------: | ----------: |\n", out->prefix);
99121

100122
char dims[64 * GGML_MAX_SRC];
101123
char types[16 * GGML_MAX_SRC];
@@ -107,39 +129,48 @@ extern "C" void ggml_profile_graph_finish(struct ggml_cgraph *cg, int n_threads)
107129

108130
// add up per thread counters and reset them
109131
for (int t=0; t < n_threads; t++) {
110-
p_nsec += cg->prof[i][t].nsec[GGML_PROF_OP_SYNC] - cg->prof[i][t].nsec[GGML_PROF_OP_START];
111-
s_nsec += cg->prof[i][t].nsec[GGML_PROF_OP_END] - cg->prof[i][t].nsec[GGML_PROF_OP_SYNC];
112-
t_nsec += cg->prof[i][t].nsec[GGML_PROF_OP_END] - cg->prof[i][t].nsec[GGML_PROF_OP_START];
132+
ggml_profile_timing &timing = cg->prof->timing[i][t];
133+
134+
p_nsec += timing.nsec[GGML_PROF_OP_SYNC] - timing.nsec[GGML_PROF_OP_START];
135+
s_nsec += timing.nsec[GGML_PROF_OP_END] - timing.nsec[GGML_PROF_OP_SYNC];
136+
t_nsec += timing.nsec[GGML_PROF_OP_END] - timing.nsec[GGML_PROF_OP_START];
113137

114-
cg->prof[i][t].nsec[GGML_PROF_OP_START] = 0;
115-
cg->prof[i][t].nsec[GGML_PROF_OP_SYNC] = 0;
116-
cg->prof[i][t].nsec[GGML_PROF_OP_END] = 0;
138+
timing.nsec[GGML_PROF_OP_START] = 0;
139+
timing.nsec[GGML_PROF_OP_SYNC] = 0;
140+
timing.nsec[GGML_PROF_OP_END] = 0;
117141
}
118142

119143
ggml_profile_format_op_dims(dims, cg->nodes[i]);
120144
ggml_profile_format_op_types(types, cg->nodes[i]);
121145

122-
fprintf(stderr, "ggml-profile: | %04d | %10s | %10lu | %10lu | %10lu | %46s | %22s | %20s |\n",
146+
fprintf(out->stream, "%s| %04d | %10s | %10lu | %10lu | %10lu | %46s | %22s | %20s |\n", out->prefix,
123147
i, ggml_op_name(cg->nodes[i]->op),
124148
(unsigned long) p_nsec, (unsigned long) s_nsec, (unsigned long) t_nsec,
125149
dims, types, cg->nodes[i]->name);
126150
}
127-
fprintf(stderr, "ggml-profile: \n"); // empty line to split tables
151+
fprintf(out->stream, "%s \n", out->prefix); // empty line to split tables
128152
}
129153

130-
extern "C" void ggml_profile_graph_free(struct ggml_cgraph *cg)
154+
extern "C" void ggml_graph_profile_free(struct ggml_cgraph *cg)
131155
{
132156
if (!cg->prof) { return; }
133157

158+
ggml_profile_output *out = cg->prof->output;
159+
if (out->stream != stderr) {
160+
fclose(out->stream);
161+
}
162+
134163
free(cg->prof); cg->prof = nullptr;
135164
}
136165

137-
extern "C" void ggml_profile_op_event(const struct ggml_cgraph *cg, enum ggml_profile_event e, int node_n, int ith)
166+
extern "C" void ggml_graph_profile_event(const struct ggml_cgraph *cg, enum ggml_profile_event e, int node_n, int ith)
138167
{
139168
if (!cg->prof) { return; }
140169

141170
using clock = std::chrono::high_resolution_clock;
142-
cg->prof[node_n][ith].nsec[e] = std::chrono::nanoseconds(clock::now().time_since_epoch()).count();
171+
172+
ggml_profile_timing &timing = cg->prof->timing[node_n][ith];
173+
timing.nsec[e] = std::chrono::nanoseconds(clock::now().time_since_epoch()).count();
143174
}
144175

145176
#endif // GGML_GRAPH_PROFILER

ggml/src/ggml-profile.h

Lines changed: 33 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8,45 +8,66 @@
88
extern "C" {
99
#endif
1010

11-
// op profile data (per op / per thread)
11+
// op profile events & timing (per op / per thread)
1212
enum ggml_profile_event {
1313
GGML_PROF_OP_START,
1414
GGML_PROF_OP_SYNC,
1515
GGML_PROF_OP_END
1616
};
1717

18-
struct ggml_profile_data {
18+
struct ggml_profile_timing {
1919
uint64_t nsec[GGML_PROF_OP_END + 1]; // event times in nsec
2020
};
2121

22+
struct ggml_profile_output;
23+
24+
struct ggml_profile_data {
25+
struct ggml_profile_output *output;
26+
struct ggml_profile_timing ** timing; // per op / per thread timing
27+
};
28+
29+
// check if profiling is enabled for this graph
30+
static inline bool ggml_graph_profile_enabled(const struct ggml_cgraph *cg)
31+
{
32+
return cg->prof != NULL;
33+
}
34+
35+
// get pointer to the timing data for specific node / thread
36+
// can be used by the backends to populate data collected internally
37+
static inline struct ggml_profile_timing * ggml_graph_profile_timing(const struct ggml_cgraph *cg, int node_n, int ith)
38+
{
39+
if (!cg->prof) { return NULL; }
40+
return &cg->prof->timing[node_n][ith];
41+
}
42+
2243
#ifndef GGML_GRAPH_PROFILER
2344

2445
// Stub out all profiler functions
2546

26-
static inline void ggml_profile_graph_init(struct ggml_cgraph *cg, int n_threads)
47+
static inline void ggml_graph_profile_init(struct ggml_cgraph *cg, int n_threads)
2748
{
2849
GGML_UNUSED(cg);
2950
GGML_UNUSED(n_threads);
3051
}
3152

32-
static inline void ggml_profile_graph_start(struct ggml_cgraph *cg, int n_threads)
53+
static inline void ggml_graph_profile_start(struct ggml_cgraph *cg, int n_threads)
3354
{
3455
GGML_UNUSED(cg);
3556
GGML_UNUSED(n_threads);
3657
}
3758

38-
static inline void ggml_profile_graph_finish(struct ggml_cgraph *cg, int n_threads)
59+
static inline void ggml_graph_profile_finish(struct ggml_cgraph *cg, int n_threads)
3960
{
4061
GGML_UNUSED(cg);
4162
GGML_UNUSED(n_threads);
4263
}
4364

44-
static inline void ggml_profile_graph_free(struct ggml_cgraph *cg)
65+
static inline void ggml_graph_profile_free(struct ggml_cgraph *cg)
4566
{
4667
GGML_UNUSED(cg);
4768
}
4869

49-
static inline void ggml_profile_op_event(const struct ggml_cgraph *cg, enum ggml_profile_event e, int node_n, int ith)
70+
static inline void ggml_graph_profile_event(const struct ggml_cgraph *cg, enum ggml_profile_event e, int node_n, int ith)
5071
{
5172
GGML_UNUSED(cg);
5273
GGML_UNUSED(e);
@@ -56,11 +77,11 @@ static inline void ggml_profile_op_event(const struct ggml_cgraph *cg, enum ggml
5677

5778
#else
5879

59-
void ggml_profile_graph_init(struct ggml_cgraph *cg, int n_threads);
60-
void ggml_profile_graph_start(struct ggml_cgraph *cg, int n_threads);
61-
void ggml_profile_graph_finish(struct ggml_cgraph *cg, int n_threads);
62-
void ggml_profile_graph_free(struct ggml_cgraph *cg);
63-
void ggml_profile_op_event(const struct ggml_cgraph *cg, enum ggml_profile_event e, int node_n, int ith);
80+
void ggml_graph_profile_init(struct ggml_cgraph *cg, int n_threads);
81+
void ggml_graph_profile_start(struct ggml_cgraph *cg, int n_threads);
82+
void ggml_graph_profile_finish(struct ggml_cgraph *cg, int n_threads);
83+
void ggml_graph_profile_free(struct ggml_cgraph *cg);
84+
void ggml_graph_profile_event(const struct ggml_cgraph *cg, enum ggml_profile_event e, int node_n, int ith);
6485

6586
#endif // GGML_GRAPH_PROFILER
6687

ggml/src/ggml.c

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20232,7 +20232,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
2023220232
for (int node_n = 0; node_n < cgraph->n_nodes && !tp->abort; node_n++) {
2023320233
struct ggml_tensor * node = cgraph->nodes[node_n];
2023420234

20235-
ggml_profile_op_event(cgraph, GGML_PROF_OP_START, node_n, state->ith);
20235+
ggml_graph_profile_event(cgraph, GGML_PROF_OP_START, node_n, state->ith);
2023620236

2023720237
ggml_compute_forward(&params, node);
2023820238

@@ -20242,11 +20242,16 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
2024220242
tp->ec = GGML_STATUS_ABORTED;
2024320243
}
2024420244

20245-
ggml_profile_op_event(cgraph, GGML_PROF_OP_SYNC, node_n, state->ith);
20245+
ggml_graph_profile_event(cgraph, GGML_PROF_OP_SYNC, node_n, state->ith);
2024620246

2024720247
ggml_barrier(state->threadpool);
2024820248

20249-
ggml_profile_op_event(cgraph, GGML_PROF_OP_END, node_n, state->ith);
20249+
ggml_graph_profile_event(cgraph, GGML_PROF_OP_END, node_n, state->ith);
20250+
}
20251+
20252+
if (ggml_graph_profile_enabled(cgraph)) {
20253+
// need another barrier to flush the last timing update
20254+
ggml_barrier(state->threadpool);
2025020255
}
2025120256

2025220257
return 0;
@@ -20519,7 +20524,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
2051920524
threadpool->ec = GGML_STATUS_SUCCESS;
2052020525
}
2052120526

20522-
ggml_profile_graph_start(cgraph, n_threads);
20527+
ggml_graph_profile_start(cgraph, n_threads);
2052320528

2052420529
#ifdef GGML_USE_OPENMP
2052520530
if (n_threads > 1) {
@@ -20551,6 +20556,8 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
2055120556
ggml_graph_compute_thread(&threadpool->workers[0]);
2055220557
#endif
2055320558

20559+
ggml_graph_profile_finish(cgraph, n_threads);
20560+
2055420561
// don't leave affinity set on the main thread
2055520562
clear_numa_thread_affinity();
2055620563

@@ -20560,8 +20567,6 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
2056020567
ggml_threadpool_free(threadpool);
2056120568
}
2056220569

20563-
ggml_profile_graph_finish(cgraph, n_threads);
20564-
2056520570
return ret;
2056620571
}
2056720572

0 commit comments

Comments
 (0)