22
33#include < stdint.h>
44#include < stdlib.h>
5+ #include < stdio.h>
56
7+ #include < string>
68#include < chrono>
79
810#ifdef GGML_GRAPH_PROFILER
911
10- extern " C" void ggml_profile_graph_init (struct ggml_cgraph *cg, int n_threads)
12+ struct ggml_profile_output {
13+ const char * prefix;
14+ FILE * stream;
15+ };
16+
17+ extern " C" void ggml_graph_profile_init (struct ggml_cgraph *cg, int n_threads)
1118{
12- if (!getenv (" GGML_GRAPH_PROFILE" )) { return ; }
19+ // TODO: make this a param
20+ const char *env = getenv (" GGML_GRAPH_PROFILE" );
21+ if (!env) { return ; }
1322
1423 // The number of threads may change between passes (pp vs tg).
1524 // Allocate for max_n_threads for simplicity for now.
1625 // TODO: use aligned allocator
1726
18- size_t node_size = sizeof (struct ggml_profile_data ) * GGML_MAX_N_THREADS;
27+ size_t node_size = sizeof (struct ggml_profile_timing ) * GGML_MAX_N_THREADS;
1928 size_t pvec_size = sizeof (std::intptr_t ) * cg->n_nodes ;
20- size_t data_size = node_size * cg->n_nodes ;
21- size_t t_size = pvec_size + data_size ;
29+ size_t time_size = node_size * cg->n_nodes ;
30+ size_t t_size = pvec_size + time_size + sizeof (ggml_profile_output) + sizeof (ggml_profile_data) ;
2231
23- cg-> prof = (struct ggml_profile_data * *) malloc (t_size);
24- if (!cg-> prof ) {
32+ uint8_t * ptr = (uint8_t *) malloc (t_size);
33+ if (!ptr ) {
2534 fprintf (stderr, " ggml-profile: failed to allocate profiling data : n_threads %d n_nodes %d\n " , n_threads, cg->n_nodes );
2635 return ;
2736 }
37+ memset (ptr, 0 , t_size);
2838
29- memset (cg-> prof , 0 , t_size);
30-
31- // init pre-thread pointers
32- uint8_t * data = (uint8_t *) cg-> prof + pvec_size;
39+ // init all pointers
40+ cg-> prof = (ggml_profile_data *) ptr; ptr += sizeof (ggml_profile_data);
41+ cg-> prof -> output = (ggml_profile_output *) ptr; ptr += sizeof (ggml_profile_output);
42+ cg-> prof -> timing = (ggml_profile_timing **) ptr; ptr += pvec_size;
3343 for (int i=0 ; i < cg->n_nodes ; i++) {
34- cg->prof [i] = (struct ggml_profile_data *) data; data += node_size;
44+ cg->prof ->timing [i] = (struct ggml_profile_timing *) ptr; ptr += node_size;
45+ }
46+
47+ // init the output
48+ ggml_profile_output *out = cg->prof ->output ;
49+ if (!strcmp (" stderr" , env) || !strcmp (" 1" , env)) {
50+ out->prefix = " ggml-profile:" ;
51+ out->stream = stderr;
52+ } else {
53+ out->prefix = " " ;
54+ out->stream = fopen (env, " w" );
3555 }
56+
3657}
3758
38- extern " C" void ggml_profile_graph_start (struct ggml_cgraph *cg, int n_threads)
59+ extern " C" void ggml_graph_profile_start (struct ggml_cgraph *cg, int n_threads)
3960{
40- if (!cg->prof ) { ggml_profile_graph_init (cg, n_threads); }
61+ if (!cg->prof ) { ggml_graph_profile_init (cg, n_threads); }
4162 if (!cg->prof ) { return ; }
4263}
4364
@@ -89,13 +110,14 @@ static inline void ggml_profile_format_op_types(char *str, struct ggml_tensor *t
89110 p += sprintf (p, " %3s" , ggml_type_name (t->type ));
90111}
91112
92-
93- extern " C" void ggml_profile_graph_finish (struct ggml_cgraph *cg, int n_threads)
113+ extern " C" void ggml_graph_profile_finish (struct ggml_cgraph *cg, int n_threads)
94114{
95115 if (!cg->prof ) { return ; }
96116
97- fprintf (stderr, " ggml-profile: | node idx | op name | proc (nsec) | sync (nsec) | total (nsec) | op dims | op types | tensor name |\n " );
98- fprintf (stderr, " ggml-profile: | -------: | :------ | ----------: | ----------: | -----------: | ------: | -------: | ----------: |\n " );
117+ ggml_profile_output *out = cg->prof ->output ;
118+
119+ fprintf (out->stream , " %s| node idx | op name | proc (nsec) | sync (nsec) | total (nsec) | op dims | op types | tensor name |\n " , out->prefix );
120+ fprintf (out->stream , " %s| -------: | :------ | ----------: | ----------: | -----------: | ------: | -------: | ----------: |\n " , out->prefix );
99121
100122 char dims[64 * GGML_MAX_SRC];
101123 char types[16 * GGML_MAX_SRC];
@@ -107,39 +129,48 @@ extern "C" void ggml_profile_graph_finish(struct ggml_cgraph *cg, int n_threads)
107129
108130 // add up per thread counters and reset them
109131 for (int t=0 ; t < n_threads; t++) {
110- p_nsec += cg->prof [i][t].nsec [GGML_PROF_OP_SYNC] - cg->prof [i][t].nsec [GGML_PROF_OP_START];
111- s_nsec += cg->prof [i][t].nsec [GGML_PROF_OP_END] - cg->prof [i][t].nsec [GGML_PROF_OP_SYNC];
112- t_nsec += cg->prof [i][t].nsec [GGML_PROF_OP_END] - cg->prof [i][t].nsec [GGML_PROF_OP_START];
132+ ggml_profile_timing &timing = cg->prof ->timing [i][t];
133+
134+ p_nsec += timing.nsec [GGML_PROF_OP_SYNC] - timing.nsec [GGML_PROF_OP_START];
135+ s_nsec += timing.nsec [GGML_PROF_OP_END] - timing.nsec [GGML_PROF_OP_SYNC];
136+ t_nsec += timing.nsec [GGML_PROF_OP_END] - timing.nsec [GGML_PROF_OP_START];
113137
114- cg-> prof [i][t] .nsec [GGML_PROF_OP_START] = 0 ;
115- cg-> prof [i][t] .nsec [GGML_PROF_OP_SYNC] = 0 ;
116- cg-> prof [i][t] .nsec [GGML_PROF_OP_END] = 0 ;
138+ timing .nsec [GGML_PROF_OP_START] = 0 ;
139+ timing .nsec [GGML_PROF_OP_SYNC] = 0 ;
140+ timing .nsec [GGML_PROF_OP_END] = 0 ;
117141 }
118142
119143 ggml_profile_format_op_dims (dims, cg->nodes [i]);
120144 ggml_profile_format_op_types (types, cg->nodes [i]);
121145
122- fprintf (stderr , " ggml-profile: | %04d | %10s | %10lu | %10lu | %10lu | %46s | %22s | %20s |\n " ,
146+ fprintf (out-> stream , " %s | %04d | %10s | %10lu | %10lu | %10lu | %46s | %22s | %20s |\n " , out-> prefix ,
123147 i, ggml_op_name (cg->nodes [i]->op ),
124148 (unsigned long ) p_nsec, (unsigned long ) s_nsec, (unsigned long ) t_nsec,
125149 dims, types, cg->nodes [i]->name );
126150 }
127- fprintf (stderr , " ggml-profile: \n " ); // empty line to split tables
151+ fprintf (out-> stream , " %s \n " , out-> prefix ); // empty line to split tables
128152}
129153
130- extern " C" void ggml_profile_graph_free (struct ggml_cgraph *cg)
154+ extern " C" void ggml_graph_profile_free (struct ggml_cgraph *cg)
131155{
132156 if (!cg->prof ) { return ; }
133157
158+ ggml_profile_output *out = cg->prof ->output ;
159+ if (out->stream != stderr) {
160+ fclose (out->stream );
161+ }
162+
134163 free (cg->prof ); cg->prof = nullptr ;
135164}
136165
137- extern " C" void ggml_profile_op_event (const struct ggml_cgraph *cg, enum ggml_profile_event e, int node_n, int ith)
166+ extern " C" void ggml_graph_profile_event (const struct ggml_cgraph *cg, enum ggml_profile_event e, int node_n, int ith)
138167{
139168 if (!cg->prof ) { return ; }
140169
141170 using clock = std::chrono::high_resolution_clock;
142- cg->prof [node_n][ith].nsec [e] = std::chrono::nanoseconds (clock::now ().time_since_epoch ()).count ();
171+
172+ ggml_profile_timing &timing = cg->prof ->timing [node_n][ith];
173+ timing.nsec [e] = std::chrono::nanoseconds (clock::now ().time_since_epoch ()).count ();
143174}
144175
145176#endif // GGML_GRAPH_PROFILER
0 commit comments