1010#include < stdlib.h>
1111#include < stdio.h>
1212#include < string.h>
13+ #include < cstring>
1314#include < thread>
1415
1516#define PROFILE_ASSERT (x ) if (!(x)) { fprintf (stderr, " Assert failed %s:%d\n " , __FILE__, __LINE__); exit (1 ); }
1617
1718static thread_local uint64_t thread_id_hash;
1819static thread_local bool thread_id_initialized;
1920static cputrace_profiler g_profiler;
20- static HW_ctx* active_contexts[CPUTRACE_MAX_ANCHORS][CPUTRACE_MAX_THREADS] = {{nullptr }};
2121
2222struct read_format {
2323 uint64_t nr;
@@ -62,7 +62,7 @@ static void open_perf_fd(int& fd, uint64_t& id, struct perf_event_attr* pe, cons
6262 if (fd != -1 ) {
6363 ioctl (fd, PERF_EVENT_IOC_ID, &id);
6464 ioctl (fd, PERF_EVENT_IOC_RESET, 0 );
65- } else {
65+ } else {
6666 fprintf (stderr, " Failed to open perf event for %s: %s\n " , name, strerror (errno));
6767 id = 0 ;
6868 }
@@ -164,31 +164,22 @@ void HW_read(HW_ctx* ctx, sample_t* measure) {
164164 }
165165}
166166
167- static void read_perf_event (HW_ctx* ctx, cputrace_anchor* anchor) {
168- pthread_mutex_lock (&anchor->lock );
169- if (ctx->parent_fd != -1 ) {
170- sample_t measure;
171- HW_read (ctx, &measure);
172- if (measure.swi ) {
173- anchor->global_results .swi += measure.swi ;
174- }
175- if (measure.cyc ) {
176- anchor->global_results .cyc += measure.cyc ;
177- }
178- if (measure.cmiss ) {
179- anchor->global_results .cmiss += measure.cmiss ;
180- }
181- if (measure.bmiss ) {
182- anchor->global_results .bmiss += measure.bmiss ;
183- }
184- if (measure.ins ) {
185- anchor->global_results .ins += measure.ins ;
186- }
187- if (ioctl (ctx->parent_fd , PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP) != 0 ) {
188- fprintf (stderr, " Failed to reset perf counters: %s\n " , strerror (errno));
189- }
167+ static void collect_samples (sample_t * start, sample_t * end, cputrace_anchor* anchor) {
168+ if (end->swi ) {
169+ anchor->global_results .swi += end->swi - start->swi ;
170+ }
171+ if (end->cyc ) {
172+ anchor->global_results .cyc += end->cyc - start->cyc ;
173+ }
174+ if (end->cmiss ) {
175+ anchor->global_results .cmiss += end->cmiss - start->cmiss ;
176+ }
177+ if (end->bmiss ) {
178+ anchor->global_results .bmiss += end->bmiss - start->bmiss ;
179+ }
180+ if (end->ins ) {
181+ anchor->global_results .ins += end->ins - start->ins ;
190182 }
191- pthread_mutex_unlock (&anchor->lock );
192183}
193184
194185HW_profile::HW_profile (const char * function, uint64_t index, uint64_t flags)
@@ -199,31 +190,44 @@ HW_profile::HW_profile(const char* function, uint64_t index, uint64_t flags)
199190 uint64_t tid = get_thread_id ();
200191
201192 cputrace_anchor& anchor = g_profiler.anchors [index];
193+ pthread_mutex_lock (&anchor.lock );
202194 anchor.name = function;
203195 anchor.flags = flags;
204196
205- pthread_mutex_lock (&anchor.lock );
206197 anchor.global_results .call_count += 1 ;
207- pthread_mutex_unlock (&anchor.lock );
208-
209- ctx = HW_ctx_empty;
210- HW_init (&ctx, flags);
211198
212- pthread_mutex_lock (&g_profiler.global_lock );
213- active_contexts[index][tid] = &ctx;
214- pthread_mutex_unlock (&g_profiler.global_lock );
199+ if (anchor.active_contexts [tid] == nullptr ) {
200+ ctx = &anchor.per_thread_ctx [tid];
201+ *ctx = HW_ctx_empty;
202+ HW_init (ctx, flags);
203+ anchor.active_contexts [tid] = ctx;
204+ } else {
205+ ctx = anchor.active_contexts [tid];
206+ }
207+ anchor.nest_level [tid]++;
208+ if (anchor.nest_level [tid] == 1 ) {
209+ HW_read (ctx, &anchor.start [tid]);
210+ }
211+ anchor.is_capturing [tid] = true ;
212+ pthread_mutex_unlock (&anchor.lock );
215213}
216214
217215HW_profile::~HW_profile () {
218216 if (!g_profiler.profiling || index >= CPUTRACE_MAX_ANCHORS)
219217 return ;
220218
219+ cputrace_anchor& anchor = g_profiler.anchors [index];
221220 uint64_t tid = get_thread_id ();
222- pthread_mutex_lock (&g_profiler.global_lock );
223- read_perf_event (&ctx, &g_profiler.anchors [index]);
224- HW_clean (&ctx);
225- active_contexts[index][tid] = nullptr ;
226- pthread_mutex_unlock (&g_profiler.global_lock );
221+
222+ pthread_mutex_lock (&anchor.lock );
223+ anchor.nest_level [tid]--;
224+ if (anchor.nest_level [tid] == 0 ) {
225+ HW_read (ctx, &anchor.end [tid]);
226+ collect_samples (&anchor.start [tid], &anchor.end [tid], &anchor);
227+ std::memcpy (&anchor.start [tid], &anchor.end [tid], sizeof (anchor.start [tid]));
228+ anchor.is_capturing [tid] = false ;
229+ }
230+ pthread_mutex_unlock (&anchor.lock );
227231}
228232
229233void cputrace_start () {
@@ -309,16 +313,20 @@ void cputrace_dump(ceph::Formatter* f, const std::string& logger, const std::str
309313 bool dumped = false ;
310314
311315 for (int i = 0 ; i < CPUTRACE_MAX_ANCHORS; ++i) {
312- const auto & anchor = g_profiler.anchors [i];
316+ cputrace_anchor & anchor = g_profiler.anchors [i];
313317 if (!anchor.name || (!logger.empty () && anchor.name != logger)) {
314318 continue ;
315319 }
316320
321+ pthread_mutex_lock (&anchor.lock );
317322 for (int j = 0 ; j < CPUTRACE_MAX_THREADS; ++j) {
318- if (active_contexts[i][j]) {
319- read_perf_event (active_contexts[i][j], &g_profiler.anchors [i]);
323+ if (anchor.is_capturing [j]) {
324+ HW_read (anchor.active_contexts [j], &anchor.end [j]);
325+ collect_samples (&anchor.start [j], &anchor.end [j], &anchor);
326+ std::memcpy (&anchor.start [j], &anchor.end [j], sizeof (anchor.start [j]));
320327 }
321328 }
329+ pthread_mutex_unlock (&anchor.lock );
322330
323331 f->open_object_section (anchor.name );
324332 f->dump_unsigned (" call_count" , anchor.global_results .call_count );
@@ -363,16 +371,20 @@ void cputrace_print_to_stringstream(std::stringstream& ss) {
363371 bool dumped = false ;
364372
365373 for (int i = 0 ; i < CPUTRACE_MAX_ANCHORS; ++i) {
366- const auto & anchor = g_profiler.anchors [i];
374+ cputrace_anchor & anchor = g_profiler.anchors [i];
367375 if (!anchor.name ) {
368376 continue ;
369377 }
370378
379+ pthread_mutex_lock (&anchor.lock );
371380 for (int j = 0 ; j < CPUTRACE_MAX_THREADS; ++j) {
372- if (active_contexts[i][j]) {
373- read_perf_event (active_contexts[i][j], &g_profiler.anchors [i]);
381+ if (anchor.is_capturing [j]) {
382+ HW_read (anchor.active_contexts [j], &anchor.end [j]);
383+ collect_samples (&anchor.start [j], &anchor.end [j], &anchor);
384+ std::memcpy (&anchor.start [j], &anchor.end [j], sizeof (anchor.start [j]));
374385 }
375386 }
387+ pthread_mutex_unlock (&anchor.lock );
376388
377389 ss << " " << anchor.name << " :\n " ;
378390 ss << " call_count: " << anchor.global_results .call_count << " \n " ;
@@ -430,6 +442,7 @@ __attribute__((constructor)) static void cputrace_init() {
430442 fprintf (stderr, " Failed to initialize mutex for anchor %d: %s\n " , i, strerror (errno));
431443 exit (1 );
432444 }
445+
433446 }
434447 if (pthread_mutex_init (&g_profiler.global_lock , nullptr ) != 0 ) {
435448 fprintf (stderr, " Failed to initialize global mutex: %s\n " , strerror (errno));
@@ -439,6 +452,16 @@ __attribute__((constructor)) static void cputrace_init() {
439452
440453__attribute__ ((destructor)) static void cputrace_fini() {
441454 for (int i = 0 ; i < CPUTRACE_MAX_ANCHORS; ++i) {
455+ cputrace_anchor& anchor = g_profiler.anchors [i];
456+ pthread_mutex_lock (&anchor.lock );
457+ for (int j = 0 ; j < CPUTRACE_MAX_THREADS; ++j) {
458+ if (anchor.active_contexts [j] != nullptr ) {
459+ HW_clean (&anchor.per_thread_ctx [j]);
460+ anchor.active_contexts [j] = nullptr ;
461+ anchor.is_capturing [j] = false ;
462+ }
463+ }
464+ pthread_mutex_unlock (&anchor.lock );
442465 if (pthread_mutex_destroy (&g_profiler.anchors [i].lock ) != 0 ) {
443466 fprintf (stderr, " Failed to destroy mutex for anchor %d: %s\n " , i, strerror (errno));
444467 }
0 commit comments