Skip to content

Commit cb0537f

Browse files
committed
timing: Allocate all timing events dynamically
This paves the way for a Julia-side API that can create new events on-the-fly without having to modify timing.h The core of the change is to introduce two different structs: - An "event" stores all of the statically-determined attributes of a profiler event (typically, zone name and source location info) - A "timing block" stores the dynamic information relevant to a particular span/measurement in the timing run Events and timing blocks have a one-to-many relationship. The intended pattern for a Julia-side API is to construct an event once at parse-time using `jl_timing_event_create` (since this is relatively expensive due to profiler traffic and allocations) and then to create its own timing block on-the-fly for each block entry/exit. This also re-factors the API a bit to hopefully be more consistently named
1 parent f61bbfb commit cb0537f

File tree

5 files changed

+297
-216
lines changed

5 files changed

+297
-216
lines changed

src/julia.expmap

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
_Z24jl_coverage_data_pointerN4llvm9StringRefEi;
3131
_Z22jl_coverage_alloc_lineN4llvm9StringRefEi;
3232
_Z22jl_malloc_data_pointerN4llvm9StringRefEi;
33+
_jl_timing_*;
3334
LLVMExtra*;
3435
JLJIT*;
3536
llvmGetPassPluginInfo;

src/julia.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1982,6 +1982,7 @@ JL_DLLEXPORT void jl_sigatomic_end(void);
19821982
// tasks and exceptions -------------------------------------------------------
19831983

19841984
typedef struct _jl_timing_block_t jl_timing_block_t;
1985+
typedef struct _jl_timing_event_t jl_timing_event_t;
19851986
typedef struct _jl_excstack_t jl_excstack_t;
19861987

19871988
// info describing an exception handler

src/task.c

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -646,7 +646,7 @@ JL_DLLEXPORT void jl_switch(void) JL_NOTSAFEPOINT_LEAVE JL_NOTSAFEPOINT_ENTER
646646
int finalizers_inhibited = ptls->finalizers_inhibited;
647647
ptls->finalizers_inhibited = 0;
648648

649-
jl_timing_block_t *blk = jl_timing_block_exit_task(ct, ptls);
649+
jl_timing_block_t *blk = jl_timing_block_task_exit(ct, ptls);
650650
ctx_switch(ct);
651651

652652
#ifdef MIGRATE_TASKS
@@ -666,7 +666,7 @@ JL_DLLEXPORT void jl_switch(void) JL_NOTSAFEPOINT_LEAVE JL_NOTSAFEPOINT_ENTER
666666
0 != ct->ptls &&
667667
0 == ptls->finalizers_inhibited);
668668
ptls->finalizers_inhibited = finalizers_inhibited;
669-
jl_timing_block_enter_task(ct, ptls, blk); (void)blk;
669+
jl_timing_block_task_enter(ct, ptls, blk); (void)blk;
670670

671671
sig_atomic_t other_defer_signal = ptls->defer_signal;
672672
ptls->defer_signal = defer_signal;
@@ -705,7 +705,7 @@ JL_DLLEXPORT JL_NORETURN void jl_no_exc_handler(jl_value_t *e, jl_task_t *ct)
705705
#define pop_timings_stack() \
706706
jl_timing_block_t *cur_block = ptls->timing_stack; \
707707
while (cur_block && eh->timing_stack != cur_block) { \
708-
cur_block = jl_pop_timing_block(cur_block); \
708+
cur_block = jl_timing_block_pop(cur_block); \
709709
} \
710710
assert(cur_block == eh->timing_stack);
711711
#else
@@ -1084,7 +1084,7 @@ JL_DLLEXPORT jl_task_t *jl_new_task(jl_function_t *start, jl_value_t *completion
10841084
t->ptls = NULL;
10851085
t->world_age = ct->world_age;
10861086
t->reentrant_timing = 0;
1087-
jl_timing_init_task(t);
1087+
jl_timing_task_init(t);
10881088

10891089
#ifdef COPY_STACKS
10901090
if (!t->copy_stack) {
@@ -1221,7 +1221,7 @@ CFI_NORETURN
12211221

12221222
ct->started = 1;
12231223
JL_PROBE_RT_START_TASK(ct);
1224-
jl_timing_block_enter_task(ct, ptls, NULL);
1224+
jl_timing_block_task_enter(ct, ptls, NULL);
12251225
if (jl_atomic_load_relaxed(&ct->_isexception)) {
12261226
record_backtrace(ptls, 0);
12271227
jl_push_excstack(&ct->excstack, ct->result,
@@ -1693,7 +1693,7 @@ jl_task_t *jl_init_root_task(jl_ptls_t ptls, void *stack_lo, void *stack_hi)
16931693
ct->ctx.asan_fake_stack = NULL;
16941694
#endif
16951695

1696-
jl_timing_block_enter_task(ct, ptls, NULL);
1696+
jl_timing_block_task_enter(ct, ptls, NULL);
16971697

16981698
#ifdef COPY_STACKS
16991699
// initialize the base_ctx from which all future copy_stacks will be copies

src/timing.c

Lines changed: 193 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -24,44 +24,47 @@ extern "C" {
2424

2525
static uint64_t t0;
2626

27-
JL_DLLEXPORT _Atomic(uint64_t) jl_timing_disable_mask[(JL_TIMING_LAST + sizeof(uint64_t) * CHAR_BIT - 1) / (sizeof(uint64_t) * CHAR_BIT)];
27+
JL_DLLEXPORT _Atomic(uint64_t) jl_timing_disable_mask[(JL_TIMING_SUBSYSTEM_LAST + sizeof(uint64_t) * CHAR_BIT - 1) / (sizeof(uint64_t) * CHAR_BIT)];
2828

29-
JL_DLLEXPORT _Atomic(uint64_t) jl_timing_self_counts[(int)JL_TIMING_EVENT_LAST];
30-
JL_DLLEXPORT _Atomic(uint64_t) jl_timing_full_counts[(int)JL_TIMING_EVENT_LAST];
29+
static arraylist_t jl_timing_counts_events;
3130

3231
// Used to as an item limit when several strings of metadata can
3332
// potentially be associated with a single timing zone.
3433
JL_DLLEXPORT uint32_t jl_timing_print_limit = 10;
3534

36-
static const char *jl_timing_names[(int)JL_TIMING_EVENT_LAST] =
35+
const char *jl_timing_subsystems[(int)JL_TIMING_SUBSYSTEM_LAST] =
3736
{
3837
#define X(name) #name,
39-
JL_TIMING_EVENTS
38+
JL_TIMING_SUBSYSTEMS
4039
#undef X
4140
};
4241

43-
static int jl_timing_names_sorted[(int)JL_TIMING_EVENT_LAST];
44-
4542
JL_DLLEXPORT jl_timing_counter_t jl_timing_counters[JL_TIMING_COUNTER_LAST];
4643

4744
void jl_print_timings(void)
4845
{
4946
#ifdef USE_TIMING_COUNTS
5047
uint64_t total_time = cycleclock() - t0;
5148
uint64_t root_time = total_time;
52-
for (int i = 0; i < JL_TIMING_EVENT_LAST; i++) {
53-
root_time -= jl_atomic_load_relaxed(jl_timing_self_counts + i);
49+
for (int i = 0; i < jl_timing_counts_events.len; i++) {
50+
jl_timing_counts_event_t *other_event = (jl_timing_counts_event_t *)jl_timing_counts_events.items[i];
51+
root_time -= jl_atomic_load_relaxed(&other_event->self);
5452
}
55-
jl_atomic_store_relaxed(jl_timing_self_counts + JL_TIMING_ROOT, root_time);
56-
jl_atomic_store_relaxed(jl_timing_full_counts + JL_TIMING_ROOT, total_time);
53+
jl_timing_counts_event_t *root_event = (jl_timing_counts_event_t *)jl_timing_counts_events.items[0];
54+
jl_atomic_store_relaxed(&root_event->self, root_time);
55+
jl_atomic_store_relaxed(&root_event->total, total_time);
56+
5757
fprintf(stderr, "\nJULIA TIMINGS\n");
5858
fprintf(stderr, "%-25s, %-30s, %-30s\n", "Event", "Self Cycles (% of Total)", "Total Cycles (% of Total)");
59-
for (int i = 0; i < JL_TIMING_EVENT_LAST; i++) {
60-
int j = jl_timing_names_sorted[i];
61-
uint64_t self = jl_atomic_load_relaxed(jl_timing_self_counts + j);
62-
uint64_t total = jl_atomic_load_relaxed(jl_timing_full_counts + j);
59+
for (int i = 0; i < jl_timing_counts_events.len; i++) {
60+
jl_timing_counts_event_t *event = (jl_timing_counts_event_t *)jl_timing_counts_events.items[i];
61+
uint64_t self = jl_atomic_load_relaxed(&event->self);
62+
uint64_t total = jl_atomic_load_relaxed(&event->total);
6363
if (total != 0)
64-
fprintf(stderr, "%-25s, %20" PRIu64 " (%5.2f %%), %20" PRIu64 " (%5.2f %%)\n", jl_timing_names[j], self, 100 * (((double)self) / total_time), total, 100 * (((double)total) / total_time));
64+
fprintf(stderr, "%-25s, %20" PRIu64 " (%5.2f %%), %20" PRIu64 " (%5.2f %%)\n",
65+
event->name,
66+
self, 100 * (((double)self) / total_time),
67+
total, 100 * (((double)total) / total_time));
6568
}
6669

6770
fprintf(stderr, "\nJULIA COUNTERS\n");
@@ -77,22 +80,32 @@ void jl_print_timings(void)
7780
#endif
7881
}
7982

80-
int cmp_names(const void *a, const void *b) {
81-
int ia = *(const int*)a;
82-
int ib = *(const int*)b;
83-
return strcmp(jl_timing_names[ia], jl_timing_names[ib]);
83+
static const int indirect_strcmp(const void *a, const void *b) {
84+
return strcmp(*(const char **)a, *(const char **)b);
8485
}
8586

8687
void jl_init_timing(void)
8788
{
8889
t0 = cycleclock();
8990

90-
_Static_assert((int)JL_TIMING_LAST <= (int)JL_TIMING_EVENT_LAST, "More owners than events!");
91+
_Static_assert(JL_TIMING_SUBSYSTEM_LAST < sizeof(uint64_t) * CHAR_BIT, "Too many timing subsystems!");
9192

92-
for (int i = 0; i < JL_TIMING_EVENT_LAST; i++) {
93-
jl_timing_names_sorted[i] = i;
94-
}
95-
qsort(jl_timing_names_sorted, JL_TIMING_EVENT_LAST, sizeof(int), cmp_names);
93+
#ifdef USE_TIMING_COUNTS
94+
// Create events list for counts backend
95+
arraylist_new(&jl_timing_counts_events, 1);
96+
97+
jl_timing_counts_event_t *new_event = (jl_timing_counts_event_t *)malloc(sizeof(jl_timing_counts_event_t));
98+
arraylist_push(&jl_timing_counts_events, (void *)new_event);
99+
100+
new_event->name = "ROOT";
101+
jl_atomic_store_relaxed(&new_event->total, 0);
102+
#endif
103+
104+
// Sort the subsystem names for quick enable/disable lookups
105+
qsort(
106+
jl_timing_subsystems, JL_TIMING_SUBSYSTEM_LAST,
107+
sizeof(const char *), indirect_strcmp
108+
);
96109

97110
int i __attribute__((unused)) = 0;
98111
#ifdef USE_ITTAPI
@@ -142,18 +155,157 @@ void jl_destroy_timing(void)
142155
jl_ptls_t ptls = jl_current_task->ptls;
143156
jl_timing_block_t *stack = ptls->timing_stack;
144157
while (stack) {
145-
_jl_timing_block_destroy(stack);
158+
jl_timing_block_end(stack);
146159
stack = stack->prev;
147160
}
148161
}
149162

150-
jl_timing_block_t *jl_pop_timing_block(jl_timing_block_t *cur_block)
163+
static const int get_timing_subsystem(const char *subsystem) {
164+
const char **match = (const char **)bsearch(
165+
&subsystem, jl_timing_subsystems, JL_TIMING_SUBSYSTEM_LAST,
166+
sizeof(const char *), indirect_strcmp
167+
);
168+
if (!match)
169+
return JL_TIMING_SUBSYSTEM_LAST;
170+
171+
return (int)(match - &jl_timing_subsystems[0]);
172+
}
173+
174+
#ifdef USE_TIMING_COUNTS
175+
176+
// This function is analogous to __itt_event_create but for the counts backend
177+
//
178+
// `event` is required to live forever
179+
static jl_timing_counts_event_t *_jl_timing_counts_event_create(const char *event) {
180+
const size_t n = jl_timing_counts_events.len;
181+
for (size_t i = 0; i < n; i++) {
182+
jl_timing_counts_event_t *other_event = (jl_timing_counts_event_t *)jl_timing_counts_events.items[i];
183+
if (strcmp(event, other_event->name) == 0)
184+
return other_event;
185+
}
186+
187+
// No matching event found - create a new one
188+
jl_timing_counts_event_t *new_event = (jl_timing_counts_event_t *)malloc(sizeof(jl_timing_counts_event_t));
189+
arraylist_push(&jl_timing_counts_events, (void *)new_event);
190+
new_event->name = event;
191+
jl_atomic_store_relaxed(&new_event->total, 0);
192+
return new_event;
193+
}
194+
195+
STATIC_INLINE void _jl_timing_counts_pause(jl_timing_counts_t *block, uint64_t t) JL_NOTSAFEPOINT {
196+
#ifdef JL_DEBUG_BUILD
197+
assert(block->running);
198+
block->running = 0;
199+
#endif
200+
block->total += t - block->start;
201+
}
202+
203+
STATIC_INLINE void _jl_timing_counts_resume(jl_timing_counts_t *block, uint64_t t) JL_NOTSAFEPOINT {
204+
#ifdef JL_DEBUG_BUILD
205+
assert(!block->running);
206+
block->running = 1;
207+
#endif
208+
block->start = t;
209+
}
210+
211+
STATIC_INLINE void _jl_timing_counts_start(jl_timing_counts_t *block, uint64_t t) JL_NOTSAFEPOINT {
212+
block->total = 0;
213+
block->start = t;
214+
block->t0 = t;
215+
#ifdef JL_DEBUG_BUILD
216+
block->running = 1;
217+
#endif
218+
}
219+
220+
STATIC_INLINE void _jl_timing_counts_stop(jl_timing_block_t *block, uint64_t t) JL_NOTSAFEPOINT {
221+
#ifdef JL_DEBUG_BUILD
222+
assert(block->counts_ctx.running);
223+
block->counts_ctx.running = 0;
224+
#endif
225+
jl_timing_counts_event_t *event = block->event->counts_event;
226+
block->counts_ctx.total += t - block->counts_ctx.start;
227+
jl_atomic_fetch_add_relaxed(&event->self, block->counts_ctx.total);
228+
jl_atomic_fetch_add_relaxed(&event->total, t - block->counts_ctx.t0);
229+
}
230+
231+
#endif // USE_TIMING_COUNTS
232+
233+
JL_DLLEXPORT jl_timing_event_t *_jl_timing_event_create(const char *subsystem, const char *name, const char *function, const char *file, int line, int color) {
234+
int maybe_subsystem = get_timing_subsystem(subsystem);
235+
if (maybe_subsystem >= JL_TIMING_SUBSYSTEM_LAST) {
236+
jl_errorf("invalid timing subsystem name: %s", subsystem);
237+
return NULL;
238+
}
239+
240+
jl_timing_event_t *event = (jl_timing_event_t *) malloc(sizeof(jl_timing_event_t));
241+
event->subsystem = maybe_subsystem;
242+
243+
#ifdef USE_TIMING_COUNTS
244+
event->counts_event = _jl_timing_counts_event_create(name);
245+
#endif // USE_TIMING_COUNTS
246+
247+
#ifdef USE_TRACY
248+
event->tracy_srcloc.name = name;
249+
event->tracy_srcloc.function = function;
250+
event->tracy_srcloc.file = file;
251+
event->tracy_srcloc.line = line;
252+
event->tracy_srcloc.color = color;
253+
#endif // USE_TRACY
254+
255+
#ifdef USE_ITTAPI
256+
event->ittapi_event = __itt_event_create(name, strlen(name));
257+
#endif // USE_ITTAPI
258+
259+
return event;
260+
}
261+
262+
JL_DLLEXPORT void _jl_timing_block_init(jl_timing_block_t *block, jl_timing_event_t *event) {
263+
memset(block, 0, sizeof(*block));
264+
block->event = event;
265+
}
266+
267+
JL_DLLEXPORT void _jl_timing_block_start(jl_timing_block_t *block) {
268+
assert(!block->is_running);
269+
if (!_jl_timing_enabled(block->event->subsystem)) return;
270+
271+
uint64_t t = cycleclock(); (void)t;
272+
_COUNTS_START(&block->counts_ctx, t);
273+
_ITTAPI_START(block);
274+
_TRACY_START(block);
275+
276+
jl_timing_block_t **prevp = &jl_current_task->ptls->timing_stack;
277+
block->prev = *prevp;
278+
block->is_running = 1;
279+
if (block->prev) {
280+
_COUNTS_PAUSE(&block->prev->counts_ctx, t);
281+
}
282+
*prevp = block;
283+
}
284+
285+
JL_DLLEXPORT void _jl_timing_block_end(jl_timing_block_t *block) {
286+
if (block->is_running) {
287+
uint64_t t = cycleclock(); (void)t;
288+
_ITTAPI_STOP(block);
289+
_TRACY_STOP(block->tracy_ctx);
290+
_COUNTS_STOP(block, t);
291+
292+
jl_task_t *ct = jl_current_task;
293+
jl_timing_block_t **pcur = &ct->ptls->timing_stack;
294+
assert(*pcur == block);
295+
*pcur = block->prev;
296+
if (block->prev) {
297+
_COUNTS_RESUME(&block->prev->counts_ctx, t);
298+
}
299+
}
300+
}
301+
302+
jl_timing_block_t *jl_timing_block_pop(jl_timing_block_t *cur_block)
151303
{
152-
_jl_timing_block_destroy(cur_block);
304+
jl_timing_block_end(cur_block);
153305
return cur_block->prev;
154306
}
155307

156-
void jl_timing_block_enter_task(jl_task_t *ct, jl_ptls_t ptls, jl_timing_block_t *prev_blk)
308+
void jl_timing_block_task_enter(jl_task_t *ct, jl_ptls_t ptls, jl_timing_block_t *prev_blk)
157309
{
158310
if (prev_blk != NULL) {
159311
assert(ptls->timing_stack == NULL);
@@ -171,7 +323,7 @@ void jl_timing_block_enter_task(jl_task_t *ct, jl_ptls_t ptls, jl_timing_block_t
171323
#endif
172324
}
173325

174-
jl_timing_block_t *jl_timing_block_exit_task(jl_task_t *ct, jl_ptls_t ptls)
326+
jl_timing_block_t *jl_timing_block_task_exit(jl_task_t *ct, jl_ptls_t ptls)
175327
{
176328
#ifdef USE_TRACY
177329
// Tracy is fairly strict about not leaving a fiber that hasn't
@@ -323,7 +475,7 @@ JL_DLLEXPORT void jl_timing_puts(jl_timing_block_t *cur_block, const char *str)
323475
#endif
324476
}
325477

326-
void jl_timing_init_task(jl_task_t *t)
478+
void jl_timing_task_init(jl_task_t *t)
327479
{
328480
#ifdef USE_TRACY
329481
jl_value_t *start_type = jl_typeof(t->start);
@@ -359,20 +511,12 @@ void jl_timing_init_task(jl_task_t *t)
359511
#endif
360512
}
361513

362-
int cmp_name_idx(const void *name, const void *idx) {
363-
return strcmp((const char *)name, jl_timing_names[*(const int *)idx]);
364-
}
365-
366514
JL_DLLEXPORT int jl_timing_set_enable(const char *subsystem, uint8_t enabled)
367515
{
368-
const int *idx = (const int *)bsearch(subsystem, jl_timing_names_sorted, JL_TIMING_EVENT_LAST, sizeof(int), cmp_name_idx);
369-
if (idx == NULL)
370-
return -1;
371-
int i = *idx;
372-
// sorted names include events, so skip if we're looking at an event instead of a subsystem
373-
// events are always at least JL_TIMING_LAST
374-
if (i >= JL_TIMING_LAST)
516+
int i = get_timing_subsystem(subsystem);
517+
if (i >= JL_TIMING_SUBSYSTEM_LAST)
375518
return -1;
519+
376520
uint64_t subsystem_bit = 1ul << (i % (sizeof(uint64_t) * CHAR_BIT));
377521
if (enabled) {
378522
jl_atomic_fetch_and_relaxed(jl_timing_disable_mask + (i / (sizeof(uint64_t) * CHAR_BIT)), ~subsystem_bit);
@@ -445,6 +589,13 @@ void jl_timing_apply_env(void)
445589

446590
void jl_init_timing(void) { }
447591
void jl_destroy_timing(void) { }
592+
593+
JL_DLLEXPORT jl_timing_event_t *_jl_timing_event_create(const char *subsystem, const char *name, const char *function, const char *file, int line, int color) { return NULL; }
594+
595+
JL_DLLEXPORT void _jl_timing_block_init(jl_timing_block_t *block, jl_timing_event_t *event) { }
596+
JL_DLLEXPORT void _jl_timing_block_start(jl_timing_block_t *block) { }
597+
JL_DLLEXPORT void _jl_timing_block_end(jl_timing_block_t *block) { }
598+
448599
JL_DLLEXPORT int jl_timing_set_enable(const char *subsystem, uint8_t enabled) { return -1; }
449600
JL_DLLEXPORT uint32_t jl_timing_print_limit = 0;
450601

0 commit comments

Comments
 (0)