@@ -24,44 +24,47 @@ extern "C" {
2424
2525static uint64_t t0 ;
2626
27- JL_DLLEXPORT _Atomic(uint64_t ) jl_timing_disable_mask [(JL_TIMING_LAST + sizeof (uint64_t ) * CHAR_BIT - 1 ) / (sizeof (uint64_t ) * CHAR_BIT )];
27+ JL_DLLEXPORT _Atomic(uint64_t ) jl_timing_disable_mask [(JL_TIMING_SUBSYSTEM_LAST + sizeof (uint64_t ) * CHAR_BIT - 1 ) / (sizeof (uint64_t ) * CHAR_BIT )];
2828
29- JL_DLLEXPORT _Atomic(uint64_t ) jl_timing_self_counts [(int )JL_TIMING_EVENT_LAST ];
30- JL_DLLEXPORT _Atomic(uint64_t ) jl_timing_full_counts [(int )JL_TIMING_EVENT_LAST ];
29+ static arraylist_t jl_timing_counts_events ;
3130
3231// Used to as an item limit when several strings of metadata can
3332// potentially be associated with a single timing zone.
3433JL_DLLEXPORT uint32_t jl_timing_print_limit = 10 ;
3534
36- static const char * jl_timing_names [(int )JL_TIMING_EVENT_LAST ] =
35+ const char * jl_timing_subsystems [(int )JL_TIMING_SUBSYSTEM_LAST ] =
3736 {
3837#define X (name ) #name ,
39- JL_TIMING_EVENTS
38+ JL_TIMING_SUBSYSTEMS
4039#undef X
4140 };
4241
43- static int jl_timing_names_sorted [(int )JL_TIMING_EVENT_LAST ];
44-
4542JL_DLLEXPORT jl_timing_counter_t jl_timing_counters [JL_TIMING_COUNTER_LAST ];
4643
4744void jl_print_timings (void )
4845{
4946#ifdef USE_TIMING_COUNTS
5047 uint64_t total_time = cycleclock () - t0 ;
5148 uint64_t root_time = total_time ;
52- for (int i = 0 ; i < JL_TIMING_EVENT_LAST ; i ++ ) {
53- root_time -= jl_atomic_load_relaxed (jl_timing_self_counts + i );
49+ for (int i = 0 ; i < jl_timing_counts_events .len ; i ++ ) {
50+ jl_timing_counts_event_t * other_event = (jl_timing_counts_event_t * )jl_timing_counts_events .items [i ];
51+ root_time -= jl_atomic_load_relaxed (& other_event -> self );
5452 }
55- jl_atomic_store_relaxed (jl_timing_self_counts + JL_TIMING_ROOT , root_time );
56- jl_atomic_store_relaxed (jl_timing_full_counts + JL_TIMING_ROOT , total_time );
53+ jl_timing_counts_event_t * root_event = (jl_timing_counts_event_t * )jl_timing_counts_events .items [0 ];
54+ jl_atomic_store_relaxed (& root_event -> self , root_time );
55+ jl_atomic_store_relaxed (& root_event -> total , total_time );
56+
5757 fprintf (stderr , "\nJULIA TIMINGS\n" );
5858 fprintf (stderr , "%-25s, %-30s, %-30s\n" , "Event" , "Self Cycles (% of Total)" , "Total Cycles (% of Total)" );
59- for (int i = 0 ; i < JL_TIMING_EVENT_LAST ; i ++ ) {
60- int j = jl_timing_names_sorted [i ];
61- uint64_t self = jl_atomic_load_relaxed (jl_timing_self_counts + j );
62- uint64_t total = jl_atomic_load_relaxed (jl_timing_full_counts + j );
59+ for (int i = 0 ; i < jl_timing_counts_events . len ; i ++ ) {
60+ jl_timing_counts_event_t * event = ( jl_timing_counts_event_t * ) jl_timing_counts_events . items [i ];
61+ uint64_t self = jl_atomic_load_relaxed (& event -> self );
62+ uint64_t total = jl_atomic_load_relaxed (& event -> total );
6363 if (total != 0 )
64- fprintf (stderr , "%-25s, %20" PRIu64 " (%5.2f %%), %20" PRIu64 " (%5.2f %%)\n" , jl_timing_names [j ], self , 100 * (((double )self ) / total_time ), total , 100 * (((double )total ) / total_time ));
64+ fprintf (stderr , "%-25s, %20" PRIu64 " (%5.2f %%), %20" PRIu64 " (%5.2f %%)\n" ,
65+ event -> name ,
66+ self , 100 * (((double )self ) / total_time ),
67+ total , 100 * (((double )total ) / total_time ));
6568 }
6669
6770 fprintf (stderr , "\nJULIA COUNTERS\n" );
@@ -77,22 +80,32 @@ void jl_print_timings(void)
7780#endif
7881}
7982
80- int cmp_names (const void * a , const void * b ) {
81- int ia = * (const int * )a ;
82- int ib = * (const int * )b ;
83- return strcmp (jl_timing_names [ia ], jl_timing_names [ib ]);
83+ static const int indirect_strcmp (const void * a , const void * b ) {
84+ return strcmp (* (const char * * )a , * (const char * * )b );
8485}
8586
8687void jl_init_timing (void )
8788{
8889 t0 = cycleclock ();
8990
90- _Static_assert (( int ) JL_TIMING_LAST <= ( int ) JL_TIMING_EVENT_LAST , "More owners than events !" );
91+ _Static_assert (JL_TIMING_SUBSYSTEM_LAST < sizeof ( uint64_t ) * CHAR_BIT , "Too many timing subsystems !" );
9192
92- for (int i = 0 ; i < JL_TIMING_EVENT_LAST ; i ++ ) {
93- jl_timing_names_sorted [i ] = i ;
94- }
95- qsort (jl_timing_names_sorted , JL_TIMING_EVENT_LAST , sizeof (int ), cmp_names );
93+ #ifdef USE_TIMING_COUNTS
94+ // Create events list for counts backend
95+ arraylist_new (& jl_timing_counts_events , 1 );
96+
97+ jl_timing_counts_event_t * new_event = (jl_timing_counts_event_t * )malloc (sizeof (jl_timing_counts_event_t ));
98+ arraylist_push (& jl_timing_counts_events , (void * )new_event );
99+
100+ new_event -> name = "ROOT" ;
101+ jl_atomic_store_relaxed (& new_event -> total , 0 );
102+ #endif
103+
104+ // Sort the subsystem names for quick enable/disable lookups
105+ qsort (
106+ jl_timing_subsystems , JL_TIMING_SUBSYSTEM_LAST ,
107+ sizeof (const char * ), indirect_strcmp
108+ );
96109
97110 int i __attribute__((unused )) = 0 ;
98111#ifdef USE_ITTAPI
@@ -142,18 +155,157 @@ void jl_destroy_timing(void)
142155 jl_ptls_t ptls = jl_current_task -> ptls ;
143156 jl_timing_block_t * stack = ptls -> timing_stack ;
144157 while (stack ) {
145- _jl_timing_block_destroy (stack );
158+ jl_timing_block_end (stack );
146159 stack = stack -> prev ;
147160 }
148161}
149162
150- jl_timing_block_t * jl_pop_timing_block (jl_timing_block_t * cur_block )
163+ static const int get_timing_subsystem (const char * subsystem ) {
164+ const char * * match = (const char * * )bsearch (
165+ & subsystem , jl_timing_subsystems , JL_TIMING_SUBSYSTEM_LAST ,
166+ sizeof (const char * ), indirect_strcmp
167+ );
168+ if (!match )
169+ return JL_TIMING_SUBSYSTEM_LAST ;
170+
171+ return (int )(match - & jl_timing_subsystems [0 ]);
172+ }
173+
174+ #ifdef USE_TIMING_COUNTS
175+
176+ // This function is analogous to __itt_event_create but for the counts backend
177+ //
178+ // `event` is required to live forever
179+ static jl_timing_counts_event_t * _jl_timing_counts_event_create (const char * event ) {
180+ const size_t n = jl_timing_counts_events .len ;
181+ for (size_t i = 0 ; i < n ; i ++ ) {
182+ jl_timing_counts_event_t * other_event = (jl_timing_counts_event_t * )jl_timing_counts_events .items [i ];
183+ if (strcmp (event , other_event -> name ) == 0 )
184+ return other_event ;
185+ }
186+
187+ // No matching event found - create a new one
188+ jl_timing_counts_event_t * new_event = (jl_timing_counts_event_t * )malloc (sizeof (jl_timing_counts_event_t ));
189+ arraylist_push (& jl_timing_counts_events , (void * )new_event );
190+ new_event -> name = event ;
191+ jl_atomic_store_relaxed (& new_event -> total , 0 );
192+ return new_event ;
193+ }
194+
195+ STATIC_INLINE void _jl_timing_counts_pause (jl_timing_counts_t * block , uint64_t t ) JL_NOTSAFEPOINT {
196+ #ifdef JL_DEBUG_BUILD
197+ assert (block -> running );
198+ block -> running = 0 ;
199+ #endif
200+ block -> total += t - block -> start ;
201+ }
202+
203+ STATIC_INLINE void _jl_timing_counts_resume (jl_timing_counts_t * block , uint64_t t ) JL_NOTSAFEPOINT {
204+ #ifdef JL_DEBUG_BUILD
205+ assert (!block -> running );
206+ block -> running = 1 ;
207+ #endif
208+ block -> start = t ;
209+ }
210+
211+ STATIC_INLINE void _jl_timing_counts_start (jl_timing_counts_t * block , uint64_t t ) JL_NOTSAFEPOINT {
212+ block -> total = 0 ;
213+ block -> start = t ;
214+ block -> t0 = t ;
215+ #ifdef JL_DEBUG_BUILD
216+ block -> running = 1 ;
217+ #endif
218+ }
219+
220+ STATIC_INLINE void _jl_timing_counts_stop (jl_timing_block_t * block , uint64_t t ) JL_NOTSAFEPOINT {
221+ #ifdef JL_DEBUG_BUILD
222+ assert (block -> counts_ctx .running );
223+ block -> counts_ctx .running = 0 ;
224+ #endif
225+ jl_timing_counts_event_t * event = block -> event -> counts_event ;
226+ block -> counts_ctx .total += t - block -> counts_ctx .start ;
227+ jl_atomic_fetch_add_relaxed (& event -> self , block -> counts_ctx .total );
228+ jl_atomic_fetch_add_relaxed (& event -> total , t - block -> counts_ctx .t0 );
229+ }
230+
231+ #endif // USE_TIMING_COUNTS
232+
233+ JL_DLLEXPORT jl_timing_event_t * _jl_timing_event_create (const char * subsystem , const char * name , const char * function , const char * file , int line , int color ) {
234+ int maybe_subsystem = get_timing_subsystem (subsystem );
235+ if (maybe_subsystem >= JL_TIMING_SUBSYSTEM_LAST ) {
236+ jl_errorf ("invalid timing subsystem name: %s" , subsystem );
237+ return NULL ;
238+ }
239+
240+ jl_timing_event_t * event = (jl_timing_event_t * ) malloc (sizeof (jl_timing_event_t ));
241+ event -> subsystem = maybe_subsystem ;
242+
243+ #ifdef USE_TIMING_COUNTS
244+ event -> counts_event = _jl_timing_counts_event_create (name );
245+ #endif // USE_TIMING_COUNTS
246+
247+ #ifdef USE_TRACY
248+ event -> tracy_srcloc .name = name ;
249+ event -> tracy_srcloc .function = function ;
250+ event -> tracy_srcloc .file = file ;
251+ event -> tracy_srcloc .line = line ;
252+ event -> tracy_srcloc .color = color ;
253+ #endif // USE_TRACY
254+
255+ #ifdef USE_ITTAPI
256+ event -> ittapi_event = __itt_event_create (name , strlen (name ));
257+ #endif // USE_ITTAPI
258+
259+ return event ;
260+ }
261+
262+ JL_DLLEXPORT void _jl_timing_block_init (jl_timing_block_t * block , jl_timing_event_t * event ) {
263+ memset (block , 0 , sizeof (* block ));
264+ block -> event = event ;
265+ }
266+
267+ JL_DLLEXPORT void _jl_timing_block_start (jl_timing_block_t * block ) {
268+ assert (!block -> is_running );
269+ if (!_jl_timing_enabled (block -> event -> subsystem )) return ;
270+
271+ uint64_t t = cycleclock (); (void )t ;
272+ _COUNTS_START (& block -> counts_ctx , t );
273+ _ITTAPI_START (block );
274+ _TRACY_START (block );
275+
276+ jl_timing_block_t * * prevp = & jl_current_task -> ptls -> timing_stack ;
277+ block -> prev = * prevp ;
278+ block -> is_running = 1 ;
279+ if (block -> prev ) {
280+ _COUNTS_PAUSE (& block -> prev -> counts_ctx , t );
281+ }
282+ * prevp = block ;
283+ }
284+
285+ JL_DLLEXPORT void _jl_timing_block_end (jl_timing_block_t * block ) {
286+ if (block -> is_running ) {
287+ uint64_t t = cycleclock (); (void )t ;
288+ _ITTAPI_STOP (block );
289+ _TRACY_STOP (block -> tracy_ctx );
290+ _COUNTS_STOP (block , t );
291+
292+ jl_task_t * ct = jl_current_task ;
293+ jl_timing_block_t * * pcur = & ct -> ptls -> timing_stack ;
294+ assert (* pcur == block );
295+ * pcur = block -> prev ;
296+ if (block -> prev ) {
297+ _COUNTS_RESUME (& block -> prev -> counts_ctx , t );
298+ }
299+ }
300+ }
301+
302+ jl_timing_block_t * jl_timing_block_pop (jl_timing_block_t * cur_block )
151303{
152- _jl_timing_block_destroy (cur_block );
304+ jl_timing_block_end (cur_block );
153305 return cur_block -> prev ;
154306}
155307
156- void jl_timing_block_enter_task (jl_task_t * ct , jl_ptls_t ptls , jl_timing_block_t * prev_blk )
308+ void jl_timing_block_task_enter (jl_task_t * ct , jl_ptls_t ptls , jl_timing_block_t * prev_blk )
157309{
158310 if (prev_blk != NULL ) {
159311 assert (ptls -> timing_stack == NULL );
@@ -171,7 +323,7 @@ void jl_timing_block_enter_task(jl_task_t *ct, jl_ptls_t ptls, jl_timing_block_t
171323#endif
172324}
173325
174- jl_timing_block_t * jl_timing_block_exit_task (jl_task_t * ct , jl_ptls_t ptls )
326+ jl_timing_block_t * jl_timing_block_task_exit (jl_task_t * ct , jl_ptls_t ptls )
175327{
176328#ifdef USE_TRACY
177329 // Tracy is fairly strict about not leaving a fiber that hasn't
@@ -323,7 +475,7 @@ JL_DLLEXPORT void jl_timing_puts(jl_timing_block_t *cur_block, const char *str)
323475#endif
324476}
325477
326- void jl_timing_init_task (jl_task_t * t )
478+ void jl_timing_task_init (jl_task_t * t )
327479{
328480#ifdef USE_TRACY
329481 jl_value_t * start_type = jl_typeof (t -> start );
@@ -359,20 +511,12 @@ void jl_timing_init_task(jl_task_t *t)
359511#endif
360512}
361513
362- int cmp_name_idx (const void * name , const void * idx ) {
363- return strcmp ((const char * )name , jl_timing_names [* (const int * )idx ]);
364- }
365-
366514JL_DLLEXPORT int jl_timing_set_enable (const char * subsystem , uint8_t enabled )
367515{
368- const int * idx = (const int * )bsearch (subsystem , jl_timing_names_sorted , JL_TIMING_EVENT_LAST , sizeof (int ), cmp_name_idx );
369- if (idx == NULL )
370- return -1 ;
371- int i = * idx ;
372- // sorted names include events, so skip if we're looking at an event instead of a subsystem
373- // events are always at least JL_TIMING_LAST
374- if (i >= JL_TIMING_LAST )
516+ int i = get_timing_subsystem (subsystem );
517+ if (i >= JL_TIMING_SUBSYSTEM_LAST )
375518 return -1 ;
519+
376520 uint64_t subsystem_bit = 1ul << (i % (sizeof (uint64_t ) * CHAR_BIT ));
377521 if (enabled ) {
378522 jl_atomic_fetch_and_relaxed (jl_timing_disable_mask + (i / (sizeof (uint64_t ) * CHAR_BIT )), ~subsystem_bit );
@@ -445,6 +589,13 @@ void jl_timing_apply_env(void)
445589
446590void jl_init_timing (void ) { }
447591void jl_destroy_timing (void ) { }
592+
593+ JL_DLLEXPORT jl_timing_event_t * _jl_timing_event_create (const char * subsystem , const char * name , const char * function , const char * file , int line , int color ) { return NULL ; }
594+
595+ JL_DLLEXPORT void _jl_timing_block_init (jl_timing_block_t * block , jl_timing_event_t * event ) { }
596+ JL_DLLEXPORT void _jl_timing_block_start (jl_timing_block_t * block ) { }
597+ JL_DLLEXPORT void _jl_timing_block_end (jl_timing_block_t * block ) { }
598+
448599JL_DLLEXPORT int jl_timing_set_enable (const char * subsystem , uint8_t enabled ) { return -1 ; }
449600JL_DLLEXPORT uint32_t jl_timing_print_limit = 0 ;
450601
0 commit comments