@@ -45,11 +45,24 @@ flame_cmd_args( int * pargc,
45
45
char * * * pargv ,
46
46
args_t * args ) {
47
47
48
- if ( FD_UNLIKELY ( !* pargc ) ) FD_LOG_ERR (( "usage: flame [all|tile|tile:idx|agave]" ));
48
+ if ( FD_UNLIKELY ( !* pargc ) ) FD_LOG_ERR (( "usage: flame [all|tile|tile:idx|agave] [sample_rate] " ));
49
49
strncpy ( args -> flame .name , * * pargv , sizeof ( args -> flame .name ) - 1 );
50
50
51
51
(* pargc )-- ;
52
52
(* pargv )++ ;
53
+
54
+ args -> flame .sample_rate = 99UL ; /* default 99hz */
55
+
56
+ if ( FD_LIKELY ( * pargc > 0 ) ) { /* optional */
57
+ char * endptr ;
58
+ ulong sample_rate = strtoul ( * * pargv , & endptr , 10 );
59
+ if ( FD_UNLIKELY ( * endptr != '\0' || sample_rate == 0UL || sample_rate > 50000UL ) ) {
60
+ FD_LOG_ERR (( "invalid sample rate `%s` - must be between 1 and 50000 Hz" , * * pargv ));
61
+ }
62
+ args -> flame .sample_rate = sample_rate ;
63
+ (* pargc )-- ;
64
+ (* pargv )++ ;
65
+ }
53
66
}
54
67
55
68
void
@@ -58,7 +71,7 @@ flame_cmd_fn( args_t * args,
58
71
install_parent_signals ();
59
72
60
73
fd_topo_join_workspaces ( & config -> topo , FD_SHMEM_JOIN_MODE_READ_ONLY );
61
- fd_topo_fill ( & config -> topo );
74
+ fd_topo_fill_resilient ( & config -> topo );
62
75
63
76
ulong tile_cnt = 0UL ;
64
77
ulong tile_idxs [ 128UL ];
@@ -97,29 +110,47 @@ flame_cmd_fn( args_t * args,
97
110
}
98
111
99
112
char threads [ 4096 ] = {0 };
113
+ char sample_rate_str [ 64 ] = {0 };
114
+ snprintf ( sample_rate_str , sizeof ( sample_rate_str ), "%lu" , args -> flame .sample_rate ); // TODO: not sure if there is a FD version of this or something similar
100
115
ulong len = 0UL ;
116
+ ulong valid_tiles = 0UL ;
101
117
for ( ulong i = 0UL ; i < tile_cnt ; i ++ ) {
102
- if ( FD_LIKELY ( i != 0UL ) ) {
103
- FD_TEST ( fd_cstr_printf_check ( threads + len , sizeof (threads )- len , NULL , "," ) );
104
- len += 1UL ;
105
- }
118
+ fd_topo_tile_t * tile = & config -> topo .tiles [ tile_idxs [ i ] ];
106
119
107
- ulong tid = fd_metrics_tile ( config -> topo .tiles [ tile_idxs [ i ] ].metrics )[ FD_METRICS_GAUGE_TILE_TID_OFF ];
108
- ulong pid = fd_metrics_tile ( config -> topo .tiles [ tile_idxs [ i ] ].metrics )[ FD_METRICS_GAUGE_TILE_PID_OFF ];
120
+ ulong tid = fd_metrics_tile ( tile -> metrics )[ FD_METRICS_GAUGE_TILE_TID_OFF ];
121
+ ulong pid = fd_metrics_tile ( tile -> metrics )[ FD_METRICS_GAUGE_TILE_PID_OFF ];
122
+
123
+ /* Skip tiles that don't have valid PID/TID */
124
+ if ( FD_UNLIKELY ( !pid || !tid || pid > INT_MAX || tid > INT_MAX ) ) {
125
+ FD_LOG_WARNING (( "skipping tile %s:%lu - invalid PID/TID (pid=%lu, tid=%lu)" , tile -> name , tile -> kind_id , pid , tid ));
126
+ continue ;
127
+ }
109
128
110
- FD_TEST ( pid <=INT_MAX );
111
129
if ( FD_UNLIKELY ( -1 == kill ( (int )pid , 0 ) ) ) {
112
- if ( FD_UNLIKELY ( errno == ESRCH ) ) FD_LOG_ERR (( "tile %s:%lu is not running" , config -> topo .tiles [ i ].name , config -> topo .tiles [ i ].kind_id ));
113
- else FD_LOG_ERR (( "kill() failed (%i-%s)" , errno , fd_io_strerror ( errno ) ));
130
+ if ( FD_UNLIKELY ( errno == ESRCH ) ) {
131
+ FD_LOG_WARNING (( "skipping tile %s:%lu - process not running (pid=%lu)" , tile -> name , tile -> kind_id , pid ));
132
+ continue ;
133
+ } else {
134
+ FD_LOG_ERR (( "kill() failed (%i-%s)" , errno , fd_io_strerror ( errno ) ));
135
+ }
136
+ }
137
+
138
+ if ( FD_LIKELY ( valid_tiles > 0UL ) ) {
139
+ FD_TEST ( fd_cstr_printf_check ( threads + len , sizeof (threads )- len , NULL , "," ) );
140
+ len += 1UL ;
114
141
}
115
142
116
143
ulong arg_len ;
117
144
FD_TEST ( fd_cstr_printf_check ( threads + len , sizeof (threads )- len , & arg_len , "%lu" , fd_ulong_if ( whole_process , pid , tid ) ) );
118
145
len += arg_len ;
146
+ valid_tiles ++ ;
147
+ }
148
+
149
+ if ( FD_UNLIKELY ( !valid_tiles ) ) {
150
+ FD_LOG_ERR (( "No valid running tiles found to profile" ));
119
151
}
120
- FD_TEST ( len < sizeof (threads ) );
121
152
122
- FD_LOG_NOTICE (( "/usr/bin/perf script record flamegraph -F 99 -%c %s && /usr/bin/perf script report flamegraph" , fd_char_if ( whole_process , 'p' , 't' ), threads ));
153
+ FD_LOG_NOTICE (( "/usr/bin/perf script record flamegraph -F %lu -%c %s && /usr/bin/perf script report flamegraph" , args -> flame . sample_rate , fd_char_if ( whole_process , 'p' , 't' ), threads ));
123
154
124
155
record_pid = fork ();
125
156
if ( FD_UNLIKELY ( -1 == record_pid ) ) FD_LOG_ERR (( "fork() failed (%i-%s)" , errno , fd_io_strerror ( errno ) ));
@@ -130,7 +161,7 @@ flame_cmd_fn( args_t * args,
130
161
"record" ,
131
162
"flamegraph" ,
132
163
"-F" ,
133
- "99" ,
164
+ sample_rate_str ,
134
165
whole_process ? "-p" : "-t" ,
135
166
threads ,
136
167
NULL ,
0 commit comments