Skip to content

Commit 67c934e

Browse files
committed
flame: fixed to handle terminated processes
1 parent dd70fa6 commit 67c934e

File tree

4 files changed

+107
-14
lines changed

4 files changed

+107
-14
lines changed

src/app/shared/fd_action.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ union fdctl_args {
6565

6666
struct {
6767
char name[ 13UL ];
68+
ulong sample_rate;
6869
} flame;
6970

7071
struct {

src/app/shared_dev/commands/flame.c

Lines changed: 45 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -45,11 +45,24 @@ flame_cmd_args( int * pargc,
4545
char *** pargv,
4646
args_t * args ) {
4747

48-
if( FD_UNLIKELY( !*pargc ) ) FD_LOG_ERR(( "usage: flame [all|tile|tile:idx|agave]" ));
48+
if( FD_UNLIKELY( !*pargc ) ) FD_LOG_ERR(( "usage: flame [all|tile|tile:idx|agave] [sample_rate]" ));
4949
strncpy( args->flame.name, **pargv, sizeof( args->flame.name ) - 1 );
5050

5151
(*pargc)--;
5252
(*pargv)++;
53+
54+
args->flame.sample_rate = 99UL; /* default 99hz */
55+
56+
if( FD_LIKELY( *pargc > 0 ) ) { /* optional */
57+
char * endptr;
58+
ulong sample_rate = strtoul( **pargv, &endptr, 10 );
59+
if( FD_UNLIKELY( *endptr != '\0' || sample_rate == 0UL || sample_rate > 50000UL ) ) {
60+
FD_LOG_ERR(( "invalid sample rate `%s` - must be between 1 and 50000 Hz", **pargv ));
61+
}
62+
args->flame.sample_rate = sample_rate;
63+
(*pargc)--;
64+
(*pargv)++;
65+
}
5366
}
5467

5568
void
@@ -58,7 +71,7 @@ flame_cmd_fn( args_t * args,
5871
install_parent_signals();
5972

6073
fd_topo_join_workspaces( &config->topo, FD_SHMEM_JOIN_MODE_READ_ONLY );
61-
fd_topo_fill( &config->topo );
74+
fd_topo_fill_resilient( &config->topo );
6275

6376
ulong tile_cnt = 0UL;
6477
ulong tile_idxs[ 128UL ];
@@ -97,29 +110,47 @@ flame_cmd_fn( args_t * args,
97110
}
98111

99112
char threads[ 4096 ] = {0};
113+
char sample_rate_str[ 64 ] = {0};
114+
snprintf( sample_rate_str, sizeof( sample_rate_str ), "%lu", args->flame.sample_rate ); // TODO: not sure if there is a FD version of this or something similar
100115
ulong len = 0UL;
116+
ulong valid_tiles = 0UL;
101117
for( ulong i=0UL; i<tile_cnt; i++ ) {
102-
if( FD_LIKELY( i!=0UL ) ) {
103-
FD_TEST( fd_cstr_printf_check( threads+len, sizeof(threads)-len, NULL, "," ) );
104-
len += 1UL;
105-
}
118+
fd_topo_tile_t * tile = &config->topo.tiles[ tile_idxs[ i ] ];
106119

107-
ulong tid = fd_metrics_tile( config->topo.tiles[ tile_idxs[ i ] ].metrics )[ FD_METRICS_GAUGE_TILE_TID_OFF ];
108-
ulong pid = fd_metrics_tile( config->topo.tiles[ tile_idxs[ i ] ].metrics )[ FD_METRICS_GAUGE_TILE_PID_OFF ];
120+
ulong tid = fd_metrics_tile( tile->metrics )[ FD_METRICS_GAUGE_TILE_TID_OFF ];
121+
ulong pid = fd_metrics_tile( tile->metrics )[ FD_METRICS_GAUGE_TILE_PID_OFF ];
122+
123+
/* Skip tiles that don't have valid PID/TID */
124+
if( FD_UNLIKELY( !pid || !tid || pid > INT_MAX || tid > INT_MAX ) ) {
125+
FD_LOG_WARNING(( "skipping tile %s:%lu - invalid PID/TID (pid=%lu, tid=%lu)", tile->name, tile->kind_id, pid, tid ));
126+
continue;
127+
}
109128

110-
FD_TEST( pid<=INT_MAX );
111129
if( FD_UNLIKELY( -1==kill( (int)pid, 0 ) ) ) {
112-
if( FD_UNLIKELY( errno==ESRCH ) ) FD_LOG_ERR(( "tile %s:%lu is not running", config->topo.tiles[ i ].name, config->topo.tiles[ i ].kind_id ));
113-
else FD_LOG_ERR(( "kill() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
130+
if( FD_UNLIKELY( errno==ESRCH ) ) {
131+
FD_LOG_WARNING(( "skipping tile %s:%lu - process not running (pid=%lu)", tile->name, tile->kind_id, pid ));
132+
continue;
133+
} else {
134+
FD_LOG_ERR(( "kill() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
135+
}
136+
}
137+
138+
if( FD_LIKELY( valid_tiles>0UL ) ) {
139+
FD_TEST( fd_cstr_printf_check( threads+len, sizeof(threads)-len, NULL, "," ) );
140+
len += 1UL;
114141
}
115142

116143
ulong arg_len;
117144
FD_TEST( fd_cstr_printf_check( threads+len, sizeof(threads)-len, &arg_len, "%lu", fd_ulong_if( whole_process, pid, tid ) ) );
118145
len += arg_len;
146+
valid_tiles++;
147+
}
148+
149+
if( FD_UNLIKELY( !valid_tiles ) ) {
150+
FD_LOG_ERR(( "No valid running tiles found to profile" ));
119151
}
120-
FD_TEST( len<sizeof(threads) );
121152

122-
FD_LOG_NOTICE(( "/usr/bin/perf script record flamegraph -F 99 -%c %s && /usr/bin/perf script report flamegraph", fd_char_if( whole_process, 'p', 't' ), threads ));
153+
FD_LOG_NOTICE(( "/usr/bin/perf script record flamegraph -F %lu -%c %s && /usr/bin/perf script report flamegraph", args->flame.sample_rate, fd_char_if( whole_process, 'p', 't' ), threads ));
123154

124155
record_pid = fork();
125156
if( FD_UNLIKELY( -1==record_pid ) ) FD_LOG_ERR(( "fork() failed (%i-%s)", errno, fd_io_strerror( errno ) ));
@@ -130,7 +161,7 @@ flame_cmd_fn( args_t * args,
130161
"record",
131162
"flamegraph",
132163
"-F",
133-
"99",
164+
sample_rate_str,
134165
whole_process ? "-p" : "-t",
135166
threads,
136167
NULL,

src/disco/topo/fd_topo.c

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,47 @@ fd_topo_workspace_fill( fd_topo_t * topo,
180180
}
181181
}
182182

183+
void
184+
fd_topo_workspace_fill_resilient( fd_topo_t * topo,
185+
fd_topo_wksp_t * wksp ) {
186+
for( ulong i=0UL; i<topo->link_cnt; i++ ) {
187+
fd_topo_link_t * link = &topo->links[ i ];
188+
189+
if( FD_UNLIKELY( topo->objs[ link->mcache_obj_id ].wksp_id!=wksp->id ) ) continue;
190+
link->mcache = fd_mcache_join( fd_topo_obj_laddr( topo, link->mcache_obj_id ) );
191+
if( FD_UNLIKELY( !link->mcache ) ) {
192+
FD_LOG_WARNING(( "failed to join mcache for link %lu (obj_id=%lu), setting to NULL", i, link->mcache_obj_id ));
193+
}
194+
195+
if( link->mtu ) {
196+
if( FD_UNLIKELY( topo->objs[ link->dcache_obj_id ].wksp_id!=wksp->id ) ) continue;
197+
link->dcache = fd_dcache_join( fd_topo_obj_laddr( topo, link->dcache_obj_id ) );
198+
if( FD_UNLIKELY( !link->dcache ) ) {
199+
FD_LOG_WARNING(( "failed to join dcache for link %lu (obj_id=%lu), setting to NULL", i, link->dcache_obj_id ));
200+
}
201+
}
202+
}
203+
204+
for( ulong i=0UL; i<topo->tile_cnt; i++ ) {
205+
fd_topo_tile_t * tile = &topo->tiles[ i ];
206+
207+
if( FD_LIKELY( topo->objs[ tile->metrics_obj_id ].wksp_id==wksp->id ) ) {
208+
tile->metrics = fd_metrics_join( fd_topo_obj_laddr( topo, tile->metrics_obj_id ) );
209+
if( FD_UNLIKELY( !tile->metrics ) ) {
210+
FD_LOG_WARNING(( "failed to join metrics for tile %s:%lu (obj_id=%lu), setting to NULL", tile->name, tile->kind_id, tile->metrics_obj_id ));
211+
}
212+
}
213+
214+
for( ulong j=0UL; j<tile->in_cnt; j++ ) {
215+
if( FD_UNLIKELY( topo->objs[ tile->in_link_fseq_obj_id[ j ] ].wksp_id!=wksp->id ) ) continue;
216+
tile->in_link_fseq[ j ] = fd_fseq_join( fd_topo_obj_laddr( topo, tile->in_link_fseq_obj_id[ j ] ) );
217+
if( FD_UNLIKELY( !tile->in_link_fseq[ j ] ) ) {
218+
FD_LOG_WARNING(( "failed to join fseq for tile %s:%lu input %lu (obj_id=%lu), setting to NULL", tile->name, tile->kind_id, j, tile->in_link_fseq_obj_id[ j ] ));
219+
}
220+
}
221+
}
222+
}
223+
183224
void
184225
fd_topo_fill_tile( fd_topo_t * topo,
185226
fd_topo_tile_t * tile ) {
@@ -196,6 +237,13 @@ fd_topo_fill( fd_topo_t * topo ) {
196237
}
197238
}
198239

240+
void
241+
fd_topo_fill_resilient( fd_topo_t * topo ) {
242+
for( ulong i=0UL; i<topo->wksp_cnt; i++ ) {
243+
fd_topo_workspace_fill_resilient( topo, &topo->workspaces[ i ] );
244+
}
245+
}
246+
199247
FD_FN_CONST static ulong
200248
fd_topo_tile_extra_huge_pages( fd_topo_tile_t const * tile ) {
201249
/* Every tile maps an additional set of pages for the stack. */

src/disco/topo/fd_topo.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -849,6 +849,19 @@ fd_topo_wksp_new( fd_topo_t const * topo,
849849
void
850850
fd_topo_fill( fd_topo_t * topo );
851851

852+
/* Resilient versions of fd_topo_workspace_fill and fd_topo_fill that
853+
handle failed joins gracefully by setting pointers to NULL and
854+
logging warnings instead of using FD_TEST. This is useful for
855+
diagnostic tools like flame that need to inspect topology even when
856+
some tiles have been killed or crashed. */
857+
858+
void
859+
fd_topo_workspace_fill_resilient( fd_topo_t * topo,
860+
fd_topo_wksp_t * wksp );
861+
862+
void
863+
fd_topo_fill_resilient( fd_topo_t * topo );
864+
852865
/* fd_topo_tile_stack_join joins a huge page optimized stack for the
853866
provided tile. The stack is assumed to already exist at a known
854867
path in the hugetlbfs mount. */

0 commit comments

Comments
 (0)