From 67c934e3b634b38b79dc6c923dd78dd946652f6a Mon Sep 17 00:00:00 2001 From: npatel-jump Date: Sat, 9 Aug 2025 06:04:38 +0000 Subject: [PATCH] flame: fixed to handle terminated processes --- src/app/shared/fd_action.h | 1 + src/app/shared_dev/commands/flame.c | 59 ++++++++++++++++++++++------- src/disco/topo/fd_topo.c | 48 +++++++++++++++++++++++ src/disco/topo/fd_topo.h | 13 +++++++ 4 files changed, 107 insertions(+), 14 deletions(-) diff --git a/src/app/shared/fd_action.h b/src/app/shared/fd_action.h index 6ebe80a330..5c64e10264 100644 --- a/src/app/shared/fd_action.h +++ b/src/app/shared/fd_action.h @@ -65,6 +65,7 @@ union fdctl_args { struct { char name[ 13UL ]; + ulong sample_rate; } flame; struct { diff --git a/src/app/shared_dev/commands/flame.c b/src/app/shared_dev/commands/flame.c index efc373ac12..ef9170105c 100644 --- a/src/app/shared_dev/commands/flame.c +++ b/src/app/shared_dev/commands/flame.c @@ -45,11 +45,24 @@ flame_cmd_args( int * pargc, char *** pargv, args_t * args ) { - if( FD_UNLIKELY( !*pargc ) ) FD_LOG_ERR(( "usage: flame [all|tile|tile:idx|agave]" )); + if( FD_UNLIKELY( !*pargc ) ) FD_LOG_ERR(( "usage: flame [all|tile|tile:idx|agave] [sample_rate]" )); strncpy( args->flame.name, **pargv, sizeof( args->flame.name ) - 1 ); (*pargc)--; (*pargv)++; + + args->flame.sample_rate = 99UL; /* default 99hz */ + + if( FD_LIKELY( *pargc > 0 ) ) { /* optional */ + char * endptr; + ulong sample_rate = strtoul( **pargv, &endptr, 10 ); + if( FD_UNLIKELY( *endptr != '\0' || sample_rate == 0UL || sample_rate > 50000UL ) ) { + FD_LOG_ERR(( "invalid sample rate `%s` - must be between 1 and 50000 Hz", **pargv )); + } + args->flame.sample_rate = sample_rate; + (*pargc)--; + (*pargv)++; + } } void @@ -58,7 +71,7 @@ flame_cmd_fn( args_t * args, install_parent_signals(); fd_topo_join_workspaces( &config->topo, FD_SHMEM_JOIN_MODE_READ_ONLY ); - fd_topo_fill( &config->topo ); + fd_topo_fill_resilient( &config->topo ); ulong tile_cnt = 0UL; ulong tile_idxs[ 128UL ]; @@ -97,29 +110,47 @@ flame_cmd_fn( args_t * args, } char threads[ 4096 ] = {0}; + char sample_rate_str[ 64 ] = {0}; + snprintf( sample_rate_str, sizeof( sample_rate_str ), "%lu", args->flame.sample_rate ); // TODO: not sure if there is a FD version of this or something similar ulong len = 0UL; + ulong valid_tiles = 0UL; for( ulong i=0UL; itopo.tiles[ tile_idxs[ i ] ]; - ulong tid = fd_metrics_tile( config->topo.tiles[ tile_idxs[ i ] ].metrics )[ FD_METRICS_GAUGE_TILE_TID_OFF ]; - ulong pid = fd_metrics_tile( config->topo.tiles[ tile_idxs[ i ] ].metrics )[ FD_METRICS_GAUGE_TILE_PID_OFF ]; + ulong tid = fd_metrics_tile( tile->metrics )[ FD_METRICS_GAUGE_TILE_TID_OFF ]; + ulong pid = fd_metrics_tile( tile->metrics )[ FD_METRICS_GAUGE_TILE_PID_OFF ]; + + /* Skip tiles that don't have valid PID/TID */ + if( FD_UNLIKELY( !pid || !tid || pid > INT_MAX || tid > INT_MAX ) ) { + FD_LOG_WARNING(( "skipping tile %s:%lu - invalid PID/TID (pid=%lu, tid=%lu)", tile->name, tile->kind_id, pid, tid )); + continue; + } - FD_TEST( pid<=INT_MAX ); if( FD_UNLIKELY( -1==kill( (int)pid, 0 ) ) ) { - if( FD_UNLIKELY( errno==ESRCH ) ) FD_LOG_ERR(( "tile %s:%lu is not running", config->topo.tiles[ i ].name, config->topo.tiles[ i ].kind_id )); - else FD_LOG_ERR(( "kill() failed (%i-%s)", errno, fd_io_strerror( errno ) )); + if( FD_UNLIKELY( errno==ESRCH ) ) { + FD_LOG_WARNING(( "skipping tile %s:%lu - process not running (pid=%lu)", tile->name, tile->kind_id, pid )); + continue; + } else { + FD_LOG_ERR(( "kill() failed (%i-%s)", errno, fd_io_strerror( errno ) )); + } + } + + if( FD_LIKELY( valid_tiles>0UL ) ) { + FD_TEST( fd_cstr_printf_check( threads+len, sizeof(threads)-len, NULL, "," ) ); + len += 1UL; } ulong arg_len; FD_TEST( fd_cstr_printf_check( threads+len, sizeof(threads)-len, &arg_len, "%lu", fd_ulong_if( whole_process, pid, tid ) ) ); len += arg_len; + valid_tiles++; + } + + if( FD_UNLIKELY( !valid_tiles ) ) { + FD_LOG_ERR(( "No valid running tiles found to profile" )); } - FD_TEST( lenflame.sample_rate, fd_char_if( whole_process, 'p', 't' ), threads )); record_pid = fork(); if( FD_UNLIKELY( -1==record_pid ) ) FD_LOG_ERR(( "fork() failed (%i-%s)", errno, fd_io_strerror( errno ) )); @@ -130,7 +161,7 @@ flame_cmd_fn( args_t * args, "record", "flamegraph", "-F", - "99", + sample_rate_str, whole_process ? "-p" : "-t", threads, NULL, diff --git a/src/disco/topo/fd_topo.c b/src/disco/topo/fd_topo.c index 7bcebc047a..ae629868e5 100644 --- a/src/disco/topo/fd_topo.c +++ b/src/disco/topo/fd_topo.c @@ -180,6 +180,47 @@ fd_topo_workspace_fill( fd_topo_t * topo, } } +void +fd_topo_workspace_fill_resilient( fd_topo_t * topo, + fd_topo_wksp_t * wksp ) { + for( ulong i=0UL; ilink_cnt; i++ ) { + fd_topo_link_t * link = &topo->links[ i ]; + + if( FD_UNLIKELY( topo->objs[ link->mcache_obj_id ].wksp_id!=wksp->id ) ) continue; + link->mcache = fd_mcache_join( fd_topo_obj_laddr( topo, link->mcache_obj_id ) ); + if( FD_UNLIKELY( !link->mcache ) ) { + FD_LOG_WARNING(( "failed to join mcache for link %lu (obj_id=%lu), setting to NULL", i, link->mcache_obj_id )); + } + + if( link->mtu ) { + if( FD_UNLIKELY( topo->objs[ link->dcache_obj_id ].wksp_id!=wksp->id ) ) continue; + link->dcache = fd_dcache_join( fd_topo_obj_laddr( topo, link->dcache_obj_id ) ); + if( FD_UNLIKELY( !link->dcache ) ) { + FD_LOG_WARNING(( "failed to join dcache for link %lu (obj_id=%lu), setting to NULL", i, link->dcache_obj_id )); + } + } + } + + for( ulong i=0UL; itile_cnt; i++ ) { + fd_topo_tile_t * tile = &topo->tiles[ i ]; + + if( FD_LIKELY( topo->objs[ tile->metrics_obj_id ].wksp_id==wksp->id ) ) { + tile->metrics = fd_metrics_join( fd_topo_obj_laddr( topo, tile->metrics_obj_id ) ); + if( FD_UNLIKELY( !tile->metrics ) ) { + FD_LOG_WARNING(( "failed to join metrics for tile %s:%lu (obj_id=%lu), setting to NULL", tile->name, tile->kind_id, tile->metrics_obj_id )); + } + } + + for( ulong j=0UL; jin_cnt; j++ ) { + if( FD_UNLIKELY( topo->objs[ tile->in_link_fseq_obj_id[ j ] ].wksp_id!=wksp->id ) ) continue; + tile->in_link_fseq[ j ] = fd_fseq_join( fd_topo_obj_laddr( topo, tile->in_link_fseq_obj_id[ j ] ) ); + if( FD_UNLIKELY( !tile->in_link_fseq[ j ] ) ) { + FD_LOG_WARNING(( "failed to join fseq for tile %s:%lu input %lu (obj_id=%lu), setting to NULL", tile->name, tile->kind_id, j, tile->in_link_fseq_obj_id[ j ] )); + } + } + } +} + void fd_topo_fill_tile( fd_topo_t * topo, fd_topo_tile_t * tile ) { @@ -196,6 +237,13 @@ fd_topo_fill( fd_topo_t * topo ) { } } +void +fd_topo_fill_resilient( fd_topo_t * topo ) { + for( ulong i=0UL; iwksp_cnt; i++ ) { + fd_topo_workspace_fill_resilient( topo, &topo->workspaces[ i ] ); + } +} + FD_FN_CONST static ulong fd_topo_tile_extra_huge_pages( fd_topo_tile_t const * tile ) { /* Every tile maps an additional set of pages for the stack. */ diff --git a/src/disco/topo/fd_topo.h b/src/disco/topo/fd_topo.h index d6c446f83e..cc5445fe46 100644 --- a/src/disco/topo/fd_topo.h +++ b/src/disco/topo/fd_topo.h @@ -849,6 +849,19 @@ fd_topo_wksp_new( fd_topo_t const * topo, void fd_topo_fill( fd_topo_t * topo ); +/* Resilient versions of fd_topo_workspace_fill and fd_topo_fill that + handle failed joins gracefully by setting pointers to NULL and + logging warnings instead of using FD_TEST. This is useful for + diagnostic tools like flame that need to inspect topology even when + some tiles have been killed or crashed. */ + +void +fd_topo_workspace_fill_resilient( fd_topo_t * topo, + fd_topo_wksp_t * wksp ); + +void +fd_topo_fill_resilient( fd_topo_t * topo ); + /* fd_topo_tile_stack_join joins a huge page optimized stack for the provided tile. The stack is assumed to already exist at a known path in the hugetlbfs mount. */