Skip to content

Commit f86cd78

Browse files
committed
repair: end slot tool allow flexible parent offset
1 parent 90b70b6 commit f86cd78

File tree

11 files changed

+77
-33
lines changed

11 files changed

+77
-33
lines changed

book/api/metrics-generated.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -450,8 +450,8 @@
450450
| <span class="metrics-name">shred_&#8203;force_&#8203;complete_&#8203;request</span> | counter | The number of times we received a FEC force complete message |
451451
| <span class="metrics-name">shred_&#8203;force_&#8203;complete_&#8203;failure</span> | counter | The number of times we failed to force complete a FEC set on request |
452452
| <span class="metrics-name">shred_&#8203;force_&#8203;complete_&#8203;success</span> | counter | The number of times we successfully forced completed a FEC set on request |
453-
| <span class="metrics-name">shred_&#8203;shred_&#8203;out_&#8203;rcv</span> | counter | The number of times we received a repair shred |
454-
| <span class="metrics-name">shred_&#8203;shred_&#8203;out_&#8203;rcv_&#8203;bytes</span> | counter | The number bytes received from network packets with repair shreds. Bytes include network headers. |
453+
| <span class="metrics-name">shred_&#8203;shred_&#8203;repair_&#8203;rcv</span> | counter | The number of times we received a repair shred |
454+
| <span class="metrics-name">shred_&#8203;shred_&#8203;repair_&#8203;rcv_&#8203;bytes</span> | counter | The number bytes received from network packets with repair shreds. Bytes include network headers. |
455455
| <span class="metrics-name">shred_&#8203;shred_&#8203;turbine_&#8203;rcv</span> | counter | The number of times we received a turbine shred |
456456
| <span class="metrics-name">shred_&#8203;shred_&#8203;turbine_&#8203;rcv_&#8203;bytes</span> | counter | The number bytes received from network packets with turbine shreds. Bytes include network headers. |
457457
| <span class="metrics-name">shred_&#8203;store_&#8203;insert_&#8203;wait</span> | histogram | Time in seconds spent waiting for the store to insert a new FEC set |

src/app/firedancer-dev/commands/repair.c

Lines changed: 34 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include "../../../disco/topo/fd_topob.h"
2020
#include "../../../util/pod/fd_pod_format.h"
2121
#include "../../../waltz/resolv/fd_io_readline.h"
22+
#include "../../platform/fd_sys_util.h"
2223
#include "../../shared/commands/monitor/helper.h"
2324
#include "../../../disco/metrics/fd_metrics.h"
2425
#include "../../../discof/repair/fd_repair_tile.c"
@@ -623,7 +624,9 @@ repair_cmd_fn_catchup( args_t * args,
623624
/* Collect all net tiles and their repair_net link metrics */
624625
ulong net_tile_cnt = config->layout.net_tile_count;
625626
volatile ulong ** repair_net_links = aligned_alloc( 8UL, net_tile_cnt * sizeof(volatile ulong*) );
627+
volatile ulong ** net_shred_links = aligned_alloc( 8UL, net_tile_cnt * sizeof(volatile ulong*) );
626628
FD_TEST( repair_net_links );
629+
FD_TEST( net_shred_links );
627630

628631
for( ulong i = 0UL; i < net_tile_cnt; i++ ) {
629632
ulong tile_idx = fd_topo_find_tile( &config->topo, "net", i );
@@ -636,6 +639,15 @@ repair_cmd_fn_catchup( args_t * args,
636639
}
637640
repair_net_links[i] = fd_metrics_link_in( tile->metrics, repair_net_in_idx );
638641
FD_TEST( repair_net_links[i] );
642+
643+
ulong shred_tile_idx = fd_topo_find_tile( &config->topo, "shred", 0 );
644+
if( FD_UNLIKELY( shred_tile_idx == ULONG_MAX ) ) FD_LOG_ERR(( "shred tile 0 not found" ));
645+
fd_topo_tile_t * shred_tile = &config->topo.tiles[ shred_tile_idx ];
646+
647+
ulong shred_out_in_idx = fd_topo_find_tile_in_link( &config->topo, shred_tile, "net_shred", i );
648+
if( FD_UNLIKELY( shred_out_in_idx == ULONG_MAX ) ) FD_LOG_ERR(( "net_shred link not found for shred tile 0" ));
649+
net_shred_links[i] = fd_metrics_link_in( shred_tile->metrics, shred_out_in_idx );
650+
FD_TEST( net_shred_links[i] );
639651
}
640652

641653
FD_LOG_NOTICE(( "Repair catchup run" ));
@@ -668,7 +680,7 @@ repair_cmd_fn_catchup( args_t * args,
668680
int catchup_finished = 0;
669681
if( FD_UNLIKELY( now - last_print > 1e9L ) ) {
670682
char buf2[ 64 ];
671-
ulong rcvd = shred_metrics [ MIDX( COUNTER, SHRED, SHRED_OUT_RCV ) ];
683+
ulong rcvd = shred_metrics [ MIDX( COUNTER, SHRED, SHRED_REPAIR_RCV ) ];
672684
ulong sent = repair_metrics[ MIDX( COUNTER, REPAIR, SENT_PKT_TYPES_NEEDED_WINDOW ) ] +
673685
repair_metrics[ MIDX( COUNTER, REPAIR, SENT_PKT_TYPES_NEEDED_HIGHEST_WINDOW ) ] +
674686
repair_metrics[ MIDX( COUNTER, REPAIR, SENT_PKT_TYPES_NEEDED_ORPHAN ) ];
@@ -690,10 +702,21 @@ repair_cmd_fn_catchup( args_t * args,
690702
for( ulong i = 0UL; i < net_tile_cnt; i++ ) {
691703
volatile ulong * ovar_net_metrics = repair_net_links[i];
692704
total_overrun += ovar_net_metrics[ MIDX( COUNTER, LINK, OVERRUN_READING_FRAG_COUNT ) ];
693-
total_consumed += ovar_net_metrics[ MIDX( COUNTER, LINK, CONSUMED_COUNT ) ];
705+
total_consumed += ovar_net_metrics[ MIDX( COUNTER, LINK, CONSUMED_COUNT ) ]; /* consumed is incremented after after_frag is called */
706+
}
707+
printf( " Outgoing requests overrun: %s\n", fmt_count( buf2, total_overrun ) );
708+
printf( " Outgoing requests consumed: %s\n", fmt_count( buf2, total_consumed ) );
709+
710+
total_overrun = net_shred_links[0][ MIDX( COUNTER, LINK, OVERRUN_READING_FRAG_COUNT ) ];
711+
total_consumed = 0UL;
712+
for( ulong i = 0UL; i < net_tile_cnt; i++ ) {
713+
volatile ulong * ovar_net_metrics = net_shred_links[i];
714+
total_overrun += ovar_net_metrics[ MIDX( COUNTER, LINK, OVERRUN_READING_FRAG_COUNT ) ];
715+
total_consumed += ovar_net_metrics[ MIDX( COUNTER, LINK, CONSUMED_COUNT ) ]; /* shred frag filtering happens manually in after_frag, so no need to index every shred_tile. */
694716
}
695-
printf( " Total overrun: %s\n", fmt_count( buf2, total_overrun ) );
696-
printf( " Net consumed: %s\n", fmt_count( buf2, total_consumed ) );
717+
718+
printf( " Incoming shreds overrun: %s\n", fmt_count( buf2, total_overrun ) );
719+
printf( " Incoming shreds consumed: %s\n", fmt_count( buf2, total_consumed ) );
697720

698721
print_histogram_buckets( repair_metrics,
699722
MIDX( HISTOGRAM, REPAIR, RESPONSE_LATENCY ),
@@ -728,7 +751,8 @@ repair_cmd_fn_catchup( args_t * args,
728751
read_iptable( args->repair.iptable_path, location_table );
729752
print_peer_location_latency( repair_wksp->wksp, repair_ctx );
730753
print_catchup_slots( repair_wksp->wksp, repair_ctx, 0, 1 );
731-
FD_LOG_ERR(("Catchup to slot %lu completed successfully", turbine_slot0));
754+
FD_LOG_NOTICE(("Catchup to slot %lu completed successfully", turbine_slot0));
755+
fd_sys_util_exit_group( 0 );
732756
}
733757
}
734758
}
@@ -892,6 +916,7 @@ static const char * CATCHUP_HELP =
892916
"\n"
893917
"optional arguments:\n"
894918
" -h, --help show this help message and exit\n"
919+
" --end-slot END_SLOT slot to catchup to (generally should be a rooted slot)\n"
895920
" --iptable-path IPTABLE_PATH\n"
896921
" path to iptable file\n"
897922
" --sort-by-slot sort results by slot\n";
@@ -920,11 +945,11 @@ static const char * REQUESTS_HELP =
920945

921946
static const char * WATERFALL_HELP =
922947
"\n\n"
923-
"usage: repair waterfall [-h] [--iptable-path IPTABLE_PATH] [--sort-by-slot]\n"
948+
"usage: repair waterfall [-h] [--iptable IPTABLE_PATH] [--sort-by-slot]\n"
924949
"\n"
925950
"optional arguments:\n"
926951
" -h, --help show this help message and exit\n"
927-
" --iptable-path IPTABLE_PATH\n"
952+
" --iptable IPTABLE_PATH\n"
928953
" path to iptable file\n"
929954
" --sort-by-slot sort results by slot\n";
930955

@@ -966,6 +991,7 @@ repair_cmd_args( int * pargc,
966991
char const * iptable_path = fd_env_strip_cmdline_cstr ( pargc, pargv, "--iptable", NULL, NULL );
967992
ulong slot = fd_env_strip_cmdline_ulong ( pargc, pargv, "--slot", NULL, ULONG_MAX );
968993
int sort_by_slot = fd_env_strip_cmdline_contains( pargc, pargv, "--sort-by-slot" );
994+
ulong end_slot = fd_env_strip_cmdline_ulong ( pargc, pargv, "--end-slot", NULL, 0 );
969995

970996
if( FD_UNLIKELY( !strcmp( args->repair.pos_arg, "catchup" ) && !manifest_path ) ) {
971997
args->repair.help = 1;
@@ -978,6 +1004,7 @@ repair_cmd_args( int * pargc,
9781004
fd_cstr_fini( fd_cstr_append_cstr_safe( fd_cstr_init( args->repair.iptable_path ), iptable_path, sizeof(args->repair.iptable_path )-1UL ) );
9791005
args->repair.slot = slot;
9801006
args->repair.sort_by_slot = sort_by_slot;
1007+
args->repair.end_slot = end_slot;
9811008
}
9821009

9831010
static void

src/disco/gui/fd_gui.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -376,7 +376,7 @@ fd_gui_network_stats_snap( fd_gui_t * gui,
376376
}
377377
}
378378

379-
cur->in.repair = fd_gui_metrics_sum_tiles_counter( topo, "shred", shred_tile_cnt, MIDX( COUNTER, SHRED, SHRED_OUT_RCV_BYTES ) );
379+
cur->in.repair = fd_gui_metrics_sum_tiles_counter( topo, "shred", shred_tile_cnt, MIDX( COUNTER, SHRED, SHRED_REPAIR_RCV_BYTES ) );
380380
ulong repair_tile_idx = fd_topo_find_tile( topo, "repair", 0UL );
381381
if( FD_LIKELY( repair_tile_idx!=ULONG_MAX ) ) {
382382
fd_topo_tile_t const * repair = &topo->tiles[ repair_tile_idx ];

src/disco/metrics/generated/fd_metrics_shred.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,8 @@ const fd_metrics_meta_t FD_METRICS_SHRED[FD_METRICS_SHRED_TOTAL] = {
2222
DECLARE_METRIC( SHRED_FORCE_COMPLETE_REQUEST, COUNTER ),
2323
DECLARE_METRIC( SHRED_FORCE_COMPLETE_FAILURE, COUNTER ),
2424
DECLARE_METRIC( SHRED_FORCE_COMPLETE_SUCCESS, COUNTER ),
25-
DECLARE_METRIC( SHRED_SHRED_OUT_RCV, COUNTER ),
26-
DECLARE_METRIC( SHRED_SHRED_OUT_RCV_BYTES, COUNTER ),
25+
DECLARE_METRIC( SHRED_SHRED_REPAIR_RCV, COUNTER ),
26+
DECLARE_METRIC( SHRED_SHRED_REPAIR_RCV_BYTES, COUNTER ),
2727
DECLARE_METRIC( SHRED_SHRED_TURBINE_RCV, COUNTER ),
2828
DECLARE_METRIC( SHRED_SHRED_TURBINE_RCV_BYTES, COUNTER ),
2929
DECLARE_METRIC_HISTOGRAM_SECONDS( SHRED_STORE_INSERT_WAIT ),

src/disco/metrics/generated/fd_metrics_shred.h

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -114,17 +114,17 @@
114114
#define FD_METRICS_COUNTER_SHRED_FORCE_COMPLETE_SUCCESS_DESC "The number of times we successfully forced completed a FEC set on request"
115115
#define FD_METRICS_COUNTER_SHRED_FORCE_COMPLETE_SUCCESS_CVT (FD_METRICS_CONVERTER_NONE)
116116

117-
#define FD_METRICS_COUNTER_SHRED_SHRED_OUT_RCV_OFF (116UL)
118-
#define FD_METRICS_COUNTER_SHRED_SHRED_OUT_RCV_NAME "shred_shred_out_rcv"
119-
#define FD_METRICS_COUNTER_SHRED_SHRED_OUT_RCV_TYPE (FD_METRICS_TYPE_COUNTER)
120-
#define FD_METRICS_COUNTER_SHRED_SHRED_OUT_RCV_DESC "The number of times we received a repair shred"
121-
#define FD_METRICS_COUNTER_SHRED_SHRED_OUT_RCV_CVT (FD_METRICS_CONVERTER_NONE)
122-
123-
#define FD_METRICS_COUNTER_SHRED_SHRED_OUT_RCV_BYTES_OFF (117UL)
124-
#define FD_METRICS_COUNTER_SHRED_SHRED_OUT_RCV_BYTES_NAME "shred_shred_out_rcv_bytes"
125-
#define FD_METRICS_COUNTER_SHRED_SHRED_OUT_RCV_BYTES_TYPE (FD_METRICS_TYPE_COUNTER)
126-
#define FD_METRICS_COUNTER_SHRED_SHRED_OUT_RCV_BYTES_DESC "The number bytes received from network packets with repair shreds. Bytes include network headers."
127-
#define FD_METRICS_COUNTER_SHRED_SHRED_OUT_RCV_BYTES_CVT (FD_METRICS_CONVERTER_NONE)
117+
#define FD_METRICS_COUNTER_SHRED_SHRED_REPAIR_RCV_OFF (116UL)
118+
#define FD_METRICS_COUNTER_SHRED_SHRED_REPAIR_RCV_NAME "shred_shred_repair_rcv"
119+
#define FD_METRICS_COUNTER_SHRED_SHRED_REPAIR_RCV_TYPE (FD_METRICS_TYPE_COUNTER)
120+
#define FD_METRICS_COUNTER_SHRED_SHRED_REPAIR_RCV_DESC "The number of times we received a repair shred"
121+
#define FD_METRICS_COUNTER_SHRED_SHRED_REPAIR_RCV_CVT (FD_METRICS_CONVERTER_NONE)
122+
123+
#define FD_METRICS_COUNTER_SHRED_SHRED_REPAIR_RCV_BYTES_OFF (117UL)
124+
#define FD_METRICS_COUNTER_SHRED_SHRED_REPAIR_RCV_BYTES_NAME "shred_shred_repair_rcv_bytes"
125+
#define FD_METRICS_COUNTER_SHRED_SHRED_REPAIR_RCV_BYTES_TYPE (FD_METRICS_TYPE_COUNTER)
126+
#define FD_METRICS_COUNTER_SHRED_SHRED_REPAIR_RCV_BYTES_DESC "The number bytes received from network packets with repair shreds. Bytes include network headers."
127+
#define FD_METRICS_COUNTER_SHRED_SHRED_REPAIR_RCV_BYTES_CVT (FD_METRICS_CONVERTER_NONE)
128128

129129
#define FD_METRICS_COUNTER_SHRED_SHRED_TURBINE_RCV_OFF (118UL)
130130
#define FD_METRICS_COUNTER_SHRED_SHRED_TURBINE_RCV_NAME "shred_shred_turbine_rcv"

src/disco/metrics/metrics.xml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -750,8 +750,8 @@ metric introduced.
750750
<counter name="ForceCompleteRequest" summary="The number of times we received a FEC force complete message" />
751751
<counter name="ForceCompleteFailure" summary="The number of times we failed to force complete a FEC set on request" />
752752
<counter name="ForceCompleteSuccess" summary="The number of times we successfully forced completed a FEC set on request" />
753-
<counter name="ShredOutRcv" summary="The number of times we received a repair shred" />
754-
<counter name="ShredOutRcvBytes" summary="The number bytes received from network packets with repair shreds. Bytes include network headers." />
753+
<counter name="ShredRepairRcv" summary="The number of times we received a repair shred" />
754+
<counter name="ShredRepairRcvBytes" summary="The number bytes received from network packets with repair shreds. Bytes include network headers." />
755755
<counter name="ShredTurbineRcv" summary="The number of times we received a turbine shred" />
756756
<counter name="ShredTurbineRcvBytes" summary="The number bytes received from network packets with turbine shreds. Bytes include network headers." />
757757
<histogram name="StoreInsertWait" min="0.00000001" max="0.0005" converter="seconds">

src/disco/shred/fd_shred_tile.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -306,8 +306,8 @@ metrics_write( fd_shred_ctx_t * ctx ) {
306306
FD_MHIST_COPY( SHRED, BATCH_MICROBLOCK_CNT, ctx->metrics->batch_microblock_cnt );
307307
FD_MHIST_COPY( SHRED, SHREDDING_DURATION_SECONDS, ctx->metrics->shredding_timing );
308308
FD_MHIST_COPY( SHRED, ADD_SHRED_DURATION_SECONDS, ctx->metrics->add_shred_timing );
309-
FD_MCNT_SET ( SHRED, SHRED_OUT_RCV, ctx->metrics->repair_rcv_cnt );
310-
FD_MCNT_SET ( SHRED, SHRED_OUT_RCV_BYTES, ctx->metrics->repair_rcv_bytes );
309+
FD_MCNT_SET ( SHRED, SHRED_REPAIR_RCV, ctx->metrics->repair_rcv_cnt );
310+
FD_MCNT_SET ( SHRED, SHRED_REPAIR_RCV_BYTES, ctx->metrics->repair_rcv_bytes );
311311
FD_MCNT_SET ( SHRED, SHRED_TURBINE_RCV, ctx->metrics->turbine_rcv_cnt );
312312
FD_MCNT_SET ( SHRED, SHRED_TURBINE_RCV_BYTES, ctx->metrics->turbine_rcv_bytes );
313313

src/discof/forest/fd_forest.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -696,6 +696,15 @@ fd_forest_data_shred_insert( fd_forest_t * forest, ulong slot, ulong parent_slot
696696
return ele;
697697
}
698698

699+
fd_forest_blk_t *
700+
fd_forest_blk_parent_update( fd_forest_t * forest, ulong slot, ulong parent_slot ) {
701+
VER_INC;
702+
fd_forest_blk_t * ele = query( forest, slot );
703+
if( FD_UNLIKELY( !ele ) ) return NULL;
704+
ele->parent_slot = parent_slot;
705+
return ele;
706+
}
707+
699708
fd_forest_blk_t *
700709
fd_forest_fec_insert( fd_forest_t * forest, ulong slot, ulong parent_slot, uint last_shred_idx, uint fec_set_idx, int slot_complete, int ref_tick ) {
701710
VER_INC;

src/discof/forest/fd_forest.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -609,6 +609,12 @@ fd_forest_query( fd_forest_t * forest, ulong slot );
609609
fd_forest_blk_t *
610610
fd_forest_blk_insert( fd_forest_t * forest, ulong slot, ulong parent_slot );
611611

612+
/* fd_forest_blk_parent_update updates the parent of a block in the forest.
613+
Needed for profiler mode. */
614+
615+
fd_forest_blk_t *
616+
fd_forest_blk_parent_update( fd_forest_t * forest, ulong slot, ulong parent_slot );
617+
612618
#define SHRED_SRC_TURBINE 0
613619
#define SHRED_SRC_REPAIR 1
614620
#define SHRED_SRC_RECOVERED 2

src/discof/repair/fd_repair_metrics.c

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -141,22 +141,23 @@ fd_repair_metrics_print_sorted( fd_repair_metrics_t * repair_metrics, int verbos
141141
double pipelined_time = (double)(max_ts - min_ts);
142142
double non_pipelined_time = (double)slot_durations_sum;
143143
FD_LOG_NOTICE(( "\n"
144-
"Over past %u completed slots: \n"
144+
"Completed %u slots in %.2f seconds total. \n"
145145
"Average slot duration (time from first shred/rq to all shreds received): %.2f ms\n"
146146
"Average time between slot completions: %.2f ms\n"
147147
"Average slots per second: %.2f\n"
148148
"Pipeline factor (sum duration of all slots / total time): %.2f\n",
149149
total_slots,
150+
(double)fd_metrics_convert_ticks_to_nanoseconds((ulong)pipelined_time) / 1e9,
150151
(double)fd_metrics_convert_ticks_to_nanoseconds((ulong)slot_durations_sum) / (double)total_slots / 1e6,
151152
(double)fd_metrics_convert_ticks_to_nanoseconds((ulong)incremental_cmpl_sum) / (double)total_slots / 1e6,
152153
(double)total_slots / (double)fd_metrics_convert_ticks_to_nanoseconds((ulong)pipelined_time) * 1e9,
153154
non_pipelined_time / pipelined_time ));
154155

155156
FD_LOG_NOTICE(( "\n"
156-
"Total time to finish catchup over %d slots: %.2f ms \n"
157-
"Time to repair orphans: %.2f ms \n"
158-
"Total time from connected orphan to done: %.2f ms \n"
159-
"Slots completed by orphans connected: %d\n",
157+
"Caught up %d slots in %.2f ms total. \n"
158+
"Time to repair orphans: %.2f ms \n"
159+
"Total time from connected orphan to done: %.2f ms \n"
160+
"Slots completed by orphans connected: %d\n",
160161
num_catchup_slots,
161162
(double)fd_metrics_convert_ticks_to_nanoseconds((ulong)(finish_catchup_ts - repair_kickoff_ts)) / 1e6,
162163
(double)fd_metrics_convert_ticks_to_nanoseconds((ulong)(orphan_cmpl_ts - repair_kickoff_ts)) / 1e6,

0 commit comments

Comments
 (0)