Skip to content

Commit 5be3959

Browse files
committed
metrics_record: add new dev command for streaming metrics into CSV
1 parent 6130b8b commit 5be3959

File tree

5 files changed

+261
-1
lines changed

5 files changed

+261
-1
lines changed

src/app/firedancer-dev/main.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,7 @@ extern action_t fd_action_dump;
190190
extern action_t fd_action_flame;
191191
extern action_t fd_action_help;
192192
extern action_t fd_action_metrics;
193+
extern action_t fd_action_metrics_record;
193194
extern action_t fd_action_load;
194195
extern action_t fd_action_pktgen;
195196
extern action_t fd_action_quic_trace;
@@ -219,6 +220,7 @@ action_t * ACTIONS[] = {
219220
&fd_action_set_identity,
220221
&fd_action_help,
221222
&fd_action_metrics,
223+
&fd_action_metrics_record,
222224
&fd_action_version,
223225
&fd_action_bench,
224226
&fd_action_bundle_client,

src/app/shared/commands/help.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ help_cmd_fn( args_t * args FD_PARAM_UNUSED,
2121
FD_LOG_STDOUT(( " --help Print this help message\n\n" ));
2222
FD_LOG_STDOUT(( "SUBCOMMANDS:\n" ));
2323
for( ulong i=0UL; ACTIONS[ i ]; i++ ) {
24-
FD_LOG_STDOUT(( " %13s %s\n", ACTIONS[ i ]->name, ACTIONS[ i ]->description ));
24+
FD_LOG_STDOUT(( " %14s %s\n", ACTIONS[ i ]->name, ACTIONS[ i ]->description ));
2525
}
2626
}
2727

src/app/shared/fd_action.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,18 @@ union fdctl_args {
121121
char topo[ 64 ];
122122
} metrics;
123123

124+
struct {
125+
char topo[ 64 ];
126+
ulong interval_ns;
127+
128+
ulong selectors_cnt;
129+
struct fd_action_metrics_record_selector {
130+
char name[ 32 ];
131+
char kind[ 16 ];
132+
ulong kind_id;
133+
} selectors[ 128 ];
134+
} metrics_record;
135+
124136
struct {
125137
uint fsck : 1;
126138
uint fsck_lthash : 1;

src/app/shared_dev/Local.mk

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ $(call add-objs,commands/dev,fddev_shared)
1313
$(call add-objs,commands/dump,fddev_shared)
1414
$(call add-objs,commands/flame,fddev_shared)
1515
$(call add-objs,commands/load,fddev_shared)
16+
$(call add-objs,commands/metrics_record,fddev_shared)
1617
$(call add-objs,commands/pktgen/pktgen,fddev_shared)
1718
$(call add-objs,commands/txn,fddev_shared)
1819
$(call add-objs,commands/udpecho/udpecho,fddev_shared)
Lines changed: 245 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,245 @@
1+
#include "../../shared/fd_config.h"
2+
#include "../../shared/fd_action.h"
3+
#include "../../../disco/metrics/fd_metrics.h"
4+
5+
#include <errno.h>
6+
#include <signal.h>
7+
#include <stdio.h>
8+
#include <unistd.h>
9+
10+
extern action_t * ACTIONS[];
11+
12+
static int running = 1;
13+
14+
static void
15+
exit_signal( int sig FD_PARAM_UNUSED ) {
16+
running = 0;
17+
}
18+
19+
static void
20+
metrics_record_cmd_args( int * pargc,
21+
char *** pargv,
22+
args_t * args ) {
23+
24+
if( fd_env_strip_cmdline_contains( pargc, pargv, "--help" ) ||
25+
fd_env_strip_cmdline_contains( pargc, pargv, "-h" ) ||
26+
fd_env_strip_cmdline_contains( pargc, pargv, "help" ) ) {
27+
fputs(
28+
"\nUsage: firedancer-dev metrics-record [GLOBAL FLAGS] [FLAGS] metric0 metric1 ... metricN\n"
29+
"\n"
30+
"Flags:\n"
31+
" --topo TOPO Attach to metrics of non-standard topo, such as snapshot-load\n"
32+
" --interval SECONDS How frequently to print a row. Defaults to 1.0 seconds.\n"
33+
"\n"
34+
"Metrics:\n"
35+
" Selector format: `metric_name[,tile_kind[,tile_kind_id]]`\n"
36+
"\n"
37+
" Metrics are primarily identified by their name string. A tile kind string can also\n"
38+
" be given to limit the given metric to only one specific tile type. Similarly, a\n"
39+
" tile kind id can be given (only if tile_kind is also given) to limit to a particular\n"
40+
" tile instance. If these tile kind filters are not given, all matching metrics will\n"
41+
" be recorded.\n"
42+
"\n"
43+
" Examples:\n"
44+
" tile_pid\n"
45+
" tile_backpressure_count,gossip\n"
46+
" tile_status,net,1\n"
47+
"\n",
48+
stderr );
49+
exit( EXIT_SUCCESS );
50+
}
51+
52+
fd_memset( &args->metrics_record, 0, sizeof(args->metrics_record) );
53+
fd_cstr_ncpy( args->metrics_record.topo, fd_env_strip_cmdline_cstr( pargc, pargv, "--topo", NULL, "" ), sizeof(args->metrics_record.topo) );
54+
55+
float _interval = fd_env_strip_cmdline_float( pargc, pargv, "--interval", NULL, 1.0f );
56+
args->metrics_record.interval_ns = fd_ulong_max( 1UL, (ulong)(_interval*1.0e9f) );
57+
58+
ulong const selectors_cnt_max = sizeof(args->metrics_record.selectors)/sizeof(args->metrics_record.selectors[0]);
59+
while( *pargc ) {
60+
if( FD_UNLIKELY( args->metrics_record.selectors_cnt>=selectors_cnt_max ) ) FD_LOG_ERR(( "too many metric selectors given %lu", selectors_cnt_max ));
61+
struct fd_action_metrics_record_selector * selector = &args->metrics_record.selectors[ args->metrics_record.selectors_cnt++ ];
62+
63+
char * name = *pargv[ 0 ];
64+
char * kind = strchr( name, ',' );
65+
char * kind_id = NULL;
66+
if( kind!=NULL ) {
67+
fd_cstr_fini( kind );
68+
kind += 1;
69+
kind_id = strchr( kind, ',' );
70+
if( kind_id!=NULL ) {
71+
fd_cstr_fini( kind_id );
72+
kind_id += 1;
73+
if( FD_UNLIKELY( NULL!=strchr( kind_id, ',' ) ) ) FD_LOG_ERR(( "invalid metric selector %s %s %s", name, kind, kind_id ));
74+
}
75+
}
76+
*pargc -= 1;
77+
*pargv += 1;
78+
79+
if( FD_UNLIKELY( NULL==name || strlen( name )>=sizeof(selector->name)) ) FD_LOG_ERR(( "invalid metric selector name %s", name ));
80+
fd_cstr_ncpy( selector->name, name, sizeof(selector->name) );
81+
if( FD_UNLIKELY( NULL!=kind && strlen( kind )>=sizeof(selector->kind)) ) FD_LOG_ERR(( "invalid metric selector kind %s", kind ));
82+
fd_cstr_ncpy( selector->kind, kind, sizeof(selector->kind) );
83+
selector->kind_id = NULL==kind_id ? ULONG_MAX : fd_cstr_to_ulong( kind_id );
84+
}
85+
}
86+
87+
static int
88+
selector_matches( struct fd_action_metrics_record_selector const * selector,
89+
char const * metric_name,
90+
char const * tile_name,
91+
ulong tile_id ) {
92+
if( 0!=strcmp( metric_name, selector->name ) ) return 0;
93+
if( selector->kind[ 0 ] && 0!=strcmp( tile_name, selector->kind ) ) return 0;
94+
if( ULONG_MAX!=selector->kind_id && tile_id!=selector->kind_id ) return 0;
95+
return 1;
96+
}
97+
98+
static void
99+
reconstruct_topo( fd_config_t * config,
100+
char const * topo_name ) {
101+
if( !topo_name[0] ) return; /* keep default action topo */
102+
103+
action_t const * selected = NULL;
104+
for( action_t ** a=ACTIONS; a!=NULL; a++ ) {
105+
action_t const * action = *a;
106+
if( 0==strcmp( action->name, topo_name ) ) {
107+
selected = action;
108+
break;
109+
}
110+
}
111+
112+
if( !selected ) FD_LOG_ERR(( "Unknown --topo %s", topo_name ));
113+
if( !selected->topo ) FD_LOG_ERR(( "Cannot recover topology for --topo %s", topo_name ));
114+
115+
selected->topo( config );
116+
}
117+
118+
static void
119+
metrics_record_cmd_fn( args_t * args,
120+
fd_config_t * config ) {
121+
122+
struct sigaction sa = { .sa_handler = exit_signal };
123+
if( FD_UNLIKELY( sigaction( SIGTERM, &sa, NULL ) ) ) FD_LOG_ERR(( "sigaction(SIGTERM) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
124+
if( FD_UNLIKELY( sigaction( SIGINT, &sa, NULL ) ) ) FD_LOG_ERR(( "sigaction(SIGINT) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
125+
126+
reconstruct_topo( config, args->metrics_record.topo );
127+
128+
fd_topo_join_workspaces( &config->topo, FD_SHMEM_JOIN_MODE_READ_ONLY );
129+
fd_topo_fill( &config->topo );
130+
131+
uchar write_buf[ 4096 ];
132+
fd_io_buffered_ostream_t out[1];
133+
FD_TEST( out==fd_io_buffered_ostream_init( out, STDOUT_FILENO, write_buf, sizeof(write_buf) ) );
134+
135+
fd_io_buffered_ostream_write( out, "timestamp", 9 );
136+
137+
ulong metrics_cnt = 0UL;
138+
struct {
139+
fd_metrics_meta_t const * meta;
140+
volatile ulong const * value;
141+
} metrics[ 4096 ];
142+
143+
for( ulong i=0UL; i<FD_METRICS_ALL_TOTAL; i++ ) {
144+
fd_metrics_meta_t const * metric = &FD_METRICS_ALL[ i ];
145+
if( metric->type!=FD_METRICS_TYPE_GAUGE && metric->type!=FD_METRICS_TYPE_COUNTER ) continue;
146+
for( ulong j=0UL; j<config->topo.tile_cnt; j++ ) {
147+
fd_topo_tile_t const * tile = &config->topo.tiles[ j ];
148+
char const * tile_name = tile->metrics_name[ 0 ] ? tile->metrics_name : tile->name;
149+
for( ulong s=0UL; s<args->metrics_record.selectors_cnt; s++ ) {
150+
if( FD_LIKELY( !selector_matches( &args->metrics_record.selectors[ s ], metric->name, tile_name, tile->kind_id ) ) ) continue;
151+
if( FD_UNLIKELY( metrics_cnt>=(sizeof(metrics)/sizeof(metrics[0])) ) ) FD_LOG_ERR(( "too many metrics %lu", metrics_cnt ));
152+
metrics[ metrics_cnt ].meta = metric;
153+
metrics[ metrics_cnt ].value = fd_metrics_tile( tile->metrics ) + metric->offset;
154+
++metrics_cnt;
155+
156+
char buf[ 1024 ];
157+
char * p = fd_cstr_append_printf( fd_cstr_init( buf ), ",%s{kind=%s,kind_id=%lu", metric->name, tile->name, tile->kind_id );
158+
if( metric->enum_name ) p = fd_cstr_append_printf( p, ",%s=%s", metric->enum_name, metric->enum_variant );
159+
p = fd_cstr_append_char( p, '}' );
160+
fd_io_buffered_ostream_write( out, buf, (ulong)(p-buf) );
161+
break;
162+
}
163+
}
164+
}
165+
166+
/* TODO: Add support for in/out link metrics */
167+
168+
for( ulong i=0UL; i<FD_METRICS_TILE_KIND_CNT; i++ ) {
169+
for( ulong j=0UL; j<FD_METRICS_TILE_KIND_SIZES[ i ]; j++ ) {
170+
fd_metrics_meta_t const * metric = &FD_METRICS_TILE_KIND_METRICS[ i ][ j ];
171+
if( metric->type!=FD_METRICS_TYPE_GAUGE && metric->type!=FD_METRICS_TYPE_COUNTER ) continue;
172+
for( ulong k=0UL; k<config->topo.tile_cnt; k++ ) {
173+
fd_topo_tile_t const * tile = &config->topo.tiles[ k ];
174+
char const * tile_name = tile->metrics_name[ 0 ] ? tile->metrics_name : tile->name;
175+
if( 0!=strcmp( tile_name, FD_METRICS_TILE_KIND_NAMES[ i ] ) ) continue;
176+
for( ulong s=0UL; s<args->metrics_record.selectors_cnt; s++ ) {
177+
if( FD_LIKELY( !selector_matches( &args->metrics_record.selectors[ s ], metric->name, tile_name, tile->kind_id ) ) ) continue;
178+
if( FD_UNLIKELY( metrics_cnt>=(sizeof(metrics)/sizeof(metrics[0])) ) ) FD_LOG_ERR(( "too many metrics %lu", metrics_cnt ));
179+
metrics[ metrics_cnt ].meta = metric;
180+
metrics[ metrics_cnt ].value = fd_metrics_tile( tile->metrics ) + metric->offset;
181+
++metrics_cnt;
182+
183+
char buf[ 1024 ];
184+
char * p = fd_cstr_append_printf( fd_cstr_init( buf ), ",%s{kind=%s,kind_id=%lu", metric->name, tile->name, tile->kind_id );
185+
if( metric->enum_name ) p = fd_cstr_append_printf( p, ",%s=%s", metric->enum_name, metric->enum_variant );
186+
p = fd_cstr_append_char( p, '}' );
187+
fd_io_buffered_ostream_write( out, buf, (ulong)(p-buf) );
188+
break;
189+
}
190+
}
191+
}
192+
}
193+
194+
if( FD_UNLIKELY( metrics_cnt==0UL ) ) FD_LOG_ERR(( "no matching metrics found" ));
195+
fd_io_buffered_ostream_write( out, "\n", 1 );
196+
fd_io_buffered_ostream_flush( out );
197+
198+
ulong count = 0UL, skip = 0UL;
199+
long const start = fd_log_wallclock();
200+
long const interval = (long)args->metrics_record.interval_ns;
201+
long next = ((start/interval)*interval)+interval;
202+
while( running ) {
203+
long now = fd_log_wait_until( next );
204+
for( next+=interval; next<=now; next+=interval ) skip++;
205+
206+
char * const b = fd_io_buffered_ostream_peek( out );
207+
char * const e = b + fd_io_buffered_ostream_peek_sz( out );
208+
char * p = b;
209+
if( FD_UNLIKELY( e-p<=20L ) ) FD_LOG_ERR(( "increase write buffer size" ));
210+
p = fd_cstr_append_ulong_as_text( p, ' ', '\0', (ulong)now, fd_ulong_base10_dig_cnt( (ulong)now ) );
211+
212+
for( ulong i=0UL; i<metrics_cnt; i++ ) {
213+
ulong value = *metrics[ i ].value;
214+
switch( metrics[ i ].meta->converter ) {
215+
case FD_METRICS_CONVERTER_NANOSECONDS: value = fd_metrics_convert_ticks_to_nanoseconds( value ); break;
216+
case FD_METRICS_CONVERTER_SECONDS: value = (ulong)(fd_metrics_convert_ticks_to_seconds( value ) + 0.5); /* round, not truncate */ break;
217+
case FD_METRICS_CONVERTER_NONE: break;
218+
default: FD_LOG_ERR(( "unknown converter %i", metrics[ i ].meta->converter ));
219+
}
220+
if( FD_UNLIKELY( e-p<=22L ) ) FD_LOG_ERR(( "increase write buffer size" ));
221+
p = fd_cstr_append_char( p, ',' );
222+
p = fd_cstr_append_ulong_as_text( p, ' ', '\0', value, fd_ulong_base10_dig_cnt( value ) );
223+
}
224+
p = fd_cstr_append_char( p, '\n' );
225+
fd_io_buffered_ostream_seek( out, (ulong)(p-b) );
226+
fd_io_buffered_ostream_flush( out );
227+
count++;
228+
}
229+
230+
FD_LOG_NOTICE(( "recorded %lu samples in %f seconds", count, (double)(fd_log_wallclock()-start)/1.0e9 ));
231+
if( skip ) FD_LOG_WARNING(( "skipped %lu samples, try reducing metric count or increasing interval", skip ));
232+
233+
fd_io_buffered_ostream_flush( out );
234+
fd_io_buffered_ostream_fini( out );
235+
236+
fd_topo_leave_workspaces( &config->topo );
237+
}
238+
239+
action_t fd_action_metrics_record = {
240+
.name = "metrics-record",
241+
.description = "Continuously print a select subset of metrics to STDOUT in CSV format",
242+
.is_diagnostic = 1,
243+
.args = metrics_record_cmd_args,
244+
.fn = metrics_record_cmd_fn,
245+
};

0 commit comments

Comments
 (0)