Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 84 additions & 0 deletions book/api/websocket.md
Original file line number Diff line number Diff line change
Expand Up @@ -545,6 +545,13 @@ Some interesting transitions are,
"loading_incremental_snapshot_decompress_bytes_compressed": null,
"loading_incremental_snapshot_insert_bytes_decompressed": null,
"loading_incremental_snapshot_insert_accounts": null,
"wait_for_supermajority_bank_hash": "2CeCyRoYmcctDmbXWrSUfTT4aQkGVCnArAmbdmQ5dGFi",
"wait_for_supermajority_shred_version": "37500",
"wait_for_supermajority_attempt": 1,
"wait_for_supermajority_total_stake": 1,
"wait_for_supermajority_connected_stake": 1,
"wait_for_supermajority_total_peers": 1,
"wait_for_supermajority_connected_peers": 1,
"catching_up_elapsed": null,
"catching_up_first_replay_slot": null,
}
Expand All @@ -568,10 +575,31 @@ Some interesting transitions are,
| loading_{full\|incremental}_snapshot_decompress_bytes_compressed | `number\|null` | If the phase is at least `loading_{full\|incremental}_snapshot`, this is the (compressed) number of bytes processed by decompress from the snapshot so far. Otherwise, `null` |
| loading_{full\|incremental}_snapshot_insert_bytes_decompressed | `number\|null` | If the phase is at least `loading_{full\|incremental}_snapshot`, this is the (decompressed) number of bytes processed from the snapshot by the snapshot insert time so far. Otherwise, `null` |
| loading_{full\|incremental}_snapshot_insert_accounts | `number\|null` | If the phase is at least `loading_{full\|incremental}_snapshot`, this is the current number of inserted accounts from the snapshot into the validator's accounts database. Otherwise, `null` |
| wait_for_supermajority_bank_hash | `string\|null` | If the client was configured to include the `waiting_for_supermajority` phase at startup, this is the expected bank hash of the snapshot bank. This ensures all validators join the cluster with the same starting state. `null` if wait for supermajority is not configured |
| wait_for_supermajority_shred_version | `string\|null` | If the client was configured to include the `waiting_for_supermajority` phase at startup, this is the expected shred version it was configured with. Shred version is functionally a hash of (genesis_hash, cluster_restart_history) which ensures only nodes which explicitly agree on the restart slot and restart attempt count can communicate with each other. `null` if wait for supermajority is not configured |
| wait_for_supermajority_attempt | `number\|null` | If the client was configured to include the `waiting_for_supermajority` phase at startup, this is the number of times this cluster has been restarted onto the snapshot slot, including the current attempt. `null` if wait for supermajority is not configured |
| wait_for_supermajority_total_stake | `number\|null` | If the phase is at least `waiting_for_supermajority`, this is the total network stake in lamports used to determine the 80% restart threshold |
| wait_for_supermajority_connected_stake | `number\|null` | If the phase is at least `waiting_for_supermajority`, this is the network stake in lamports that is currently active on gossip and waiting for the restart threshold |
| wait_for_supermajority_total_peers | `number\|null` | If the phase is at least `waiting_for_supermajority`, this is the total number of peers with a active stake |
| wait_for_supermajority_connected_peers | `number\|null` | If the phase is at least `waiting_for_supermajority`, this is the number of peers with active stake currently active on gossip and waiting for the restart threshold |
Comment on lines +583 to +584
| catching_up_elapsed_seconds | `number` | If the phase is `catching_up`, this is the duration, in seconds, the validator has spent catching up to the current slot |
| catching_up_first_replay_slot | `number` | If the phase is `catching_up`, this is the first slot that exited the replay pipeline after booting |


The `wait_for_supermajority_*` fields will be `null` if the
client is not configured to wait for a cluster restart, which is the
case for typical client usage.

The `wait_for_supermajority_*_stake` stake fields are derived
differently from the `gossip.network_stats.health` activated stake
(which is from the start of the epoch). These fields account for any
stake that is activating/deactivating in the current epoch and any stake
that was explicilty undelegated prior to restart (e.g. inactive testnet
participants or bad actors).

During the `waiting_for_supermajority` phase, per-peer offline status
is available via the [`wfs_offline_peers`](#wfs_offline_peers) topic.

#### `summary.schedule_strategy`
| frequency | type | example |
|------------|----------|---------|
Expand Down Expand Up @@ -1749,6 +1777,7 @@ identity is no longer in these three data sources, it will be removed.
"vote": [
{
"vote_pubkey": "8ri9HeWZv4Dcf4BD46pVPjmefzJLpbtfdAtyxyeG4enL",
"prev_stake": "0",
"activated_stake": "5812",
"last_vote": 281795801,
"root_slot": 281795770,
Expand Down Expand Up @@ -1791,6 +1820,7 @@ identity is no longer in these three data sources, it will be removed.
|-----------------|----------------|-------------|
| vote_pubkey | `string` | The public key of vote account, encoded in base58 |
| activated_stake | `string` | The amount of stake in lamports that is activated on this vote account for the current epoch. Warming up or cooling down stake that was delegating during this epoch is not included |
| prev_stake | `string\|null` | The amount of stake in lamports that is activated on this vote account at the start of the previous epoch. Will be `null` on Frankendancer (unsupported) and when no previous epoch exists (e.g. genesis or first epoch) |
| last_vote | `number\|null` | The last vote by the vote account that was landed on chain, as seen by this validator. If the vote account has not yet landed any votes on the chain this will be `null` |
| root_slot | `number\|null` | The last slot that was rooted by the vote account, based on the vote history. If the vote account has not yet rooted any slots this will be `null` |
| epoch_credits | `number` | The number of credits earned by the vote account during the current epoch |
Expand Down Expand Up @@ -1831,6 +1861,60 @@ full and includes this node itself, nodes with a different
`shred_version`, nodes publishing corrupt or bad information, and so
on.

### wfs_offline_peers
The `wfs_offline_peers` topic publishes edge-triggered add/remove
messages for staked peers that are **not** currently active on gossip.

The message is only published if the client is configured to boot with
the `waiting_for_supermajority` phase enabled.

#### `wfs_offline_peers.update`
| frequency | type | example |
|------------------|-----------------------------------|---------|
| *Once* + *Live* | `ClusterRestartOfflinePeerUpdate` | below |

::: details Example

**Initial state (all peers offline):**
```json
{
"topic": "wfs_offline_peers",
"key": "update",
"value": {
"add": [
{
"identity": "Fe4StcZSQ228dKK2hni7aCP7ZprNhj8QKWzFe5usGFYF",
"stake": "5812",
"info": null
}
],
"remove": [
{ "identity": "9aE6Bp1hbDpMFKqnWGUMbfxfMPXswPbkNwNrSjhpFiSN" }
]
}
}
```

:::

**`ClusterRestartOfflinePeerUpdate`**
| Field | Type | Description |
|-----------------|------------------------|-------------|
| identity | `string` | Identity public key of the validator, encoded in base58 |
| stake | `string` | Total stake in lamports for this identity (aggregated across all vote accounts), derived from the snapshot manifest |
| info | `PeerUpdateInfo\|null` | Self-reported validator information from the ConfigProgram, if available, and `null` otherwise |

**`ClusterRestartOfflinePeerRemove`**
| Field | Type | Description |
|----------|----------|-------------|
| identity | `string` | Identity public key of the validator, encoded in base58 |

**`ClusterRestartOfflinePeersUpdate`**
| Field | Type | Description |
|--------|-------------------------------------|-------------|
| add | `ClusterRestartOfflinePeerUpdate[]` | List of peers that became offline (or all offline peers on initial connect) |
| remove | `ClusterRestartOfflinePeerRemove[]` | List of peers that came back online and are no longer offline |

### slot
Slots are opportunities for a leader to produce a block. A slot can be
in one of five levels, and in typical operation a slot moves through
Expand Down
3 changes: 3 additions & 0 deletions src/app/firedancer/topology.c
Original file line number Diff line number Diff line change
Expand Up @@ -1322,6 +1322,7 @@ fd_topo_initialize( config_t * config ) {
if( FD_LIKELY( snapshots_enabled ) ) {
/**/ fd_topob_tile_in( topo, "gui", 0UL, "metric_in", "snapct_gui", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED );
/**/ fd_topob_tile_in( topo, "gui", 0UL, "metric_in", "snapin_gui", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED );
/**/ fd_topob_tile_in( topo, "gui", 0UL, "metric_in", "snapin_manif", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED );
}
if( FD_UNLIKELY( config->tiles.bundle.enabled ) ) {
/**/ fd_topob_tile_in( topo, "gui", 0UL, "metric_in", "bundle_status", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED );
Expand Down Expand Up @@ -1831,6 +1832,8 @@ fd_topo_configure_tile( fd_topo_tile_t * tile,
tile->gui.schedule_strategy = config->tiles.pack.schedule_strategy_enum;
tile->gui.websocket_compression = 1;
tile->gui.frontend_release_channel = config->development.gui.frontend_release_channel_enum;
fd_cstr_ncpy( tile->gui.wfs_bank_hash, config->firedancer.consensus.wait_for_supermajority_with_bank_hash, sizeof(tile->gui.wfs_bank_hash) );
tile->gui.wfs_expected_shred_version = config->consensus.expected_shred_version;

} else if( FD_UNLIKELY( !strcmp( tile->name, "rpc" ) ) ) {

Expand Down
34 changes: 33 additions & 1 deletion src/disco/gui/fd_gui.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include "fd_gui_metrics.h"

#include "../metrics/fd_metrics.h"
#include "../../discof/gossip/fd_gossip_tile.h"
#include "../plugin/fd_plugin.h"

#include "../../ballet/base58/fd_base58.h"
Expand Down Expand Up @@ -113,6 +114,11 @@ fd_gui_new( void * shmem,
}
gui->summary.boot_progress.catching_up_time_nanos = 0L;
gui->summary.boot_progress.catching_up_first_replay_slot = ULONG_MAX;
gui->summary.boot_progress.wfs_total_stake = 0UL;
gui->summary.boot_progress.wfs_connected_stake = 0UL;
gui->summary.boot_progress.wfs_total_peers = 0UL;
gui->summary.boot_progress.wfs_connected_peers = 0UL;
gui->summary.boot_progress.wfs_attempt = 0UL;
} else {
fd_memset( &gui->summary.boot_progress, 0, sizeof(gui->summary.boot_progress) );
gui->summary.boot_progress.phase = FD_GUI_BOOT_PROGRESS_TYPE_RUNNING;
Expand Down Expand Up @@ -786,13 +792,19 @@ fd_gui_run_boot_progress( fd_gui_t * gui, long now ) {
fd_topo_tile_t const * snapin = &gui->topo->tiles[ fd_topo_find_tile( gui->topo, "snapin", 0UL ) ];
volatile ulong * snapin_metrics = fd_metrics_tile( snapin->metrics );

fd_topo_tile_t const * gossip = &gui->topo->tiles[ fd_topo_find_tile( gui->topo, "gossip", 0UL ) ];
volatile ulong * gossip_metrics = fd_metrics_tile( gossip->metrics );

ulong snapshot_phase = snapct_metrics[ MIDX( GAUGE, SNAPCT, STATE ) ];
ulong wfs_state = gossip_metrics[ MIDX( GAUGE, GOSSIP, WFS_STATE ) ];

/* state transitions */
if( FD_UNLIKELY( gui->summary.slot_caught_up!=ULONG_MAX ) ) {
gui->summary.boot_progress.phase = FD_GUI_BOOT_PROGRESS_TYPE_RUNNING;
} else if( FD_LIKELY( snapshot_phase == FD_SNAPCT_STATE_SHUTDOWN && gui->summary.slots_max_turbine[ 0 ].slot!=ULONG_MAX && gui->summary.slot_completed!=ULONG_MAX ) ) {
} else if( FD_LIKELY( snapshot_phase == FD_SNAPCT_STATE_SHUTDOWN && wfs_state==FD_GOSSIP_WFS_STATE_DONE && gui->summary.slots_max_turbine[ 0 ].slot!=ULONG_MAX && gui->summary.slot_completed!=ULONG_MAX ) ) {
gui->summary.boot_progress.phase = FD_GUI_BOOT_PROGRESS_TYPE_CATCHING_UP;
} else if( FD_UNLIKELY( snapshot_phase == FD_SNAPCT_STATE_SHUTDOWN && wfs_state==FD_GOSSIP_WFS_STATE_WAIT ) ) {
gui->summary.boot_progress.phase = FD_GUI_BOOT_PROGRESS_TYPE_WAITING_FOR_SUPERMAJORITY;
} else if( FD_LIKELY( snapshot_phase==FD_SNAPCT_STATE_READING_FULL_FILE
|| snapshot_phase==FD_SNAPCT_STATE_FLUSHING_FULL_FILE_FINI
|| snapshot_phase==FD_SNAPCT_STATE_FLUSHING_FULL_FILE_DONE
Expand Down Expand Up @@ -859,6 +871,13 @@ fd_gui_run_boot_progress( fd_gui_t * gui, long now ) {

break;
}
case FD_GUI_BOOT_PROGRESS_TYPE_WAITING_FOR_SUPERMAJORITY: {
gui->summary.boot_progress.wfs_total_stake = gossip_metrics[ MIDX( GAUGE, GOSSIP, WFS_STAKE_TOTAL ) ];
gui->summary.boot_progress.wfs_connected_stake = gossip_metrics[ MIDX( GAUGE, GOSSIP, WFS_STAKE_ONLINE ) ];
gui->summary.boot_progress.wfs_total_peers = gossip_metrics[ MIDX( GAUGE, GOSSIP, WFS_STAKED_PEERS_TOTAL ) ];
gui->summary.boot_progress.wfs_connected_peers = gossip_metrics[ MIDX( GAUGE, GOSSIP, WFS_STAKED_PEERS_ONLINE ) ];
break;
}
case FD_GUI_BOOT_PROGRESS_TYPE_CATCHING_UP: {
gui->summary.boot_progress.catching_up_time_nanos = now;
break;
Expand Down Expand Up @@ -2472,6 +2491,19 @@ fd_gui_handle_snapshot_update( fd_gui_t * gui,
fd_cstr_printf_check( gui->summary.boot_progress.loading_snapshot[ snapshot_idx ].read_path, sizeof(gui->summary.boot_progress.loading_snapshot[ snapshot_idx ].read_path), NULL, "%s", msg->read_path );
}

void
fd_gui_handle_snapshot_manifest( fd_gui_t * gui,
fd_snapshot_manifest_t const * manifest ) {
ulong attempt = 0UL;
for( ulong i=0UL; i<manifest->hard_forks_len; i++ ) {
if( FD_UNLIKELY( manifest->hard_forks[ i ]==manifest->slot ) ) {
attempt = manifest->hard_forks_cnts[ i ];
break;
}
}
gui->summary.boot_progress.wfs_attempt = attempt;
}

static void
fd_gui_handle_reset_slot( fd_gui_t * gui, ulong reset_slot, long now ) {
FD_TEST( reset_slot!=ULONG_MAX );
Expand Down
20 changes: 18 additions & 2 deletions src/disco/gui/fd_gui.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include "../../disco/fd_txn_p.h"
#include "../../disco/bundle/fd_bundle_tile.h"
#include "../../discof/restore/fd_snapct_tile.h"
#include "../../discof/restore/utils/fd_ssmsg.h"
#include "../../discof/tower/fd_tower_tile.h"
#include "../../discof/replay/fd_replay_tile.h"
#include "../../choreo/tower/fd_tower.h"
Expand Down Expand Up @@ -108,8 +109,9 @@ struct fd_gui_validator_info {
#define FD_GUI_BOOT_PROGRESS_TYPE_JOINING_GOSSIP (1)
#define FD_GUI_BOOT_PROGRESS_TYPE_LOADING_FULL_SNAPSHOT (2)
#define FD_GUI_BOOT_PROGRESS_TYPE_LOADING_INCREMENTAL_SNAPSHOT (3)
#define FD_GUI_BOOT_PROGRESS_TYPE_CATCHING_UP (4)
#define FD_GUI_BOOT_PROGRESS_TYPE_RUNNING (5)
#define FD_GUI_BOOT_PROGRESS_TYPE_WAITING_FOR_SUPERMAJORITY (4)
#define FD_GUI_BOOT_PROGRESS_TYPE_CATCHING_UP (5)
#define FD_GUI_BOOT_PROGRESS_TYPE_RUNNING (6)

#define FD_GUI_BOOT_PROGRESS_FULL_SNAPSHOT_IDX (0UL)
#define FD_GUI_BOOT_PROGRESS_INCREMENTAL_SNAPSHOT_IDX (1UL)
Expand Down Expand Up @@ -559,6 +561,10 @@ struct fd_gui {
char const * version;
char const * cluster;

char wfs_bank_hash[ FD_BASE58_ENCODED_32_SZ ];
ushort wfs_expected_shred_version;
int wfs_enabled;

ulong vote_distance;
int vote_state;

Expand Down Expand Up @@ -616,6 +622,12 @@ struct fd_gui {
ulong insert_accounts_current;
} loading_snapshot[ FD_GUI_BOOT_PROGRESS_SNAPSHOT_CNT ];

ulong wfs_total_stake;
ulong wfs_connected_stake;
ulong wfs_total_peers;
ulong wfs_connected_peers;
ulong wfs_attempt;

long catching_up_time_nanos;
ulong catching_up_first_replay_slot;
} boot_progress;
Expand Down Expand Up @@ -897,6 +909,10 @@ void
fd_gui_handle_snapshot_update( fd_gui_t * gui,
fd_snapct_update_t const * msg );

void
fd_gui_handle_snapshot_manifest( fd_gui_t * gui,
fd_snapshot_manifest_t const * manifest );

void
fd_gui_handle_leader_schedule( fd_gui_t * gui,
fd_stake_weight_msg_t const * leader_schedule,
Expand Down
Loading
Loading