Skip to content

Commit d517f11

Browse files
author
Stanisław Drozd
authored
[wormhole-attester] Add a healthcheck on the metrics port (#429)
* wormhole attester: Add a healthcheck on the metrics port * pyth2wormhole healthcheck: apply review advice - Move metrics/healthcheck counter updates to be next to each other - Change "0-sized window disables healthcheck" into an explicit config value - move healthcheck updates past the atomic counter updates
1 parent 5175445 commit d517f11

File tree

8 files changed

+293
-102
lines changed

8 files changed

+293
-102
lines changed

devnet/p2w-attest.yaml

Lines changed: 29 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -45,11 +45,35 @@ spec:
4545
- name: P2W_EXIT_ON_ERROR
4646
value: "true"
4747
tty: true
48-
readinessProbe:
49-
tcpSocket:
50-
port: 2000
51-
periodSeconds: 1
52-
failureThreshold: 300
48+
# Probes, in order of appearance https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/
49+
#
50+
# Startup probe - delays other probes until it gets its first success
51+
startupProbe:
52+
httpGet:
53+
path: /healthcheck
54+
port: 3000
55+
failureThreshold: 100 # up to 100 * 10 seconds to report initial healthy status
56+
periodSeconds: 10
57+
# Readiness probe - Used to tell load balancers to
58+
# start/stop sending traffic to the container, *without*
59+
# restarting it. The attester does not accept any traffic as
60+
# part of its workflow, which means this isn't very useful.
61+
# readinessProbe:
62+
# httpGet:
63+
# path: /healthcheck
64+
# port: 3000
65+
# failureThreshold: 1
66+
# periodSeconds: 10
67+
#
68+
# Liveness probe - decides restarts for misbehaving
69+
# containers
70+
livenessProbe:
71+
httpGet:
72+
path: /healthcheck
73+
port: 3000
74+
failureThreshold: 1 # If the attester healthcheck fails once,
75+
periodSeconds: 10
76+
5377
ports:
5478
- containerPort: 4343
5579
name: p2w-attest

solana/pyth2wormhole/Cargo.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

solana/pyth2wormhole/client/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "pyth2wormhole-client"
3-
version = "1.2.0"
3+
version = "1.3.0"
44
edition = "2018"
55

66
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

solana/pyth2wormhole/client/src/attestation_cfg.rs

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,14 @@ pub struct AttestationConfig {
3535
#[serde(default = "default_max_msg_accounts")]
3636
pub max_msg_accounts: u64,
3737

38+
/// How many consecutive attestation failures cause the service to
39+
/// report as unhealthy.
40+
#[serde(default = "default_healthcheck_window_size")]
41+
pub healthcheck_window_size: u64,
42+
43+
#[serde(default = "default_enable_healthcheck")]
44+
pub enable_healthcheck: bool,
45+
3846
/// Optionally, we take a mapping account to add remaining symbols from a Pyth deployments.
3947
/// These symbols are processed under `default_attestation_conditions`.
4048
#[serde(
@@ -85,7 +93,7 @@ impl AttestationConfig {
8593

8694
name_to_symbols
8795
.entry(name.clone())
88-
.or_insert(vec![])
96+
.or_default()
8997
.push(symbol);
9098
}
9199
}
@@ -289,6 +297,14 @@ pub const fn default_min_msg_reuse_interval_ms() -> u64 {
289297
10_000 // 10s
290298
}
291299

300+
pub const fn default_healthcheck_window_size() -> u64 {
301+
100
302+
}
303+
304+
pub const fn default_enable_healthcheck() -> bool {
305+
true
306+
}
307+
292308
pub const fn default_mapping_reload_interval_mins() -> u64 {
293309
15
294310
}
@@ -473,6 +489,8 @@ mod tests {
473489
let cfg = AttestationConfig {
474490
min_msg_reuse_interval_ms: 1000,
475491
max_msg_accounts: 100_000,
492+
enable_healthcheck: true,
493+
healthcheck_window_size: 100,
476494
min_rpc_interval_ms: 2123,
477495
mapping_addr: None,
478496
mapping_reload_interval_mins: 42,
@@ -555,6 +573,8 @@ mod tests {
555573
let cfg = AttestationConfig {
556574
min_msg_reuse_interval_ms: 1000,
557575
max_msg_accounts: 100_000,
576+
healthcheck_window_size: 100,
577+
enable_healthcheck: true,
558578
min_rpc_interval_ms: 2123,
559579
mapping_addr: None,
560580
mapping_reload_interval_mins: 42,
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
use {
2+
std::{
3+
collections::VecDeque,
4+
sync::Arc,
5+
},
6+
tokio::sync::Mutex,
7+
};
8+
9+
lazy_static::lazy_static! {
10+
pub static ref HEALTHCHECK_STATE: Arc<Mutex<HealthCheckState>> = Arc::new(Mutex::new(HealthCheckState::new(1, false)));
11+
}
12+
13+
/// Helper structure for deciding service health
14+
pub struct HealthCheckState {
15+
/// Whether to report the healthy/unhealthy status
16+
pub enable: bool,
17+
/// Sliding LIFO window over last `max_window_size` attestation results (true = ok, false = error)
18+
pub window: VecDeque<bool>,
19+
/// Window size
20+
pub max_window_size: usize,
21+
}
22+
23+
24+
impl HealthCheckState {
25+
pub fn new(max_window_size: usize, enable: bool) -> Self {
26+
Self {
27+
enable,
28+
window: VecDeque::with_capacity(max_window_size),
29+
max_window_size,
30+
}
31+
}
32+
/// Check service health, return None if not enough data is present
33+
pub fn is_healthy(&self) -> Option<bool> {
34+
if self.window.len() >= self.max_window_size && self.enable {
35+
// If all results are false, return false (unhealthy).
36+
Some(self.window.iter().any(|entry| *entry))
37+
} else {
38+
// The window isn't big enough yet or the healthcheck is disabled
39+
None
40+
}
41+
}
42+
43+
/// Rotate the window
44+
pub fn add_result(&mut self, res: bool) {
45+
self.window.push_front(res);
46+
47+
// Trim window back to size if needed. truncate() deletes from
48+
// the back and has no effect if new size is greater than
49+
// current size.
50+
self.window.truncate(self.max_window_size);
51+
}
52+
}

solana/pyth2wormhole/client/src/lib.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
pub mod attestation_cfg;
22
pub mod batch_state;
3+
pub mod healthcheck;
34
pub mod message;
45
pub mod util;
56

@@ -10,6 +11,10 @@ pub use {
1011
P2WSymbol,
1112
},
1213
batch_state::BatchState,
14+
healthcheck::{
15+
HealthCheckState,
16+
HEALTHCHECK_STATE,
17+
},
1318
message::P2WMessageQueue,
1419
pyth2wormhole::Pyth2WormholeConfig,
1520
util::{

0 commit comments

Comments
 (0)