Skip to content

Commit 24297c2

Browse files
apollo_dashboard: split alerts to different modules (#8348)
1 parent 00e7cab commit 24297c2

File tree

14 files changed

+1531
-1324
lines changed

14 files changed

+1531
-1324
lines changed

crates/apollo_dashboard/src/alert_definitions.rs

Lines changed: 111 additions & 1324 deletions
Large diffs are not rendered by default.
Lines changed: 170 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,170 @@
1+
use apollo_consensus::metrics::CONSENSUS_ROUND_ABOVE_ZERO;
2+
use apollo_consensus_manager::metrics::CONSENSUS_NUM_CONNECTED_PEERS;
3+
use apollo_consensus_orchestrator::metrics::CENDE_WRITE_BLOB_FAILURE;
4+
5+
use crate::alerts::{
6+
Alert,
7+
AlertComparisonOp,
8+
AlertCondition,
9+
AlertEnvFiltering,
10+
AlertGroup,
11+
AlertLogicalOp,
12+
AlertSeverity,
13+
};
14+
15+
const PENDING_DURATION_DEFAULT: &str = "30s";
16+
const EVALUATION_INTERVAL_SEC_DEFAULT: u64 = 30;
17+
18+
/// The was a round larger than zero in the last hour.
19+
fn get_consensus_round_above_zero(
20+
alert_env_filtering: AlertEnvFiltering,
21+
alert_severity: AlertSeverity,
22+
) -> Alert {
23+
Alert::new(
24+
"consensus_round_above_zero",
25+
"Consensus round above zero",
26+
AlertGroup::Consensus,
27+
format!("increase({}[1h])", CONSENSUS_ROUND_ABOVE_ZERO.get_name_with_filter()),
28+
vec![AlertCondition {
29+
comparison_op: AlertComparisonOp::GreaterThan,
30+
comparison_value: 0.0,
31+
logical_op: AlertLogicalOp::And,
32+
}],
33+
PENDING_DURATION_DEFAULT,
34+
EVALUATION_INTERVAL_SEC_DEFAULT,
35+
alert_severity,
36+
alert_env_filtering,
37+
)
38+
}
39+
40+
pub(crate) fn get_consensus_round_above_zero_vec() -> Vec<Alert> {
41+
vec![
42+
get_consensus_round_above_zero(
43+
AlertEnvFiltering::MainnetStyleAlerts,
44+
AlertSeverity::DayOnly,
45+
),
46+
get_consensus_round_above_zero(
47+
AlertEnvFiltering::TestnetStyleAlerts,
48+
AlertSeverity::WorkingHours,
49+
),
50+
]
51+
}
52+
53+
/// There were 5 times in the last 30 minutes that the round was larger than zero.
54+
fn get_consensus_round_above_zero_multiple_times(
55+
alert_env_filtering: AlertEnvFiltering,
56+
alert_severity: AlertSeverity,
57+
) -> Alert {
58+
Alert::new(
59+
"consensus_round_above_zero_multiple_times",
60+
"Consensus round above zero multiple times",
61+
AlertGroup::Consensus,
62+
format!("increase({}[30m])", CONSENSUS_ROUND_ABOVE_ZERO.get_name_with_filter()),
63+
vec![AlertCondition {
64+
comparison_op: AlertComparisonOp::GreaterThan,
65+
comparison_value: 5.0,
66+
logical_op: AlertLogicalOp::And,
67+
}],
68+
PENDING_DURATION_DEFAULT,
69+
EVALUATION_INTERVAL_SEC_DEFAULT,
70+
alert_severity,
71+
alert_env_filtering,
72+
)
73+
}
74+
75+
pub(crate) fn get_consensus_round_above_zero_multiple_times_vec() -> Vec<Alert> {
76+
vec![
77+
get_consensus_round_above_zero_multiple_times(
78+
AlertEnvFiltering::MainnetStyleAlerts,
79+
AlertSeverity::Sos,
80+
),
81+
get_consensus_round_above_zero_multiple_times(
82+
AlertEnvFiltering::TestnetStyleAlerts,
83+
AlertSeverity::WorkingHours,
84+
),
85+
]
86+
}
87+
88+
fn get_cende_write_blob_failure_alert(
89+
alert_env_filtering: AlertEnvFiltering,
90+
alert_severity: AlertSeverity,
91+
) -> Alert {
92+
Alert::new(
93+
"cende_write_blob_failure",
94+
"Cende write blob failure",
95+
AlertGroup::Consensus,
96+
format!("increase({}[1h])", CENDE_WRITE_BLOB_FAILURE.get_name_with_filter()),
97+
vec![AlertCondition {
98+
comparison_op: AlertComparisonOp::GreaterThan,
99+
comparison_value: 10.0,
100+
logical_op: AlertLogicalOp::And,
101+
}],
102+
PENDING_DURATION_DEFAULT,
103+
EVALUATION_INTERVAL_SEC_DEFAULT,
104+
alert_severity,
105+
alert_env_filtering,
106+
)
107+
}
108+
109+
pub(crate) fn get_cende_write_blob_failure_alert_vec() -> Vec<Alert> {
110+
vec![
111+
get_cende_write_blob_failure_alert(
112+
AlertEnvFiltering::MainnetStyleAlerts,
113+
AlertSeverity::DayOnly,
114+
),
115+
get_cende_write_blob_failure_alert(
116+
AlertEnvFiltering::TestnetStyleAlerts,
117+
AlertSeverity::WorkingHours,
118+
),
119+
]
120+
}
121+
122+
fn get_consensus_p2p_peer_down(
123+
alert_env_filtering: AlertEnvFiltering,
124+
alert_severity: AlertSeverity,
125+
) -> Alert {
126+
Alert::new(
127+
"consensus_p2p_peer_down",
128+
"Consensus p2p peer down",
129+
AlertGroup::Consensus,
130+
format!("max_over_time({}[2m])", CONSENSUS_NUM_CONNECTED_PEERS.get_name_with_filter()),
131+
vec![AlertCondition {
132+
comparison_op: AlertComparisonOp::LessThan,
133+
// TODO(shahak): find a way to make this depend on num_validators
134+
comparison_value: 2.0,
135+
logical_op: AlertLogicalOp::And,
136+
}],
137+
PENDING_DURATION_DEFAULT,
138+
EVALUATION_INTERVAL_SEC_DEFAULT,
139+
alert_severity,
140+
alert_env_filtering,
141+
)
142+
}
143+
144+
pub(crate) fn get_consensus_p2p_peer_down_vec() -> Vec<Alert> {
145+
vec![
146+
get_consensus_p2p_peer_down(AlertEnvFiltering::MainnetStyleAlerts, AlertSeverity::Sos),
147+
get_consensus_p2p_peer_down(
148+
AlertEnvFiltering::TestnetStyleAlerts,
149+
AlertSeverity::WorkingHours,
150+
),
151+
]
152+
}
153+
154+
pub(crate) fn get_cende_write_blob_failure_once_alert() -> Alert {
155+
Alert::new(
156+
"cende_write_blob_failure_once",
157+
"Cende write blob failure once",
158+
AlertGroup::Consensus,
159+
format!("increase({}[1h])", CENDE_WRITE_BLOB_FAILURE.get_name_with_filter()),
160+
vec![AlertCondition {
161+
comparison_op: AlertComparisonOp::GreaterThan,
162+
comparison_value: 0.0,
163+
logical_op: AlertLogicalOp::And,
164+
}],
165+
PENDING_DURATION_DEFAULT,
166+
EVALUATION_INTERVAL_SEC_DEFAULT,
167+
AlertSeverity::Informational,
168+
AlertEnvFiltering::All,
169+
)
170+
}

0 commit comments

Comments
 (0)