|
| 1 | +use apollo_consensus::metrics::CONSENSUS_ROUND_ABOVE_ZERO; |
| 2 | +use apollo_consensus_manager::metrics::CONSENSUS_NUM_CONNECTED_PEERS; |
| 3 | +use apollo_consensus_orchestrator::metrics::CENDE_WRITE_BLOB_FAILURE; |
| 4 | + |
| 5 | +use crate::alerts::{ |
| 6 | + Alert, |
| 7 | + AlertComparisonOp, |
| 8 | + AlertCondition, |
| 9 | + AlertEnvFiltering, |
| 10 | + AlertGroup, |
| 11 | + AlertLogicalOp, |
| 12 | + AlertSeverity, |
| 13 | +}; |
| 14 | + |
| 15 | +const PENDING_DURATION_DEFAULT: &str = "30s"; |
| 16 | +const EVALUATION_INTERVAL_SEC_DEFAULT: u64 = 30; |
| 17 | + |
| 18 | +/// The was a round larger than zero in the last hour. |
| 19 | +fn get_consensus_round_above_zero( |
| 20 | + alert_env_filtering: AlertEnvFiltering, |
| 21 | + alert_severity: AlertSeverity, |
| 22 | +) -> Alert { |
| 23 | + Alert::new( |
| 24 | + "consensus_round_above_zero", |
| 25 | + "Consensus round above zero", |
| 26 | + AlertGroup::Consensus, |
| 27 | + format!("increase({}[1h])", CONSENSUS_ROUND_ABOVE_ZERO.get_name_with_filter()), |
| 28 | + vec![AlertCondition { |
| 29 | + comparison_op: AlertComparisonOp::GreaterThan, |
| 30 | + comparison_value: 0.0, |
| 31 | + logical_op: AlertLogicalOp::And, |
| 32 | + }], |
| 33 | + PENDING_DURATION_DEFAULT, |
| 34 | + EVALUATION_INTERVAL_SEC_DEFAULT, |
| 35 | + alert_severity, |
| 36 | + alert_env_filtering, |
| 37 | + ) |
| 38 | +} |
| 39 | + |
| 40 | +pub(crate) fn get_consensus_round_above_zero_vec() -> Vec<Alert> { |
| 41 | + vec![ |
| 42 | + get_consensus_round_above_zero( |
| 43 | + AlertEnvFiltering::MainnetStyleAlerts, |
| 44 | + AlertSeverity::DayOnly, |
| 45 | + ), |
| 46 | + get_consensus_round_above_zero( |
| 47 | + AlertEnvFiltering::TestnetStyleAlerts, |
| 48 | + AlertSeverity::WorkingHours, |
| 49 | + ), |
| 50 | + ] |
| 51 | +} |
| 52 | + |
| 53 | +/// There were 5 times in the last 30 minutes that the round was larger than zero. |
| 54 | +fn get_consensus_round_above_zero_multiple_times( |
| 55 | + alert_env_filtering: AlertEnvFiltering, |
| 56 | + alert_severity: AlertSeverity, |
| 57 | +) -> Alert { |
| 58 | + Alert::new( |
| 59 | + "consensus_round_above_zero_multiple_times", |
| 60 | + "Consensus round above zero multiple times", |
| 61 | + AlertGroup::Consensus, |
| 62 | + format!("increase({}[30m])", CONSENSUS_ROUND_ABOVE_ZERO.get_name_with_filter()), |
| 63 | + vec![AlertCondition { |
| 64 | + comparison_op: AlertComparisonOp::GreaterThan, |
| 65 | + comparison_value: 5.0, |
| 66 | + logical_op: AlertLogicalOp::And, |
| 67 | + }], |
| 68 | + PENDING_DURATION_DEFAULT, |
| 69 | + EVALUATION_INTERVAL_SEC_DEFAULT, |
| 70 | + alert_severity, |
| 71 | + alert_env_filtering, |
| 72 | + ) |
| 73 | +} |
| 74 | + |
| 75 | +pub(crate) fn get_consensus_round_above_zero_multiple_times_vec() -> Vec<Alert> { |
| 76 | + vec![ |
| 77 | + get_consensus_round_above_zero_multiple_times( |
| 78 | + AlertEnvFiltering::MainnetStyleAlerts, |
| 79 | + AlertSeverity::Sos, |
| 80 | + ), |
| 81 | + get_consensus_round_above_zero_multiple_times( |
| 82 | + AlertEnvFiltering::TestnetStyleAlerts, |
| 83 | + AlertSeverity::WorkingHours, |
| 84 | + ), |
| 85 | + ] |
| 86 | +} |
| 87 | + |
| 88 | +fn get_cende_write_blob_failure_alert( |
| 89 | + alert_env_filtering: AlertEnvFiltering, |
| 90 | + alert_severity: AlertSeverity, |
| 91 | +) -> Alert { |
| 92 | + Alert::new( |
| 93 | + "cende_write_blob_failure", |
| 94 | + "Cende write blob failure", |
| 95 | + AlertGroup::Consensus, |
| 96 | + format!("increase({}[1h])", CENDE_WRITE_BLOB_FAILURE.get_name_with_filter()), |
| 97 | + vec![AlertCondition { |
| 98 | + comparison_op: AlertComparisonOp::GreaterThan, |
| 99 | + comparison_value: 10.0, |
| 100 | + logical_op: AlertLogicalOp::And, |
| 101 | + }], |
| 102 | + PENDING_DURATION_DEFAULT, |
| 103 | + EVALUATION_INTERVAL_SEC_DEFAULT, |
| 104 | + alert_severity, |
| 105 | + alert_env_filtering, |
| 106 | + ) |
| 107 | +} |
| 108 | + |
| 109 | +pub(crate) fn get_cende_write_blob_failure_alert_vec() -> Vec<Alert> { |
| 110 | + vec![ |
| 111 | + get_cende_write_blob_failure_alert( |
| 112 | + AlertEnvFiltering::MainnetStyleAlerts, |
| 113 | + AlertSeverity::DayOnly, |
| 114 | + ), |
| 115 | + get_cende_write_blob_failure_alert( |
| 116 | + AlertEnvFiltering::TestnetStyleAlerts, |
| 117 | + AlertSeverity::WorkingHours, |
| 118 | + ), |
| 119 | + ] |
| 120 | +} |
| 121 | + |
| 122 | +fn get_consensus_p2p_peer_down( |
| 123 | + alert_env_filtering: AlertEnvFiltering, |
| 124 | + alert_severity: AlertSeverity, |
| 125 | +) -> Alert { |
| 126 | + Alert::new( |
| 127 | + "consensus_p2p_peer_down", |
| 128 | + "Consensus p2p peer down", |
| 129 | + AlertGroup::Consensus, |
| 130 | + format!("max_over_time({}[2m])", CONSENSUS_NUM_CONNECTED_PEERS.get_name_with_filter()), |
| 131 | + vec![AlertCondition { |
| 132 | + comparison_op: AlertComparisonOp::LessThan, |
| 133 | + // TODO(shahak): find a way to make this depend on num_validators |
| 134 | + comparison_value: 2.0, |
| 135 | + logical_op: AlertLogicalOp::And, |
| 136 | + }], |
| 137 | + PENDING_DURATION_DEFAULT, |
| 138 | + EVALUATION_INTERVAL_SEC_DEFAULT, |
| 139 | + alert_severity, |
| 140 | + alert_env_filtering, |
| 141 | + ) |
| 142 | +} |
| 143 | + |
| 144 | +pub(crate) fn get_consensus_p2p_peer_down_vec() -> Vec<Alert> { |
| 145 | + vec![ |
| 146 | + get_consensus_p2p_peer_down(AlertEnvFiltering::MainnetStyleAlerts, AlertSeverity::Sos), |
| 147 | + get_consensus_p2p_peer_down( |
| 148 | + AlertEnvFiltering::TestnetStyleAlerts, |
| 149 | + AlertSeverity::WorkingHours, |
| 150 | + ), |
| 151 | + ] |
| 152 | +} |
| 153 | + |
| 154 | +pub(crate) fn get_cende_write_blob_failure_once_alert() -> Alert { |
| 155 | + Alert::new( |
| 156 | + "cende_write_blob_failure_once", |
| 157 | + "Cende write blob failure once", |
| 158 | + AlertGroup::Consensus, |
| 159 | + format!("increase({}[1h])", CENDE_WRITE_BLOB_FAILURE.get_name_with_filter()), |
| 160 | + vec![AlertCondition { |
| 161 | + comparison_op: AlertComparisonOp::GreaterThan, |
| 162 | + comparison_value: 0.0, |
| 163 | + logical_op: AlertLogicalOp::And, |
| 164 | + }], |
| 165 | + PENDING_DURATION_DEFAULT, |
| 166 | + EVALUATION_INTERVAL_SEC_DEFAULT, |
| 167 | + AlertSeverity::Informational, |
| 168 | + AlertEnvFiltering::All, |
| 169 | + ) |
| 170 | +} |
0 commit comments