17
17
*/
18
18
19
19
use std:: {
20
+ ops:: RangeTo ,
20
21
sync:: { Arc , Mutex } ,
21
22
time:: Duration ,
22
23
} ;
@@ -26,6 +27,11 @@ use serde::{Deserialize, Serialize};
26
27
27
28
use super :: { AlertState , CallableTarget , Context } ;
28
29
30
+ enum Retry {
31
+ Infinity ,
32
+ Range ( RangeTo < usize > ) ,
33
+ }
34
+
29
35
#[ derive( Debug , Serialize , Deserialize ) ]
30
36
#[ serde( rename_all = "camelCase" ) ]
31
37
#[ serde( try_from = "TargetVerifier" ) ]
@@ -45,14 +51,18 @@ impl Target {
45
51
match resolves {
46
52
AlertState :: SetToFiring => {
47
53
state. alert_state = AlertState :: Firing ;
48
-
49
54
if !state. timed_out {
50
55
// set state
51
56
state. timed_out = true ;
52
57
state. awaiting_resolve = true ;
53
58
drop ( state) ;
54
59
55
- self . spawn_timeout_task ( timeout, context. clone ( ) ) ;
60
+ let retry = match self . target {
61
+ TargetType :: AlertManager ( _) => Retry :: Infinity ,
62
+ _ => Retry :: Range ( ..10 ) ,
63
+ } ;
64
+
65
+ self . spawn_timeout_task ( timeout, context. clone ( ) , retry) ;
56
66
call_target ( self . target . clone ( ) , context)
57
67
}
58
68
}
@@ -78,38 +88,53 @@ impl Target {
78
88
}
79
89
}
80
90
81
- fn spawn_timeout_task ( & self , timeout : & Timeout , alert_context : Context ) {
91
+ fn spawn_timeout_task ( & self , timeout : & Timeout , alert_context : Context , retry : Retry ) {
82
92
let state = Arc :: clone ( & timeout. state ) ;
83
93
let timeout = timeout. timeout ;
84
94
let target = self . target . clone ( ) ;
85
95
86
- actix_web:: rt:: spawn ( async move {
87
- const RETRIES : usize = 10 ;
88
- // sleep for timeout period
89
- for _ in 0 ..RETRIES {
96
+ let sleep_and_check_if_call = move |timeout_state : Arc < Mutex < TimeoutState > > | {
97
+ async move {
90
98
tokio:: time:: sleep ( timeout) . await ;
91
- let mut state = state . lock ( ) . unwrap ( ) ;
99
+ let mut state = timeout_state . lock ( ) . unwrap ( ) ;
92
100
if state. alert_state == AlertState :: Firing {
93
101
// it is still firing .. sleep more and come back
94
102
state. awaiting_resolve = true ;
95
-
96
- call_target ( target. clone ( ) , alert_context. clone ( ) )
103
+ true
97
104
} else {
98
105
state. timed_out = false ;
99
- return ;
106
+ false
100
107
}
101
108
}
109
+ } ;
102
110
103
- // fallback for if this task only observed FIRING on all RETRIES
104
- // Stream might be dead and sending too many alerts is not great
105
- // Send and alert stating that this alert will only work once it has seen a RESOLVE
106
- state. lock ( ) . unwrap ( ) . timed_out = false ;
107
- let mut context = alert_context;
108
- context. message = format ! (
109
- "Triggering alert did not resolve itself after {RETRIES} retries, This alert is paused until it resolves" ,
110
- ) ;
111
- // Send and exit this task.
112
- call_target ( target, context) ;
111
+ actix_web:: rt:: spawn ( async move {
112
+ match retry {
113
+ Retry :: Infinity => loop {
114
+ let should_call = sleep_and_check_if_call ( Arc :: clone ( & state) ) . await ;
115
+ if should_call {
116
+ call_target ( target. clone ( ) , alert_context. clone ( ) )
117
+ }
118
+ } ,
119
+ Retry :: Range ( range) => {
120
+ for _ in 0 ..range. end {
121
+ let should_call = sleep_and_check_if_call ( Arc :: clone ( & state) ) . await ;
122
+ if should_call {
123
+ call_target ( target. clone ( ) , alert_context. clone ( ) )
124
+ }
125
+ }
126
+ // fallback for if this task only observed FIRING on all RETRIES
127
+ // Stream might be dead and sending too many alerts is not great
128
+ // Send and alert stating that this alert will only work once it has seen a RESOLVE
129
+ state. lock ( ) . unwrap ( ) . timed_out = false ;
130
+ let mut context = alert_context;
131
+ context. message = format ! (
132
+ "Triggering alert did not resolve itself after {} retries, This alert is paused until it resolves" ,
133
+ range. end) ;
134
+ // Send and exit this task.
135
+ call_target ( target, context) ;
136
+ }
137
+ }
113
138
} ) ;
114
139
}
115
140
}
@@ -156,13 +181,15 @@ pub enum TargetType {
156
181
Slack ( SlackWebHook ) ,
157
182
#[ serde( rename = "webhook" ) ]
158
183
Other ( OtherWebHook ) ,
184
+ AlertManager ( AlertManager ) ,
159
185
}
160
186
161
187
impl TargetType {
162
188
pub fn call ( & self , payload : & Context ) {
163
189
match self {
164
190
TargetType :: Slack ( target) => target. call ( payload) ,
165
191
TargetType :: Other ( target) => target. call ( payload) ,
192
+ TargetType :: AlertManager ( target) => target. call ( payload) ,
166
193
}
167
194
}
168
195
}
@@ -226,6 +253,48 @@ impl CallableTarget for OtherWebHook {
226
253
}
227
254
}
228
255
256
+ #[ derive( Debug , Clone , PartialEq , Eq , Serialize , Deserialize ) ]
257
+ pub struct AlertManager {
258
+ url : String ,
259
+ }
260
+
261
+ impl CallableTarget for AlertManager {
262
+ fn call ( & self , payload : & Context ) {
263
+ let alert = match payload. alert_state {
264
+ AlertState :: SetToFiring => ureq:: json!( [ {
265
+ "labels" : {
266
+ "status" : "firing" ,
267
+ "alertname" : payload. alert_name,
268
+ "streamname" : payload. stream
269
+ } ,
270
+ "annotations" : {
271
+ "message" : payload. message,
272
+ "reason" : payload. reason
273
+ }
274
+ } ] ) ,
275
+ AlertState :: Resolved => ureq:: json!( [ {
276
+ "labels" : {
277
+ "status" : "resolved" ,
278
+ "alertname" : payload. alert_name,
279
+ "streamname" : payload. stream
280
+ } ,
281
+ "annotations" : {
282
+ "message" : payload. message,
283
+ "reason" : payload. reason
284
+ }
285
+ } ] ) ,
286
+ _ => unreachable ! ( ) ,
287
+ } ;
288
+
289
+ if let Err ( e) = ureq:: post ( & self . url )
290
+ . set ( "Content-Type" , "application/json" )
291
+ . send_json ( alert)
292
+ {
293
+ log:: error!( "Couldn't make call to alertmanager, error: {}" , e)
294
+ }
295
+ }
296
+ }
297
+
229
298
#[ derive( Debug , Serialize , Deserialize ) ]
230
299
pub struct Timeout {
231
300
#[ serde( with = "humantime_serde" ) ]
0 commit comments