@@ -94,12 +94,13 @@ function(VARS_2I2C_AWS_ACCOUNT_ID=null)
9494 };
9595
9696 local makePodRestartAlert = function (
97+ pod_name,
9798 summary,
9899 pod_name_regex,
99100 severity,
100101 labels={}
101102 ) {
102- alert: pod_name_substring + ' pod has restarted' ,
103+ alert: pod_name + ' pod has restarted' ,
103104 expr: |||
104105 # Count total container restarts with pod name containing 'pod_name_substring'.
105106 # We sum by pod name (which resets after restart) and namespace, so we don't get all
@@ -290,26 +291,31 @@ function(VARS_2I2C_AWS_ACCOUNT_ID=null)
290291 name: 'Important Pod Restart' ,
291292 rules: [
292293 makePodRestartAlert(
294+ 'jupyterhub-cost-monitoring' ,
293295 'jupyterhub-cost-monitoring pod has restarted on %s:{{ $labels.namespace }}' % [cluster_name],
294296 '.*cost-monitoring.*' ,
295297 'action needed this week'
296298 ),
297299 makePodRestartAlert(
300+ 'jupyterhub-groups-exporter' ,
298301 'jupyterhub-groups-exporter pod has restarted on %s:{{ $labels.namespace }}' % [cluster_name],
299302 '.*groups-exporter.*' ,
300303 'action needed this week'
301304 ),
302305 makePodRestartAlert(
306+ 'jupyterhub-home-nfs' ,
303307 'jupyterhub-home-nfs pod has restarted on %s:{{ $labels.namespace }}' % [cluster_name],
304308 '^storage-quota-home-nfs.*' ,
305309 'same day action needed'
306310 ),
307311 makePodRestartAlert(
312+ 'support-grafana' ,
308313 'support-grafana pod has restarted on %s:{{ $labels.namespace }}' % [cluster_name],
309314 '^support-grafana.*' ,
310315 'action needed this week'
311316 ),
312317 makePodRestartAlert(
318+ 'proxy' ,
313319 'proxy pod has restarted on %s:{{ $labels.namespace }}' % [cluster_name],
314320 '^proxy.*' ,
315321 'immediate action needed'
0 commit comments