diff --git a/charms/argo-controller/src/grafana_dashboards/basic.json.tmpl b/charms/argo-controller/src/grafana_dashboards/basic.json.tmpl index 8694d04..36a1492 100644 --- a/charms/argo-controller/src/grafana_dashboards/basic.json.tmpl +++ b/charms/argo-controller/src/grafana_dashboards/basic.json.tmpl @@ -469,7 +469,7 @@ "datasource": "${prometheusds}", "editorMode": "builder", "exemplar": false, - "expr": "argo_pod_missing{juju_application=~\"$juju_application\",juju_model=~\"$juju_model\",juju_model_uuid=~\"$juju_model_uuid\",juju_unit=~\"$juju_unit\"}", + "expr": "sum(increase(argo_workflows_pod_missing_total{juju_application=~\"$juju_application\",juju_model=~\"$juju_model\",juju_model_uuid=~\"$juju_model_uuid\",juju_unit=~\"$juju_unit\"}[10m])) by (node_phase)", "format": "time_series", "instant": false, "interval": "", @@ -478,7 +478,7 @@ "refId": "A" } ], - "title": "Workflows missing Pods", + "title": "Workflows missing Pods in the past 10 minutes", "transparent": true, "type": "gauge" }, diff --git a/charms/argo-controller/src/prometheus_alert_rules/missing_pods.rule b/charms/argo-controller/src/prometheus_alert_rules/missing_pods.rule index 61a142d..32ed35d 100644 --- a/charms/argo-controller/src/prometheus_alert_rules/missing_pods.rule +++ b/charms/argo-controller/src/prometheus_alert_rules/missing_pods.rule @@ -1,11 +1,11 @@ alert: ArgoWorkflowPodsMissing -expr: max_over_time(argo_pod_missing[5m]) > 0 +expr: sum without (node_phase, recently_started) (increase(argo_workflows_pod_missing_total[10m])) > 0 for: 5m labels: severity: critical annotations: summary: "Missing workflow pods detected" description: > - Detected missing workflow pods in the last 5 minutes. - Missing pods are expected pods that never appeared or were deleted. - See https://argo-workflows.readthedocs.io/en/release-3.7/metrics/#argo_pod_missing for details. + The number of missing workflow pods has increased in the last 5 minutes. + This indicates that expected pods never appeared or were deleted unexpectedly. + See https://argo-workflows.readthedocs.io/en/release-3.7/metrics/#pod_missing for details.