metrics: Add sample alert rules (#912)

JeffLuoo · web-flow · commit 9f9fe74833f9 · 2025-06-09T09:00:24.000-07:00
diff --git a/site-src/guides/metrics.md b/site-src/guides/metrics.md
@@ -93,4 +93,69 @@ TOKEN=$(kubectl -n default get secret inference-gateway-sa-metrics-reader-secret
 kubectl -n default port-forward inference-gateway-ext-proc-pod-name  9090
 
 curl -H "Authorization: Bearer $TOKEN" localhost:9090/metrics
-```
+```
+
+## Prometheus Alerts
+
+The section instructs how to configure prometheus alerts using collected metrics.
+
+### Configure alerts
+
+You can follow this [blog post](https://grafana.com/blog/2020/02/25/step-by-step-guide-to-setting-up-prometheus-alertmanager-with-slack-pagerduty-and-gmail/) for instruction of setting up alerts in your monitoring stacks with Prometheus.
+
+A template alert rule is available at [alert.yaml](../../tools/alerts/alert.yaml). You can modify and append these rules to your existing Prometheus deployment.
+
+#### High Inference Request Latency P99
+
+```
+alert: HighInferenceRequestLatencyP99
+expr: histogram_quantile(0.99, rate(inference_model_request_duration_seconds_bucket[5m])) > 10.0 # Adjust threshold as needed (e.g., 10.0 seconds)
+for: 5m
+annotations:
+  title: 'High latency (P99) for model {{ $labels.model_name }}'
+  description: 'The 99th percentile request duration for model {{ $labels.model_name }} and target model {{ $labels.target_model_name }} has been consistently above 10.0 seconds for 5 minutes.'
+labels:
+  severity: 'warning'
+```
+
+#### High Inference Error Rate
+
+```
+alert: HighInferenceErrorRate
+expr: sum by (model_name) (rate(inference_model_request_error_total[5m])) / sum by (model_name) (rate(inference_model_request_total[5m])) > 0.05 # Adjust threshold as needed (e.g., 5% error rate)
+for: 5m
+annotations:
+  title: 'High error rate for model {{ $labels.model_name }}'
+  description: 'The error rate for model {{ $labels.model_name }} and target model {{ $labels.target_model_name }} has been consistently above 5% for 5 minutes.'
+labels:
+  severity: 'critical'
+  impact: 'availability'
+```
+
+#### High Inference Pool Queue Average Size
+
+```
+alert: HighInferencePoolAvgQueueSize
+expr: inference_pool_average_queue_size > 50 # Adjust threshold based on expected queue size
+for: 5m
+annotations:
+  title: 'High average queue size for inference pool {{ $labels.name }}'
+  description: 'The average number of requests pending in the queue for inference pool {{ $labels.name }} has been consistently above 50 for 5 minutes.'
+labels:
+  severity: 'critical'
+  impact: 'performance'
+```
+
+#### High Inference Pool Average KV Cache
+
+```
+alert: HighInferencePoolAvgKVCacheUtilization
+expr: inference_pool_average_kv_cache_utilization > 0.9 # 90% utilization
+for: 5m
+annotations:
+  title: 'High KV cache utilization for inference pool {{ $labels.name }}'
+  description: 'The average KV cache utilization for inference pool {{ $labels.name }} has been consistently above 90% for 5 minutes, indicating potential resource exhaustion.'
+labels:
+  severity: 'critical'
+  impact: 'resource_exhaustion'
+```
diff --git a/tools/alerts/alert.yaml b/tools/alerts/alert.yaml
@@ -0,0 +1,38 @@
+groups:
+- name: gateway-api-inference-extension
+  rules:
+  - alert: HighInferenceRequestLatencyP99
+    annotations:
+      title: 'High latency (P99) for model {{ $labels.model_name }}'
+      description: 'The 99th percentile request duration for model {{ $labels.model_name }} and target model {{ $labels.target_model_name }} has been consistently above 10.0 seconds for 5 minutes.'
+    expr: histogram_quantile(0.99, rate(inference_model_request_duration_seconds_bucket[5m])) > 10.0
+    for: 5m
+    labels:
+      severity: 'warning'
+  - alert: HighInferenceErrorRate
+    annotations:
+      title: 'High error rate for model {{ $labels.model_name }}'
+      description: 'The error rate for model {{ $labels.model_name }} and target model {{ $labels.target_model_name }} has been consistently above 5% for 5 minutes.'
+    expr: sum by (model_name) (rate(inference_model_request_error_total[5m])) / sum by (model_name) (rate(inference_model_request_total[5m])) > 0.05
+    for: 5m
+    labels:
+      severity: 'critical'
+      impact: 'availability'
+  - alert: HighInferencePoolAvgQueueSize
+    annotations:
+      title: 'High average queue size for inference pool {{ $labels.name }}'
+      description: 'The average number of requests pending in the queue for inference pool {{ $labels.name }} has been consistently above 50 for 5 minutes.'
+    expr: inference_pool_average_queue_size > 50
+    for: 5m
+    labels:
+      severity: 'critical'
+      impact: 'performance'
+  - alert: HighInferencePoolAvgKVCacheUtilization
+    annotations:
+      title: 'High KV cache utilization for inference pool {{ $labels.name }}'
+      description: 'The average KV cache utilization for inference pool {{ $labels.name }} has been consistently above 90% for 5 minutes, indicating potential resource exhaustion.'
+    expr: inference_pool_average_kv_cache_utilization > 0.9
+    for: 5m
+    labels:
+      severity: 'critical'
+      impact: 'resource_exhaustion'