diff --git a/Makefile b/Makefile index 4498b285..2b1efa57 100644 --- a/Makefile +++ b/Makefile @@ -16,6 +16,7 @@ _run: -f tools/make/pre-commit.mk \ -f tools/make/docker.mk \ -f tools/make/kube.mk \ + -f tools/make/observability.mk \ $(MAKECMDGOALS) .PHONY: _run diff --git a/config/grafana/dashboards.yaml b/config/grafana/dashboards.yaml deleted file mode 100644 index f34ddeef..00000000 --- a/config/grafana/dashboards.yaml +++ /dev/null @@ -1,10 +0,0 @@ -apiVersion: 1 -providers: - - name: LLM Router Dashboards - orgId: 1 - folder: "LLM Router" - type: file - disableDeletion: false - allowUiUpdates: true - options: - path: /etc/grafana/provisioning/dashboards \ No newline at end of file diff --git a/config/grafana/datasource.yaml b/config/grafana/datasource.yaml deleted file mode 100644 index 8d9f9d8f..00000000 --- a/config/grafana/datasource.yaml +++ /dev/null @@ -1,7 +0,0 @@ -apiVersion: 1 -datasources: - - name: Prometheus - type: prometheus - access: proxy - url: http://prometheus:9090 - isDefault: true \ No newline at end of file diff --git a/config/prometheus.yaml b/config/prometheus.yaml deleted file mode 100644 index f9a7ac37..00000000 --- a/config/prometheus.yaml +++ /dev/null @@ -1,22 +0,0 @@ -global: - scrape_interval: 10s - evaluation_interval: 10s - -scrape_configs: - # Semantic Router - - job_name: semantic-router - metrics_path: /metrics - static_configs: - - targets: ["semantic-router:9190"] - labels: - service: semantic-router - env: dev - - # Optional: Envoy - - job_name: envoy - metrics_path: /stats/prometheus - static_configs: - - targets: ["envoy-proxy:19000"] - labels: - service: envoy - env: dev \ No newline at end of file diff --git a/docker-compose.obs.yml b/docker-compose.obs.yml new file mode 100644 index 00000000..35bde379 --- /dev/null +++ b/docker-compose.obs.yml @@ -0,0 +1,42 @@ +# Local observability stack for monitoring semantic-router running on host +# +# Usage: make obs-local +# Or: docker compose -f docker-compose.obs.yml up +# +# This provides Prometheus and Grafana in Docker with network_mode: host +# to scrape metrics from router running natively on localhost:9190 + +version: '3.8' + +services: + prometheus: + image: prom/prometheus:v2.53.0 + container_name: prometheus-local + network_mode: host + volumes: + - ./tools/observability/prometheus.yaml:/etc/prometheus/prometheus.yaml:ro + - prometheus-local-data:/prometheus + command: + - '--config.file=/etc/prometheus/prometheus.yaml' + - '--storage.tsdb.path=/prometheus' + - '--storage.tsdb.retention.time=15d' + environment: + - ROUTER_TARGET=localhost:9190 + + grafana: + image: grafana/grafana:11.5.1 + container_name: grafana-local + network_mode: host + volumes: + - ./tools/observability/grafana-datasource.yaml:/etc/grafana/provisioning/datasources/datasource.yaml:ro + - ./tools/observability/grafana-dashboard.yaml:/etc/grafana/provisioning/dashboards/dashboard.yaml:ro + - ./tools/observability/llm-router-dashboard.json:/etc/grafana/provisioning/dashboards/llm-router-dashboard.json:ro + - grafana-local-data:/var/lib/grafana + environment: + - PROMETHEUS_URL=localhost:9090 + - GF_SECURITY_ADMIN_PASSWORD=admin + - GF_USERS_ALLOW_SIGN_UP=false + +volumes: + prometheus-local-data: + grafana-local-data: diff --git a/docker-compose.yml b/docker-compose.yml index 21f193ed..8b6990cc 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -69,10 +69,13 @@ services: image: prom/prometheus:v2.53.0 container_name: prometheus volumes: - - ./config/prometheus.yaml:/etc/prometheus/prometheus.yaml:ro + - ./tools/observability/prometheus.yaml:/etc/prometheus/prometheus.yaml:ro + - prometheus-data:/prometheus command: - --config.file=/etc/prometheus/prometheus.yaml - --storage.tsdb.retention.time=15d + environment: + - ROUTER_TARGET=semantic-router:9190 ports: - "9090:9090" networks: @@ -84,14 +87,18 @@ services: environment: - GF_SECURITY_ADMIN_USER=admin - GF_SECURITY_ADMIN_PASSWORD=admin + - PROMETHEUS_URL=prometheus:9090 ports: - "3000:3000" volumes: - - ./config/grafana/datasource.yaml:/etc/grafana/provisioning/datasources/datasource.yaml:ro - - ./config/grafana/dashboards.yaml:/etc/grafana/provisioning/dashboards/dashboards.yaml:ro - - ./deploy/llm-router-dashboard.json:/etc/grafana/provisioning/dashboards/llm-router-dashboard.json:ro + - ./tools/observability/grafana-datasource.yaml:/etc/grafana/provisioning/datasources/datasource.yaml:ro + - ./tools/observability/grafana-dashboard.yaml:/etc/grafana/provisioning/dashboards/dashboard.yaml:ro + - ./tools/observability/llm-router-dashboard.json:/etc/grafana/provisioning/dashboards/llm-router-dashboard.json:ro + - grafana-data:/var/lib/grafana networks: - semantic-network + depends_on: + - prometheus # LLM Katan service for testing llm-katan: @@ -118,3 +125,5 @@ networks: volumes: models-cache: driver: local + prometheus-data: + grafana-data: diff --git a/tools/make/common.mk b/tools/make/common.mk index d34f2dbc..08cf2340 100644 --- a/tools/make/common.mk +++ b/tools/make/common.mk @@ -85,6 +85,16 @@ help: @echo " docs-serve - Serve built documentation" @echo " docs-clean - Clean documentation artifacts" @echo "" + @echo " Observability targets:" + @echo " run-observability - Start observability (alias for obs-local)" + @echo " obs-local - Start observability in LOCAL mode" + @echo " obs-compose - Start observability in COMPOSE mode" + @echo " stop-observability - Stop observability stack" + @echo " open-observability - Open Prometheus and Grafana in browser" + @echo " obs-status - Check observability stack status" + @echo " obs-logs - Show observability logs" + @echo " obs-clean - Remove observability data volumes" + @echo "" @echo " Environment variables:" @echo " CONTAINER_RUNTIME - Container runtime (docker|podman, default: docker)" @echo " CONFIG_FILE - Config file path (default: config/config.yaml)" diff --git a/tools/make/observability.mk b/tools/make/observability.mk new file mode 100644 index 00000000..46ee90eb --- /dev/null +++ b/tools/make/observability.mk @@ -0,0 +1,51 @@ +# ====================== observability.mk ====================== +# = Observability targets for semantic-router monitoring = +# ====================== observability.mk ====================== + +# Observability directories +OBS_CONFIG_DIR = tools/observability +OBS_SCRIPTS_DIR = tools/observability/scripts + +.PHONY: run-observability stop-observability obs-local obs-compose open-observability obs-logs obs-status obs-clean + +## run-observability: Start observability stack (alias for obs-local) +run-observability: obs-local + +## obs-local: Start observability in LOCAL mode (router on host, obs in Docker) +obs-local: + @$(call log, Starting observability in LOCAL mode...) + @$(OBS_SCRIPTS_DIR)/start-observability.sh local + +## obs-compose: Start observability in COMPOSE mode (all services in Docker) +obs-compose: + @$(call log, Starting observability in COMPOSE mode...) + @$(OBS_SCRIPTS_DIR)/start-observability.sh compose + +## stop-observability: Stop and remove observability containers +stop-observability: + @$(call log, Stopping observability stack...) + @$(OBS_SCRIPTS_DIR)/stop-observability.sh + +## open-observability: Open Prometheus and Grafana in browser +open-observability: + @echo "Opening Prometheus and Grafana..." + @open http://localhost:9090 2>/dev/null || xdg-open http://localhost:9090 2>/dev/null || echo "Please open http://localhost:9090" + @open http://localhost:3000 2>/dev/null || xdg-open http://localhost:3000 2>/dev/null || echo "Please open http://localhost:3000" + +## obs-logs: Show logs from observability containers +obs-logs: + @docker compose -f docker-compose.obs.yml logs -f 2>/dev/null || docker compose logs prometheus grafana -f + +## obs-status: Check status of observability containers +obs-status: + @echo "==> Local mode:" + @docker compose -f docker-compose.obs.yml ps 2>/dev/null || echo " Not running" + @echo "" + @echo "==> Compose mode:" + @docker compose ps prometheus grafana 2>/dev/null || echo " Not running" + +## obs-clean: Remove observability data volumes +obs-clean: + @echo "⚠️ Removing all observability data volumes..." + @docker volume rm prometheus-local-data grafana-local-data prometheus-data grafana-data 2>/dev/null || true + @echo "✓ Done" diff --git a/tools/observability/README.md b/tools/observability/README.md new file mode 100644 index 00000000..528f7803 --- /dev/null +++ b/tools/observability/README.md @@ -0,0 +1,30 @@ +# Observability Configuration + +Prometheus and Grafana configuration files for monitoring semantic-router. + +## Files + +- `prometheus.yaml` - Prometheus scrape config (uses `$ROUTER_TARGET` env var) +- `grafana-datasource.yaml` - Grafana datasource (uses `$PROMETHEUS_URL` env var) +- `grafana-dashboard.yaml` - Dashboard provisioning config +- `llm-router-dashboard.json` - LLM Router dashboard + +## Usage + +**Local mode** (router on host, observability in Docker): + +```bash +make obs-local +``` + +**Compose mode** (all services in Docker): + +```bash +make obs-compose +# or: docker compose up +``` + +**Access:** + +- Prometheus: http://localhost:9090 +- Grafana: http://localhost:3000 (admin/admin) diff --git a/tools/observability/grafana-dashboard.yaml b/tools/observability/grafana-dashboard.yaml new file mode 100644 index 00000000..9e162ee1 --- /dev/null +++ b/tools/observability/grafana-dashboard.yaml @@ -0,0 +1,15 @@ +# Grafana dashboard provisioning configuration +# This file tells Grafana where to find dashboard JSON files + +apiVersion: 1 + +providers: + - name: "Semantic Router" + orgId: 1 + folder: "" + type: file + disableDeletion: false + updateIntervalSeconds: 10 + allowUiUpdates: true + options: + path: /etc/grafana/provisioning/dashboards diff --git a/tools/observability/grafana-datasource.yaml b/tools/observability/grafana-datasource.yaml new file mode 100644 index 00000000..a96c705e --- /dev/null +++ b/tools/observability/grafana-datasource.yaml @@ -0,0 +1,14 @@ +# Grafana datasource configuration for Prometheus +# This file is provisioned automatically when Grafana starts + +apiVersion: 1 + +datasources: + - name: Prometheus + type: prometheus + access: proxy + url: http://${PROMETHEUS_URL:-localhost:9090} + isDefault: true + editable: true + jsonData: + timeInterval: 15s diff --git a/tools/observability/llm-router-dashboard.json b/tools/observability/llm-router-dashboard.json new file mode 100644 index 00000000..ff136b6e --- /dev/null +++ b/tools/observability/llm-router-dashboard.json @@ -0,0 +1,1238 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 18, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 4, + "options": { + "displayMode": "gradient", + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "maxVizHeight": 300, + "minVizHeight": 16, + "minVizWidth": 8, + "namePlacement": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "auto", + "valueMode": "color", + "text": { + "valueSize": 24 + } + }, + "pluginVersion": "11.5.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "sum by(category) (llm_category_classifications_count)", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": true, + "legendFormat": "{{category}}", + "range": false, + "refId": "A", + "useBackend": false + } + ], + "title": "Prompt Category", + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Tokens/sec", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "tps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 2, + "options": { + "legend": { + "calcs": [ + "mean", + "max", + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.5.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum(rate(llm_model_completion_tokens_total[5m])) by (model)", + "legendFormat": "Completion Tokens {{model}}", + "range": true, + "refId": "A" + } + ], + "title": "Token Usage Rate by Model", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Routes/sec", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 3, + "options": { + "legend": { + "calcs": [ + "mean", + "max", + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.5.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum(rate(llm_model_routing_modifications_total[5m])) by (source_model, target_model)", + "format": "time_series", + "legendFormat": "{{source_model}} -> {{target_model}}", + "range": true, + "refId": "A" + } + ], + "title": "Model Routing Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Seconds", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 1, + "options": { + "legend": { + "calcs": [ + "mean", + "max", + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.5.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum(rate(llm_model_completion_latency_seconds_bucket[5m])) by (le, model))", + "legendFormat": "p95 {{model}}", + "range": true, + "refId": "A" + } + ], + "title": "Model Completion Latency (p95)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Seconds", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 5, + "options": { + "legend": { + "calcs": [ + "mean", + "max", + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.5.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum(rate(llm_model_ttft_seconds_bucket[5m])) by (le, model))", + "legendFormat": "TTFT p95 {{model}}", + "range": true, + "refId": "A" + } + ], + "title": "TTFT (p95) by Model", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Seconds per token", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 6, + "options": { + "legend": { + "calcs": [ + "mean", + "max", + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.5.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum(rate(llm_model_tpot_seconds_bucket[5m])) by (le, model))", + "legendFormat": "TPOT p95 {{model}}", + "range": true, + "refId": "A" + } + ], + "title": "TPOT (p95) by Model (sec/token)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Requests/sec", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 24 + }, + "id": 7, + "options": { + "legend": { + "calcs": [ + "mean", + "max", + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.5.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum(rate(llm_reasoning_decisions_total{enabled=\"true\"}[5m])) by (model, effort)", + "legendFormat": "Reasoning Enabled: {{model}} ({{effort}})", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum(rate(llm_reasoning_decisions_total{enabled=\"false\"}[5m])) by (model)", + "legendFormat": "Reasoning Disabled: {{model}}", + "range": true, + "refId": "B" + } + ], + "title": "Reasoning Rate by Model", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Cost", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "currencyUSD" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 24 + }, + "id": 8, + "options": { + "legend": { + "calcs": [ + "mean", + "max", + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.5.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum(rate(llm_model_cost_total{currency=\"USD\"}[5m])) by (model)", + "legendFormat": "Cost/sec: {{model}}", + "range": true, + "refId": "A" + } + ], + "title": "Model Cost Rate (USD/sec)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Errors/sec", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 32 + }, + "id": 9, + "options": { + "legend": { + "calcs": [ + "mean", + "max", + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.5.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum(rate(llm_request_errors_total{reason=\"pii_policy_denied\"}[5m])) by (model)", + "legendFormat": "PII Policy Denied: {{model}}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum(rate(llm_request_errors_total{reason=\"jailbreak_block\"}[5m])) by (model)", + "legendFormat": "Jailbreak Block: {{model}}", + "range": true, + "refId": "B" + } + ], + "title": "Refusal Rates by Model", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 0.01 + }, + { + "color": "red", + "value": 0.05 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 32 + }, + "id": 10, + "options": { + "displayMode": "gradient", + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "maxVizHeight": 300, + "minVizHeight": 16, + "minVizWidth": 8, + "namePlacement": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "auto", + "valueMode": "color" + }, + "pluginVersion": "11.5.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum(rate(llm_request_errors_total{reason=~\"pii_policy_denied|jailbreak_block\"}[5m])) by (model) / sum(rate(llm_model_requests_total[5m])) by (model)", + "legendFormat": "{{model}}", + "range": true, + "refId": "A" + } + ], + "title": "Refusal Rate Percentage by Model", + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "currencyUSD" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 40 + }, + "id": 11, + "options": { + "displayMode": "gradient", + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "maxVizHeight": 300, + "minVizHeight": 16, + "minVizWidth": 8, + "namePlacement": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "auto", + "valueMode": "color" + }, + "pluginVersion": "11.5.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum(llm_model_cost_total{currency=\"USD\"}) by (model)", + "legendFormat": "{{model}}", + "range": true, + "refId": "A" + } + ], + "title": "Total Cost by Model (USD)", + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Seconds", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 40 + }, + "id": 12, + "options": { + "legend": { + "calcs": [ + "mean", + "max", + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.5.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.50, sum(rate(llm_model_completion_latency_seconds_bucket[5m])) by (le, model))", + "legendFormat": "p50 {{model}}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.90, sum(rate(llm_model_completion_latency_seconds_bucket[5m])) by (le, model))", + "legendFormat": "p90 {{model}}", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum(rate(llm_model_completion_latency_seconds_bucket[5m])) by (le, model))", + "legendFormat": "p99 {{model}}", + "range": true, + "refId": "C" + } + ], + "title": "Model Completion Latency (p50/p90/p99)", + "type": "timeseries" + } + ], + "preload": false, + "refresh": "10s", + "schemaVersion": 40, + "tags": [ + "llm-router" + ], + "templating": { + "list": [ + { + "current": { + "text": "prometheus", + "value": "prometheus" + }, + "includeAll": false, + "name": "DS_PROMETHEUS", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + } + ] + }, + "time": { + "from": "now-5m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "LLM Router Metrics", + "uid": "llm-router-metrics", + "version": 14, + "weekStart": "" +} \ No newline at end of file diff --git a/tools/observability/prometheus.yaml b/tools/observability/prometheus.yaml new file mode 100644 index 00000000..74172124 --- /dev/null +++ b/tools/observability/prometheus.yaml @@ -0,0 +1,33 @@ +# Prometheus configuration for semantic-router observability +# +# This configuration works for both: +# - Local development (router running natively, observability in Docker) +# - Docker Compose (all services in containers) +# +# The target address is controlled by environment variable: +# - Local mode: ROUTER_TARGET=localhost:9190 +# - Compose mode: ROUTER_TARGET=semantic-router:9190 + +global: + scrape_interval: 15s + evaluation_interval: 15s + +scrape_configs: + # Semantic Router metrics + - job_name: semantic-router + metrics_path: /metrics + static_configs: + - targets: ["${ROUTER_TARGET:-localhost:9190}"] + labels: + service: semantic-router + environment: docker + + # Optional: Envoy proxy metrics + # Uncomment if Envoy is running + # - job_name: envoy + # metrics_path: /stats/prometheus + # static_configs: + # - targets: ["${ENVOY_TARGET:-envoy-proxy:19000}"] + # labels: + # service: envoy + # environment: docker diff --git a/tools/observability/scripts/start-observability.sh b/tools/observability/scripts/start-observability.sh new file mode 100755 index 00000000..b017507b --- /dev/null +++ b/tools/observability/scripts/start-observability.sh @@ -0,0 +1,136 @@ +#!/usr/bin/env bash +# start-observability.sh +# +# Starts Prometheus and Grafana using Docker Compose +# +# This script starts observability stack to monitor semantic-router. +# It supports two modes: +# - Local mode: Router running natively, observability in Docker (network_mode: host) +# - Compose mode: All services in Docker (uses semantic-network) +# +# Prerequisites: +# - Docker and Docker Compose installed and running +# +# Usage: +# ./scripts/start-observability.sh [local|compose] +# +# To stop: +# ./scripts/stop-observability.sh + +set -euo pipefail + +# Color output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Directories +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" + +log_info() { echo -e "${GREEN}[INFO]${NC} $1"; } +log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; } +log_error() { echo -e "${RED}[ERROR]${NC} $1"; } +log_debug() { echo -e "${BLUE}[DEBUG]${NC} $1"; } + +# Parse mode argument +MODE="${1:-local}" + +case "${MODE}" in + local) + log_info "Starting observability in LOCAL mode (router on host, observability in Docker)" + COMPOSE_CMD="docker compose -f ${PROJECT_ROOT}/docker-compose.obs.yml" + ;; + compose) + log_info "Starting observability in COMPOSE mode (all services in Docker)" + COMPOSE_CMD="docker compose -f ${PROJECT_ROOT}/docker-compose.yml" + ;; + *) + log_error "Invalid mode: ${MODE}" + log_info "Usage: $0 [local|compose]" + log_info " local - Router on host, observability in Docker (default)" + log_info " compose - All services in Docker (uses main docker-compose.yml)" + exit 1 + ;; +esac + +# Check if Docker is available +if ! command -v docker &> /dev/null; then + log_error "Docker is not installed or not in PATH" + log_info "Please install Docker Desktop: https://www.docker.com/products/docker-desktop" + exit 1 +fi + +# Check if Docker daemon is running +if ! docker info &> /dev/null; then + log_error "Docker daemon is not running" + log_info "Please start Docker Desktop" + exit 1 +fi + +log_info "Starting services..." +log_debug "Command: ${COMPOSE_CMD} up -d" + +${COMPOSE_CMD} up -d + +# Wait for services to become healthy +log_info "Waiting for services to become healthy..." +sleep 10 + +# Check service status +if [[ "${MODE}" == "local" ]]; then + PROM_CONTAINER="prometheus-local" + GRAF_CONTAINER="grafana-local" +else + PROM_CONTAINER="prometheus" + GRAF_CONTAINER="grafana" +fi + +if docker ps --format '{{.Names}}' | grep -q "^${PROM_CONTAINER}$"; then + log_info "✓ Prometheus is running at http://localhost:9090" +else + log_warn "⚠ Prometheus not running" + log_info " Check logs: docker logs ${PROM_CONTAINER}" +fi + +if docker ps --format '{{.Names}}' | grep -q "^${GRAF_CONTAINER}$"; then + log_info "✓ Grafana is running at http://localhost:3000" + log_info " Default credentials: admin / admin" +else + log_warn "⚠ Grafana not running" + log_info " Check logs: docker logs ${GRAF_CONTAINER}" +fi + +echo "" +log_info "===================================================================" +log_info "Observability stack started successfully in ${MODE^^} mode!" +log_info "===================================================================" +echo "" + +if [[ "${MODE}" == "local" ]]; then + log_info "Next steps:" + log_info " 1. Start semantic-router on localhost:9190" + log_info " 2. Open Prometheus: http://localhost:9090/targets" + log_info " 3. Open Grafana: http://localhost:3000" + log_info " 4. View dashboard: LLM Router Metrics" +else + log_info "Next steps:" + log_info " 1. Ensure semantic-router is running in Docker" + log_info " 2. Open Prometheus: http://localhost:9090/targets" + log_info " 3. Open Grafana: http://localhost:3000" + log_info " 4. View dashboard: LLM Router Metrics" +fi + +echo "" +log_info "Useful commands:" +if [[ "${MODE}" == "local" ]]; then + log_info " - Check status: docker compose -f docker-compose.obs.yml ps" + log_info " - View logs: docker compose -f docker-compose.obs.yml logs -f" +else + log_info " - Check status: docker compose ps" + log_info " - View logs: docker compose logs prometheus grafana -f" +fi +log_info " - Stop services: make stop-observability" +echo "" diff --git a/tools/observability/scripts/stop-observability.sh b/tools/observability/scripts/stop-observability.sh new file mode 100755 index 00000000..60d42fad --- /dev/null +++ b/tools/observability/scripts/stop-observability.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash +# stop-observability.sh +# +# Stops and removes observability Docker containers using Docker Compose. +# +# Usage: +# ./scripts/stop-observability.sh + +set -euo pipefail + +# Color output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Directories +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" + +log_info() { echo -e "${GREEN}[INFO]${NC} $1"; } +log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; } + +echo -e "${BLUE}==================================================================${NC}" +echo -e "${BLUE} Stopping Observability Stack${NC}" +echo -e "${BLUE}==================================================================${NC}" +echo "" + +# Stop services +log_info "Stopping observability services..." + +# Try stopping local mode containers first +if docker ps -a --format '{{.Names}}' | grep -qE '^(prometheus-local|grafana-local)$'; then + log_info "Stopping local mode containers..." + docker compose -f "${PROJECT_ROOT}/docker-compose.obs.yml" down +fi + +# Also stop compose mode if running as part of main stack +if docker ps -a --format '{{.Names}}' | grep -qE '^(prometheus|grafana)$' && ! docker ps -a --format '{{.Names}}' | grep -q 'semantic-router'; then + log_warn "Observability containers from main stack are running" + log_info "Use 'docker compose down' to stop the full stack" +fi + +echo "" +log_info "Observability stopped" +echo "" diff --git a/website/docs/tutorials/observability/observability.md b/website/docs/tutorials/observability/observability.md index e8b5168c..bba9757a 100644 --- a/website/docs/tutorials/observability/observability.md +++ b/website/docs/tutorials/observability/observability.md @@ -1,265 +1,190 @@ # Observability -This page focuses solely on collecting and visualizing metrics for Semantic Router using Prometheus and Grafana—deployment method (Docker Compose vs Kubernetes) is covered in `docker-quickstart.md`. +Metrics collection and visualization for Semantic Router using Prometheus and Grafana. --- -## 1. Metrics & Endpoints Summary +## 1. Metrics & Endpoints -| Component | Endpoint | Notes | -| ---------------------------- | ------------------------- | ------------------------------------------ | -| Router metrics | `:9190/metrics` | Prometheus format (flag: `--metrics-port`) | -| Router health (future probe) | `:8080/health` | HTTP readiness/liveness candidate | -| Envoy metrics (optional) | `:19000/stats/prometheus` | If you enable Envoy | +| Component | Endpoint | Notes | +| ------------------------ | ------------------------- | ------------------------------------------ | +| Router metrics | `:9190/metrics` | Prometheus format (flag: `--metrics-port`) | +| Router health | `:8080/health` | HTTP readiness/liveness | +| Envoy metrics (optional) | `:19000/stats/prometheus` | If Envoy is enabled | -Dashboard JSON: `deploy/llm-router-dashboard.json`. - -Primary source file exposing metrics: `src/semantic-router/cmd/main.go` (uses `promhttp`). +**Configuration location**: `tools/observability/` +**Dashboard**: `tools/observability/llm-router-dashboard.json` --- -## 2. Docker Compose Observability - -Compose bundles: `prometheus`, `grafana`, `semantic-router`, (optional) `envoy`, `mock-vllm`. +## 2. Local Mode (Router on Host) -Key files: +Run router natively on host, observability in Docker. -- `config/prometheus.yaml` -- `config/grafana/datasource.yaml` -- `config/grafana/dashboards.yaml` -- `deploy/llm-router-dashboard.json` - -Start (with testing profile example): +### Quick Start ```bash -CONFIG_FILE=/app/config/config.testing.yaml docker compose --profile testing up --build +# Start router +make run-router + +# Start observability +make obs-local ``` -Access: +**Access:** - Prometheus: http://localhost:9090 - Grafana: http://localhost:3000 (admin/admin) -Expected Prometheus targets: - -- `semantic-router:9190` -- `envoy-proxy:19000` (optional) - ---- - -## 3. Kubernetes Observability - -This guide adds a production-ready Prometheus + Grafana stack to the existing Semantic Router Kubernetes deployment. It includes manifests for collectors, dashboards, data sources, RBAC, and ingress so you can monitor routing performance in any cluster. - -> **Namespace** – All manifests default to the `vllm-semantic-router-system` namespace to match the core deployment. Override it with Kustomize if you use a different namespace. - -## What Gets Installed - -| Component | Purpose | Key Files | -|--------------|---------|-----------| -| Prometheus | Scrapes Semantic Router metrics and stores them with persistent retention | `prometheus/` (`rbac.yaml`, `configmap.yaml`, `deployment.yaml`, `pvc.yaml`, `service.yaml`)| -| Grafana | Visualizes metrics using the bundled LLM Router dashboard and a pre-configured Prometheus datasource | `grafana/` (`secret.yaml`, `configmap-*.yaml`, `deployment.yaml`, `pvc.yaml`, `service.yaml`)| -| Ingress (optional) | Exposes the UIs outside the cluster | `ingress.yaml`| -| Dashboard provisioning | Automatically loads `deploy/llm-router-dashboard.json` into Grafana | `grafana/configmap-dashboard.yaml`| - -Prometheus is configured to discover the `semantic-router-metrics` service (port `9190`) automatically. Grafana provisions the same LLM Router dashboard that ships with the Docker Compose stack. - -### 1. Prerequisites +**Verify targets:** -- Deployed Semantic Router workload via `deploy/kubernetes/` -- A Kubernetes cluster (managed, on-prem, or kind) -- `kubectl` v1.23+ -- Optional: an ingress controller (NGINX, ALB, etc.) if you want external access - -### 2. Directory Layout - -``` -deploy/kubernetes/observability/ -├── README.md -├── kustomization.yaml # (created in the next step) -├── ingress.yaml # optional HTTPS ingress examples -├── prometheus/ -│ ├── configmap.yaml # Scrape config (Kubernetes SD) -│ ├── deployment.yaml -│ ├── pvc.yaml -│ ├── rbac.yaml # SA + ClusterRole + binding -│ └── service.yaml -└── grafana/ - ├── configmap-dashboard.yaml # Bundled LLM router dashboard - ├── configmap-provisioning.yaml # Datasource + provider config - ├── deployment.yaml - ├── pvc.yaml - ├── secret.yaml # Admin credentials (override in prod) - └── service.yaml +```bash +# Check Prometheus scrapes localhost:9190 +open http://localhost:9090/targets ``` -### 3. Prometheus Configuration Highlights +**Stop:** -- Uses `kubernetes_sd_configs` to enumerate endpoints in `vllm-semantic-router-system` -- Keeps 15 days of metrics by default (`--storage.tsdb.retention.time=15d`) -- Stores metrics in a `PersistentVolumeClaim` named `prometheus-data` -- RBAC rules grant read-only access to Services, Endpoints, Pods, Nodes, and EndpointSlices - -#### Scrape configuration snippet - -```yaml -scrape_configs: - - job_name: semantic-router - kubernetes_sd_configs: - - role: endpoints - namespaces: - names: - - vllm-semantic-router-system - relabel_configs: - - source_labels: [__meta_kubernetes_service_name] - regex: semantic-router-metrics - action: keep - - source_labels: [__meta_kubernetes_endpoint_port_name] - regex: metrics - action: keep +```bash +make stop-observability ``` -Modify the namespace or service name if you changed them in your primary deployment. +### Configuration -### 4. Grafana Configuration Highlights +All configs in `tools/observability/`: -- Stateful deployment backed by the `grafana-storage` PVC -- Datasource provisioned automatically pointing to `http://prometheus:9090` -- Dashboard provider watches `/var/lib/grafana-dashboards` -- Bundled `llm-router-dashboard.json` is identical to `deploy/llm-router-dashboard.json` -- Admin credentials pulled from the `grafana-admin` secret (default `admin/admin` – **change this!)** +- `prometheus.yaml` - Scrapes `localhost:9190` when `ROUTER_TARGET=localhost:9190` +- `grafana-datasource.yaml` - Points to `localhost:9090` +- `grafana-dashboard.yaml` - Dashboard provisioning +- `llm-router-dashboard.json` - Dashboard definition -#### Updating credentials +### Troubleshooting -```bash -kubectl create secret generic grafana-admin \ - --namespace vllm-semantic-router-system \ - --from-literal=admin-user=monitor \ - --from-literal=admin-password='pick-a-strong-password' \ - --dry-run=client -o yaml | kubectl apply -f - -``` - -Remove or overwrite the committed `secret.yaml` when you adopt a different secret management approach. +| Issue | Fix | +| ------------- | --------------------------------------- | +| Target DOWN | Start router: `make run-router` | +| No metrics | Generate traffic, check `:9190/metrics` | +| Port conflict | Change port or stop conflicting service | -### 5. Deployment Steps +--- -#### 5.1. Create the Kustomization +## 3. Docker Compose Mode -Create `deploy/kubernetes/observability/kustomization.yaml` (see below) to assemble all manifests. This guide assumes you keep Prometheus & Grafana in the same namespace as the router. +All services in Docker containers. -#### 5.2. Apply manifests +### Quick Start ```bash -kubectl apply -k deploy/kubernetes/observability/ +# Start full stack (includes observability) +docker compose up --build + +# Or with testing profile +docker compose --profile testing up --build ``` -Verify pods: +**Access:** -```bash -kubectl get pods -n vllm-semantic-router-system -``` +- Prometheus: http://localhost:9090 +- Grafana: http://localhost:3000 (admin/admin) -You should see `prometheus-...` and `grafana-...` pods in `Running` state. +**Expected targets:** -#### 5.3. Integration with the core deployment +- `semantic-router:9190` +- `envoy-proxy:19000` (optional) -1. Deploy or update Semantic Router (`kubectl apply -k deploy/kubernetes/`). -2. Deploy observability stack (`kubectl apply -k deploy/kubernetes/observability/`). -3. Confirm the metrics service (`semantic-router-metrics`) has endpoints: +### Configuration - ```bash - kubectl get endpoints semantic-router-metrics -n vllm-semantic-router-system - ``` +Same configs as local mode (`tools/observability/`), but: -4. Prometheus target should transition to **UP** within ~15 seconds. +- `ROUTER_TARGET=semantic-router:9190` +- `PROMETHEUS_URL=prometheus:9090` +- Uses `semantic-network` bridge network -#### 5.4. Accessing the UIs +--- -> **Optional Ingress** – If you prefer to keep the stack private, delete `ingress.yaml` from `kustomization.yaml` before applying. +## 4. Kubernetes Mode -- **Port-forward (quick check)** +Production-ready Prometheus + Grafana for K8s clusters. - ```bash - kubectl port-forward svc/prometheus 9090:9090 -n vllm-semantic-router-system - kubectl port-forward svc/grafana 3000:3000 -n vllm-semantic-router-system - ``` +> **Namespace**: `vllm-semantic-router-system` - Prometheus → http://localhost:9090, Grafana → http://localhost:3000 +### Components -- **Ingress (production)** – Customize `ingress.yaml` with real domains, TLS secrets, and your ingress class before applying. Replace `*.example.com` and configure HTTPS certificates via cert-manager or your provider. +| Component | Purpose | Location | +| ---------- | ------------------------------------- | ---------------------------------------------- | +| Prometheus | Scrapes router metrics, 15d retention | `deploy/kubernetes/observability/prometheus/` | +| Grafana | Dashboard visualization | `deploy/kubernetes/observability/grafana/` | +| Ingress | Optional external access | `deploy/kubernetes/observability/ingress.yaml` | -### 6. Verifying Metrics Collection +### Deploy -1. Open Prometheus (port-forward or ingress) → **Status ▸ Targets** → ensure `semantic-router` job is green. -2. Query `rate(llm_model_completion_tokens_total[5m])` – should return data after traffic. -3. Open Grafana, log in with the admin credentials, and confirm the **LLM Router Metrics** dashboard exists under the *Semantic Router* folder. -4. Generate traffic to Semantic Router (classification or routing requests). Key panels should start populating: - - Prompt Category counts - - Token usage rate per model - - Routing modifications between models - - Latency histograms (TTFT, completion p95) +```bash +# Apply manifests +kubectl apply -k deploy/kubernetes/observability/ -### 7. Dashboard Customization +# Verify +kubectl get pods -n vllm-semantic-router-system +``` -- Duplicate the provisioned dashboard inside Grafana to make changes while keeping the original as a template. -- Update Grafana provisioning (`grafana/configmap-provisioning.yaml`) to point to alternate folders or add new providers. -- Add additional dashboards by extending `grafana/configmap-dashboard.yaml` or mounting a different ConfigMap. -- Incorporate Kubernetes cluster metrics (CPU/memory) by adding another datasource or deploying kube-state-metrics + node exporters. +### Access -### 8. Best Practices +**Port-forward:** -#### Resource Sizing +```bash +kubectl port-forward svc/prometheus 9090:9090 -n vllm-semantic-router-system +kubectl port-forward svc/grafana 3000:3000 -n vllm-semantic-router-system +``` -- Prometheus: increase CPU/memory with higher scrape cardinality or retention > 15 days. -- Grafana: start with `500m` CPU / `1Gi` RAM; scale replicas horizontally when concurrent viewers exceed a few dozen. +**Ingress:** Customize `ingress.yaml` with your domain and TLS -#### Storage +### Key Configuration -- Use SSD-backed storage classes for Prometheus when retention/window is large. -- Increase `prometheus/pvc.yaml` (default 20Gi) and `grafana/pvc.yaml` (default 10Gi) to match retention requirements. -- Enable volume snapshots or backups for dashboards and alert history. +**Prometheus** uses Kubernetes service discovery: -#### Security +```yaml +scrape_configs: + - job_name: semantic-router + kubernetes_sd_configs: + - role: endpoints + namespaces: + names: [vllm-semantic-router-system] +``` -- Replace the demo `grafana-admin` secret with credentials stored in your preferred secret manager. -- Restrict ingress access with network policies, OAuth proxies, or SSO integrations. -- Enable Grafana role-based access control and API keys for automation. -- Scope Prometheus RBAC to only the namespaces you need. If metrics run in multiple namespaces, list them in the scrape config. +**Grafana** credentials (change in production): -#### Maintenance +```bash +kubectl create secret generic grafana-admin \ + --namespace vllm-semantic-router-system \ + --from-literal=admin-user=admin \ + --from-literal=admin-password='your-password' +``` -- Monitor Prometheus disk usage; prune retention or scale PVC before it fills up. -- Back up Grafana dashboards or store them in Git (already done through this ConfigMap). -- Roll upgrades separately: update Prometheus and Grafana images via `kustomization.yaml` patches. -- Consider adopting the Prometheus Operator (`ServiceMonitor` + `PodMonitor`) if you already run kube-prometheus-stack. A sample `ServiceMonitor` is in `website/docs/tutorials/observability/observability.md`. +--- -## 4. Key Metrics (Sample) +## 5. Key Metrics -| Metric | Type | Description | -| ------------------------------------------------------------- | --------- | -------------------------------------------- | -| `llm_category_classifications_count` | counter | Number of category classification operations | -| `llm_model_completion_tokens_total` | counter | Tokens emitted per model | -| `llm_model_routing_modifications_total` | counter | Model switch / routing adjustments | -| `llm_model_completion_latency_seconds` | histogram | Completion latency distribution | -| `process_cpu_seconds_total` / `process_resident_memory_bytes` | standard | Runtime resource usage | +| Metric | Type | Description | +| --------------------------------------- | --------- | ------------------------ | +| `llm_category_classifications_count` | counter | Category classifications | +| `llm_model_completion_tokens_total` | counter | Tokens per model | +| `llm_model_routing_modifications_total` | counter | Model routing changes | +| `llm_model_completion_latency_seconds` | histogram | Completion latency | -Use typical PromQL patterns: +**Example queries:** ```promql rate(llm_model_completion_tokens_total[5m]) -histogram_quantile(0.95, sum by (le) (rate(llm_model_completion_latency_seconds_bucket[5m]))) +histogram_quantile(0.95, rate(llm_model_completion_latency_seconds_bucket[5m])) ``` --- -## 5. Troubleshooting +## 6. Troubleshooting -| Symptom | Likely Cause | Check | Fix | -| --------------------- | ------------------------- | ---------------------------------------- | ---------------------------------------------------------------- | -| Target DOWN (Docker) | Service name mismatch | Prometheus /targets | Ensure `semantic-router` container running | -| Target DOWN (K8s) | Label/selectors mismatch | `kubectl get ep semantic-router-metrics` | Align labels or ServiceMonitor selector | -| No new tokens metrics | No traffic | Generate chat/completions via Envoy | Send test requests | -| Dashboard empty | Datasource URL wrong | Grafana datasource settings | Point to `http://prometheus:9090` (Docker) or cluster Prometheus | -| Large 5xx spikes | Backend model unreachable | Router logs | Verify vLLM endpoints configuration | +| Issue | Check | Fix | +| --------------- | ------------------- | ----------------------------------------------------- | +| Target DOWN | Prometheus /targets | Verify router is running and exposing `:9190/metrics` | +| No metrics | Generate traffic | Send requests through router | +| Dashboard empty | Grafana datasource | Check Prometheus URL configuration | ---