From 0665cbf773a64b2a70df88673da4eb2b2dc0a37b Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Tue, 30 Sep 2025 12:26:02 +0800
Subject: [PATCH 1/6] feat: add Grafava+Prometheus in k8s

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 deploy/kubernetes/observability/README.md     | 203 ++++++
 .../grafana/configmap-dashboard.yaml          | 652 ++++++++++++++++++
 .../grafana/configmap-provisioning.yaml       |  30 +
 .../observability/grafana/deployment.yaml     |  85 +++
 .../kubernetes/observability/grafana/pvc.yaml |  12 +
 .../observability/grafana/secret.yaml         |  10 +
 .../observability/grafana/service.yaml        |  14 +
 deploy/kubernetes/observability/ingress.yaml  |  53 ++
 .../observability/kustomization.yaml          |  22 +
 .../observability/prometheus/configmap.yaml   |  35 +
 .../observability/prometheus/deployment.yaml  |  54 ++
 .../observability/prometheus/pvc.yaml         |  12 +
 .../observability/prometheus/rbac.yaml        |  43 ++
 .../observability/prometheus/service.yaml     |  14 +
 14 files changed, 1239 insertions(+)
 create mode 100644 deploy/kubernetes/observability/README.md
 create mode 100644 deploy/kubernetes/observability/grafana/configmap-dashboard.yaml
 create mode 100644 deploy/kubernetes/observability/grafana/configmap-provisioning.yaml
 create mode 100644 deploy/kubernetes/observability/grafana/deployment.yaml
 create mode 100644 deploy/kubernetes/observability/grafana/pvc.yaml
 create mode 100644 deploy/kubernetes/observability/grafana/secret.yaml
 create mode 100644 deploy/kubernetes/observability/grafana/service.yaml
 create mode 100644 deploy/kubernetes/observability/ingress.yaml
 create mode 100644 deploy/kubernetes/observability/kustomization.yaml
 create mode 100644 deploy/kubernetes/observability/prometheus/configmap.yaml
 create mode 100644 deploy/kubernetes/observability/prometheus/deployment.yaml
 create mode 100644 deploy/kubernetes/observability/prometheus/pvc.yaml
 create mode 100644 deploy/kubernetes/observability/prometheus/rbac.yaml
 create mode 100644 deploy/kubernetes/observability/prometheus/service.yaml

diff --git a/deploy/kubernetes/observability/README.md b/deploy/kubernetes/observability/README.md
new file mode 100644
index 00000000..640621ce
--- /dev/null
+++ b/deploy/kubernetes/observability/README.md
@@ -0,0 +1,203 @@
+# Semantic Router Observability on Kubernetes
+
+This guide adds a production-ready Prometheus + Grafana stack to the existing Semantic Router Kubernetes deployment. It includes manifests for collectors, dashboards, data sources, RBAC, and ingress so you can monitor routing performance in any cluster.
+
+> **Namespace** – All manifests default to the `vllm-semantic-router-system` namespace to match the core deployment. Override it with Kustomize if you use a different namespace.
+
+## What Gets Installed
+
+| Component    | Purpose | Key Files |
+|--------------|---------|-----------|
+| Prometheus   | Scrapes Semantic Router metrics and stores them with persistent retention | `prometheus/` (`rbac.yaml`, `configmap.yaml`, `deployment.yaml`, `pvc.yaml`, `service.yaml`)|
+| Grafana      | Visualizes metrics using the bundled LLM Router dashboard and a pre-configured Prometheus datasource | `grafana/` (`secret.yaml`, `configmap-*.yaml`, `deployment.yaml`, `pvc.yaml`, `service.yaml`)|
+| Ingress (optional) | Exposes the UIs outside the cluster | `ingress.yaml`|
+| Dashboard provisioning | Automatically loads `deploy/llm-router-dashboard.json` into Grafana | `grafana/configmap-dashboard.yaml`|
+
+Prometheus is configured to discover the `semantic-router-metrics` service (port `9190`) automatically. Grafana provisions the same LLM Router dashboard that ships with the Docker Compose stack.
+
+## 1. Prerequisites
+
+- Deployed Semantic Router workload via `deploy/kubernetes/`
+- A Kubernetes cluster (managed, on-prem, or kind)
+- `kubectl` v1.23+
+- Optional: an ingress controller (NGINX, ALB, etc.) if you want external access
+
+## 2. Directory Layout
+
+```
+deploy/kubernetes/observability/
+├── README.md
+├── kustomization.yaml          # (created in the next step)
+├── ingress.yaml                # optional HTTPS ingress examples
+├── prometheus/
+│   ├── configmap.yaml          # Scrape config (Kubernetes SD)
+│   ├── deployment.yaml
+│   ├── pvc.yaml
+│   ├── rbac.yaml               # SA + ClusterRole + binding
+│   └── service.yaml
+└── grafana/
+    ├── configmap-dashboard.yaml    # Bundled LLM router dashboard
+    ├── configmap-provisioning.yaml # Datasource + provider config
+    ├── deployment.yaml
+    ├── pvc.yaml
+    ├── secret.yaml                 # Admin credentials (override in prod)
+    └── service.yaml
+```
+
+## 3. Prometheus Configuration Highlights
+
+- Uses `kubernetes_sd_configs` to enumerate endpoints in `vllm-semantic-router-system`
+- Keeps 15 days of metrics by default (`--storage.tsdb.retention.time=15d`)
+- Stores metrics in a `PersistentVolumeClaim` named `prometheus-data`
+- RBAC rules grant read-only access to Services, Endpoints, Pods, Nodes, and EndpointSlices
+
+### Scrape configuration snippet
+
+```yaml
+scrape_configs:
+  - job_name: semantic-router
+    kubernetes_sd_configs:
+      - role: endpoints
+        namespaces:
+          names:
+            - vllm-semantic-router-system
+    relabel_configs:
+      - source_labels: [__meta_kubernetes_service_name]
+        regex: semantic-router-metrics
+        action: keep
+      - source_labels: [__meta_kubernetes_endpoint_port_name]
+        regex: metrics
+        action: keep
+```
+
+Modify the namespace or service name if you changed them in your primary deployment.
+
+## 4. Grafana Configuration Highlights
+
+- Stateful deployment backed by the `grafana-storage` PVC
+- Datasource provisioned automatically pointing to `http://prometheus:9090`
+- Dashboard provider watches `/var/lib/grafana-dashboards`
+- Bundled `llm-router-dashboard.json` is identical to `deploy/llm-router-dashboard.json`
+- Admin credentials pulled from the `grafana-admin` secret (default `admin/admin` – **change this!)**
+
+### Updating credentials
+
+```bash
+kubectl create secret generic grafana-admin \
+  --namespace vllm-semantic-router-system \
+  --from-literal=admin-user=monitor \
+  --from-literal=admin-password='pick-a-strong-password' \
+  --dry-run=client -o yaml | kubectl apply -f -
+```
+
+Remove or overwrite the committed `secret.yaml` when you adopt a different secret management approach.
+
+## 5. Deployment Steps
+
+### 5.1. Create the Kustomization
+
+Create `deploy/kubernetes/observability/kustomization.yaml` (see below) to assemble all manifests. This guide assumes you keep Prometheus & Grafana in the same namespace as the router.
+
+### 5.2. Apply manifests
+
+```bash
+kubectl apply -k deploy/kubernetes/observability/
+```
+
+Verify pods:
+
+```bash
+kubectl get pods -n vllm-semantic-router-system
+```
+
+You should see `prometheus-...` and `grafana-...` pods in `Running` state.
+
+### 5.3. Integration with the core deployment
+
+1. Deploy or update Semantic Router (`kubectl apply -k deploy/kubernetes/`).
+2. Deploy observability stack (`kubectl apply -k deploy/kubernetes/observability/`).
+3. Confirm the metrics service (`semantic-router-metrics`) has endpoints:
+
+   ```bash
+   kubectl get endpoints semantic-router-metrics -n vllm-semantic-router-system
+   ```
+
+4. Prometheus target should transition to **UP** within ~15 seconds.
+
+### 5.4. Accessing the UIs
+
+> **Optional Ingress** – If you prefer to keep the stack private, delete `ingress.yaml` from `kustomization.yaml` before applying.
+
+- **Port-forward (quick check)**
+
+  ```bash
+  kubectl port-forward svc/prometheus 9090:9090 -n vllm-semantic-router-system
+  kubectl port-forward svc/grafana 3000:3000 -n vllm-semantic-router-system
+  ```
+
+  Prometheus → http://localhost:9090, Grafana → http://localhost:3000
+
+- **Ingress (production)** – Customize `ingress.yaml` with real domains, TLS secrets, and your ingress class before applying. Replace `*.example.com` and configure HTTPS certificates via cert-manager or your provider.
+
+## 6. Verifying Metrics Collection
+
+1. Open Prometheus (port-forward or ingress) → **Status ▸ Targets** → ensure `semantic-router` job is green.
+2. Query `rate(llm_model_completion_tokens_total[5m])` – should return data after traffic.
+3. Open Grafana, log in with the admin credentials, and confirm the **LLM Router Metrics** dashboard exists under the *Semantic Router* folder.
+4. Generate traffic to Semantic Router (classification or routing requests). Key panels should start populating:
+   - Prompt Category counts
+   - Token usage rate per model
+   - Routing modifications between models
+   - Latency histograms (TTFT, completion p95)
+
+## 7. Dashboard Customization
+
+- Duplicate the provisioned dashboard inside Grafana to make changes while keeping the original as a template.
+- Update Grafana provisioning (`grafana/configmap-provisioning.yaml`) to point to alternate folders or add new providers.
+- Add additional dashboards by extending `grafana/configmap-dashboard.yaml` or mounting a different ConfigMap.
+- Incorporate Kubernetes cluster metrics (CPU/memory) by adding another datasource or deploying kube-state-metrics + node exporters.
+
+## 8. Best Practices
+
+### Resource Sizing
+
+- Prometheus: increase CPU/memory with higher scrape cardinality or retention > 15 days.
+- Grafana: start with `500m` CPU / `1Gi` RAM; scale replicas horizontally when concurrent viewers exceed a few dozen.
+
+### Storage
+
+- Use SSD-backed storage classes for Prometheus when retention/window is large.
+- Increase `prometheus/pvc.yaml` (default 20Gi) and `grafana/pvc.yaml` (default 10Gi) to match retention requirements.
+- Enable volume snapshots or backups for dashboards and alert history.
+
+### Security
+
+- Replace the demo `grafana-admin` secret with credentials stored in your preferred secret manager.
+- Restrict ingress access with network policies, OAuth proxies, or SSO integrations.
+- Enable Grafana role-based access control and API keys for automation.
+- Scope Prometheus RBAC to only the namespaces you need. If metrics run in multiple namespaces, list them in the scrape config.
+
+### Maintenance
+
+- Monitor Prometheus disk usage; prune retention or scale PVC before it fills up.
+- Back up Grafana dashboards or store them in Git (already done through this ConfigMap).
+- Roll upgrades separately: update Prometheus and Grafana images via `kustomization.yaml` patches.
+- Consider adopting the Prometheus Operator (`ServiceMonitor` + `PodMonitor`) if you already run kube-prometheus-stack. A sample `ServiceMonitor` is in `website/docs/tutorials/observability/observability.md`.
+
+## 9. Troubleshooting
+
+| Symptom | Checks | Fix |
+|---------|--------|-----|
+| Prometheus target **DOWN** | `kubectl get endpoints semantic-router-metrics -n vllm-semantic-router-system` | Ensure the Semantic Router deployment is running and the service labels match `app=semantic-router`, `service=metrics` |
+| Grafana dashboard empty | **Configuration → Data Sources** | Confirm Prometheus datasource URL resolves and the Prometheus service is reachable |
+| Login fails | `kubectl get secret grafana-admin -o yaml` | Update the secret to match the credentials you expect |
+| PVC Pending | `kubectl describe pvc prometheus-data` | Provide a storage class via `storageClassName`, or provision storage manually |
+| Ingress 404 | `kubectl describe ingress grafana` | Update hostnames, TLS secrets, and ensure ingress controller is installed |
+
+## 10. Next Steps
+
+- Configure alerts for critical metrics (Prometheus alerting rules + Alertmanager)
+- Add log aggregation (Loki, Elasticsearch, or Cloud-native logging)
+- Automate stack deployment through CI/CD pipelines using `kubectl apply -k`
+
+With this observability stack in place, you can track Semantic Router health, routing accuracy, latency distributions, and usage trends across any Kubernetes environment.
diff --git a/deploy/kubernetes/observability/grafana/configmap-dashboard.yaml b/deploy/kubernetes/observability/grafana/configmap-dashboard.yaml
new file mode 100644
index 00000000..eeccafb4
--- /dev/null
+++ b/deploy/kubernetes/observability/grafana/configmap-dashboard.yaml
@@ -0,0 +1,652 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: grafana-dashboards
+  labels:
+    app: grafana
+    grafana_dashboard: "1"
+data:
+  llm-router-dashboard.json: |
+    {
+      "annotations": {
+        "list": [
+          {
+            "builtIn": 1,
+            "datasource": {
+              "type": "grafana",
+              "uid": "-- Grafana --"
+            },
+            "enable": true,
+            "hide": true,
+            "iconColor": "rgba(0, 211, 255, 1)",
+            "name": "Annotations & Alerts",
+            "target": {
+              "limit": 100,
+              "matchAny": false,
+              "tags": [],
+              "type": "dashboard"
+            },
+            "type": "dashboard"
+          }
+        ]
+      },
+      "editable": true,
+      "fiscalYearStartMonth": 0,
+      "graphTooltip": 0,
+      "id": 18,
+      "links": [],
+      "panels": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "thresholds"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "red",
+                    "value": 80
+                  }
+                ]
+              }
+            },
+            "overrides": []
+          },
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 0,
+            "y": 0
+          },
+          "id": 4,
+          "options": {
+            "displayMode": "gradient",
+            "legend": {
+              "calcs": [],
+              "displayMode": "list",
+              "placement": "bottom",
+              "showLegend": false
+            },
+            "maxVizHeight": 300,
+            "minVizHeight": 16,
+            "minVizWidth": 8,
+            "namePlacement": "auto",
+            "orientation": "auto",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            },
+            "showUnfilled": true,
+            "sizing": "auto",
+            "valueMode": "color"
+          },
+          "pluginVersion": "11.5.1",
+          "targets": [
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "prometheus"
+              },
+              "disableTextWrap": false,
+              "editorMode": "builder",
+              "expr": "sum by(category) (llm_category_classifications_count)",
+              "fullMetaSearch": false,
+              "includeNullMetadata": true,
+              "instant": false,
+              "legendFormat": "__auto",
+              "range": true,
+              "refId": "A",
+              "useBackend": false
+            }
+          ],
+          "title": "Prompt Category",
+          "type": "bargauge"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "custom": {
+                "axisBorderShow": false,
+                "axisCenteredZero": false,
+                "axisColorMode": "text",
+                "axisLabel": "Tokens/sec",
+                "axisPlacement": "auto",
+                "barAlignment": 0,
+                "barWidthFactor": 0.6,
+                "drawStyle": "line",
+                "fillOpacity": 10,
+                "gradientMode": "none",
+                "hideFrom": {
+                  "legend": false,
+                  "tooltip": false,
+                  "viz": false
+                },
+                "insertNulls": false,
+                "lineInterpolation": "smooth",
+                "lineWidth": 1,
+                "pointSize": 5,
+                "scaleDistribution": {
+                  "type": "linear"
+                },
+                "showPoints": "auto",
+                "spanNulls": false,
+                "stacking": {
+                  "group": "A",
+                  "mode": "none"
+                },
+                "thresholdsStyle": {
+                  "mode": "off"
+                }
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  }
+                ]
+              },
+              "unit": "tps"
+            },
+            "overrides": []
+          },
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 12,
+            "y": 0
+          },
+          "id": 2,
+          "options": {
+            "legend": {
+              "calcs": [
+                "mean",
+                "max",
+                "lastNotNull"
+              ],
+              "displayMode": "table",
+              "placement": "bottom",
+              "showLegend": true
+            },
+            "tooltip": {
+              "hideZeros": false,
+              "mode": "multi",
+              "sort": "none"
+            }
+          },
+          "pluginVersion": "11.5.1",
+          "targets": [
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "prometheus"
+              },
+              "editorMode": "code",
+              "expr": "sum(rate(llm_model_completion_tokens_total[5m])) by (model)",
+              "legendFormat": "Completion Tokens {{model}}",
+              "range": true,
+              "refId": "A"
+            }
+          ],
+          "title": "Token Usage Rate by Model",
+          "type": "timeseries"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "custom": {
+                "axisBorderShow": false,
+                "axisCenteredZero": false,
+                "axisColorMode": "text",
+                "axisLabel": "Routes/sec",
+                "axisPlacement": "auto",
+                "barAlignment": 0,
+                "barWidthFactor": 0.6,
+                "drawStyle": "line",
+                "fillOpacity": 10,
+                "gradientMode": "none",
+                "hideFrom": {
+                  "legend": false,
+                  "tooltip": false,
+                  "viz": false
+                },
+                "insertNulls": false,
+                "lineInterpolation": "smooth",
+                "lineWidth": 1,
+                "pointSize": 5,
+                "scaleDistribution": {
+                  "type": "linear"
+                },
+                "showPoints": "auto",
+                "spanNulls": false,
+                "stacking": {
+                  "group": "A",
+                  "mode": "none"
+                },
+                "thresholdsStyle": {
+                  "mode": "off"
+                }
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  }
+                ]
+              },
+              "unit": "ops"
+            },
+            "overrides": []
+          },
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 0,
+            "y": 8
+          },
+          "id": 3,
+          "options": {
+            "legend": {
+              "calcs": [
+                "mean",
+                "max",
+                "lastNotNull"
+              ],
+              "displayMode": "table",
+              "placement": "bottom",
+              "showLegend": true
+            },
+            "tooltip": {
+              "hideZeros": false,
+              "mode": "multi",
+              "sort": "none"
+            }
+          },
+          "pluginVersion": "11.5.1",
+          "targets": [
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "prometheus"
+              },
+              "editorMode": "code",
+              "expr": "sum(rate(llm_model_routing_modifications_total[5m])) by (source_model, target_model)",
+              "format": "time_series",
+              "legendFormat": "{{source_model}} -> {{target_model}}",
+              "range": true,
+              "refId": "A"
+            }
+          ],
+          "title": "Model Routing Rate",
+          "type": "timeseries"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "custom": {
+                "axisBorderShow": false,
+                "axisCenteredZero": false,
+                "axisColorMode": "text",
+                "axisLabel": "Seconds",
+                "axisPlacement": "auto",
+                "barAlignment": 0,
+                "barWidthFactor": 0.6,
+                "drawStyle": "line",
+                "fillOpacity": 10,
+                "gradientMode": "none",
+                "hideFrom": {
+                  "legend": false,
+                  "tooltip": false,
+                  "viz": false
+                },
+                "insertNulls": false,
+                "lineInterpolation": "smooth",
+                "lineWidth": 1,
+                "pointSize": 5,
+                "scaleDistribution": {
+                  "type": "linear"
+                },
+                "showPoints": "auto",
+                "spanNulls": false,
+                "stacking": {
+                  "group": "A",
+                  "mode": "none"
+                },
+                "thresholdsStyle": {
+                  "mode": "off"
+                }
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "red",
+                    "value": 80
+                  }
+                ]
+              },
+              "unit": "s"
+            },
+            "overrides": []
+          },
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 12,
+            "y": 8
+          },
+          "id": 1,
+          "options": {
+            "legend": {
+              "calcs": [
+                "mean",
+                "max",
+                "lastNotNull"
+              ],
+              "displayMode": "table",
+              "placement": "bottom",
+              "showLegend": true
+            },
+            "tooltip": {
+              "hideZeros": false,
+              "mode": "multi",
+              "sort": "none"
+            }
+          },
+          "pluginVersion": "11.5.1",
+          "targets": [
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "prometheus"
+              },
+              "editorMode": "code",
+              "expr": "histogram_quantile(0.95, sum(rate(llm_model_completion_latency_seconds_bucket[5m])) by (le, model))",
+              "legendFormat": "p95 {{model}}",
+              "range": true,
+              "refId": "A"
+            }
+          ],
+          "title": "Model Completion Latency (p95)",
+          "type": "timeseries"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "custom": {
+                "axisBorderShow": false,
+                "axisCenteredZero": false,
+                "axisColorMode": "text",
+                "axisLabel": "Seconds",
+                "axisPlacement": "auto",
+                "barAlignment": 0,
+                "barWidthFactor": 0.6,
+                "drawStyle": "line",
+                "fillOpacity": 10,
+                "gradientMode": "none",
+                "hideFrom": {
+                  "legend": false,
+                  "tooltip": false,
+                  "viz": false
+                },
+                "insertNulls": false,
+                "lineInterpolation": "smooth",
+                "lineWidth": 1,
+                "pointSize": 5,
+                "scaleDistribution": {
+                  "type": "linear"
+                },
+                "showPoints": "auto",
+                "spanNulls": false,
+                "stacking": {
+                  "group": "A",
+                  "mode": "none"
+                },
+                "thresholdsStyle": {
+                  "mode": "off"
+                }
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "red",
+                    "value": 80
+                  }
+                ]
+              },
+              "unit": "s"
+            },
+            "overrides": []
+          },
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 0,
+            "y": 16
+          },
+          "id": 5,
+          "options": {
+            "legend": {
+              "calcs": [
+                "mean",
+                "max",
+                "lastNotNull"
+              ],
+              "displayMode": "table",
+              "placement": "bottom",
+              "showLegend": true
+            },
+            "tooltip": {
+              "hideZeros": false,
+              "mode": "multi",
+              "sort": "none"
+            }
+          },
+          "pluginVersion": "11.5.1",
+          "targets": [
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "prometheus"
+              },
+              "editorMode": "code",
+              "expr": "histogram_quantile(0.95, sum(rate(llm_model_ttft_seconds_bucket[5m])) by (le, model))",
+              "legendFormat": "TTFT p95 {{model}}",
+              "range": true,
+              "refId": "A"
+            }
+          ],
+          "title": "TTFT (p95) by Model",
+          "type": "timeseries"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "custom": {
+                "axisBorderShow": false,
+                "axisCenteredZero": false,
+                "axisColorMode": "text",
+                "axisLabel": "Seconds per token",
+                "axisPlacement": "auto",
+                "barAlignment": 0,
+                "barWidthFactor": 0.6,
+                "drawStyle": "line",
+                "fillOpacity": 10,
+                "gradientMode": "none",
+                "hideFrom": {
+                  "legend": false,
+                  "tooltip": false,
+                  "viz": false
+                },
+                "insertNulls": false,
+                "lineInterpolation": "smooth",
+                "lineWidth": 1,
+                "pointSize": 5,
+                "scaleDistribution": {
+                  "type": "linear"
+                },
+                "showPoints": "auto",
+                "spanNulls": false,
+                "stacking": {
+                  "group": "A",
+                  "mode": "none"
+                },
+                "thresholdsStyle": {
+                  "mode": "off"
+                }
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  }
+                ]
+              },
+              "unit": "s"
+            },
+            "overrides": []
+          },
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 12,
+            "y": 16
+          },
+          "id": 6,
+          "options": {
+            "legend": {
+              "calcs": [
+                "mean",
+                "max",
+                "lastNotNull"
+              ],
+              "displayMode": "table",
+              "placement": "bottom",
+              "showLegend": true
+            },
+            "tooltip": {
+              "hideZeros": false,
+              "mode": "multi",
+              "sort": "none"
+            }
+          },
+          "pluginVersion": "11.5.1",
+          "targets": [
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "prometheus"
+              },
+              "editorMode": "code",
+              "expr": "histogram_quantile(0.95, sum(rate(llm_model_tpot_seconds_bucket[5m])) by (le, model))",
+              "legendFormat": "TPOT p95 {{model}}",
+              "range": true,
+              "refId": "A"
+            }
+          ],
+          "title": "TPOT (p95) by Model (sec/token)",
+          "type": "timeseries"
+        }
+      ],
+      "preload": false,
+      "refresh": "10s",
+      "schemaVersion": 40,
+      "tags": [
+        "llm-router"
+      ],
+      "templating": {
+        "list": [
+          {
+            "current": {
+              "text": "prometheus",
+              "value": "prometheus"
+            },
+            "includeAll": false,
+            "name": "DS_PROMETHEUS",
+            "options": [],
+            "query": "prometheus",
+            "refresh": 1,
+            "regex": "",
+            "type": "datasource"
+          }
+        ]
+      },
+      "time": {
+        "from": "now-5m",
+        "to": "now"
+      },
+      "timepicker": {},
+      "timezone": "",
+      "title": "LLM Router Metrics",
+      "uid": "llm-router-metrics",
+      "version": 14,
+      "weekStart": ""
+    }
diff --git a/deploy/kubernetes/observability/grafana/configmap-provisioning.yaml b/deploy/kubernetes/observability/grafana/configmap-provisioning.yaml
new file mode 100644
index 00000000..32086fe3
--- /dev/null
+++ b/deploy/kubernetes/observability/grafana/configmap-provisioning.yaml
@@ -0,0 +1,30 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: grafana-provisioning
+  labels:
+    app: grafana
+data:
+  datasources.yaml: |
+    apiVersion: 1
+    datasources:
+      - name: Prometheus
+        uid: prometheus
+        type: prometheus
+        access: proxy
+        url: http://prometheus:9090
+        isDefault: true
+        editable: false
+        jsonData:
+          timeInterval: 15s
+  dashboards.yaml: |
+    apiVersion: 1
+    providers:
+      - name: semantic-router-dashboards
+        orgId: 1
+        folder: Semantic Router
+        type: file
+        disableDeletion: false
+        editable: true
+        options:
+          path: /var/lib/grafana-dashboards
diff --git a/deploy/kubernetes/observability/grafana/deployment.yaml b/deploy/kubernetes/observability/grafana/deployment.yaml
new file mode 100644
index 00000000..0b3bfdcc
--- /dev/null
+++ b/deploy/kubernetes/observability/grafana/deployment.yaml
@@ -0,0 +1,85 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: grafana
+  labels:
+    app: grafana
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: grafana
+  template:
+    metadata:
+      labels:
+        app: grafana
+    spec:
+      securityContext:
+        runAsNonRoot: true
+        runAsUser: 472
+        fsGroup: 472
+      containers:
+        - name: grafana
+          image: grafana/grafana:11.5.1
+          imagePullPolicy: IfNotPresent
+          env:
+            - name: GF_SECURITY_ADMIN_USER
+              valueFrom:
+                secretKeyRef:
+                  name: grafana-admin
+                  key: admin-user
+            - name: GF_SECURITY_ADMIN_PASSWORD
+              valueFrom:
+                secretKeyRef:
+                  name: grafana-admin
+                  key: admin-password
+            - name: GF_AUTH_ANONYMOUS_ENABLED
+              value: "false"
+            - name: GF_PATHS_PROVISIONING
+              value: /etc/grafana/provisioning
+          ports:
+            - name: http
+              containerPort: 3000
+          readinessProbe:
+            httpGet:
+              path: /api/health
+              port: http
+            initialDelaySeconds: 30
+            periodSeconds: 10
+          livenessProbe:
+            httpGet:
+              path: /api/health
+              port: http
+            initialDelaySeconds: 60
+            periodSeconds: 30
+          resources:
+            requests:
+              cpu: "100m"
+              memory: "256Mi"
+            limits:
+              cpu: "500m"
+              memory: "1Gi"
+          volumeMounts:
+            - name: provisioning
+              mountPath: /etc/grafana/provisioning
+              readOnly: true
+            - name: dashboards
+              mountPath: /var/lib/grafana-dashboards
+              readOnly: true
+            - name: storage
+              mountPath: /var/lib/grafana
+      volumes:
+        - name: provisioning
+          configMap:
+            name: grafana-provisioning
+            items:
+              - key: datasources.yaml
+                path: datasources/datasource.yaml
+              - key: dashboards.yaml
+                path: dashboards/provider.yaml
+        - name: dashboards
+          configMap:
+            name: grafana-dashboards
+        - name: storage
+          persistentVolumeClaim:
+            claimName: grafana-storage
diff --git a/deploy/kubernetes/observability/grafana/pvc.yaml b/deploy/kubernetes/observability/grafana/pvc.yaml
new file mode 100644
index 00000000..e11b2d94
--- /dev/null
+++ b/deploy/kubernetes/observability/grafana/pvc.yaml
@@ -0,0 +1,12 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: grafana-storage
+  labels:
+    app: grafana
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 10Gi
diff --git a/deploy/kubernetes/observability/grafana/secret.yaml b/deploy/kubernetes/observability/grafana/secret.yaml
new file mode 100644
index 00000000..f831a4a8
--- /dev/null
+++ b/deploy/kubernetes/observability/grafana/secret.yaml
@@ -0,0 +1,10 @@
+apiVersion: v1
+kind: Secret
+metadata:
+  name: grafana-admin
+  labels:
+    app: grafana
+type: Opaque
+stringData:
+  admin-user: admin
+  admin-password: admin
diff --git a/deploy/kubernetes/observability/grafana/service.yaml b/deploy/kubernetes/observability/grafana/service.yaml
new file mode 100644
index 00000000..c394a31c
--- /dev/null
+++ b/deploy/kubernetes/observability/grafana/service.yaml
@@ -0,0 +1,14 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: grafana
+  labels:
+    app: grafana
+spec:
+  type: ClusterIP
+  selector:
+    app: grafana
+  ports:
+    - name: http
+      port: 3000
+      targetPort: http
diff --git a/deploy/kubernetes/observability/ingress.yaml b/deploy/kubernetes/observability/ingress.yaml
new file mode 100644
index 00000000..7ef2cdf4
--- /dev/null
+++ b/deploy/kubernetes/observability/ingress.yaml
@@ -0,0 +1,53 @@
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: grafana
+  labels:
+    app: grafana
+  annotations:
+    kubernetes.io/ingress.class: nginx
+    nginx.ingress.kubernetes.io/backend-protocol: HTTP
+    nginx.ingress.kubernetes.io/ssl-redirect: "true"
+spec:
+  tls:
+    - hosts:
+        - grafana.example.com
+      secretName: grafana-tls
+  rules:
+    - host: grafana.example.com
+      http:
+        paths:
+          - path: /
+            pathType: Prefix
+            backend:
+              service:
+                name: grafana
+                port:
+                  name: http
+---
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: prometheus
+  labels:
+    app: prometheus
+  annotations:
+    kubernetes.io/ingress.class: nginx
+    nginx.ingress.kubernetes.io/backend-protocol: HTTP
+    nginx.ingress.kubernetes.io/ssl-redirect: "true"
+spec:
+  tls:
+    - hosts:
+        - prometheus.example.com
+      secretName: prometheus-tls
+  rules:
+    - host: prometheus.example.com
+      http:
+        paths:
+          - path: /
+            pathType: Prefix
+            backend:
+              service:
+                name: prometheus
+                port:
+                  name: http
diff --git a/deploy/kubernetes/observability/kustomization.yaml b/deploy/kubernetes/observability/kustomization.yaml
new file mode 100644
index 00000000..d3ec5569
--- /dev/null
+++ b/deploy/kubernetes/observability/kustomization.yaml
@@ -0,0 +1,22 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+namespace: vllm-semantic-router-system
+
+commonLabels:
+  app.kubernetes.io/part-of: semantic-router
+  app.kubernetes.io/component: observability
+
+resources:
+  - prometheus/rbac.yaml
+  - prometheus/pvc.yaml
+  - prometheus/configmap.yaml
+  - prometheus/deployment.yaml
+  - prometheus/service.yaml
+  - grafana/secret.yaml
+  - grafana/pvc.yaml
+  - grafana/configmap-provisioning.yaml
+  - grafana/configmap-dashboard.yaml
+  - grafana/deployment.yaml
+  - grafana/service.yaml
+  - ingress.yaml
diff --git a/deploy/kubernetes/observability/prometheus/configmap.yaml b/deploy/kubernetes/observability/prometheus/configmap.yaml
new file mode 100644
index 00000000..8c600621
--- /dev/null
+++ b/deploy/kubernetes/observability/prometheus/configmap.yaml
@@ -0,0 +1,35 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: prometheus-config
+  labels:
+    app: prometheus
+data:
+  prometheus.yml: |
+    global:
+      scrape_interval: 15s
+      evaluation_interval: 15s
+
+    scrape_configs:
+      - job_name: prometheus
+        static_configs:
+          - targets:
+              - localhost:9090
+
+      - job_name: semantic-router
+        kubernetes_sd_configs:
+          - role: endpoints
+            namespaces:
+              names:
+                - vllm-semantic-router-system
+        relabel_configs:
+          - source_labels: [__meta_kubernetes_service_name]
+            regex: semantic-router-metrics
+            action: keep
+          - source_labels: [__meta_kubernetes_endpoint_port_name]
+            regex: metrics
+            action: keep
+          - source_labels: [__meta_kubernetes_namespace]
+            target_label: namespace
+          - source_labels: [__address__]
+            target_label: instance
diff --git a/deploy/kubernetes/observability/prometheus/deployment.yaml b/deploy/kubernetes/observability/prometheus/deployment.yaml
new file mode 100644
index 00000000..a9815d01
--- /dev/null
+++ b/deploy/kubernetes/observability/prometheus/deployment.yaml
@@ -0,0 +1,54 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: prometheus
+  labels:
+    app: prometheus
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: prometheus
+  template:
+    metadata:
+      labels:
+        app: prometheus
+    spec:
+      serviceAccountName: prometheus
+      securityContext:
+        runAsNonRoot: true
+        runAsUser: 65534
+        fsGroup: 65534
+      containers:
+        - name: prometheus
+          image: prom/prometheus:v2.53.0
+          imagePullPolicy: IfNotPresent
+          args:
+            - "--config.file=/etc/prometheus/prometheus.yml"
+            - "--storage.tsdb.path=/prometheus"
+            - "--web.enable-lifecycle"
+            - "--storage.tsdb.retention.time=15d"
+            - "--storage.tsdb.max-block-duration=2h"
+            - "--storage.tsdb.no-lockfile"
+          ports:
+            - name: http
+              containerPort: 9090
+          resources:
+            requests:
+              cpu: "250m"
+              memory: "1Gi"
+            limits:
+              cpu: "500m"
+              memory: "2Gi"
+          volumeMounts:
+            - name: config
+              mountPath: /etc/prometheus
+            - name: data
+              mountPath: /prometheus
+      volumes:
+        - name: config
+          configMap:
+            name: prometheus-config
+        - name: data
+          persistentVolumeClaim:
+            claimName: prometheus-data
diff --git a/deploy/kubernetes/observability/prometheus/pvc.yaml b/deploy/kubernetes/observability/prometheus/pvc.yaml
new file mode 100644
index 00000000..d2dd216e
--- /dev/null
+++ b/deploy/kubernetes/observability/prometheus/pvc.yaml
@@ -0,0 +1,12 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: prometheus-data
+  labels:
+    app: prometheus
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 20Gi
diff --git a/deploy/kubernetes/observability/prometheus/rbac.yaml b/deploy/kubernetes/observability/prometheus/rbac.yaml
new file mode 100644
index 00000000..c0954750
--- /dev/null
+++ b/deploy/kubernetes/observability/prometheus/rbac.yaml
@@ -0,0 +1,43 @@
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: prometheus
+  labels:
+    app: prometheus
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: prometheus
+  labels:
+    app: prometheus
+rules:
+  - apiGroups: [""]
+    resources:
+      - nodes
+      - nodes/proxy
+      - services
+      - endpoints
+      - pods
+    verbs: ["get", "list", "watch"]
+  - apiGroups: ["discovery.k8s.io"]
+    resources:
+      - endpointslices
+    verbs: ["get", "list", "watch"]
+  - nonResourceURLs: ["/metrics"]
+    verbs: ["get"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: prometheus
+  labels:
+    app: prometheus
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: prometheus
+subjects:
+  - kind: ServiceAccount
+    name: prometheus
+    namespace: vllm-semantic-router-system
diff --git a/deploy/kubernetes/observability/prometheus/service.yaml b/deploy/kubernetes/observability/prometheus/service.yaml
new file mode 100644
index 00000000..1d86bde7
--- /dev/null
+++ b/deploy/kubernetes/observability/prometheus/service.yaml
@@ -0,0 +1,14 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: prometheus
+  labels:
+    app: prometheus
+spec:
+  selector:
+    app: prometheus
+  ports:
+    - name: http
+      port: 9090
+      targetPort: http
+  type: ClusterIP

From cedd13e642ccbf168e1b28e6bc8d49b97d779bcb Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Tue, 30 Sep 2025 19:28:41 +0800
Subject: [PATCH 2/6] Update docs of observability k8s part

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 .../tutorials/observability/observability.md  | 215 ++++++++++++++----
 1 file changed, 174 insertions(+), 41 deletions(-)

diff --git a/website/docs/tutorials/observability/observability.md b/website/docs/tutorials/observability/observability.md
index 66411319..82756cca 100644
--- a/website/docs/tutorials/observability/observability.md
+++ b/website/docs/tutorials/observability/observability.md
@@ -49,74 +49,207 @@ Expected Prometheus targets:
 
 ## 3. Kubernetes Observability
 
-After applying `deploy/kubernetes/`, you get services:
+This guide adds a production-ready Prometheus + Grafana stack to the existing Semantic Router Kubernetes deployment. It includes manifests for collectors, dashboards, data sources, RBAC, and ingress so you can monitor routing performance in any cluster.
 
-- `semantic-router` (gRPC)
-- `semantic-router-metrics` (metrics 9190)
+> **Namespace** – All manifests default to the `vllm-semantic-router-system` namespace to match the core deployment. Override it with Kustomize if you use a different namespace.
 
-### 3.1 Prometheus Operator (ServiceMonitor)
+## What Gets Installed
 
-```yaml
-apiVersion: monitoring.coreos.com/v1
-kind: ServiceMonitor
-metadata:
-  name: semantic-router
-  namespace: semantic-router
-spec:
-  selector:
-    matchLabels:
-      app: semantic-router
-      service: metrics
-  namespaceSelector:
-    matchNames: ["semantic-router"]
-  endpoints:
-    - port: metrics
-      interval: 15s
-      path: /metrics
+| Component    | Purpose | Key Files |
+|--------------|---------|-----------|
+| Prometheus   | Scrapes Semantic Router metrics and stores them with persistent retention | `prometheus/` (`rbac.yaml`, `configmap.yaml`, `deployment.yaml`, `pvc.yaml`, `service.yaml`)|
+| Grafana      | Visualizes metrics using the bundled LLM Router dashboard and a pre-configured Prometheus datasource | `grafana/` (`secret.yaml`, `configmap-*.yaml`, `deployment.yaml`, `pvc.yaml`, `service.yaml`)|
+| Ingress (optional) | Exposes the UIs outside the cluster | `ingress.yaml`|
+| Dashboard provisioning | Automatically loads `deploy/llm-router-dashboard.json` into Grafana | `grafana/configmap-dashboard.yaml`|
+
+Prometheus is configured to discover the `semantic-router-metrics` service (port `9190`) automatically. Grafana provisions the same LLM Router dashboard that ships with the Docker Compose stack.
+
+### 1. Prerequisites
+
+- Deployed Semantic Router workload via `deploy/kubernetes/`
+- A Kubernetes cluster (managed, on-prem, or kind)
+- `kubectl` v1.23+
+- Optional: an ingress controller (NGINX, ALB, etc.) if you want external access
+
+### 2. Directory Layout
+
+```
+deploy/kubernetes/observability/
+├── README.md
+├── kustomization.yaml          # (created in the next step)
+├── ingress.yaml                # optional HTTPS ingress examples
+├── prometheus/
+│   ├── configmap.yaml          # Scrape config (Kubernetes SD)
+│   ├── deployment.yaml
+│   ├── pvc.yaml
+│   ├── rbac.yaml               # SA + ClusterRole + binding
+│   └── service.yaml
+└── grafana/
+    ├── configmap-dashboard.yaml    # Bundled LLM router dashboard
+    ├── configmap-provisioning.yaml # Datasource + provider config
+    ├── deployment.yaml
+    ├── pvc.yaml
+    ├── secret.yaml                 # Admin credentials (override in prod)
+    └── service.yaml
 ```
 
-Ensure the metrics Service carries a label like `service: metrics`. (It does in the provided manifests.)
+### 3. Prometheus Configuration Highlights
 
-### 3.2 Plain Prometheus Static Scrape
+- Uses `kubernetes_sd_configs` to enumerate endpoints in `vllm-semantic-router-system`
+- Keeps 15 days of metrics by default (`--storage.tsdb.retention.time=15d`)
+- Stores metrics in a `PersistentVolumeClaim` named `prometheus-data`
+- RBAC rules grant read-only access to Services, Endpoints, Pods, Nodes, and EndpointSlices
+
+#### Scrape configuration snippet
 
 ```yaml
 scrape_configs:
   - job_name: semantic-router
     kubernetes_sd_configs:
       - role: endpoints
+        namespaces:
+          names:
+            - vllm-semantic-router-system
     relabel_configs:
       - source_labels: [__meta_kubernetes_service_name]
         regex: semantic-router-metrics
         action: keep
+      - source_labels: [__meta_kubernetes_endpoint_port_name]
+        regex: metrics
+        action: keep
 ```
 
-### 3.3 Port Forward for Spot Checks
+Modify the namespace or service name if you changed them in your primary deployment.
+
+### 4. Grafana Configuration Highlights
+
+- Stateful deployment backed by the `grafana-storage` PVC
+- Datasource provisioned automatically pointing to `http://prometheus:9090`
+- Dashboard provider watches `/var/lib/grafana-dashboards`
+- Bundled `llm-router-dashboard.json` is identical to `deploy/llm-router-dashboard.json`
+- Admin credentials pulled from the `grafana-admin` secret (default `admin/admin` – **change this!)**
+
+#### Updating credentials
 
 ```bash
-kubectl -n semantic-router port-forward svc/semantic-router-metrics 9190:9190
-curl -s localhost:9190/metrics | head
+kubectl create secret generic grafana-admin \
+  --namespace vllm-semantic-router-system \
+  --from-literal=admin-user=monitor \
+  --from-literal=admin-password='pick-a-strong-password' \
+  --dry-run=client -o yaml | kubectl apply -f -
 ```
 
-### 3.4 Grafana Dashboard Provision
+Remove or overwrite the committed `secret.yaml` when you adopt a different secret management approach.
 
-If using kube-prometheus-stack or a Grafana sidecar:
+### 5. Deployment Steps
 
-```yaml
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: semantic-router-dashboard
-  namespace: semantic-router
-  labels:
-    grafana_dashboard: "1"
-data:
-  llm-router-dashboard.json: |
-    # Paste JSON from deploy/llm-router-dashboard.json
+#### 5.1. Create the Kustomization
+
+Create `deploy/kubernetes/observability/kustomization.yaml` (see below) to assemble all manifests. This guide assumes you keep Prometheus & Grafana in the same namespace as the router.
+
+#### 5.2. Apply manifests
+
+```bash
+kubectl apply -k deploy/kubernetes/observability/
 ```
 
-Otherwise import the JSON manually in Grafana UI.
+Verify pods:
 
----
+```bash
+kubectl get pods -n vllm-semantic-router-system
+```
+
+You should see `prometheus-...` and `grafana-...` pods in `Running` state.
+
+#### 5.3. Integration with the core deployment
+
+1. Deploy or update Semantic Router (`kubectl apply -k deploy/kubernetes/`).
+2. Deploy observability stack (`kubectl apply -k deploy/kubernetes/observability/`).
+3. Confirm the metrics service (`semantic-router-metrics`) has endpoints:
+
+   ```bash
+   kubectl get endpoints semantic-router-metrics -n vllm-semantic-router-system
+   ```
+
+4. Prometheus target should transition to **UP** within ~15 seconds.
+
+#### 5.4. Accessing the UIs
+
+> **Optional Ingress** – If you prefer to keep the stack private, delete `ingress.yaml` from `kustomization.yaml` before applying.
+
+- **Port-forward (quick check)**
+
+  ```bash
+  kubectl port-forward svc/prometheus 9090:9090 -n vllm-semantic-router-system
+  kubectl port-forward svc/grafana 3000:3000 -n vllm-semantic-router-system
+  ```
+
+  Prometheus → http://localhost:9090, Grafana → http://localhost:3000
+
+- **Ingress (production)** – Customize `ingress.yaml` with real domains, TLS secrets, and your ingress class before applying. Replace `*.example.com` and configure HTTPS certificates via cert-manager or your provider.
+
+### 6. Verifying Metrics Collection
+
+1. Open Prometheus (port-forward or ingress) → **Status ▸ Targets** → ensure `semantic-router` job is green.
+2. Query `rate(llm_model_completion_tokens_total[5m])` – should return data after traffic.
+3. Open Grafana, log in with the admin credentials, and confirm the **LLM Router Metrics** dashboard exists under the *Semantic Router* folder.
+4. Generate traffic to Semantic Router (classification or routing requests). Key panels should start populating:
+   - Prompt Category counts
+   - Token usage rate per model
+   - Routing modifications between models
+   - Latency histograms (TTFT, completion p95)
+
+### 7. Dashboard Customization
+
+- Duplicate the provisioned dashboard inside Grafana to make changes while keeping the original as a template.
+- Update Grafana provisioning (`grafana/configmap-provisioning.yaml`) to point to alternate folders or add new providers.
+- Add additional dashboards by extending `grafana/configmap-dashboard.yaml` or mounting a different ConfigMap.
+- Incorporate Kubernetes cluster metrics (CPU/memory) by adding another datasource or deploying kube-state-metrics + node exporters.
+
+### 8. Best Practices
+
+#### Resource Sizing
+
+- Prometheus: increase CPU/memory with higher scrape cardinality or retention > 15 days.
+- Grafana: start with `500m` CPU / `1Gi` RAM; scale replicas horizontally when concurrent viewers exceed a few dozen.
+
+#### Storage
+
+- Use SSD-backed storage classes for Prometheus when retention/window is large.
+- Increase `prometheus/pvc.yaml` (default 20Gi) and `grafana/pvc.yaml` (default 10Gi) to match retention requirements.
+- Enable volume snapshots or backups for dashboards and alert history.
+
+#### Security
+
+- Replace the demo `grafana-admin` secret with credentials stored in your preferred secret manager.
+- Restrict ingress access with network policies, OAuth proxies, or SSO integrations.
+- Enable Grafana role-based access control and API keys for automation.
+- Scope Prometheus RBAC to only the namespaces you need. If metrics run in multiple namespaces, list them in the scrape config.
+
+#### Maintenance
+
+- Monitor Prometheus disk usage; prune retention or scale PVC before it fills up.
+- Back up Grafana dashboards or store them in Git (already done through this ConfigMap).
+- Roll upgrades separately: update Prometheus and Grafana images via `kustomization.yaml` patches.
+- Consider adopting the Prometheus Operator (`ServiceMonitor` + `PodMonitor`) if you already run kube-prometheus-stack. A sample `ServiceMonitor` is in `website/docs/tutorials/observability/observability.md`.
+
+### 9. Troubleshooting
+
+| Symptom | Checks | Fix |
+|---------|--------|-----|
+| Prometheus target **DOWN** | `kubectl get endpoints semantic-router-metrics -n vllm-semantic-router-system` | Ensure the Semantic Router deployment is running and the service labels match `app=semantic-router`, `service=metrics` |
+| Grafana dashboard empty | **Configuration → Data Sources** | Confirm Prometheus datasource URL resolves and the Prometheus service is reachable |
+| Login fails | `kubectl get secret grafana-admin -o yaml` | Update the secret to match the credentials you expect |
+| PVC Pending | `kubectl describe pvc prometheus-data` | Provide a storage class via `storageClassName`, or provision storage manually |
+| Ingress 404 | `kubectl describe ingress grafana` | Update hostnames, TLS secrets, and ensure ingress controller is installed |
+
+### 10. Next Steps
+
+- Configure alerts for critical metrics (Prometheus alerting rules + Alertmanager)
+- Add log aggregation (Loki, Elasticsearch, or Cloud-native logging)
+- Automate stack deployment through CI/CD pipelines using `kubectl apply -k`
+
+With this observability stack in place, you can track Semantic Router health, routing accuracy, latency distributions, and usage trends across any Kubernetes environment.
 
 ## 4. Key Metrics (Sample)
 

From 9405f476851406bc2d0a52bb5999f01a8e951447 Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Tue, 30 Sep 2025 19:35:50 +0800
Subject: [PATCH 3/6] get rig of redudent part in doc

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 .../tutorials/observability/observability.md   | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/website/docs/tutorials/observability/observability.md b/website/docs/tutorials/observability/observability.md
index 82756cca..e8b5168c 100644
--- a/website/docs/tutorials/observability/observability.md
+++ b/website/docs/tutorials/observability/observability.md
@@ -233,24 +233,6 @@ You should see `prometheus-...` and `grafana-...` pods in `Running` state.
 - Roll upgrades separately: update Prometheus and Grafana images via `kustomization.yaml` patches.
 - Consider adopting the Prometheus Operator (`ServiceMonitor` + `PodMonitor`) if you already run kube-prometheus-stack. A sample `ServiceMonitor` is in `website/docs/tutorials/observability/observability.md`.
 
-### 9. Troubleshooting
-
-| Symptom | Checks | Fix |
-|---------|--------|-----|
-| Prometheus target **DOWN** | `kubectl get endpoints semantic-router-metrics -n vllm-semantic-router-system` | Ensure the Semantic Router deployment is running and the service labels match `app=semantic-router`, `service=metrics` |
-| Grafana dashboard empty | **Configuration → Data Sources** | Confirm Prometheus datasource URL resolves and the Prometheus service is reachable |
-| Login fails | `kubectl get secret grafana-admin -o yaml` | Update the secret to match the credentials you expect |
-| PVC Pending | `kubectl describe pvc prometheus-data` | Provide a storage class via `storageClassName`, or provision storage manually |
-| Ingress 404 | `kubectl describe ingress grafana` | Update hostnames, TLS secrets, and ensure ingress controller is installed |
-
-### 10. Next Steps
-
-- Configure alerts for critical metrics (Prometheus alerting rules + Alertmanager)
-- Add log aggregation (Loki, Elasticsearch, or Cloud-native logging)
-- Automate stack deployment through CI/CD pipelines using `kubectl apply -k`
-
-With this observability stack in place, you can track Semantic Router health, routing accuracy, latency distributions, and usage trends across any Kubernetes environment.
-
 ## 4. Key Metrics (Sample)
 
 | Metric                                                        | Type      | Description                                  |

From 5249c10c9e701c40d697a012b2104468dd987e36 Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Wed, 1 Oct 2025 11:41:52 +0800
Subject: [PATCH 4/6] add comments of 472 and 65534

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 deploy/kubernetes/observability/grafana/deployment.yaml    | 2 +-
 deploy/kubernetes/observability/prometheus/deployment.yaml | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/deploy/kubernetes/observability/grafana/deployment.yaml b/deploy/kubernetes/observability/grafana/deployment.yaml
index 0b3bfdcc..e69f111c 100644
--- a/deploy/kubernetes/observability/grafana/deployment.yaml
+++ b/deploy/kubernetes/observability/grafana/deployment.yaml
@@ -15,7 +15,7 @@ spec:
         app: grafana
     spec:
       securityContext:
-        runAsNonRoot: true
+        # Run as non-root user 472 (grafana) and set fsGroup for volume permissions.
         runAsUser: 472
         fsGroup: 472
       containers:
diff --git a/deploy/kubernetes/observability/prometheus/deployment.yaml b/deploy/kubernetes/observability/prometheus/deployment.yaml
index a9815d01..ef5e1653 100644
--- a/deploy/kubernetes/observability/prometheus/deployment.yaml
+++ b/deploy/kubernetes/observability/prometheus/deployment.yaml
@@ -17,6 +17,7 @@ spec:
       serviceAccountName: prometheus
       securityContext:
         runAsNonRoot: true
+        # Run as user 'nobody' and group 'nobody' for enhanced security
         runAsUser: 65534
         fsGroup: 65534
       containers:

From 496a8bd2dc818170f691dfc792892dda9ca38ef6 Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Wed, 1 Oct 2025 11:49:17 +0800
Subject: [PATCH 5/6] add network tips of k8s

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 website/docs/troubleshooting/network-tips.md | 31 +++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/website/docs/troubleshooting/network-tips.md b/website/docs/troubleshooting/network-tips.md
index 88610311..4820cc31 100644
--- a/website/docs/troubleshooting/network-tips.md
+++ b/website/docs/troubleshooting/network-tips.md
@@ -174,7 +174,36 @@ docker compose -f docker-compose.yml -f docker-compose.override.yml up -d
 docker compose -f docker-compose.yml -f docker-compose.override.yml --profile testing up -d
 ```
 
-## 5. Troubleshooting
+## 5. Kubernetes clusters with limited egress
+
+Container runtimes on Kubernetes nodes do not automatically reuse the host Docker daemon settings. When registries are slow or blocked, pods can sit in `ImagePullBackOff`. Pick one or combine several of these mitigations:
+
+### 5.1 Configure containerd or CRI mirrors
+
+- For clusters backed by containerd (Kind, k3s, kubeadm), edit `/etc/containerd/config.toml` or use Kind’s `containerdConfigPatches` to add regional mirror endpoints for registries such as `docker.io`, `ghcr.io`, or `quay.io`.
+- Restart containerd and kubelet after changes so the new mirrors take effect.
+- Avoid pointing mirrors to loopback proxies unless every node can reach that proxy address.
+
+### 5.2 Preload or sideload images
+
+- Build required images locally, then push them into the cluster runtime. For Kind, run `kind load docker-image --name <cluster> <image:tag>`; for other clusters, use `crictl pull` or `ctr -n k8s.io images import` on each node.
+- Patch deployments to set `imagePullPolicy: IfNotPresent` when you know the image already exists on the node.
+
+### 5.3 Publish to an accessible registry
+
+- Tag and push images to a registry that is reachable from the cluster (cloud provider registry, privately hosted Harbor, etc.).
+- Update your `kustomization.yaml` or Helm values with the new image name, and configure `imagePullSecrets` if the registry requires authentication.
+
+### 5.4 Run a local pull-through cache
+
+- Start a registry proxy (`registry:2` or vendor-specific cache) inside the same network, configure it as a mirror in containerd, and regularly warm it with the images you need.
+
+### 5.5 Verify after adjustments
+
+- Use `kubectl describe pod <name>` or `kubectl get events` to confirm pull errors disappear.
+- Check that services such as `semantic-router-metrics` now expose endpoints and respond via port-forward (`kubectl port-forward svc/<service> <local-port>:<service-port>`).
+
+## 6. Troubleshooting
 
 - Go modules still time out:
   - Verify `GOPROXY` and `GOSUMDB` are present in the go-builder stage logs.

From 790c919897b900b1695af33ed0588ff436bb3056 Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Wed, 1 Oct 2025 12:22:04 +0800
Subject: [PATCH 6/6] update uid in dashboard

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 deploy/llm-router-dashboard.json | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/deploy/llm-router-dashboard.json b/deploy/llm-router-dashboard.json
index 350ebf84..4abc5e51 100644
--- a/deploy/llm-router-dashboard.json
+++ b/deploy/llm-router-dashboard.json
@@ -30,7 +30,7 @@
     {
       "datasource": {
         "type": "prometheus",
-        "uid": "febzoy4cplt6oe"
+        "uid": "prometheus"
       },
       "fieldConfig": {
         "defaults": {
@@ -90,7 +90,7 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "febzoy4cplt6oe"
+            "uid": "prometheus"
           },
           "disableTextWrap": false,
           "editorMode": "builder",
@@ -110,7 +110,7 @@
     {
       "datasource": {
         "type": "prometheus",
-        "uid": "febzoy4cplt6oe"
+        "uid": "prometheus"
       },
       "fieldConfig": {
         "defaults": {
@@ -193,7 +193,7 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "febzoy4cplt6oe"
+            "uid": "prometheus"
           },
           "editorMode": "code",
           "expr": "sum(rate(llm_model_completion_tokens_total[5m])) by (model)",
@@ -208,7 +208,7 @@
     {
       "datasource": {
         "type": "prometheus",
-        "uid": "febzoy4cplt6oe"
+        "uid": "prometheus"
       },
       "fieldConfig": {
         "defaults": {
@@ -291,7 +291,7 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "febzoy4cplt6oe"
+            "uid": "prometheus"
           },
           "editorMode": "code",
           "expr": "sum(rate(llm_model_routing_modifications_total[5m])) by (source_model, target_model)",
@@ -307,7 +307,7 @@
     {
       "datasource": {
         "type": "prometheus",
-        "uid": "febzoy4cplt6oe"
+        "uid": "prometheus"
       },
       "fieldConfig": {
         "defaults": {
@@ -394,7 +394,7 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "febzoy4cplt6oe"
+            "uid": "prometheus"
           },
           "editorMode": "code",
           "expr": "histogram_quantile(0.95, sum(rate(llm_model_completion_latency_seconds_bucket[5m])) by (le, model))",
@@ -409,7 +409,7 @@
     {
       "datasource": {
         "type": "prometheus",
-        "uid": "febzoy4cplt6oe"
+        "uid": "prometheus"
       },
       "fieldConfig": {
         "defaults": {
@@ -496,7 +496,7 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "febzoy4cplt6oe"
+            "uid": "prometheus"
           },
           "editorMode": "code",
           "expr": "histogram_quantile(0.95, sum(rate(llm_model_ttft_seconds_bucket[5m])) by (le, model))",
@@ -511,7 +511,7 @@
     {
       "datasource": {
         "type": "prometheus",
-        "uid": "febzoy4cplt6oe"
+        "uid": "prometheus"
       },
       "fieldConfig": {
         "defaults": {
@@ -594,7 +594,7 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "febzoy4cplt6oe"
+            "uid": "prometheus"
           },
           "editorMode": "code",
           "expr": "histogram_quantile(0.95, sum(rate(llm_model_tpot_seconds_bucket[5m])) by (le, model))",
@@ -618,7 +618,7 @@
       {
         "current": {
           "text": "prometheus",
-          "value": "febzoy4cplt6oe"
+          "value": "prometheus"
         },
         "includeAll": false,
         "name": "DS_PROMETHEUS",
@@ -640,4 +640,4 @@
   "uid": "llm-router-metrics",
   "version": 14,
   "weekStart": ""
-}
+}
\ No newline at end of file