diff --git a/README.md b/README.md
index 5a0576355c..9b0cc366be 100644
--- a/README.md
+++ b/README.md
@@ -101,9 +101,8 @@ To coordinate across a data center, Dynamo relies on etcd and NATS. To run Dynam
 
 To quickly setup etcd & NATS, you can also run:
 
-```
+```bash
 # At the root of the repository:
-# Edit deploy/docker-compose.yml to comment out "runtime: nvidia" of the dcgm-exporter service if the nvidia container runtime isn't deployed or to be used.
 docker compose -f deploy/docker-compose.yml up -d
 ```
 
diff --git a/deploy/docker-compose.yml b/deploy/docker-compose.yml
index 2b19741f7b..31ded423ae 100644
--- a/deploy/docker-compose.yml
+++ b/deploy/docker-compose.yml
@@ -1,26 +1,13 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
-# IMPORT NOTE: Make sure this is in sync with lib/runtime/docker-compose.yml
+# Bare minimum infrastructure services for Dynamo.
+# For observability (metrics, tracing, dashboards), use docker-observability.yml
+
 networks:
   server:
     driver: bridge
-  monitoring:
-    driver: bridge
 
-# Note that the images are pinned to specific versions to avoid breaking changes.
 services:
   nats-server:
     image: nats:2.11.4
@@ -31,7 +18,6 @@ services:
       - 8222:8222  # the endpoints include /varz, /healthz, ...
     networks:
       - server
-      - monitoring
 
   etcd-server:
     image: bitnamilegacy/etcd:3.6.1
@@ -42,108 +28,3 @@ services:
       - 2380:2380
     networks:
       - server
-      - monitoring
-
-  # All the services below are part of the metrics profile and monitoring network.
-
-  # The exporter translates from /varz and other stats to Prometheus metrics
-  nats-prometheus-exporter:
-    image: natsio/prometheus-nats-exporter:0.17.3
-    command: ["-varz", "-connz", "-routez", "-subz", "-gatewayz", "-leafz", "-jsz=all", "http://nats-server:8222"]
-    ports:
-      - 7777:7777
-    networks:
-      - monitoring
-    profiles: [metrics]
-    depends_on:
-      - nats-server
-
-  # DCGM stands for Data Center GPU Manager: https://developer.nvidia.com/dcgm
-  # dcgm-exporter is a tool from NVIDIA that exposes DCGM metrics in Prometheus format.
-  dcgm-exporter:
-    image: nvidia/dcgm-exporter:4.2.3-4.1.3-ubi9
-    ports:
-      # Expose dcgm-exporter on port 9401 both inside and outside the container
-      # to avoid conflicts with other dcgm-exporter instances in distributed environments.
-      # To access DCGM metrics:
-      # Outside the container: curl http://localhost:9401/metrics (or the host IP)
-      # Inside the container (container-to-container): curl http://dcgm-exporter:9401/metrics
-      - 9401:9401
-    cap_add:
-      - SYS_ADMIN
-    deploy:
-      resources:
-        reservations:
-          devices:
-            - driver: nvidia
-              count: all
-              capabilities: [gpu]
-    environment:
-      # dcgm uses NVIDIA_VISIBLE_DEVICES variable but normally it is CUDA_VISIBLE_DEVICES
-      - NVIDIA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-all}
-      - DCGM_EXPORTER_LISTEN=:9401
-    runtime: nvidia  # Specify the NVIDIA runtime
-    networks:
-      - monitoring
-
-  # To access Prometheus from another machine, you may need to disable te firewall on your host. On Ubuntu:
-  # sudo ufw allow 9090/tcp
-  prometheus:
-    image: prom/prometheus:v3.4.1
-    container_name: prometheus
-    volumes:
-      - ./metrics/prometheus.yml:/etc/prometheus/prometheus.yml
-    command:
-      - '--config.file=/etc/prometheus/prometheus.yml'
-      - '--storage.tsdb.path=/prometheus'
-      # These provide the web console functionality
-      - '--web.console.libraries=/etc/prometheus/console_libraries'
-      - '--web.console.templates=/etc/prometheus/consoles'
-      - '--web.enable-lifecycle'
-    restart: unless-stopped
-    # Example to pull from the /query endpoint:
-    # {__name__=~"DCGM.*", job="dcgm-exporter"}
-    networks:
-      - monitoring
-    ports:
-      - "9090:9090"
-    profiles: [metrics]
-    extra_hosts:
-    - "host.docker.internal:host-gateway"
-    depends_on:
-      - dcgm-exporter
-      - nats-prometheus-exporter
-      - etcd-server
-
-  # grafana connects to prometheus via the /query endpoint.
-  # Default credentials are dynamo/dynamo.
-  # To access Grafana from another machine, you may need to disable te firewall on your host. On Ubuntu:
-  # sudo ufw allow 3001/tcp
-  grafana:
-    image: grafana/grafana-enterprise:12.0.1
-    container_name: grafana
-    volumes:
-      - ./metrics/grafana_dashboards:/etc/grafana/provisioning/dashboards
-      - ./metrics/grafana-datasources.yml:/etc/grafana/provisioning/datasources/datasources.yml
-    environment:
-      - GF_SERVER_HTTP_PORT=3001
-      # do not make it admin/admin, because you will be prompted to change the password every time
-      - GF_SECURITY_ADMIN_USER=dynamo
-      - GF_SECURITY_ADMIN_PASSWORD=dynamo
-      - GF_USERS_ALLOW_SIGN_UP=false
-      - GF_INSTALL_PLUGINS=grafana-piechart-panel
-      # Default min interval is 5s, but can be configured lower
-      - GF_DASHBOARDS_MIN_REFRESH_INTERVAL=2s
-      # Disable password change requirement
-      - GF_SECURITY_DISABLE_INITIAL_ADMIN_CREATION=false
-      - GF_SECURITY_ADMIN_PASSWORD_POLICY=false
-      - GF_AUTH_DISABLE_LOGIN_FORM=false
-      - GF_AUTH_DISABLE_SIGNOUT_MENU=false
-    restart: unless-stopped
-    ports:
-      - "3001:3001"
-    networks:
-      - monitoring
-    profiles: [metrics]
-    depends_on:
-      - prometheus
diff --git a/deploy/docker-observability.yml b/deploy/docker-observability.yml
new file mode 100644
index 0000000000..b8e57aa6c3
--- /dev/null
+++ b/deploy/docker-observability.yml
@@ -0,0 +1,137 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# Observability stack for Dynamo: metrics, tracing, and visualization.
+# Requires deploy/docker-compose.yml to be running for NATS and etcd connectivity.
+#
+# Usage:
+#   docker compose -f deploy/docker-observability.yml up -d
+
+version: '3.8'
+
+networks:
+  server:
+    external: true
+    name: deploy_server
+
+volumes:
+  grafana-data:
+  tempo-data:
+
+services:
+  # DCGM stands for Data Center GPU Manager: https://developer.nvidia.com/dcgm
+  # dcgm-exporter is a tool from NVIDIA that exposes DCGM metrics in Prometheus format.
+  dcgm-exporter:
+    image: nvidia/dcgm-exporter:4.2.3-4.1.3-ubi9
+    ports:
+      # Expose dcgm-exporter on port 9401 both inside and outside the container
+      # to avoid conflicts with other dcgm-exporter instances in distributed environments.
+      # To access DCGM metrics:
+      # Outside the container: curl http://localhost:9401/metrics (or the host IP)
+      # Inside the container (container-to-container): curl http://dcgm-exporter:9401/metrics
+      - 9401:9401
+    cap_add:
+      - SYS_ADMIN
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    environment:
+      # dcgm uses NVIDIA_VISIBLE_DEVICES variable but normally it is CUDA_VISIBLE_DEVICES
+      - NVIDIA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-all}
+      - DCGM_EXPORTER_LISTEN=:9401
+    runtime: nvidia  # Specify the NVIDIA runtime
+    networks:
+      - server
+
+  # The exporter translates from /varz and other stats to Prometheus metrics
+  nats-prometheus-exporter:
+    image: natsio/prometheus-nats-exporter:0.17.3
+    command: ["-varz", "-connz", "-routez", "-subz", "-gatewayz", "-leafz", "-jsz=all", "http://nats-server:8222"]
+    ports:
+      - 7777:7777
+    networks:
+      - server
+
+  # To access Prometheus from another machine, you may need to disable te firewall on your host. On Ubuntu:
+  # sudo ufw allow 9090/tcp
+  prometheus:
+    image: prom/prometheus:v3.4.1
+    container_name: prometheus
+    volumes:
+      - ./observability/prometheus.yml:/etc/prometheus/prometheus.yml
+    command:
+      - '--config.file=/etc/prometheus/prometheus.yml'
+      - '--storage.tsdb.path=/prometheus'
+      # These provide the web console functionality
+      - '--web.console.libraries=/etc/prometheus/console_libraries'
+      - '--web.console.templates=/etc/prometheus/consoles'
+      - '--web.enable-lifecycle'
+    restart: unless-stopped
+    # Example to pull from the /query endpoint:
+    # {__name__=~"DCGM.*", job="dcgm-exporter"}
+    ports:
+      - "9090:9090"
+    networks:
+      - server
+    extra_hosts:
+    - "host.docker.internal:host-gateway"
+    depends_on:
+      - dcgm-exporter
+      - nats-prometheus-exporter
+
+  # Tempo - Distributed tracing backend
+  tempo:
+    image: grafana/tempo:2.8.2
+    command: [ "-config.file=/etc/tempo.yaml" ]
+    user: root
+    volumes:
+      - ./observability/tempo.yaml:/etc/tempo.yaml
+      - tempo-data:/tmp/tempo
+    ports:
+      - "3200:3200"   # Tempo HTTP
+      - "4317:4317"   # OTLP gRPC receiver (accessible from host)
+      - "4318:4318"   # OTLP HTTP receiver (accessible from host)
+    networks:
+      - server
+
+  # Grafana - Visualization and dashboards
+  # Supports both Prometheus (metrics) and Tempo (tracing) datasources
+  # Default credentials: dynamo/dynamo
+  # To access Grafana from another machine, you may need to disable te firewall on your host. On Ubuntu:
+  # sudo ufw allow 3000/tcp
+  grafana:
+    image: grafana/grafana:12.2.0
+    container_name: grafana
+    volumes:
+      - grafana-data:/var/lib/grafana
+      - ./observability/grafana_dashboards:/etc/grafana/provisioning/dashboards
+      - ./observability/grafana-datasources.yml:/etc/grafana/provisioning/datasources/prometheus.yml
+      - ./observability/tempo-datasource.yml:/etc/grafana/provisioning/datasources/tempo.yml
+    environment:
+      - GF_SERVER_HTTP_PORT=3000
+      # do not make it admin/admin, because you will be prompted to change the password every time
+      - GF_SECURITY_ADMIN_USER=dynamo
+      - GF_SECURITY_ADMIN_PASSWORD=dynamo
+      - GF_USERS_ALLOW_SIGN_UP=false
+      - GF_FEATURE_TOGGLES_ENABLE=traceqlEditor
+      - GF_INSTALL_PLUGINS=grafana-piechart-panel
+      # Default min interval is 5s, but can be configured lower
+      - GF_DASHBOARDS_MIN_REFRESH_INTERVAL=2s
+      # Disable password change requirement
+      - GF_SECURITY_DISABLE_INITIAL_ADMIN_CREATION=false
+      - GF_SECURITY_ADMIN_PASSWORD_POLICY=false
+      - GF_AUTH_DISABLE_LOGIN_FORM=false
+      - GF_AUTH_DISABLE_SIGNOUT_MENU=false
+    restart: unless-stopped
+    ports:
+      - "3000:3000"
+    networks:
+      - server
+    depends_on:
+      - prometheus
+      - tempo
+
diff --git a/deploy/metrics/grafana-datasources.yml b/deploy/observability/grafana-datasources.yml
similarity index 100%
rename from deploy/metrics/grafana-datasources.yml
rename to deploy/observability/grafana-datasources.yml
diff --git a/deploy/metrics/grafana_dashboards/grafana-dashboard-providers.yml b/deploy/observability/grafana_dashboards/grafana-dashboard-providers.yml
similarity index 100%
rename from deploy/metrics/grafana_dashboards/grafana-dashboard-providers.yml
rename to deploy/observability/grafana_dashboards/grafana-dashboard-providers.yml
diff --git a/deploy/metrics/grafana_dashboards/grafana-dcgm-metrics.json b/deploy/observability/grafana_dashboards/grafana-dcgm-metrics.json
similarity index 100%
rename from deploy/metrics/grafana_dashboards/grafana-dcgm-metrics.json
rename to deploy/observability/grafana_dashboards/grafana-dcgm-metrics.json
diff --git a/deploy/metrics/grafana_dashboards/grafana-dynamo-dashboard.json b/deploy/observability/grafana_dashboards/grafana-dynamo-dashboard.json
similarity index 99%
rename from deploy/metrics/grafana_dashboards/grafana-dynamo-dashboard.json
rename to deploy/observability/grafana_dashboards/grafana-dynamo-dashboard.json
index 76b822c6f9..1ef1abc7c1 100644
--- a/deploy/metrics/grafana_dashboards/grafana-dynamo-dashboard.json
+++ b/deploy/observability/grafana_dashboards/grafana-dynamo-dashboard.json
@@ -1020,7 +1020,7 @@
   },
   "timepicker": {},
   "timezone": "browser",
-  "title": "Dynamo Dashboard",
+  "title": "Dynamo Dashboard (generic)",
   "uid": "97ae8df9-138a-4f7a-9b0f-635b77d818fe",
   "version": 1
 }
\ No newline at end of file
diff --git a/deploy/metrics/grafana_dashboards/grafana-kvbm-dashboard.json b/deploy/observability/grafana_dashboards/grafana-kvbm-dashboard.json
similarity index 100%
rename from deploy/metrics/grafana_dashboards/grafana-kvbm-dashboard.json
rename to deploy/observability/grafana_dashboards/grafana-kvbm-dashboard.json
diff --git a/deploy/metrics/k8s/README.md b/deploy/observability/k8s/README.md
similarity index 100%
rename from deploy/metrics/k8s/README.md
rename to deploy/observability/k8s/README.md
diff --git a/deploy/metrics/k8s/frontend-podmonitor.yaml b/deploy/observability/k8s/frontend-podmonitor.yaml
similarity index 100%
rename from deploy/metrics/k8s/frontend-podmonitor.yaml
rename to deploy/observability/k8s/frontend-podmonitor.yaml
diff --git a/deploy/metrics/k8s/grafana-dynamo-dashboard-configmap.yaml b/deploy/observability/k8s/grafana-dynamo-dashboard-configmap.yaml
similarity index 99%
rename from deploy/metrics/k8s/grafana-dynamo-dashboard-configmap.yaml
rename to deploy/observability/k8s/grafana-dynamo-dashboard-configmap.yaml
index 0c4ed0c011..ee1088556b 100644
--- a/deploy/metrics/k8s/grafana-dynamo-dashboard-configmap.yaml
+++ b/deploy/observability/k8s/grafana-dynamo-dashboard-configmap.yaml
@@ -1002,7 +1002,7 @@ data:
       },
       "timepicker": {},
       "timezone": "browser",
-      "title": "Dynamo Dashboard",
+      "title": "Dynamo Dashboard (generic)",
       "uid": "dynamo-dashboard",
       "version": 1
     }
diff --git a/deploy/logging/README.md b/deploy/observability/k8s/logging/README.md
similarity index 75%
rename from deploy/logging/README.md
rename to deploy/observability/k8s/logging/README.md
index 2423989d99..85634e5273 100644
--- a/deploy/logging/README.md
+++ b/deploy/observability/k8s/logging/README.md
@@ -1,3 +1,3 @@
 # Dynamo Logging on Kubernetes
 
-For detailed documentation on collecting and visualizing logs on Kubernetes, see [docs/kubernetes/observability/logging.md](../../docs/kubernetes/observability/logging.md).
+For detailed documentation on collecting and visualizing logs on Kubernetes, see [docs/kubernetes/observability/logging.md](../../../../docs/kubernetes/observability/logging.md).
diff --git a/deploy/logging/grafana/dashboard.json b/deploy/observability/k8s/logging/grafana/dashboard.json
similarity index 100%
rename from deploy/logging/grafana/dashboard.json
rename to deploy/observability/k8s/logging/grafana/dashboard.json
diff --git a/deploy/logging/grafana/logging-dashboard.yaml b/deploy/observability/k8s/logging/grafana/logging-dashboard.yaml
similarity index 100%
rename from deploy/logging/grafana/logging-dashboard.yaml
rename to deploy/observability/k8s/logging/grafana/logging-dashboard.yaml
diff --git a/deploy/logging/grafana/loki-datasource.yaml b/deploy/observability/k8s/logging/grafana/loki-datasource.yaml
similarity index 100%
rename from deploy/logging/grafana/loki-datasource.yaml
rename to deploy/observability/k8s/logging/grafana/loki-datasource.yaml
diff --git a/deploy/logging/values/alloy-values.yaml b/deploy/observability/k8s/logging/values/alloy-values.yaml
similarity index 100%
rename from deploy/logging/values/alloy-values.yaml
rename to deploy/observability/k8s/logging/values/alloy-values.yaml
diff --git a/deploy/logging/values/loki-values.yaml b/deploy/observability/k8s/logging/values/loki-values.yaml
similarity index 100%
rename from deploy/logging/values/loki-values.yaml
rename to deploy/observability/k8s/logging/values/loki-values.yaml
diff --git a/deploy/metrics/k8s/planner-podmonitor.yaml b/deploy/observability/k8s/planner-podmonitor.yaml
similarity index 100%
rename from deploy/metrics/k8s/planner-podmonitor.yaml
rename to deploy/observability/k8s/planner-podmonitor.yaml
diff --git a/deploy/metrics/k8s/worker-podmonitor.yaml b/deploy/observability/k8s/worker-podmonitor.yaml
similarity index 100%
rename from deploy/metrics/k8s/worker-podmonitor.yaml
rename to deploy/observability/k8s/worker-podmonitor.yaml
diff --git a/deploy/metrics/prometheus.yml b/deploy/observability/prometheus.yml
similarity index 100%
rename from deploy/metrics/prometheus.yml
rename to deploy/observability/prometheus.yml
diff --git a/deploy/tracing/grafana/provisioning/datasources/tempo.yaml b/deploy/observability/tempo-datasource.yml
similarity index 96%
rename from deploy/tracing/grafana/provisioning/datasources/tempo.yaml
rename to deploy/observability/tempo-datasource.yml
index 388c461371..14efa7c770 100644
--- a/deploy/tracing/grafana/provisioning/datasources/tempo.yaml
+++ b/deploy/observability/tempo-datasource.yml
@@ -9,7 +9,7 @@ datasources:
     access: proxy
     url: http://tempo:3200
     uid: tempo
-    isDefault: true
+    isDefault: false
     editable: true
     jsonData:
       httpMethod: GET
diff --git a/deploy/tracing/tempo.yaml b/deploy/observability/tempo.yaml
similarity index 100%
rename from deploy/tracing/tempo.yaml
rename to deploy/observability/tempo.yaml
diff --git a/deploy/tracing/docker-compose.yml b/deploy/tracing/docker-compose.yml
deleted file mode 100644
index 16a5f0657d..0000000000
--- a/deploy/tracing/docker-compose.yml
+++ /dev/null
@@ -1,35 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-version: '3.8'
-
-services:
-  # Tempo - Distributed tracing backend
-  tempo:
-    image: grafana/tempo:2.8.2
-    command: [ "-config.file=/etc/tempo.yaml" ]
-    volumes:
-      - ./tempo.yaml:/etc/tempo.yaml
-      - tempo-data:/tmp/tempo
-    ports:
-      - "3200:3200"   # Tempo HTTP
-      - "4317:4317"   # OTLP gRPC receiver (accessible from host)
-      - "4318:4318"   # OTLP HTTP receiver (accessible from host)
-
-  # Grafana - Visualization and dashboards
-  grafana:
-    image: grafana/grafana:12.2.0
-    ports:
-      - "3000:3000"
-    environment:
-      - GF_SECURITY_ADMIN_PASSWORD=admin
-      - GF_FEATURE_TOGGLES_ENABLE=traceqlEditor
-    volumes:
-      - grafana-data:/var/lib/grafana
-      - ./grafana/provisioning:/etc/grafana/provisioning
-    depends_on:
-      - tempo
-
-volumes:
-  tempo-data:
-  grafana-data:
diff --git a/docs/_sections/observability.rst b/docs/_sections/observability.rst
index f91973e7d1..c1b108c975 100644
--- a/docs/_sections/observability.rst
+++ b/docs/_sections/observability.rst
@@ -4,6 +4,10 @@ Observability
 .. toctree::
    :hidden:
 
+   Overview <../observability/README>
+   Prometheus + Grafana Setup <../observability/prometheus-grafana>
    Metrics <../observability/metrics>
+   Metrics Developer Guide <../observability/metrics-developer-guide>
+   Health Checks <../observability/health-checks>
+   Tracing <../observability/tracing>
    Logging <../observability/logging>
-   Health Checks <../observability/health-checks>
\ No newline at end of file
diff --git a/docs/kubernetes/observability/logging.md b/docs/kubernetes/observability/logging.md
index 0784cf05c7..95c0785bef 100644
--- a/docs/kubernetes/observability/logging.md
+++ b/docs/kubernetes/observability/logging.md
@@ -46,7 +46,7 @@ helm repo add grafana https://grafana.github.io/helm-charts
 helm repo update
 
 # Install Loki
-helm install --values deploy/logging/values/loki-values.yaml loki grafana/loki -n $MONITORING_NAMESPACE
+helm install --values deploy/observability/k8s/logging/values/loki-values.yaml loki grafana/loki -n $MONITORING_NAMESPACE
 ```
 
 Our configuration (`loki-values.yaml`) sets up Loki in a simple configuration that is suitable for testing and development. It uses a local MinIO for storage. The installation pods can be viewed with:
@@ -60,7 +60,7 @@ Next, install the Grafana Alloy collector to gather logs from your Kubernetes cl
 
 ```bash
 # Generate a custom values file with the namespace information
-envsubst < deploy/logging/values/alloy-values.yaml > alloy-custom-values.yaml
+envsubst < deploy/observability/k8s/logging/values/alloy-values.yaml > alloy-custom-values.yaml
 
 # Install the collector
 helm install --values alloy-custom-values.yaml alloy grafana/k8s-monitoring -n $MONITORING_NAMESPACE
@@ -110,10 +110,10 @@ Since we are using Grafana with the Prometheus Operator, we can simply apply the
 
 ```bash
 # Configure Grafana with the Loki datasource
-envsubst < deploy/logging/grafana/loki-datasource.yaml | kubectl apply -n $MONITORING_NAMESPACE -f -
+envsubst < deploy/observability/k8s/logging/grafana/loki-datasource.yaml | kubectl apply -n $MONITORING_NAMESPACE -f -
 
 # Configure Grafana with the Dynamo Logs dashboard
-envsubst < deploy/logging/grafana/logging-dashboard.yaml | kubectl apply -n $MONITORING_NAMESPACE -f -
+envsubst < deploy/observability/k8s/logging/grafana/logging-dashboard.yaml | kubectl apply -n $MONITORING_NAMESPACE -f -
 ```
 
 > [!Note]
@@ -141,4 +141,4 @@ kubectl port-forward svc/prometheus-grafana 3000:80 -n $MONITORING_NAMESPACE
 
 If everything is working, under Home > Dashboards > Dynamo Logs, you should see a dashboard that can be used to view the logs associated with our DynamoGraphDeployments
 
-The dashboard enables filtering by DynamoGraphDeployment, namespace, and component type (e.g frontend, worker, etc).
\ No newline at end of file
+The dashboard enables filtering by DynamoGraphDeployment, namespace, and component type (e.g., frontend, worker, etc.).
diff --git a/docs/kubernetes/observability/metrics.md b/docs/kubernetes/observability/metrics.md
index e03ec3efeb..f8d6f8696b 100644
--- a/docs/kubernetes/observability/metrics.md
+++ b/docs/kubernetes/observability/metrics.md
@@ -128,9 +128,7 @@ spec:
 
 Apply the Dynamo dashboard configuration to populate Grafana with the Dynamo dashboard:
 ```bash
-pushd deploy/metrics/k8s
-kubectl apply -n monitoring -f grafana-dynamo-dashboard-configmap.yaml
-popd
+kubectl apply -n monitoring -f deploy/observability/k8s/grafana-dynamo-dashboard-configmap.yaml
 ```
 
 The dashboard is embedded in the ConfigMap. Since it is labeled with `grafana_dashboard: "1"`, the Grafana will discover and populate it to its list of available dashboards. The dashboard includes panels for:
diff --git a/docs/observability/README.md b/docs/observability/README.md
new file mode 100644
index 0000000000..12c71c335e
--- /dev/null
+++ b/docs/observability/README.md
@@ -0,0 +1,32 @@
+<!--
+SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: Apache-2.0
+-->
+
+# Dynamo Observability
+
+## Quick Start
+
+For a quick start guide to get Prometheus and Grafana running with Dynamo on a single machine, see [Prometheus + Grafana Setup](prometheus-grafana.md).
+
+## Observability Documentations
+
+| Guide | Description | Environment Variables to Control |
+|-------|-------------|----------------------------------|
+| [Metrics](metrics.md) | Available metrics reference | `DYN_SYSTEM_ENABLED`†, `DYN_SYSTEM_PORT`† |
+| [Health Checks](health-checks.md) | Component health monitoring and readiness probes | `DYN_SYSTEM_ENABLED`†, `DYN_SYSTEM_PORT`†, `DYN_SYSTEM_STARTING_HEALTH_STATUS`, `DYN_SYSTEM_HEALTH_PATH`, `DYN_SYSTEM_LIVE_PATH`, `DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS` |
+| [Tracing](tracing.md) | Distributed tracing with OpenTelemetry and Tempo | `DYN_LOGGING_JSONL`†, `OTEL_EXPORT_ENABLED`†, `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT`†, `OTEL_SERVICE_NAME`† |
+| [Logging](logging.md) | Structured logging configuration | `DYN_LOGGING_JSONL`†, `DYN_LOG`, `DYN_LOG_USE_LOCAL_TZ`, `DYN_LOGGING_CONFIG_PATH`, `OTEL_SERVICE_NAME`†, `OTEL_EXPORT_ENABLED`†, `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT`† |
+
+**Variables marked with † are shared across multiple observability systems.**
+
+## Developer Guides
+
+| Guide | Description | Environment Variables to Control |
+|-------|-------------|----------------------------------|
+| [Metrics Developer Guide](metrics-developer-guide.md) | Creating custom metrics in Rust and Python | `DYN_SYSTEM_ENABLED`†, `DYN_SYSTEM_PORT`† |
+
+## Kubernetes
+
+For Kubernetes-specific setup and configuration, see [docs/kubernetes/observability/](../kubernetes/observability/).
+
diff --git a/docs/observability/health-checks.md b/docs/observability/health-checks.md
index 9e77f3202b..2213b2bc10 100644
--- a/docs/observability/health-checks.md
+++ b/docs/observability/health-checks.md
@@ -11,6 +11,41 @@ Dynamo provides health check and liveness HTTP endpoints for each component whic
 can be used to configure startup, liveness and readiness probes in
 orchestration frameworks such as Kubernetes.
 
+## Environment Variables
+
+| Variable | Description | Default | Example |
+|----------|-------------|---------|---------|
+| `DYN_SYSTEM_ENABLED` | Enable system status server | `false` | `true` |
+| `DYN_SYSTEM_PORT` | System status server port | `8081` | `9090` |
+| `DYN_SYSTEM_STARTING_HEALTH_STATUS` | Initial health status | `notready` | `ready`, `notready` |
+| `DYN_SYSTEM_HEALTH_PATH` | Custom health endpoint path | `/health` | `/custom/health` |
+| `DYN_SYSTEM_LIVE_PATH` | Custom liveness endpoint path | `/live` | `/custom/live` |
+| `DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS` | Endpoints required for ready state | none | `["generate"]` |
+
+## Getting Started (Single GPU)
+
+Enable health checks and query endpoints:
+
+```bash
+# Enable system status server
+export DYN_SYSTEM_ENABLED=true
+export DYN_SYSTEM_PORT=8081
+
+# Start your Dynamo components
+python -m dynamo.frontend --http-port 8000 &
+python -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager &
+```
+
+Check health status:
+
+```bash
+# Frontend health (port 8000)
+curl -s localhost:8000/health | jq
+
+# Worker health (port 8081)
+curl -s localhost:8081/health | jq
+```
+
 ## Frontend Liveness Check
 
 The frontend liveness endpoint reports a status of `live` as long as
@@ -124,17 +159,6 @@ when initializing and HTTP status code `HTTP/1.1 200 OK` once ready.
 
 > **Note**: Both /live and /ready return the same information
 
-### Environment Variables for Enabling Health Checks
-
-| **Environment Variable** | **Description**     | **Example Settings**                             |
-| -------------------------| ------------------- | ------------------------------------------------ |
-| `DYN_SYSTEM_ENABLED`     | Enables the system status server.                                            | `true`, `false`                           |
-| `DYN_SYSTEM_PORT`        | Specifies the port for the system status server.                              | `9090`                                   |
-| `DYN_SYSTEM_STARTING_HEALTH_STATUS`     | Sets the initial health status of the system (ready/not ready).                | `ready`, `notready`      |
-| `DYN_SYSTEM_HEALTH_PATH`                | Custom path for the health endpoint.                                         | `/custom/health`           |
-| `DYN_SYSTEM_LIVE_PATH`                   | Custom path for the liveness endpoint.                                       | `/custom/live`            |
-| `DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS` | Specifies endpoints to check for determining overall system health status.    | `["generate"]`            |
-
 ### Example Environment Setting
 
 ```
diff --git a/docs/observability/logging.md b/docs/observability/logging.md
index fac8ff910d..8b811d0649 100644
--- a/docs/observability/logging.md
+++ b/docs/observability/logging.md
@@ -24,18 +24,32 @@ JSONL is enabled logs additionally contain `span` creation and exit
 events as well as support for `trace_id` and `span_id` fields for
 distributed tracing.
 
-## Environment Variables for configuring Logging
+## Environment Variables
 
-| Environment Variable                | Description                                 | Example Settings                  |
-| ----------------------------------- | --------------------------------------------| ---------------------------------------------------- |
-| `DYN_LOGGING_JSONL`                | Enable JSONL logging format (default: READABLE)                  | `DYN_LOGGING_JSONL=true`                          |
-| `DYN_LOG_USE_LOCAL_TZ`             | Use local timezone for logging timestamps (default: UTC)         | `DYN_LOG_USE_LOCAL_TZ=1`                       |
-| `DYN_LOG`                          | Log levels per target `<default_level>,<module_path>=<level>,<module_path>=<level>`             | `DYN_LOG=info,dynamo_runtime::system_status_server:trace`  |
-| `DYN_LOGGING_CONFIG_PATH`          | Path to custom TOML logging configuration file            | `DYN_LOGGING_CONFIG_PATH=/path/to/config.toml`|
-| `OTEL_SERVICE_NAME`                | Service name for OpenTelemetry traces (default: `dynamo`) | `OTEL_SERVICE_NAME=dynamo-frontend` |
-| `OTEL_EXPORT_ENABLED`              | Enable OTLP trace exporting (set to `1` to enable) | `OTEL_EXPORT_ENABLED=1` |
-| `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT`             | OTLP exporter endpoint (default: http://localhost:4317) | `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://tempo:4317` |
+| Variable | Description | Default | Example |
+|----------|-------------|---------|---------|
+| `DYN_LOGGING_JSONL` | Enable JSONL logging format | `false` | `true` |
+| `DYN_LOG` | Log level: `info` or `debug` | `info` | `debug` |
+| `DYN_LOG_USE_LOCAL_TZ` | Use local timezone for timestamps | `false` | `true` |
+| `DYN_LOGGING_CONFIG_PATH` | Path to custom TOML logging configuration | none | `/path/to/config.toml` |
+| `OTEL_SERVICE_NAME` | Service name for OpenTelemetry traces | `dynamo` | `dynamo-frontend` |
+| `OTEL_EXPORT_ENABLED` | Enable OTLP trace exporting | `false` | `true` |
+| `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT` | OTLP exporter endpoint | `http://localhost:4317` | `http://tempo:4317` |
 
+## Getting Started
+
+Enable structured JSONL logging:
+
+```bash
+export DYN_LOGGING_JSONL=true
+export DYN_LOG=debug
+
+# Start your Dynamo components
+python -m dynamo.frontend --http-port 8000 &
+python -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager &
+```
+
+Logs will be written to stderr in JSONL format with trace context.
 
 ## Available Logging Levels
 
@@ -85,68 +99,55 @@ Resulting Log format:
 {"time":"2025-09-02T15:53:31.943747Z","level":"INFO","target":"log","message":"Scheduler config values: {'max_num_seqs': 256, 'max_num_batched_tokens': 2048}","log.file":"/opt/dynamo/venv/lib/python3.12/site-packages/dynamo/vllm/main.py","log.line":268,"log.target":"main.get_engine_cache_info"}
 ```
 
-## OpenTelemetry Distributed Tracing
-
-When `DYN_LOGGING_JSONL` is enabled, Dynamo uses OpenTelemetry for distributed tracing. All logs include `trace_id` and `span_id` fields, and spans are automatically created for requests. By default, traces are **not exported**. To export traces to an observability backend (like Tempo, Jaeger, or Zipkin), set `OTEL_EXPORT_ENABLED=1`.
+## Logging of OpenTelemetry Tracing
 
-### Behavior
+When `DYN_LOGGING_JSONL` is enabled, Dynamo uses OpenTelemetry for distributed tracing. All logs include `trace_id` and `span_id` fields, and spans are automatically created for requests. This is useful for short debugging sessions where you want to examine trace context in logs without setting up a full tracing backend.
 
-- **With `DYN_LOGGING_JSONL=true` only**: OpenTelemetry layer is active, generating trace context and span IDs for all requests. Traces appear in logs but are not exported anywhere.
-- **With `OTEL_EXPORT_ENABLED=1` and `DYN_LOGGING_JSONL=true`**: Same as above, plus traces are exported to an OTLP collector for visualization.
+**Note:** This section has overlap with [Distributed Tracing with Tempo](tracing.md) since OpenTelemetry has aspects of both logging and tracing. For trace visualization in Grafana Tempo and persistent trace analysis, see [Distributed Tracing with Tempo](tracing.md).
 
-### Configuration
+### Configuration for Logging
 
-To enable OTLP trace exporting:
-
-1. Set `OTEL_EXPORT_ENABLED=1` to enable trace export
-2. Optionally configure the endpoint using `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT` (default: `http://localhost:4317`)
-3. Optionally set `OTEL_SERVICE_NAME` to identify the service (useful in Kubernetes, default: `dynamo`)
-
-**Export Settings:**
-- **Protocol**: gRPC (Tonic)
-- **Service Name**: Value of `OTEL_SERVICE_NAME` env var, or `dynamo` if not set
-- **Endpoint**: Value of `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT` env var, or `http://localhost:4317` if not set
-
-### Example: JSONL Logging Only (No Export)
+To see OpenTelemetry trace information in logs:
 
 ```bash
 export DYN_LOGGING_JSONL=true
-# OpenTelemetry is active, traces appear in logs, but nothing is exported
+export DYN_LOG=debug  # Set to debug to see detailed trace logs
+
+# Start your Dynamo components (e.g., frontend and worker)
+python -m dynamo.frontend --http-port 8000 &
+python -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager &
 ```
 
-### Example: JSONL Logging + Trace Export to Tempo
+This enables JSONL logging with `trace_id` and `span_id` fields. Traces appear in logs but are not exported to any backend.
+
+### Example Request
+
+Send a request to generate logs with trace context:
 
 ```bash
-export DYN_LOGGING_JSONL=true
-export OTEL_EXPORT_ENABLED=1
-export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://tempo:4317
-export OTEL_SERVICE_NAME=dynamo-frontend
-# OpenTelemetry is active, traces appear in logs AND are exported to Tempo
+curl -H 'Content-Type: application/json' \
+-H 'x-request-id: test-trace-001' \
+-d '{
+  "model": "Qwen/Qwen3-0.6B",
+  "max_completion_tokens": 100,
+  "messages": [
+    {"role": "user", "content": "What is the capital of France?"}
+  ]
+}' \
+http://localhost:8000/v1/chat/completions
 ```
 
-## Trace and Span Information
+Check the logs (stderr) for JSONL output containing `trace_id`, `span_id`, and `x_request_id` fields.
 
-### Example Request
+## Trace and Span Information in Logs
 
-```sh
-curl -X POST http://localhost:8000/v1/chat/completions \
-  -H 'Content-Type: application/json' \
-  -d '{
-    "model": "Qwen/Qwen3-0.6B",
-    "messages": [
-      {
-        "role": "user",
-        "content": "Explain why Roger Federer is considered one of the greatest tennis players of all time"
-      }
-    ],
-    "stream": true,
-    "max_tokens": 1000,
-  }'
-```
+This section shows how OpenTelemetry trace and span information appears in JSONL logs. These logs can be used to understand request flows even without a trace visualization backend.
+
+### Example Disaggregated Trace in Grafana
 
 When viewing the corresponding trace in Grafana, you should be able to see something like the following:
 
-![Trace Example](./grafana-disagg-trace.png)
+![Disaggregated Trace Example](grafana-disagg-trace.png)
 
 ### Trace Overview
 
@@ -208,7 +209,7 @@ When viewing the corresponding trace in Grafana, you should be able to see somet
 | **Busy Time** | 3,795,258 ns (3.79ms) |
 | **Idle Time** | 3,996,532,471 ns (3.99s) |
 
-### Frontend Logs
+### Frontend Logs with Trace Context
 
 The following shows the JSONL logs from the frontend service for the same request. Note the `trace_id` field (`b672ccf48683b392891c5cb4163d4b51`) that correlates all logs for this request, and the `span_id` field that identifies individual operations:
 
@@ -220,7 +221,7 @@ The following shows the JSONL logs from the frontend service for the same reques
 {"time":"2025-10-31T20:52:10.745545Z","level":"DEBUG","file":"/opt/dynamo/lib/runtime/src/pipeline/network/tcp/server.rs","line":230,"target":"dynamo_runtime::pipeline::network::tcp::server","message":"Registering new TcpStream on 10.0.4.65:41959","method":"POST","span_id":"5c20cc08e6afb2b7","span_name":"http-request","trace_id":"b672ccf48683b392891c5cb4163d4b51","uri":"/v1/chat/completions","version":"HTTP/1.1"}
 ```
 
-## Custom Request IDs
+## Custom Request IDs in Logs
 
 You can provide a custom request ID using the `x-request-id` header. This ID will be attached to all spans and logs for that request, making it easier to correlate traces with application-level request tracking.
 
@@ -238,7 +239,7 @@ curl -X POST http://localhost:8000/v1/chat/completions \
         "content": "Explain why Roger Federer is considered one of the greatest tennis players of all time"
       }
     ],
-    "stream": true,
+    "stream": false,
     "max_tokens": 1000
   }'
 ```
diff --git a/docs/observability/metrics-developer-guide.md b/docs/observability/metrics-developer-guide.md
new file mode 100644
index 0000000000..c07d235751
--- /dev/null
+++ b/docs/observability/metrics-developer-guide.md
@@ -0,0 +1,270 @@
+<!--
+SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: Apache-2.0
+-->
+
+# Metrics Developer Guide
+
+This guide explains how to create and use custom metrics in Dynamo components using the Dynamo metrics API.
+
+## Metrics Exposure
+
+All metrics created via the Dynamo metrics API are automatically exposed on the `/metrics` HTTP endpoint in Prometheus Exposition Format text when the following environment variables are set:
+
+- `DYN_SYSTEM_ENABLED=true` - Enable the system metrics server
+- `DYN_SYSTEM_PORT=<port>` - Port for the metrics endpoint (default: `8081`)
+
+Example:
+```bash
+DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 python -m dynamo.vllm --model <model>
+```
+
+Prometheus Exposition Format text metrics will be available at: `http://localhost:8081/metrics`
+
+## Metric Name Constants
+
+The [prometheus_names.rs](../../lib/runtime/src/metrics/prometheus_names.rs) module provides centralized metric name constants and sanitization functions to ensure consistency across all Dynamo components.
+
+---
+
+## Metrics API in Rust
+
+The metrics API is accessible through the `.metrics()` method on runtime, namespace, component, and endpoint objects. See [Runtime Hierarchy](metrics.md#runtime-hierarchy) for details on the hierarchical structure.
+
+### Available Methods
+
+- `.metrics().create_counter()`: Create a counter metric
+- `.metrics().create_gauge()`: Create a gauge metric
+- `.metrics().create_histogram()`: Create a histogram metric
+- `.metrics().create_countervec()`: Create a counter with labels
+- `.metrics().create_gaugevec()`: Create a gauge with labels
+- `.metrics().create_histogramvec()`: Create a histogram with labels
+
+### Creating Metrics
+
+```rust
+use dynamo_runtime::DistributedRuntime;
+
+let runtime = DistributedRuntime::new()?;
+let endpoint = runtime.namespace("my_namespace").component("my_component").endpoint("my_endpoint");
+
+// Simple metrics
+let requests_total = endpoint.metrics().create_counter(
+    "requests_total",
+    "Total requests",
+    &[]
+)?;
+
+let active_connections = endpoint.metrics().create_gauge(
+    "active_connections",
+    "Active connections",
+    &[]
+)?;
+
+let latency = endpoint.metrics().create_histogram(
+    "latency_seconds",
+    "Request latency",
+    &[],
+    Some(vec![0.001, 0.01, 0.1, 1.0, 10.0])
+)?;
+```
+
+### Using Metrics
+
+```rust
+// Counters
+requests_total.inc();
+
+// Gauges
+active_connections.set(42.0);
+active_connections.inc();
+active_connections.dec();
+
+// Histograms
+latency.observe(0.023);  // 23ms
+```
+
+### Vector Metrics with Labels
+
+```rust
+// Create vector metrics with label names
+let requests_by_model = endpoint.metrics().create_countervec(
+    "requests_by_model",
+    "Requests by model",
+    &["model_type", "model_size"],
+    &[]
+)?;
+
+let memory_by_gpu = endpoint.metrics().create_gaugevec(
+    "gpu_memory_bytes",
+    "GPU memory by device",
+    &["gpu_id", "memory_type"],
+    &[]
+)?;
+
+// Use with specific label values
+requests_by_model.with_label_values(&["llama", "7b"]).inc();
+memory_by_gpu.with_label_values(&["0", "allocated"]).set(8192.0);
+```
+
+### Advanced Features
+
+**Custom histogram buckets:**
+```rust
+let latency = endpoint.metrics().create_histogram(
+    "latency_seconds",
+    "Request latency",
+    &[],
+    Some(vec![0.001, 0.01, 0.1, 1.0, 10.0])
+)?;
+```
+
+**Constant labels:**
+```rust
+let counter = endpoint.metrics().create_counter(
+    "requests_total",
+    "Total requests",
+    &[("region", "us-west"), ("env", "prod")]
+)?;
+```
+
+---
+
+## Metrics API in Python
+
+Python components can create and manage Prometheus metrics using the same metrics API through Python bindings.
+
+### Available Methods
+
+- `endpoint.metrics.create_counter()` / `create_intcounter()`: Create a counter metric
+- `endpoint.metrics.create_gauge()` / `create_intgauge()`: Create a gauge metric
+- `endpoint.metrics.create_histogram()`: Create a histogram metric
+- `endpoint.metrics.create_countervec()` / `create_intcountervec()`: Create a counter with labels
+- `endpoint.metrics.create_gaugevec()` / `create_intgaugevec()`: Create a gauge with labels
+- `endpoint.metrics.create_histogramvec()`: Create a histogram with labels
+
+All metrics are imported from `dynamo.prometheus_metrics`.
+
+### Creating Metrics
+
+```python
+from dynamo.runtime import DistributedRuntime
+
+drt = DistributedRuntime()
+endpoint = drt.namespace("my_namespace").component("my_component").endpoint("my_endpoint")
+
+# Simple metrics
+requests_total = endpoint.metrics.create_intcounter(
+    "requests_total",
+    "Total requests"
+)
+
+active_connections = endpoint.metrics.create_intgauge(
+    "active_connections",
+    "Active connections"
+)
+
+latency = endpoint.metrics.create_histogram(
+    "latency_seconds",
+    "Request latency",
+    buckets=[0.001, 0.01, 0.1, 1.0, 10.0]
+)
+```
+
+### Using Metrics
+
+```python
+# Counters
+requests_total.inc()
+requests_total.inc_by(5)
+
+# Gauges
+active_connections.set(42)
+active_connections.inc()
+active_connections.dec()
+
+# Histograms
+latency.observe(0.023)  # 23ms
+```
+
+### Vector Metrics with Labels
+
+```python
+# Create vector metrics with label names
+requests_by_model = endpoint.metrics.create_intcountervec(
+    "requests_by_model",
+    "Requests by model",
+    ["model_type", "model_size"]
+)
+
+memory_by_gpu = endpoint.metrics.create_intgaugevec(
+    "gpu_memory_bytes",
+    "GPU memory by device",
+    ["gpu_id", "memory_type"]
+)
+
+# Use with specific label values
+requests_by_model.inc({"model_type": "llama", "model_size": "7b"})
+memory_by_gpu.set(8192, {"gpu_id": "0", "memory_type": "allocated"})
+```
+
+### Advanced Features
+
+**Constant labels:**
+```python
+counter = endpoint.metrics.create_intcounter(
+    "requests_total",
+    "Total requests",
+    [("region", "us-west"), ("env", "prod")]
+)
+```
+
+**Metric introspection:**
+```python
+print(counter.name())            # "my_namespace_my_component_my_endpoint_requests_total"
+print(counter.const_labels())    # {"dynamo_namespace": "my_namespace", ...}
+print(gauge_vec.variable_labels())  # ["model_type", "model_size"]
+```
+
+**Update patterns:**
+
+Background thread updates:
+```python
+import threading
+import time
+
+def update_loop():
+    while True:
+        active_connections.set(compute_current_connections())
+        time.sleep(2)
+
+threading.Thread(target=update_loop, daemon=True).start()
+```
+
+Callback-based updates (called before each `/metrics` scrape):
+```python
+def update_metrics():
+    active_connections.set(compute_current_connections())
+
+endpoint.metrics.register_callback(update_metrics)
+```
+
+### Examples
+
+Example scripts: [lib/bindings/python/examples/metrics/](../../lib/bindings/python/examples/metrics/)
+
+```bash
+cd ~/dynamo/lib/bindings/python/examples/metrics
+DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 ./server_with_loop.py
+DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 ./server_with_callback.py
+```
+
+---
+
+## Related Documentation
+
+- [Metrics Overview](metrics.md)
+- [Prometheus and Grafana Setup](prometheus-grafana.md)
+- [Distributed Runtime Architecture](../design_docs/distributed_runtime.md)
+- [Python Metrics Examples](../../lib/bindings/python/examples/metrics/)
+
diff --git a/docs/observability/metrics.md b/docs/observability/metrics.md
index 7e2beb34c5..e947285545 100644
--- a/docs/observability/metrics.md
+++ b/docs/observability/metrics.md
@@ -3,27 +3,88 @@ SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All
 SPDX-License-Identifier: Apache-2.0
 -->
 
-# Dynamo MetricsRegistry
+# Dynamo Metrics
 
 ## Overview
 
-Dynamo provides built-in metrics capabilities through the `MetricsRegistry` trait, which is automatically available whenever you use the `DistributedRuntime` framework. This guide explains how to use metrics for observability and monitoring across all Dynamo components.
+Dynamo provides built-in metrics capabilities through the Dynamo metrics API, which is automatically available whenever you use the `DistributedRuntime` framework. This document serves as a reference for all available metrics in Dynamo.
 
-## Automatic Metrics
+**For visualization setup instructions**, see the [Prometheus and Grafana Setup Guide](prometheus-grafana.md).
 
-Dynamo automatically exposes metrics with the `dynamo_` name prefixes. It also adds the following labels `dynamo_namespace`, `dynamo_component`, and `dynamo_endpoint` to indicate which component is providing the metric.
+**For creating custom metrics**, see the [Metrics Developer Guide](metrics-developer-guide.md).
 
-**Frontend Metrics**: When using Dynamo HTTP Frontend (`--framework VLLM` or `--framework TRTLLM`), these metrics are automatically exposed with the `dynamo_frontend_*` prefix and include `model` labels containing the model name. These cover request handling, token processing, and latency measurements. See [prometheus-grafana.md](prometheus-grafana.md#available-metrics) for the complete list of frontend metrics.
+## Environment Variables
 
-**Component Metrics**: The core Dynamo backend system automatically exposes metrics with the `dynamo_component_*` prefix for all components that use the `DistributedRuntime` framework. These include request counts, processing times, byte transfers, and system uptime metrics. See [prometheus-grafana.md](prometheus-grafana.md#available-metrics) for the complete list of component metrics.
+| Variable | Description | Default | Example |
+|----------|-------------|---------|---------|
+| `DYN_SYSTEM_ENABLED` | Enable system metrics/health server | `false` | `true` |
+| `DYN_SYSTEM_PORT` | System metrics/health port | `8081` | `9090` |
 
-**Specialized Component Metrics**: Components can also expose additional metrics specific to their functionality. For example, a `preprocessor` component exposes metrics with the `dynamo_preprocessor_*` prefix. See [prometheus-grafana.md](prometheus-grafana.md#available-metrics) for details on specialized component metrics.
+## Getting Started (Single GPU)
 
-**Kubernetes Integration**: For comprehensive Kubernetes deployment and monitoring setup, see the [Kubernetes Metrics Guide](../kubernetes/observability/metrics.md). This includes Prometheus Operator setup, metrics collection configuration, and visualization in Grafana.
+**Note:** This requires NATS and etcd running. For a complete setup with Prometheus and Grafana visualization, see the [Prometheus and Grafana Setup Guide](prometheus-grafana.md).
 
-## Metrics Hierarchy
+Launch a frontend and vLLM backend to test metrics:
 
-The `MetricsRegistry` trait is implemented by `DistributedRuntime`, `Namespace`, `Component`, and `Endpoint`, providing a hierarchical approach to metric collection that matches Dynamo's distributed architecture:
+```bash
+$ python -m dynamo.frontend --http-port 8000
+
+# Enable system metrics server
+export DYN_SYSTEM_ENABLED=true
+export DYN_SYSTEM_PORT=8081
+
+$ python -m dynamo.vllm --model Qwen/Qwen3-0.6B  \
+   --enforce-eager --no-enable-prefix-caching --max-num-seqs 3
+```
+
+Wait for the vLLM worker to start, then send requests and check metrics:
+
+```bash
+# Send a request
+curl -H 'Content-Type: application/json' \
+-d '{
+  "model": "Qwen/Qwen3-0.6B",
+  "max_completion_tokens": 100,
+  "messages": [{"role": "user", "content": "Hello"}]
+}' \
+http://localhost:8000/v1/chat/completions
+
+# Check metrics from the worker
+curl -s localhost:8081/metrics | grep dynamo_component
+```
+
+## Exposed Metrics
+
+Dynamo exposes metrics in Prometheus Exposition Format text at the `/metrics` HTTP endpoint. All Dynamo-generated metrics use the `dynamo_*` prefix and include labels (`dynamo_namespace`, `dynamo_component`, `dynamo_endpoint`) to identify the source component.
+
+**Example Prometheus Exposition Format text:**
+
+```
+# HELP dynamo_component_requests_total Total requests processed
+# TYPE dynamo_component_requests_total counter
+dynamo_component_requests_total{dynamo_namespace="default",dynamo_component="worker",dynamo_endpoint="generate"} 42
+
+# HELP dynamo_component_request_duration_seconds Request processing time
+# TYPE dynamo_component_request_duration_seconds histogram
+dynamo_component_request_duration_seconds_bucket{dynamo_namespace="default",dynamo_component="worker",dynamo_endpoint="generate",le="0.005"} 10
+dynamo_component_request_duration_seconds_bucket{dynamo_namespace="default",dynamo_component="worker",dynamo_endpoint="generate",le="0.01"} 15
+dynamo_component_request_duration_seconds_bucket{dynamo_namespace="default",dynamo_component="worker",dynamo_endpoint="generate",le="+Inf"} 42
+dynamo_component_request_duration_seconds_sum{dynamo_namespace="default",dynamo_component="worker",dynamo_endpoint="generate"} 2.5
+dynamo_component_request_duration_seconds_count{dynamo_namespace="default",dynamo_component="worker",dynamo_endpoint="generate"} 42
+```
+
+### Metric Categories
+
+Dynamo exposes several categories of metrics:
+
+- **Frontend Metrics** (`dynamo_frontend_*`) - Request handling, token processing, and latency measurements
+- **Component Metrics** (`dynamo_component_*`) - Request counts, processing times, byte transfers, and system uptime
+- **Specialized Component Metrics** (e.g., `dynamo_preprocessor_*`) - Component-specific metrics
+- **Engine Metrics** (Pass-through) - Backend engines expose their own metrics: [vLLM](../backends/vllm/prometheus.md) (`vllm:*`), [SGLang](../backends/sglang/prometheus.md) (`sglang:*`), [TensorRT-LLM](../backends/trtllm/prometheus.md) (`trtllm:*`)
+
+## Runtime Hierarchy
+
+The Dynamo metrics API is available on `DistributedRuntime`, `Namespace`, `Component`, and `Endpoint`, providing a hierarchical approach to metric collection that matches Dynamo's distributed architecture:
 
 - `DistributedRuntime`: Global metrics across the entire runtime
 - `Namespace`: Metrics scoped to a specific dynamo_namespace
@@ -32,65 +93,116 @@ The `MetricsRegistry` trait is implemented by `DistributedRuntime`, `Namespace`,
 
 This hierarchical structure allows you to create metrics at the appropriate level of granularity for your monitoring needs.
 
+## Available Metrics
 
-## Getting Started
+### Backend Component Metrics
 
-For a complete setup guide including Docker Compose configuration, Prometheus setup, and Grafana dashboards, see the [Getting Started section](prometheus-grafana.md#getting-started) in the Prometheus and Grafana guide.
+The core Dynamo backend system automatically exposes metrics with the `dynamo_component_*` prefix for all components that use the `DistributedRuntime` framework:
 
-The quick start includes:
-- Docker Compose setup for Prometheus and Grafana
-- Pre-configured dashboards and datasources
-- Access URLs for all monitoring endpoints
-- GPU targeting configuration
+- `dynamo_component_inflight_requests`: Requests currently being processed (gauge)
+- `dynamo_component_request_bytes_total`: Total bytes received in requests (counter)
+- `dynamo_component_request_duration_seconds`: Request processing time (histogram)
+- `dynamo_component_requests_total`: Total requests processed (counter)
+- `dynamo_component_response_bytes_total`: Total bytes sent in responses (counter)
+- `dynamo_component_system_uptime_seconds`: DistributedRuntime uptime (gauge)
 
-## Implementation Examples
+### KV Router Statistics (kvstats)
 
-Examples of creating metrics at different hierarchy levels and using dynamic labels are included in this document below.
+KV router statistics are automatically exposed by LLM workers and KV router components with the `dynamo_component_kvstats_*` prefix. These metrics provide insights into GPU memory usage and cache efficiency:
 
-### Grafana Dashboards
+- `dynamo_component_kvstats_active_blocks`: Number of active KV cache blocks currently in use (gauge)
+- `dynamo_component_kvstats_total_blocks`: Total number of KV cache blocks available (gauge)
+- `dynamo_component_kvstats_gpu_cache_usage_percent`: GPU cache usage as a percentage (0.0-1.0) (gauge)
+- `dynamo_component_kvstats_gpu_prefix_cache_hit_rate`: GPU prefix cache hit rate as a percentage (0.0-1.0) (gauge)
 
-Use dashboards in `deploy/metrics/grafana_dashboards/`:
-- `grafana-dynamo-dashboard.json`: General Dynamo dashboard
-- `grafana-dcgm-metrics.json`: DCGM GPU metrics dashboard
+These metrics are published by:
+- **LLM Workers**: vLLM and TRT-LLM backends publish these metrics through their respective publishers
+- **KV Router**: The KV router component aggregates and exposes these metrics for load balancing decisions
 
-## Metrics Visualization Architecture
+### Specialized Component Metrics
 
-### Service Topology
+Some components expose additional metrics specific to their functionality:
 
-The metrics system follows this architecture for collecting and visualizing metrics:
+- `dynamo_preprocessor_*`: Metrics specific to preprocessor components
 
-```mermaid
-graph TD
-    BROWSER[Browser] -->|:3001| GRAFANA[Grafana :3001]
-    subgraph DockerComposeNetwork [Network inside Docker Compose]
-        NATS_PROM_EXP[nats-prom-exp :7777 /metrics] -->|:8222/varz| NATS_SERVER[nats-server :4222, :6222, :8222]
-        PROMETHEUS[Prometheus server :9090] -->|:2379/metrics| ETCD_SERVER[etcd-server :2379, :2380]
-        PROMETHEUS -->|:9401/metrics| DCGM_EXPORTER[dcgm-exporter :9401]
-        PROMETHEUS -->|:7777/metrics| NATS_PROM_EXP
-        PROMETHEUS -->|:8000/metrics| DYNAMOFE[Dynamo HTTP FE :8000]
-        PROMETHEUS -->|:8081/metrics| DYNAMOBACKEND[Dynamo backend :8081]
-        DYNAMOFE --> DYNAMOBACKEND
-        GRAFANA -->|:9090/query API| PROMETHEUS
-    end
-```
+### Frontend Metrics
+
+When using Dynamo HTTP Frontend (`--framework VLLM` or `--framework TRTLLM`), these metrics are automatically exposed with the `dynamo_frontend_*` prefix and include `model` labels containing the model name:
+
+- `dynamo_frontend_inflight_requests`: Inflight requests (gauge)
+- `dynamo_frontend_queued_requests`: Number of requests in HTTP processing queue (gauge)
+- `dynamo_frontend_input_sequence_tokens`: Input sequence length (histogram)
+- `dynamo_frontend_inter_token_latency_seconds`: Inter-token latency (histogram)
+- `dynamo_frontend_output_sequence_tokens`: Output sequence length (histogram)
+- `dynamo_frontend_request_duration_seconds`: LLM request duration (histogram)
+- `dynamo_frontend_requests_total`: Total LLM requests (counter)
+- `dynamo_frontend_time_to_first_token_seconds`: Time to first token (histogram)
+
+**Note**: The `dynamo_frontend_inflight_requests` metric tracks requests from HTTP handler start until the complete response is finished, while `dynamo_frontend_queued_requests` tracks requests from HTTP handler start until first token generation begins (including prefill time). HTTP queue time is a subset of inflight time.
+
+#### Model Configuration Metrics
+
+The frontend also exposes model configuration metrics with the `dynamo_frontend_model_*` prefix. These metrics are populated from the worker backend registration service when workers register with the system:
 
-### Grafana Dashboard
+**Runtime Config Metrics (from ModelRuntimeConfig):**
+These metrics come from the runtime configuration provided by worker backends during registration.
 
-The metrics system includes a pre-configured Grafana dashboard for visualizing service metrics:
+- `dynamo_frontend_model_total_kv_blocks`: Total KV blocks available for a worker serving the model (gauge)
+- `dynamo_frontend_model_max_num_seqs`: Maximum number of sequences for a worker serving the model (gauge)
+- `dynamo_frontend_model_max_num_batched_tokens`: Maximum number of batched tokens for a worker serving the model (gauge)
 
-![Grafana Dynamo Dashboard](./grafana-dynamo-composite.png)
+**MDC Metrics (from ModelDeploymentCard):**
+These metrics come from the Model Deployment Card information provided by worker backends during registration. Note that when multiple worker instances register with the same model name, only the first instance's configuration metrics (runtime config and MDC metrics) will be populated. Subsequent instances with duplicate model names will be skipped for configuration metric updates, though the worker count metric will reflect all instances.
 
-## Detailed Setup Guide
+- `dynamo_frontend_model_context_length`: Maximum context length for a worker serving the model (gauge)
+- `dynamo_frontend_model_kv_cache_block_size`: KV cache block size for a worker serving the model (gauge)
+- `dynamo_frontend_model_migration_limit`: Request migration limit for a worker serving the model (gauge)
 
-For complete setup instructions including Docker Compose, Prometheus configuration, and Grafana dashboards, see:
+**Worker Management Metrics:**
+- `dynamo_frontend_model_workers`: Number of worker instances currently serving the model (gauge)
 
-```{toctree}
-:hidden:
+### Request Processing Flow
 
-prometheus-grafana
+This section explains the distinction between two key metrics used to track request processing:
+
+1. **Inflight**: Tracks requests from HTTP handler start until the complete response is finished
+2. **HTTP Queue**: Tracks requests from HTTP handler start until first token generation begins (including prefill time)
+
+**Example Request Flow:**
 ```
+curl -s localhost:8000/v1/completions -H "Content-Type: application/json" -d '{
+  "model": "Qwen/Qwen3-0.6B",
+  "prompt": "Hello let's talk about LLMs",
+  "stream": false,
+  "max_tokens": 1000
+}'
+```
+
+**Timeline:**
+```
+Timeline:    0, 1, ...
+Client ────> Frontend:8000 ────────────────────> Dynamo component/backend (vLLM, SGLang, TRT)
+             │request start                     │received                              │
+             |                                  |                                      |
+             │                                  ├──> start prefill ──> first token ──> |last token
+             │                                  │     (not impl)       |               |
+             ├─────actual HTTP queue¹ ──────────┘                      │               |
+             │                                                         │               │
+             ├─────implemented HTTP queue ─────────────────────────────┘               |
+             │                                                                         │
+             └─────────────────────────────────── Inflight ────────────────────────────┘
+```
+
+**Concurrency Example:**
+Suppose the backend allows 3 concurrent requests and there are 10 clients continuously hitting the frontend:
+- All 10 requests will be counted as inflight (from start until complete response)
+- 7 requests will be in HTTP queue most of the time
+- 3 requests will be actively processed (between first token and last token)
 
-- [Prometheus and Grafana Setup Guide](prometheus-grafana.md)
+**Key Differences:**
+- **Inflight**: Measures total request lifetime including processing time
+- **HTTP Queue**: Measures queuing time before processing begins (including prefill time)
+- **HTTP Queue ≤ Inflight** (HTTP queue is a subset of inflight time)
 
 ## Related Documentation
 
diff --git a/docs/observability/prometheus-grafana.md b/docs/observability/prometheus-grafana.md
index 6c6bcec60c..8789f888e4 100644
--- a/docs/observability/prometheus-grafana.md
+++ b/docs/observability/prometheus-grafana.md
@@ -1,254 +1,134 @@
 # Metrics Visualization with Prometheus and Grafana
 
-This directory contains configuration for visualizing metrics from the metrics aggregation service using Prometheus and Grafana.
-
-> [!NOTE]
-> For detailed information about Dynamo's metrics system, including hierarchical metrics, automatic labeling, and usage examples, see the [Metrics Guide](./metrics.md).
-
 ## Overview
 
-### Components
-
-- **Prometheus Server**: Collects and stores metrics from Dynamo services and other components.
-- **Grafana**: Provides dashboards by querying the Prometheus Server.
-
-### Topology
-
-Default Service Relationship Diagram:
-```mermaid
-graph TD
-    BROWSER[Browser] -->|:3001| GRAFANA[Grafana :3001]
-    subgraph DockerComposeNetwork [Network inside Docker Compose]
-        NATS_PROM_EXP[nats-prom-exp :7777 /metrics] -->|:8222/varz| NATS_SERVER[nats-server :4222, :6222, :8222]
-        PROMETHEUS[Prometheus server :9090] -->|:2379/metrics| ETCD_SERVER[etcd-server :2379, :2380]
-        PROMETHEUS -->|:9401/metrics| DCGM_EXPORTER[dcgm-exporter :9401]
-        PROMETHEUS -->|:7777/metrics| NATS_PROM_EXP
-        PROMETHEUS -->|:8000/metrics| DYNAMOFE[Dynamo HTTP FE :8000]
-        PROMETHEUS -->|:8081/metrics| DYNAMOBACKEND[Dynamo backend :8081]
-        DYNAMOFE --> DYNAMOBACKEND
-        GRAFANA -->|:9090/query API| PROMETHEUS
-    end
-```
-
-The dcgm-exporter service in the Docker Compose network is configured to use port 9401 instead of the default port 9400. This adjustment is made to avoid port conflicts with other dcgm-exporter instances that may be running simultaneously. Such a configuration is typical in distributed systems like SLURM.
-
-As of Q2 2025, Dynamo HTTP Frontend metrics are exposed when you build containers with `--framework VLLM` or `--framework TRTLLM`.
-
-### Available Metrics
-
-#### Backend Component Metrics
-
-The core Dynamo backend system automatically exposes metrics with the `dynamo_component_*` prefix for all components that use the `DistributedRuntime` framework:
-
-- `dynamo_component_inflight_requests`: Requests currently being processed (gauge)
-- `dynamo_component_request_bytes_total`: Total bytes received in requests (counter)
-- `dynamo_component_request_duration_seconds`: Request processing time (histogram)
-- `dynamo_component_requests_total`: Total requests processed (counter)
-- `dynamo_component_response_bytes_total`: Total bytes sent in responses (counter)
-- `dynamo_component_system_uptime_seconds`: DistributedRuntime uptime (gauge)
-
-#### KV Router Statistics (kvstats)
-
-KV router statistics are automatically exposed by LLM workers and KV router components with the `dynamo_component_kvstats_*` prefix. These metrics provide insights into GPU memory usage and cache efficiency:
-
-- `dynamo_component_kvstats_active_blocks`: Number of active KV cache blocks currently in use (gauge)
-- `dynamo_component_kvstats_total_blocks`: Total number of KV cache blocks available (gauge)
-- `dynamo_component_kvstats_gpu_cache_usage_percent`: GPU cache usage as a percentage (0.0-1.0) (gauge)
-- `dynamo_component_kvstats_gpu_prefix_cache_hit_rate`: GPU prefix cache hit rate as a percentage (0.0-1.0) (gauge)
-
-These metrics are published by:
-- **LLM Workers**: vLLM and TRT-LLM backends publish these metrics through their respective publishers
-- **KV Router**: The KV router component aggregates and exposes these metrics for load balancing decisions
-
-#### Specialized Component Metrics
-
-Some components expose additional metrics specific to their functionality:
-
-- `dynamo_preprocessor_*`: Metrics specific to preprocessor components
-
-#### Frontend Metrics
+This guide shows how to set up Prometheus and Grafana for visualizing Dynamo metrics on a single machine for demo purposes.
 
-When using Dynamo HTTP Frontend (`--framework VLLM` or `--framework TRTLLM`), these metrics are automatically exposed with the `dynamo_frontend_*` prefix and include `model` labels containing the model name:
+![Grafana Dynamo Dashboard](./grafana-dynamo-composite.png)
 
-- `dynamo_frontend_inflight_requests`: Inflight requests (gauge)
-- `dynamo_frontend_queued_requests`: Number of requests in HTTP processing queue (gauge)
-- `dynamo_frontend_input_sequence_tokens`: Input sequence length (histogram)
-- `dynamo_frontend_inter_token_latency_seconds`: Inter-token latency (histogram)
-- `dynamo_frontend_output_sequence_tokens`: Output sequence length (histogram)
-- `dynamo_frontend_request_duration_seconds`: LLM request duration (histogram)
-- `dynamo_frontend_requests_total`: Total LLM requests (counter)
-- `dynamo_frontend_time_to_first_token_seconds`: Time to first token (histogram)
+**Components:**
+- **Prometheus Server** - Collects and stores metrics from Dynamo services
+- **Grafana** - Provides dashboards by querying the Prometheus Server
 
-**Note**: The `dynamo_frontend_inflight_requests` metric tracks requests from HTTP handler start until the complete response is finished, while `dynamo_frontend_queued_requests` tracks requests from HTTP handler start until first token generation begins (including prefill time). HTTP queue time is a subset of inflight time.
+**For metrics reference**, see [Metrics Documentation](metrics.md).
 
-##### Model Configuration Metrics
+## Environment Variables
 
-The frontend also exposes model configuration metrics with the `dynamo_frontend_model_*` prefix. These metrics are populated from the worker backend registration service when workers register with the system:
+| Variable | Description | Default | Example |
+|----------|-------------|---------|---------|
+| `DYN_SYSTEM_ENABLED` | Enable system metrics/health server | `false` | `true` |
+| `DYN_SYSTEM_PORT` | System metrics/health port | `8081` | `9090` |
 
-**Runtime Config Metrics (from ModelRuntimeConfig):**
-These metrics come from the runtime configuration provided by worker backends during registration.
+## Getting Started (Single GPU)
 
-- `dynamo_frontend_model_total_kv_blocks`: Total KV blocks available for a worker serving the model (gauge)
-- `dynamo_frontend_model_max_num_seqs`: Maximum number of sequences for a worker serving the model (gauge)
-- `dynamo_frontend_model_max_num_batched_tokens`: Maximum number of batched tokens for a worker serving the model (gauge)
-
-**MDC Metrics (from ModelDeploymentCard):**
-These metrics come from the Model Deployment Card information provided by worker backends during registration. Note that when multiple worker instances register with the same model name, only the first instance's configuration metrics (runtime config and MDC metrics) will be populated. Subsequent instances with duplicate model names will be skipped for configuration metric updates, though the worker count metric will reflect all instances.
+### Prerequisites
 
-- `dynamo_frontend_model_context_length`: Maximum context length for a worker serving the model (gauge)
-- `dynamo_frontend_model_kv_cache_block_size`: KV cache block size for a worker serving the model (gauge)
-- `dynamo_frontend_model_migration_limit`: Request migration limit for a worker serving the model (gauge)
+Install these on your machine:
 
-**Worker Management Metrics:**
-- `dynamo_frontend_model_workers`: Number of worker instances currently serving the model (gauge)
+- [Docker](https://docs.docker.com/get-docker/)
+- [Docker Compose](https://docs.docker.com/compose/install/)
 
-#### Request Processing Flow
+### Start the Observability Stack
 
-This section explains the distinction between two key metrics used to track request processing:
+From the Dynamo root directory:
 
-1. **Inflight**: Tracks requests from HTTP handler start until the complete response is finished
-2. **HTTP Queue**: Tracks requests from HTTP handler start until first token generation begins (including prefill time)
+```bash
+# Start infrastructure (NATS, etcd)
+docker compose -f deploy/docker-compose.yml up -d
 
-**Example Request Flow:**
-```
-curl -s localhost:8000/v1/completions -H "Content-Type: application/json" -d '{
-  "model": "Qwen/Qwen3-0.6B",
-  "prompt": "Hello let's talk about LLMs",
-  "stream": false,
-  "max_tokens": 1000
-}'
+# Then start observability stack (Prometheus, Grafana, Tempo, DCGM GPU exporter, NATS exporter)
+docker compose -f deploy/docker-observability.yml up -d
 ```
 
-**Timeline:**
-```
-Timeline:    0, 1, ...
-Client ────> Frontend:8000 ────────────────────> Dynamo component/backend (vLLM, SGLang, TRT)
-             │request start                     │received                              │
-             |                                  |                                      |
-             │                                  ├──> start prefill ──> first token ──> |last token
-             │                                  │     (not impl)       |               |
-             ├─────actual HTTP queue¹ ──────────┘                      │               |
-             │                                                         │               │
-             ├─────implemented HTTP queue ─────────────────────────────┘               |
-             │                                                                         │
-             └─────────────────────────────────── Inflight ────────────────────────────┘
-```
+### Start Dynamo Components
 
-**Concurrency Example:**
-Suppose the backend allows 3 concurrent requests and there are 10 clients continuously hitting the frontend:
-- All 10 requests will be counted as inflight (from start until complete response)
-- 7 requests will be in HTTP queue most of the time
-- 3 requests will be actively processed (between first token and last token)
+Start frontend and worker (a simple single GPU example):
 
-**Testing Setup:**
-Try launching a frontend and a Mocker backend that allows 3 concurrent requests:
 ```bash
-$ python -m dynamo.frontend --http-port 8000
-$ python -m dynamo.mocker --model-path Qwen/Qwen3-0.6B --max-num-seqs 3
-# Launch your 10 concurrent clients here
-# Then check the queued_requests and inflight_requests metrics from the frontend:
-$ curl -s localhost:8000/metrics|grep -v '^#'|grep -E 'queue|inflight'
-dynamo_frontend_queued_requests{model="qwen/qwen3-0.6b"} 7
-dynamo_frontend_inflight_requests{model="qwen/qwen3-0.6b"} 10
-```
+# Start frontend in one process
+python -m dynamo.frontend --http-port 8000 &
 
-**Real setup using vLLM (instead of Mocker):**
-```bash
-$ python -m dynamo.vllm --model Qwen/Qwen3-0.6B  \
-   --enforce-eager --no-enable-prefix-caching --max-num-seqs 3
+# Start vLLM worker with metrics enabled in another process
+DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 \
+  python -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager
 ```
 
-**Key Differences:**
-- **Inflight**: Measures total request lifetime including processing time
-- **HTTP Queue**: Measures queuing time before processing begins (including prefill time)
-- **HTTP Queue ≤ Inflight** (HTTP queue is a subset of inflight time)
-
-### Required Files
-
-The following configuration files are located in the `deploy/metrics/` directory:
-- [docker-compose.yml](../../deploy/docker-compose.yml): Defines the Prometheus and Grafana services
-- [prometheus.yml](../../deploy/metrics/prometheus.yml): Contains Prometheus scraping configuration
-- [grafana-datasources.yml](../../deploy/metrics/grafana-datasources.yml): Contains Grafana datasource configuration
-- [grafana_dashboards/grafana-dashboard-providers.yml](../../deploy/metrics/grafana_dashboards/grafana-dashboard-providers.yml): Contains Grafana dashboard provider configuration
-- [grafana_dashboards/grafana-dynamo-dashboard.json](../../deploy/metrics/grafana_dashboards/grafana-dynamo-dashboard.json): A general Dynamo Dashboard for both SW and HW metrics.
-- [grafana_dashboards/grafana-dcgm-metrics.json](../../deploy/metrics/grafana_dashboards/grafana-dcgm-metrics.json): Contains Grafana dashboard configuration for DCGM GPU metrics
-- [grafana_dashboards/grafana-kvbm-dashboard.json](../../deploy/metrics/grafana_dashboards/grafana-kvbm-dashboard.json): Contains Grafana dashboard configuration for KVBM metrics
-
-### Metric Name Constants
-
-The [prometheus_names.rs](../../lib/runtime/src/metrics/prometheus_names.rs) module provides centralized Prometheus metric name constants and sanitization utilities for the Dynamo metrics system. This module ensures consistency across all components and prevents metric name duplication.
-
-#### Key Features
-
-- **Centralized Constants**: All Prometheus metric names are defined as constants to avoid duplication and typos
-- **Automatic Sanitization**: Functions to sanitize metric and label names according to Prometheus naming rules
-- **Component Organization**: Metric names are organized by component (frontend, work_handler, nats_client, etc.)
-- **Validation Arrays**: Arrays of metric names for iteration and validation purposes
-
-#### Metric Name Prefixes
-
-- `dynamo_component_*`: Core component metrics (requests, latency, bytes, etc.)
-- `dynamo_frontend_*`: Frontend service metrics (LLM HTTP service)
-- `nats_client_*`: NATS client connection and message metrics
-- `nats_service_*`: NATS service statistics metrics
-- `kvstats_*`: KV cache statistics from LLM workers
+After the workers are running, send a few test requests to populate metrics in the system:
 
-#### Sanitization Functions
-
-The module provides functions to ensure metric and label names comply with Prometheus naming conventions:
-
-- `sanitize_prometheus_name()`: Sanitizes metric names (allows colons and `__`)
-- `sanitize_prometheus_label()`: Sanitizes label names (no colons, no `__` prefix)
-- `build_component_metric_name()`: Builds full component metric names with proper prefixing
+```bash
+curl -X POST http://localhost:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "Qwen/Qwen3-0.6B",
+    "messages": [{"role": "user", "content": "Hello"}],
+    "max_completion_tokens": 100
+  }'
+```
 
-This centralized approach ensures all Dynamo components use consistent, valid Prometheus metric names without manual coordination.
+After sending a few requests, the Prometheus Exposition Format text metrics are available at:
+- Frontend: `http://localhost:8000/metrics`
+- Backend worker: `http://localhost:8081/metrics`
 
-## Getting Started
+### Access Web Interfaces
 
-### Prerequisites
+Once Dynamo components are running:
 
-1. Make sure Docker and Docker Compose are installed on your system
+1. Open **Grafana** at `http://localhost:3000` (username: `dynamo`, password: `dynamo`)
+2. Click on **Dashboards** in the left sidebar
+3. Select **Dynamo Dashboard** to view metrics and traces
 
-### Quick Start
+Other interfaces:
+- **Prometheus**: `http://localhost:9090`
+- **Tempo** (tracing): Accessible through Grafana's Explore view. See [Tracing Guide](tracing.md) for details.
 
-1. Start Dynamo dependencies. Assume you're at the root dynamo path:
+**Note:** If accessing from another machine, replace `localhost` with the machine's hostname or IP address, and ensure firewall rules allow access to these ports (3000, 9090).
 
-   ```bash
-   # Start the basic services (etcd & natsd), along with Prometheus and Grafana
-   docker compose -f deploy/docker-compose.yml --profile metrics up -d
+---
 
-   # Minimum components for Dynamo (will not have Prometheus and Grafana): etcd/nats/dcgm-exporter
-   docker compose -f deploy/docker-compose.yml up -d
-   ```
+## Topology
 
-   Optional: To target specific GPU(s), export the variable below before running Docker Compose
-   ```bash
-   export CUDA_VISIBLE_DEVICES=0,2
-   ```
+Default Service Relationship Diagram:
+```mermaid
+graph TD
+    BROWSER[Browser] -->|:3000| GRAFANA[Grafana :3000]
+    subgraph DockerComposeNetwork [Network inside Docker Compose]
+        NATS_PROM_EXP[nats-prom-exp :7777 /metrics] -->|:8222/varz| NATS_SERVER[nats-server :4222, :6222, :8222]
+        PROMETHEUS[Prometheus server :9090] -->|:2379/metrics| ETCD_SERVER[etcd-server :2379, :2380]
+        PROMETHEUS -->|:9401/metrics| DCGM_EXPORTER[dcgm-exporter :9401]
+        PROMETHEUS -->|:7777/metrics| NATS_PROM_EXP
+        PROMETHEUS -->|:8000/metrics| DYNAMOFE[Dynamo HTTP FE :8000]
+        PROMETHEUS -->|:8081/metrics| DYNAMOBACKEND[Dynamo backend :8081]
+        DYNAMOFE --> DYNAMOBACKEND
+        GRAFANA -->|:9090/query API| PROMETHEUS
+    end
+```
 
-2. Web servers started. The ones that end in /metrics are in Prometheus format:
-   - Grafana: `http://localhost:3001` (default login: dynamo/dynamo)
-   - Prometheus Server: `http://localhost:9090`
-   - NATS Server: `http://localhost:8222` (monitoring endpoints: /varz, /healthz, etc.)
-   - NATS Prometheus Exporter: `http://localhost:7777/metrics`
-   - etcd Server: `http://localhost:2379/metrics`
-   - DCGM Exporter: `http://localhost:9401/metrics`
+The dcgm-exporter service in the Docker Compose network is configured to use port 9401 instead of the default port 9400. This adjustment is made to avoid port conflicts with other dcgm-exporter instances that may be running simultaneously. Such a configuration is typical in distributed systems like SLURM.
 
+### Required Files
 
-   - Start worker(s) that publishes KV Cache metrics: [lib/runtime/examples/service_metrics/README.md](../../lib/runtime/examples/service_metrics/README.md) can populate dummy KV Cache metrics.
+The following configuration files are located in the `deploy/observability/` directory:
+- [docker-compose.yml](../../deploy/docker-compose.yml): Defines NATS and etcd services
+- [docker-observability.yml](../../deploy/docker-observability.yml): Defines Prometheus, Grafana, Tempo, and exporters
+- [prometheus.yml](../../deploy/observability/prometheus.yml): Contains Prometheus scraping configuration
+- [grafana-datasources.yml](../../deploy/observability/grafana-datasources.yml): Contains Grafana datasource configuration
+- [grafana_dashboards/grafana-dashboard-providers.yml](../../deploy/observability/grafana_dashboards/grafana-dashboard-providers.yml): Contains Grafana dashboard provider configuration
+- [grafana_dashboards/grafana-dynamo-dashboard.json](../../deploy/observability/grafana_dashboards/grafana-dynamo-dashboard.json): A general Dynamo Dashboard for both SW and HW metrics
+- [grafana_dashboards/grafana-dcgm-metrics.json](../../deploy/observability/grafana_dashboards/grafana-dcgm-metrics.json): Contains Grafana dashboard configuration for DCGM GPU metrics
+- [grafana_dashboards/grafana-kvbm-dashboard.json](../../deploy/observability/grafana_dashboards/grafana-kvbm-dashboard.json): Contains Grafana dashboard configuration for KVBM metrics
 
 ### Configuration
 
 #### Prometheus
 
-The Prometheus configuration is specified in [prometheus.yml](../../deploy/metrics/prometheus.yml). This file is set up to collect metrics from the metrics aggregation service endpoint.
+The Prometheus configuration is specified in [prometheus.yml](../../deploy/observability/prometheus.yml). This file is set up to collect metrics from the metrics aggregation service endpoint.
 
 Please be aware that you might need to modify the target settings to align with your specific host configuration and network environment.
 
-After making changes to prometheus.yml, it is necessary to reload the configuration using the command below. Simply sending a kill -HUP signal will not suffice due to the caching of the volume that contains the prometheus.yml file.
+After making changes to prometheus.yml, restart the Prometheus service:
 
-```
-docker compose -f deploy/docker-compose.yml up prometheus -d --force-recreate
+```bash
+docker compose -f deploy/docker-observability.yml restart prometheus
 ```
 
 #### Grafana
@@ -256,237 +136,32 @@ docker compose -f deploy/docker-compose.yml up prometheus -d --force-recreate
 Grafana is pre-configured with:
 - Prometheus datasource
 - Sample dashboard for visualizing service metrics
-![grafana image](./grafana-dynamo-composite.png)
 
 ### Troubleshooting
 
 1. Verify services are running:
   ```bash
-  docker compose ps
+  docker compose -f deploy/docker-observability.yml ps
   ```
 
 2. Check logs:
   ```bash
-  docker compose logs prometheus
-  docker compose logs grafana
+  docker compose -f deploy/docker-observability.yml logs prometheus
+  docker compose -f deploy/docker-observability.yml logs grafana
   ```
 
 3. Check Prometheus targets at `http://localhost:9090/targets` to verify metric collection.
 
-## Developer Guide
-
-### Creating Metrics at Different Hierarchy Levels
-
-#### Runtime-Level Metrics
-
-```rust
-use dynamo_runtime::DistributedRuntime;
-
-let runtime = DistributedRuntime::new()?;
-let namespace = runtime.namespace("my_namespace")?;
-let component = namespace.component("my_component")?;
-let endpoint = component.endpoint("my_endpoint")?;
-
-// Create endpoint-level counters (this is a Prometheus Counter type)
-let requests_total = endpoint.metrics().create_counter(
-    "requests_total",
-    "Total requests across all namespaces",
-    &[]
-)?;
-
-let active_connections = endpoint.metrics().create_gauge(
-    "active_connections",
-    "Number of active client connections",
-    &[]
-)?;
-```
-
-#### Namespace-Level Metrics
-
-```rust
-let namespace = runtime.namespace("my_model")?;
-
-// Namespace-scoped metrics
-let model_requests = namespace.metrics().create_counter(
-    "model_requests",
-    "Requests for this specific model",
-    &[]
-)?;
-
-let model_latency = namespace.metrics().create_histogram(
-    "model_latency_seconds",
-    "Model inference latency",
-    &[],
-    Some(vec![0.001, 0.01, 0.1, 1.0, 10.0])
-)?;
-```
-
-#### Component-Level Metrics
-
-```rust
-let component = namespace.component("backend")?;
-
-// Component-specific metrics
-let backend_requests = component.metrics().create_counter(
-    "backend_requests",
-    "Requests handled by this backend component",
-    &[]
-)?;
-
-let gpu_memory_usage = component.metrics().create_gauge(
-    "gpu_memory_bytes",
-    "GPU memory usage in bytes",
-    &[]
-)?;
-```
-
-#### Endpoint-Level Metrics
-
-```rust
-let endpoint = component.endpoint("generate")?;
-
-// Endpoint-specific metrics
-let generate_requests = endpoint.metrics().create_counter(
-    "generate_requests",
-    "Generate endpoint requests",
-    &[]
-)?;
-
-let generate_latency = endpoint.metrics().create_histogram(
-    "generate_latency_seconds",
-    "Generate endpoint latency",
-    &[],
-    Some(vec![0.001, 0.01, 0.1, 1.0, 10.0])
-)?;
-```
-
-### Creating Vector Metrics with Dynamic Labels
-
-Use vector metrics when you need to track metrics with different label values:
-
-```rust
-// Counter with labels
-let requests_by_model = endpoint.metrics().create_countervec(
-    "requests_by_model",
-    "Requests by model type",
-    &["model_type", "model_size"],
-    &[]  // no constant labels
-)?;
-
-// Increment with specific labels
-requests_by_model.with_label_values(&["llama", "7b"]).inc();
-requests_by_model.with_label_values(&["gpt", "13b"]).inc();
-
-// Gauge with labels
-let memory_by_gpu = component.metrics().create_gaugevec(
-    "gpu_memory_bytes",
-    "GPU memory usage by device",
-    &["gpu_id", "memory_type"],
-    &[]  // no constant labels
-)?;
-
-memory_by_gpu.with_label_values(&["0", "allocated"]).set(8192.0);
-memory_by_gpu.with_label_values(&["0", "cached"]).set(4096.0);
-```
-
-### Creating Histograms
-
-Histograms are useful for measuring distributions of values like latency:
-
-```rust
-let latency_histogram = endpoint.metrics().create_histogram(
-    "request_latency_seconds",
-    "Request latency distribution",
-    &[],
-    Some(vec![0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0])
-)?;
-
-// Record latency values
-latency_histogram.observe(0.023); // 23ms
-latency_histogram.observe(0.156); // 156ms
-```
-
-### Transitioning from Plain Prometheus
-
-If you're currently using plain Prometheus metrics, transitioning to Dynamo's `MetricsRegistry` is straightforward:
-
-#### Before (Plain Prometheus)
-
-```rust
-use prometheus::{Counter, Opts, Registry};
-
-// Create a registry to hold metrics
-let registry = Registry::new();
-let counter_opts = Opts::new("my_counter", "My custom counter");
-let counter = Counter::with_opts(counter_opts).unwrap();
-registry.register(Box::new(counter.clone())).unwrap();
-
-// Use the counter
-counter.inc();
-
-// To expose metrics, you'd need to set up an HTTP server manually
-// and implement the /metrics endpoint yourself
-```
-
-#### After (Dynamo MetricsRegistry)
-
-```rust
-let counter = endpoint.metrics().create_counter(
-    "my_counter",
-    "My custom counter",
-    &[]
-)?;
-
-counter.inc();
-```
-
-**Note:** The metric is automatically registered when created via the endpoint's `metrics().create_counter()` factory method.
-
-**Benefits of Dynamo's approach:**
-- **Automatic registration**: Metrics created via endpoint's `metrics().create_*()` factory methods are automatically registered with the system
-- Automatic labeling with namespace, component, and endpoint information
-- Consistent metric naming with `dynamo_` prefix
-- Built-in HTTP metrics endpoint when enabled with `DYN_SYSTEM_ENABLED=true`
-- Hierarchical metric organization
-
-### Advanced Features
-
-#### Custom Buckets for Histograms
-
-```rust
-// Define custom buckets for your use case
-let custom_buckets = vec![0.001, 0.01, 0.1, 1.0, 10.0];
-let latency = endpoint.metrics().create_histogram(
-    "api_latency_seconds",
-    "API latency in seconds",
-    &[],
-    Some(custom_buckets)
-)?;
-```
-
-#### Metric Aggregation
-
-```rust
-// Aggregate metrics across multiple endpoints
-let requests_total = namespace.metrics().create_counter(
-    "requests_total",
-    "Total requests across all endpoints",
-    &[]
-)?;
-```
-
-
-## Troubleshooting
-
-1. Verify services are running:
+4. If you encounter issues with stale data or configuration, stop services and wipe volumes:
   ```bash
-  docker compose ps
+  docker compose -f deploy/docker-observability.yml down -v
+  docker compose -f deploy/docker-observability.yml up -d
   ```
 
-2. Check logs:
-  ```bash
-  docker compose logs prometheus
-  docker compose logs grafana
-  ```
+  **Note:** The `-v` flag removes named volumes (grafana-data, tempo-data), which will reset dashboards and stored metrics.
 
-3. Check Prometheus targets at `http://localhost:9090/targets` to verify metric collection.
+## Developer Guide
+
+For detailed information on creating custom metrics in Dynamo components, see:
+
+- [Metrics Developer Guide](metrics-developer-guide.md)
diff --git a/deploy/tracing/trace.png b/docs/observability/trace.png
similarity index 100%
rename from deploy/tracing/trace.png
rename to docs/observability/trace.png
diff --git a/deploy/tracing/README.md b/docs/observability/tracing.md
similarity index 59%
rename from deploy/tracing/README.md
rename to docs/observability/tracing.md
index a2efa75bd5..feae4d3f29 100644
--- a/deploy/tracing/README.md
+++ b/docs/observability/tracing.md
@@ -5,70 +5,45 @@ SPDX-License-Identifier: Apache-2.0
 
 # Distributed Tracing with Tempo
 
-This guide explains how to set up and view distributed traces in Grafana Tempo for Dynamo workloads.
-
 ## Overview
 
-Dynamo supports OpenTelemetry-based distributed tracing, allowing you to visualize request flows across Frontend and Worker components. Traces are exported to Tempo via OTLP (OpenTelemetry Protocol) and visualized in Grafana.
-
-## Prerequisites
-
-- Docker and Docker Compose (for local deployment)
-- Kubernetes cluster with kubectl access (for Kubernetes deployment)
-- Dynamo runtime with tracing support
-
-## Environment Variables
+Dynamo supports OpenTelemetry-based distributed tracing for visualizing request flows across Frontend and Worker components. Traces are exported to Tempo via OTLP (OpenTelemetry Protocol) and visualized in Grafana.
 
-Dynamo's tracing is configured via environment variables. For complete logging documentation, see [docs/observability/logging.md](../../docs/observability/logging.md).
+**Requirements:** Set `DYN_LOGGING_JSONL=true` and `OTEL_EXPORT_ENABLED=true` to export traces to Tempo.
 
-### Required Environment Variables
+This guide covers single GPU demo setup using Docker Compose. For Kubernetes deployments, see [Kubernetes Deployment](#kubernetes-deployment).
 
-| Variable | Description | Example Value |
-|----------|-------------|---------------|
-| `DYN_LOGGING_JSONL` | Enable JSONL logging format (required for tracing) | `true` |
-| `OTEL_EXPORT_ENABLED` | Enable OTLP trace export | `1` |
-| `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT` | OTLP gRPC endpoint for Tempo | `http://localhost:4317` (local) or `http://tempo:4317` (docker) |
-| `OTEL_SERVICE_NAME` | Service name for identifying components | `dynamo-frontend`, `dynamo-worker-prefill`, `dynamo-worker-decode` |
+**Note:** This section has overlap with [Logging of OpenTelemetry Tracing](logging.md) since OpenTelemetry has aspects of both logging and tracing. The tracing approach documented here is for persistent trace visualization and analysis. For short debugging sessions examining trace context directly in logs, see the [Logging](logging.md) guide.
 
-**Note:** When `OTEL_EXPORT_ENABLED=1`, logging initialization is deferred until the runtime is available (required by the OTEL exporter). This means some early logs will be dropped. This will be fixed in a future release.
-
-### Example Configuration
-
-```bash
-# Enable JSONL logging and tracing
-export DYN_LOGGING_JSONL=true
-
-# Enable trace export to Tempo
-export OTEL_EXPORT_ENABLED=1
-
-# Set the Tempo endpoint (docker-compose network)
-export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://tempo:4317
-
-# Set service name to identify this component
-export OTEL_SERVICE_NAME=dynamo-frontend
-```
-
----
+## Environment Variables
 
-## Local Deployment with Docker Compose
+| Variable | Description | Default | Example |
+|----------|-------------|---------|---------|
+| `DYN_LOGGING_JSONL` | Enable JSONL logging format (required for tracing) | `false` | `true` |
+| `OTEL_EXPORT_ENABLED` | Enable OTLP trace export | `false` | `true` |
+| `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT` | OTLP gRPC endpoint for Tempo | `http://localhost:4317` | `http://tempo:4317` |
+| `OTEL_SERVICE_NAME` | Service name for identifying components | `dynamo` | `dynamo-frontend` |
 
-### 1. Start Tempo and Grafana
+## Getting Started (Single GPU)
 
-From the `deploy/tracing` directory, start the observability stack:
+### 1. Start Observability Stack
 
 ```bash
-cd deploy/tracing
-docker-compose up -d
+cd deploy
+docker compose -f docker-observability.yml up -d
 ```
 
 This will start:
 - **Tempo** on `http://localhost:3200` (HTTP API) and `localhost:4317` (OTLP gRPC)
-- **Grafana** on `http://localhost:3000` (username: `admin`, password: `admin`)
+- **Prometheus** on `http://localhost:9090`
+- **Grafana** on `http://localhost:3000` (username: `dynamo`, password: `dynamo`)
+- **DCGM Exporter** on `http://localhost:9401/metrics` (GPU metrics)
+- **NATS Exporter** on `http://localhost:7777/metrics`
 
 Verify services are running:
 
 ```bash
-docker-compose ps
+docker compose -f docker-observability.yml ps
 ```
 
 ### 2. Set Environment Variables
@@ -78,14 +53,29 @@ Configure Dynamo components to export traces:
 ```bash
 # Enable JSONL logging and tracing
 export DYN_LOGGING_JSONL=true
-export OTEL_EXPORT_ENABLED=1
+export OTEL_EXPORT_ENABLED=true
 export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://localhost:4317
+```
+
+### 3. Start Dynamo Components (Single GPU)
+
+For a simple single-GPU deployment, start the frontend and a single vLLM worker:
 
-# Set service names for each component
+```bash
+# Start the frontend with tracing enabled
 export OTEL_SERVICE_NAME=dynamo-frontend
+python -m dynamo.frontend --router-mode kv --http-port=8000 &
+
+# Start a single vLLM worker (aggregated prefill and decode)
+export OTEL_SERVICE_NAME=dynamo-worker-vllm
+python -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager &
+
+wait
 ```
 
-### 3. Run vLLM Disaggregated Deployment
+This runs both prefill and decode on the same GPU, providing a simpler setup for testing tracing.
+
+### Alternative: Disaggregated Deployment (2 GPUs)
 
 Run the vLLM disaggregated script with tracing enabled:
 
@@ -106,69 +96,70 @@ trap 'echo Cleaning up...; kill 0' EXIT
 
 # Enable tracing
 export DYN_LOGGING_JSONL=true
-export OTEL_EXPORT_ENABLED=1
+export OTEL_EXPORT_ENABLED=true
 export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://localhost:4317
 
 # Run frontend
 export OTEL_SERVICE_NAME=dynamo-frontend
 python -m dynamo.frontend --router-mode kv --http-port=8000 &
 
-# Run decode worker
+# Run decode worker, make sure to wait for start up
 export OTEL_SERVICE_NAME=dynamo-worker-decode
 CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager &
 
-# Run prefill worker
+# Run prefill worker, make sure to wait for start up
 export OTEL_SERVICE_NAME=dynamo-worker-prefill
 CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
     --model Qwen/Qwen3-0.6B \
     --enforce-eager \
     --is-prefill-worker &
-
-wait
 ```
 
+For disaggregated deployments, this separates prefill and decode onto different GPUs for better resource utilization.
+
 ### 4. Generate Traces
 
-Send requests to the frontend to generate traces:
+Send requests to the frontend to generate traces (works for both aggregated and disaggregated deployments). **Note the `x-request-id` header**, which allows you to easily search for and correlate this specific trace in Grafana:
 
 ```bash
-curl -d '{
+curl -H 'Content-Type: application/json' \
+-H 'x-request-id: test-trace-001' \
+-d '{
   "model": "Qwen/Qwen3-0.6B",
   "max_completion_tokens": 100,
   "messages": [
     {"role": "user", "content": "What is the capital of France?"}
   ]
 }' \
--H 'Content-Type: application/json' \
--H 'x-request-id: test-trace-001' \
 http://localhost:8000/v1/chat/completions
 ```
 
 ### 5. View Traces in Grafana Tempo
 
 1. Open Grafana at `http://localhost:3000`
-2. Login with username `admin` and password `admin`
+2. Login with username `dynamo` and password `dynamo`
 3. Navigate to **Explore** (compass icon in the left sidebar)
 4. Select **Tempo** as the data source (should be selected by default)
-5. Use the **Search** tab to find traces:
+5. In the query type, select **"Search"** (not TraceQL, not Service Graph)
+6. Use the **Search** tab to find traces:
    - Search by **Service Name** (e.g., `dynamo-frontend`)
    - Search by **Span Name** (e.g., `http-request`, `handle_payload`)
    - Search by **Tags** (e.g., `x_request_id=test-trace-001`)
-6. Click on a trace to view the detailed flame graph
+7. Click on a trace to view the detailed flame graph
 
 #### Example Trace View
 
 Below is an example of what a trace looks like in Grafana Tempo:
 
-![Trace Example](./trace.png)
+![Trace Example](trace.png)
 
 ### 6. Stop Services
 
-When done, stop the Tempo and Grafana stack:
+When done, stop the observability stack:
 
 ```bash
-cd deploy/tracing
-docker-compose down
+cd deploy
+docker compose -f docker-observability.yml down
 ```
 
 ---
@@ -192,7 +183,7 @@ spec:
     - name: DYN_LOGGING_JSONL
       value: "true"
     - name: OTEL_EXPORT_ENABLED
-      value: "1"
+      value: "true"
     - name: OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
       value: "http://tempo.observability.svc.cluster.local:4317"
 
diff --git a/lib/bindings/python/examples/metrics/README.md b/lib/bindings/python/examples/metrics/README.md
index 9e0b810773..5f83fbe31a 100644
--- a/lib/bindings/python/examples/metrics/README.md
+++ b/lib/bindings/python/examples/metrics/README.md
@@ -1,425 +1,25 @@
 <!-- SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -->
 <!-- SPDX-License-Identifier: Apache-2.0 -->
 
-# Dynamo MetricsRegistry for Python
+# Python Metrics Examples
 
-Python MetricsRegistry allows you to create and manage Prometheus metrics from Python:
+Example scripts demonstrating how to create and use Prometheus metrics in Python using the Dynamo metrics API.
 
-- **Metric Types**: Counter, IntCounter, Gauge, IntGauge, Histogram, and their Vec variants (CounterVec, IntCounterVec, GaugeVec, IntGaugeVec)
-- **Metric Introspection**: Access metric names, constant labels, and variable label names
-- **Automatic Registration**: Metrics are automatically registered with the component hierarchy (namespace/component/endpoint) and available on the HTTP system status server
-- **Optional Callback Support**: Register Python callbacks to update metrics before scraping
+## Documentation
 
-Example:
-```python
-from dynamo.runtime import DistributedRuntime
+See the **[Metrics Developer Guide - Python Section](../../../../../docs/observability/metrics-developer-guide.md#metrics-api-in-python)** for complete documentation.
 
-async def main():
-    drt = DistributedRuntime()
-    endpoint = drt.namespace("ns").component("comp").endpoint("ep")
-
-    # Create metrics
-    counter = endpoint.metrics.create_intcounter("requests_total", "Total requests")
-    gauge_vec = endpoint.metrics.create_intgaugevec(
-        "active_connections",
-        "Active connections by status",
-        ["status"],  # variable labels
-        [("region", "us-west")]  # constant labels
-    )
-
-    # Introspect metrics
-    print(counter.name())           # "ns_comp_ep_requests_total"
-    print(counter.const_labels())   # {"dynamo_namespace": "ns", ...}
-    print(gauge_vec.variable_labels())  # ["status"]
-
-    # Use metrics
-    counter.inc()
-    gauge_vec.set(5, {"status": "active"})
-```
-
-## Python-Rust Metrics Integration
-
-This directory demonstrates two methods for passing metrics between Python and Rust in the Dynamo runtime.
-
-### Method 1: ForwardPassMetrics Pub/Sub via NATS (Legacy method for passing metrics)
-
-Python maintains its own metrics dictionary, serializes it, and publishes to NATS. Rust subscribes to NATS, deserializes the metrics, and updates Prometheus gauges.
-
-**Communication pattern**: Unidirectional (Python → NATS → Rust). Python publishes metrics; no feedback from Rust to Python.
-
-**Example**: Used by `WorkerMetricsPublisher` in production code
-
-```python
-from dynamo.llm import WorkerMetricsPublisher, ForwardPassMetrics
-
-# Create publisher
-publisher = WorkerMetricsPublisher()
-await publisher.create_endpoint(component, metrics_labels)
-
-# Python maintains its own metrics dict
-metrics_dict = {
-    "num_running_reqs": 5,
-    "num_waiting_reqs": 10,
-    "gpu_cache_usage": 0.75,
-}
-
-# Serialize and publish to NATS
-metrics = ForwardPassMetrics(metrics_dict)
-publisher.publish(metrics)
-
-# Rust subscribes to NATS, deserializes, and updates Prometheus
-```
-
-### Adding/Changing Metrics in Method 1
-
-When you need to add or modify metrics in Method 1 (ForwardPassMetrics Pub/Sub via NATS), you must update **multiple files**:
-
-1. **`lib/llm/src/kv_router/protocols.rs`** - Add field to struct (WorkerStats is part of ForwardPassMetrics):
-   ```rust
-   pub struct WorkerStats {
-       pub request_active_slots: u64,
-       pub request_total_slots: u64,
-       pub num_requests_waiting: u64,
-       pub new_metric_field: u64,  // ADD THIS
-   }
-   ```
-
-2. **`lib/llm/src/kv_router/publisher.rs`** - Manually create Prometheus gauge using DRT:
-   ```rust
-   fn new(component: &Component) -> Result<Self> {
-       use dynamo_runtime::metrics::MetricsRegistry;
-
-       // ... existing gauges ...
-
-       // Manually create and register new Prometheus gauge
-       let new_metric_gauge = component.metrics().create_gauge(
-           "new_metric_name",
-           "Description of new metric",
-           &[],  // labels
-       )?;
-
-       // Store in struct
-       Ok(KvStatsPrometheusGauges {
-           kv_active_blocks_gauge,
-           kv_total_blocks_gauge,
-           gpu_cache_usage_gauge,
-           gpu_prefix_cache_hit_rate_gauge,
-           new_metric_gauge,  // ADD THIS
-       })
-   }
-   ```
-
-3. **`lib/llm/src/kv_router/publisher.rs`** - Update gauge in `update_from_kvstats()`:
-   ```rust
-   fn update_from_kvstats(&self, kv_stats: &KvStats) {
-       // ... existing updates ...
-       self.new_metric_gauge.set(worker_stats.new_metric_field as f64);
-   }
-   ```
-
-4. **`components/src/dynamo/sglang/publisher.py`** - Update Python code to compute new metric:
-   ```python
-   def collect_metrics():
-       worker_stats = WorkerStats(
-           request_active_slots=...,
-           new_metric_field=compute_new_metric(),  # ADD THIS
-       )
-   ```
-
-**Result**: Changes require touching 3-4 files across Rust and Python codebases.
-
-### Method 2: Dynamo MetricsRegistry in Python
-
-Python creates typed metric objects using `endpoint.metrics.create_*()` methods, which automatically register with the endpoint. Python updates values through these objects with methods that have type hints (via `.pyi` files). Rust creates the underlying Prometheus metrics and calls Python callbacks before scraping.
-
-**Communication pattern**: Currently unidirectional (Python → Rust for updates, Rust → Python for callback invocation). Could be extended to bidirectional communication in the future (e.g., Rust notifying Python of scrape events, configuration changes) without major architectural changes.
-
-**Key advantage:** No Rust code modifications needed - metrics are defined and updated entirely in Python.
-
-This method supports two update patterns:
-
-#### Example A: Background Thread Updates (server_with_loop.py)
-
-Update metrics continuously from a background thread, independent of scraping:
-
-```python
-# Create metric objects (automatically registered)
-# Note: Prometheus prefixes these with "dynamo_component_", so they appear as:
-#   - dynamo_component_request_total_slots
-#   - dynamo_component_gpu_cache_usage_percent
-request_slots: IntGauge = endpoint.metrics.create_intgauge(
-    "request_total_slots", "Total request slots available"
-)
-gpu_usage: Gauge = endpoint.metrics.create_gauge(
-    "gpu_cache_usage_percent", "GPU cache usage percentage"
-)
-
-# Background thread continuously updates metrics
-def update_metrics_in_loop():
-    count = 0
-    while True:
-        count += 1
-        request_slots.set(1024 + count)
-        gpu_usage.set(0.01 + (count * 0.01))
-        time.sleep(2)
-
-updater = threading.Thread(target=update_metrics_in_loop, daemon=True)
-updater.start()
-```
-
-#### Example B: Callback-based Updates (server_with_callback.py)
-
-Register a callback that updates metrics on-demand when Prometheus scrapes the `/metrics` endpoint:
-
-```python
-# Create metric objects (automatically registered)
-# Note: Prometheus prefixes these with "dynamo_component_", so they appear as:
-#   - dynamo_component_request_total_slots
-#   - dynamo_component_gpu_cache_usage_percent
-request_slots: IntGauge = endpoint.metrics.create_intgauge(
-    "request_total_slots", "Total request slots available"
-)
-gpu_usage: Gauge = endpoint.metrics.create_gauge(
-    "gpu_cache_usage_percent", "GPU cache usage percentage"
-)
-
-# Register callback for dynamic updates before scraping
-def update_metrics():
-    request_slots.set(compute_current_slots())
-    gpu_usage.set(get_gpu_usage())
-
-endpoint.metrics.register_callback(update_metrics)
-```
-
-Both examples support vector metrics with labels:
-
-```python
-# Create vector metrics with labels
-worker_requests: IntGaugeVec = endpoint.metrics.create_intgaugevec(
-    "worker_active_requests",
-    "Active requests per worker",
-    ["worker_id", "model"]
-)
-
-# Update vector metrics with specific label values
-worker_requests.set(5, {"worker_id": "worker_1", "model": "llama-3"})
-worker_requests.set(3, {"worker_id": "worker_2", "model": "llama-3"})
-```
-
-#### Available Metric Types
-
-Method 2 supports all standard Prometheus metric types:
-
-- **Gauges**: `Gauge` (float), `IntGauge` (integer)
-- **GaugeVec**: `GaugeVec` (float with labels), `IntGaugeVec` (integer with labels)
-- **Counters**: `Counter` (float), `IntCounter` (integer)
-- **CounterVec**: `CounterVec` (float with labels), `IntCounterVec` (integer with labels)
-- **Histograms**: `Histogram`
-
-All metrics are imported from `dynamo.prometheus_metrics`.
-
-#### Adding/Changing Metrics in Method 2
-
-When you need to add or modify metrics in Method 2 (Dynamic Registration), you only update **Python code**:
-
-1. **Create new metric** - Just add one line in Python (automatically registered):
-   ```python
-   new_metric: IntGauge = endpoint.metrics.create_intgauge(
-       "new_metric_name", "Description of the metric"
-   )
-   ```
-
-2. **Update in callback** - Add update logic:
-   ```python
-   def update_metrics():
-       request_slots.set(compute_slots())
-       gpu_usage.set(compute_gpu_usage())
-       new_metric.set(compute_new_metric())  # ADD THIS
-   ```
-
-3. **For vector metrics with labels** - Create with label names, update with label values:
-   ```python
-   # Create vector metric
-   new_vec: IntGaugeVec = endpoint.metrics.create_intgaugevec(
-       "new_metric_vec", "Description", ["label1", "label2"]
-   )
-
-   # Update with specific label values
-   new_vec.set(100, {"label1": "value1", "label2": "value2"})
-   ```
-
-**Result**: Changes only require modifying Python code. No Rust changes needed. Metrics are automatically created and registered with Prometheus by the Rust runtime when you call `create_*()`.
-
-#### Type-Hinted Methods
-
-Dynamic Registration provides type hints (via `.pyi` stub files) for typed metric classes:
-
-- **Gauges** use `.set()`, `.get()`, `.inc()`, `.dec()`, `.add()`, `.sub()`
-- **Counters** use `.inc()`, `.inc_by()`, `.get()` (counters only increase)
-- **Histograms** use `.observe()`
-- **Vec metrics** take a `labels: Dict[str, str]` parameter for operations
-
-### Architecture Diagrams
-
-#### Component Architecture
-
-##### Method 1: ForwardPassMetrics Pub/Sub via NATS - Component View
-
-```mermaid
-graph TB
-    subgraph "Python Layer"
-        PY[Python Application<br/>components/src/dynamo/sglang/main.py]
-        style PY fill:#3776ab,color:#fff
-    end
-
-    subgraph "Python/Rust Interface (PyO3)"
-        WMPB[WorkerMetricsPublisher Bindings<br/>bindings/python/rust/llm/kv.rs]
-        FPM[ForwardPassMetrics Struct<br/>bindings/python/rust/llm/kv.rs]
-        style WMPB fill:#f4a261,color:#000
-        style FPM fill:#f4a261,color:#000
-    end
-
-    subgraph "Rust Core"
-        subgraph "Worker Process Components"
-            WMP[WorkerMetricsPublisher<br/>llm/src/kv_router/publisher.rs]
-            WATCH[Watch Channel<br/>tokio::sync::watch]
-            PROM1[Local Prometheus Gauges<br/>prometheus::Gauge]
-        end
-
-        subgraph "NATS Infrastructure"
-            NATS[NATS Server<br/>KV_METRICS_SUBJECT]
-        end
-
-        subgraph "Other Consumers (e.g., KvWorkerMonitor)"
-            SUB[NATS Subscriber<br/>component/namespace.rs]
-        end
-
-        subgraph "System Status Servers"
-            SS[System Status Server<br/>runtime/src/system_status_server.rs<br/>Started by DistributedRuntime]
-        end
-
-        style WMP fill:#ce422b,color:#fff
-        style WATCH fill:#ce422b,color:#fff
-        style PROM1 fill:#ce422b,color:#fff
-        style NATS fill:#27aae1,color:#fff
-        style SUB fill:#ce422b,color:#fff
-        style SS fill:#6c757d,color:#fff
-    end
-
-    PY -->|"WorkerMetricsPublisher()"| WMPB
-    PY -->|"ForwardPassMetrics(worker_stats, kv_stats, spec_decode_stats)"| FPM
-    PY -->|"publish(metrics)"| WMPB
-    WMPB -->|"FFI: publish(Arc ForwardPassMetrics)"| WMP
-    WMP -->|"update_from_kvstats(kv_stats)"| PROM1
-    WMP -->|"tx.send(metrics)"| WATCH
-    WATCH -->|"publish(KV_METRICS_SUBJECT, LoadEvent)"| NATS
-    NATS -->|"subscribe_with_type LoadEvent"| SUB
-    SS -->|"Worker: gather() from PROM1"| PROM1
-```
-
-##### Method 2: Dynamic Registration - Component View
-
-```mermaid
-graph TD
-    subgraph Python["Python Layer"]
-        PY[Python Application<br/>main.py]
-        style PY fill:#3776ab,color:#fff
-    end
-
-    subgraph PyO3["Python/Rust Interface - PyO3"]
-        PM[PrometheusMetricsUtils<br/>endpoint.metrics<br/>prometheus_metrics.rs]
-        MT[Metric Type Objects<br/>IntGauge/Gauge/Counter/etc.<br/>prometheus_metrics.rs]
-        style PM fill:#f4a261,color:#000
-        style MT fill:#f4a261,color:#000
-    end
-
-    subgraph Rust["Rust Core"]
-        EP[Endpoint<br/>component/endpoint.rs]
-        DRT[DistributedRuntime<br/>distributed.rs]
-        PROM["Prometheus Registry<br/>prometheus::IntGauge/Gauge/etc."]
-        SS[System Status Server<br/>system_status_server.rs]
-        style EP fill:#ce422b,color:#fff
-        style DRT fill:#ce422b,color:#fff
-        style PROM fill:#ce422b,color:#fff
-        style SS fill:#6c757d,color:#fff
-    end
-
-    PY -->|endpoint.metrics.create_intgauge| PM
-    PM -->|endpoint.metrics.create_intgauge| EP
-    EP -->|create & register| PROM
-    PM -->|wrap & return| MT
-    MT -->|return to Python| PY
-    PY -->|metric.set/get| MT
-    MT -->|direct FFI call| PROM
-    PY -.->|endpoint.metrics.register_callback| PM
-    PM -.->|drt.register_metrics_callback| DRT
-    SS ==>|execute_metrics_callbacks| DRT
-    DRT -.->|invoke Python callback| PY
-    SS -->|gather| PROM
-
-    linkStyle 7 stroke:#ff6b6b,stroke-width:2px
-    linkStyle 8 stroke:#ff6b6b,stroke-width:2px
-    linkStyle 9 stroke:#ff6b6b,stroke-width:2px
-    linkStyle 10 stroke:#ff6b6b,stroke-width:2px
-```
-
-### Running the Examples
-
-The examples demonstrate Method 2 (Dynamo MetricsRegistry in Python) with two different update patterns.
-
-#### Prerequisites
-
-Update Python bindings if needed:
-```bash
-cd ~/dynamo/lib/bindings/python
-maturin develop
-```
-
-#### Run Example A: Background Thread Updates
+## Running Examples
 
 ```bash
 cd ~/dynamo/lib/bindings/python/examples/metrics
-DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 ./server_with_loop.py
-```
 
-#### Run Example B: Callback-based Updates
+# Background thread updates
+DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 ./server_with_loop.py
 
-```bash
-cd ~/dynamo/lib/bindings/python/examples/metrics
+# Callback-based updates
 DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 ./server_with_callback.py
-```
-
-**Note:** The environment variables are required:
-- `DYN_SYSTEM_ENABLED=true` - Enables the system status server
-- `DYN_SYSTEM_PORT=8081` - Sets the port for the metrics endpoint
-
-#### Check the Metrics
-
-The metrics are served via the system status server at:
 
-```bash
+# Check Prometheus Exposition Format text metrics
 curl http://localhost:8081/metrics
 ```
-
-Expected output includes:
-
-```
-# HELP request_total_slots Total request slots available
-# TYPE request_total_slots gauge
-request_total_slots{dynamo_namespace="ns556",dynamo_component="cp556",dynamo_endpoint="ep556"} 1024
-
-# HELP gpu_cache_usage_percent GPU cache usage percentage
-# TYPE gpu_cache_usage_percent gauge
-gpu_cache_usage_percent{dynamo_namespace="ns556",dynamo_component="cp556",dynamo_endpoint="ep556"} 0.00
-
-# HELP worker_active_requests Active requests per worker
-# TYPE worker_active_requests gauge
-worker_active_requests{dynamo_namespace="ns556",dynamo_component="cp556",dynamo_endpoint="ep556",worker_id="worker_1",model="llama-3"} 5
-worker_active_requests{dynamo_namespace="ns556",dynamo_component="cp556",dynamo_endpoint="ep556",worker_id="worker_2",model="llama-3"} 3
-
-# HELP internal_update_count Number of times metrics callback was invoked
-# TYPE internal_update_count counter
-internal_update_count{dynamo_namespace="ns556",dynamo_component="cp556",dynamo_endpoint="ep556",type="internal"} 1
-```
-
-Each time you query the `/metrics` endpoint, the `update_metrics()` callback is invoked, updating the metric values with fresh data.
diff --git a/lib/bindings/python/rust/lib.rs b/lib/bindings/python/rust/lib.rs
index fc905b45d1..fb9a31108e 100644
--- a/lib/bindings/python/rust/lib.rs
+++ b/lib/bindings/python/rust/lib.rs
@@ -125,12 +125,9 @@ fn create_request_context(
 #[pymodule]
 fn _core(m: &Bound<'_, PyModule>) -> PyResult<()> {
     // Initialize logging early unless OTEL export is enabled (which requires tokio runtime)
-    if std::env::var("OTEL_EXPORT_ENABLED")
-        .map(|v| v == "1")
-        .unwrap_or(false)
-    {
+    if rs::config::env_is_truthy("OTEL_EXPORT_ENABLED") {
         eprintln!(
-            "Warning: OTEL_EXPORT_ENABLED=1 detected. Logging initialization deferred until runtime is available. Early logs may be dropped."
+            "Warning: OTEL_EXPORT_ENABLED detected. Logging initialization deferred until runtime is available. Early logs may be dropped."
         );
     } else {
         rs::logging::init();
@@ -449,10 +446,7 @@ impl DistributedRuntime {
 
         // Initialize logging in context where tokio runtime is available
         // otel exporter requires it
-        if std::env::var("OTEL_EXPORT_ENABLED")
-            .map(|v| v == "1")
-            .unwrap_or(false)
-        {
+        if rs::config::env_is_truthy("OTEL_EXPORT_ENABLED") {
             runtime.secondary().block_on(async {
                 rs::logging::init();
             });
diff --git a/lib/runtime/src/logging.rs b/lib/runtime/src/logging.rs
index 4250c82f17..a46b043eb8 100644
--- a/lib/runtime/src/logging.rs
+++ b/lib/runtime/src/logging.rs
@@ -144,11 +144,9 @@ impl Default for LoggingConfig {
     }
 }
 
-/// Check if OTLP trace exporting is enabled (set OTEL_EXPORT_ENABLED=1 to enable)
+/// Check if OTLP trace exporting is enabled (set OTEL_EXPORT_ENABLED to a truthy value: 1, true, on, yes)
 fn otlp_exporter_enabled() -> bool {
-    std::env::var(OTEL_EXPORT_ENABLED_ENV)
-        .map(|v| v == "1")
-        .unwrap_or(false)
+    crate::config::env_is_truthy(OTEL_EXPORT_ENABLED_ENV)
 }
 
 /// Get the service name from environment or use default