Call out GKE-specific labels/namespace for service monitor

seans3 · seans3 · commit e9058ccb9889 · 2025-09-08T19:53:06.000Z
diff --git a/ai/vllm-deployment/hpa/gpu-dcgm-exporter-service.yaml b/ai/vllm-deployment/hpa/gpu-dcgm-exporter-service.yaml
@@ -2,21 +2,32 @@
 # pods. The Prometheus Operator's ServiceMonitor will target this Service
 # to discover and scrape the GPU metrics. This is especially important
 # because the exporter pods are part of a DaemonSet, and their IPs can change.
+#
+# NOTE: This configuration is specific to GKE, which automatically deploys the
+# DCGM exporter in the 'gke-managed-system' namespace. For other cloud
+# providers or on-premise clusters, you would need to deploy your own DCGM
+# exporter (e.g., via a Helm chart) and update this Service's 'namespace'
+# and 'labels' to match your deployment.
 
 apiVersion: v1
 kind: Service
 metadata:
   name: gke-managed-dcgm-exporter
+  # GKE-SPECIFIC: GKE deploys its managed DCGM exporter in this namespace.
+  # On other platforms, this would be the namespace where you deploy the exporter.
   namespace: gke-managed-system
   labels:
     # This label is critical. The ServiceMonitor uses this label to find this
     # specific Service. If the labels don't match, Prometheus will not be
     # able to discover the metrics endpoint.
+    # GKE-SPECIFIC: This label is used by GKE's managed service. For a custom
+    # deployment, you would use a more generic label like 'nvidia-dcgm-exporter'.
     app.kubernetes.io/name: gke-managed-dcgm-exporter
 spec:
   selector:
     # This selector tells the Service which pods to route traffic to.
     # It must match the labels on the DCGM exporter pods.
+    # GKE-SPECIFIC: This selector matches the labels on GKE's managed DCGM pods.
     app.kubernetes.io/name: gke-managed-dcgm-exporter
   ports:
     - # The 'name' of this port is important. The ServiceMonitor will specifically
diff --git a/ai/vllm-deployment/hpa/gpu-service-monitor.yaml b/ai/vllm-deployment/hpa/gpu-service-monitor.yaml
@@ -17,11 +17,17 @@ spec:
   # the labels on the 'gke-managed-dcgm-exporter' Service.
   selector:
     matchLabels:
+      # GKE-SPECIFIC: This label matches the Service for GKE's managed DCGM
+      # exporter. If you are using a different DCGM deployment, you must
+      # update this label to match the label of the corresponding Service.
       app.kubernetes.io/name: gke-managed-dcgm-exporter
   # This selector specifies which namespace to search for the target Service.
   # For GKE, the DCGM service is in 'gke-managed-system'.
   namespaceSelector:
     matchNames:
+    # GKE-SPECIFIC: This is the namespace for GKE's managed DCGM exporter.
+    # For other environments, this should be the namespace where you have
+    # deployed the DCGM exporter Service.
     - gke-managed-system
   endpoints:
   - port: metrics