|
2 | 2 | # pods. The Prometheus Operator's ServiceMonitor will target this Service
|
3 | 3 | # to discover and scrape the GPU metrics. This is especially important
|
4 | 4 | # because the exporter pods are part of a DaemonSet, and their IPs can change.
|
| 5 | +# |
| 6 | +# NOTE: This configuration is specific to GKE, which automatically deploys the |
| 7 | +# DCGM exporter in the 'gke-managed-system' namespace. For other cloud |
| 8 | +# providers or on-premise clusters, you would need to deploy your own DCGM |
| 9 | +# exporter (e.g., via a Helm chart) and update this Service's 'namespace' |
| 10 | +# and 'labels' to match your deployment. |
5 | 11 |
|
6 | 12 | apiVersion: v1
|
7 | 13 | kind: Service
|
8 | 14 | metadata:
|
9 | 15 | name: gke-managed-dcgm-exporter
|
| 16 | + # GKE-SPECIFIC: GKE deploys its managed DCGM exporter in this namespace. |
| 17 | + # On other platforms, this would be the namespace where you deploy the exporter. |
10 | 18 | namespace: gke-managed-system
|
11 | 19 | labels:
|
12 | 20 | # This label is critical. The ServiceMonitor uses this label to find this
|
13 | 21 | # specific Service. If the labels don't match, Prometheus will not be
|
14 | 22 | # able to discover the metrics endpoint.
|
| 23 | + # GKE-SPECIFIC: This label is used by GKE's managed service. For a custom |
| 24 | + # deployment, you would use a more generic label like 'nvidia-dcgm-exporter'. |
15 | 25 | app.kubernetes.io/name: gke-managed-dcgm-exporter
|
16 | 26 | spec:
|
17 | 27 | selector:
|
18 | 28 | # This selector tells the Service which pods to route traffic to.
|
19 | 29 | # It must match the labels on the DCGM exporter pods.
|
| 30 | + # GKE-SPECIFIC: This selector matches the labels on GKE's managed DCGM pods. |
20 | 31 | app.kubernetes.io/name: gke-managed-dcgm-exporter
|
21 | 32 | ports:
|
22 | 33 | - # The 'name' of this port is important. The ServiceMonitor will specifically
|
|
0 commit comments