Skip to content

Commit 0d36e2b

Browse files
authored
Add KWOK singular deployment for metrics instead of deployment template solution (#149)
* Add KWOK singular deployment for metrics instead of deployment template solution * Simplify integration tests and setup scripts * Add additional KWOK nodes to integration tests setup
1 parent b1e85a9 commit 0d36e2b

File tree

20 files changed

+698
-285
lines changed

20 files changed

+698
-285
lines changed

Dockerfile

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,9 @@ RUN --mount=type=cache,target=/root/.cache/go-build make build OS=$TARGETOS ARCH
2525

2626
FROM common-builder AS status-exporter-builder
2727
COPY ./cmd/status-exporter/ ./cmd/status-exporter/
28+
COPY ./cmd/status-exporter-kwok/ ./cmd/status-exporter-kwok/
2829
COPY ./internal/ ./internal/
29-
RUN --mount=type=cache,target=/root/.cache/go-build make build OS=$TARGETOS ARCH=$TARGETARCH COMPONENTS=status-exporter
30+
RUN --mount=type=cache,target=/root/.cache/go-build make build OS=$TARGETOS ARCH=$TARGETARCH COMPONENTS="status-exporter status-exporter-kwok"
3031

3132
FROM common-builder AS topology-server-builder
3233
COPY ./cmd/topology-server/ ./cmd/topology-server/
@@ -71,8 +72,14 @@ COPY --from=status-updater-builder /go/src/github.com/run-ai/fake-gpu-operator/b
7172
ENTRYPOINT ["/bin/status-updater"]
7273

7374
FROM ubuntu AS status-exporter
74-
COPY --from=status-exporter-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/status-exporter /bin/
75-
ENTRYPOINT ["/bin/status-exporter"]
75+
COPY --from=status-exporter-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/status-exporter /usr/local/bin/
76+
COPY --from=status-exporter-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/status-exporter-kwok /usr/local/bin/
77+
ENTRYPOINT ["/usr/local/bin/status-exporter"]
78+
79+
FROM ubuntu AS status-exporter-kwok
80+
COPY --from=status-exporter-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/status-exporter /usr/local/bin/
81+
COPY --from=status-exporter-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/status-exporter-kwok /usr/local/bin/
82+
ENTRYPOINT ["/usr/local/bin/status-exporter-kwok"]
7683

7784
FROM ubuntu AS topology-server
7885
COPY --from=topology-server-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/topology-server /bin/

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
BUILD_DIR=$(shell pwd)/bin
2-
COMPONENTS?=device-plugin dra-plugin-gpu status-updater kwok-gpu-device-plugin kwok-dra-plugin status-exporter topology-server mig-faker
2+
COMPONENTS?=device-plugin dra-plugin-gpu status-updater kwok-gpu-device-plugin kwok-dra-plugin status-exporter status-exporter-kwok topology-server mig-faker
33

44
DOCKER_REPO_BASE=ghcr.io/run-ai/fake-gpu-operator
55
DOCKER_TAG?=0.0.0-dev

cmd/status-exporter-kwok/main.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
package main
2+
3+
import (
4+
"github.com/run-ai/fake-gpu-operator/internal/common/app"
5+
status_exporter "github.com/run-ai/fake-gpu-operator/internal/status-exporter"
6+
)
7+
8+
func main() {
9+
appRunner := app.NewAppRunner(&status_exporter.KWOKStatusExporterApp{})
10+
appRunner.Run()
11+
}
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
{{- if .Values.statusExporter.enabled }}
2+
apiVersion: apps/v1
3+
kind: Deployment
4+
metadata:
5+
name: nvidia-dcgm-exporter-kwok
6+
labels:
7+
app: nvidia-dcgm-exporter
8+
component: status-exporter-kwok
9+
app.kubernetes.io/name: nvidia-container-toolkit
10+
spec:
11+
selector:
12+
matchLabels:
13+
app: nvidia-dcgm-exporter
14+
component: status-exporter-kwok
15+
replicas: 1
16+
template:
17+
metadata:
18+
labels:
19+
app: nvidia-dcgm-exporter
20+
component: status-exporter-kwok
21+
app.kubernetes.io/name: nvidia-container-toolkit
22+
spec:
23+
containers:
24+
- name: nvidia-dcgm-exporter-kwok
25+
image: "{{ .Values.statusExporter.image.repository }}:{{ .Values.statusExporter.image.tag | default .Chart.AppVersion }}"
26+
imagePullPolicy: "{{ .Values.statusExporter.image.pullPolicy }}"
27+
command: ["/usr/local/bin/status-exporter-kwok"]
28+
resources:
29+
{{- toYaml .Values.statusExporter.kwok.resources | nindent 12 }}
30+
env:
31+
- name: TOPOLOGY_CM_NAME
32+
value: topology
33+
- name: TOPOLOGY_CM_NAMESPACE
34+
value: "{{ .Release.Namespace }}"
35+
ports:
36+
- containerPort: 9400
37+
name: http
38+
restartPolicy: Always
39+
serviceAccountName: status-exporter
40+
imagePullSecrets:
41+
- name: gcr-secret
42+
{{- end }}
43+

deploy/fake-gpu-operator/values.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,15 @@ statusExporter:
5959
cpu: "200m"
6060
memory: "200Mi"
6161
topologyMaxExportInterval: 10s
62+
# If using many KWOK nodes, you may need to increase the resources for the KWOK status-exporter
63+
kwok:
64+
resources:
65+
requests:
66+
cpu: "50m"
67+
memory: "100Mi"
68+
limits:
69+
cpu: "500m"
70+
memory: "256Mi"
6271

6372
kwokGpuDevicePlugin:
6473
enabled: true

internal/status-exporter/export/labels/exporter.go

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@ package labels
33
import (
44
"fmt"
55
"log"
6-
"strconv"
76

87
"github.com/run-ai/fake-gpu-operator/internal/common/kubeclient"
98
"github.com/run-ai/fake-gpu-operator/internal/common/topology"
@@ -43,15 +42,7 @@ func (e *LabelsExporter) Run(stopCh <-chan struct{}) {
4342
}
4443

4544
func (e *LabelsExporter) export(nodeTopology *topology.NodeTopology) error {
46-
47-
labels := map[string]string{
48-
"nvidia.com/gpu.memory": strconv.Itoa(nodeTopology.GpuMemory),
49-
"nvidia.com/gpu.product": nodeTopology.GpuProduct,
50-
"nvidia.com/mig.strategy": nodeTopology.MigStrategy,
51-
"nvidia.com/gpu.count": strconv.Itoa(len(nodeTopology.Gpus)),
52-
"nvidia.com/gpu.present": "true",
53-
"run.ai/fake.gpu": "true",
54-
}
45+
labels := BuildNodeLabels(nodeTopology)
5546

5647
err := e.kubeclient.SetNodeLabels(labels)
5748
if err != nil {
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
package labels
2+
3+
import (
4+
"strconv"
5+
6+
"github.com/run-ai/fake-gpu-operator/internal/common/topology"
7+
)
8+
9+
// BuildNodeLabels creates the standard node labels from a topology
10+
func BuildNodeLabels(nodeTopology *topology.NodeTopology) map[string]string {
11+
return map[string]string{
12+
"nvidia.com/gpu.memory": strconv.Itoa(nodeTopology.GpuMemory),
13+
"nvidia.com/gpu.product": nodeTopology.GpuProduct,
14+
"nvidia.com/mig.strategy": nodeTopology.MigStrategy,
15+
"nvidia.com/gpu.count": strconv.Itoa(len(nodeTopology.Gpus)),
16+
"nvidia.com/gpu.present": "true",
17+
"run.ai/fake.gpu": "true",
18+
}
19+
}
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
package labels
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"log"
7+
8+
"k8s.io/apimachinery/pkg/api/errors"
9+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
10+
"k8s.io/client-go/kubernetes"
11+
"k8s.io/client-go/util/retry"
12+
13+
"github.com/run-ai/fake-gpu-operator/internal/common/topology"
14+
"github.com/run-ai/fake-gpu-operator/internal/status-exporter/watch"
15+
)
16+
17+
// MultiNodeLabelsExporter exports labels for multiple KWOK nodes
18+
type MultiNodeLabelsExporter struct {
19+
kubeClient kubernetes.Interface
20+
}
21+
22+
var _ watch.LabelsExporter = &MultiNodeLabelsExporter{}
23+
24+
// NewMultiNodeLabelsExporter creates a new multi-node labels exporter
25+
func NewMultiNodeLabelsExporter(kubeClient kubernetes.Interface) *MultiNodeLabelsExporter {
26+
return &MultiNodeLabelsExporter{
27+
kubeClient: kubeClient,
28+
}
29+
}
30+
31+
// SetLabelsForNode exports labels for a specific node
32+
func (e *MultiNodeLabelsExporter) SetLabelsForNode(nodeName string, nodeTopology *topology.NodeTopology) error {
33+
labels := BuildNodeLabels(nodeTopology)
34+
35+
if err := e.setNodeLabels(nodeName, labels); err != nil {
36+
return fmt.Errorf("failed to set node labels for %s: %w", nodeName, err)
37+
}
38+
39+
log.Printf("Exported labels for KWOK node: %s\n", nodeName)
40+
return nil
41+
}
42+
43+
// setNodeLabels sets labels on a specific node with retry logic to handle conflicts
44+
func (e *MultiNodeLabelsExporter) setNodeLabels(nodeName string, labels map[string]string) error {
45+
log.Printf("Setting labels on KWOK node %s: %v\n", nodeName, labels)
46+
47+
// Retry on conflict errors (when node is being modified by KWOK stages)
48+
return retry.RetryOnConflict(retry.DefaultRetry, func() error {
49+
node, err := e.kubeClient.CoreV1().Nodes().Get(context.TODO(), nodeName, metav1.GetOptions{})
50+
if err != nil {
51+
if errors.IsNotFound(err) {
52+
log.Printf("Node %s not found (may have been deleted)\n", nodeName)
53+
return nil // Node deleted, don't retry
54+
}
55+
return err
56+
}
57+
58+
// Update labels
59+
for k, v := range labels {
60+
node.Labels[k] = v
61+
}
62+
63+
_, err = e.kubeClient.CoreV1().Nodes().Update(context.TODO(), node, metav1.UpdateOptions{})
64+
return err
65+
})
66+
}

internal/status-exporter/export/metrics/exporter.go

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,8 @@ import (
55
"fmt"
66
"log"
77
"net/http"
8-
"strconv"
98
"time"
109

11-
"github.com/prometheus/client_golang/prometheus"
1210
"github.com/prometheus/client_golang/prometheus/promhttp"
1311
"github.com/run-ai/fake-gpu-operator/internal/common/constants"
1412
"github.com/run-ai/fake-gpu-operator/internal/common/topology"
@@ -70,16 +68,7 @@ func (e *MetricsExporter) export(nodeTopology *topology.NodeTopology) error {
7068

7169
for gpuIdx, gpu := range nodeTopology.Gpus {
7270
log.Printf("Exporting metrics for node %v, gpu %v\n", nodeName, gpu.ID)
73-
labels := prometheus.Labels{
74-
"gpu": strconv.Itoa(gpuIdx),
75-
"UUID": gpu.ID,
76-
"device": "nvidia" + strconv.Itoa(gpuIdx),
77-
"modelName": nodeTopology.GpuProduct,
78-
"Hostname": generateFakeHostname(nodeName),
79-
"namespace": gpu.Status.AllocatedBy.Namespace,
80-
"pod": gpu.Status.AllocatedBy.Pod,
81-
"container": gpu.Status.AllocatedBy.Container,
82-
}
71+
labels := buildGpuMetricLabels(nodeName, gpuIdx, &gpu, nodeTopology)
8372

8473
utilization := gpu.Status.PodGpuUsageStatus.Utilization()
8574
fbUsed := gpu.Status.PodGpuUsageStatus.FbUsed(nodeTopology.GpuMemory)
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
package metrics
2+
3+
import (
4+
"strconv"
5+
6+
"github.com/prometheus/client_golang/prometheus"
7+
"github.com/run-ai/fake-gpu-operator/internal/common/topology"
8+
)
9+
10+
// buildGpuMetricLabels creates Prometheus labels for a GPU metric
11+
func buildGpuMetricLabels(nodeName string, gpuIdx int, gpu *topology.GpuDetails, nodeTopology *topology.NodeTopology) prometheus.Labels {
12+
return prometheus.Labels{
13+
"gpu": strconv.Itoa(gpuIdx),
14+
"UUID": gpu.ID,
15+
"device": "nvidia" + strconv.Itoa(gpuIdx),
16+
"modelName": nodeTopology.GpuProduct,
17+
"Hostname": generateFakeHostname(nodeName),
18+
"namespace": gpu.Status.AllocatedBy.Namespace,
19+
"pod": gpu.Status.AllocatedBy.Pod,
20+
"container": gpu.Status.AllocatedBy.Container,
21+
}
22+
}

0 commit comments

Comments
 (0)