Skip to content

Commit e2fee94

Browse files
committed
add node labels and integration test support for compute-domain-dra-plugin
- Add computeDomainDevicePluginLabelKey to status-updater node labeling - Add compute-domain-dra-plugin to integration test setup - Enable computeDomainDraPlugin in integration test values
1 parent c2e605b commit e2fee94

File tree

3 files changed

+20
-8
lines changed

3 files changed

+20
-8
lines changed

internal/status-updater/handlers/node/labels.go

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,10 @@ import (
1212
)
1313

1414
const (
15-
dcgmExporterLabelKey = "nvidia.com/gpu.deploy.dcgm-exporter"
16-
devicePluginLabelKey = "nvidia.com/gpu.deploy.device-plugin"
17-
draPluginGpuLabelKey = "nvidia.com/gpu.deploy.dra-plugin-gpu"
15+
dcgmExporterLabelKey = "nvidia.com/gpu.deploy.dcgm-exporter"
16+
devicePluginLabelKey = "nvidia.com/gpu.deploy.device-plugin"
17+
draPluginGpuLabelKey = "nvidia.com/gpu.deploy.dra-plugin-gpu"
18+
computeDomainDevicePluginLabelKey = "nvidia.com/gpu.deploy.compute-domain-dra-plugin"
1819
)
1920

2021
// labelNode labels the node with required labels for the fake-gpu-operator to function.
@@ -25,6 +26,7 @@ func (p *NodeHandler) labelNode(node *v1.Node) error {
2526
if !isFakeNode(node) {
2627
labels[devicePluginLabelKey] = "true"
2728
labels[draPluginGpuLabelKey] = "true"
29+
labels[computeDomainDevicePluginLabelKey] = "true"
2830
}
2931

3032
err := p.patchNodeLabels(node, labels)
@@ -38,9 +40,10 @@ func (p *NodeHandler) labelNode(node *v1.Node) error {
3840
// unlabelNode removes the labels from the node that were added by the fake-gpu-operator.
3941
func (p *NodeHandler) unlabelNode(node *v1.Node) error {
4042
err := p.patchNodeLabels(node, map[string]interface{}{
41-
dcgmExporterLabelKey: nil,
42-
devicePluginLabelKey: nil,
43-
draPluginGpuLabelKey: nil,
43+
dcgmExporterLabelKey: nil,
44+
devicePluginLabelKey: nil,
45+
draPluginGpuLabelKey: nil,
46+
computeDomainDevicePluginLabelKey: nil,
4447
})
4548
if err != nil && !errors.IsNotFound(err) {
4649
return fmt.Errorf("failed to unlabel node %s: %w", node.Name, err)

test/integration/setup.sh

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ if [[ "${SKIP_SETUP}" != "true" ]]; then
5353

5454
echo "Loading images into kind cluster..."
5555
DOCKER_REPO_BASE="${DOCKER_REPO_BASE:-ghcr.io/run-ai/fake-gpu-operator}"
56-
for component in dra-plugin-gpu status-updater status-exporter topology-server kwok-dra-plugin compute-domain-controller; do
56+
for component in dra-plugin-gpu status-updater status-exporter topology-server kwok-dra-plugin compute-domain-controller compute-domain-dra-plugin; do
5757
IMAGE="${DOCKER_REPO_BASE}/${component}:${DOCKER_TAG}"
5858
echo "Loading ${IMAGE}..."
5959
kind load docker-image \
@@ -82,7 +82,8 @@ if [[ "${SKIP_SETUP}" != "true" ]]; then
8282
--set statusExporter.image.tag="${DOCKER_TAG}" \
8383
--set topologyServer.image.tag="${DOCKER_TAG}" \
8484
--set kwokDraPlugin.image.tag="${DOCKER_TAG}" \
85-
--set computeDomainController.image.tag="${DOCKER_TAG}"
85+
--set computeDomainController.image.tag="${DOCKER_TAG}" \
86+
--set computeDomainDraPlugin.image.tag="${DOCKER_TAG}"
8687

8788
echo "Waiting for status-updater pod to be ready..."
8889
kubectl wait --for=condition=Ready pod -l app=status-updater -n gpu-operator --timeout=120s
@@ -99,6 +100,9 @@ if [[ "${SKIP_SETUP}" != "true" ]]; then
99100
echo "Waiting for kwok-dra-plugin pod to be ready..."
100101
kubectl wait --for=condition=Ready pod -l app=kwok-dra-plugin -n gpu-operator --timeout=120s
101102

103+
echo "Waiting for compute-domain-dra-plugin daemonset to be ready..."
104+
kubectl wait --for=condition=Ready pod -l app=compute-domain-dra-plugin -n gpu-operator --timeout=120s
105+
102106
# Install KWOK controller for simulated nodes
103107
echo "Installing KWOK controller..."
104108
KWOK_VERSION="${KWOK_VERSION:-v0.7.0}"

test/integration/values.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,11 @@ computeDomainController:
5757
image:
5858
pullPolicy: Never
5959

60+
computeDomainDraPlugin:
61+
enabled: true
62+
image:
63+
pullPolicy: Never
64+
6065
# GPU topology configuration
6166
# Status-updater will use this to create topology ConfigMaps
6267
topology:

0 commit comments

Comments
 (0)