Skip to content

Commit 22ae477

Browse files
authored
test: add manager logic to test controller's behavior (#113)
* test: add manager logic to test controller's behavior * test: add manager logic to test controller's behavior * test: add manager logic to test controller's behavior
1 parent 2bb9225 commit 22ae477

File tree

6 files changed

+343
-306
lines changed

6 files changed

+343
-306
lines changed

internal/controller/gpu_controller_test.go

Lines changed: 20 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -18,81 +18,40 @@ package controller
1818

1919
import (
2020
"context"
21-
"fmt"
2221

2322
tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
2423
"github.com/NexusGPU/tensor-fusion/internal/constants"
2524
. "github.com/onsi/ginkgo/v2"
2625
. "github.com/onsi/gomega"
27-
"k8s.io/apimachinery/pkg/api/errors"
2826
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
29-
"k8s.io/apimachinery/pkg/types"
3027
"k8s.io/client-go/kubernetes/scheme"
28+
"sigs.k8s.io/controller-runtime/pkg/client"
3129
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
32-
"sigs.k8s.io/controller-runtime/pkg/reconcile"
3330
)
3431

3532
var _ = Describe("GPU Controller", func() {
36-
Context("When reconciling a resource", func() {
37-
const resourceName = "test-resource"
38-
39-
ctx := context.Background()
40-
41-
typeNamespacedName := types.NamespacedName{
42-
Name: resourceName,
43-
Namespace: "default",
44-
}
45-
gpu := &tfv1.GPU{}
46-
gpunode := &tfv1.GPUNode{}
47-
48-
BeforeEach(func() {
49-
By("creating the custom resource for the Kind GPUNode")
50-
gpunode = &tfv1.GPUNode{
33+
Context("When reconciling a GPU resource", func() {
34+
It("Should add a specific label with pool name", func() {
35+
ctx := context.Background()
36+
pool := getMockGPUPool(ctx)
37+
gpunode := getMockGPUNode(ctx, "mock-node")
38+
By("creating the custom resource for the Kind GPU")
39+
key := client.ObjectKey{Name: "mock-gpu", Namespace: "default"}
40+
gpu := &tfv1.GPU{
5141
ObjectMeta: metav1.ObjectMeta{
52-
Name: resourceName + "-node",
53-
Labels: map[string]string{
54-
fmt.Sprintf(constants.GPUNodePoolIdentifierLabelFormat, "mock"): "true",
55-
},
42+
Name: key.Name,
43+
Namespace: key.Namespace,
5644
},
5745
}
58-
Expect(k8sClient.Create(ctx, gpunode)).To(Succeed())
59-
60-
By("creating the custom resource for the Kind GPU")
61-
err := k8sClient.Get(ctx, typeNamespacedName, gpu)
62-
if err != nil && errors.IsNotFound(err) {
63-
resource := &tfv1.GPU{
64-
ObjectMeta: metav1.ObjectMeta{
65-
Name: resourceName,
66-
Namespace: "default",
67-
},
68-
}
69-
Expect(controllerutil.SetControllerReference(gpunode, resource, scheme.Scheme)).To(Succeed())
70-
Expect(k8sClient.Create(ctx, resource)).To(Succeed())
71-
}
72-
})
73-
74-
AfterEach(func() {
75-
resource := &tfv1.GPU{}
76-
err := k8sClient.Get(ctx, typeNamespacedName, resource)
77-
Expect(err).NotTo(HaveOccurred())
78-
79-
By("Cleanup the specific resource instance GPU")
80-
Expect(k8sClient.Delete(ctx, resource)).To(Succeed())
81-
82-
By("Cleanup the specific resource instance GPUNode")
83-
Expect(k8sClient.Delete(ctx, gpunode)).To(Succeed())
84-
})
85-
It("should successfully reconcile the resource", func() {
86-
By("Reconciling the created resource")
87-
controllerReconciler := &GPUReconciler{
88-
Client: k8sClient,
89-
Scheme: k8sClient.Scheme(),
90-
}
91-
92-
_, err := controllerReconciler.Reconcile(ctx, reconcile.Request{
93-
NamespacedName: typeNamespacedName,
94-
})
95-
Expect(err).NotTo(HaveOccurred())
46+
Expect(controllerutil.SetControllerReference(gpunode, gpu, scheme.Scheme)).To(Succeed())
47+
Expect(k8sClient.Create(ctx, gpu)).To(Succeed())
48+
By("checking gpu lables")
49+
Eventually(func(g Gomega) {
50+
g.Expect(k8sClient.Get(ctx, key, gpu)).Should(Succeed())
51+
g.Expect(gpu.GetLabels()[constants.GpuPoolKey]).Should(Equal(pool.Name))
52+
}, timeout, interval).Should(Succeed())
53+
54+
Expect(k8sClient.Delete(ctx, gpu)).Should(Succeed())
9655
})
9756
})
9857
})

internal/controller/gpunode_controller_test.go

Lines changed: 50 additions & 111 deletions
Original file line numberDiff line numberDiff line change
@@ -18,133 +18,72 @@ package controller
1818

1919
import (
2020
"context"
21-
"encoding/json"
2221
"fmt"
23-
"time"
2422

25-
tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
26-
"github.com/NexusGPU/tensor-fusion/internal/constants"
2723
"github.com/NexusGPU/tensor-fusion/internal/utils"
2824
. "github.com/onsi/ginkgo/v2"
2925
. "github.com/onsi/gomega"
30-
"github.com/samber/lo"
3126
batchv1 "k8s.io/api/batch/v1"
3227
corev1 "k8s.io/api/core/v1"
33-
"k8s.io/apimachinery/pkg/api/errors"
34-
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3528
"k8s.io/apimachinery/pkg/types"
3629
"k8s.io/utils/ptr"
37-
"sigs.k8s.io/controller-runtime/pkg/reconcile"
3830
)
3931

4032
var _ = Describe("GPUNode Controller", func() {
41-
Context("When reconciling a resource", func() {
42-
const resourceName = "test-resource"
43-
44-
ctx := context.Background()
45-
46-
typeNamespacedName := types.NamespacedName{
47-
Name: resourceName,
48-
Namespace: "default",
49-
}
50-
gpunode := &tfv1.GPUNode{}
51-
BeforeEach(func() {
52-
By("creating the custom resource for the Kind GPUNode")
53-
err := k8sClient.Get(ctx, typeNamespacedName, gpunode)
54-
if err != nil && errors.IsNotFound(err) {
55-
resource := &tfv1.GPUNode{
56-
ObjectMeta: metav1.ObjectMeta{
57-
Name: resourceName,
58-
Namespace: "default",
59-
Labels: map[string]string{
60-
fmt.Sprintf(constants.GPUNodePoolIdentifierLabelFormat, "mock"): "true",
61-
},
62-
},
63-
}
64-
Expect(k8sClient.Create(ctx, resource)).To(Succeed())
65-
resource.Status.KubernetesNodeName = resource.Name
66-
resource.Status.Phase = tfv1.TensorFusionGPUNodePhaseRunning
67-
Expect(k8sClient.Status().Update(ctx, resource)).To(Succeed())
68-
}
69-
By("creating the core node")
70-
coreNode := &corev1.Node{
71-
ObjectMeta: metav1.ObjectMeta{
72-
Name: resourceName,
73-
},
74-
Spec: corev1.NodeSpec{},
75-
}
76-
Expect(k8sClient.Create(ctx, coreNode)).To(Succeed())
77-
})
78-
79-
AfterEach(func() {
80-
resource := &tfv1.GPUNode{}
81-
err := k8sClient.Get(ctx, typeNamespacedName, resource)
82-
Expect(err).NotTo(HaveOccurred())
83-
84-
By("Cleanup the specific resource instance GPUNode")
85-
Expect(k8sClient.Delete(ctx, resource)).To(Succeed())
86-
By("Cleanup the core node")
87-
coreNode := &corev1.Node{ObjectMeta: metav1.ObjectMeta{
88-
Name: resourceName,
89-
}}
90-
Expect(k8sClient.Delete(ctx, coreNode)).To(Succeed())
91-
})
92-
93-
It("should successfully reconcile the resource", func() {
94-
By("Reconciling the created resource")
95-
controllerReconciler := &GPUNodeReconciler{
96-
Client: k8sClient,
97-
Scheme: k8sClient.Scheme(),
98-
}
99-
100-
_, err := controllerReconciler.Reconcile(ctx, reconcile.Request{
101-
NamespacedName: typeNamespacedName,
102-
})
103-
Expect(err).NotTo(HaveOccurred())
104-
105-
By("Verify the finalizer is added")
106-
Expect(k8sClient.Get(ctx, typeNamespacedName, gpunode)).To(Succeed())
107-
Expect(gpunode.Finalizers).Should(ConsistOf(constants.Finalizer))
33+
Context("When reconciling a GPUNode", func() {
34+
It("should create the node discovery job and the hypervisor pod", func() {
35+
ctx := context.Background()
36+
gpuNode := getMockGPUNode(ctx, "mock-node")
37+
38+
By("checking that the node discovery job is created")
39+
Eventually(func(g Gomega) {
40+
job := &batchv1.Job{}
41+
g.Expect(k8sClient.Get(ctx, types.NamespacedName{
42+
Name: fmt.Sprintf("node-discovery-%s", gpuNode.Name),
43+
Namespace: utils.CurrentNamespace(),
44+
}, job)).Should(Succeed())
10845

109-
By("Verify the node discovery job is created")
110-
job := &batchv1.Job{}
111-
Expect(k8sClient.Get(ctx, types.NamespacedName{
112-
Name: fmt.Sprintf("node-discovery-%s", gpunode.Name),
113-
Namespace: utils.CurrentNamespace(),
114-
}, job)).To(Succeed())
115-
Expect(job.Spec.TTLSecondsAfterFinished).Should(Equal(ptr.To[int32](3600 * 10)))
46+
g.Expect(job.Spec.TTLSecondsAfterFinished).Should(Equal(ptr.To[int32](3600 * 10)))
47+
}, timeout, interval).Should(Succeed())
11648

117-
By("Verify the hypervisor pod is created")
49+
By("checking that the hypervisor pod is created")
11850
pod := &corev1.Pod{}
119-
Expect(k8sClient.Get(ctx, types.NamespacedName{
120-
Name: fmt.Sprintf("hypervisor-%s", gpunode.Name),
121-
Namespace: utils.CurrentNamespace(),
122-
}, pod)).To(Succeed())
123-
124-
By("Verify the hypervior pod recreated after hypervisor config change")
125-
pool := &tfv1.GPUPool{}
126-
Expect(k8sClient.Get(ctx, types.NamespacedName{
127-
Name: "mock",
128-
Namespace: "default",
129-
}, pool)).To(Succeed())
130-
podTmpl := &corev1.PodTemplate{}
131-
err = json.Unmarshal(pool.Spec.ComponentConfig.Hypervisor.PodTemplate.Raw, podTmpl)
132-
Expect(err).NotTo(HaveOccurred())
133-
podTmpl.Template.Spec.Containers[0].Name = "foo"
134-
pool.Spec.ComponentConfig.Hypervisor.PodTemplate.Raw = lo.Must(json.Marshal(podTmpl))
135-
Expect(k8sClient.Update(ctx, pool)).To(Succeed())
136-
Eventually(func() string {
137-
_, err = controllerReconciler.Reconcile(ctx, reconcile.Request{
138-
NamespacedName: typeNamespacedName,
139-
})
140-
if err = k8sClient.Get(ctx, types.NamespacedName{
141-
Name: fmt.Sprintf("hypervisor-%s", gpunode.Name),
51+
Eventually(func() error {
52+
return k8sClient.Get(ctx, types.NamespacedName{
53+
Name: fmt.Sprintf("hypervisor-%s", gpuNode.Name),
54+
Namespace: utils.CurrentNamespace(),
55+
}, pod)
56+
}, timeout, interval).Should(Succeed())
57+
58+
By("checking that it will recreate terminated hypervisor pod")
59+
Expect(k8sClient.Delete(ctx, pod)).Should(Succeed())
60+
Eventually(func() error {
61+
return k8sClient.Get(ctx, types.NamespacedName{
62+
Name: fmt.Sprintf("hypervisor-%s", gpuNode.Name),
14263
Namespace: utils.CurrentNamespace(),
143-
}, pod); err != nil {
144-
return ""
145-
}
146-
return pod.Spec.Containers[0].Name
147-
}, 5*time.Second, time.Second).Should(Equal("foo"))
64+
}, pod)
65+
}, timeout, interval).Should(Succeed())
66+
67+
// TODO: make this test pass when implement rolling udpate
68+
// By("checking that the hypervisor config changed")
69+
// tfc := getMockCluster(ctx)
70+
// hypervisor := tfc.Spec.GPUPools[0].SpecTemplate.ComponentConfig.Hypervisor
71+
// podTmpl := &corev1.PodTemplate{}
72+
// err := json.Unmarshal(hypervisor.PodTemplate.Raw, podTmpl)
73+
// Expect(err).NotTo(HaveOccurred())
74+
// podTmpl.Template.Spec.Containers[0].Name = "foo"
75+
// hypervisor.PodTemplate.Raw = lo.Must(json.Marshal(podTmpl))
76+
// Expect(k8sClient.Update(ctx, tfc)).To(Succeed())
77+
// Eventually(func() string {
78+
// pod := &corev1.Pod{}
79+
// if err = k8sClient.Get(ctx, types.NamespacedName{
80+
// Name: fmt.Sprintf("hypervisor-%s", gpuNode.Name),
81+
// Namespace: utils.CurrentNamespace(),
82+
// }, pod); err != nil {
83+
// return ""
84+
// }
85+
// return pod.Spec.Containers[0].Name
86+
// }, timeout, interval).Should(Equal("foo"))
14887
})
14988
})
15089
})

internal/controller/gpupool_controller_test.go

Lines changed: 6 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -21,86 +21,18 @@ import (
2121

2222
. "github.com/onsi/ginkgo/v2"
2323
. "github.com/onsi/gomega"
24-
corev1 "k8s.io/api/core/v1"
25-
"k8s.io/apimachinery/pkg/api/errors"
26-
"k8s.io/apimachinery/pkg/types"
27-
28-
"sigs.k8s.io/controller-runtime/pkg/reconcile"
2924

3025
tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
31-
"github.com/NexusGPU/tensor-fusion/internal/config"
32-
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3326
)
3427

3528
var _ = Describe("GPUPool Controller", func() {
3629
Context("When reconciling a resource", func() {
37-
const resourceName = "test-resource"
38-
39-
ctx := context.Background()
40-
41-
typeNamespacedName := types.NamespacedName{
42-
Name: resourceName,
43-
Namespace: "default",
44-
}
45-
nodeNamespacedName := types.NamespacedName{
46-
Name: "test-node",
47-
}
48-
gpupool := &tfv1.GPUPool{}
49-
50-
BeforeEach(func() {
51-
By("creating the custom resource for the Kind GPUPool")
52-
err := k8sClient.Get(ctx, typeNamespacedName, gpupool)
53-
if err != nil && errors.IsNotFound(err) {
54-
resource := &tfv1.GPUPool{
55-
ObjectMeta: metav1.ObjectMeta{
56-
Name: resourceName,
57-
Namespace: "default",
58-
},
59-
Spec: *config.MockGPUPoolSpec,
60-
}
61-
Expect(k8sClient.Create(ctx, resource)).To(Succeed())
62-
}
63-
64-
node := &corev1.Node{
65-
ObjectMeta: metav1.ObjectMeta{
66-
Name: nodeNamespacedName.Name,
67-
Labels: map[string]string{
68-
"mock-label": "true",
69-
},
70-
},
71-
Spec: corev1.NodeSpec{},
72-
}
73-
Expect(k8sClient.Create(ctx, node)).To(Succeed())
74-
})
75-
76-
AfterEach(func() {
77-
resource := &tfv1.GPUPool{}
78-
err := k8sClient.Get(ctx, typeNamespacedName, resource)
79-
Expect(err).NotTo(HaveOccurred())
80-
81-
By("Cleanup the specific resource instance GPUPool")
82-
Expect(k8sClient.Delete(ctx, resource)).To(Succeed())
83-
84-
Expect(k8sClient.Delete(ctx, &corev1.Node{
85-
ObjectMeta: metav1.ObjectMeta{
86-
Name: nodeNamespacedName.Name,
87-
},
88-
})).To(Succeed())
89-
90-
})
91-
92-
It("should successfully reconcile the resource", func() {
93-
By("Reconciling the created resource")
94-
controllerReconciler := &GPUPoolReconciler{
95-
Client: k8sClient,
96-
Scheme: k8sClient.Scheme(),
97-
}
98-
99-
_, err := controllerReconciler.Reconcile(ctx, reconcile.Request{
100-
NamespacedName: typeNamespacedName,
101-
})
102-
Expect(err).NotTo(HaveOccurred())
103-
30+
It("Should update status when nodes ready", func() {
31+
ctx := context.Background()
32+
Eventually(func(g Gomega) {
33+
pool := getMockGPUPool(ctx)
34+
g.Expect(pool.Status.Phase).Should(Equal(tfv1.TensorFusionPoolPhaseRunning))
35+
}, timeout, interval).Should(Succeed())
10436
})
10537
})
10638
})

0 commit comments

Comments
 (0)