Skip to content

Commit 60d28e9

Browse files
authored
fix: recreate hypervisor pod after eviction (#173)
* fix: recreate hypervisor pod after eviction * chore: disable the default 10-minute timeout limit for test
1 parent 1c707d6 commit 60d28e9

File tree

4 files changed

+21
-33
lines changed

4 files changed

+21
-33
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ vet: ## Run go vet against code.
6262

6363
.PHONY: test
6464
test: manifests generate fmt vet envtest ## Run tests.
65-
KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test $$(go list ./... | grep -v /e2e) -coverprofile cover.out
65+
KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test $$(go list ./... | grep -v /e2e) -timeout 0 -coverprofile cover.out
6666

6767
# TODO(user): To use a different vendor for e2e tests, modify the setup under 'tests/e2e'.
6868
# The default setup assumes Kind is pre-installed and builds/loads the Manager Docker image locally.

internal/controller/gpunode_controller.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -393,7 +393,8 @@ func (r *GPUNodeReconciler) reconcileHypervisorPod(ctx context.Context, node *tf
393393
return key.Name, nil
394394
}
395395

396-
if currentPod.Labels[constants.LabelKeyPodTemplateHash] != utils.GetObjectHash(pool.Spec.ComponentConfig.Hypervisor) {
396+
if utils.IsPodTerminated(currentPod) ||
397+
currentPod.Labels[constants.LabelKeyPodTemplateHash] != utils.GetObjectHash(pool.Spec.ComponentConfig.Hypervisor) {
397398
if err := r.Delete(ctx, currentPod); err != nil {
398399
return "", fmt.Errorf("failed to delete old hypervisor pod: %w", err)
399400
}

internal/controller/gpunode_controller_test.go

Lines changed: 14 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -55,8 +55,8 @@ var _ = Describe("GPUNode Controller", func() {
5555
}, timeout, interval).Should(Succeed())
5656

5757
By("checking that the hypervisor pod is created")
58+
pod := &corev1.Pod{}
5859
Eventually(func(g Gomega) {
59-
pod := &corev1.Pod{}
6060
err := k8sClient.Get(ctx, types.NamespacedName{
6161
Name: fmt.Sprintf("hypervisor-%s", gpuNode.Name),
6262
Namespace: utils.CurrentNamespace(),
@@ -71,37 +71,20 @@ var _ = Describe("GPUNode Controller", func() {
7171
g.Expect(gpunode.Status.Phase).Should(Equal(tfv1.TensorFusionGPUNodePhaseRunning))
7272
}, timeout, interval).Should(Succeed())
7373

74-
tfEnv.Cleanup()
75-
76-
// By("checking that it will recreate terminated hypervisor pod")
77-
// Expect(k8sClient.Delete(ctx, pod)).Should(Succeed())
78-
// Eventually(func() error {
79-
// return k8sClient.Get(ctx, types.NamespacedName{
80-
// Name: fmt.Sprintf("hypervisor-%s", gpuNode.Name),
81-
// Namespace: utils.CurrentNamespace(),
82-
// }, pod)
83-
// }, timeout, interval).Should(Succeed())
74+
By("checking the hypervisor pod should be recreated when enters terminated status")
75+
pod.Status.Phase = corev1.PodFailed
76+
Expect(k8sClient.Status().Update(ctx, pod)).Should(Succeed())
77+
Eventually(func(g Gomega) {
78+
newPod := &corev1.Pod{}
79+
err := k8sClient.Get(ctx, types.NamespacedName{
80+
Name: fmt.Sprintf("hypervisor-%s", gpuNode.Name),
81+
Namespace: utils.CurrentNamespace(),
82+
}, newPod)
83+
g.Expect(err).ShouldNot(HaveOccurred())
84+
g.Expect(newPod.UID).ShouldNot(Equal(pod.UID))
85+
}, timeout, interval).Should(Succeed())
8486

85-
// TODO: make this test pass when implement rolling udpate
86-
// By("checking that the hypervisor config changed")
87-
// tfc := getMockCluster(ctx)
88-
// hypervisor := tfc.Spec.GPUPools[0].SpecTemplate.ComponentConfig.Hypervisor
89-
// podTmpl := &corev1.PodTemplate{}
90-
// err := json.Unmarshal(hypervisor.PodTemplate.Raw, podTmpl)
91-
// Expect(err).NotTo(HaveOccurred())
92-
// podTmpl.Template.Spec.Containers[0].Name = "foo"
93-
// hypervisor.PodTemplate.Raw = lo.Must(json.Marshal(podTmpl))
94-
// Expect(k8sClient.Update(ctx, tfc)).To(Succeed())
95-
// Eventually(func() string {
96-
// pod := &corev1.Pod{}
97-
// if err = k8sClient.Get(ctx, types.NamespacedName{
98-
// Name: fmt.Sprintf("hypervisor-%s", gpuNode.Name),
99-
// Namespace: utils.CurrentNamespace(),
100-
// }, pod); err != nil {
101-
// return ""
102-
// }
103-
// return pod.Spec.Containers[0].Name
104-
// }, timeout, interval).Should(Equal("foo"))
87+
tfEnv.Cleanup()
10588
})
10689
})
10790
})

internal/utils/reconcile.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,3 +175,7 @@ func IsPodConditionTrue(conditions []corev1.PodCondition, conditionType corev1.P
175175
}
176176
return false
177177
}
178+
179+
func IsPodTerminated(pod *corev1.Pod) bool {
180+
return pod.Status.Phase == corev1.PodFailed || pod.Status.Phase == corev1.PodSucceeded
181+
}

0 commit comments

Comments
 (0)