feat: add test for tensorfusion connection controller & fix deployment of controller (#29)

0x5457 · web-flow · commit 36451d4b1f66 · 2025-01-24T11:43:29.000+08:00
diff --git a/charts/tensor-fusion/templates/controller-deployment.yaml b/charts/tensor-fusion/templates/controller-deployment.yaml
@@ -45,8 +45,6 @@ spec:
             - name: cert
               readOnly: true
               mountPath: /tmp/k8s-webhook-server/serving-certs
-            - name: config
-              mountPath: /etc/tensor-fusion
         - name: vector
           image: docker.io/timberio/vector:nightly-2025-01-07-debian
           env:
@@ -74,10 +72,6 @@ spec:
                 path: tls.crt
               - key: key
                 path: tls.key
-        - name: config
-          configMap:
-            name: {{ include "tensor-fusion.fullname" . }}-config
-            defaultMode: 420
         - name: vector-config
           configMap:
             name: {{ include "tensor-fusion.fullname" . }}-vector-config
diff --git a/internal/controller/tensorfusionconnection_controller.go b/internal/controller/tensorfusionconnection_controller.go
@@ -173,7 +173,10 @@ func (r *TensorFusionConnectionReconciler) tryStartWorker(
 		if errors.IsNotFound(err) {
 			// Pod doesn't exist, create a new one
 			port := workerGenerator.AllocPort()
-			pod = workerGenerator.GenerateWorkerPod(gpu, connection, namespacedName, port)
+			pod, err = workerGenerator.GenerateWorkerPod(gpu, connection, namespacedName, port)
+			if err != nil {
+				return nil, fmt.Errorf("generate worker pod %w", err)
+			}
 			if err := ctrl.SetControllerReference(connection, pod, r.Scheme); err != nil {
 				return nil, fmt.Errorf("set owner reference %w", err)
 			}
diff --git a/internal/controller/tensorfusionconnection_controller_test.go b/internal/controller/tensorfusionconnection_controller_test.go
@@ -21,14 +21,18 @@ import (
 
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
+	corev1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/api/errors"
+	"k8s.io/apimachinery/pkg/api/resource"
 	"k8s.io/apimachinery/pkg/types"
 	"sigs.k8s.io/controller-runtime/pkg/reconcile"
 
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 
 	tfv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1"
 	"github.com/NexusGPU/tensor-fusion-operator/internal/config"
+	"github.com/NexusGPU/tensor-fusion-operator/internal/constants"
+	"github.com/NexusGPU/tensor-fusion-operator/internal/scheduler"
 )
 
 var _ = Describe("TensorFusionConnection Controller", func() {
@@ -39,27 +43,61 @@ var _ = Describe("TensorFusionConnection Controller", func() {
 
 		typeNamespacedName := types.NamespacedName{
 			Name:      resourceName,
-			Namespace: "default", // TODO(user):Modify as needed
+			Namespace: "default",
+		}
+		scheduler := scheduler.NewNaiveScheduler()
+		gpu := &tfv1.GPU{
+			ObjectMeta: metav1.ObjectMeta{
+				Name: "mock-gpu",
+			},
 		}
-		tensorfusionconnection := &tfv1.TensorFusionConnection{}
-
 		BeforeEach(func() {
+			connection := &tfv1.TensorFusionConnection{}
 			By("creating the custom resource for the Kind TensorFusionConnection")
-			err := k8sClient.Get(ctx, typeNamespacedName, tensorfusionconnection)
+			err := k8sClient.Get(ctx, typeNamespacedName, connection)
 			if err != nil && errors.IsNotFound(err) {
 				resource := &tfv1.TensorFusionConnection{
 					ObjectMeta: metav1.ObjectMeta{
 						Name:      resourceName,
 						Namespace: "default",
 					},
-					// TODO(user): Specify other spec details if needed.
+					Spec: tfv1.TensorFusionConnectionSpec{
+						PoolName: "mock",
+						Resources: tfv1.Resources{
+							Requests: tfv1.Resource{
+								Tflops: resource.MustParse("1"),
+								Vram:   resource.MustParse("1Gi"),
+							},
+							Limits: tfv1.Resource{
+								Tflops: resource.MustParse("1"),
+								Vram:   resource.MustParse("1Gi"),
+							},
+						},
+					},
 				}
 				Expect(k8sClient.Create(ctx, resource)).To(Succeed())
 			}
+
+			scheduler.OnAdd(gpu)
+			Expect(k8sClient.Create(ctx, gpu)).To(Succeed())
+			gpu.Status = tfv1.GPUStatus{
+				UUID: "mock-gpu",
+				NodeSelector: map[string]string{
+					"kubernetes.io/hostname": "mock-node",
+				},
+				Capacity: tfv1.Resource{
+					Tflops: resource.MustParse("2"),
+					Vram:   resource.MustParse("2Gi"),
+				},
+				Available: tfv1.Resource{
+					Tflops: resource.MustParse("2"),
+					Vram:   resource.MustParse("2Gi"),
+				},
+			}
+			Expect(k8sClient.Status().Update(ctx, gpu)).To(Succeed())
 		})
 
 		AfterEach(func() {
-			// TODO(user): Cleanup logic after each test, like removing the resource instance.
 			resource := &tfv1.TensorFusionConnection{}
 			err := k8sClient.Get(ctx, typeNamespacedName, resource)
 			Expect(err).NotTo(HaveOccurred())
@@ -74,13 +112,29 @@ var _ = Describe("TensorFusionConnection Controller", func() {
 				Client:       k8sClient,
 				Scheme:       k8sClient.Scheme(),
 				GpuPoolState: gpuPoolState,
+				Scheduler:    scheduler,
 			}
 			_, err := controllerReconciler.Reconcile(ctx, reconcile.Request{
 				NamespacedName: typeNamespacedName,
 			})
 			Expect(err).NotTo(HaveOccurred())
-			// TODO(user): Add more specific assertions depending on your controller's reconciliation logic.
-			// Example: If you expect a certain status condition after reconciliation, verify it here.
+			connection := &tfv1.TensorFusionConnection{}
+			Expect(k8sClient.Get(ctx, typeNamespacedName, connection)).NotTo(HaveOccurred())
+			Expect(connection.Finalizers).Should(ConsistOf(constants.Finalizer))
+			_, err = controllerReconciler.Reconcile(ctx, reconcile.Request{
+				NamespacedName: typeNamespacedName,
+			})
+			Expect(err).NotTo(HaveOccurred())
+			Expect(k8sClient.Get(ctx, typeNamespacedName, connection)).NotTo(HaveOccurred())
+			Expect(connection.Status.Phase).To(Equal(tfv1.TensorFusionConnectionStarting))
+
+			workerPod := &corev1.Pod{}
+			Expect(k8sClient.Get(ctx, typeNamespacedName, workerPod)).NotTo(HaveOccurred())
+			Expect(workerPod.Spec.NodeSelector).To(Equal(gpu.Status.NodeSelector))
+
+			Expect(k8sClient.Get(ctx, types.NamespacedName{Name: "mock-gpu"}, gpu)).NotTo(HaveOccurred())
+			Expect(gpu.Status.Available.Tflops).To(Equal(resource.MustParse("1")))
+			Expect(gpu.Status.Available.Vram).To(Equal(resource.MustParse("1Gi")))
 		})
 	})
 })
diff --git a/internal/webhook/v1/pod_webhook.go b/internal/webhook/v1/pod_webhook.go
@@ -192,7 +192,7 @@ func (m *TensorFusionPodMutator) patchTFClient(pod *corev1.Pod, clientConfig *tf
 	// Convert the current pod to JSON
 	currentBytes, err := json.Marshal(pod)
 	if err != nil {
-		return nil, fmt.Errorf("marshal current pod: %v", err)
+		return nil, fmt.Errorf("marshal current pod: %w", err)
 	}
 
 	// Patch to Container
@@ -203,19 +203,19 @@ func (m *TensorFusionPodMutator) patchTFClient(pod *corev1.Pod, clientConfig *tf
 				// patch from config
 				containerJSON, err := json.Marshal(container)
 				if err != nil {
-					return nil, fmt.Errorf("marshal container: %v", err)
+					return nil, fmt.Errorf("marshal container: %w", err)
 				}
 				patchJSON, err := json.Marshal(clientConfig.PatchToContainer)
 				if err != nil {
-					return nil, fmt.Errorf("marshal patchToContainer: %v", err)
+					return nil, fmt.Errorf("marshal patchToContainer: %w", err)
 				}
 
 				patchedJSON, err := strategicpatch.StrategicMergePatch(containerJSON, patchJSON, corev1.Container{})
 				if err != nil {
-					return nil, fmt.Errorf("apply strategic merge patch to container: %v", err)
+					return nil, fmt.Errorf("apply strategic merge patch to container: %w", err)
 				}
 				if err := json.Unmarshal(patchedJSON, container); err != nil {
-					return nil, fmt.Errorf("unmarshal patched container: %v", err)
+					return nil, fmt.Errorf("unmarshal patched container: %w", err)
 				}
 
 				// add connection env
@@ -240,35 +240,35 @@ func (m *TensorFusionPodMutator) patchTFClient(pod *corev1.Pod, clientConfig *tf
 
 	containerPatchedJSON, err := json.Marshal(pod)
 	if err != nil {
-		return nil, fmt.Errorf("marshal current pod: %v", err)
+		return nil, fmt.Errorf("marshal current pod: %w", err)
 	}
 	patches, err := jsonpatch.CreatePatch(currentBytes, containerPatchedJSON)
 	if err != nil {
-		return nil, fmt.Errorf("patch to container: %v", err)
+		return nil, fmt.Errorf("patch to container: %w", err)
 	}
 
 	// Convert the strategic merge patch to JSON
 	patchBytes, err := json.Marshal(clientConfig.PatchToPod)
 
 	if err != nil {
-		return nil, fmt.Errorf("marshal patch: %v", err)
+		return nil, fmt.Errorf("marshal patch: %w", err)
 	}
 
 	// Apply the strategic merge patch
 	resultBytes, err := strategicpatch.StrategicMergePatch(currentBytes, patchBytes, corev1.Pod{})
 	if err != nil {
-		return nil, fmt.Errorf("apply strategic merge patch: %v", err)
+		return nil, fmt.Errorf("apply strategic merge patch: %w", err)
 	}
 
 	// Generate JSON patch operations by comparing original and patched pod
 	strategicpatches, err := jsonpatch.CreatePatch(currentBytes, resultBytes)
 	if err != nil {
-		return nil, fmt.Errorf("create json patch: %v", err)
+		return nil, fmt.Errorf("create json patch: %w", err)
 	}
 
 	// Unmarshal the result back into the pod
 	if err := json.Unmarshal(resultBytes, pod); err != nil {
-		return nil, fmt.Errorf("unmarshal patched pod: %v", err)
+		return nil, fmt.Errorf("unmarshal patched pod: %w", err)
 	}
 
 	patches = append(patches, strategicpatches...)
diff --git a/internal/worker/worker.go b/internal/worker/worker.go
@@ -1,6 +1,7 @@
 package worker
 
 import (
+	"encoding/json"
 	"fmt"
 	"strconv"
 	"time"
@@ -44,8 +45,13 @@ func (wg *WorkerGenerator) GenerateWorkerPod(
 	connection *tfv1.TensorFusionConnection,
 	namespacedName types.NamespacedName,
 	port int,
-) *corev1.Pod {
-	spec := wg.WorkerConfig.PodTemplate.Object.(*corev1.PodTemplate).Template.Spec.DeepCopy()
+) (*corev1.Pod, error) {
+	podTmpl := &corev1.PodTemplate{}
+	err := json.Unmarshal(wg.WorkerConfig.PodTemplate.Raw, podTmpl)
+	if err != nil {
+		return nil, fmt.Errorf("failed to unmarshal pod template: %w", err)
+	}
+	spec := podTmpl.Template.Spec
 	if spec.NodeSelector == nil {
 		spec.NodeSelector = make(map[string]string)
 	}
@@ -64,6 +70,6 @@ func (wg *WorkerGenerator) GenerateWorkerPod(
 			Name:      namespacedName.Name,
 			Namespace: namespacedName.Namespace,
 		},
-		Spec: *spec,
-	}
+		Spec: spec,
+	}, nil
 }