feat: Add pod template hash and Add logic to detect and replace pods … (#70)

0x5457 · web-flow · commit f717d9914183 · 2025-03-13T10:16:13.000+08:00
* feat: Add pod template hash and Add logic to detect and replace pods when resource specifications change

* fix: worker pod with the infix `tf-worker`
diff --git a/api/v1/tensorfusionworkload_types.go b/api/v1/tensorfusionworkload_types.go
@@ -62,6 +62,8 @@ type TensorFusionWorkloadStatus struct {
 	ReadyReplicas int32 `json:"readyReplicas,omitempty"`
 
 	WorkerStatuses []WorkerStatus `json:"workerStatuses,omitempty"`
+
+	PodTemplateHash string `json:"podTemplateHash,omitempty"`
 }
 
 // +kubebuilder:object:root=true
diff --git a/charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionworkloads.yaml b/charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionworkloads.yaml
@@ -103,6 +103,8 @@ spec:
             description: TensorFusionWorkloadStatus defines the observed state of
               TensorFusionWorkload.
             properties:
+              podTemplateHash:
+                type: string
               readyReplicas:
                 description: readyReplicas is the number of pods created for this
                   Workload with a Ready Condition.
diff --git a/config/crd/bases/tensor-fusion.ai_tensorfusionworkloads.yaml b/config/crd/bases/tensor-fusion.ai_tensorfusionworkloads.yaml
@@ -103,6 +103,8 @@ spec:
             description: TensorFusionWorkloadStatus defines the observed state of
               TensorFusionWorkload.
             properties:
+              podTemplateHash:
+                type: string
               readyReplicas:
                 description: readyReplicas is the number of pods created for this
                   Workload with a Ready Condition.
diff --git a/internal/constants/constants.go b/internal/constants/constants.go
@@ -10,9 +10,10 @@ const (
 	FinalizerSuffix = "finalizer"
 	Finalizer       = Domain + "/" + FinalizerSuffix
 
-	LabelKeyOwner        = Domain + "/managed-by"
-	LabelKeyClusterOwner = Domain + "/cluster"
-	LabelKeyNodeClass    = Domain + "/node-class"
+	LabelKeyOwner           = Domain + "/managed-by"
+	LabelKeyClusterOwner    = Domain + "/cluster"
+	LabelKeyNodeClass       = Domain + "/node-class"
+	LabelKeyPodTemplateHash = Domain + "/pod-template-hash"
 
 	GPUNodePoolIdentifierLabelPrefix = Domain + "/pool-"
 	GPUNodePoolIdentifierLabelFormat = Domain + "/pool-%s"
@@ -80,7 +81,7 @@ const (
 const (
 	// No disrupt label, similar to Karpenter, avoid TFConnection/Worker/GPUNode to be moved to another node or destroying node.
 	// Refer: https://karpenter.sh/docs/concepts/disruption/
-	SchedulingDoNotDisruptLabel = "tensor-fusion.ai/do-not-disrupt"
+	SchedulingDoNotDisruptLabel = Domain + "/do-not-disrupt"
 )
 
 const (
@@ -91,7 +92,7 @@ const (
 
 // To match GPUNode with K8S node, when creating from cloud vendor, must set a label from cloud-init userdata
 const (
-	ProvisionerLabelKey        = "tensor-fusion.ai/node-provisioner"
+	ProvisionerLabelKey        = Domain + "/node-provisioner"
 	ProvisionerNamePlaceholder = "__GPU_NODE_RESOURCE_NAME__"
 )
 const (
@@ -100,4 +101,4 @@ const (
 
 const TFDataPath = "/tmp/tensor-fusion/data"
 const DataVolumeName = "tf-data"
-const TensorFusionPoolManualCompaction = "tensor-fusion.ai/manual-compaction"
+const TensorFusionPoolManualCompaction = Domain + "/manual-compaction"
diff --git a/internal/controller/tensorfusionworkload_controller.go b/internal/controller/tensorfusionworkload_controller.go
@@ -106,6 +106,38 @@ func (r *TensorFusionWorkloadReconciler) Reconcile(ctx context.Context, req ctrl
 	// Create worker generator
 	workerGenerator := &worker.WorkerGenerator{WorkerConfig: pool.Spec.ComponentConfig.Worker}
 
+	podTemplateHash, err := workerGenerator.PodTemplateHash(workload.Spec.Resources.Limits)
+	if err != nil {
+		return ctrl.Result{}, fmt.Errorf("get pod template hash: %w", err)
+	}
+
+	if workload.Status.PodTemplateHash != podTemplateHash {
+		workload.Status.PodTemplateHash = podTemplateHash
+		if err := r.Status().Update(ctx, workload); err != nil {
+			return ctrl.Result{}, fmt.Errorf("update status: %w", err)
+		}
+	}
+
+	// Check if there are any Pods using the old podTemplateHash and delete them if any
+	if len(podList.Items) > 0 {
+		var outdatedPods []corev1.Pod
+		for i := range podList.Items {
+			pod := &podList.Items[i]
+			if pod.Labels[constants.LabelKeyPodTemplateHash] != podTemplateHash {
+				outdatedPods = append(outdatedPods, *pod)
+			}
+		}
+
+		if len(outdatedPods) > 0 {
+			log.Info("Found outdated pods with different template hash", "count", len(outdatedPods))
+			if err := r.scaleDownWorkers(ctx, workload, outdatedPods); err != nil {
+				return ctrl.Result{}, err
+			}
+			// After deletion, requeue, and the next reconcile will create a new pod
+			return ctrl.Result{Requeue: true}, nil
+		}
+	}
+
 	// Determine the number of replicas
 	desiredReplicas := int32(1)
 	if workload.Spec.Replicas != nil {
@@ -162,7 +194,7 @@ func (r *TensorFusionWorkloadReconciler) tryStartWorker(
 	workload *tfv1.TensorFusionWorkload,
 ) (*corev1.Pod, error) {
 	port := workerGenerator.AllocPort()
-	pod, err := workerGenerator.GenerateWorkerPod(gpu, workload.Name+"-", workload.Namespace, port, workload.Spec.Resources.Limits)
+	pod, hash, err := workerGenerator.GenerateWorkerPod(gpu, fmt.Sprintf("%s-tf-worker-", workload.Name), workload.Namespace, port, workload.Spec.Resources.Limits)
 	if err != nil {
 		return nil, fmt.Errorf("generate worker pod %w", err)
 	}
@@ -173,6 +205,7 @@ func (r *TensorFusionWorkloadReconciler) tryStartWorker(
 	}
 	pod.Labels[constants.WorkloadKey] = workload.Name
 	pod.Labels[constants.GpuKey] = gpu.Name
+	pod.Labels[constants.LabelKeyPodTemplateHash] = hash
 
 	// Add finalizer for GPU resource cleanup
 	pod.Finalizers = append(pod.Finalizers, constants.Finalizer)
diff --git a/internal/controller/tensorfusionworkload_controller_test.go b/internal/controller/tensorfusionworkload_controller_test.go
@@ -288,6 +288,140 @@ var _ = Describe("TensorFusionWorkload Controller", func() {
 		})
 	})
 
+	Context("When resource limits change in a workload", func() {
+		It("Should rebuild all worker pods", func() {
+			// Create a workload with 2 replicas
+			workload := &tfv1.TensorFusionWorkload{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      resourceName,
+					Namespace: resourceNamespace,
+				},
+				Spec: tfv1.TensorFusionWorkloadSpec{
+					Replicas: ptr.Int32(2),
+					PoolName: poolName,
+					Resources: tfv1.Resources{
+						Requests: tfv1.Resource{
+							Tflops: tflopsRequests,
+							Vram:   vramRequests,
+						},
+						Limits: tfv1.Resource{
+							Tflops: tflopsLimits,
+							Vram:   vramLimits,
+						},
+					},
+				},
+			}
+
+			Expect(k8sClient.Create(ctx, workload)).To(Succeed())
+
+			// First reconcile to create the initial pods
+			_, err := reconciler.Reconcile(ctx, reconcile.Request{
+				NamespacedName: typeNamespacedName,
+			})
+			Expect(err).NotTo(HaveOccurred())
+
+			// Check that pods are created
+			podList := &corev1.PodList{}
+			Eventually(func() int {
+				err := k8sClient.List(ctx, podList,
+					client.InNamespace(resourceNamespace),
+					client.MatchingLabels{constants.WorkloadKey: resourceName})
+				if err != nil {
+					return 0
+				}
+				return len(podList.Items)
+			}, 5*time.Second, 100*time.Millisecond).Should(Equal(2))
+
+			// Store the original pod template hash
+			var originalPodNames []string
+			var originalPodTemplateHash string
+			for _, pod := range podList.Items {
+				originalPodNames = append(originalPodNames, pod.Name)
+				originalPodTemplateHash = pod.Labels[constants.LabelKeyPodTemplateHash]
+			}
+			Expect(originalPodTemplateHash).NotTo(BeEmpty())
+
+			// Update workload with different resource limits
+			workload = &tfv1.TensorFusionWorkload{}
+			Expect(k8sClient.Get(ctx, typeNamespacedName, workload)).To(Succeed())
+			workload.Spec.Resources.Limits.Tflops = resource.MustParse("30") // Increase TFLOPS limit
+			workload.Spec.Resources.Limits.Vram = resource.MustParse("24Gi") // Increase VRAM limit
+			Expect(k8sClient.Update(ctx, workload)).To(Succeed())
+
+			// Reconcile to handle the resource limits change
+			_, err = reconciler.Reconcile(ctx, reconcile.Request{
+				NamespacedName: typeNamespacedName,
+			})
+			Expect(err).NotTo(HaveOccurred())
+
+			// Reconcile again to handle the Finalizer
+			_, err = reconciler.Reconcile(ctx, reconcile.Request{
+				NamespacedName: typeNamespacedName,
+			})
+			Expect(err).NotTo(HaveOccurred())
+
+			// Verify old pods are deleted due to template hash change
+			Eventually(func() bool {
+				podList := &corev1.PodList{}
+				err := k8sClient.List(ctx, podList,
+					client.InNamespace(resourceNamespace),
+					client.MatchingLabels{constants.WorkloadKey: resourceName})
+				if err != nil || len(podList.Items) != 0 {
+					return false
+				}
+				return true // All pods should be deleted
+			}, 5*time.Second, 100*time.Millisecond).Should(BeTrue())
+
+			// Reconcile again to create new pods
+			_, err = reconciler.Reconcile(ctx, reconcile.Request{
+				NamespacedName: typeNamespacedName,
+			})
+			Expect(err).NotTo(HaveOccurred())
+
+			// Verify new pods are created
+			Eventually(func() int {
+				err := k8sClient.List(ctx, podList,
+					client.InNamespace(resourceNamespace),
+					client.MatchingLabels{constants.WorkloadKey: resourceName})
+				if err != nil {
+					return 0
+				}
+				return len(podList.Items)
+			}, 5*time.Second, 100*time.Millisecond).Should(Equal(2))
+
+			// Verify new pods have different names and pod template hash
+			var newPodNames []string
+			var newPodTemplateHash string
+			for _, pod := range podList.Items {
+				newPodNames = append(newPodNames, pod.Name)
+				newPodTemplateHash = pod.Labels[constants.LabelKeyPodTemplateHash]
+			}
+			Expect(newPodTemplateHash).NotTo(BeEmpty())
+			Expect(newPodTemplateHash).NotTo(Equal(originalPodTemplateHash))
+
+			// Verify that pod names have changed
+			for _, originalName := range originalPodNames {
+				Expect(newPodNames).NotTo(ContainElement(originalName))
+			}
+
+			// Reconcile again to handle status
+			_, err = reconciler.Reconcile(ctx, reconcile.Request{
+				NamespacedName: typeNamespacedName,
+			})
+			Expect(err).NotTo(HaveOccurred())
+
+			// Verify workload status was updated
+			Eventually(func() int32 {
+				workload := &tfv1.TensorFusionWorkload{}
+				err = k8sClient.Get(ctx, typeNamespacedName, workload)
+				if err != nil {
+					return -1
+				}
+				return workload.Status.Replicas
+			}, 5*time.Second, 100*time.Millisecond).Should(Equal(int32(2)))
+		})
+	})
+
 	Context("When scaling down a workload", func() {
 		It("Should delete excess worker pods", func() {
 			// Create a workload with 3 replicas
diff --git a/internal/utils/reconcile.go b/internal/utils/reconcile.go
@@ -2,10 +2,10 @@ package utils
 
 import (
 	"context"
-	"crypto/sha256"
-	"encoding/hex"
 	"encoding/json"
 	"errors"
+	"fmt"
+	"hash/fnv"
 	"math"
 	"math/rand/v2"
 	"os"
@@ -95,15 +95,21 @@ func CurrentNamespace() string {
 	return namespace
 }
 
-func GetObjectHash(obj any) string {
-	hasher := sha256.New()
-	jsonBytes, err := json.Marshal(obj)
-	if err != nil {
-		panic(err)
+// GetObjectHash generates a shorter FNV-1a hash for one or more objects
+func GetObjectHash(objs ...any) string {
+	hasher := fnv.New64a()
+
+	for _, obj := range objs {
+		jsonBytes, err := json.Marshal(obj)
+		if err != nil {
+			panic(err)
+		}
+		// Add length prefix to prevent collisions when combining multiple objects
+		hasher.Write(fmt.Appendf(nil, "%d:", len(jsonBytes)))
+		hasher.Write(jsonBytes)
 	}
-	str := string(jsonBytes)
-	hasher.Write([]byte(str))
-	return hex.EncodeToString(hasher.Sum(nil))
+
+	return fmt.Sprintf("%x", hasher.Sum(nil))
 }
 
 const DebounceKeySuffix = ":in_queue"
diff --git a/internal/worker/worker.go b/internal/worker/worker.go
@@ -9,6 +9,7 @@ import (
 
 	tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
 	"github.com/NexusGPU/tensor-fusion/internal/constants"
+	"github.com/NexusGPU/tensor-fusion/internal/utils"
 	"github.com/samber/lo"
 	"golang.org/x/exp/rand"
 	corev1 "k8s.io/api/core/v1"
@@ -42,18 +43,28 @@ func (wg *WorkerGenerator) AllocPort() int {
 	return rand.Intn(max-min+1) + min
 }
 
+func (wg *WorkerGenerator) PodTemplateHash(limits tfv1.Resource) (string, error) {
+	podTmpl := &corev1.PodTemplate{}
+	err := json.Unmarshal(wg.WorkerConfig.PodTemplate.Raw, podTmpl)
+	if err != nil {
+		return "", fmt.Errorf("failed to unmarshal pod template: %w", err)
+	}
+	return utils.GetObjectHash(podTmpl, limits), nil
+}
+
 func (wg *WorkerGenerator) GenerateWorkerPod(
 	gpu *tfv1.GPU,
 	generateName string,
 	namespace string,
 	port int,
 	limits tfv1.Resource,
-) (*corev1.Pod, error) {
+) (*corev1.Pod, string, error) {
 	podTmpl := &corev1.PodTemplate{}
 	err := json.Unmarshal(wg.WorkerConfig.PodTemplate.Raw, podTmpl)
 	if err != nil {
-		return nil, fmt.Errorf("failed to unmarshal pod template: %w", err)
+		return nil, "", fmt.Errorf("failed to unmarshal pod template: %w", err)
 	}
+	podTemplateHash := utils.GetObjectHash(podTmpl, limits)
 	spec := podTmpl.Template.Spec
 	if spec.NodeSelector == nil {
 		spec.NodeSelector = make(map[string]string)
@@ -95,14 +106,13 @@ func (wg *WorkerGenerator) GenerateWorkerPod(
 			},
 		},
 	})
-
 	return &corev1.Pod{
 		ObjectMeta: metav1.ObjectMeta{
 			GenerateName: generateName,
 			Namespace:    namespace,
 		},
 		Spec: spec,
-	}, nil
+	}, podTemplateHash, nil
 }
 
 func SelectWorker(

Original file line number	Diff line number	Diff line change
`@@ -62,6 +62,8 @@ type TensorFusionWorkloadStatus struct {`
`62`	`62`	ReadyReplicas int32 `json:"readyReplicas,omitempty"`
`63`	`63`
`64`	`64`	WorkerStatuses []WorkerStatus `json:"workerStatuses,omitempty"`
	`65`	`+`
	`66`	+ PodTemplateHash string `json:"podTemplateHash,omitempty"`
`65`	`67`	`}`
`66`	`68`
`67`	`69`	`// +kubebuilder:object:root=true`