@@ -288,6 +288,140 @@ var _ = Describe("TensorFusionWorkload Controller", func() {
288288 })
289289 })
290290
291+ Context ("When resource limits change in a workload" , func () {
292+ It ("Should rebuild all worker pods" , func () {
293+ // Create a workload with 2 replicas
294+ workload := & tfv1.TensorFusionWorkload {
295+ ObjectMeta : metav1.ObjectMeta {
296+ Name : resourceName ,
297+ Namespace : resourceNamespace ,
298+ },
299+ Spec : tfv1.TensorFusionWorkloadSpec {
300+ Replicas : ptr .Int32 (2 ),
301+ PoolName : poolName ,
302+ Resources : tfv1.Resources {
303+ Requests : tfv1.Resource {
304+ Tflops : tflopsRequests ,
305+ Vram : vramRequests ,
306+ },
307+ Limits : tfv1.Resource {
308+ Tflops : tflopsLimits ,
309+ Vram : vramLimits ,
310+ },
311+ },
312+ },
313+ }
314+
315+ Expect (k8sClient .Create (ctx , workload )).To (Succeed ())
316+
317+ // First reconcile to create the initial pods
318+ _ , err := reconciler .Reconcile (ctx , reconcile.Request {
319+ NamespacedName : typeNamespacedName ,
320+ })
321+ Expect (err ).NotTo (HaveOccurred ())
322+
323+ // Check that pods are created
324+ podList := & corev1.PodList {}
325+ Eventually (func () int {
326+ err := k8sClient .List (ctx , podList ,
327+ client .InNamespace (resourceNamespace ),
328+ client.MatchingLabels {constants .WorkloadKey : resourceName })
329+ if err != nil {
330+ return 0
331+ }
332+ return len (podList .Items )
333+ }, 5 * time .Second , 100 * time .Millisecond ).Should (Equal (2 ))
334+
335+ // Store the original pod template hash
336+ var originalPodNames []string
337+ var originalPodTemplateHash string
338+ for _ , pod := range podList .Items {
339+ originalPodNames = append (originalPodNames , pod .Name )
340+ originalPodTemplateHash = pod .Labels [constants .LabelKeyPodTemplateHash ]
341+ }
342+ Expect (originalPodTemplateHash ).NotTo (BeEmpty ())
343+
344+ // Update workload with different resource limits
345+ workload = & tfv1.TensorFusionWorkload {}
346+ Expect (k8sClient .Get (ctx , typeNamespacedName , workload )).To (Succeed ())
347+ workload .Spec .Resources .Limits .Tflops = resource .MustParse ("30" ) // Increase TFLOPS limit
348+ workload .Spec .Resources .Limits .Vram = resource .MustParse ("24Gi" ) // Increase VRAM limit
349+ Expect (k8sClient .Update (ctx , workload )).To (Succeed ())
350+
351+ // Reconcile to handle the resource limits change
352+ _ , err = reconciler .Reconcile (ctx , reconcile.Request {
353+ NamespacedName : typeNamespacedName ,
354+ })
355+ Expect (err ).NotTo (HaveOccurred ())
356+
357+ // Reconcile again to handle the Finalizer
358+ _ , err = reconciler .Reconcile (ctx , reconcile.Request {
359+ NamespacedName : typeNamespacedName ,
360+ })
361+ Expect (err ).NotTo (HaveOccurred ())
362+
363+ // Verify old pods are deleted due to template hash change
364+ Eventually (func () bool {
365+ podList := & corev1.PodList {}
366+ err := k8sClient .List (ctx , podList ,
367+ client .InNamespace (resourceNamespace ),
368+ client.MatchingLabels {constants .WorkloadKey : resourceName })
369+ if err != nil || len (podList .Items ) != 0 {
370+ return false
371+ }
372+ return true // All pods should be deleted
373+ }, 5 * time .Second , 100 * time .Millisecond ).Should (BeTrue ())
374+
375+ // Reconcile again to create new pods
376+ _ , err = reconciler .Reconcile (ctx , reconcile.Request {
377+ NamespacedName : typeNamespacedName ,
378+ })
379+ Expect (err ).NotTo (HaveOccurred ())
380+
381+ // Verify new pods are created
382+ Eventually (func () int {
383+ err := k8sClient .List (ctx , podList ,
384+ client .InNamespace (resourceNamespace ),
385+ client.MatchingLabels {constants .WorkloadKey : resourceName })
386+ if err != nil {
387+ return 0
388+ }
389+ return len (podList .Items )
390+ }, 5 * time .Second , 100 * time .Millisecond ).Should (Equal (2 ))
391+
392+ // Verify new pods have different names and pod template hash
393+ var newPodNames []string
394+ var newPodTemplateHash string
395+ for _ , pod := range podList .Items {
396+ newPodNames = append (newPodNames , pod .Name )
397+ newPodTemplateHash = pod .Labels [constants .LabelKeyPodTemplateHash ]
398+ }
399+ Expect (newPodTemplateHash ).NotTo (BeEmpty ())
400+ Expect (newPodTemplateHash ).NotTo (Equal (originalPodTemplateHash ))
401+
402+ // Verify that pod names have changed
403+ for _ , originalName := range originalPodNames {
404+ Expect (newPodNames ).NotTo (ContainElement (originalName ))
405+ }
406+
407+ // Reconcile again to handle status
408+ _ , err = reconciler .Reconcile (ctx , reconcile.Request {
409+ NamespacedName : typeNamespacedName ,
410+ })
411+ Expect (err ).NotTo (HaveOccurred ())
412+
413+ // Verify workload status was updated
414+ Eventually (func () int32 {
415+ workload := & tfv1.TensorFusionWorkload {}
416+ err = k8sClient .Get (ctx , typeNamespacedName , workload )
417+ if err != nil {
418+ return - 1
419+ }
420+ return workload .Status .Replicas
421+ }, 5 * time .Second , 100 * time .Millisecond ).Should (Equal (int32 (2 )))
422+ })
423+ })
424+
291425 Context ("When scaling down a workload" , func () {
292426 It ("Should delete excess worker pods" , func () {
293427 // Create a workload with 3 replicas
0 commit comments