@@ -40,6 +40,7 @@ import (
4040 "sigs.k8s.io/controller-runtime/pkg/client"
4141 "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
4242 "sigs.k8s.io/controller-runtime/pkg/event"
43+ "sigs.k8s.io/controller-runtime/pkg/handler"
4344 "sigs.k8s.io/controller-runtime/pkg/log"
4445 "sigs.k8s.io/controller-runtime/pkg/predicate"
4546
@@ -96,10 +97,9 @@ type DynamoGraphDeploymentReconciler struct {
9697//
9798// For more details, check Reconcile and its Result here:
9899// - https://pkg.go.dev/sigs.k8s.io/[email protected] /pkg/reconcile 99- func (r * DynamoGraphDeploymentReconciler ) Reconcile (ctx context.Context , req ctrl.Request ) (ctrl.Result , error ) {
100+ func (r * DynamoGraphDeploymentReconciler ) Reconcile (ctx context.Context , req ctrl.Request ) (result ctrl.Result , err error ) {
100101 logger := log .FromContext (ctx )
101102
102- var err error
103103 reason := Reason ("undefined" )
104104 message := Message ("" )
105105 state := PendingState
@@ -110,6 +110,12 @@ func (r *DynamoGraphDeploymentReconciler) Reconcile(ctx context.Context, req ctr
110110 }
111111
112112 defer func () {
113+ // Skip status update if DGD is being deleted
114+ if ! dynamoDeployment .GetDeletionTimestamp ().IsZero () {
115+ logger .Info ("Reconciliation done - skipping status update for deleted resource" )
116+ return
117+ }
118+
113119 if err != nil {
114120 state = FailedState
115121 message = Message (err .Error ())
@@ -131,9 +137,13 @@ func (r *DynamoGraphDeploymentReconciler) Reconcile(ctx context.Context, req ctr
131137 LastTransitionTime : metav1 .Now (),
132138 })
133139
134- err = r .Status ().Update (ctx , dynamoDeployment )
135- if err != nil {
136- logger .Error (err , "Unable to update the CRD status" , "crd" , req .NamespacedName , "state" , state , "reason" , reason , "message" , message )
140+ updateErr := r .Status ().Update (ctx , dynamoDeployment )
141+ if updateErr != nil {
142+ logger .Error (updateErr , "Unable to update the CRD status" , "crd" , req .NamespacedName , "state" , state , "reason" , reason , "message" , message )
143+ // Set err to trigger requeue
144+ if err == nil {
145+ err = updateErr
146+ }
137147 }
138148 logger .Info ("Reconciliation done" )
139149 }()
@@ -539,11 +549,59 @@ func (r *DynamoGraphDeploymentReconciler) SetupWithManager(mgr ctrl.Manager) err
539549 DeleteFunc : func (de event.DeleteEvent ) bool { return true },
540550 UpdateFunc : func (de event.UpdateEvent ) bool { return true },
541551 GenericFunc : func (ge event.GenericEvent ) bool { return true },
542- }))
552+ })).
553+ // Watch PodClique resources - only on status changes
554+ // Note: We don't need to watch PodCliqueScalingGroup because it's just a container
555+ // for PodCliques. The actual status changes happen at the PodClique level.
556+ Watches (
557+ & grovev1alpha1.PodClique {},
558+ handler .EnqueueRequestsFromMapFunc (r .mapPodCliqueToRequests ),
559+ builder .WithPredicates (predicate.Funcs {
560+ CreateFunc : func (ce event.CreateEvent ) bool { return false },
561+ DeleteFunc : func (de event.DeleteEvent ) bool { return false },
562+ UpdateFunc : func (ue event.UpdateEvent ) bool {
563+ // Only trigger on status changes (readyReplicas or replicas)
564+ oldPC , okOld := ue .ObjectOld .(* grovev1alpha1.PodClique )
565+ newPC , okNew := ue .ObjectNew .(* grovev1alpha1.PodClique )
566+ if ! okOld || ! okNew {
567+ return false
568+ }
569+ // Trigger if readyReplicas or replicas changed
570+ return oldPC .Status .ReadyReplicas != newPC .Status .ReadyReplicas ||
571+ oldPC .Spec .Replicas != newPC .Spec .Replicas
572+ },
573+ GenericFunc : func (ge event.GenericEvent ) bool { return false },
574+ }),
575+ )
543576 }
544577 return ctrlBuilder .Complete (r )
545578}
546579
547580func (r * DynamoGraphDeploymentReconciler ) GetRecorder () record.EventRecorder {
548581 return r .Recorder
549582}
583+
584+ // mapPodCliqueToRequests maps a PodClique to reconcile requests for its owning DGD
585+ // Uses the nvidia.com/dynamo-graph-deployment-name label for direct lookup - no API calls needed!
586+ func (r * DynamoGraphDeploymentReconciler ) mapPodCliqueToRequests (ctx context.Context , obj client.Object ) []ctrl.Request {
587+ podClique , ok := obj .(* grovev1alpha1.PodClique )
588+ if ! ok {
589+ return nil
590+ }
591+
592+ // PodCliques are labeled with the DGD name and live in the same namespace
593+ dgdName , hasLabel := podClique .GetLabels ()[consts .KubeLabelDynamoGraphDeploymentName ]
594+ if ! hasLabel || dgdName == "" {
595+ log .FromContext (ctx ).V (1 ).Info ("PodClique missing DGD label" ,
596+ "podClique" , podClique .Name ,
597+ "namespace" , podClique .Namespace )
598+ return nil
599+ }
600+
601+ return []ctrl.Request {{
602+ NamespacedName : types.NamespacedName {
603+ Name : dgdName ,
604+ Namespace : podClique .Namespace ,
605+ },
606+ }}
607+ }
0 commit comments