Skip to content

Commit 816a4c6

Browse files
fix: add PodClique watch to trigger DGD reconciliation (#4076)
Signed-off-by: Julien Mancuso <[email protected]>
1 parent 3659c82 commit 816a4c6

File tree

1 file changed

+64
-6
lines changed

1 file changed

+64
-6
lines changed

deploy/cloud/operator/internal/controller/dynamographdeployment_controller.go

Lines changed: 64 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ import (
4040
"sigs.k8s.io/controller-runtime/pkg/client"
4141
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
4242
"sigs.k8s.io/controller-runtime/pkg/event"
43+
"sigs.k8s.io/controller-runtime/pkg/handler"
4344
"sigs.k8s.io/controller-runtime/pkg/log"
4445
"sigs.k8s.io/controller-runtime/pkg/predicate"
4546

@@ -96,10 +97,9 @@ type DynamoGraphDeploymentReconciler struct {
9697
//
9798
// For more details, check Reconcile and its Result here:
9899
// - https://pkg.go.dev/sigs.k8s.io/[email protected]/pkg/reconcile
99-
func (r *DynamoGraphDeploymentReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
100+
func (r *DynamoGraphDeploymentReconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ctrl.Result, err error) {
100101
logger := log.FromContext(ctx)
101102

102-
var err error
103103
reason := Reason("undefined")
104104
message := Message("")
105105
state := PendingState
@@ -110,6 +110,12 @@ func (r *DynamoGraphDeploymentReconciler) Reconcile(ctx context.Context, req ctr
110110
}
111111

112112
defer func() {
113+
// Skip status update if DGD is being deleted
114+
if !dynamoDeployment.GetDeletionTimestamp().IsZero() {
115+
logger.Info("Reconciliation done - skipping status update for deleted resource")
116+
return
117+
}
118+
113119
if err != nil {
114120
state = FailedState
115121
message = Message(err.Error())
@@ -131,9 +137,13 @@ func (r *DynamoGraphDeploymentReconciler) Reconcile(ctx context.Context, req ctr
131137
LastTransitionTime: metav1.Now(),
132138
})
133139

134-
err = r.Status().Update(ctx, dynamoDeployment)
135-
if err != nil {
136-
logger.Error(err, "Unable to update the CRD status", "crd", req.NamespacedName, "state", state, "reason", reason, "message", message)
140+
updateErr := r.Status().Update(ctx, dynamoDeployment)
141+
if updateErr != nil {
142+
logger.Error(updateErr, "Unable to update the CRD status", "crd", req.NamespacedName, "state", state, "reason", reason, "message", message)
143+
// Set err to trigger requeue
144+
if err == nil {
145+
err = updateErr
146+
}
137147
}
138148
logger.Info("Reconciliation done")
139149
}()
@@ -539,11 +549,59 @@ func (r *DynamoGraphDeploymentReconciler) SetupWithManager(mgr ctrl.Manager) err
539549
DeleteFunc: func(de event.DeleteEvent) bool { return true },
540550
UpdateFunc: func(de event.UpdateEvent) bool { return true },
541551
GenericFunc: func(ge event.GenericEvent) bool { return true },
542-
}))
552+
})).
553+
// Watch PodClique resources - only on status changes
554+
// Note: We don't need to watch PodCliqueScalingGroup because it's just a container
555+
// for PodCliques. The actual status changes happen at the PodClique level.
556+
Watches(
557+
&grovev1alpha1.PodClique{},
558+
handler.EnqueueRequestsFromMapFunc(r.mapPodCliqueToRequests),
559+
builder.WithPredicates(predicate.Funcs{
560+
CreateFunc: func(ce event.CreateEvent) bool { return false },
561+
DeleteFunc: func(de event.DeleteEvent) bool { return false },
562+
UpdateFunc: func(ue event.UpdateEvent) bool {
563+
// Only trigger on status changes (readyReplicas or replicas)
564+
oldPC, okOld := ue.ObjectOld.(*grovev1alpha1.PodClique)
565+
newPC, okNew := ue.ObjectNew.(*grovev1alpha1.PodClique)
566+
if !okOld || !okNew {
567+
return false
568+
}
569+
// Trigger if readyReplicas or replicas changed
570+
return oldPC.Status.ReadyReplicas != newPC.Status.ReadyReplicas ||
571+
oldPC.Spec.Replicas != newPC.Spec.Replicas
572+
},
573+
GenericFunc: func(ge event.GenericEvent) bool { return false },
574+
}),
575+
)
543576
}
544577
return ctrlBuilder.Complete(r)
545578
}
546579

547580
func (r *DynamoGraphDeploymentReconciler) GetRecorder() record.EventRecorder {
548581
return r.Recorder
549582
}
583+
584+
// mapPodCliqueToRequests maps a PodClique to reconcile requests for its owning DGD
585+
// Uses the nvidia.com/dynamo-graph-deployment-name label for direct lookup - no API calls needed!
586+
func (r *DynamoGraphDeploymentReconciler) mapPodCliqueToRequests(ctx context.Context, obj client.Object) []ctrl.Request {
587+
podClique, ok := obj.(*grovev1alpha1.PodClique)
588+
if !ok {
589+
return nil
590+
}
591+
592+
// PodCliques are labeled with the DGD name and live in the same namespace
593+
dgdName, hasLabel := podClique.GetLabels()[consts.KubeLabelDynamoGraphDeploymentName]
594+
if !hasLabel || dgdName == "" {
595+
log.FromContext(ctx).V(1).Info("PodClique missing DGD label",
596+
"podClique", podClique.Name,
597+
"namespace", podClique.Namespace)
598+
return nil
599+
}
600+
601+
return []ctrl.Request{{
602+
NamespacedName: types.NamespacedName{
603+
Name: dgdName,
604+
Namespace: podClique.Namespace,
605+
},
606+
}}
607+
}

0 commit comments

Comments
 (0)