Skip to content
Open
13 changes: 13 additions & 0 deletions extensions/api/v1alpha1/sandboxwarmpool_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,19 @@ type SandboxWarmPoolSpec struct {
// sandboxTemplateRef - name of the SandboxTemplate to be used for creating a Sandbox
// +kubebuilder:validation:Required
TemplateRef SandboxTemplateRef `json:"sandboxTemplateRef,omitempty" protobuf:"bytes,3,name=sandboxTemplateRef"`

// +optional
UpdateStrategy SandboxWarmPoolUpdateStrategy `json:"updateStrategy,omitempty"`
}

// SandboxWarmPoolUpdateStrategy defines the update strategy for the SandboxWarmPool.
type SandboxWarmPoolUpdateStrategy struct {
// Recreate: Stale pods are deleted immediately to ensure the pool only contains fresh pods.
// OnDelete: Stale pods are only replaced when they are manually deleted or claimed.
// +kubebuilder:validation:Enum=Recreate;OnDelete
// +kubebuilder:default=Recreate
// +optional
Type string `json:"type,omitempty"`
}

// SandboxWarmPoolStatus defines the observed state of SandboxWarmPool
Expand Down
16 changes: 16 additions & 0 deletions extensions/api/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

117 changes: 89 additions & 28 deletions extensions/controllers/sandboxwarmpool_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,21 +25,19 @@ import (
k8serrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/types"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/controller"
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
"sigs.k8s.io/controller-runtime/pkg/handler"
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/reconcile"

sandboxcontrollers "sigs.k8s.io/agent-sandbox/controllers"
extensionsv1alpha1 "sigs.k8s.io/agent-sandbox/extensions/api/v1alpha1"
)

const (
poolLabel = "agents.x-k8s.io/pool"
sandboxTemplateRefHash = "agents.x-k8s.io/sandbox-template-ref-hash"
)

// SandboxWarmPoolReconciler reconciles a SandboxWarmPool object
type SandboxWarmPoolReconciler struct {
client.Client
Expand Down Expand Up @@ -112,6 +110,21 @@ func (r *SandboxWarmPoolReconciler) reconcilePool(ctx context.Context, warmPool
var activePods []corev1.Pod
var allErrors error

// Fetch template and compute hash once to avoid repeated expensive operations
template, tmplErr := r.getTemplate(ctx, warmPool)
var currentTemplateHash string
if tmplErr == nil {
currentTemplateHash = computeTemplateHash(template)
Copy link
Member

@janetkuo janetkuo Mar 13, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not safe to directly compute template hash for comparison, because pod schema changes in upstream will affect the value even if the template hasn't changed. This will end up deleting all warmed resources after a cluster upgrade. The value should only be computed once and added as a label, and later get the hash value from labels to compare. The same pattern is used in Deployment with pod-template-hash label.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Per discussion, if the Warmpool can create sandboxes directly (#390), we can use sandbox spec to compare directly.

} else {
log.Error(tmplErr, "Failed to get sandbox template", "templateRef", warmPool.Spec.TemplateRef.Name)
}

// Get the update strategy
updateStrategy := warmPool.Spec.UpdateStrategy.Type
if updateStrategy == "" {
updateStrategy = RecreateStrategy
}

for _, pod := range podList.Items {
// Skip pods that are being deleted
if !pod.DeletionTimestamp.IsZero() {
Expand All @@ -120,26 +133,35 @@ func (r *SandboxWarmPoolReconciler) reconcilePool(ctx context.Context, warmPool

// Get the controller owner reference
controllerRef := metav1.GetControllerOf(&pod)
isOrphan := controllerRef == nil
isControlledByPool := controllerRef != nil && controllerRef.UID == warmPool.UID

if controllerRef == nil {
// Ignore pods belonging to other controllers
if !isOrphan && !isControlledByPool {
log.Info("Ignoring pod with different controller", "pod", pod.Name, "controller", controllerRef.Name)
continue
}

// If the update strategy of the warmpool is recreate, check if it's stale
if updateStrategy == RecreateStrategy && tmplErr == nil && r.isPodStale(&pod, currentTemplateHash) {
log.Info("Deleting stale pod from pool", "pod", pod.Name)
if err := r.Delete(ctx, &pod); err != nil {
log.Error(err, "Failed to delete stale pod", "pod", pod.Name)
allErrors = errors.Join(allErrors, err)
}
continue
}

if isOrphan {
// Pod has no controller - adopt it
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This only happens if the strategy is OnDelete, correct? Could you update the comment to reflect this?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So, if the strategy is Recreate and if the orphaned pod is not stale, the controller will go ahead and adopt the pod.

log.Info("Adopting orphaned pod", "pod", pod.Name)
if err := r.adoptPod(ctx, warmPool, &pod); err != nil {
log.Error(err, "Failed to adopt pod", "pod", pod.Name)
allErrors = errors.Join(allErrors, err)
continue
}
activePods = append(activePods, pod)
} else if controllerRef.UID == warmPool.UID {
// Pod belongs to this warmpool - include it
activePods = append(activePods, pod)
} else {
// Pod has a different controller - ignore it
log.Info("Ignoring pod with different controller",
"pod", pod.Name,
"controller", controllerRef.Name,
"controllerKind", controllerRef.Kind)
}
activePods = append(activePods, pod)
}

desiredReplicas := warmPool.Spec.Replicas
Expand Down Expand Up @@ -168,11 +190,15 @@ func (r *SandboxWarmPoolReconciler) reconcilePool(ctx context.Context, warmPool

// Create new pods if we need more
if currentReplicas < desiredReplicas {
if tmplErr != nil {
return errors.Join(allErrors, tmplErr)
}

podsToCreate := desiredReplicas - currentReplicas
log.Info("Creating new pods", "count", podsToCreate)

for i := int32(0); i < podsToCreate; i++ {
if err := r.createPoolPod(ctx, warmPool, poolNameHash); err != nil {
if err := r.createPoolPod(ctx, warmPool, poolNameHash, template, currentTemplateHash); err != nil {
log.Error(err, "Failed to create pod")
allErrors = errors.Join(allErrors, err)
}
Expand Down Expand Up @@ -212,21 +238,14 @@ func (r *SandboxWarmPoolReconciler) adoptPod(ctx context.Context, warmPool *exte
}

// createPoolPod creates a new pod for the warm pool
func (r *SandboxWarmPoolReconciler) createPoolPod(ctx context.Context, warmPool *extensionsv1alpha1.SandboxWarmPool, poolNameHash string) error {
func (r *SandboxWarmPoolReconciler) createPoolPod(ctx context.Context, warmPool *extensionsv1alpha1.SandboxWarmPool, poolNameHash string, template *extensionsv1alpha1.SandboxTemplate, currentTemplateHash string) error {
log := log.FromContext(ctx)

// Create labels for the pod
podLabels := make(map[string]string)
podLabels[poolLabel] = poolNameHash
podLabels[sandboxTemplateRefHash] = sandboxcontrollers.NameHash(warmPool.Spec.TemplateRef.Name)

// Try getting template
var template *extensionsv1alpha1.SandboxTemplate
var err error
if template, err = r.getTemplate(ctx, warmPool); err != nil {
log.Error(err, "Failed to get sandbox template for warm pool", "warmPoolName", warmPool.Name)
return err
}
podLabels[sandboxTemplateHash] = currentTemplateHash

for k, v := range template.Spec.PodTemplate.ObjectMeta.Labels {
podLabels[k] = v
Expand All @@ -249,8 +268,6 @@ func (r *SandboxWarmPoolReconciler) createPoolPod(ctx context.Context, warmPool
Spec: template.Spec.PodTemplate.Spec,
}

// pod.Labels[podNameLabel] = sandboxcontrollers.NameHash(pod.Name)

// Set controller reference so the Pod is owned by the SandboxWarmPool
if err := ctrl.SetControllerReference(warmPool, pod, r.Scheme()); err != nil {
return fmt.Errorf("SetControllerReference for Pod failed: %w", err)
Expand Down Expand Up @@ -301,11 +318,55 @@ func (r *SandboxWarmPoolReconciler) getTemplate(ctx context.Context, warmPool *e
return template, nil
}

// isPodStale checks if the pod is stale based on the template hash
func (r *SandboxWarmPoolReconciler) isPodStale(pod *corev1.Pod, currentTemplateHash string) bool {
podHash := pod.Labels[sandboxTemplateHash]
return currentTemplateHash != podHash
}

// SetupWithManager sets up the controller with the Manager
func (r *SandboxWarmPoolReconciler) SetupWithManager(mgr ctrl.Manager, concurrentWorkers int) error {
if err := mgr.GetFieldIndexer().IndexField(context.Background(), &extensionsv1alpha1.SandboxWarmPool{}, templateRefField, func(rawObj client.Object) []string {
wp := rawObj.(*extensionsv1alpha1.SandboxWarmPool)
if wp.Spec.TemplateRef.Name == "" {
return nil
}
return []string{wp.Spec.TemplateRef.Name}
}); err != nil {
return err
}

return ctrl.NewControllerManagedBy(mgr).
For(&extensionsv1alpha1.SandboxWarmPool{}).
Owns(&corev1.Pod{}).
WithOptions(controller.Options{MaxConcurrentReconciles: concurrentWorkers}).
Watches(
&extensionsv1alpha1.SandboxTemplate{},
handler.EnqueueRequestsFromMapFunc(r.findWarmPoolsForTemplate),
).
Complete(r)
}

// findWarmPoolsForTemplate returns a list of reconcile.Requests for all SandboxWarmPools that reference the template.
func (r *SandboxWarmPoolReconciler) findWarmPoolsForTemplate(ctx context.Context, obj client.Object) []reconcile.Request {
template, ok := obj.(*extensionsv1alpha1.SandboxTemplate)
if !ok {
return nil
}

warmPools := &extensionsv1alpha1.SandboxWarmPoolList{}
if err := r.List(ctx, warmPools, client.InNamespace(template.Namespace), client.MatchingFields{templateRefField: template.Name}); err != nil {
return nil
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

consider pls logging the error since this could affect updates to warmpools

}

var requests []reconcile.Request
for _, wp := range warmPools.Items {
requests = append(requests, reconcile.Request{
NamespacedName: types.NamespacedName{
Name: wp.Name,
Namespace: wp.Namespace,
},
})
}
return requests
}
Loading