Skip to content

Commit 27274ee

Browse files
committed
fix: move sandbox rollback defer before timeout to prevent resource leak
Signed-off-by: Sanchit2662 <sanchit2662@gmail.com>
1 parent ace39ce commit 27274ee

File tree

1 file changed

+21
-14
lines changed

1 file changed

+21
-14
lines changed

pkg/workloadmanager/handlers.go

Lines changed: 21 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -160,16 +160,9 @@ func (s *Server) createSandbox(ctx context.Context, dynamicClient dynamic.Interf
160160
}
161161
}
162162

163-
var createdSandbox *sandboxv1alpha1.Sandbox
164-
select {
165-
case result := <-resultChan:
166-
createdSandbox = result.Sandbox
167-
klog.V(2).Infof("sandbox %s/%s running", createdSandbox.Namespace, createdSandbox.Name)
168-
case <-time.After(2 * time.Minute): // consistent with router settings
169-
klog.Warningf("sandbox %s/%s create timed out", sandbox.Namespace, sandbox.Name)
170-
return nil, fmt.Errorf("sandbox creation timed out")
171-
}
172-
163+
// Register rollback BEFORE waiting for the sandbox to become ready.
164+
// This ensures the K8s resource and store placeholder are cleaned up on
165+
// timeout, pod-IP failure, or store-update failure — not just on post-creation errors.
173166
needRollbackSandbox := true
174167
sandboxRollbackFunc := func() {
175168
ctxTimeout, cancel := context.WithTimeout(context.Background(), 30*time.Second)
@@ -180,17 +173,21 @@ func (s *Server) createSandbox(ctx context.Context, dynamicClient dynamic.Interf
180173
err = deleteSandboxClaim(ctxTimeout, dynamicClient, sandboxClaim.Namespace, sandboxClaim.Name)
181174
if err != nil {
182175
klog.Infof("sandbox claim %s/%s rollback failed: %v", sandboxClaim.Namespace, sandboxClaim.Name, err)
183-
return
176+
} else {
177+
klog.Infof("sandbox claim %s/%s rollback succeeded", sandboxClaim.Namespace, sandboxClaim.Name)
184178
}
185-
klog.Infof("sandbox claim %s/%s rollback succeeded", sandboxClaim.Namespace, sandboxClaim.Name)
186179
} else {
187180
// Rollback Sandbox
188181
err = deleteSandbox(ctxTimeout, dynamicClient, sandbox.Namespace, sandbox.Name)
189182
if err != nil {
190183
klog.Infof("sandbox %s/%s rollback failed: %v", sandbox.Namespace, sandbox.Name, err)
191-
return
184+
} else {
185+
klog.Infof("sandbox %s/%s rollback succeeded", sandbox.Namespace, sandbox.Name)
192186
}
193-
klog.Infof("sandbox %s/%s rollback succeeded", sandbox.Namespace, sandbox.Name)
187+
}
188+
// Clean up the store placeholder so it does not pollute GC queries
189+
if delErr := s.storeClient.DeleteSandboxBySessionID(ctxTimeout, sandboxEntry.SessionID); delErr != nil {
190+
klog.Infof("sandbox %s/%s store placeholder cleanup failed: %v", sandbox.Namespace, sandbox.Name, delErr)
194191
}
195192
}
196193
defer func() {
@@ -200,6 +197,16 @@ func (s *Server) createSandbox(ctx context.Context, dynamicClient dynamic.Interf
200197
sandboxRollbackFunc()
201198
}()
202199

200+
var createdSandbox *sandboxv1alpha1.Sandbox
201+
select {
202+
case result := <-resultChan:
203+
createdSandbox = result.Sandbox
204+
klog.V(2).Infof("sandbox %s/%s running", createdSandbox.Namespace, createdSandbox.Name)
205+
case <-time.After(2 * time.Minute): // consistent with router settings
206+
klog.Warningf("sandbox %s/%s create timed out", sandbox.Namespace, sandbox.Name)
207+
return nil, fmt.Errorf("sandbox creation timed out")
208+
}
209+
203210
// agent-sandbox create pod with same name as sandbox if no warmpool is used
204211
// so here we try to get pod IP by sandbox name first
205212
// if warmpool is used, the pod name is stored in sandbox's annotation `agents.x-k8s.io/sandbox-pod-name`

0 commit comments

Comments
 (0)