Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions packages/api/internal/orchestrator/autoresume_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ func TestHandleExistingSandboxAutoResume(t *testing.T) {
require.NoError(t, err)
assert.False(t, alreadyDone)
require.NotNil(t, finish)

finish(t.Context(), nil)

pausingSandbox, err := o.GetSandbox(t.Context(), sbx.TeamID, sbx.SandboxID)
Expand All @@ -125,7 +126,7 @@ func TestHandleExistingSandboxAutoResume(t *testing.T) {
assert.ErrorIs(t, err, ErrSandboxStillTransitioning)
})

t.Run("pausing sandbox wait failure returns internal error", func(t *testing.T) {
t.Run("concurrently pausing sandbox returns internal error", func(t *testing.T) {
t.Parallel()

o := newTestAutoResumeOrchestrator()
Expand All @@ -136,10 +137,15 @@ func TestHandleExistingSandboxAutoResume(t *testing.T) {
require.NoError(t, err)
assert.False(t, alreadyDone)
require.NotNil(t, finish)
finish(t.Context(), errors.New("boom"))

pausingSandbox, err := o.GetSandbox(t.Context(), sbx.TeamID, sbx.SandboxID)
require.NoError(t, err)
assert.Equal(t, sandbox.StatePausing, pausingSandbox.State)

go func() {
time.Sleep(50 * time.Millisecond)
finish(t.Context(), errors.New("boom"))
}()

_, handled, err := o.HandleExistingSandboxAutoResume(t.Context(), sbx.TeamID, sbx.SandboxID, pausingSandbox, time.Minute)
require.Error(t, err)
Expand Down
6 changes: 6 additions & 0 deletions packages/api/internal/orchestrator/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,12 @@ func (o *Orchestrator) discoverClusterNode(ctx context.Context, clusterID uuid.U
ctx, span := tracer.Start(ctx, "discover-cluster-node")
defer span.End()

if o.clusters == nil {
logger.L().Error(ctx, "Cluster pool not initialized during on-demand node discovery", logger.WithClusterID(clusterID))

return
}

cluster, found := o.clusters.GetClusterById(clusterID)
if !found {
logger.L().Error(ctx, "Cluster not found during on-demand node discovery", logger.WithClusterID(clusterID))
Expand Down
20 changes: 15 additions & 5 deletions packages/api/internal/orchestrator/delete_instance.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ import (

"github.com/google/uuid"
"go.uber.org/zap"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"

"github.com/e2b-dev/infra/packages/api/internal/orchestrator/nodemanager"
"github.com/e2b-dev/infra/packages/api/internal/sandbox"
Expand All @@ -17,7 +19,7 @@ import (
sbxlogger "github.com/e2b-dev/infra/packages/shared/pkg/logger/sandbox"
)

func (o *Orchestrator) RemoveSandbox(ctx context.Context, teamID uuid.UUID, sandboxID string, opts sandbox.RemoveOpts) error {
func (o *Orchestrator) RemoveSandbox(ctx context.Context, teamID uuid.UUID, sandboxID string, opts sandbox.RemoveOpts) (err error) {
ctx, span := tracer.Start(ctx, "remove-sandbox")
defer span.End()

Expand Down Expand Up @@ -83,15 +85,18 @@ func (o *Orchestrator) RemoveSandbox(ctx context.Context, teamID uuid.UUID, sand
return nil
}

defer func() { go o.analyticsRemove(context.WithoutCancel(ctx), sbx, opts.Action) }()
defer o.sandboxStore.Remove(ctx, teamID, sandboxID)
err = o.removeSandboxFromNode(ctx, sbx, opts.Action)
if err != nil {
logger.L().Error(ctx, "Error pausing sandbox", zap.Error(err), logger.WithSandboxID(sbx.SandboxID))
if errors.Is(err, ErrSandboxNotFound) {
logger.L().Warn(ctx, "Sandbox not found during removal, treating as not found", zap.Error(err), logger.WithSandboxID(sbx.SandboxID))
} else if err != nil {
logger.L().Error(ctx, "Error removing sandbox from node", zap.Error(err), logger.WithSandboxID(sbx.SandboxID))
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Routing record deleted before removal, not restored on failure

Medium Severity

In removeSandboxFromNode, the routing catalog entry (routingCatalog.DeleteSandbox) is deleted before the actual pause/kill gRPC call. If the gRPC call fails, RemoveSandbox now keeps the sandbox in the store with its state reverted to Running, but the routing record is already gone. This leaves the sandbox visible to the API but unreachable for Nomad-managed nodes, a new inconsistency introduced by moving sandboxStore.Remove to only execute on success.

Additional Locations (1)
Fix in Cursor Fix in Web


return ErrSandboxOperationFailed
}

o.sandboxStore.Remove(context.WithoutCancel(ctx), teamID, sandboxID)
go o.analyticsRemove(context.WithoutCancel(ctx), sbx, opts.Action)

return nil
}

Expand Down Expand Up @@ -154,6 +159,11 @@ func (o *Orchestrator) killSandboxOnNode(ctx context.Context, node *nodemanager.
client, ctx := node.GetSandboxDeleteCtx(ctx, sbx.SandboxID, sbx.ExecutionID)
_, err := client.Sandbox.Delete(ctx, req)
if err != nil {
grpcErr, ok := status.FromError(err)
if ok && grpcErr.Code() == codes.NotFound {
return ErrSandboxNotFound
}

return fmt.Errorf("failed to delete sandbox '%s': %w", sbx.SandboxID, err)
}

Expand Down
10 changes: 10 additions & 0 deletions packages/api/internal/orchestrator/pause_instance.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,12 @@ func (o *Orchestrator) pauseSandbox(ctx context.Context, node *nodemanager.Node,
return PauseQueueExhaustedError{}
}

if errors.Is(err, ErrSandboxNotFound) {
telemetry.ReportCriticalError(ctx, "sandbox not found when pausing", err)

return ErrSandboxNotFound
}

if err != nil && !errors.Is(err, PauseQueueExhaustedError{}) {
telemetry.ReportCriticalError(ctx, "error pausing sandbox", err)

Expand Down Expand Up @@ -97,6 +103,10 @@ func snapshotInstance(ctx context.Context, node *nodemanager.Node, sbx sandbox.S
return PauseQueueExhaustedError{}
}

if st.Code() == codes.NotFound {
return ErrSandboxNotFound
}

return fmt.Errorf("failed to pause sandbox '%s': %w", sbx.SandboxID, err)
}

Expand Down
16 changes: 8 additions & 8 deletions packages/api/internal/sandbox/storage/memory/operations.go
Original file line number Diff line number Diff line change
Expand Up @@ -172,14 +172,14 @@ func startRemoving(ctx context.Context, sbx *memorySandbox, opts sandbox.RemoveO
}
}

originalState := sbx._data.State
newState := opts.Action.TargetState

if transition != nil {
currentState := sbx._data.State
sbx.mu.Unlock()

if currentState != newState && !sandbox.AllowedTransitions[currentState][newState] {
return false, nil, &sandbox.InvalidStateTransitionError{CurrentState: currentState, TargetState: newState}
if originalState != newState && !sandbox.AllowedTransitions[originalState][newState] {
return false, nil, &sandbox.InvalidStateTransitionError{CurrentState: originalState, TargetState: newState}
}

logger.L().Debug(ctx, "State transition already in progress to the same state, waiting", logger.WithSandboxID(sbx.SandboxID()), zap.String("state", string(newState)))
Expand All @@ -190,9 +190,9 @@ func startRemoving(ctx context.Context, sbx *memorySandbox, opts sandbox.RemoveO

// If the transition is to the same state just wait
switch {
case currentState == newState:
case originalState == newState:
return true, func(context.Context, error) {}, nil
case sandbox.AllowedTransitions[currentState][newState]:
case sandbox.AllowedTransitions[originalState][newState]:
return startRemoving(ctx, sbx, sandbox.RemoveOpts{Action: opts.Action})
default:
return false, nil, fmt.Errorf("unexpected state transition")
Expand Down Expand Up @@ -238,11 +238,11 @@ func startRemoving(ctx context.Context, sbx *memorySandbox, opts sandbox.RemoveO
}

if err != nil {
// Keep the transition in place so the error stays
return
// Revert the state change if the transition failed and it's not a transient transition
sbx._data.State = originalState
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Expired flag not reverted when removal transition fails

High Severity

When TransitionExpires actions (pause/kill) fail, the callback reverts sbx._data.State to originalState but does not undo the sbx.setExpired() call made earlier. The sandbox ends up in its original state (e.g., Running) but permanently marked as expired. This causes the evictor to repeatedly find and attempt to remove the sandbox, and may break KeepAliveFor or other lifetime-extending operations since the sandbox appears expired.

Additional Locations (1)
Fix in Cursor Fix in Web

}

// The transition is completed and the next transition can be started
// Remove the transition so the next transition can be started
sbx.transition = nil
}

Expand Down
18 changes: 10 additions & 8 deletions packages/api/internal/sandbox/storage/memory/operations_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -229,19 +229,21 @@ func TestStartRemoving_Error(t *testing.T) {
assert.False(t, alreadyDone2)
assert.Nil(t, finish2)

// From Failed state, no transitions are allowed
// Failed transition should be cleared so subsequent transitions can proceed.
alreadyDone3, finish3, err3 := startRemoving(ctx, sbx, sandbox.RemoveOpts{Action: sandbox.StateActionPause})
require.Error(t, err3)
require.ErrorIs(t, err3, failureErr)
require.NoError(t, err3)
assert.False(t, alreadyDone3)
assert.Nil(t, finish3)
require.NotNil(t, finish3)
finish3(ctx, nil)
assert.Equal(t, sandbox.StatePausing, sbx.State())

// Trying to transition to Killed should also fail
// Follow-up transition should also work.
alreadyDone4, finish4, err4 := startRemoving(ctx, sbx, sandbox.RemoveOpts{Action: sandbox.StateActionKill})
require.Error(t, err4)
require.ErrorIs(t, err4, failureErr)
require.NoError(t, err4)
assert.False(t, alreadyDone4)
assert.Nil(t, finish4)
require.NotNil(t, finish4)
finish4(ctx, nil)
assert.Equal(t, sandbox.StateKilling, sbx.State())
}

// Test context timeout during wait
Expand Down
Loading