Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 40 additions & 2 deletions packages/api/internal/handlers/template_delete.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (

"github.com/e2b-dev/infra/packages/api/internal/api"
"github.com/e2b-dev/infra/packages/api/internal/sandbox"
"github.com/e2b-dev/infra/packages/db/pkg/dberrors"
"github.com/e2b-dev/infra/packages/db/queries"
"github.com/e2b-dev/infra/packages/shared/pkg/id"
"github.com/e2b-dev/infra/packages/shared/pkg/logger"
Expand Down Expand Up @@ -64,8 +65,37 @@ func (a *APIStore) DeleteTemplatesTemplateID(c *gin.Context, aliasOrTemplateID a
}
}

// Use a transaction to atomically check for snapshots and delete the template.
// This prevents a TOCTOU race where a snapshot could be created between the
// ExistsTemplateSnapshots check and the DeleteTemplate call.
txClient, tx, err := a.sqlcDB.WithTx(ctx)
if err != nil {
telemetry.ReportCriticalError(ctx, "error when beginning transaction", err)
a.sendAPIStoreError(c, http.StatusInternalServerError, "Error when beginning transaction")

return
}
defer tx.Rollback(ctx)

// Lock the env row to prevent concurrent snapshot creation (UpsertSnapshot)
// from inserting a snapshot with base_env_id referencing this env while we
// are checking and deleting it.
_, err = txClient.LockEnvForUpdate(ctx, templateID)
if err != nil {
if dberrors.IsNotFoundError(err) {
a.sendAPIStoreError(c, http.StatusNotFound, fmt.Sprintf("Template '%s' not found", templateID))

return
}

telemetry.ReportCriticalError(ctx, "error when locking env for deletion", err)
a.sendAPIStoreError(c, http.StatusInternalServerError, "Error when deleting template")

return
}

// check if base template has snapshots
hasSnapshots, err := a.sqlcDB.ExistsTemplateSnapshots(ctx, templateID)
hasSnapshots, err := txClient.ExistsTemplateSnapshots(ctx, templateID)
if err != nil {
telemetry.ReportError(ctx, "error when checking if base template has snapshots", err)
a.sendAPIStoreError(c, http.StatusInternalServerError, "Error when checking if template has snapshots")
Expand All @@ -85,7 +115,7 @@ func (a *APIStore) DeleteTemplatesTemplateID(c *gin.Context, aliasOrTemplateID a
// Build artifacts are intentionally NOT deleted from storage here because builds are layered diffs
// that may be referenced by other builds' header mappings.
// [ENG-3477] a future GC mechanism will handle orphaned storage.
aliasKeys, err := a.sqlcDB.DeleteTemplate(ctx, queries.DeleteTemplateParams{
aliasKeys, err := txClient.DeleteTemplate(ctx, queries.DeleteTemplateParams{
TemplateID: templateID,
TeamID: team.ID,
})
Expand All @@ -96,6 +126,14 @@ func (a *APIStore) DeleteTemplatesTemplateID(c *gin.Context, aliasOrTemplateID a
return
}

err = tx.Commit(ctx)
if err != nil {
telemetry.ReportCriticalError(ctx, "error when committing template deletion", err)
a.sendAPIStoreError(c, http.StatusInternalServerError, "Error when deleting template")

return
}

a.templateCache.InvalidateAllTags(context.WithoutCancel(ctx), templateID)
a.templateCache.InvalidateAliasesByTemplateID(context.WithoutCancel(ctx), templateID, aliasKeys)

Expand Down
20 changes: 20 additions & 0 deletions packages/api/internal/orchestrator/pause_instance.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (

"github.com/e2b-dev/infra/packages/api/internal/orchestrator/nodemanager"
"github.com/e2b-dev/infra/packages/api/internal/sandbox"
"github.com/e2b-dev/infra/packages/db/pkg/dberrors"
"github.com/e2b-dev/infra/packages/db/pkg/types"
"github.com/e2b-dev/infra/packages/db/queries"
"github.com/e2b-dev/infra/packages/shared/pkg/grpc/orchestrator"
Expand All @@ -27,12 +28,31 @@ func (PauseQueueExhaustedError) Error() string {
return "The pause queue is exhausted"
}

// BaseTemplateDeletedError is returned when a snapshot cannot be created because
// the base template was deleted between sandbox creation and pause.
type BaseTemplateDeletedError struct {
BaseTemplateID string
}

func (e BaseTemplateDeletedError) Error() string {
return fmt.Sprintf("base template '%s' was deleted, cannot create snapshot", e.BaseTemplateID)
}

func (o *Orchestrator) pauseSandbox(ctx context.Context, node *nodemanager.Node, sbx sandbox.Sandbox) error {
ctx, span := tracer.Start(ctx, "pause-sandbox")
defer span.End()

result, err := o.throttledUpsertSnapshot(ctx, buildUpsertSnapshotParams(sbx, node))
if err != nil {
// Check if the error is an FK violation on base_env_id, which means
// the base template was deleted between sandbox creation and pause.
if dberrors.IsForeignKeyViolation(err) {
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

dberrors.IsForeignKeyViolation(err) matches any FK violation from UpsertSnapshot, not just the base_env_id one. That query has several other FK constraints — snapshots.team_id, the env_build_assignments.env_id / build_id edges, and env_builds.cluster_node_id (if it has a FK). A violation on any of those would be silently reclassified as a non-critical BaseTemplateDeletedError instead of propagating as a critical error, masking real bugs.

Use pgconn.PgError.ConstraintName to narrow the check to the specific constraint, e.g.:

var pgErr *pgconn.PgError
if errors.As(err, &pgErr) && pgErr.Code == "23503" && pgErr.ConstraintName == "snapshots_base_env_id_fkey" {

(adjust the constraint name to match the actual schema)

telemetry.ReportError(ctx, "base template was deleted, cannot create snapshot",
err, telemetry.WithTemplateID(sbx.BaseTemplateID))

return BaseTemplateDeletedError{BaseTemplateID: sbx.BaseTemplateID}
Comment on lines +49 to +53
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Preserve original FK error for pause fallback

Returning BaseTemplateDeletedError here discards the original PostgreSQL error, but the only caller (removeSandboxFromNode) still relies on dberrors.IsForeignKeyViolation(err) to trigger the kill-sandbox fallback when pause cannot snapshot due to a deleted base template. In that race, this branch now makes the caller miss the FK case, skip cleanup, and return a generic auto-pause failure instead of executing the intended fallback path.

Useful? React with 👍 / 👎.

}

telemetry.ReportCriticalError(ctx, "error inserting snapshot for env", err)

return err
Expand Down
23 changes: 23 additions & 0 deletions packages/db/queries/lock_env_for_update.sql.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions packages/db/queries/templates/lock_env_for_update.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
-- name: LockEnvForUpdate :one
SELECT id FROM "public"."envs"
WHERE id = @env_id
FOR UPDATE;
Loading