diff --git a/packages/api/internal/handlers/template_delete.go b/packages/api/internal/handlers/template_delete.go index 3680c8db64..14ef1bfe4b 100644 --- a/packages/api/internal/handlers/template_delete.go +++ b/packages/api/internal/handlers/template_delete.go @@ -10,6 +10,7 @@ import ( "github.com/e2b-dev/infra/packages/api/internal/api" "github.com/e2b-dev/infra/packages/api/internal/sandbox" + "github.com/e2b-dev/infra/packages/db/pkg/dberrors" "github.com/e2b-dev/infra/packages/db/queries" "github.com/e2b-dev/infra/packages/shared/pkg/id" "github.com/e2b-dev/infra/packages/shared/pkg/logger" @@ -64,8 +65,37 @@ func (a *APIStore) DeleteTemplatesTemplateID(c *gin.Context, aliasOrTemplateID a } } + // Use a transaction to atomically check for snapshots and delete the template. + // This prevents a TOCTOU race where a snapshot could be created between the + // ExistsTemplateSnapshots check and the DeleteTemplate call. + txClient, tx, err := a.sqlcDB.WithTx(ctx) + if err != nil { + telemetry.ReportCriticalError(ctx, "error when beginning transaction", err) + a.sendAPIStoreError(c, http.StatusInternalServerError, "Error when beginning transaction") + + return + } + defer tx.Rollback(ctx) + + // Lock the env row to prevent concurrent snapshot creation (UpsertSnapshot) + // from inserting a snapshot with base_env_id referencing this env while we + // are checking and deleting it. + _, err = txClient.LockEnvForUpdate(ctx, templateID) + if err != nil { + if dberrors.IsNotFoundError(err) { + a.sendAPIStoreError(c, http.StatusNotFound, fmt.Sprintf("Template '%s' not found", templateID)) + + return + } + + telemetry.ReportCriticalError(ctx, "error when locking env for deletion", err) + a.sendAPIStoreError(c, http.StatusInternalServerError, "Error when deleting template") + + return + } + // check if base template has snapshots - hasSnapshots, err := a.sqlcDB.ExistsTemplateSnapshots(ctx, templateID) + hasSnapshots, err := txClient.ExistsTemplateSnapshots(ctx, templateID) if err != nil { telemetry.ReportError(ctx, "error when checking if base template has snapshots", err) a.sendAPIStoreError(c, http.StatusInternalServerError, "Error when checking if template has snapshots") @@ -85,7 +115,7 @@ func (a *APIStore) DeleteTemplatesTemplateID(c *gin.Context, aliasOrTemplateID a // Build artifacts are intentionally NOT deleted from storage here because builds are layered diffs // that may be referenced by other builds' header mappings. // [ENG-3477] a future GC mechanism will handle orphaned storage. - aliasKeys, err := a.sqlcDB.DeleteTemplate(ctx, queries.DeleteTemplateParams{ + aliasKeys, err := txClient.DeleteTemplate(ctx, queries.DeleteTemplateParams{ TemplateID: templateID, TeamID: team.ID, }) @@ -96,6 +126,14 @@ func (a *APIStore) DeleteTemplatesTemplateID(c *gin.Context, aliasOrTemplateID a return } + err = tx.Commit(ctx) + if err != nil { + telemetry.ReportCriticalError(ctx, "error when committing template deletion", err) + a.sendAPIStoreError(c, http.StatusInternalServerError, "Error when deleting template") + + return + } + a.templateCache.InvalidateAllTags(context.WithoutCancel(ctx), templateID) a.templateCache.InvalidateAliasesByTemplateID(context.WithoutCancel(ctx), templateID, aliasKeys) diff --git a/packages/api/internal/orchestrator/pause_instance.go b/packages/api/internal/orchestrator/pause_instance.go index 09b4e90f43..5f22ec4309 100644 --- a/packages/api/internal/orchestrator/pause_instance.go +++ b/packages/api/internal/orchestrator/pause_instance.go @@ -13,6 +13,7 @@ import ( "github.com/e2b-dev/infra/packages/api/internal/orchestrator/nodemanager" "github.com/e2b-dev/infra/packages/api/internal/sandbox" + "github.com/e2b-dev/infra/packages/db/pkg/dberrors" "github.com/e2b-dev/infra/packages/db/pkg/types" "github.com/e2b-dev/infra/packages/db/queries" "github.com/e2b-dev/infra/packages/shared/pkg/grpc/orchestrator" @@ -27,12 +28,31 @@ func (PauseQueueExhaustedError) Error() string { return "The pause queue is exhausted" } +// BaseTemplateDeletedError is returned when a snapshot cannot be created because +// the base template was deleted between sandbox creation and pause. +type BaseTemplateDeletedError struct { + BaseTemplateID string +} + +func (e BaseTemplateDeletedError) Error() string { + return fmt.Sprintf("base template '%s' was deleted, cannot create snapshot", e.BaseTemplateID) +} + func (o *Orchestrator) pauseSandbox(ctx context.Context, node *nodemanager.Node, sbx sandbox.Sandbox) error { ctx, span := tracer.Start(ctx, "pause-sandbox") defer span.End() result, err := o.throttledUpsertSnapshot(ctx, buildUpsertSnapshotParams(sbx, node)) if err != nil { + // Check if the error is an FK violation on base_env_id, which means + // the base template was deleted between sandbox creation and pause. + if dberrors.IsForeignKeyViolation(err) { + telemetry.ReportError(ctx, "base template was deleted, cannot create snapshot", + err, telemetry.WithTemplateID(sbx.BaseTemplateID)) + + return BaseTemplateDeletedError{BaseTemplateID: sbx.BaseTemplateID} + } + telemetry.ReportCriticalError(ctx, "error inserting snapshot for env", err) return err diff --git a/packages/db/queries/lock_env_for_update.sql.go b/packages/db/queries/lock_env_for_update.sql.go new file mode 100644 index 0000000000..02b8bb938f --- /dev/null +++ b/packages/db/queries/lock_env_for_update.sql.go @@ -0,0 +1,23 @@ +// Code generated by sqlc. DO NOT EDIT. +// versions: +// sqlc v1.29.0 +// source: lock_env_for_update.sql + +package queries + +import ( + "context" +) + +const lockEnvForUpdate = `-- name: LockEnvForUpdate :one +SELECT id FROM "public"."envs" +WHERE id = $1 +FOR UPDATE +` + +func (q *Queries) LockEnvForUpdate(ctx context.Context, envID string) (string, error) { + row := q.db.QueryRow(ctx, lockEnvForUpdate, envID) + var id string + err := row.Scan(&id) + return id, err +} diff --git a/packages/db/queries/templates/lock_env_for_update.sql b/packages/db/queries/templates/lock_env_for_update.sql new file mode 100644 index 0000000000..fabb78fecc --- /dev/null +++ b/packages/db/queries/templates/lock_env_for_update.sql @@ -0,0 +1,4 @@ +-- name: LockEnvForUpdate :one +SELECT id FROM "public"."envs" +WHERE id = @env_id +FOR UPDATE;