diff --git a/NEXT_CHANGELOG.md b/NEXT_CHANGELOG.md index 88f45aac96..8c34587667 100644 --- a/NEXT_CHANGELOG.md +++ b/NEXT_CHANGELOG.md @@ -10,5 +10,6 @@ ### Bundles * Add validation that served_models and served_entities are not used at the same time. Add client side translation logic. ([#3880](https://github.com/databricks/cli/pull/3880)) +* Gracefully handle interrupts (SIGINT, SIGTERM, SIGHUP, SIGQUIT) during bundle deployment and destruction by releasing locks before exiting ([#3758](https://github.com/databricks/cli/pull/3758)) ### API Changes diff --git a/acceptance/bundle/deploy/signal-cleanup/databricks.yml b/acceptance/bundle/deploy/signal-cleanup/databricks.yml new file mode 100644 index 0000000000..17a6fc84e9 --- /dev/null +++ b/acceptance/bundle/deploy/signal-cleanup/databricks.yml @@ -0,0 +1,10 @@ +bundle: + name: signal-test + +resources: + jobs: + job1: + name: job1 + + job2: + name: job2 (deploy after ${resources.jobs.job1.id}) diff --git a/acceptance/bundle/deploy/signal-cleanup/out.test.toml b/acceptance/bundle/deploy/signal-cleanup/out.test.toml new file mode 100644 index 0000000000..90061dedb1 --- /dev/null +++ b/acceptance/bundle/deploy/signal-cleanup/out.test.toml @@ -0,0 +1,5 @@ +Local = true +Cloud = false + +[EnvMatrix] + DATABRICKS_BUNDLE_ENGINE = ["terraform"] diff --git a/acceptance/bundle/deploy/signal-cleanup/output.txt b/acceptance/bundle/deploy/signal-cleanup/output.txt new file mode 100644 index 0000000000..b139caf23b --- /dev/null +++ b/acceptance/bundle/deploy/signal-cleanup/output.txt @@ -0,0 +1,83 @@ + +=== Wait until the deployment has started.Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/signal-test/default/files... +Deploying resources... +Deployment in progress, sending interrupt signal... + +>>> kill -INT [PID] + +>>> wait [PID] +Operation interrupted. Gracefully shutting down... +Error: terraform apply: exit status 1 + +Updating deployment state... + +Exit code: 130 + +=== A deletion request for deploy.lock should have been recorded in the requests file +>>> cat out.requests.txt +{ + "method": "POST", + "path": "/api/2.0/workspace/delete", + "body": { + "path": "/Workspace/Users/[USERNAME]/.bundle/signal-test/default/state/deploy.lock" + } +} + +=== An upload request for the state file should have been recorded in the requests file +>>> cat out.requests.txt +{ + "method": "POST", + "path": "/api/2.0/workspace-files/import-file/Workspace/Users/[USERNAME]/.bundle/signal-test/default/state/terraform.tfstate", + "q": { + "overwrite": "true" + }, + "body": { + "version": 4, + "terraform_version": "1.5.5", + "serial": 1, + "lineage": "[UUID]", + "outputs": {}, + "resources": [ + { + "mode": "managed", + "type": "databricks_job", + "name": "job2", + "provider": "provider[\"registry.terraform.io/databricks/databricks\"]", + "instances": [] + } + ], + "check_results": null + } +} + +=== A creation request for job1 should be recorded in the requests file. No request for job2 should exist since the process was terminated mid deployment. +>>> cat out.requests.txt +{ + "method": "POST", + "path": "/api/2.2/jobs/create", + "body": { + "deployment": { + "kind": "BUNDLE", + "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/signal-test/default/state/metadata.json" + }, + "edit_mode": "UI_LOCKED", + "format": "MULTI_TASK", + "max_concurrent_runs": 1, + "name": "job1", + "queue": { + "enabled": true + } + } +} + +>>> [CLI] bundle debug plan +{ + "plan": { + "resources.jobs.job1": { + "action": "create" + }, + "resources.jobs.job2": { + "action": "create" + } + } +} diff --git a/acceptance/bundle/deploy/signal-cleanup/script b/acceptance/bundle/deploy/signal-cleanup/script new file mode 100644 index 0000000000..ec9d92383b --- /dev/null +++ b/acceptance/bundle/deploy/signal-cleanup/script @@ -0,0 +1,35 @@ +#!/bin/bash + +# Start deployment in background, redirecting stderr to capture when deployment starts +$CLI bundle deploy 2>&1 & +DEPLOY_PID=$! + +# Wait for deployment to start by monitoring the requests file +# Once we see the job creation request starting, we know deployment is in progress +title "Wait until the deployment has started." +for i in {1..30}; do + if [ -f out.requests.txt ] && jq -e 'select(.method == "POST" and (.path | contains("/api/2.2/jobs/create")))' out.requests.txt >/dev/null 2>&1; then + echo "Deployment in progress, sending interrupt signal..." + break + fi + sleep 0.1 +done + +# Send interrupt signal +trace kill -INT $DEPLOY_PID + +# Wait for process to complete +errcode trace wait $DEPLOY_PID + +title "A deletion request for deploy.lock should have been recorded in the requests file" +trace cat out.requests.txt | jq 'select(.method == "POST" and (.path | contains("workspace/delete")) and (.body.path | contains("deploy.lock")))' + +title "An upload request for the state file should have been recorded in the requests file" +trace cat out.requests.txt | jq 'select(.method == "POST" and (.path | contains("workspace-files/import-file")) and (.path | contains("terraform.tfstate")))' + +title "A creation request for job1 should be recorded in the requests file. No request for job2 should exist since the process was terminated mid deployment." +trace cat out.requests.txt | jq 'select(.method == "POST" and (.path | contains("/api/2.2/jobs/create")))' + +trace $CLI bundle debug plan + +rm out.requests.txt diff --git a/acceptance/bundle/deploy/signal-cleanup/test.toml b/acceptance/bundle/deploy/signal-cleanup/test.toml new file mode 100644 index 0000000000..0c55bb6aa9 --- /dev/null +++ b/acceptance/bundle/deploy/signal-cleanup/test.toml @@ -0,0 +1,20 @@ +Local = true +Cloud = false +RecordRequests = true + +# Test only terraform engine (signal handling is the same for direct) +EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["terraform"] + +# Add delay to first job creation to ensure we can interrupt during deployment +[[Server]] +Pattern = "POST /api/2.2/jobs/create" +Response.StatusCode = 200 +Response.Body = '{"job_id": 1111}' + +# Large time to ensure deployment gets stuck when trying to create the first job. +Delay = "300s" + +# Replace PID numbers in kill/wait commands +[[Repls]] +Old = "(kill -INT |wait )\\d+" +New = "$1[PID]" diff --git a/acceptance/bundle/destroy/signal-cleanup/databricks.yml b/acceptance/bundle/destroy/signal-cleanup/databricks.yml new file mode 100644 index 0000000000..64d09d3762 --- /dev/null +++ b/acceptance/bundle/destroy/signal-cleanup/databricks.yml @@ -0,0 +1,7 @@ +bundle: + name: signal-test + +resources: + jobs: + job1: + name: job1 diff --git a/acceptance/bundle/destroy/signal-cleanup/out.test.toml b/acceptance/bundle/destroy/signal-cleanup/out.test.toml new file mode 100644 index 0000000000..54146af564 --- /dev/null +++ b/acceptance/bundle/destroy/signal-cleanup/out.test.toml @@ -0,0 +1,5 @@ +Local = true +Cloud = false + +[EnvMatrix] + DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/destroy/signal-cleanup/output.txt b/acceptance/bundle/destroy/signal-cleanup/output.txt new file mode 100644 index 0000000000..b3308aaa97 --- /dev/null +++ b/acceptance/bundle/destroy/signal-cleanup/output.txt @@ -0,0 +1,59 @@ +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/signal-test/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! + +=== Wait until the destroy has started.The following resources will be deleted: + delete job job1 + +All files and directories at the following location will be deleted: /Workspace/Users/[USERNAME]/.bundle/signal-test/default + +Destroy in progress, sending interrupt signal... + +>>> kill -INT [PID] + +>>> wait [PID] +Operation interrupted. Gracefully shutting down... +Error: cannot delete resources.jobs.job1: deleting id=[NUMID]: Post "[DATABRICKS_URL]/api/2.2/jobs/delete": context canceled + + +Exit code: 130 + +=== A deletion request for deploy.lock should have been recorded in the requests file +>>> cat out.requests.txt +{ + "method": "POST", + "path": "/api/2.0/workspace/delete", + "body": { + "path": "/Workspace/Users/[USERNAME]/.bundle/signal-test/default/state/deploy.lock" + } +} + +=== No deletion request for resources.json should be recorded. We still need state to complete the destroy. +>>> cat out.requests.txt + +>>> [CLI] bundle debug plan +{ + "plan": { + "resources.jobs.job1": { + "action": "skip", + "remote_state": { + "creator_user_name": "[USERNAME]", + "job_id": [NUMID], + "settings": { + "deployment": { + "kind": "BUNDLE", + "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/signal-test/default/state/metadata.json" + }, + "edit_mode": "UI_LOCKED", + "format": "MULTI_TASK", + "max_concurrent_runs": 1, + "name": "job1", + "queue": { + "enabled": true + } + } + } + } + } +} diff --git a/acceptance/bundle/destroy/signal-cleanup/script b/acceptance/bundle/destroy/signal-cleanup/script new file mode 100755 index 0000000000..d2ae482cc2 --- /dev/null +++ b/acceptance/bundle/destroy/signal-cleanup/script @@ -0,0 +1,34 @@ +#!/bin/bash + +# First deploy the bundle so we have something to destroy +$CLI bundle deploy --auto-approve +rm out.requests.txt + +# Start destroy in background, redirecting stderr to capture when destroy starts +$CLI bundle destroy --auto-approve 2>&1 & +DESTROY_PID=$! + +# Wait for destroy to start by monitoring for job deletion request +title "Wait until the destroy has started." +for i in {1..30}; do + if [ -f out.requests.txt ] && jq -e 'select(.method == "POST" and (.path | contains("/api/2.2/jobs/delete")))' out.requests.txt >/dev/null 2>&1; then + echo "Destroy in progress, sending interrupt signal..." + break + fi + sleep 0.1 +done + +# Send interrupt signal +trace kill -INT $DESTROY_PID + +# Wait for process to complete +errcode trace wait $DESTROY_PID + +title "A deletion request for deploy.lock should have been recorded in the requests file" +trace cat out.requests.txt | jq 'select(.method == "POST" and (.path | contains("workspace/delete")) and (.body.path | contains("deploy.lock")))' + +title "No deletion request for resources.json should be recorded. We still need state to complete the destroy." +trace cat out.requests.txt | jq 'select(.method == "POST" and (.path | contains("workspace/delete")) and (.body.path | contains("resources.json")))' + +trace $CLI bundle debug plan +rm out.requests.txt diff --git a/acceptance/bundle/destroy/signal-cleanup/test.toml b/acceptance/bundle/destroy/signal-cleanup/test.toml new file mode 100644 index 0000000000..5d6ce2f750 --- /dev/null +++ b/acceptance/bundle/destroy/signal-cleanup/test.toml @@ -0,0 +1,20 @@ +Local = true +Cloud = false +RecordRequests = true + +# Test only direct engine (signal handling is the same for terraform) +EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["direct"] + +# Add delay to first job deletion to ensure we can interrupt during destroy +[[Server]] +Pattern = "POST /api/2.2/jobs/delete" +Response.StatusCode = 200 +Response.Body = '{}' + +# Large time to ensure destroy gets stuck when deleting the first job. +Delay = "300s" + +# Replace PID numbers in kill/wait commands +[[Repls]] +Old = "(kill -INT |wait )\\d+" +New = "$1[PID]" diff --git a/bundle/deploy/lock/release.go b/bundle/deploy/lock/release.go index 26f95edfc9..71e42b82e5 100644 --- a/bundle/deploy/lock/release.go +++ b/bundle/deploy/lock/release.go @@ -5,25 +5,13 @@ import ( "github.com/databricks/cli/bundle" "github.com/databricks/cli/libs/diag" - "github.com/databricks/cli/libs/locker" "github.com/databricks/cli/libs/log" ) -type Goal string +type release struct{} -const ( - GoalBind = Goal("bind") - GoalUnbind = Goal("unbind") - GoalDeploy = Goal("deploy") - GoalDestroy = Goal("destroy") -) - -type release struct { - goal Goal -} - -func Release(goal Goal) bundle.Mutator { - return &release{goal} +func Release() bundle.Mutator { + return &release{} } func (m *release) Name() string { @@ -45,14 +33,6 @@ func (m *release) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnostics } log.Infof(ctx, "Releasing deployment lock") - switch m.goal { - case GoalDeploy: - return diag.FromErr(b.Locker.Unlock(ctx)) - case GoalBind, GoalUnbind: - return diag.FromErr(b.Locker.Unlock(ctx)) - case GoalDestroy: - return diag.FromErr(b.Locker.Unlock(ctx, locker.AllowLockFileNotExist)) - default: - return diag.Errorf("unknown goal for lock release: %s", m.goal) - } + err := b.Locker.Unlock(ctx) + return diag.FromErr(err) } diff --git a/bundle/deploy/lock/release_test.go b/bundle/deploy/lock/release_test.go new file mode 100644 index 0000000000..0cd68ef074 --- /dev/null +++ b/bundle/deploy/lock/release_test.go @@ -0,0 +1,181 @@ +package lock_test + +import ( + "context" + "reflect" + "testing" + + "github.com/databricks/cli/bundle" + "github.com/databricks/cli/bundle/config" + "github.com/databricks/cli/bundle/deploy/lock" + "github.com/databricks/cli/libs/filer" + lockpkg "github.com/databricks/cli/libs/locker" + "github.com/google/uuid" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// createTestLocker creates a locker with a filer for testing +func createTestLocker(f filer.Filer, targetDir string) *lockpkg.Locker { + l := &lockpkg.Locker{ + TargetDir: targetDir, + Active: false, + State: &lockpkg.LockState{ + ID: uuid.New(), + User: "test-user", + }, + } + // Use reflection to set the private filer field + v := reflect.ValueOf(l).Elem() + filerField := v.FieldByName("filer") + filerField = reflect.NewAt(filerField.Type(), filerField.Addr().UnsafePointer()).Elem() + filerField.Set(reflect.ValueOf(f)) + return l +} + +func TestReleaseIdempotent(t *testing.T) { + ctx := context.Background() + + // Create a temporary directory for the filer + tmpDir := t.TempDir() + f, err := filer.NewLocalClient(tmpDir) + require.NoError(t, err) + + // Create a bundle with a locker + enabled := true + b := &bundle.Bundle{ + Config: config.Root{ + Bundle: config.Bundle{ + Target: "test", + Deployment: config.Deployment{ + Lock: config.Lock{ + Enabled: &enabled, + }, + }, + }, + }, + } + + // Initialize locker with the filer + locker := createTestLocker(f, tmpDir) + b.Locker = locker + + // Acquire lock + err = locker.Lock(ctx, false) + require.NoError(t, err) + assert.True(t, locker.Active) + + // Verify lock file exists + _, err = f.Stat(ctx, "deploy.lock") + require.NoError(t, err) + + // First release - should succeed + mutator := lock.Release() + diags := bundle.Apply(ctx, b, mutator) + require.NoError(t, diags.Error()) + assert.False(t, locker.Active) + + // Verify lock file is deleted + _, err = f.Stat(ctx, "deploy.lock") + require.Error(t, err) + + // Second release - should be idempotent and succeed + diags = bundle.Apply(ctx, b, mutator) + require.NoError(t, diags.Error()) + assert.False(t, locker.Active) + + // Third release - should still be idempotent and succeed + diags = bundle.Apply(ctx, b, mutator) + require.NoError(t, diags.Error()) + assert.False(t, locker.Active) +} + +func TestReleaseFileAlreadyDeleted(t *testing.T) { + ctx := context.Background() + + // Create a temporary directory for the filer + tmpDir := t.TempDir() + f, err := filer.NewLocalClient(tmpDir) + require.NoError(t, err) + + // Create a bundle with a locker + enabled := true + b := &bundle.Bundle{ + Config: config.Root{ + Bundle: config.Bundle{ + Target: "test", + Deployment: config.Deployment{ + Lock: config.Lock{ + Enabled: &enabled, + }, + }, + }, + }, + } + + // Initialize locker with the filer + locker := createTestLocker(f, tmpDir) + b.Locker = locker + + // Acquire lock + err = locker.Lock(ctx, false) + require.NoError(t, err) + assert.True(t, locker.Active) + + // Verify lock file exists + _, err = f.Stat(ctx, "deploy.lock") + require.NoError(t, err) + + // Manually delete lock file + err = f.Delete(ctx, "deploy.lock") + require.NoError(t, err) + + // Release lock - should succeed even though lock file is already deleted + mutator := lock.Release() + diags := bundle.Apply(ctx, b, mutator) + require.NoError(t, diags.Error()) + assert.False(t, locker.Active) +} + +func TestReleaseWhenAnotherProcessHoldsLock(t *testing.T) { + ctx := context.Background() + + // Create a temporary directory for the filer + tmpDir := t.TempDir() + f, err := filer.NewLocalClient(tmpDir) + require.NoError(t, err) + + // Create two lockers - simulating two different processes + locker1 := createTestLocker(f, tmpDir) + locker2 := createTestLocker(f, tmpDir) + + // First locker acquires the lock + err = locker1.Lock(ctx, false) + require.NoError(t, err) + assert.True(t, locker1.Active) + + // Create bundle with second locker (different process) + enabled := true + b := &bundle.Bundle{ + Config: config.Root{ + Bundle: config.Bundle{ + Target: "test", + Deployment: config.Deployment{ + Lock: config.Lock{ + Enabled: &enabled, + }, + }, + }, + }, + Locker: locker2, + } + + // Set locker2 as active (simulating it thinks it has the lock, but it doesn't) + locker2.Active = true + + // Try to release with locker2 - should error because locker1 holds the lock + mutator := lock.Release() + diags := bundle.Apply(ctx, b, mutator) + require.Error(t, diags.Error()) + assert.Contains(t, diags.Error().Error(), "deploy lock acquired by") +} diff --git a/bundle/phases/bind.go b/bundle/phases/bind.go index c2a2721438..a3526a2250 100644 --- a/bundle/phases/bind.go +++ b/bundle/phases/bind.go @@ -20,7 +20,7 @@ func Bind(ctx context.Context, b *bundle.Bundle, opts *terraform.BindOptions) { } defer func() { - bundle.ApplyContext(ctx, b, lock.Release(lock.GoalBind)) + bundle.ApplyContext(ctx, b, lock.Release()) }() bundle.ApplySeqContext(ctx, b, @@ -40,7 +40,7 @@ func Unbind(ctx context.Context, b *bundle.Bundle, bundleType, tfResourceType, r } defer func() { - bundle.ApplyContext(ctx, b, lock.Release(lock.GoalUnbind)) + bundle.ApplyContext(ctx, b, lock.Release()) }() bundle.ApplySeqContext(ctx, b, diff --git a/bundle/phases/deploy.go b/bundle/phases/deploy.go index 85d1f04eeb..765eb088ce 100644 --- a/bundle/phases/deploy.go +++ b/bundle/phases/deploy.go @@ -3,6 +3,9 @@ package phases import ( "context" "errors" + "os" + "os/signal" + "syscall" "github.com/databricks/cli/bundle" "github.com/databricks/cli/bundle/artifacts" @@ -104,7 +107,10 @@ func deployCore(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan, di } // Even if deployment failed, there might be updates in states that we need to upload - bundle.ApplyContext(ctx, b, + // Use context.WithoutCancel to ensure state push completes even if context was cancelled + // (e.g., due to signal interruption). We want to save the current state before exiting. + statePushCtx := context.WithoutCancel(ctx) + bundle.ApplyContext(statePushCtx, b, statemgmt.StatePush(directDeployment), ) if logdiag.HasError(ctx) { @@ -132,6 +138,101 @@ func uploadLibraries(ctx context.Context, b *bundle.Bundle, libs map[string][]li ) } +// registerGracefulCleanup sets up signal handlers to release the lock +// before the process terminates. Returns a new context that will be cancelled +// when a signal is received, and a cleanup function for the exit path. +// +// This follows idiomatic Go patterns for graceful shutdown: +// 1. Use context cancellation to signal shutdown to the main routine +// 2. Use a done channel to wait for the main routine to complete +// 3. Only exit after confirming the main routine has terminated +// +// Catches SIGINT (Ctrl+C), SIGTERM, SIGHUP, and SIGQUIT. +// Note: SIGKILL and SIGSTOP cannot be caught - the kernel terminates the process directly. +func registerGracefulCleanup(ctx context.Context, b *bundle.Bundle) (context.Context, func()) { + // Create a cancellable context to propagate cancellation to the main routine + ctx, cancel := context.WithCancel(ctx) + + // Channel to signal that the main + cleanup routine has completed + cleanupDone := make(chan struct{}) + + // Channel to signal that a signal was received and handled + signalReceived := make(chan struct{}) + + // Channel to receive OS signals + sigChan := make(chan os.Signal, 1) + signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM, syscall.SIGHUP, syscall.SIGQUIT) + + signalHandler := func() { + // Wait for a signal to be received. + sig := <-sigChan + + // Stop listening for more signals. This allows for multiple interrupts + // to cause the program to force exit. + signal.Stop(sigChan) + + // Signal that we received an interrupt + close(signalReceived) + + cmdio.LogString(ctx, "Operation interrupted. Gracefully shutting down...") + + // Cancel the context to signal the main routine to stop + cancel() + + // Wait for the main routine to complete before releasing the lock + // This ensures we don't exit while operations are still in progress + <-cleanupDone + + // Release the lock using a context without cancellation to avoid cancellation issues + // We use context.WithoutCancel to preserve context values (like user agent) + // but remove the cancellation signal so the lock release can complete + releaseCtx := context.WithoutCancel(ctx) + bundle.ApplyContext(releaseCtx, b, lock.Release()) + + // Calculate exit code (128 + signal number) + exitCode := 128 + if s, ok := sig.(syscall.Signal); ok { + exitCode += int(s) + } + + // Exit with the appropriate signal exit code + os.Exit(exitCode) + } + + // Start goroutine to handle signals + go signalHandler() + + // Return cleanup function for the exit path + // This should be called via defer to ensure it runs even if there's a panic + cleanup := func() { + // Stop listening for signals + signal.Stop(sigChan) + + // Release the lock (idempotent) + // Use context.WithoutCancel to preserve context values but remove cancellation + releaseCtx := context.WithoutCancel(ctx) + bundle.ApplyContext(releaseCtx, b, lock.Release()) + + // Signal that the main routine has completed. + // Once the signal is recieved, + // This must be done AFTER all cleanup is complete + close(cleanupDone) + + // If a signal was received, wait indefinitely for the signal handler to exit + // This prevents the main function from returning and exiting with a different code + // If no signal was received, signalReceived will never be closed, so we just return + select { + case <-signalReceived: + // Signal was received, wait forever for os.Exit() in signal handler + select {} + default: + // No signal received, proceed with normal exit + } + } + + return ctx, cleanup +} + // The deploy phase deploys artifacts and resources. func Deploy(ctx context.Context, b *bundle.Bundle, outputHandler sync.OutputHandler, directDeployment bool) { log.Info(ctx, "Phase: deploy") @@ -148,10 +249,9 @@ func Deploy(ctx context.Context, b *bundle.Bundle, outputHandler sync.OutputHand return } - // lock is acquired here - defer func() { - bundle.ApplyContext(ctx, b, lock.Release(lock.GoalDeploy)) - }() + // lock is acquired here - set up signal handlers and defer cleanup + ctx, cleanup := registerGracefulCleanup(ctx, b) + defer cleanup() libs := deployPrepare(ctx, b, false, directDeployment) if logdiag.HasError(ctx) { diff --git a/bundle/phases/destroy.go b/bundle/phases/destroy.go index bea8d25c4e..be2493bc85 100644 --- a/bundle/phases/destroy.go +++ b/bundle/phases/destroy.go @@ -125,12 +125,13 @@ func Destroy(ctx context.Context, b *bundle.Bundle, directDeployment bool) { bundle.ApplyContext(ctx, b, lock.Acquire()) if logdiag.HasError(ctx) { + // lock is not acquired here return } - defer func() { - bundle.ApplyContext(ctx, b, lock.Release(lock.GoalDestroy)) - }() + // lock is acquired here - set up signal handlers and defer cleanup + ctx, cleanup := registerGracefulCleanup(ctx, b) + defer cleanup() if !directDeployment { bundle.ApplySeqContext(ctx, b, diff --git a/integration/libs/locker/locker_test.go b/integration/libs/locker/locker_test.go index 3ae80f8e71..2521dc5d43 100644 --- a/integration/libs/locker/locker_test.go +++ b/integration/libs/locker/locker_test.go @@ -186,8 +186,8 @@ func TestLockUnlockWithAllowsLockFileNotExist(t *testing.T) { err = f.Delete(ctx, "deploy.lock") assert.NoError(t, err) - // Assert error, because lock file does not exist - err = locker.Unlock(ctx, lockpkg.AllowLockFileNotExist) + // Unlock should succeed even though lock file does not exist + err = locker.Unlock(ctx) assert.NoError(t, err) assert.False(t, locker.Active) } diff --git a/libs/locker/locker.go b/libs/locker/locker.go index aadc50b587..3f348da7b8 100644 --- a/libs/locker/locker.go +++ b/libs/locker/locker.go @@ -8,7 +8,6 @@ import ( "fmt" "io" "io/fs" - "slices" "time" "github.com/databricks/cli/libs/filer" @@ -16,12 +15,6 @@ import ( "github.com/google/uuid" ) -type UnlockOption int - -const ( - AllowLockFileNotExist UnlockOption = iota -) - const LockFileName = "deploy.lock" // Locker object enables exclusive access to TargetDir's scope for a client. This @@ -171,15 +164,18 @@ func (locker *Locker) Lock(ctx context.Context, isForced bool) error { return nil } -func (locker *Locker) Unlock(ctx context.Context, opts ...UnlockOption) error { +func (locker *Locker) Unlock(ctx context.Context) error { + // If locker is already inactive, return without an error. + // This keeps the locker idempotent in the face of multiple unlock calls. if !locker.Active { - return errors.New("unlock called when lock is not held") + return nil } - // if allowLockFileNotExist is set, do not throw an error if the lock file does - // not exist. This is helpful when destroying a bundle in which case the lock - // file will be deleted before we have a chance to unlock - if _, err := locker.filer.Stat(ctx, LockFileName); errors.Is(err, fs.ErrNotExist) && slices.Contains(opts, AllowLockFileNotExist) { + // Check if the lock file exists. If it doesn't exist, this is not an error + // as it makes unlock idempotent. This is helpful in several scenarios: + // 1. When destroying a bundle, the lock file may be deleted before unlock + // 2. When unlock is called multiple times, which is possible when handling signals like SIGINT, SIGTERM, SIGHUP, SIGQUIT + if _, err := locker.filer.Stat(ctx, LockFileName); errors.Is(err, fs.ErrNotExist) { locker.Active = false return nil }