From 48bd034c456602f9b003d4582af63ea86ffc43c2 Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Tue, 14 Oct 2025 13:43:28 +0200 Subject: [PATCH 1/6] Gracefully exit when terminated mid deploy / destroy --- .../deploy/signal-cleanup/databricks.yml | 10 ++++ .../deploy/signal-cleanup/out.test.toml | 5 ++ .../bundle/deploy/signal-cleanup/output.txt | 40 ++++++++++++++++ .../bundle/deploy/signal-cleanup/script | 30 ++++++++++++ .../bundle/deploy/signal-cleanup/test.toml | 17 +++++++ .../destroy/signal-cleanup/databricks.yml | 10 ++++ .../destroy/signal-cleanup/out.test.toml | 5 ++ .../bundle/destroy/signal-cleanup/output.txt | 33 +++++++++++++ .../bundle/destroy/signal-cleanup/script | 32 +++++++++++++ .../bundle/destroy/signal-cleanup/test.toml | 20 ++++++++ bundle/phases/deploy.go | 46 +++++++++++++++++-- bundle/phases/destroy.go | 6 +-- 12 files changed, 247 insertions(+), 7 deletions(-) create mode 100644 acceptance/bundle/deploy/signal-cleanup/databricks.yml create mode 100644 acceptance/bundle/deploy/signal-cleanup/out.test.toml create mode 100644 acceptance/bundle/deploy/signal-cleanup/output.txt create mode 100644 acceptance/bundle/deploy/signal-cleanup/script create mode 100644 acceptance/bundle/deploy/signal-cleanup/test.toml create mode 100644 acceptance/bundle/destroy/signal-cleanup/databricks.yml create mode 100644 acceptance/bundle/destroy/signal-cleanup/out.test.toml create mode 100644 acceptance/bundle/destroy/signal-cleanup/output.txt create mode 100755 acceptance/bundle/destroy/signal-cleanup/script create mode 100644 acceptance/bundle/destroy/signal-cleanup/test.toml diff --git a/acceptance/bundle/deploy/signal-cleanup/databricks.yml b/acceptance/bundle/deploy/signal-cleanup/databricks.yml new file mode 100644 index 0000000000..17a6fc84e9 --- /dev/null +++ b/acceptance/bundle/deploy/signal-cleanup/databricks.yml @@ -0,0 +1,10 @@ +bundle: + name: signal-test + +resources: + jobs: + job1: + name: job1 + + job2: + name: job2 (deploy after ${resources.jobs.job1.id}) diff --git a/acceptance/bundle/deploy/signal-cleanup/out.test.toml b/acceptance/bundle/deploy/signal-cleanup/out.test.toml new file mode 100644 index 0000000000..e092fd5ed6 --- /dev/null +++ b/acceptance/bundle/deploy/signal-cleanup/out.test.toml @@ -0,0 +1,5 @@ +Local = true +Cloud = false + +[EnvMatrix] + DATABRICKS_BUNDLE_ENGINE = ["terraform", "direct-exp"] diff --git a/acceptance/bundle/deploy/signal-cleanup/output.txt b/acceptance/bundle/deploy/signal-cleanup/output.txt new file mode 100644 index 0000000000..fab0526c9d --- /dev/null +++ b/acceptance/bundle/deploy/signal-cleanup/output.txt @@ -0,0 +1,40 @@ + +=== Wait until the deployment has started.Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/signal-test/default/files... +Deploying resources... +Deployment in progress, sending interrupt signal... + +>>> kill -INT [PID] + +>>> wait [PID] +Operation interrupted. Gracefully shutting down... + +Exit code: 130 + +>>> cat out.requests.txt +{ + "method": "POST", + "path": "/api/2.0/workspace/delete", + "body": { + "path": "/Workspace/Users/[USERNAME]/.bundle/signal-test/default/state/deploy.lock" + } +} + +=== A creation request for job1 should be recorded in the requests file. No request for job2 should exist since the process was terminated mid deployment. +>>> cat out.requests.txt +{ + "method": "POST", + "path": "/api/2.2/jobs/create", + "body": { + "deployment": { + "kind": "BUNDLE", + "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/signal-test/default/state/metadata.json" + }, + "edit_mode": "UI_LOCKED", + "format": "MULTI_TASK", + "max_concurrent_runs": 1, + "name": "job1", + "queue": { + "enabled": true + } + } +} diff --git a/acceptance/bundle/deploy/signal-cleanup/script b/acceptance/bundle/deploy/signal-cleanup/script new file mode 100644 index 0000000000..1321778167 --- /dev/null +++ b/acceptance/bundle/deploy/signal-cleanup/script @@ -0,0 +1,30 @@ +#!/bin/bash + +# Start deployment in background, redirecting stderr to capture when deployment starts +$CLI bundle deploy 2>&1 & +DEPLOY_PID=$! + +# Wait for deployment to start by monitoring the requests file +# Once we see the job creation request starting, we know deployment is in progress +title "Wait until the deployment has started." +for i in {1..30}; do + if [ -f out.requests.txt ] && jq -e 'select(.method == "POST" and (.path | contains("/api/2.2/jobs/create")))' out.requests.txt >/dev/null 2>&1; then + echo "Deployment in progress, sending interrupt signal..." + break + fi + sleep 0.1 +done + +# Send interrupt signal +trace kill -INT $DEPLOY_PID + +# Wait for process to complete +errcode trace wait $DEPLOY_PID + +# A deletion request for deploy.lock should have been recorded in the requests file +trace cat out.requests.txt | jq 'select(.method == "POST" and (.path | contains("workspace/delete")) and (.body.path | contains("deploy.lock")))' + +title "A creation request for job1 should be recorded in the requests file. No request for job2 should exist since the process was terminated mid deployment." +trace cat out.requests.txt | jq 'select(.method == "POST" and (.path | contains("/api/2.2/jobs/create")))' + +rm out.requests.txt diff --git a/acceptance/bundle/deploy/signal-cleanup/test.toml b/acceptance/bundle/deploy/signal-cleanup/test.toml new file mode 100644 index 0000000000..13cfea3c93 --- /dev/null +++ b/acceptance/bundle/deploy/signal-cleanup/test.toml @@ -0,0 +1,17 @@ +Local = true +Cloud = false +RecordRequests = true + +# Add delay to first job creation to ensure we can interrupt during deployment +[[Server]] +Pattern = "POST /api/2.2/jobs/create" +Response.StatusCode = 200 +Response.Body = '{"job_id": 1111}' + +# Large time to ensure deployment gets stuck when trying to create the first job. +Delay = "300s" + +# Replace PID numbers in kill/wait commands +[[Repls]] +Old = "(kill -INT |wait )\\d+" +New = "$1[PID]" diff --git a/acceptance/bundle/destroy/signal-cleanup/databricks.yml b/acceptance/bundle/destroy/signal-cleanup/databricks.yml new file mode 100644 index 0000000000..1eb4342413 --- /dev/null +++ b/acceptance/bundle/destroy/signal-cleanup/databricks.yml @@ -0,0 +1,10 @@ +bundle: + name: signal-test + +resources: + jobs: + job1: + name: job1 + + job2: + name: job2 (depends on ${resources.jobs.job1.id}) diff --git a/acceptance/bundle/destroy/signal-cleanup/out.test.toml b/acceptance/bundle/destroy/signal-cleanup/out.test.toml new file mode 100644 index 0000000000..90061dedb1 --- /dev/null +++ b/acceptance/bundle/destroy/signal-cleanup/out.test.toml @@ -0,0 +1,5 @@ +Local = true +Cloud = false + +[EnvMatrix] + DATABRICKS_BUNDLE_ENGINE = ["terraform"] diff --git a/acceptance/bundle/destroy/signal-cleanup/output.txt b/acceptance/bundle/destroy/signal-cleanup/output.txt new file mode 100644 index 0000000000..c0697f44c7 --- /dev/null +++ b/acceptance/bundle/destroy/signal-cleanup/output.txt @@ -0,0 +1,33 @@ +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/signal-test/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! + +=== Wait until the destroy has started.The following resources will be deleted: + delete job job1 + delete job job2 + +All files and directories at the following location will be deleted: /Workspace/Users/[USERNAME]/.bundle/signal-test/default + +Destroy in progress, sending interrupt signal... + +>>> kill -INT [PID] + +>>> wait [PID] +Operation interrupted. Gracefully shutting down... + +Exit code: 130 + +>>> cat out.requests.txt + +=== A deletion request for job2 should be recorded in the requests file. No request for job1 should exist since the process was terminated mid destroy. +>>> cat out.requests.txt +{ + "method": "POST", + "path": "/api/2.2/jobs/delete", + "body": { + "job_id": [NUMID] + } +} + +=== [CLI] \ No newline at end of file diff --git a/acceptance/bundle/destroy/signal-cleanup/script b/acceptance/bundle/destroy/signal-cleanup/script new file mode 100755 index 0000000000..459dae2cf9 --- /dev/null +++ b/acceptance/bundle/destroy/signal-cleanup/script @@ -0,0 +1,32 @@ +#!/bin/bash + +# First deploy the bundle so we have something to destroy +$CLI bundle deploy --auto-approve + +# Start destroy in background, redirecting stderr to capture when destroy starts +$CLI bundle destroy --auto-approve 2>&1 & +DESTROY_PID=$! + +# Wait for destroy to start by monitoring for job deletion request +title "Wait until the destroy has started." +for i in {1..30}; do + if [ -f out.requests.txt ] && jq -e 'select(.method == "POST" and (.path | contains("/api/2.2/jobs/delete")))' out.requests.txt >/dev/null 2>&1; then + echo "Destroy in progress, sending interrupt signal..." + break + fi + sleep 0.1 +done + +# Send interrupt signal +trace kill -INT $DESTROY_PID + +# Wait for process to complete +errcode trace wait $DESTROY_PID + +# A deletion request for destroy.lock should have been recorded in the requests file +trace cat out.requests.txt | jq 'select(.method == "POST" and (.path | contains("workspace/delete")) and (.body.path | contains("destroy.lock")))' + +title "A deletion request for job2 should be recorded in the requests file. No request for job1 should exist since the process was terminated mid destroy." +trace cat out.requests.txt | jq 'select(.method == "POST" and (.path | contains("/api/2.2/jobs/delete")))' + +rm out.requests.txt diff --git a/acceptance/bundle/destroy/signal-cleanup/test.toml b/acceptance/bundle/destroy/signal-cleanup/test.toml new file mode 100644 index 0000000000..f8186b73d9 --- /dev/null +++ b/acceptance/bundle/destroy/signal-cleanup/test.toml @@ -0,0 +1,20 @@ +Local = true +Cloud = false +RecordRequests = true + +# Test only terraform engine (signal handling is the same for direct) +EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["terraform"] + +# Add delay to first job deletion to ensure we can interrupt during destroy +[[Server]] +Pattern = "POST /api/2.2/jobs/delete" +Response.StatusCode = 200 +Response.Body = '{}' + +# Large time to ensure destroy gets stuck when deleting the first job. +Delay = "300s" + +# Replace PID numbers in kill/wait commands +[[Repls]] +Old = "(kill -INT |wait )\\d+" +New = "$1[PID]" diff --git a/bundle/phases/deploy.go b/bundle/phases/deploy.go index 85d1f04eeb..ec620947fe 100644 --- a/bundle/phases/deploy.go +++ b/bundle/phases/deploy.go @@ -3,6 +3,9 @@ package phases import ( "context" "errors" + "os" + "os/signal" + "syscall" "github.com/databricks/cli/bundle" "github.com/databricks/cli/bundle/artifacts" @@ -132,6 +135,43 @@ func uploadLibraries(ctx context.Context, b *bundle.Bundle, libs map[string][]li ) } +// registerGracefulCleanup sets up signal handlers to release the lock +// before the process terminates. Returns a cleanup function for the normal exit path. +// +// Catches SIGINT (Ctrl+C), SIGTERM, SIGHUP, and SIGQUIT. +// Note: SIGKILL and SIGSTOP cannot be caught - the kernel terminates the process directly. +func registerGracefulCleanup(ctx context.Context, b *bundle.Bundle, goal lock.Goal) func() { + sigChan := make(chan os.Signal, 1) + signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM, syscall.SIGHUP, syscall.SIGQUIT) + + // Start goroutine to handle signals + go func() { + sig := <-sigChan + // Stop listening for more signals + signal.Stop(sigChan) + + cmdio.LogString(ctx, "Operation interrupted. Gracefully shutting down...") + + // Release the lock. + bundle.ApplyContext(ctx, b, lock.Release(goal)) + + // Exit immediately with standard signal exit code (128 + signal number). + // The deferred cleanup function returned below won't run because we exit here. + exitCode := 128 + if s, ok := sig.(syscall.Signal); ok { + exitCode += int(s) + } + os.Exit(exitCode) + }() + + // Return cleanup function for normal exit path + return func() { + signal.Stop(sigChan) + // Don't close the channel - it causes the goroutine to receive nil + bundle.ApplyContext(ctx, b, lock.Release(goal)) + } +} + // The deploy phase deploys artifacts and resources. func Deploy(ctx context.Context, b *bundle.Bundle, outputHandler sync.OutputHandler, directDeployment bool) { log.Info(ctx, "Phase: deploy") @@ -148,10 +188,8 @@ func Deploy(ctx context.Context, b *bundle.Bundle, outputHandler sync.OutputHand return } - // lock is acquired here - defer func() { - bundle.ApplyContext(ctx, b, lock.Release(lock.GoalDeploy)) - }() + // lock is acquired here - set up signal handlers and defer cleanup + defer registerGracefulCleanup(ctx, b, lock.GoalDeploy)() libs := deployPrepare(ctx, b, false, directDeployment) if logdiag.HasError(ctx) { diff --git a/bundle/phases/destroy.go b/bundle/phases/destroy.go index bea8d25c4e..29faaa7aef 100644 --- a/bundle/phases/destroy.go +++ b/bundle/phases/destroy.go @@ -125,12 +125,12 @@ func Destroy(ctx context.Context, b *bundle.Bundle, directDeployment bool) { bundle.ApplyContext(ctx, b, lock.Acquire()) if logdiag.HasError(ctx) { + // lock is not acquired here return } - defer func() { - bundle.ApplyContext(ctx, b, lock.Release(lock.GoalDestroy)) - }() + // lock is acquired here - set up signal handlers and defer cleanup + defer registerGracefulCleanup(ctx, b, lock.GoalDestroy)() if !directDeployment { bundle.ApplySeqContext(ctx, b, From 3023134c88a496788f209aa05bed5d54986573ea Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Thu, 6 Nov 2025 12:55:44 +0100 Subject: [PATCH 2/6] address comments --- .../deploy/signal-cleanup/out.test.toml | 2 +- .../bundle/deploy/signal-cleanup/output.txt | 17 ++++ .../bundle/deploy/signal-cleanup/script | 2 + .../bundle/deploy/signal-cleanup/test.toml | 3 + .../bundle/destroy/signal-cleanup/output.txt | 14 ++- .../bundle/destroy/signal-cleanup/script | 2 + bundle/bundle.go | 3 + bundle/deploy/lock/release.go | 29 +++--- bundle/phases/deploy.go | 89 +++++++++++++++---- bundle/phases/destroy.go | 3 +- 10 files changed, 135 insertions(+), 29 deletions(-) diff --git a/acceptance/bundle/deploy/signal-cleanup/out.test.toml b/acceptance/bundle/deploy/signal-cleanup/out.test.toml index e092fd5ed6..90061dedb1 100644 --- a/acceptance/bundle/deploy/signal-cleanup/out.test.toml +++ b/acceptance/bundle/deploy/signal-cleanup/out.test.toml @@ -2,4 +2,4 @@ Local = true Cloud = false [EnvMatrix] - DATABRICKS_BUNDLE_ENGINE = ["terraform", "direct-exp"] + DATABRICKS_BUNDLE_ENGINE = ["terraform"] diff --git a/acceptance/bundle/deploy/signal-cleanup/output.txt b/acceptance/bundle/deploy/signal-cleanup/output.txt index fab0526c9d..9f44243750 100644 --- a/acceptance/bundle/deploy/signal-cleanup/output.txt +++ b/acceptance/bundle/deploy/signal-cleanup/output.txt @@ -7,6 +7,11 @@ Deployment in progress, sending interrupt signal... >>> wait [PID] Operation interrupted. Gracefully shutting down... +Error: terraform apply: exit status 1 + +Updating deployment state... +Error: failed in rate limiter: context canceled + Exit code: 130 @@ -38,3 +43,15 @@ Exit code: 130 } } } + +>>> [CLI] bundle debug plan +{ + "plan": { + "resources.jobs.job1": { + "action": "create" + }, + "resources.jobs.job2": { + "action": "create" + } + } +} diff --git a/acceptance/bundle/deploy/signal-cleanup/script b/acceptance/bundle/deploy/signal-cleanup/script index 1321778167..046adb2aab 100644 --- a/acceptance/bundle/deploy/signal-cleanup/script +++ b/acceptance/bundle/deploy/signal-cleanup/script @@ -27,4 +27,6 @@ trace cat out.requests.txt | jq 'select(.method == "POST" and (.path | contains( title "A creation request for job1 should be recorded in the requests file. No request for job2 should exist since the process was terminated mid deployment." trace cat out.requests.txt | jq 'select(.method == "POST" and (.path | contains("/api/2.2/jobs/create")))' +trace $CLI bundle debug plan + rm out.requests.txt diff --git a/acceptance/bundle/deploy/signal-cleanup/test.toml b/acceptance/bundle/deploy/signal-cleanup/test.toml index 13cfea3c93..0c55bb6aa9 100644 --- a/acceptance/bundle/deploy/signal-cleanup/test.toml +++ b/acceptance/bundle/deploy/signal-cleanup/test.toml @@ -2,6 +2,9 @@ Local = true Cloud = false RecordRequests = true +# Test only terraform engine (signal handling is the same for direct) +EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["terraform"] + # Add delay to first job creation to ensure we can interrupt during deployment [[Server]] Pattern = "POST /api/2.2/jobs/create" diff --git a/acceptance/bundle/destroy/signal-cleanup/output.txt b/acceptance/bundle/destroy/signal-cleanup/output.txt index c0697f44c7..6ee581ddb3 100644 --- a/acceptance/bundle/destroy/signal-cleanup/output.txt +++ b/acceptance/bundle/destroy/signal-cleanup/output.txt @@ -15,6 +15,8 @@ Destroy in progress, sending interrupt signal... >>> wait [PID] Operation interrupted. Gracefully shutting down... +Error: terraform apply: exit status 1 + Exit code: 130 @@ -30,4 +32,14 @@ Exit code: 130 } } -=== [CLI] \ No newline at end of file +>>> [CLI] bundle debug plan +{ + "plan": { + "resources.jobs.job1": { + "action": "skip" + }, + "resources.jobs.job2": { + "action": "skip" + } + } +} diff --git a/acceptance/bundle/destroy/signal-cleanup/script b/acceptance/bundle/destroy/signal-cleanup/script index 459dae2cf9..7936106904 100755 --- a/acceptance/bundle/destroy/signal-cleanup/script +++ b/acceptance/bundle/destroy/signal-cleanup/script @@ -29,4 +29,6 @@ trace cat out.requests.txt | jq 'select(.method == "POST" and (.path | contains( title "A deletion request for job2 should be recorded in the requests file. No request for job1 should exist since the process was terminated mid destroy." trace cat out.requests.txt | jq 'select(.method == "POST" and (.path | contains("/api/2.2/jobs/delete")))' +trace $CLI bundle debug plan + rm out.requests.txt diff --git a/bundle/bundle.go b/bundle/bundle.go index 25112574dc..68fb0a7676 100644 --- a/bundle/bundle.go +++ b/bundle/bundle.go @@ -123,6 +123,9 @@ type Bundle struct { // Stores the locker responsible for acquiring/releasing a deployment lock. Locker *locker.Locker + // LockReleaseOnce ensures the lock is only released once + LockReleaseOnce sync.Once + // TerraformPlanPath is the path to the plan from the terraform CLI TerraformPlanPath string diff --git a/bundle/deploy/lock/release.go b/bundle/deploy/lock/release.go index 26f95edfc9..3252bfc9ad 100644 --- a/bundle/deploy/lock/release.go +++ b/bundle/deploy/lock/release.go @@ -44,15 +44,22 @@ func (m *release) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnostics return nil } - log.Infof(ctx, "Releasing deployment lock") - switch m.goal { - case GoalDeploy: - return diag.FromErr(b.Locker.Unlock(ctx)) - case GoalBind, GoalUnbind: - return diag.FromErr(b.Locker.Unlock(ctx)) - case GoalDestroy: - return diag.FromErr(b.Locker.Unlock(ctx, locker.AllowLockFileNotExist)) - default: - return diag.Errorf("unknown goal for lock release: %s", m.goal) - } + // Make lock release idempotent using sync.Once to prevent race conditions + // This ensures the lock is only released once even if called multiple times + var diags diag.Diagnostics + b.LockReleaseOnce.Do(func() { + log.Infof(ctx, "Releasing deployment lock") + switch m.goal { + case GoalDeploy: + diags = diag.FromErr(b.Locker.Unlock(ctx)) + case GoalBind, GoalUnbind: + diags = diag.FromErr(b.Locker.Unlock(ctx)) + case GoalDestroy: + diags = diag.FromErr(b.Locker.Unlock(ctx, locker.AllowLockFileNotExist)) + default: + diags = diag.Errorf("unknown goal for lock release: %s", m.goal) + } + }) + + return diags } diff --git a/bundle/phases/deploy.go b/bundle/phases/deploy.go index ec620947fe..8cb015d3fb 100644 --- a/bundle/phases/deploy.go +++ b/bundle/phases/deploy.go @@ -136,40 +136,98 @@ func uploadLibraries(ctx context.Context, b *bundle.Bundle, libs map[string][]li } // registerGracefulCleanup sets up signal handlers to release the lock -// before the process terminates. Returns a cleanup function for the normal exit path. +// before the process terminates. Returns a new context that will be cancelled +// when a signal is received, and a cleanup function for the exit path. +// +// This follows idiomatic Go patterns for graceful shutdown: +// 1. Use context cancellation to signal shutdown to the main routine +// 2. Use a done channel to wait for the main routine to complete +// 3. Only exit after confirming the main routine has terminated // // Catches SIGINT (Ctrl+C), SIGTERM, SIGHUP, and SIGQUIT. // Note: SIGKILL and SIGSTOP cannot be caught - the kernel terminates the process directly. -func registerGracefulCleanup(ctx context.Context, b *bundle.Bundle, goal lock.Goal) func() { +func registerGracefulCleanup(ctx context.Context, b *bundle.Bundle, goal lock.Goal) (context.Context, func()) { + // Create a cancellable context to propagate cancellation to the main routine + ctx, cancel := context.WithCancel(ctx) + + // Channel to signal that the main routine has completed + done := make(chan struct{}) + + // Channel to signal that a signal was received and handled + signalReceived := make(chan struct{}) + + // Channel to receive OS signals sigChan := make(chan os.Signal, 1) signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM, syscall.SIGHUP, syscall.SIGQUIT) - // Start goroutine to handle signals - go func() { + signalHandler := func() { + // Wait for a signal to be received. sig := <-sigChan - // Stop listening for more signals + + // Stop listening for more signals. This allows for multiple interrupts + // to cause the program to force exit. signal.Stop(sigChan) + // Signal that we received an interrupt + close(signalReceived) + cmdio.LogString(ctx, "Operation interrupted. Gracefully shutting down...") - // Release the lock. - bundle.ApplyContext(ctx, b, lock.Release(goal)) + // Cancel the context to signal the main routine to stop + cancel() + + // Wait for the main routine to complete before releasing the lock + // This ensures we don't exit while operations are still in progress + <-done - // Exit immediately with standard signal exit code (128 + signal number). - // The deferred cleanup function returned below won't run because we exit here. + // Release the lock using a context without cancellation to avoid cancellation issues + // We use context.WithoutCancel to preserve context values (like user agent) + // but remove the cancellation signal so the lock release can complete + releaseCtx := context.WithoutCancel(ctx) + bundle.ApplyContext(releaseCtx, b, lock.Release(goal)) + + // Calculate exit code (128 + signal number) exitCode := 128 if s, ok := sig.(syscall.Signal); ok { exitCode += int(s) } + + // Exit with the appropriate signal exit code os.Exit(exitCode) - }() + } + + // Start goroutine to handle signals + go signalHandler() - // Return cleanup function for normal exit path - return func() { + // Return cleanup function for the exit path + // This should be called via defer to ensure it runs even if there's a panic + cleanup := func() { + // Stop listening for signals signal.Stop(sigChan) - // Don't close the channel - it causes the goroutine to receive nil - bundle.ApplyContext(ctx, b, lock.Release(goal)) + + // Release the lock (idempotent thanks to sync.Once in lock.Release) + // Use context.WithoutCancel to preserve context values but remove cancellation + releaseCtx := context.WithoutCancel(ctx) + bundle.ApplyContext(releaseCtx, b, lock.Release(goal)) + + // Signal that the main routine has completed. + // Once the signal is recieved, + // This must be done AFTER all cleanup is complete + close(done) + + // If a signal was received, wait indefinitely for the signal handler to exit + // This prevents the main function from returning and exiting with a different code + // If no signal was received, signalReceived will never be closed, so we just return + select { + case <-signalReceived: + // Signal was received, wait forever for os.Exit() in signal handler + select {} + default: + // No signal received, proceed with normal exit + } } + + return ctx, cleanup } // The deploy phase deploys artifacts and resources. @@ -189,7 +247,8 @@ func Deploy(ctx context.Context, b *bundle.Bundle, outputHandler sync.OutputHand } // lock is acquired here - set up signal handlers and defer cleanup - defer registerGracefulCleanup(ctx, b, lock.GoalDeploy)() + ctx, cleanup := registerGracefulCleanup(ctx, b, lock.GoalDeploy) + defer cleanup() libs := deployPrepare(ctx, b, false, directDeployment) if logdiag.HasError(ctx) { diff --git a/bundle/phases/destroy.go b/bundle/phases/destroy.go index 29faaa7aef..622a76a018 100644 --- a/bundle/phases/destroy.go +++ b/bundle/phases/destroy.go @@ -130,7 +130,8 @@ func Destroy(ctx context.Context, b *bundle.Bundle, directDeployment bool) { } // lock is acquired here - set up signal handlers and defer cleanup - defer registerGracefulCleanup(ctx, b, lock.GoalDestroy)() + ctx, cleanup := registerGracefulCleanup(ctx, b, lock.GoalDestroy) + defer cleanup() if !directDeployment { bundle.ApplySeqContext(ctx, b, From 9d2fb046916da4bd9035025d911ac43448f81350 Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Thu, 6 Nov 2025 15:19:35 +0100 Subject: [PATCH 3/6] update with latest --- NEXT_CHANGELOG.md | 1 + bundle/bundle.go | 3 - bundle/deploy/lock/release.go | 39 +----- bundle/deploy/lock/release_test.go | 181 +++++++++++++++++++++++++ bundle/phases/bind.go | 4 +- bundle/phases/deploy.go | 10 +- bundle/phases/destroy.go | 2 +- integration/libs/locker/locker_test.go | 4 +- libs/locker/locker.go | 22 ++- 9 files changed, 207 insertions(+), 59 deletions(-) create mode 100644 bundle/deploy/lock/release_test.go diff --git a/NEXT_CHANGELOG.md b/NEXT_CHANGELOG.md index 88f45aac96..8c34587667 100644 --- a/NEXT_CHANGELOG.md +++ b/NEXT_CHANGELOG.md @@ -10,5 +10,6 @@ ### Bundles * Add validation that served_models and served_entities are not used at the same time. Add client side translation logic. ([#3880](https://github.com/databricks/cli/pull/3880)) +* Gracefully handle interrupts (SIGINT, SIGTERM, SIGHUP, SIGQUIT) during bundle deployment and destruction by releasing locks before exiting ([#3758](https://github.com/databricks/cli/pull/3758)) ### API Changes diff --git a/bundle/bundle.go b/bundle/bundle.go index 68fb0a7676..25112574dc 100644 --- a/bundle/bundle.go +++ b/bundle/bundle.go @@ -123,9 +123,6 @@ type Bundle struct { // Stores the locker responsible for acquiring/releasing a deployment lock. Locker *locker.Locker - // LockReleaseOnce ensures the lock is only released once - LockReleaseOnce sync.Once - // TerraformPlanPath is the path to the plan from the terraform CLI TerraformPlanPath string diff --git a/bundle/deploy/lock/release.go b/bundle/deploy/lock/release.go index 3252bfc9ad..71e42b82e5 100644 --- a/bundle/deploy/lock/release.go +++ b/bundle/deploy/lock/release.go @@ -5,25 +5,13 @@ import ( "github.com/databricks/cli/bundle" "github.com/databricks/cli/libs/diag" - "github.com/databricks/cli/libs/locker" "github.com/databricks/cli/libs/log" ) -type Goal string +type release struct{} -const ( - GoalBind = Goal("bind") - GoalUnbind = Goal("unbind") - GoalDeploy = Goal("deploy") - GoalDestroy = Goal("destroy") -) - -type release struct { - goal Goal -} - -func Release(goal Goal) bundle.Mutator { - return &release{goal} +func Release() bundle.Mutator { + return &release{} } func (m *release) Name() string { @@ -44,22 +32,7 @@ func (m *release) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnostics return nil } - // Make lock release idempotent using sync.Once to prevent race conditions - // This ensures the lock is only released once even if called multiple times - var diags diag.Diagnostics - b.LockReleaseOnce.Do(func() { - log.Infof(ctx, "Releasing deployment lock") - switch m.goal { - case GoalDeploy: - diags = diag.FromErr(b.Locker.Unlock(ctx)) - case GoalBind, GoalUnbind: - diags = diag.FromErr(b.Locker.Unlock(ctx)) - case GoalDestroy: - diags = diag.FromErr(b.Locker.Unlock(ctx, locker.AllowLockFileNotExist)) - default: - diags = diag.Errorf("unknown goal for lock release: %s", m.goal) - } - }) - - return diags + log.Infof(ctx, "Releasing deployment lock") + err := b.Locker.Unlock(ctx) + return diag.FromErr(err) } diff --git a/bundle/deploy/lock/release_test.go b/bundle/deploy/lock/release_test.go new file mode 100644 index 0000000000..0cd68ef074 --- /dev/null +++ b/bundle/deploy/lock/release_test.go @@ -0,0 +1,181 @@ +package lock_test + +import ( + "context" + "reflect" + "testing" + + "github.com/databricks/cli/bundle" + "github.com/databricks/cli/bundle/config" + "github.com/databricks/cli/bundle/deploy/lock" + "github.com/databricks/cli/libs/filer" + lockpkg "github.com/databricks/cli/libs/locker" + "github.com/google/uuid" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// createTestLocker creates a locker with a filer for testing +func createTestLocker(f filer.Filer, targetDir string) *lockpkg.Locker { + l := &lockpkg.Locker{ + TargetDir: targetDir, + Active: false, + State: &lockpkg.LockState{ + ID: uuid.New(), + User: "test-user", + }, + } + // Use reflection to set the private filer field + v := reflect.ValueOf(l).Elem() + filerField := v.FieldByName("filer") + filerField = reflect.NewAt(filerField.Type(), filerField.Addr().UnsafePointer()).Elem() + filerField.Set(reflect.ValueOf(f)) + return l +} + +func TestReleaseIdempotent(t *testing.T) { + ctx := context.Background() + + // Create a temporary directory for the filer + tmpDir := t.TempDir() + f, err := filer.NewLocalClient(tmpDir) + require.NoError(t, err) + + // Create a bundle with a locker + enabled := true + b := &bundle.Bundle{ + Config: config.Root{ + Bundle: config.Bundle{ + Target: "test", + Deployment: config.Deployment{ + Lock: config.Lock{ + Enabled: &enabled, + }, + }, + }, + }, + } + + // Initialize locker with the filer + locker := createTestLocker(f, tmpDir) + b.Locker = locker + + // Acquire lock + err = locker.Lock(ctx, false) + require.NoError(t, err) + assert.True(t, locker.Active) + + // Verify lock file exists + _, err = f.Stat(ctx, "deploy.lock") + require.NoError(t, err) + + // First release - should succeed + mutator := lock.Release() + diags := bundle.Apply(ctx, b, mutator) + require.NoError(t, diags.Error()) + assert.False(t, locker.Active) + + // Verify lock file is deleted + _, err = f.Stat(ctx, "deploy.lock") + require.Error(t, err) + + // Second release - should be idempotent and succeed + diags = bundle.Apply(ctx, b, mutator) + require.NoError(t, diags.Error()) + assert.False(t, locker.Active) + + // Third release - should still be idempotent and succeed + diags = bundle.Apply(ctx, b, mutator) + require.NoError(t, diags.Error()) + assert.False(t, locker.Active) +} + +func TestReleaseFileAlreadyDeleted(t *testing.T) { + ctx := context.Background() + + // Create a temporary directory for the filer + tmpDir := t.TempDir() + f, err := filer.NewLocalClient(tmpDir) + require.NoError(t, err) + + // Create a bundle with a locker + enabled := true + b := &bundle.Bundle{ + Config: config.Root{ + Bundle: config.Bundle{ + Target: "test", + Deployment: config.Deployment{ + Lock: config.Lock{ + Enabled: &enabled, + }, + }, + }, + }, + } + + // Initialize locker with the filer + locker := createTestLocker(f, tmpDir) + b.Locker = locker + + // Acquire lock + err = locker.Lock(ctx, false) + require.NoError(t, err) + assert.True(t, locker.Active) + + // Verify lock file exists + _, err = f.Stat(ctx, "deploy.lock") + require.NoError(t, err) + + // Manually delete lock file + err = f.Delete(ctx, "deploy.lock") + require.NoError(t, err) + + // Release lock - should succeed even though lock file is already deleted + mutator := lock.Release() + diags := bundle.Apply(ctx, b, mutator) + require.NoError(t, diags.Error()) + assert.False(t, locker.Active) +} + +func TestReleaseWhenAnotherProcessHoldsLock(t *testing.T) { + ctx := context.Background() + + // Create a temporary directory for the filer + tmpDir := t.TempDir() + f, err := filer.NewLocalClient(tmpDir) + require.NoError(t, err) + + // Create two lockers - simulating two different processes + locker1 := createTestLocker(f, tmpDir) + locker2 := createTestLocker(f, tmpDir) + + // First locker acquires the lock + err = locker1.Lock(ctx, false) + require.NoError(t, err) + assert.True(t, locker1.Active) + + // Create bundle with second locker (different process) + enabled := true + b := &bundle.Bundle{ + Config: config.Root{ + Bundle: config.Bundle{ + Target: "test", + Deployment: config.Deployment{ + Lock: config.Lock{ + Enabled: &enabled, + }, + }, + }, + }, + Locker: locker2, + } + + // Set locker2 as active (simulating it thinks it has the lock, but it doesn't) + locker2.Active = true + + // Try to release with locker2 - should error because locker1 holds the lock + mutator := lock.Release() + diags := bundle.Apply(ctx, b, mutator) + require.Error(t, diags.Error()) + assert.Contains(t, diags.Error().Error(), "deploy lock acquired by") +} diff --git a/bundle/phases/bind.go b/bundle/phases/bind.go index c2a2721438..a3526a2250 100644 --- a/bundle/phases/bind.go +++ b/bundle/phases/bind.go @@ -20,7 +20,7 @@ func Bind(ctx context.Context, b *bundle.Bundle, opts *terraform.BindOptions) { } defer func() { - bundle.ApplyContext(ctx, b, lock.Release(lock.GoalBind)) + bundle.ApplyContext(ctx, b, lock.Release()) }() bundle.ApplySeqContext(ctx, b, @@ -40,7 +40,7 @@ func Unbind(ctx context.Context, b *bundle.Bundle, bundleType, tfResourceType, r } defer func() { - bundle.ApplyContext(ctx, b, lock.Release(lock.GoalUnbind)) + bundle.ApplyContext(ctx, b, lock.Release()) }() bundle.ApplySeqContext(ctx, b, diff --git a/bundle/phases/deploy.go b/bundle/phases/deploy.go index 8cb015d3fb..55a19d4185 100644 --- a/bundle/phases/deploy.go +++ b/bundle/phases/deploy.go @@ -146,7 +146,7 @@ func uploadLibraries(ctx context.Context, b *bundle.Bundle, libs map[string][]li // // Catches SIGINT (Ctrl+C), SIGTERM, SIGHUP, and SIGQUIT. // Note: SIGKILL and SIGSTOP cannot be caught - the kernel terminates the process directly. -func registerGracefulCleanup(ctx context.Context, b *bundle.Bundle, goal lock.Goal) (context.Context, func()) { +func registerGracefulCleanup(ctx context.Context, b *bundle.Bundle) (context.Context, func()) { // Create a cancellable context to propagate cancellation to the main routine ctx, cancel := context.WithCancel(ctx) @@ -184,7 +184,7 @@ func registerGracefulCleanup(ctx context.Context, b *bundle.Bundle, goal lock.Go // We use context.WithoutCancel to preserve context values (like user agent) // but remove the cancellation signal so the lock release can complete releaseCtx := context.WithoutCancel(ctx) - bundle.ApplyContext(releaseCtx, b, lock.Release(goal)) + bundle.ApplyContext(releaseCtx, b, lock.Release()) // Calculate exit code (128 + signal number) exitCode := 128 @@ -205,10 +205,10 @@ func registerGracefulCleanup(ctx context.Context, b *bundle.Bundle, goal lock.Go // Stop listening for signals signal.Stop(sigChan) - // Release the lock (idempotent thanks to sync.Once in lock.Release) + // Release the lock (idempotent) // Use context.WithoutCancel to preserve context values but remove cancellation releaseCtx := context.WithoutCancel(ctx) - bundle.ApplyContext(releaseCtx, b, lock.Release(goal)) + bundle.ApplyContext(releaseCtx, b, lock.Release()) // Signal that the main routine has completed. // Once the signal is recieved, @@ -247,7 +247,7 @@ func Deploy(ctx context.Context, b *bundle.Bundle, outputHandler sync.OutputHand } // lock is acquired here - set up signal handlers and defer cleanup - ctx, cleanup := registerGracefulCleanup(ctx, b, lock.GoalDeploy) + ctx, cleanup := registerGracefulCleanup(ctx, b) defer cleanup() libs := deployPrepare(ctx, b, false, directDeployment) diff --git a/bundle/phases/destroy.go b/bundle/phases/destroy.go index 622a76a018..be2493bc85 100644 --- a/bundle/phases/destroy.go +++ b/bundle/phases/destroy.go @@ -130,7 +130,7 @@ func Destroy(ctx context.Context, b *bundle.Bundle, directDeployment bool) { } // lock is acquired here - set up signal handlers and defer cleanup - ctx, cleanup := registerGracefulCleanup(ctx, b, lock.GoalDestroy) + ctx, cleanup := registerGracefulCleanup(ctx, b) defer cleanup() if !directDeployment { diff --git a/integration/libs/locker/locker_test.go b/integration/libs/locker/locker_test.go index 3ae80f8e71..2521dc5d43 100644 --- a/integration/libs/locker/locker_test.go +++ b/integration/libs/locker/locker_test.go @@ -186,8 +186,8 @@ func TestLockUnlockWithAllowsLockFileNotExist(t *testing.T) { err = f.Delete(ctx, "deploy.lock") assert.NoError(t, err) - // Assert error, because lock file does not exist - err = locker.Unlock(ctx, lockpkg.AllowLockFileNotExist) + // Unlock should succeed even though lock file does not exist + err = locker.Unlock(ctx) assert.NoError(t, err) assert.False(t, locker.Active) } diff --git a/libs/locker/locker.go b/libs/locker/locker.go index aadc50b587..3f348da7b8 100644 --- a/libs/locker/locker.go +++ b/libs/locker/locker.go @@ -8,7 +8,6 @@ import ( "fmt" "io" "io/fs" - "slices" "time" "github.com/databricks/cli/libs/filer" @@ -16,12 +15,6 @@ import ( "github.com/google/uuid" ) -type UnlockOption int - -const ( - AllowLockFileNotExist UnlockOption = iota -) - const LockFileName = "deploy.lock" // Locker object enables exclusive access to TargetDir's scope for a client. This @@ -171,15 +164,18 @@ func (locker *Locker) Lock(ctx context.Context, isForced bool) error { return nil } -func (locker *Locker) Unlock(ctx context.Context, opts ...UnlockOption) error { +func (locker *Locker) Unlock(ctx context.Context) error { + // If locker is already inactive, return without an error. + // This keeps the locker idempotent in the face of multiple unlock calls. if !locker.Active { - return errors.New("unlock called when lock is not held") + return nil } - // if allowLockFileNotExist is set, do not throw an error if the lock file does - // not exist. This is helpful when destroying a bundle in which case the lock - // file will be deleted before we have a chance to unlock - if _, err := locker.filer.Stat(ctx, LockFileName); errors.Is(err, fs.ErrNotExist) && slices.Contains(opts, AllowLockFileNotExist) { + // Check if the lock file exists. If it doesn't exist, this is not an error + // as it makes unlock idempotent. This is helpful in several scenarios: + // 1. When destroying a bundle, the lock file may be deleted before unlock + // 2. When unlock is called multiple times, which is possible when handling signals like SIGINT, SIGTERM, SIGHUP, SIGQUIT + if _, err := locker.filer.Stat(ctx, LockFileName); errors.Is(err, fs.ErrNotExist) { locker.Active = false return nil } From cec6a88a522dd2df7c05623c92b4c4ad6bb2bf58 Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Thu, 6 Nov 2025 16:28:03 +0100 Subject: [PATCH 4/6] better test --- .../bundle/deploy/signal-cleanup/output.txt | 30 +++++++++++++++++-- .../bundle/deploy/signal-cleanup/script | 5 +++- bundle/phases/deploy.go | 5 +++- 3 files changed, 36 insertions(+), 4 deletions(-) diff --git a/acceptance/bundle/deploy/signal-cleanup/output.txt b/acceptance/bundle/deploy/signal-cleanup/output.txt index 9f44243750..b139caf23b 100644 --- a/acceptance/bundle/deploy/signal-cleanup/output.txt +++ b/acceptance/bundle/deploy/signal-cleanup/output.txt @@ -10,11 +10,10 @@ Operation interrupted. Gracefully shutting down... Error: terraform apply: exit status 1 Updating deployment state... -Error: failed in rate limiter: context canceled - Exit code: 130 +=== A deletion request for deploy.lock should have been recorded in the requests file >>> cat out.requests.txt { "method": "POST", @@ -24,6 +23,33 @@ Exit code: 130 } } +=== An upload request for the state file should have been recorded in the requests file +>>> cat out.requests.txt +{ + "method": "POST", + "path": "/api/2.0/workspace-files/import-file/Workspace/Users/[USERNAME]/.bundle/signal-test/default/state/terraform.tfstate", + "q": { + "overwrite": "true" + }, + "body": { + "version": 4, + "terraform_version": "1.5.5", + "serial": 1, + "lineage": "[UUID]", + "outputs": {}, + "resources": [ + { + "mode": "managed", + "type": "databricks_job", + "name": "job2", + "provider": "provider[\"registry.terraform.io/databricks/databricks\"]", + "instances": [] + } + ], + "check_results": null + } +} + === A creation request for job1 should be recorded in the requests file. No request for job2 should exist since the process was terminated mid deployment. >>> cat out.requests.txt { diff --git a/acceptance/bundle/deploy/signal-cleanup/script b/acceptance/bundle/deploy/signal-cleanup/script index 046adb2aab..ec9d92383b 100644 --- a/acceptance/bundle/deploy/signal-cleanup/script +++ b/acceptance/bundle/deploy/signal-cleanup/script @@ -21,9 +21,12 @@ trace kill -INT $DEPLOY_PID # Wait for process to complete errcode trace wait $DEPLOY_PID -# A deletion request for deploy.lock should have been recorded in the requests file +title "A deletion request for deploy.lock should have been recorded in the requests file" trace cat out.requests.txt | jq 'select(.method == "POST" and (.path | contains("workspace/delete")) and (.body.path | contains("deploy.lock")))' +title "An upload request for the state file should have been recorded in the requests file" +trace cat out.requests.txt | jq 'select(.method == "POST" and (.path | contains("workspace-files/import-file")) and (.path | contains("terraform.tfstate")))' + title "A creation request for job1 should be recorded in the requests file. No request for job2 should exist since the process was terminated mid deployment." trace cat out.requests.txt | jq 'select(.method == "POST" and (.path | contains("/api/2.2/jobs/create")))' diff --git a/bundle/phases/deploy.go b/bundle/phases/deploy.go index 55a19d4185..c9e1cc6856 100644 --- a/bundle/phases/deploy.go +++ b/bundle/phases/deploy.go @@ -107,7 +107,10 @@ func deployCore(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan, di } // Even if deployment failed, there might be updates in states that we need to upload - bundle.ApplyContext(ctx, b, + // Use context.WithoutCancel to ensure state push completes even if context was cancelled + // (e.g., due to signal interruption). We want to save the current state before exiting. + statePushCtx := context.WithoutCancel(ctx) + bundle.ApplyContext(statePushCtx, b, statemgmt.StatePush(directDeployment), ) if logdiag.HasError(ctx) { From 59328842fca945698279f7481606894b322e838a Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Thu, 6 Nov 2025 16:35:55 +0100 Subject: [PATCH 5/6] update tests --- .../destroy/signal-cleanup/databricks.yml | 3 -- .../destroy/signal-cleanup/out.test.toml | 2 +- .../bundle/destroy/signal-cleanup/output.txt | 36 +++++++++++++------ .../bundle/destroy/signal-cleanup/script | 10 +++--- .../bundle/destroy/signal-cleanup/test.toml | 4 +-- 5 files changed, 33 insertions(+), 22 deletions(-) diff --git a/acceptance/bundle/destroy/signal-cleanup/databricks.yml b/acceptance/bundle/destroy/signal-cleanup/databricks.yml index 1eb4342413..64d09d3762 100644 --- a/acceptance/bundle/destroy/signal-cleanup/databricks.yml +++ b/acceptance/bundle/destroy/signal-cleanup/databricks.yml @@ -5,6 +5,3 @@ resources: jobs: job1: name: job1 - - job2: - name: job2 (depends on ${resources.jobs.job1.id}) diff --git a/acceptance/bundle/destroy/signal-cleanup/out.test.toml b/acceptance/bundle/destroy/signal-cleanup/out.test.toml index 90061dedb1..54146af564 100644 --- a/acceptance/bundle/destroy/signal-cleanup/out.test.toml +++ b/acceptance/bundle/destroy/signal-cleanup/out.test.toml @@ -2,4 +2,4 @@ Local = true Cloud = false [EnvMatrix] - DATABRICKS_BUNDLE_ENGINE = ["terraform"] + DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/destroy/signal-cleanup/output.txt b/acceptance/bundle/destroy/signal-cleanup/output.txt index 6ee581ddb3..b3308aaa97 100644 --- a/acceptance/bundle/destroy/signal-cleanup/output.txt +++ b/acceptance/bundle/destroy/signal-cleanup/output.txt @@ -5,7 +5,6 @@ Deployment complete! === Wait until the destroy has started.The following resources will be deleted: delete job job1 - delete job job2 All files and directories at the following location will be deleted: /Workspace/Users/[USERNAME]/.bundle/signal-test/default @@ -15,31 +14,46 @@ Destroy in progress, sending interrupt signal... >>> wait [PID] Operation interrupted. Gracefully shutting down... -Error: terraform apply: exit status 1 +Error: cannot delete resources.jobs.job1: deleting id=[NUMID]: Post "[DATABRICKS_URL]/api/2.2/jobs/delete": context canceled Exit code: 130 ->>> cat out.requests.txt - -=== A deletion request for job2 should be recorded in the requests file. No request for job1 should exist since the process was terminated mid destroy. +=== A deletion request for deploy.lock should have been recorded in the requests file >>> cat out.requests.txt { "method": "POST", - "path": "/api/2.2/jobs/delete", + "path": "/api/2.0/workspace/delete", "body": { - "job_id": [NUMID] + "path": "/Workspace/Users/[USERNAME]/.bundle/signal-test/default/state/deploy.lock" } } +=== No deletion request for resources.json should be recorded. We still need state to complete the destroy. +>>> cat out.requests.txt + >>> [CLI] bundle debug plan { "plan": { "resources.jobs.job1": { - "action": "skip" - }, - "resources.jobs.job2": { - "action": "skip" + "action": "skip", + "remote_state": { + "creator_user_name": "[USERNAME]", + "job_id": [NUMID], + "settings": { + "deployment": { + "kind": "BUNDLE", + "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/signal-test/default/state/metadata.json" + }, + "edit_mode": "UI_LOCKED", + "format": "MULTI_TASK", + "max_concurrent_runs": 1, + "name": "job1", + "queue": { + "enabled": true + } + } + } } } } diff --git a/acceptance/bundle/destroy/signal-cleanup/script b/acceptance/bundle/destroy/signal-cleanup/script index 7936106904..d2ae482cc2 100755 --- a/acceptance/bundle/destroy/signal-cleanup/script +++ b/acceptance/bundle/destroy/signal-cleanup/script @@ -2,6 +2,7 @@ # First deploy the bundle so we have something to destroy $CLI bundle deploy --auto-approve +rm out.requests.txt # Start destroy in background, redirecting stderr to capture when destroy starts $CLI bundle destroy --auto-approve 2>&1 & @@ -23,12 +24,11 @@ trace kill -INT $DESTROY_PID # Wait for process to complete errcode trace wait $DESTROY_PID -# A deletion request for destroy.lock should have been recorded in the requests file -trace cat out.requests.txt | jq 'select(.method == "POST" and (.path | contains("workspace/delete")) and (.body.path | contains("destroy.lock")))' +title "A deletion request for deploy.lock should have been recorded in the requests file" +trace cat out.requests.txt | jq 'select(.method == "POST" and (.path | contains("workspace/delete")) and (.body.path | contains("deploy.lock")))' -title "A deletion request for job2 should be recorded in the requests file. No request for job1 should exist since the process was terminated mid destroy." -trace cat out.requests.txt | jq 'select(.method == "POST" and (.path | contains("/api/2.2/jobs/delete")))' +title "No deletion request for resources.json should be recorded. We still need state to complete the destroy." +trace cat out.requests.txt | jq 'select(.method == "POST" and (.path | contains("workspace/delete")) and (.body.path | contains("resources.json")))' trace $CLI bundle debug plan - rm out.requests.txt diff --git a/acceptance/bundle/destroy/signal-cleanup/test.toml b/acceptance/bundle/destroy/signal-cleanup/test.toml index f8186b73d9..5d6ce2f750 100644 --- a/acceptance/bundle/destroy/signal-cleanup/test.toml +++ b/acceptance/bundle/destroy/signal-cleanup/test.toml @@ -2,8 +2,8 @@ Local = true Cloud = false RecordRequests = true -# Test only terraform engine (signal handling is the same for direct) -EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["terraform"] +# Test only direct engine (signal handling is the same for terraform) +EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["direct"] # Add delay to first job deletion to ensure we can interrupt during destroy [[Server]] From 42f03eb99440072e9d7284c6dc7909960f6a317f Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Thu, 6 Nov 2025 16:48:11 +0100 Subject: [PATCH 6/6] minor rename --- bundle/phases/deploy.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bundle/phases/deploy.go b/bundle/phases/deploy.go index c9e1cc6856..765eb088ce 100644 --- a/bundle/phases/deploy.go +++ b/bundle/phases/deploy.go @@ -153,8 +153,8 @@ func registerGracefulCleanup(ctx context.Context, b *bundle.Bundle) (context.Con // Create a cancellable context to propagate cancellation to the main routine ctx, cancel := context.WithCancel(ctx) - // Channel to signal that the main routine has completed - done := make(chan struct{}) + // Channel to signal that the main + cleanup routine has completed + cleanupDone := make(chan struct{}) // Channel to signal that a signal was received and handled signalReceived := make(chan struct{}) @@ -181,7 +181,7 @@ func registerGracefulCleanup(ctx context.Context, b *bundle.Bundle) (context.Con // Wait for the main routine to complete before releasing the lock // This ensures we don't exit while operations are still in progress - <-done + <-cleanupDone // Release the lock using a context without cancellation to avoid cancellation issues // We use context.WithoutCancel to preserve context values (like user agent) @@ -216,7 +216,7 @@ func registerGracefulCleanup(ctx context.Context, b *bundle.Bundle) (context.Con // Signal that the main routine has completed. // Once the signal is recieved, // This must be done AFTER all cleanup is complete - close(done) + close(cleanupDone) // If a signal was received, wait indefinitely for the signal handler to exit // This prevents the main function from returning and exiting with a different code