-
Notifications
You must be signed in to change notification settings - Fork 112
Gracefully exit when terminated mid deploy / destroy #3758
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 6 commits
48bd034
3023134
9d2fb04
5610096
cec6a88
5932884
42f03eb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,10 @@ | ||
| bundle: | ||
| name: signal-test | ||
|
|
||
| resources: | ||
| jobs: | ||
| job1: | ||
| name: job1 | ||
|
|
||
| job2: | ||
| name: job2 (deploy after ${resources.jobs.job1.id}) |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,83 @@ | ||
|
|
||
| === Wait until the deployment has started.Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/signal-test/default/files... | ||
| Deploying resources... | ||
| Deployment in progress, sending interrupt signal... | ||
|
|
||
| >>> kill -INT [PID] | ||
|
|
||
| >>> wait [PID] | ||
| Operation interrupted. Gracefully shutting down... | ||
| Error: terraform apply: exit status 1 | ||
|
|
||
| Updating deployment state... | ||
|
|
||
| Exit code: 130 | ||
|
|
||
| === A deletion request for deploy.lock should have been recorded in the requests file | ||
| >>> cat out.requests.txt | ||
| { | ||
| "method": "POST", | ||
| "path": "/api/2.0/workspace/delete", | ||
| "body": { | ||
| "path": "/Workspace/Users/[USERNAME]/.bundle/signal-test/default/state/deploy.lock" | ||
| } | ||
| } | ||
|
|
||
| === An upload request for the state file should have been recorded in the requests file | ||
| >>> cat out.requests.txt | ||
| { | ||
| "method": "POST", | ||
| "path": "/api/2.0/workspace-files/import-file/Workspace/Users/[USERNAME]/.bundle/signal-test/default/state/terraform.tfstate", | ||
| "q": { | ||
| "overwrite": "true" | ||
| }, | ||
| "body": { | ||
| "version": 4, | ||
| "terraform_version": "1.5.5", | ||
| "serial": 1, | ||
| "lineage": "[UUID]", | ||
| "outputs": {}, | ||
| "resources": [ | ||
| { | ||
| "mode": "managed", | ||
| "type": "databricks_job", | ||
| "name": "job2", | ||
| "provider": "provider[\"registry.terraform.io/databricks/databricks\"]", | ||
| "instances": [] | ||
| } | ||
| ], | ||
| "check_results": null | ||
| } | ||
| } | ||
|
|
||
| === A creation request for job1 should be recorded in the requests file. No request for job2 should exist since the process was terminated mid deployment. | ||
| >>> cat out.requests.txt | ||
| { | ||
| "method": "POST", | ||
| "path": "/api/2.2/jobs/create", | ||
| "body": { | ||
| "deployment": { | ||
| "kind": "BUNDLE", | ||
| "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/signal-test/default/state/metadata.json" | ||
| }, | ||
| "edit_mode": "UI_LOCKED", | ||
| "format": "MULTI_TASK", | ||
| "max_concurrent_runs": 1, | ||
| "name": "job1", | ||
| "queue": { | ||
| "enabled": true | ||
| } | ||
| } | ||
| } | ||
|
|
||
| >>> [CLI] bundle debug plan | ||
| { | ||
| "plan": { | ||
| "resources.jobs.job1": { | ||
| "action": "create" | ||
| }, | ||
| "resources.jobs.job2": { | ||
| "action": "create" | ||
| } | ||
| } | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,35 @@ | ||
| #!/bin/bash | ||
|
|
||
| # Start deployment in background, redirecting stderr to capture when deployment starts | ||
| $CLI bundle deploy 2>&1 & | ||
| DEPLOY_PID=$! | ||
|
|
||
| # Wait for deployment to start by monitoring the requests file | ||
| # Once we see the job creation request starting, we know deployment is in progress | ||
| title "Wait until the deployment has started." | ||
| for i in {1..30}; do | ||
| if [ -f out.requests.txt ] && jq -e 'select(.method == "POST" and (.path | contains("/api/2.2/jobs/create")))' out.requests.txt >/dev/null 2>&1; then | ||
| echo "Deployment in progress, sending interrupt signal..." | ||
| break | ||
| fi | ||
| sleep 0.1 | ||
| done | ||
|
|
||
| # Send interrupt signal | ||
| trace kill -INT $DEPLOY_PID | ||
|
|
||
| # Wait for process to complete | ||
| errcode trace wait $DEPLOY_PID | ||
|
|
||
| title "A deletion request for deploy.lock should have been recorded in the requests file" | ||
| trace cat out.requests.txt | jq 'select(.method == "POST" and (.path | contains("workspace/delete")) and (.body.path | contains("deploy.lock")))' | ||
|
|
||
| title "An upload request for the state file should have been recorded in the requests file" | ||
| trace cat out.requests.txt | jq 'select(.method == "POST" and (.path | contains("workspace-files/import-file")) and (.path | contains("terraform.tfstate")))' | ||
|
|
||
| title "A creation request for job1 should be recorded in the requests file. No request for job2 should exist since the process was terminated mid deployment." | ||
| trace cat out.requests.txt | jq 'select(.method == "POST" and (.path | contains("/api/2.2/jobs/create")))' | ||
|
|
||
| trace $CLI bundle debug plan | ||
|
|
||
| rm out.requests.txt | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,20 @@ | ||
| Local = true | ||
| Cloud = false | ||
| RecordRequests = true | ||
|
|
||
| # Test only terraform engine (signal handling is the same for direct) | ||
| EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["terraform"] | ||
|
|
||
| # Add delay to first job creation to ensure we can interrupt during deployment | ||
| [[Server]] | ||
| Pattern = "POST /api/2.2/jobs/create" | ||
| Response.StatusCode = 200 | ||
| Response.Body = '{"job_id": 1111}' | ||
|
|
||
| # Large time to ensure deployment gets stuck when trying to create the first job. | ||
| Delay = "300s" | ||
|
|
||
| # Replace PID numbers in kill/wait commands | ||
| [[Repls]] | ||
| Old = "(kill -INT |wait )\\d+" | ||
| New = "$1[PID]" |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,7 @@ | ||
| bundle: | ||
| name: signal-test | ||
|
|
||
| resources: | ||
| jobs: | ||
| job1: | ||
| name: job1 |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,59 @@ | ||
| Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/signal-test/default/files... | ||
| Deploying resources... | ||
| Updating deployment state... | ||
| Deployment complete! | ||
|
|
||
| === Wait until the destroy has started.The following resources will be deleted: | ||
| delete job job1 | ||
|
|
||
| All files and directories at the following location will be deleted: /Workspace/Users/[USERNAME]/.bundle/signal-test/default | ||
|
|
||
| Destroy in progress, sending interrupt signal... | ||
|
|
||
| >>> kill -INT [PID] | ||
|
|
||
| >>> wait [PID] | ||
| Operation interrupted. Gracefully shutting down... | ||
| Error: cannot delete resources.jobs.job1: deleting id=[NUMID]: Post "[DATABRICKS_URL]/api/2.2/jobs/delete": context canceled | ||
|
|
||
|
|
||
| Exit code: 130 | ||
|
|
||
| === A deletion request for deploy.lock should have been recorded in the requests file | ||
| >>> cat out.requests.txt | ||
| { | ||
| "method": "POST", | ||
| "path": "/api/2.0/workspace/delete", | ||
| "body": { | ||
| "path": "/Workspace/Users/[USERNAME]/.bundle/signal-test/default/state/deploy.lock" | ||
| } | ||
| } | ||
|
|
||
| === No deletion request for resources.json should be recorded. We still need state to complete the destroy. | ||
| >>> cat out.requests.txt | ||
|
|
||
| >>> [CLI] bundle debug plan | ||
| { | ||
| "plan": { | ||
| "resources.jobs.job1": { | ||
| "action": "skip", | ||
| "remote_state": { | ||
| "creator_user_name": "[USERNAME]", | ||
| "job_id": [NUMID], | ||
| "settings": { | ||
| "deployment": { | ||
| "kind": "BUNDLE", | ||
| "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/signal-test/default/state/metadata.json" | ||
| }, | ||
| "edit_mode": "UI_LOCKED", | ||
| "format": "MULTI_TASK", | ||
| "max_concurrent_runs": 1, | ||
| "name": "job1", | ||
| "queue": { | ||
| "enabled": true | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,34 @@ | ||
| #!/bin/bash | ||
|
|
||
| # First deploy the bundle so we have something to destroy | ||
| $CLI bundle deploy --auto-approve | ||
| rm out.requests.txt | ||
|
|
||
| # Start destroy in background, redirecting stderr to capture when destroy starts | ||
| $CLI bundle destroy --auto-approve 2>&1 & | ||
| DESTROY_PID=$! | ||
|
|
||
| # Wait for destroy to start by monitoring for job deletion request | ||
| title "Wait until the destroy has started." | ||
| for i in {1..30}; do | ||
| if [ -f out.requests.txt ] && jq -e 'select(.method == "POST" and (.path | contains("/api/2.2/jobs/delete")))' out.requests.txt >/dev/null 2>&1; then | ||
| echo "Destroy in progress, sending interrupt signal..." | ||
| break | ||
| fi | ||
| sleep 0.1 | ||
| done | ||
|
|
||
| # Send interrupt signal | ||
| trace kill -INT $DESTROY_PID | ||
|
|
||
| # Wait for process to complete | ||
| errcode trace wait $DESTROY_PID | ||
|
|
||
| title "A deletion request for deploy.lock should have been recorded in the requests file" | ||
| trace cat out.requests.txt | jq 'select(.method == "POST" and (.path | contains("workspace/delete")) and (.body.path | contains("deploy.lock")))' | ||
|
|
||
| title "No deletion request for resources.json should be recorded. We still need state to complete the destroy." | ||
| trace cat out.requests.txt | jq 'select(.method == "POST" and (.path | contains("workspace/delete")) and (.body.path | contains("resources.json")))' | ||
|
|
||
| trace $CLI bundle debug plan | ||
| rm out.requests.txt |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,20 @@ | ||
| Local = true | ||
| Cloud = false | ||
| RecordRequests = true | ||
|
|
||
| # Test only direct engine (signal handling is the same for terraform) | ||
| EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["direct"] | ||
|
|
||
| # Add delay to first job deletion to ensure we can interrupt during destroy | ||
| [[Server]] | ||
| Pattern = "POST /api/2.2/jobs/delete" | ||
| Response.StatusCode = 200 | ||
| Response.Body = '{}' | ||
|
|
||
| # Large time to ensure destroy gets stuck when deleting the first job. | ||
| Delay = "300s" | ||
|
|
||
| # Replace PID numbers in kill/wait commands | ||
| [[Repls]] | ||
| Old = "(kill -INT |wait )\\d+" | ||
| New = "$1[PID]" |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,25 +5,13 @@ import ( | |
|
|
||
| "github.com/databricks/cli/bundle" | ||
| "github.com/databricks/cli/libs/diag" | ||
| "github.com/databricks/cli/libs/locker" | ||
| "github.com/databricks/cli/libs/log" | ||
| ) | ||
|
|
||
| type Goal string | ||
| type release struct{} | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We don't need goal anymore since lock.Release is idempotent. |
||
|
|
||
| const ( | ||
| GoalBind = Goal("bind") | ||
| GoalUnbind = Goal("unbind") | ||
| GoalDeploy = Goal("deploy") | ||
| GoalDestroy = Goal("destroy") | ||
| ) | ||
|
|
||
| type release struct { | ||
| goal Goal | ||
| } | ||
|
|
||
| func Release(goal Goal) bundle.Mutator { | ||
| return &release{goal} | ||
| func Release() bundle.Mutator { | ||
| return &release{} | ||
| } | ||
|
|
||
| func (m *release) Name() string { | ||
|
|
@@ -45,14 +33,6 @@ func (m *release) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnostics | |
| } | ||
|
|
||
| log.Infof(ctx, "Releasing deployment lock") | ||
| switch m.goal { | ||
| case GoalDeploy: | ||
| return diag.FromErr(b.Locker.Unlock(ctx)) | ||
| case GoalBind, GoalUnbind: | ||
| return diag.FromErr(b.Locker.Unlock(ctx)) | ||
| case GoalDestroy: | ||
| return diag.FromErr(b.Locker.Unlock(ctx, locker.AllowLockFileNotExist)) | ||
| default: | ||
| return diag.Errorf("unknown goal for lock release: %s", m.goal) | ||
| } | ||
| err := b.Locker.Unlock(ctx) | ||
| return diag.FromErr(err) | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.