Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions NEXT_CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,6 @@

### Bundles
* Add validation that served_models and served_entities are not used at the same time. Add client side translation logic. ([#3880](https://github.com/databricks/cli/pull/3880))
* Gracefully handle interrupts (SIGINT, SIGTERM, SIGHUP, SIGQUIT) during bundle deployment and destruction by releasing locks before exiting ([#3758](https://github.com/databricks/cli/pull/3758))

### API Changes
10 changes: 10 additions & 0 deletions acceptance/bundle/deploy/signal-cleanup/databricks.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
bundle:
name: signal-test

resources:
jobs:
job1:
name: job1

job2:
name: job2 (deploy after ${resources.jobs.job1.id})
5 changes: 5 additions & 0 deletions acceptance/bundle/deploy/signal-cleanup/out.test.toml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

83 changes: 83 additions & 0 deletions acceptance/bundle/deploy/signal-cleanup/output.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@

=== Wait until the deployment has started.Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/signal-test/default/files...
Deploying resources...
Deployment in progress, sending interrupt signal...

>>> kill -INT [PID]

>>> wait [PID]
Operation interrupted. Gracefully shutting down...
Error: terraform apply: exit status 1

Updating deployment state...

Exit code: 130

=== A deletion request for deploy.lock should have been recorded in the requests file
>>> cat out.requests.txt
{
"method": "POST",
"path": "/api/2.0/workspace/delete",
"body": {
"path": "/Workspace/Users/[USERNAME]/.bundle/signal-test/default/state/deploy.lock"
}
}

=== An upload request for the state file should have been recorded in the requests file
>>> cat out.requests.txt
{
"method": "POST",
"path": "/api/2.0/workspace-files/import-file/Workspace/Users/[USERNAME]/.bundle/signal-test/default/state/terraform.tfstate",
"q": {
"overwrite": "true"
},
"body": {
"version": 4,
"terraform_version": "1.5.5",
"serial": 1,
"lineage": "[UUID]",
"outputs": {},
"resources": [
{
"mode": "managed",
"type": "databricks_job",
"name": "job2",
"provider": "provider[\"registry.terraform.io/databricks/databricks\"]",
"instances": []
}
],
"check_results": null
}
}

=== A creation request for job1 should be recorded in the requests file. No request for job2 should exist since the process was terminated mid deployment.
>>> cat out.requests.txt
{
"method": "POST",
"path": "/api/2.2/jobs/create",
"body": {
"deployment": {
"kind": "BUNDLE",
"metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/signal-test/default/state/metadata.json"
},
"edit_mode": "UI_LOCKED",
"format": "MULTI_TASK",
"max_concurrent_runs": 1,
"name": "job1",
"queue": {
"enabled": true
}
}
}

>>> [CLI] bundle debug plan
{
"plan": {
"resources.jobs.job1": {
"action": "create"
},
"resources.jobs.job2": {
"action": "create"
}
}
}
35 changes: 35 additions & 0 deletions acceptance/bundle/deploy/signal-cleanup/script
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/bin/bash

# Start deployment in background, redirecting stderr to capture when deployment starts
$CLI bundle deploy 2>&1 &
DEPLOY_PID=$!

# Wait for deployment to start by monitoring the requests file
# Once we see the job creation request starting, we know deployment is in progress
title "Wait until the deployment has started."
for i in {1..30}; do
if [ -f out.requests.txt ] && jq -e 'select(.method == "POST" and (.path | contains("/api/2.2/jobs/create")))' out.requests.txt >/dev/null 2>&1; then
echo "Deployment in progress, sending interrupt signal..."
break
fi
sleep 0.1
done

# Send interrupt signal
trace kill -INT $DEPLOY_PID

# Wait for process to complete
errcode trace wait $DEPLOY_PID

title "A deletion request for deploy.lock should have been recorded in the requests file"
trace cat out.requests.txt | jq 'select(.method == "POST" and (.path | contains("workspace/delete")) and (.body.path | contains("deploy.lock")))'

title "An upload request for the state file should have been recorded in the requests file"
trace cat out.requests.txt | jq 'select(.method == "POST" and (.path | contains("workspace-files/import-file")) and (.path | contains("terraform.tfstate")))'

title "A creation request for job1 should be recorded in the requests file. No request for job2 should exist since the process was terminated mid deployment."
trace cat out.requests.txt | jq 'select(.method == "POST" and (.path | contains("/api/2.2/jobs/create")))'

trace $CLI bundle debug plan

rm out.requests.txt
20 changes: 20 additions & 0 deletions acceptance/bundle/deploy/signal-cleanup/test.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
Local = true
Cloud = false
RecordRequests = true

# Test only terraform engine (signal handling is the same for direct)
EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["terraform"]

# Add delay to first job creation to ensure we can interrupt during deployment
[[Server]]
Pattern = "POST /api/2.2/jobs/create"
Response.StatusCode = 200
Response.Body = '{"job_id": 1111}'

# Large time to ensure deployment gets stuck when trying to create the first job.
Delay = "300s"

# Replace PID numbers in kill/wait commands
[[Repls]]
Old = "(kill -INT |wait )\\d+"
New = "$1[PID]"
7 changes: 7 additions & 0 deletions acceptance/bundle/destroy/signal-cleanup/databricks.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
bundle:
name: signal-test

resources:
jobs:
job1:
name: job1
5 changes: 5 additions & 0 deletions acceptance/bundle/destroy/signal-cleanup/out.test.toml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

59 changes: 59 additions & 0 deletions acceptance/bundle/destroy/signal-cleanup/output.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/signal-test/default/files...
Deploying resources...
Updating deployment state...
Deployment complete!

=== Wait until the destroy has started.The following resources will be deleted:
delete job job1

All files and directories at the following location will be deleted: /Workspace/Users/[USERNAME]/.bundle/signal-test/default

Destroy in progress, sending interrupt signal...

>>> kill -INT [PID]

>>> wait [PID]
Operation interrupted. Gracefully shutting down...
Error: cannot delete resources.jobs.job1: deleting id=[NUMID]: Post "[DATABRICKS_URL]/api/2.2/jobs/delete": context canceled


Exit code: 130

=== A deletion request for deploy.lock should have been recorded in the requests file
>>> cat out.requests.txt
{
"method": "POST",
"path": "/api/2.0/workspace/delete",
"body": {
"path": "/Workspace/Users/[USERNAME]/.bundle/signal-test/default/state/deploy.lock"
}
}

=== No deletion request for resources.json should be recorded. We still need state to complete the destroy.
>>> cat out.requests.txt

>>> [CLI] bundle debug plan
{
"plan": {
"resources.jobs.job1": {
"action": "skip",
"remote_state": {
"creator_user_name": "[USERNAME]",
"job_id": [NUMID],
"settings": {
"deployment": {
"kind": "BUNDLE",
"metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/signal-test/default/state/metadata.json"
},
"edit_mode": "UI_LOCKED",
"format": "MULTI_TASK",
"max_concurrent_runs": 1,
"name": "job1",
"queue": {
"enabled": true
}
}
}
}
}
}
34 changes: 34 additions & 0 deletions acceptance/bundle/destroy/signal-cleanup/script
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/bin/bash

# First deploy the bundle so we have something to destroy
$CLI bundle deploy --auto-approve
rm out.requests.txt

# Start destroy in background, redirecting stderr to capture when destroy starts
$CLI bundle destroy --auto-approve 2>&1 &
DESTROY_PID=$!

# Wait for destroy to start by monitoring for job deletion request
title "Wait until the destroy has started."
for i in {1..30}; do
if [ -f out.requests.txt ] && jq -e 'select(.method == "POST" and (.path | contains("/api/2.2/jobs/delete")))' out.requests.txt >/dev/null 2>&1; then
echo "Destroy in progress, sending interrupt signal..."
break
fi
sleep 0.1
done

# Send interrupt signal
trace kill -INT $DESTROY_PID

# Wait for process to complete
errcode trace wait $DESTROY_PID

title "A deletion request for deploy.lock should have been recorded in the requests file"
trace cat out.requests.txt | jq 'select(.method == "POST" and (.path | contains("workspace/delete")) and (.body.path | contains("deploy.lock")))'

title "No deletion request for resources.json should be recorded. We still need state to complete the destroy."
trace cat out.requests.txt | jq 'select(.method == "POST" and (.path | contains("workspace/delete")) and (.body.path | contains("resources.json")))'

trace $CLI bundle debug plan
rm out.requests.txt
20 changes: 20 additions & 0 deletions acceptance/bundle/destroy/signal-cleanup/test.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
Local = true
Cloud = false
RecordRequests = true

# Test only direct engine (signal handling is the same for terraform)
EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["direct"]

# Add delay to first job deletion to ensure we can interrupt during destroy
[[Server]]
Pattern = "POST /api/2.2/jobs/delete"
Response.StatusCode = 200
Response.Body = '{}'

# Large time to ensure destroy gets stuck when deleting the first job.
Delay = "300s"

# Replace PID numbers in kill/wait commands
[[Repls]]
Old = "(kill -INT |wait )\\d+"
New = "$1[PID]"
30 changes: 5 additions & 25 deletions bundle/deploy/lock/release.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,25 +5,13 @@ import (

"github.com/databricks/cli/bundle"
"github.com/databricks/cli/libs/diag"
"github.com/databricks/cli/libs/locker"
"github.com/databricks/cli/libs/log"
)

type Goal string
type release struct{}
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't need goal anymore since lock.Release is idempotent.


const (
GoalBind = Goal("bind")
GoalUnbind = Goal("unbind")
GoalDeploy = Goal("deploy")
GoalDestroy = Goal("destroy")
)

type release struct {
goal Goal
}

func Release(goal Goal) bundle.Mutator {
return &release{goal}
func Release() bundle.Mutator {
return &release{}
}

func (m *release) Name() string {
Expand All @@ -45,14 +33,6 @@ func (m *release) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnostics
}

log.Infof(ctx, "Releasing deployment lock")
switch m.goal {
case GoalDeploy:
return diag.FromErr(b.Locker.Unlock(ctx))
case GoalBind, GoalUnbind:
return diag.FromErr(b.Locker.Unlock(ctx))
case GoalDestroy:
return diag.FromErr(b.Locker.Unlock(ctx, locker.AllowLockFileNotExist))
default:
return diag.Errorf("unknown goal for lock release: %s", m.goal)
}
err := b.Locker.Unlock(ctx)
return diag.FromErr(err)
}
Loading
Loading