Skip to content
Draft
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions NEXT_CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,6 @@

### Bundles
* Add validation that served_models and served_entities are not used at the same time. Add client side translation logic. ([#3880](https://github.com/databricks/cli/pull/3880))
* Gracefully handle interrupts (SIGINT, SIGTERM, SIGHUP, SIGQUIT) during bundle deployment and destruction by releasing locks before exiting ([#3758](https://github.com/databricks/cli/pull/3758))

### API Changes
10 changes: 10 additions & 0 deletions acceptance/bundle/deploy/signal-cleanup/databricks.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
bundle:
name: signal-test

resources:
jobs:
job1:
name: job1

job2:
name: job2 (deploy after ${resources.jobs.job1.id})
5 changes: 5 additions & 0 deletions acceptance/bundle/deploy/signal-cleanup/out.test.toml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

83 changes: 83 additions & 0 deletions acceptance/bundle/deploy/signal-cleanup/output.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@

=== Wait until the deployment has started.Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/signal-test/default/files...
Deploying resources...
Deployment in progress, sending interrupt signal...

>>> kill -INT [PID]

>>> wait [PID]
Operation interrupted. Gracefully shutting down...
Error: terraform apply: exit status 1

Updating deployment state...

Exit code: 130

=== A deletion request for deploy.lock should have been recorded in the requests file
>>> cat out.requests.txt
{
"method": "POST",
"path": "/api/2.0/workspace/delete",
"body": {
"path": "/Workspace/Users/[USERNAME]/.bundle/signal-test/default/state/deploy.lock"
}
}

=== An upload request for the state file should have been recorded in the requests file
>>> cat out.requests.txt
{
"method": "POST",
"path": "/api/2.0/workspace-files/import-file/Workspace/Users/[USERNAME]/.bundle/signal-test/default/state/terraform.tfstate",
"q": {
"overwrite": "true"
},
"body": {
"version": 4,
"terraform_version": "1.5.5",
"serial": 1,
"lineage": "[UUID]",
"outputs": {},
"resources": [
{
"mode": "managed",
"type": "databricks_job",
"name": "job2",
"provider": "provider[\"registry.terraform.io/databricks/databricks\"]",
"instances": []
}
],
"check_results": null
}
}

=== A creation request for job1 should be recorded in the requests file. No request for job2 should exist since the process was terminated mid deployment.
>>> cat out.requests.txt
{
"method": "POST",
"path": "/api/2.2/jobs/create",
"body": {
"deployment": {
"kind": "BUNDLE",
"metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/signal-test/default/state/metadata.json"
},
"edit_mode": "UI_LOCKED",
"format": "MULTI_TASK",
"max_concurrent_runs": 1,
"name": "job1",
"queue": {
"enabled": true
}
}
}

>>> [CLI] bundle debug plan
{
"plan": {
"resources.jobs.job1": {
"action": "create"
},
"resources.jobs.job2": {
"action": "create"
}
}
}
35 changes: 35 additions & 0 deletions acceptance/bundle/deploy/signal-cleanup/script
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/bin/bash

# Start deployment in background, redirecting stderr to capture when deployment starts
$CLI bundle deploy 2>&1 &
DEPLOY_PID=$!

# Wait for deployment to start by monitoring the requests file
# Once we see the job creation request starting, we know deployment is in progress
title "Wait until the deployment has started."
for i in {1..30}; do
if [ -f out.requests.txt ] && jq -e 'select(.method == "POST" and (.path | contains("/api/2.2/jobs/create")))' out.requests.txt >/dev/null 2>&1; then
echo "Deployment in progress, sending interrupt signal..."
break
fi
sleep 0.1
done

# Send interrupt signal
trace kill -INT $DEPLOY_PID

# Wait for process to complete
errcode trace wait $DEPLOY_PID

title "A deletion request for deploy.lock should have been recorded in the requests file"
trace cat out.requests.txt | jq 'select(.method == "POST" and (.path | contains("workspace/delete")) and (.body.path | contains("deploy.lock")))'

title "An upload request for the state file should have been recorded in the requests file"
trace cat out.requests.txt | jq 'select(.method == "POST" and (.path | contains("workspace-files/import-file")) and (.path | contains("terraform.tfstate")))'

title "A creation request for job1 should be recorded in the requests file. No request for job2 should exist since the process was terminated mid deployment."
trace cat out.requests.txt | jq 'select(.method == "POST" and (.path | contains("/api/2.2/jobs/create")))'

trace $CLI bundle debug plan

rm out.requests.txt
20 changes: 20 additions & 0 deletions acceptance/bundle/deploy/signal-cleanup/test.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
Local = true
Cloud = false
RecordRequests = true

# Test only terraform engine (signal handling is the same for direct)
EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["terraform"]

# Add delay to first job creation to ensure we can interrupt during deployment
[[Server]]
Pattern = "POST /api/2.2/jobs/create"
Response.StatusCode = 200
Response.Body = '{"job_id": 1111}'

# Large time to ensure deployment gets stuck when trying to create the first job.
Delay = "300s"

# Replace PID numbers in kill/wait commands
[[Repls]]
Old = "(kill -INT |wait )\\d+"
New = "$1[PID]"
7 changes: 7 additions & 0 deletions acceptance/bundle/destroy/signal-cleanup/databricks.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
bundle:
name: signal-test

resources:
jobs:
job1:
name: job1
5 changes: 5 additions & 0 deletions acceptance/bundle/destroy/signal-cleanup/out.test.toml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

59 changes: 59 additions & 0 deletions acceptance/bundle/destroy/signal-cleanup/output.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/signal-test/default/files...
Deploying resources...
Updating deployment state...
Deployment complete!

=== Wait until the destroy has started.The following resources will be deleted:
delete job job1

All files and directories at the following location will be deleted: /Workspace/Users/[USERNAME]/.bundle/signal-test/default

Destroy in progress, sending interrupt signal...

>>> kill -INT [PID]

>>> wait [PID]
Operation interrupted. Gracefully shutting down...
Error: cannot delete resources.jobs.job1: deleting id=[NUMID]: Post "[DATABRICKS_URL]/api/2.2/jobs/delete": context canceled


Exit code: 130

=== A deletion request for deploy.lock should have been recorded in the requests file
>>> cat out.requests.txt
{
"method": "POST",
"path": "/api/2.0/workspace/delete",
"body": {
"path": "/Workspace/Users/[USERNAME]/.bundle/signal-test/default/state/deploy.lock"
}
}

=== No deletion request for resources.json should be recorded. We still need state to complete the destroy.
>>> cat out.requests.txt

>>> [CLI] bundle debug plan
{
"plan": {
"resources.jobs.job1": {
"action": "skip",
"remote_state": {
"creator_user_name": "[USERNAME]",
"job_id": [NUMID],
"settings": {
"deployment": {
"kind": "BUNDLE",
"metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/signal-test/default/state/metadata.json"
},
"edit_mode": "UI_LOCKED",
"format": "MULTI_TASK",
"max_concurrent_runs": 1,
"name": "job1",
"queue": {
"enabled": true
}
}
}
}
}
}
34 changes: 34 additions & 0 deletions acceptance/bundle/destroy/signal-cleanup/script
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/bin/bash

# First deploy the bundle so we have something to destroy
$CLI bundle deploy --auto-approve
rm out.requests.txt

# Start destroy in background, redirecting stderr to capture when destroy starts
$CLI bundle destroy --auto-approve 2>&1 &
DESTROY_PID=$!

# Wait for destroy to start by monitoring for job deletion request
title "Wait until the destroy has started."
for i in {1..30}; do
if [ -f out.requests.txt ] && jq -e 'select(.method == "POST" and (.path | contains("/api/2.2/jobs/delete")))' out.requests.txt >/dev/null 2>&1; then
echo "Destroy in progress, sending interrupt signal..."
break
fi
sleep 0.1
done

# Send interrupt signal
trace kill -INT $DESTROY_PID

# Wait for process to complete
errcode trace wait $DESTROY_PID

title "A deletion request for deploy.lock should have been recorded in the requests file"
trace cat out.requests.txt | jq 'select(.method == "POST" and (.path | contains("workspace/delete")) and (.body.path | contains("deploy.lock")))'

title "No deletion request for resources.json should be recorded. We still need state to complete the destroy."
trace cat out.requests.txt | jq 'select(.method == "POST" and (.path | contains("workspace/delete")) and (.body.path | contains("resources.json")))'

trace $CLI bundle debug plan
rm out.requests.txt
20 changes: 20 additions & 0 deletions acceptance/bundle/destroy/signal-cleanup/test.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
Local = true
Cloud = false
RecordRequests = true

# Test only direct engine (signal handling is the same for terraform)
EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["direct"]

# Add delay to first job deletion to ensure we can interrupt during destroy
[[Server]]
Pattern = "POST /api/2.2/jobs/delete"
Response.StatusCode = 200
Response.Body = '{}'

# Large time to ensure destroy gets stuck when deleting the first job.
Delay = "300s"

# Replace PID numbers in kill/wait commands
[[Repls]]
Old = "(kill -INT |wait )\\d+"
New = "$1[PID]"
30 changes: 5 additions & 25 deletions bundle/deploy/lock/release.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,25 +5,13 @@ import (

"github.com/databricks/cli/bundle"
"github.com/databricks/cli/libs/diag"
"github.com/databricks/cli/libs/locker"
"github.com/databricks/cli/libs/log"
)

type Goal string
type release struct{}
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't need goal anymore since lock.Release is idempotent.


const (
GoalBind = Goal("bind")
GoalUnbind = Goal("unbind")
GoalDeploy = Goal("deploy")
GoalDestroy = Goal("destroy")
)

type release struct {
goal Goal
}

func Release(goal Goal) bundle.Mutator {
return &release{goal}
func Release() bundle.Mutator {
return &release{}
}

func (m *release) Name() string {
Expand All @@ -45,14 +33,6 @@ func (m *release) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnostics
}

log.Infof(ctx, "Releasing deployment lock")
switch m.goal {
case GoalDeploy:
return diag.FromErr(b.Locker.Unlock(ctx))
case GoalBind, GoalUnbind:
return diag.FromErr(b.Locker.Unlock(ctx))
case GoalDestroy:
return diag.FromErr(b.Locker.Unlock(ctx, locker.AllowLockFileNotExist))
default:
return diag.Errorf("unknown goal for lock release: %s", m.goal)
}
err := b.Locker.Unlock(ctx)
return diag.FromErr(err)
}
Loading
Loading