Skip to content

Commit b2c1182

Browse files
authored
Introduce retries to databricks psql command (#3492)
## Why <!-- Why are these changes needed? Provide the context that the reviewer might be missing. For example, were there any decisions behind the change that are not reflected in the code itself? --> Sometimes the connection to the database instance might fail, which can be remediated with a retry. This change makes the command to retry the connection 3 times by default. To disable the retries users can set a `max-retries` value to zero: ``` $ databricks psql my-instance --max-retries 0 ``` ## Tests <!-- How have you tested the changes? --> Added a new acceptance test, modified and existing acceptance test; ran manual tests on Mac and Windows <!-- If your PR needs to be included in the release notes for next release, add a separate entry in NEXT_CHANGELOG.md as part of your PR. -->
1 parent 1d38fdc commit b2c1182

File tree

11 files changed

+299
-14
lines changed

11 files changed

+299
-14
lines changed

NEXT_CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
### Notable Changes
66

77
### CLI
8+
* Introduce retries to `databricks psql` command ([#3492](https://github.com/databricks/cli/pull/3492))
89
* Add rule files for coding agents working on the CLI code base ([#3245](https://github.com/databricks/cli/pull/3245))
910

1011
### Dependency updates
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
#!/bin/bash
2+
#
3+
# This script prints its arguments and exits.
4+
# The test script renames this script to "psql" in order to capture the arguments that the CLI passes to psql command.
5+
#
6+
echo "Simulating connection failure with exit code '2'"
7+
exit 2
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
Local = true
2+
Cloud = false
3+
4+
[GOOS]
5+
windows = false
6+
7+
[EnvMatrix]
8+
DATABRICKS_CLI_DEPLOYMENT = ["terraform", "direct-exp"]
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
2+
=== Command should use default number of retries:
3+
>>> musterr [CLI] psql my-database -- --dbname=db1 -p 3000
4+
Connecting to Databricks Database Instance my-database ...
5+
Postgres version: 14
6+
Database instance status: AVAILABLE
7+
Successfully fetched database credentials
8+
Launching psql session to my-database.my-host.com (attempt 1/3)...
9+
Simulating connection failure with exit code '2'
10+
Connection failed with retryable error: connection failed (retryable): psql exited with code 2
11+
Connection attempt 1/3 failed, retrying in 1s...
12+
Launching psql session to my-database.my-host.com (attempt 2/3)...
13+
Simulating connection failure with exit code '2'
14+
Connection failed with retryable error: connection failed (retryable): psql exited with code 2
15+
Connection attempt 2/3 failed, retrying in 2s...
16+
Launching psql session to my-database.my-host.com (attempt 3/3)...
17+
Simulating connection failure with exit code '2'
18+
Connection failed with retryable error: connection failed (retryable): psql exited with code 2
19+
Error: failed to connect after 3 attempts, last error: connection failed (retryable): psql exited with code 2
20+
21+
Exit code (musterr): 1
22+
23+
=== Command should use custom number of retries:
24+
>>> musterr [CLI] psql my-database --max-retries 5 -- --dbname=db1 -p 3000
25+
Connecting to Databricks Database Instance my-database ...
26+
Postgres version: 14
27+
Database instance status: AVAILABLE
28+
Successfully fetched database credentials
29+
Launching psql session to my-database.my-host.com (attempt 1/5)...
30+
Simulating connection failure with exit code '2'
31+
Connection failed with retryable error: connection failed (retryable): psql exited with code 2
32+
Connection attempt 1/5 failed, retrying in 1s...
33+
Launching psql session to my-database.my-host.com (attempt 2/5)...
34+
Simulating connection failure with exit code '2'
35+
Connection failed with retryable error: connection failed (retryable): psql exited with code 2
36+
Connection attempt 2/5 failed, retrying in 2s...
37+
Launching psql session to my-database.my-host.com (attempt 3/5)...
38+
Simulating connection failure with exit code '2'
39+
Connection failed with retryable error: connection failed (retryable): psql exited with code 2
40+
Connection attempt 3/5 failed, retrying in 4s...
41+
Launching psql session to my-database.my-host.com (attempt 4/5)...
42+
Simulating connection failure with exit code '2'
43+
Connection failed with retryable error: connection failed (retryable): psql exited with code 2
44+
Connection attempt 4/5 failed, retrying in 8s...
45+
Launching psql session to my-database.my-host.com (attempt 5/5)...
46+
Simulating connection failure with exit code '2'
47+
Connection failed with retryable error: connection failed (retryable): psql exited with code 2
48+
Error: failed to connect after 5 attempts, last error: connection failed (retryable): psql exited with code 2
49+
50+
Exit code (musterr): 1
51+
52+
=== Command should not use retries:
53+
>>> musterr [CLI] psql my-database --max-retries 0 -- --dbname=db1 -p 3000
54+
Connecting to Databricks Database Instance my-database ...
55+
Postgres version: 14
56+
Database instance status: AVAILABLE
57+
Successfully fetched database credentials
58+
Launching psql with connection to my-database.my-host.com...
59+
Simulating connection failure with exit code '2'
60+
61+
Exit code (musterr): 2
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
mv always-fail.sh psql
2+
3+
cleanup() {
4+
rm psql
5+
}
6+
trap cleanup EXIT
7+
8+
export PATH="$(pwd):$PATH"
9+
10+
title "Command should use default number of retries:"
11+
trace musterr $CLI psql my-database -- --dbname=db1 -p 3000
12+
13+
title "Command should use custom number of retries:"
14+
trace musterr $CLI psql my-database --max-retries 5 -- --dbname=db1 -p 3000
15+
16+
title "Command should not use retries:"
17+
trace musterr $CLI psql my-database --max-retries 0 -- --dbname=db1 -p 3000
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# This acceptance test is disabled on Windows runners because
2+
# the current argument capturing method does not work on windows-latest GitHub Runner.
3+
#
4+
# See PR #3228 for documented attempts to fix this issue:
5+
# https://github.com/databricks/cli/pull/3228
6+
GOOS.windows = false
7+
8+
[[Server]]
9+
Pattern = "GET /api/2.0/database/instances/my-database"
10+
Response.Body = '''
11+
{
12+
"state": "AVAILABLE",
13+
"pg_version": "14",
14+
"read_write_dns": "my-database.my-host.com"
15+
}
16+
'''
17+
18+
[[Server]]
19+
Pattern = "GET /api/2.0/database/instances"
20+
Response.Body = '''
21+
{
22+
"database_instances": []
23+
}
24+
'''
25+
26+
[[Server]]
27+
Pattern = "POST /api/2.0/database/credentials"
28+
Response.Body = '''
29+
{
30+
"token": "my-secret-token"
31+
}
32+
'''

acceptance/cmd/psql/simple/output.txt

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ Connecting to Databricks Database Instance my-database ...
2323
Postgres version: 14
2424
Database instance status: AVAILABLE
2525
Successfully fetched database credentials
26-
Launching psql with connection to my-database.my-host.com...
26+
Launching psql session to my-database.my-host.com (attempt 1/3)...
2727
echo-arguments.sh was called with the following arguments: --host=my-database.my-host.com --username=[USERNAME] --port=5432 --dbname=databricks_postgres
2828
PGPASSWORD=my-secret-token
2929
PGSSLMODE=require
@@ -34,7 +34,7 @@ Connecting to Databricks Database Instance my-database ...
3434
Postgres version: 14
3535
Database instance status: AVAILABLE
3636
Successfully fetched database credentials
37-
Launching psql with connection to my-database.my-host.com...
37+
Launching psql session to my-database.my-host.com (attempt 1/3)...
3838
echo-arguments.sh was called with the following arguments: --host=my-database.my-host.com --username=[USERNAME] --port=5432 --dbname=databricks_postgres -c SELECT * FROM my_table --echo-all
3939
PGPASSWORD=my-secret-token
4040
PGSSLMODE=require
@@ -45,7 +45,7 @@ Connecting to Databricks Database Instance my-database ...
4545
Postgres version: 14
4646
Database instance status: AVAILABLE
4747
Successfully fetched database credentials
48-
Launching psql with connection to my-database.my-host.com...
48+
Launching psql session to my-database.my-host.com (attempt 1/3)...
4949
echo-arguments.sh was called with the following arguments: --host=my-database.my-host.com --username=[USERNAME] --port=5432 --dbname=db1
5050
PGPASSWORD=my-secret-token
5151
PGSSLMODE=require
@@ -55,7 +55,7 @@ Connecting to Databricks Database Instance my-database ...
5555
Postgres version: 14
5656
Database instance status: AVAILABLE
5757
Successfully fetched database credentials
58-
Launching psql with connection to my-database.my-host.com...
58+
Launching psql session to my-database.my-host.com (attempt 1/3)...
5959
echo-arguments.sh was called with the following arguments: --host=my-database.my-host.com --username=[USERNAME] --port=5432 -d db2
6060
PGPASSWORD=my-secret-token
6161
PGSSLMODE=require
@@ -66,7 +66,7 @@ Connecting to Databricks Database Instance my-database ...
6666
Postgres version: 14
6767
Database instance status: AVAILABLE
6868
Successfully fetched database credentials
69-
Launching psql with connection to my-database.my-host.com...
69+
Launching psql session to my-database.my-host.com (attempt 1/3)...
7070
echo-arguments.sh was called with the following arguments: --host=my-database.my-host.com --username=[USERNAME] --dbname=db1 -p 3000
7171
PGPASSWORD=my-secret-token
7272
PGSSLMODE=require
@@ -76,6 +76,27 @@ Connecting to Databricks Database Instance my-database ...
7676
Postgres version: 14
7777
Database instance status: AVAILABLE
7878
Successfully fetched database credentials
79+
Launching psql session to my-database.my-host.com (attempt 1/3)...
80+
echo-arguments.sh was called with the following arguments: --host=my-database.my-host.com --username=[USERNAME] -d db2 --port=3001
81+
PGPASSWORD=my-secret-token
82+
PGSSLMODE=require
83+
84+
=== Command should not use retries if max-retries is set to 0:
85+
>>> [CLI] psql my-database --max-retries 0 -- --dbname=db1 -p 3000
86+
Connecting to Databricks Database Instance my-database ...
87+
Postgres version: 14
88+
Database instance status: AVAILABLE
89+
Successfully fetched database credentials
90+
Launching psql with connection to my-database.my-host.com...
91+
echo-arguments.sh was called with the following arguments: --host=my-database.my-host.com --username=[USERNAME] --dbname=db1 -p 3000
92+
PGPASSWORD=my-secret-token
93+
PGSSLMODE=require
94+
95+
>>> [CLI] psql my-database --max-retries 0 -- -d db2 --port=3001
96+
Connecting to Databricks Database Instance my-database ...
97+
Postgres version: 14
98+
Database instance status: AVAILABLE
99+
Successfully fetched database credentials
79100
Launching psql with connection to my-database.my-host.com...
80101
echo-arguments.sh was called with the following arguments: --host=my-database.my-host.com --username=[USERNAME] -d db2 --port=3001
81102
PGPASSWORD=my-secret-token

acceptance/cmd/psql/simple/script

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,3 +29,7 @@ trace $CLI psql my-database -- -d db2
2929
title "Command should use the port from extra arguments when specified:"
3030
trace $CLI psql my-database -- --dbname=db1 -p 3000
3131
trace $CLI psql my-database -- -d db2 --port=3001
32+
33+
title "Command should not use retries if max-retries is set to 0:"
34+
trace $CLI psql my-database --max-retries 0 -- --dbname=db1 -p 3000
35+
trace $CLI psql my-database --max-retries 0 -- -d db2 --port=3001

cmd/psql/psql.go

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package psql
33
import (
44
"errors"
55
"fmt"
6+
"time"
67

78
"github.com/databricks/cli/libs/cmdctx"
89
"github.com/databricks/cli/libs/cmdio"
@@ -26,12 +27,17 @@ func newLakebaseConnectCommand() *cobra.Command {
2627
2728
This command requires a psql client to be installed on your machine for the connection to work.
2829
30+
The command includes automatic retry logic for connection failures. You can configure the retry behavior using the flags below.
31+
2932
You can pass additional arguments to psql after a double-dash (--):
3033
databricks psql my-database -- -c "SELECT * FROM my_table"
3134
databricks psql my-database -- --echo-all -d "my-db"
3235
`,
3336
}
3437

38+
// Add retry configuration flag
39+
cmd.Flags().Int("max-retries", 3, "Maximum number of connection retry attempts (set to 0 to disable retries)")
40+
3541
cmd.PreRunE = root.MustWorkspaceClient
3642
cmd.RunE = func(cmd *cobra.Command, args []string) error {
3743
ctx := cmd.Context()
@@ -74,7 +80,17 @@ You can pass additional arguments to psql after a double-dash (--):
7480
databaseInstanceName := args[0]
7581
extraArgs := args[1:]
7682

77-
return lakebase.Connect(cmd.Context(), databaseInstanceName, extraArgs...)
83+
// Read retry configuration from flags
84+
maxRetries, _ := cmd.Flags().GetInt("max-retries")
85+
86+
retryConfig := lakebase.RetryConfig{
87+
MaxRetries: maxRetries, // Retries are disables when max-retries is 0
88+
InitialDelay: time.Second, // Fixed initial delay
89+
MaxDelay: 10 * time.Second, // Fixed max delay
90+
BackoffFactor: 2.0, // Fixed backoff factor
91+
}
92+
93+
return lakebase.ConnectWithRetryConfig(cmd.Context(), databaseInstanceName, retryConfig, extraArgs...)
7894
}
7995

8096
cmd.ValidArgsFunction = func(cmd *cobra.Command, args []string, toComplete string) ([]string, cobra.ShellCompDirective) {

libs/lakebase/connect.go

Lines changed: 95 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,16 +5,53 @@ import (
55
"errors"
66
"fmt"
77
"os"
8+
"os/exec"
89
"strings"
10+
"time"
911

1012
"github.com/databricks/cli/libs/cmdctx"
1113
"github.com/databricks/cli/libs/cmdio"
12-
"github.com/databricks/cli/libs/exec"
14+
execlib "github.com/databricks/cli/libs/exec"
1315
"github.com/databricks/databricks-sdk-go/service/database"
1416
"github.com/google/uuid"
1517
)
1618

17-
func Connect(ctx context.Context, databaseInstanceName string, extraArgs ...string) error {
19+
// RetryConfig holds configuration for connection retry behavior
20+
type RetryConfig struct {
21+
MaxRetries int
22+
InitialDelay time.Duration
23+
MaxDelay time.Duration
24+
BackoffFactor float64
25+
}
26+
27+
// attemptConnection launches psql interactively and returns an error if connection fails
28+
func attemptConnection(ctx context.Context, args, env []string) error {
29+
cmd := exec.CommandContext(ctx, args[0], args[1:]...)
30+
cmd.Env = env
31+
cmd.Stdin = os.Stdin
32+
cmd.Stdout = os.Stdout
33+
cmd.Stderr = os.Stderr
34+
35+
err := cmd.Run()
36+
if err != nil {
37+
// Check if the error might be due to connection issues
38+
// Since we can't capture stderr when running interactively, we check the exit code
39+
var exitError *exec.ExitError
40+
if errors.As(err, &exitError) {
41+
exitCode := exitError.ExitCode()
42+
43+
// We do not use the Databricks SDK for checking whether the error is retryable because the call in question is not to the API
44+
// psql returns exit code 2 for connection failures
45+
if exitCode == 2 {
46+
return fmt.Errorf("connection failed (retryable): psql exited with code %d", exitCode)
47+
}
48+
}
49+
}
50+
51+
return err
52+
}
53+
54+
func ConnectWithRetryConfig(ctx context.Context, databaseInstanceName string, retryConfig RetryConfig, extraArgs ...string) error {
1855
cmdio.LogString(ctx, fmt.Sprintf("Connecting to Databricks Database Instance %s ...", databaseInstanceName))
1956

2057
w := cmdctx.WorkspaceClient(ctx)
@@ -91,11 +128,61 @@ func Connect(ctx context.Context, databaseInstanceName string, extraArgs ...stri
91128
"PGSSLMODE=require",
92129
)
93130

94-
cmdio.LogString(ctx, fmt.Sprintf("Launching psql with connection to %s...", db.ReadWriteDns))
131+
// If retries are disabled, go directly to interactive session
132+
if retryConfig.MaxRetries <= 0 {
133+
cmdio.LogString(ctx, fmt.Sprintf("Launching psql with connection to %s...", db.ReadWriteDns))
134+
return execlib.Execv(execlib.ExecvOptions{
135+
Args: args,
136+
Env: cmdEnv,
137+
})
138+
}
95139

96-
// Execute psql command inline
97-
return exec.Execv(exec.ExecvOptions{
98-
Args: args,
99-
Env: cmdEnv,
100-
})
140+
// Try launching psql with retry logic
141+
maxRetries := retryConfig.MaxRetries
142+
delay := retryConfig.InitialDelay
143+
144+
var lastErr error
145+
for attempt := range maxRetries {
146+
if attempt > 0 {
147+
cmdio.LogString(ctx, fmt.Sprintf("Connection attempt %d/%d failed, retrying in %v...", attempt, maxRetries, delay))
148+
149+
// Wait with context cancellation support
150+
select {
151+
case <-ctx.Done():
152+
return ctx.Err()
153+
case <-time.After(delay):
154+
}
155+
156+
// Exponential backoff
157+
delay = time.Duration(float64(delay) * retryConfig.BackoffFactor)
158+
if delay > retryConfig.MaxDelay {
159+
delay = retryConfig.MaxDelay
160+
}
161+
}
162+
163+
cmdio.LogString(ctx, fmt.Sprintf("Launching psql session to %s (attempt %d/%d)...", db.ReadWriteDns, attempt+1, maxRetries))
164+
165+
// Try to launch psql and capture the exit status
166+
err := attemptConnection(ctx, args, cmdEnv)
167+
if err == nil {
168+
// psql exited normally (user quit)
169+
return nil
170+
}
171+
172+
lastErr = err
173+
174+
// Check if this is a retryable error
175+
// We do not use the Databricks SDK for checking whether the error is retryable because the call in question is not to the API
176+
if !strings.Contains(err.Error(), "connection failed (retryable)") {
177+
// Non-retryable error, fail immediately
178+
return err
179+
}
180+
181+
if attempt < maxRetries {
182+
cmdio.LogString(ctx, fmt.Sprintf("Connection failed with retryable error: %v", err))
183+
}
184+
}
185+
186+
// All retries exhausted
187+
return fmt.Errorf("failed to connect after %d attempts, last error: %w", maxRetries, lastErr)
101188
}

0 commit comments

Comments
 (0)