Skip to content

Commit 1e5443d

Browse files
[Internal] Retry on 504 when calling the permission API (#4355)
## Changes This PR adds logic to retry failed get calls to the permissions API when the error is a 504. This solution is meant to be temporary and will be removed as soon as such retries are handled natively in the Databricks Go SDK. ## Tests Complete coverage of the added retrier.
1 parent 151ed6d commit 1e5443d

File tree

4 files changed

+144
-5
lines changed

4 files changed

+144
-5
lines changed

common/retry.go

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,11 @@ package common
22

33
import (
44
"context"
5-
"log"
5+
"errors"
66
"regexp"
77

8+
"github.com/databricks/databricks-sdk-go/apierr"
9+
"github.com/databricks/databricks-sdk-go/logger"
810
"github.com/databricks/databricks-sdk-go/retries"
911
)
1012

@@ -15,11 +17,27 @@ func RetryOnTimeout[T any](ctx context.Context, f func(context.Context) (*T, err
1517
msg := err.Error()
1618
isTimeout := timeoutRegex.MatchString(msg)
1719
if isTimeout {
18-
log.Printf("[DEBUG] Retrying due to timeout: %s", msg)
20+
logger.Debugf(ctx, "Retrying due to timeout: %s", msg)
1921
}
2022
return isTimeout
2123
}))
2224
return r.Run(ctx, func(ctx context.Context) (*T, error) {
2325
return f(ctx)
2426
})
2527
}
28+
29+
// RetryOn504 returns a [retries.Retrier] that calls the given method
30+
// until it either succeeds or returns an error that is different from
31+
// [apierr.ErrDeadlineExceeded].
32+
func RetryOn504[T any](ctx context.Context, f func(context.Context) (*T, error)) (*T, error) {
33+
r := retries.New[T](retries.WithTimeout(-1), retries.WithRetryFunc(func(err error) bool {
34+
if !errors.Is(err, apierr.ErrDeadlineExceeded) {
35+
return false
36+
}
37+
logger.Debugf(ctx, "Retrying on error 504")
38+
return true
39+
}))
40+
return r.Run(ctx, func(ctx context.Context) (*T, error) {
41+
return f(ctx)
42+
})
43+
}

common/retry_test.go

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"errors"
66
"testing"
77

8+
"github.com/databricks/databricks-sdk-go/apierr"
89
"github.com/databricks/databricks-sdk-go/experimental/mocks"
910
"github.com/databricks/databricks-sdk-go/service/workspace"
1011
"github.com/stretchr/testify/assert"
@@ -47,3 +48,87 @@ func TestRetryOnTimeout_NonRetriableError(t *testing.T) {
4748
})
4849
assert.ErrorIs(t, err, expected)
4950
}
51+
52+
func TestRetryOn504_noError(t *testing.T) {
53+
wantErr := error(nil)
54+
wantRes := (*workspace.ObjectInfo)(nil)
55+
wantCalls := 1
56+
57+
w := mocks.NewMockWorkspaceClient(t)
58+
api := w.GetMockWorkspaceAPI().EXPECT()
59+
api.GetStatusByPath(mock.Anything, mock.Anything).Return(wantRes, wantErr)
60+
61+
gotCalls := 0
62+
gotRes, gotErr := RetryOn504(context.Background(), func(ctx context.Context) (*workspace.ObjectInfo, error) {
63+
gotCalls += 1
64+
return w.WorkspaceClient.Workspace.GetStatusByPath(ctx, "path")
65+
})
66+
67+
assert.ErrorIs(t, gotErr, wantErr)
68+
assert.Equal(t, gotRes, wantRes)
69+
assert.Equal(t, gotCalls, wantCalls)
70+
}
71+
72+
func TestRetryOn504_errorNot504(t *testing.T) {
73+
wantErr := errors.New("test error")
74+
wantRes := (*workspace.ObjectInfo)(nil)
75+
wantCalls := 1
76+
77+
w := mocks.NewMockWorkspaceClient(t)
78+
api := w.GetMockWorkspaceAPI().EXPECT()
79+
api.GetStatusByPath(mock.Anything, mock.Anything).Return(wantRes, wantErr)
80+
81+
gotCalls := 0
82+
gotRes, gotErr := RetryOn504(context.Background(), func(ctx context.Context) (*workspace.ObjectInfo, error) {
83+
gotCalls += 1
84+
return w.WorkspaceClient.Workspace.GetStatusByPath(ctx, "path")
85+
})
86+
87+
assert.ErrorIs(t, gotErr, wantErr)
88+
assert.Equal(t, gotRes, wantRes)
89+
assert.Equal(t, gotCalls, wantCalls)
90+
}
91+
92+
func TestRetryOn504_error504ThenFail(t *testing.T) {
93+
wantErr := errors.New("test error")
94+
wantRes := (*workspace.ObjectInfo)(nil)
95+
wantCalls := 2
96+
97+
w := mocks.NewMockWorkspaceClient(t)
98+
api := w.GetMockWorkspaceAPI().EXPECT()
99+
call := api.GetStatusByPath(mock.Anything, mock.Anything).Return(nil, apierr.ErrDeadlineExceeded)
100+
call.Repeatability = 1
101+
api.GetStatusByPath(mock.Anything, mock.Anything).Return(wantRes, wantErr)
102+
103+
gotCalls := 0
104+
gotRes, gotErr := RetryOn504(context.Background(), func(ctx context.Context) (*workspace.ObjectInfo, error) {
105+
gotCalls++
106+
return w.WorkspaceClient.Workspace.GetStatusByPath(ctx, "path")
107+
})
108+
109+
assert.ErrorIs(t, gotErr, wantErr)
110+
assert.Equal(t, gotRes, wantRes)
111+
assert.Equal(t, gotCalls, wantCalls)
112+
}
113+
114+
func TestRetryOn504_error504ThenSuccess(t *testing.T) {
115+
wantErr := error(nil)
116+
wantRes := &workspace.ObjectInfo{}
117+
wantCalls := 2
118+
119+
w := mocks.NewMockWorkspaceClient(t)
120+
api := w.GetMockWorkspaceAPI().EXPECT()
121+
call := api.GetStatusByPath(mock.Anything, mock.Anything).Return(nil, apierr.ErrDeadlineExceeded)
122+
call.Repeatability = 1
123+
api.GetStatusByPath(mock.Anything, mock.Anything).Return(wantRes, wantErr)
124+
125+
gotCalls := 0
126+
gotRes, gotErr := RetryOn504(context.Background(), func(ctx context.Context) (*workspace.ObjectInfo, error) {
127+
gotCalls++
128+
return w.WorkspaceClient.Workspace.GetStatusByPath(ctx, "path")
129+
})
130+
131+
assert.ErrorIs(t, gotErr, wantErr)
132+
assert.Equal(t, gotRes, wantRes)
133+
assert.Equal(t, gotCalls, wantCalls)
134+
}

permissions/resource_permissions.go

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -122,10 +122,16 @@ func (a PermissionsAPI) readRaw(objectID string, mapping resourcePermissions) (*
122122
}
123123
idParts := strings.Split(objectID, "/")
124124
id := idParts[len(idParts)-1]
125-
permissions, err := w.Permissions.Get(a.context, iam.GetPermissionRequest{
126-
RequestObjectId: id,
127-
RequestObjectType: mapping.requestObjectType,
125+
126+
// TODO: This a temporary measure to implement retry on 504 until this is
127+
// supported natively in the Go SDK.
128+
permissions, err := common.RetryOn504(a.context, func(ctx context.Context) (*iam.ObjectPermissions, error) {
129+
return w.Permissions.Get(a.context, iam.GetPermissionRequest{
130+
RequestObjectId: id,
131+
RequestObjectType: mapping.requestObjectType,
132+
})
128133
})
134+
129135
var apiErr *apierr.APIError
130136
// https://github.com/databricks/terraform-provider-databricks/issues/1227
131137
// platform propagates INVALID_STATE error for auto-purged clusters in

permissions/resource_permissions_test.go

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -484,6 +484,36 @@ func TestResourcePermissionsRead_EmptyListResultsInRemoval(t *testing.T) {
484484
}.ApplyNoError(t)
485485
}
486486

487+
func TestResourcePermissionsRead_EmptyListResultsInRemovalWith504Errors(t *testing.T) {
488+
qa.ResourceFixture{
489+
MockWorkspaceClientFunc: func(mwc *mocks.MockWorkspaceClient) {
490+
mwc.GetMockCurrentUserAPI().EXPECT().Me(mock.Anything).Return(&iam.User{UserName: "admin"}, nil)
491+
492+
req := iam.GetPermissionRequest{
493+
RequestObjectId: "abc",
494+
RequestObjectType: "clusters",
495+
}
496+
497+
// Fail 3 times with a 504 error. These should be retried
498+
// transparently.
499+
call := mwc.GetMockPermissionsAPI().EXPECT().Get(mock.Anything, req).Return(nil, apierr.ErrDeadlineExceeded)
500+
call.Repeatability = 3
501+
502+
mwc.GetMockPermissionsAPI().EXPECT().Get(mock.Anything, req).Return(&iam.ObjectPermissions{
503+
ObjectId: "/clusters/abc",
504+
ObjectType: "cluster",
505+
}, nil)
506+
},
507+
Resource: ResourcePermissions(),
508+
Read: true,
509+
Removed: true,
510+
InstanceState: map[string]string{
511+
"cluster_id": "abc",
512+
},
513+
ID: "/clusters/abc",
514+
}.ApplyNoError(t)
515+
}
516+
487517
func TestResourcePermissionsDelete(t *testing.T) {
488518
d, err := qa.ResourceFixture{
489519
MockWorkspaceClientFunc: func(mwc *mocks.MockWorkspaceClient) {

0 commit comments

Comments
 (0)