Skip to content

Commit 81be591

Browse files
[Fix] Retry cluster update on "INVALID_STATE" (#3890)
## Changes Clusters can only be updated while in Running and Terminated state. This causes TF to fail to update Autoscaling Clusters if there is an ongoing resize. ## Tests - [X] `make test` run locally - [ ] relevant change in `docs/` folder - [ ] covered with integration tests in `internal/acceptance` - [ ] relevant acceptance tests are passing - [X] using Go SDK
1 parent 9490aa8 commit 81be591

File tree

2 files changed

+114
-1
lines changed

2 files changed

+114
-1
lines changed

clusters/resource_cluster.go

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,18 @@ package clusters
22

33
import (
44
"context"
5+
"errors"
56
"fmt"
67
"log"
78
"strings"
89
"time"
910

1011
"github.com/hashicorp/go-cty/cty"
12+
"github.com/hashicorp/terraform-plugin-sdk/v2/helper/retry"
1113
"github.com/hashicorp/terraform-plugin-sdk/v2/helper/schema"
1214
"github.com/hashicorp/terraform-plugin-sdk/v2/helper/validation"
1315

16+
"github.com/databricks/databricks-sdk-go/apierr"
1417
"github.com/databricks/databricks-sdk-go/service/compute"
1518
"github.com/databricks/terraform-provider-databricks/common"
1619
"github.com/databricks/terraform-provider-databricks/libraries"
@@ -604,7 +607,21 @@ func resourceClusterUpdate(ctx context.Context, d *schema.ResourceData, c *commo
604607
return err
605608
}
606609
cluster.ForceSendFields = []string{"NumWorkers"}
607-
_, err = clusters.Edit(ctx, cluster)
610+
611+
err = retry.RetryContext(ctx, 15*time.Minute, func() *retry.RetryError {
612+
_, err = clusters.Edit(ctx, cluster)
613+
if err == nil {
614+
return nil
615+
}
616+
var apiErr *apierr.APIError
617+
// Only Running and Terminated clusters can be modified. In particular, autoscaling clusters cannot be modified
618+
// while the resizing is ongoing. We retry in this case. Scaling can take several minutes.
619+
if errors.As(err, &apiErr) && apiErr.ErrorCode == "INVALID_STATE" {
620+
return retry.RetryableError(fmt.Errorf("cluster %s cannot be modified in its current state", clusterId))
621+
}
622+
return retry.NonRetryableError(err)
623+
})
624+
608625
}
609626
if err != nil {
610627
return err

clusters/resource_cluster_test.go

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -965,6 +965,102 @@ func TestResourceClusterUpdate(t *testing.T) {
965965
assert.Equal(t, "abc", d.Id(), "Id should be the same as in reading")
966966
}
967967

968+
func TestResourceClusterUpdate_WhileScaling(t *testing.T) {
969+
d, err := qa.ResourceFixture{
970+
Fixtures: []qa.HTTPFixture{
971+
{
972+
Method: "GET",
973+
Resource: "/api/2.1/clusters/get?cluster_id=abc",
974+
ReuseRequest: true,
975+
Response: compute.ClusterDetails{
976+
ClusterId: "abc",
977+
NumWorkers: 100,
978+
ClusterName: "Shared Autoscaling",
979+
SparkVersion: "7.1-scala12",
980+
NodeTypeId: "i3.xlarge",
981+
AutoterminationMinutes: 15,
982+
State: compute.StateRunning,
983+
},
984+
},
985+
{
986+
Method: "POST",
987+
Resource: "/api/2.1/clusters/events",
988+
ExpectedRequest: compute.GetEvents{
989+
ClusterId: "abc",
990+
Limit: 1,
991+
Order: compute.GetEventsOrderDesc,
992+
EventTypes: []compute.EventType{compute.EventTypePinned, compute.EventTypeUnpinned},
993+
},
994+
Response: compute.GetEventsResponse{
995+
Events: []compute.ClusterEvent{},
996+
TotalCount: 0,
997+
},
998+
},
999+
{
1000+
Method: "POST",
1001+
Resource: "/api/2.1/clusters/start",
1002+
ExpectedRequest: compute.StartCluster{
1003+
ClusterId: "abc",
1004+
},
1005+
},
1006+
{
1007+
Method: "GET",
1008+
Resource: "/api/2.0/libraries/cluster-status?cluster_id=abc",
1009+
Response: compute.ClusterLibraryStatuses{
1010+
LibraryStatuses: []compute.LibraryFullStatus{},
1011+
},
1012+
},
1013+
{
1014+
Method: "POST",
1015+
Resource: "/api/2.1/clusters/edit",
1016+
ExpectedRequest: compute.ClusterDetails{
1017+
AutoterminationMinutes: 15,
1018+
ClusterId: "abc",
1019+
NumWorkers: 100,
1020+
ClusterName: "Shared Autoscaling",
1021+
SparkVersion: "7.1-scala12",
1022+
NodeTypeId: "i3.xlarge",
1023+
},
1024+
Response: common.APIErrorBody{
1025+
ErrorCode: "INVALID_STATE",
1026+
},
1027+
Status: 404,
1028+
},
1029+
{
1030+
Method: "POST",
1031+
Resource: "/api/2.1/clusters/edit",
1032+
ExpectedRequest: compute.ClusterDetails{
1033+
AutoterminationMinutes: 15,
1034+
ClusterId: "abc",
1035+
NumWorkers: 100,
1036+
ClusterName: "Shared Autoscaling",
1037+
SparkVersion: "7.1-scala12",
1038+
NodeTypeId: "i3.xlarge",
1039+
},
1040+
},
1041+
{
1042+
Method: "GET",
1043+
Resource: "/api/2.0/libraries/cluster-status?cluster_id=abc",
1044+
Response: compute.ClusterLibraryStatuses{
1045+
LibraryStatuses: []compute.LibraryFullStatus{},
1046+
},
1047+
},
1048+
},
1049+
ID: "abc",
1050+
Update: true,
1051+
Resource: ResourceCluster(),
1052+
State: map[string]any{
1053+
"autotermination_minutes": 15,
1054+
"cluster_name": "Shared Autoscaling",
1055+
"spark_version": "7.1-scala12",
1056+
"node_type_id": "i3.xlarge",
1057+
"num_workers": 100,
1058+
},
1059+
}.Apply(t)
1060+
assert.NoError(t, err)
1061+
assert.Equal(t, "abc", d.Id(), "Id should be the same as in reading")
1062+
}
1063+
9681064
func TestResourceClusterUpdateWithPinned(t *testing.T) {
9691065
d, err := qa.ResourceFixture{
9701066
Fixtures: []qa.HTTPFixture{

0 commit comments

Comments
 (0)