Skip to content

Commit fb178f9

Browse files
authored
[Fix] Permanently delete ERROR and TERMINATED state clusters if their creation fails (#4021)
## Changes <!-- Summary of your changes that are easy to understand --> If we get error or terminated cluster (after getting WaitGetClusterRunning in Create) then we permanently delete them. ## Tests <!-- How is this tested? Please see the checklist below and also describe any other relevant tests --> Unit tests - [ ] `make test` run locally - [ ] relevant change in `docs/` folder - [ ] covered with integration tests in `internal/acceptance` - [ ] relevant acceptance tests are passing - [ ] using Go SDK
1 parent 1153bba commit fb178f9

File tree

2 files changed

+116
-0
lines changed

2 files changed

+116
-0
lines changed

clusters/resource_cluster.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -472,6 +472,11 @@ func resourceClusterCreate(ctx context.Context, d *schema.ResourceData, c *commo
472472

473473
clusterInfo, err := clusterWaiter.GetWithTimeout(timeout)
474474
if err != nil {
475+
// In case of "ERROR" or "TERMINATED" state, WaitGetClusterRunning returns an error and we should delete the cluster before returning
476+
deleteError := resourceClusterDelete(ctx, d, c)
477+
if deleteError != nil {
478+
return fmt.Errorf("failed to create cluster: %v and failed to delete it during cleanup: %v", err, deleteError)
479+
}
475480
return err
476481
}
477482

clusters/resource_cluster_test.go

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,117 @@ func TestResourceClusterCreatePinned(t *testing.T) {
164164
assert.Equal(t, "abc", d.Id())
165165
}
166166

167+
func TestResourceClusterCreateErrorFollowedByDeletion(t *testing.T) {
168+
d, err := qa.ResourceFixture{
169+
Fixtures: []qa.HTTPFixture{
170+
{
171+
Method: "POST",
172+
Resource: "/api/2.1/clusters/create",
173+
ExpectedRequest: compute.CreateCluster{
174+
NumWorkers: 100,
175+
ClusterName: "Shared Autoscaling",
176+
SparkVersion: "7.1-scala12",
177+
NodeTypeId: "i3.xlarge",
178+
AutoterminationMinutes: 15,
179+
},
180+
Response: compute.ClusterDetails{
181+
ClusterId: "abc",
182+
},
183+
},
184+
{
185+
Method: "GET",
186+
ReuseRequest: true,
187+
Resource: "/api/2.1/clusters/get?cluster_id=abc",
188+
Response: compute.ClusterDetails{
189+
ClusterId: "abc",
190+
NumWorkers: 100,
191+
ClusterName: "Shared Autoscaling",
192+
SparkVersion: "7.1-scala12",
193+
NodeTypeId: "i3.xlarge",
194+
AutoterminationMinutes: 15,
195+
State: compute.StateTerminated,
196+
},
197+
},
198+
{
199+
Method: "POST",
200+
Resource: "/api/2.1/clusters/permanent-delete",
201+
ExpectedRequest: compute.PermanentDeleteCluster{
202+
ClusterId: "abc",
203+
},
204+
},
205+
},
206+
Create: true,
207+
Resource: ResourceCluster(),
208+
State: map[string]any{
209+
"autotermination_minutes": 15,
210+
"cluster_name": "Shared Autoscaling",
211+
"spark_version": "7.1-scala12",
212+
"node_type_id": "i3.xlarge",
213+
"num_workers": 100,
214+
},
215+
}.Apply(t)
216+
assert.ErrorContains(t, err, "failed to reach RUNNING, got TERMINATED")
217+
assert.Equal(t, "abc", d.Id())
218+
}
219+
220+
func TestResourceClusterCreateErrorFollowedByDeletionError(t *testing.T) {
221+
d, err := qa.ResourceFixture{
222+
Fixtures: []qa.HTTPFixture{
223+
{
224+
Method: "POST",
225+
Resource: "/api/2.1/clusters/create",
226+
ExpectedRequest: compute.CreateCluster{
227+
NumWorkers: 100,
228+
ClusterName: "Shared Autoscaling",
229+
SparkVersion: "7.1-scala12",
230+
NodeTypeId: "i3.xlarge",
231+
AutoterminationMinutes: 15,
232+
},
233+
Response: compute.ClusterDetails{
234+
ClusterId: "abc",
235+
},
236+
},
237+
{
238+
Method: "GET",
239+
ReuseRequest: true,
240+
Resource: "/api/2.1/clusters/get?cluster_id=abc",
241+
Response: compute.ClusterDetails{
242+
ClusterId: "abc",
243+
NumWorkers: 100,
244+
ClusterName: "Shared Autoscaling",
245+
SparkVersion: "7.1-scala12",
246+
NodeTypeId: "i3.xlarge",
247+
AutoterminationMinutes: 15,
248+
State: compute.StateTerminated,
249+
},
250+
},
251+
{
252+
Method: "POST",
253+
Resource: "/api/2.1/clusters/permanent-delete",
254+
ExpectedRequest: compute.PermanentDeleteCluster{
255+
ClusterId: "abc",
256+
},
257+
Status: 500,
258+
Response: common.APIErrorBody{
259+
ErrorCode: "INTERNAL_ERROR",
260+
Message: "Internal error happened",
261+
},
262+
},
263+
},
264+
Create: true,
265+
Resource: ResourceCluster(),
266+
State: map[string]any{
267+
"autotermination_minutes": 15,
268+
"cluster_name": "Shared Autoscaling",
269+
"spark_version": "7.1-scala12",
270+
"node_type_id": "i3.xlarge",
271+
"num_workers": 100,
272+
},
273+
}.Apply(t)
274+
assert.ErrorContains(t, err, "failed to create cluster: failed to reach RUNNING, got TERMINATED: and failed to delete it during cleanup: Internal error happened")
275+
assert.Equal(t, "abc", d.Id())
276+
}
277+
167278
func TestResourceClusterCreate_WithLibraries(t *testing.T) {
168279
d, err := qa.ResourceFixture{
169280
Fixtures: []qa.HTTPFixture{

0 commit comments

Comments
 (0)