Skip to content

Commit 4697bcf

Browse files
authored
feat: Long-running operation improvements for mongodbatlas_cluster_outage_simulation resource (#3541)
* implement delete_on_create_timeout * wait for delete in clean up to avoid trying to delete cluster before outage has been deleted * use delete timeout(or default if not present) for clean up * delete when state is SIMULATING * increate time to reach simulating * add missing timeout field in docs
1 parent 46d3b12 commit 4697bcf

File tree

6 files changed

+144
-20
lines changed

6 files changed

+144
-20
lines changed

.changelog/3541.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
```release-note:enhancement
2+
resource/mongodbatlas_cluster_outage_simulation: Adds `delete_on_create_timeout` attribute to indicate whether to delete the resource if its creation times out
3+
```

docs/resources/cluster_outage_simulation.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,8 @@ resource "mongodbatlas_cluster_outage_simulation" "outage_simulation" {
4343
* `GCP`
4444
* `AZURE`
4545
* `region_name` - (Required) The Atlas name of the region to undergo an outage simulation.
46+
* `timeouts`- (Optional) The duration of time to wait for Cluster Outage Simulation to be created or deleted. The timeout value is defined by a signed sequence of decimal numbers with a time unit suffix such as: `1h45m`, `300s`, `10m`, etc. The valid time units are: `ns`, `us` (or `µs`), `ms`, `s`, `m`, `h`. The default timeout for Cluster Outage Simulation create and delete is `25m`. Learn more about timeouts [here](https://www.terraform.io/plugin/sdkv2/resources/retries-and-customizable-timeouts).
47+
* `delete_on_create_timeout` - (Optional) Flag that indicates whether to delete the resource if creation times out. Default is `true`. When Terraform apply fails, it returns immediately without waiting for cleanup to complete. If you suspect a transient error, wait before retrying to allow resource deletion to finish.
4648

4749
## Attributes Reference
4850

Lines changed: 94 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"github.com/hashicorp/terraform-plugin-sdk/v2/diag"
1010
"github.com/hashicorp/terraform-plugin-sdk/v2/helper/retry"
1111
"github.com/hashicorp/terraform-plugin-sdk/v2/helper/schema"
12+
"github.com/mongodb/terraform-provider-mongodbatlas/internal/common/cleanup"
1213
"github.com/mongodb/terraform-provider-mongodbatlas/internal/common/conversion"
1314
"github.com/mongodb/terraform-provider-mongodbatlas/internal/common/validate"
1415
"github.com/mongodb/terraform-provider-mongodbatlas/internal/config"
@@ -21,16 +22,18 @@ const (
2122
errorClusterOutageSimulationDelete = "error ending MongoDB Atlas Cluster Outage Simulation for Project (%s), Cluster (%s): %s"
2223
errorClusterOutageSimulationSetting = "error setting `%s` for MongoDB Atlas Cluster Outage Simulation: %s"
2324
defaultOutageFilterType = "REGION"
25+
oneMinute = 1 * time.Minute
2426
)
2527

2628
func Resource() *schema.Resource {
2729
return &schema.Resource{
28-
CreateContext: resourceCreate,
29-
ReadContext: resourceRead,
30-
UpdateContext: resourceUpdate,
31-
DeleteContext: resourceDelete,
30+
CreateWithoutTimeout: resourceCreate,
31+
ReadWithoutTimeout: resourceRead,
32+
UpdateWithoutTimeout: resourceUpdate,
33+
DeleteWithoutTimeout: resourceDelete,
3234
Timeouts: &schema.ResourceTimeout{
3335
Delete: schema.DefaultTimeout(25 * time.Minute),
36+
Create: schema.DefaultTimeout(25 * time.Minute),
3437
},
3538
Schema: map[string]*schema.Schema{
3639
"project_id": {
@@ -74,6 +77,11 @@ func Resource() *schema.Resource {
7477
Type: schema.TypeString,
7578
Computed: true,
7679
},
80+
"delete_on_create_timeout": { // Don't use Default: true to avoid unplanned changes when upgrading from previous versions.
81+
Type: schema.TypeBool,
82+
Optional: true,
83+
Description: "Flag that indicates whether to delete the resource if creation times out. Default is true.",
84+
},
7785
},
7886
}
7987
}
@@ -97,14 +105,28 @@ func resourceCreate(ctx context.Context, d *schema.ResourceData, meta any) diag.
97105
Pending: []string{"START_REQUESTED", "STARTING"},
98106
Target: []string{"SIMULATING"},
99107
Refresh: resourceRefreshFunc(ctx, clusterName, projectID, connV2),
100-
Timeout: d.Timeout(schema.TimeoutCreate) - time.Minute, // When using a CRUD function with a timeout, any StateChangeConf timeouts must be configured below that duration to avoid returning the SDK context: deadline exceeded error instead of the retry logic error.
101-
MinTimeout: 1 * time.Minute,
102-
Delay: 3 * time.Minute,
108+
Timeout: d.Timeout(schema.TimeoutCreate) - oneMinute, // When using a CRUD function with a timeout, any StateChangeConf timeouts must be configured below that duration to avoid returning the SDK context: deadline exceeded error instead of the retry logic error.
109+
MinTimeout: oneMinute,
110+
Delay: oneMinute,
103111
}
104112

105-
_, err = stateConf.WaitForStateContext(ctx)
106-
if err != nil {
107-
return diag.FromErr(fmt.Errorf(errorClusterOutageSimulationCreate, projectID, clusterName, err))
113+
_, errWait := stateConf.WaitForStateContext(ctx)
114+
deleteOnCreateTimeout := true // default value when not set
115+
if v, ok := d.GetOkExists("delete_on_create_timeout"); ok {
116+
deleteOnCreateTimeout = v.(bool)
117+
}
118+
errWait = cleanup.HandleCreateTimeout(deleteOnCreateTimeout, errWait, func(ctxCleanup context.Context) error {
119+
return deleteOutageSimulationWithCleanup(
120+
ctxCleanup,
121+
connV2,
122+
projectID,
123+
clusterName,
124+
20*time.Minute, // wait timeout for reaching SIMULATING before trying to delete
125+
d.Timeout(schema.TimeoutDelete),
126+
)
127+
})
128+
if errWait != nil {
129+
return diag.FromErr(fmt.Errorf(errorClusterOutageSimulationCreate, projectID, clusterName, errWait))
108130
}
109131

110132
d.SetId(conversion.EncodeStateID(map[string]string{
@@ -158,16 +180,53 @@ func resourceRead(ctx context.Context, d *schema.ResourceData, meta any) diag.Di
158180
return nil
159181
}
160182

161-
func resourceDelete(ctx context.Context, d *schema.ResourceData, meta any) diag.Diagnostics {
162-
connV2 := meta.(*config.MongoDBClient).AtlasV2
183+
// waitForDeletableState waits for the outage simulation to reach a deletable state
184+
func waitForDeletableState(ctx context.Context, connV2 *admin.APIClient, projectID, clusterName string, timeout time.Duration) (*admin.ClusterOutageSimulation, error) {
185+
stateConf := &retry.StateChangeConf{
186+
Pending: []string{"START_REQUESTED", "STARTING"},
187+
Target: []string{"SIMULATING", "FAILED", "DELETED"},
188+
Refresh: resourceRefreshFunc(ctx, clusterName, projectID, connV2),
189+
Timeout: timeout,
190+
MinTimeout: oneMinute,
191+
Delay: oneMinute,
192+
}
163193

164-
ids := conversion.DecodeStateID(d.Id())
165-
projectID := ids["project_id"]
166-
clusterName := ids["cluster_name"]
194+
result, err := stateConf.WaitForStateContext(ctx)
195+
if err != nil {
196+
return nil, err
197+
}
198+
199+
if result == nil {
200+
return nil, fmt.Errorf("no result returned from state change")
201+
}
202+
203+
simulation := result.(*admin.ClusterOutageSimulation)
204+
return simulation, nil
205+
}
167206

207+
// deleteOutageSimulationWithCleanup waits for SIMULATING state and then deletes the simulation
208+
func deleteOutageSimulationWithCleanup(ctx context.Context, connV2 *admin.APIClient, projectID, clusterName string, waitTimeout, deleteTimeout time.Duration) error {
209+
simulation, err := waitForDeletableState(ctx, connV2, projectID, clusterName, waitTimeout)
210+
if err != nil {
211+
return nil // Don't fail cleanup if we can't reach a deletable state
212+
}
213+
214+
finalState := simulation.GetState()
215+
switch finalState {
216+
case "SIMULATING": // If this isn't the state when triggering the delete, the API returns a 400 error: "INVALID_CLUSTER_OUTAGE_SIMULATION_STATE") Detail: Invalid cluster outage simulation state: START_REQUESTED, expected state: SIMULATING
217+
return endOutageSimulationAndWait(ctx, connV2, projectID, clusterName, deleteTimeout)
218+
case "FAILED", "DELETED":
219+
return nil
220+
default:
221+
return nil
222+
}
223+
}
224+
225+
// endOutageSimulationAndWait ends the outage simulation and waits for it to complete
226+
func endOutageSimulationAndWait(ctx context.Context, connV2 *admin.APIClient, projectID, clusterName string, timeout time.Duration) error {
168227
_, _, err := connV2.ClusterOutageSimulationApi.EndOutageSimulation(ctx, projectID, clusterName).Execute()
169228
if err != nil {
170-
return diag.FromErr(fmt.Errorf(errorClusterOutageSimulationDelete, projectID, clusterName, err))
229+
return fmt.Errorf(errorClusterOutageSimulationDelete, projectID, clusterName, err)
171230
}
172231

173232
log.Println("[INFO] Waiting for MongoDB Cluster Outage Simulation to end")
@@ -176,14 +235,29 @@ func resourceDelete(ctx context.Context, d *schema.ResourceData, meta any) diag.
176235
Pending: []string{"RECOVERY_REQUESTED", "RECOVERING", "COMPLETE"},
177236
Target: []string{"DELETED"},
178237
Refresh: resourceRefreshFunc(ctx, clusterName, projectID, connV2),
179-
Timeout: d.Timeout(schema.TimeoutDelete),
180-
MinTimeout: 30 * time.Second,
181-
Delay: 1 * time.Minute,
238+
Timeout: timeout,
239+
MinTimeout: oneMinute,
240+
Delay: oneMinute,
182241
}
183242

184243
_, err = stateConf.WaitForStateContext(ctx)
185244
if err != nil {
186-
return diag.FromErr(fmt.Errorf(errorClusterOutageSimulationDelete, projectID, clusterName, err))
245+
return fmt.Errorf(errorClusterOutageSimulationDelete, projectID, clusterName, err)
246+
}
247+
248+
return nil
249+
}
250+
251+
func resourceDelete(ctx context.Context, d *schema.ResourceData, meta any) diag.Diagnostics {
252+
connV2 := meta.(*config.MongoDBClient).AtlasV2
253+
254+
ids := conversion.DecodeStateID(d.Id())
255+
projectID := ids["project_id"]
256+
clusterName := ids["cluster_name"]
257+
258+
err := endOutageSimulationAndWait(ctx, connV2, projectID, clusterName, d.Timeout(schema.TimeoutDelete))
259+
if err != nil {
260+
return diag.FromErr(err)
187261
}
188262

189263
return nil
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package clusteroutagesimulation_test
33
import (
44
"context"
55
"fmt"
6+
"regexp"
67
"testing"
78

89
"github.com/hashicorp/terraform-plugin-testing/helper/resource"
@@ -153,6 +154,50 @@ func configMultiRegion(info *acc.ClusterInfo) string {
153154
`, info.TerraformStr, info.ProjectID, info.Name, info.ResourceName)
154155
}
155156

157+
func TestAccClusterOutageSimulation_deleteOnCreateTimeout(t *testing.T) {
158+
var (
159+
singleRegionRequest = acc.ClusterRequest{
160+
ReplicationSpecs: []acc.ReplicationSpecRequest{
161+
{Region: "US_WEST_2", InstanceSize: "M10"},
162+
},
163+
}
164+
clusterInfo = acc.GetClusterInfo(t, &singleRegionRequest)
165+
)
166+
167+
resource.ParallelTest(t, resource.TestCase{
168+
PreCheck: acc.PreCheckBasicSleep(t, &clusterInfo, "", ""),
169+
ProtoV6ProviderFactories: acc.TestAccProviderV6Factories,
170+
Steps: []resource.TestStep{
171+
{
172+
Config: configDeleteOnCreateTimeout(&clusterInfo, "1s", true),
173+
ExpectError: regexp.MustCompile("will run cleanup because delete_on_create_timeout is true"),
174+
},
175+
},
176+
})
177+
}
178+
179+
func configDeleteOnCreateTimeout(info *acc.ClusterInfo, timeout string, deleteOnTimeout bool) string {
180+
return fmt.Sprintf(`
181+
%[1]s
182+
resource "mongodbatlas_cluster_outage_simulation" "test_outage" {
183+
project_id = %[2]q
184+
cluster_name = %[3]q
185+
delete_on_create_timeout = %[5]t
186+
187+
timeouts {
188+
create = %[4]q
189+
}
190+
191+
outage_filters {
192+
cloud_provider = "AWS"
193+
region_name = "US_WEST_2"
194+
}
195+
196+
depends_on = [%[6]s]
197+
}
198+
`, info.TerraformStr, info.ProjectID, info.Name, timeout, deleteOnTimeout, info.ResourceName)
199+
}
200+
156201
func checkDestroy(s *terraform.State) error {
157202
for _, rs := range s.RootModule().Resources {
158203
if rs.Type != "mongodbatlas_cluster_outage_simulation" {

0 commit comments

Comments
 (0)