feat: Long-running operation improvements for mongodbatlas_cluster_outage_simulation resource (#3541)

oarbusi · web-flow · commit 4697bcfcb863 · 2025-08-05T08:45:12.000+02:00
* implement delete_on_create_timeout

* wait for delete in clean up to avoid trying to delete cluster before outage has been deleted

* use delete timeout(or default if not present) for clean up

* delete when state is SIMULATING

* increate time to reach simulating

* add missing timeout field in docs
diff --git a/.changelog/3541.txt b/.changelog/3541.txt
@@ -0,0 +1,3 @@
+```release-note:enhancement
+resource/mongodbatlas_cluster_outage_simulation: Adds `delete_on_create_timeout` attribute to indicate whether to delete the resource if its creation times out
+``` 
diff --git a/docs/resources/cluster_outage_simulation.md b/docs/resources/cluster_outage_simulation.md
@@ -43,6 +43,8 @@ resource "mongodbatlas_cluster_outage_simulation" "outage_simulation" {
     * `GCP`
     * `AZURE`
   * `region_name` - (Required) The Atlas name of the region to undergo an outage simulation.
+* `timeouts`- (Optional) The duration of time to wait for Cluster Outage Simulation to be created or deleted. The timeout value is defined by a signed sequence of decimal numbers with a time unit suffix such as: `1h45m`, `300s`, `10m`, etc. The valid time units are: `ns`, `us` (or `µs`), `ms`, `s`, `m`, `h`. The default timeout for Cluster Outage Simulation create and delete is `25m`. Learn more about timeouts [here](https://www.terraform.io/plugin/sdkv2/resources/retries-and-customizable-timeouts).
+* `delete_on_create_timeout` - (Optional) Flag that indicates whether to delete the resource if creation times out. Default is `true`. When Terraform apply fails, it returns immediately without waiting for cleanup to complete. If you suspect a transient error, wait before retrying to allow resource deletion to finish.
 
 ## Attributes Reference
 
diff --git a/internal/service/clusteroutagesimulation/data_source.go b/internal/service/clusteroutagesimulation/data_source.go
diff --git a/internal/service/clusteroutagesimulation/resource.go b/internal/service/clusteroutagesimulation/resource.go
@@ -9,6 +9,7 @@ import (
 	"github.com/hashicorp/terraform-plugin-sdk/v2/diag"
 	"github.com/hashicorp/terraform-plugin-sdk/v2/helper/retry"
 	"github.com/hashicorp/terraform-plugin-sdk/v2/helper/schema"
+	"github.com/mongodb/terraform-provider-mongodbatlas/internal/common/cleanup"
 	"github.com/mongodb/terraform-provider-mongodbatlas/internal/common/conversion"
 	"github.com/mongodb/terraform-provider-mongodbatlas/internal/common/validate"
 	"github.com/mongodb/terraform-provider-mongodbatlas/internal/config"
@@ -21,16 +22,18 @@ const (
 	errorClusterOutageSimulationDelete  = "error ending MongoDB Atlas Cluster Outage Simulation for Project (%s), Cluster (%s): %s"
 	errorClusterOutageSimulationSetting = "error setting `%s` for MongoDB Atlas Cluster Outage Simulation: %s"
 	defaultOutageFilterType             = "REGION"
+	oneMinute                           = 1 * time.Minute
 )
 
 func Resource() *schema.Resource {
 	return &schema.Resource{
-		CreateContext: resourceCreate,
-		ReadContext:   resourceRead,
-		UpdateContext: resourceUpdate,
-		DeleteContext: resourceDelete,
+		CreateWithoutTimeout: resourceCreate,
+		ReadWithoutTimeout:   resourceRead,
+		UpdateWithoutTimeout: resourceUpdate,
+		DeleteWithoutTimeout: resourceDelete,
 		Timeouts: &schema.ResourceTimeout{
 			Delete: schema.DefaultTimeout(25 * time.Minute),
+			Create: schema.DefaultTimeout(25 * time.Minute),
 		},
 		Schema: map[string]*schema.Schema{
 			"project_id": {
@@ -74,6 +77,11 @@ func Resource() *schema.Resource {
 				Type:     schema.TypeString,
 				Computed: true,
 			},
+			"delete_on_create_timeout": { // Don't use Default: true to avoid unplanned changes when upgrading from previous versions.
+				Type:        schema.TypeBool,
+				Optional:    true,
+				Description: "Flag that indicates whether to delete the resource if creation times out. Default is true.",
+			},
 		},
 	}
 }
@@ -97,14 +105,28 @@ func resourceCreate(ctx context.Context, d *schema.ResourceData, meta any) diag.
 		Pending:    []string{"START_REQUESTED", "STARTING"},
 		Target:     []string{"SIMULATING"},
 		Refresh:    resourceRefreshFunc(ctx, clusterName, projectID, connV2),
-		Timeout:    d.Timeout(schema.TimeoutCreate) - time.Minute, // When using a CRUD function with a timeout, any StateChangeConf timeouts must be configured below that duration to avoid returning the SDK context: deadline exceeded error instead of the retry logic error.
-		MinTimeout: 1 * time.Minute,
-		Delay:      3 * time.Minute,
+		Timeout:    d.Timeout(schema.TimeoutCreate) - oneMinute, // When using a CRUD function with a timeout, any StateChangeConf timeouts must be configured below that duration to avoid returning the SDK context: deadline exceeded error instead of the retry logic error.
+		MinTimeout: oneMinute,
+		Delay:      oneMinute,
 	}
 
-	_, err = stateConf.WaitForStateContext(ctx)
-	if err != nil {
-		return diag.FromErr(fmt.Errorf(errorClusterOutageSimulationCreate, projectID, clusterName, err))
+	_, errWait := stateConf.WaitForStateContext(ctx)
+	deleteOnCreateTimeout := true // default value when not set
+	if v, ok := d.GetOkExists("delete_on_create_timeout"); ok {
+		deleteOnCreateTimeout = v.(bool)
+	}
+	errWait = cleanup.HandleCreateTimeout(deleteOnCreateTimeout, errWait, func(ctxCleanup context.Context) error {
+		return deleteOutageSimulationWithCleanup(
+			ctxCleanup,
+			connV2,
+			projectID,
+			clusterName,
+			20*time.Minute, // wait timeout for reaching SIMULATING before trying to delete
+			d.Timeout(schema.TimeoutDelete),
+		)
+	})
+	if errWait != nil {
+		return diag.FromErr(fmt.Errorf(errorClusterOutageSimulationCreate, projectID, clusterName, errWait))
 	}
 
 	d.SetId(conversion.EncodeStateID(map[string]string{
@@ -158,16 +180,53 @@ func resourceRead(ctx context.Context, d *schema.ResourceData, meta any) diag.Di
 	return nil
 }
 
-func resourceDelete(ctx context.Context, d *schema.ResourceData, meta any) diag.Diagnostics {
-	connV2 := meta.(*config.MongoDBClient).AtlasV2
+// waitForDeletableState waits for the outage simulation to reach a deletable state
+func waitForDeletableState(ctx context.Context, connV2 *admin.APIClient, projectID, clusterName string, timeout time.Duration) (*admin.ClusterOutageSimulation, error) {
+	stateConf := &retry.StateChangeConf{
+		Pending:    []string{"START_REQUESTED", "STARTING"},
+		Target:     []string{"SIMULATING", "FAILED", "DELETED"},
+		Refresh:    resourceRefreshFunc(ctx, clusterName, projectID, connV2),
+		Timeout:    timeout,
+		MinTimeout: oneMinute,
+		Delay:      oneMinute,
+	}
 
-	ids := conversion.DecodeStateID(d.Id())
-	projectID := ids["project_id"]
-	clusterName := ids["cluster_name"]
+	result, err := stateConf.WaitForStateContext(ctx)
+	if err != nil {
+		return nil, err
+	}
+
+	if result == nil {
+		return nil, fmt.Errorf("no result returned from state change")
+	}
+
+	simulation := result.(*admin.ClusterOutageSimulation)
+	return simulation, nil
+}
 
+// deleteOutageSimulationWithCleanup waits for SIMULATING state and then deletes the simulation
+func deleteOutageSimulationWithCleanup(ctx context.Context, connV2 *admin.APIClient, projectID, clusterName string, waitTimeout, deleteTimeout time.Duration) error {
+	simulation, err := waitForDeletableState(ctx, connV2, projectID, clusterName, waitTimeout)
+	if err != nil {
+		return nil // Don't fail cleanup if we can't reach a deletable state
+	}
+
+	finalState := simulation.GetState()
+	switch finalState {
+	case "SIMULATING": // If this isn't the state when triggering the delete, the API returns a 400 error: "INVALID_CLUSTER_OUTAGE_SIMULATION_STATE") Detail: Invalid cluster outage simulation state: START_REQUESTED, expected state: SIMULATING
+		return endOutageSimulationAndWait(ctx, connV2, projectID, clusterName, deleteTimeout)
+	case "FAILED", "DELETED":
+		return nil
+	default:
+		return nil
+	}
+}
+
+// endOutageSimulationAndWait ends the outage simulation and waits for it to complete
+func endOutageSimulationAndWait(ctx context.Context, connV2 *admin.APIClient, projectID, clusterName string, timeout time.Duration) error {
 	_, _, err := connV2.ClusterOutageSimulationApi.EndOutageSimulation(ctx, projectID, clusterName).Execute()
 	if err != nil {
-		return diag.FromErr(fmt.Errorf(errorClusterOutageSimulationDelete, projectID, clusterName, err))
+		return fmt.Errorf(errorClusterOutageSimulationDelete, projectID, clusterName, err)
 	}
 
 	log.Println("[INFO] Waiting for MongoDB Cluster Outage Simulation to end")
@@ -176,14 +235,29 @@ func resourceDelete(ctx context.Context, d *schema.ResourceData, meta any) diag.
 		Pending:    []string{"RECOVERY_REQUESTED", "RECOVERING", "COMPLETE"},
 		Target:     []string{"DELETED"},
 		Refresh:    resourceRefreshFunc(ctx, clusterName, projectID, connV2),
-		Timeout:    d.Timeout(schema.TimeoutDelete),
-		MinTimeout: 30 * time.Second,
-		Delay:      1 * time.Minute,
+		Timeout:    timeout,
+		MinTimeout: oneMinute,
+		Delay:      oneMinute,
 	}
 
 	_, err = stateConf.WaitForStateContext(ctx)
 	if err != nil {
-		return diag.FromErr(fmt.Errorf(errorClusterOutageSimulationDelete, projectID, clusterName, err))
+		return fmt.Errorf(errorClusterOutageSimulationDelete, projectID, clusterName, err)
+	}
+
+	return nil
+}
+
+func resourceDelete(ctx context.Context, d *schema.ResourceData, meta any) diag.Diagnostics {
+	connV2 := meta.(*config.MongoDBClient).AtlasV2
+
+	ids := conversion.DecodeStateID(d.Id())
+	projectID := ids["project_id"]
+	clusterName := ids["cluster_name"]
+
+	err := endOutageSimulationAndWait(ctx, connV2, projectID, clusterName, d.Timeout(schema.TimeoutDelete))
+	if err != nil {
+		return diag.FromErr(err)
 	}
 
 	return nil
diff --git a/internal/service/clusteroutagesimulation/resource_migration_test.go b/internal/service/clusteroutagesimulation/resource_migration_test.go
diff --git a/internal/service/clusteroutagesimulation/resource_test.go b/internal/service/clusteroutagesimulation/resource_test.go
@@ -3,6 +3,7 @@ package clusteroutagesimulation_test
 import (
 	"context"
 	"fmt"
+	"regexp"
 	"testing"
 
 	"github.com/hashicorp/terraform-plugin-testing/helper/resource"
@@ -153,6 +154,50 @@ func configMultiRegion(info *acc.ClusterInfo) string {
 	`, info.TerraformStr, info.ProjectID, info.Name, info.ResourceName)
 }
 
+func TestAccClusterOutageSimulation_deleteOnCreateTimeout(t *testing.T) {
+	var (
+		singleRegionRequest = acc.ClusterRequest{
+			ReplicationSpecs: []acc.ReplicationSpecRequest{
+				{Region: "US_WEST_2", InstanceSize: "M10"},
+			},
+		}
+		clusterInfo = acc.GetClusterInfo(t, &singleRegionRequest)
+	)
+
+	resource.ParallelTest(t, resource.TestCase{
+		PreCheck:                 acc.PreCheckBasicSleep(t, &clusterInfo, "", ""),
+		ProtoV6ProviderFactories: acc.TestAccProviderV6Factories,
+		Steps: []resource.TestStep{
+			{
+				Config:      configDeleteOnCreateTimeout(&clusterInfo, "1s", true),
+				ExpectError: regexp.MustCompile("will run cleanup because delete_on_create_timeout is true"),
+			},
+		},
+	})
+}
+
+func configDeleteOnCreateTimeout(info *acc.ClusterInfo, timeout string, deleteOnTimeout bool) string {
+	return fmt.Sprintf(`
+		%[1]s
+		resource "mongodbatlas_cluster_outage_simulation" "test_outage" {
+			project_id = %[2]q
+			cluster_name = %[3]q
+			delete_on_create_timeout = %[5]t
+			
+			timeouts {
+				create = %[4]q
+			}
+			
+			outage_filters {
+				cloud_provider = "AWS"
+				region_name    = "US_WEST_2"
+			}
+			
+			depends_on = [%[6]s]
+		}
+	`, info.TerraformStr, info.ProjectID, info.Name, timeout, deleteOnTimeout, info.ResourceName)
+}
+
 func checkDestroy(s *terraform.State) error {
 	for _, rs := range s.RootModule().Resources {
 		if rs.Type != "mongodbatlas_cluster_outage_simulation" {

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	+```release-note:enhancement
	`2`	+resource/mongodbatlas_cluster_outage_simulation: Adds `delete_on_create_timeout` attribute to indicate whether to delete the resource if its creation times out
	`3`	+```