Skip to content

Commit bcbc194

Browse files
Cloud Stack: More reliable retrying (#1035)
Closes #996 There have been reliability issues with stacks related to Terraform Grafana.com has a very asynchronous process that works well in the web but not so much in Terraform While an API 500 in the web UI is not that annoying, since the stack shows up on the next refresh, it disrupts Terraform In this PR, I make the stack creation process retry on pretty much every error To test that everything works well, I added two test cases: - Tainting a stack to test that it can recreate right after deletion - Creating a stack in double, making sure we can't take-over a stack that already exists
1 parent aa32d25 commit bcbc194

File tree

2 files changed

+65
-29
lines changed

2 files changed

+65
-29
lines changed

internal/resources/cloud/resource_cloud_stack.go

Lines changed: 25 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -238,20 +238,32 @@ func CreateStack(ctx context.Context, d *schema.ResourceData, meta interface{})
238238
Region: d.Get("region_slug").(string),
239239
}
240240

241-
stackID, err := client.NewStack(stack)
242-
switch {
243-
case err != nil && strings.Contains(err.Error(), "409"):
244-
return diag.Errorf("Error: A Grafana stack with the name '%s' already exists.", stack.Name)
245-
case err != nil:
246-
// If we had an error that isn't a 409 (already exists), try to read the stack
247-
// Sometimes, the stack is created but the API returns an error (e.g. 504)
248-
readStack, readErr := client.StackBySlug(stack.Slug)
249-
if readErr != nil {
250-
return diag.Errorf("Failed to create stack: %v", err)
241+
err := retry.RetryContext(ctx, 1*time.Minute, func() *retry.RetryError {
242+
stackID, err := client.NewStack(stack)
243+
switch {
244+
case err != nil && strings.Contains(err.Error(), "409"):
245+
// If the API returns a 409, it means that the stack already exists
246+
// It may also mean that the stack was recently deleted and is still in the process of being deleted
247+
// In that case, we want to retry
248+
time.Sleep(10 * time.Second) // Do not retry too fast, default is 500ms
249+
return retry.RetryableError(fmt.Errorf("a stack with the name '%s' already exists", stack.Name))
250+
case err != nil:
251+
// If we had an error that isn't a 409 (already exists), try to read the stack
252+
// Sometimes, the stack is created but the API returns an error (e.g. 504)
253+
readStack, readErr := client.StackBySlug(stack.Slug)
254+
if readErr == nil {
255+
d.SetId(strconv.FormatInt(readStack.ID, 10))
256+
return nil
257+
}
258+
time.Sleep(10 * time.Second) // Do not retry too fast, default is 500ms
259+
return retry.RetryableError(fmt.Errorf("failed to create stack: %v", err))
260+
default:
261+
d.SetId(strconv.FormatInt(stackID, 10))
251262
}
252-
d.SetId(strconv.FormatInt(readStack.ID, 10))
253-
default:
254-
d.SetId(strconv.FormatInt(stackID, 10))
263+
return nil
264+
})
265+
if err != nil {
266+
return diag.FromErr(err)
255267
}
256268

257269
if diag := ReadStack(ctx, d, meta); diag != nil {

internal/resources/cloud/resource_cloud_stack_test.go

Lines changed: 40 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package cloud_test
22

33
import (
44
"fmt"
5+
"regexp"
56
"strings"
67
"time"
78

@@ -25,29 +26,47 @@ func TestResourceStack_Basic(t *testing.T) {
2526
resourceName := GetRandomStackName(prefix)
2627
stackDescription := "This is a test stack"
2728

29+
firstStepChecks := resource.ComposeTestCheckFunc(
30+
testAccStackCheckExists("grafana_cloud_stack.test", &stack),
31+
resource.TestMatchResourceAttr("grafana_cloud_stack.test", "id", common.IDRegexp),
32+
resource.TestCheckResourceAttr("grafana_cloud_stack.test", "name", resourceName),
33+
resource.TestCheckResourceAttr("grafana_cloud_stack.test", "slug", resourceName),
34+
resource.TestCheckResourceAttr("grafana_cloud_stack.test", "status", "active"),
35+
resource.TestCheckResourceAttr("grafana_cloud_stack.test", "prometheus_remote_endpoint", "https://prometheus-prod-01-eu-west-0.grafana.net/api/prom"),
36+
resource.TestCheckResourceAttr("grafana_cloud_stack.test", "prometheus_remote_write_endpoint", "https://prometheus-prod-01-eu-west-0.grafana.net/api/prom/push"),
37+
resource.TestCheckResourceAttrSet("grafana_cloud_stack.test", "prometheus_user_id"),
38+
resource.TestCheckResourceAttrSet("grafana_cloud_stack.test", "alertmanager_user_id"),
39+
resource.TestCheckResourceAttrSet("grafana_cloud_stack.test", "logs_user_id"),
40+
resource.TestCheckResourceAttrSet("grafana_cloud_stack.test", "traces_user_id"),
41+
resource.TestCheckResourceAttrSet("grafana_cloud_stack.test", "graphite_user_id"),
42+
)
43+
2844
resource.ParallelTest(t, resource.TestCase{
2945
PreCheck: func() {
3046
testAccDeleteExistingStacks(t, prefix)
3147
},
3248
ProviderFactories: testutils.ProviderFactories,
3349
CheckDestroy: testAccStackCheckDestroy(&stack),
3450
Steps: []resource.TestStep{
51+
// Create a basic stack
3552
{
3653
Config: testAccStackConfigBasic(resourceName, resourceName),
37-
Check: resource.ComposeTestCheckFunc(
38-
testAccStackCheckExists("grafana_cloud_stack.test", &stack),
39-
resource.TestMatchResourceAttr("grafana_cloud_stack.test", "id", common.IDRegexp),
40-
resource.TestCheckResourceAttr("grafana_cloud_stack.test", "name", resourceName),
41-
resource.TestCheckResourceAttr("grafana_cloud_stack.test", "slug", resourceName),
42-
resource.TestCheckResourceAttr("grafana_cloud_stack.test", "status", "active"),
43-
resource.TestCheckResourceAttr("grafana_cloud_stack.test", "prometheus_remote_endpoint", "https://prometheus-prod-01-eu-west-0.grafana.net/api/prom"),
44-
resource.TestCheckResourceAttr("grafana_cloud_stack.test", "prometheus_remote_write_endpoint", "https://prometheus-prod-01-eu-west-0.grafana.net/api/prom/push"),
45-
resource.TestCheckResourceAttrSet("grafana_cloud_stack.test", "prometheus_user_id"),
46-
resource.TestCheckResourceAttrSet("grafana_cloud_stack.test", "alertmanager_user_id"),
47-
resource.TestCheckResourceAttrSet("grafana_cloud_stack.test", "logs_user_id"),
48-
resource.TestCheckResourceAttrSet("grafana_cloud_stack.test", "traces_user_id"),
49-
resource.TestCheckResourceAttrSet("grafana_cloud_stack.test", "graphite_user_id"),
50-
),
54+
Check: firstStepChecks,
55+
},
56+
// Check that we can't takeover a stack without importing it
57+
// The retrying logic for creation is very permissive,
58+
// but it shouldn't allow to apply an already existing stack on a new resource
59+
{
60+
Config: testAccStackConfigBasic(resourceName, resourceName) +
61+
testAccStackConfigBasicWithCustomResourceName(resourceName, resourceName, "test2"), // new stack with same name/slug
62+
ExpectError: regexp.MustCompile(fmt.Sprintf(".*a stack with the name '%s' already exists.*", resourceName)),
63+
},
64+
// Test that the stack is correctly recreated if it's tainted and reapplied
65+
// This is a special case because stack deletion is asynchronous
66+
{
67+
Config: testAccStackConfigBasic(resourceName, resourceName),
68+
Check: firstStepChecks,
69+
Taint: []string{"grafana_cloud_stack.test"},
5170
},
5271
{
5372
// Delete the stack outside of the test and make sure it is recreated
@@ -65,6 +84,7 @@ func TestResourceStack_Basic(t *testing.T) {
6584
resource.TestCheckResourceAttr("grafana_cloud_stack.test", "status", "active"),
6685
),
6786
},
87+
// Update the stack
6888
{
6989
Config: testAccStackConfigUpdate(resourceName+"new", resourceName, stackDescription),
7090
Check: resource.ComposeTestCheckFunc(
@@ -150,13 +170,17 @@ func testAccStackCheckDestroy(a *gapi.Stack) resource.TestCheckFunc {
150170
}
151171

152172
func testAccStackConfigBasic(name string, slug string) string {
173+
return testAccStackConfigBasicWithCustomResourceName(name, slug, "test")
174+
}
175+
176+
func testAccStackConfigBasicWithCustomResourceName(name string, slug string, resourceName string) string {
153177
return fmt.Sprintf(`
154-
resource "grafana_cloud_stack" "test" {
178+
resource "grafana_cloud_stack" "%s" {
155179
name = "%s"
156180
slug = "%s"
157181
region_slug = "eu"
158182
}
159-
`, name, slug)
183+
`, resourceName, name, slug)
160184
}
161185

162186
func testAccStackConfigUpdate(name string, slug string, description string) string {

0 commit comments

Comments
 (0)