chainguard-dev
diff --git a/‎docs/resources/tests.md‎
Lines changed: 26 additions & 0 deletions b/‎docs/resources/tests.md‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎internal/provider/tests_resource.go‎
Lines changed: 142 additions & 7 deletions b/‎internal/provider/tests_resource.go‎
Lines changed: 142 additions & 7 deletions
diff --git a/‎internal/provider/tests_resource_test.go‎
Lines changed: 110 additions & 0 deletions b/‎internal/provider/tests_resource_test.go‎
Lines changed: 110 additions & 0 deletions
@@ -26,6 +26,7 @@ description: |-
 - `labels` (Map of String) Metadata to attach to the tests resource. Used for filtering and grouping.
 - `name` (String) The name of the test. If one is not provided, a random name will be generated.
 - `repo` (String) The target repository the provider will use for pushing/pulling dynamically built images, overriding provider config.
+- `retry` (Attributes) On failure, tears down the driver completely, creates a fresh one, and re-runs all tests from scratch. This gives each attempt a clean driver, but external side effects from previous attempts are not rolled back: pushed images, written files, cloud resources created outside the driver (e.g. IAM roles, DNS records), and any other out-of-band mutations will still exist. All per-test retry blocks also reset — every test runs from its first attempt on each resource-level retry. (see [below for nested schema](#nestedatt--retry))
 - `skipped` (Boolean) Whether or not the tests were skipped. This is set to true if the tests were skipped, and false otherwise.
 - `tests` (Attributes List) An ordered list of test suites to run (see [below for nested schema](#nestedatt--tests))
 - `timeout` (String) The maximum amount of time to wait for all tests to complete. This includes the time it takes to start and destroy the driver.
@@ -231,6 +232,18 @@ Optional:
 
 
 
+<a id="nestedatt--retry"></a>
+### Nested Schema for `retry`
+
+Required:
+
+- `attempts` (Number) Total number of attempts including the initial run. Must be >= 1.
+
+Optional:
+
+- `delay` (String) Delay between retry attempts as a Go duration string (e.g. "5s", "1m"). Defaults to 5s.
+
+
 <a id="nestedatt--tests"></a>
 ### Nested Schema for `tests`
 
@@ -246,6 +259,7 @@ Optional:
 - `content` (Attributes List) The content to use for the test (see [below for nested schema](#nestedatt--tests--content))
 - `envs` (Map of String) Environment variables to set on the test container. These will overwrite the environment variables set in the image's config on conflicts.
 - `on_failure` (List of String) Commands to run in the sandbox on test failure for diagnostic collection. Each command runs independently (best-effort); failures do not prevent subsequent commands from executing.
+- `retry` (Attributes) Re-runs this individual test within the same driver instance. Each retry launches a fresh test sandbox container, but all driver-level state persists: for Kubernetes-based drivers (k3s_in_docker, EKS, AKS) this means the cluster, namespace, RBAC, secrets, and any objects created by previous attempts are still present. For EC2, the instance filesystem and Docker daemon state carry over. Tests must be idempotent — use create-or-update patterns, unique names, or explicit cleanup to avoid conflicts with leftover state from failed attempts. (see [below for nested schema](#nestedatt--tests--retry))
 - `timeout` (String) The maximum amount of time to wait for the individual test to complete. This is encompassed by the overall timeout of the parent tests resource.
 
 <a id="nestedatt--tests--artifact"></a>
@@ -267,3 +281,15 @@ Required:
 Optional:
 
 - `target` (String) The target path to use for the test
+
+
+<a id="nestedatt--tests--retry"></a>
+### Nested Schema for `tests.retry`
+
+Required:
+
+- `attempts` (Number) Total number of attempts including the initial run. Must be >= 1.
+
+Optional:
+
+- `delay` (String) Delay between retry attempts as a Go duration string (e.g. "5s", "1m"). Defaults to 5s.
@@ -8,6 +8,7 @@ import (
 	"maps"
 	"net/url"
 	"os"
+	"strconv"
 	"strings"
 	"time"
 
@@ -18,6 +19,7 @@ import (
 	internallog "github.com/chainguard-dev/terraform-provider-imagetest/internal/log"
 	"github.com/chainguard-dev/terraform-provider-imagetest/internal/o11y"
 	"github.com/chainguard-dev/terraform-provider-imagetest/internal/provider/framework"
+	"github.com/chainguard-dev/terraform-provider-imagetest/internal/retry"
 	"github.com/chainguard-dev/terraform-provider-imagetest/internal/skip"
 	"github.com/google/go-containerregistry/pkg/name"
 	v1 "github.com/google/go-containerregistry/pkg/v1"
@@ -77,6 +79,7 @@ type TestsResourceModel struct {
 	Labels       map[string]string          `tfsdk:"labels"`
 	Skipped      types.Bool                 `tfsdk:"skipped"`
 	RepoOverride types.String               `tfsdk:"repo"`
+	Retry        *RetryResourceModel        `tfsdk:"retry"`
 }
 
 type TestsImageResource map[string]string
@@ -114,6 +117,41 @@ type TestResourceModel struct {
 	Timeout   types.String               `tfsdk:"timeout"`
 	Artifact  types.Object               `tfsdk:"artifact"`
 	OnFailure []string                   `tfsdk:"on_failure"`
+	Retry     *RetryResourceModel        `tfsdk:"retry"`
+}
+
+type RetryResourceModel struct {
+	Attempts types.Int64  `tfsdk:"attempts"`
+	Delay    types.String `tfsdk:"delay"`
+}
+
+func (r *RetryResourceModel) config() (retry.Config, diag.Diagnostics) {
+	if r == nil {
+		return retry.Config{}, nil
+	}
+	cfg := retry.Config{
+		Attempts: int(r.Attempts.ValueInt64()),
+		Delay:    5 * time.Second,
+	}
+	if v := os.Getenv("IMAGETEST_RETRY_ATTEMPTS"); v != "" {
+		if n, err := strconv.Atoi(v); err == nil {
+			cfg.Attempts = n
+		}
+	}
+	if cfg.Attempts < 1 {
+		cfg.Attempts = 1
+	}
+	if !r.Delay.IsNull() && r.Delay.ValueString() != "" {
+		d, err := time.ParseDuration(r.Delay.ValueString())
+		if err != nil {
+			return cfg, diag.Diagnostics{diag.NewErrorDiagnostic(
+				"invalid retry delay",
+				fmt.Sprintf("failed to parse delay %q: %s", r.Delay.ValueString(), err),
+			)}
+		}
+		cfg.Delay = d
+	}
+	return cfg, nil
 }
 
 type TestContentResourceModel struct {
@@ -206,6 +244,11 @@ func (t *TestsResource) Schema(ctx context.Context, req resource.SchemaRequest,
 							Optional:    true,
 							ElementType: types.StringType,
 						},
+						"retry": retrySchema("Re-runs this individual test within the same driver instance. " +
+							"Each retry launches a fresh test sandbox container, but all driver-level state persists: " +
+							"for Kubernetes-based drivers (k3s_in_docker, EKS, AKS) this means the cluster, namespace, RBAC, secrets, and any objects created by previous attempts are still present. " +
+							"For EC2, the instance filesystem and Docker daemon state carry over. " +
+							"Tests must be idempotent — use create-or-update patterns, unique names, or explicit cleanup to avoid conflicts with leftover state from failed attempts."),
 						"artifact": schema.SingleNestedAttribute{
 							Description: "The bundled artifact generated by the test.",
 							Optional:    true,
@@ -240,6 +283,27 @@ func (t *TestsResource) Schema(ctx context.Context, req resource.SchemaRequest,
 				Optional:    true,
 				Computed:    true,
 			},
+			"retry": retrySchema("On failure, tears down the driver completely, creates a fresh one, and re-runs all tests from scratch. " +
+				"This gives each attempt a clean driver, but external side effects from previous attempts are not rolled back: " +
+				"pushed images, written files, cloud resources created outside the driver (e.g. IAM roles, DNS records), and any other out-of-band mutations will still exist. " +
+				"All per-test retry blocks also reset — every test runs from its first attempt on each resource-level retry."),
+		},
+	}
+}
+
+func retrySchema(description string) schema.SingleNestedAttribute {
+	return schema.SingleNestedAttribute{
+		Description: description,
+		Optional:    true,
+		Attributes: map[string]schema.Attribute{
+			"attempts": schema.Int64Attribute{
+				Description: "Total number of attempts including the initial run. Must be >= 1.",
+				Required:    true,
+			},
+			"delay": schema.StringAttribute{
+				Description: "Delay between retry attempts as a Go duration string (e.g. \"5s\", \"1m\"). Defaults to 5s.",
+				Optional:    true,
+			},
 		},
 	}
 }
@@ -380,17 +444,13 @@ func (t *TestsResource) do(ctx context.Context, data *TestsResourceModel) (ds di
 		return []diag.Diagnostic{diag.NewErrorDiagnostic("failed to create target repository", err.Error())}
 	}
 
-	tracer := otel.Tracer("imagetest")
-
+	// Build test images once — refs are digest-based and stable across retries.
 	trefs, buildDiags := t.buildTestImages(ctx, data, trepo, imgsResolvedData, id)
 	if buildDiags.HasError() {
 		return buildDiags
 	}
 
-	dr, err := t.LoadDriver(ctx, data)
-	if err != nil {
-		return []diag.Diagnostic{diag.NewErrorDiagnostic("failed to load driver", err.Error())}
-	}
+	tracer := otel.Tracer("imagetest")
 
 	ctx, suiteSpan := tracer.Start(ctx, "imagetest.suite",
 		trace.WithAttributes(
@@ -412,6 +472,51 @@ func (t *TestsResource) do(ctx context.Context, data *TestsResourceModel) (ds di
 		suiteSpan.End()
 	}()
 
+	// Resource-level retry: on failure, tear down the driver, create a fresh
+	// one, and re-run all tests from scratch.
+	retryCfg, cfgDiags := data.Retry.config()
+	if cfgDiags.HasError() {
+		return cfgDiags
+	}
+
+	result := retry.Do(ctx, retryCfg, func(ctx context.Context, attempt int) error {
+		if attempt > 1 {
+			suiteSpan.AddEvent("retry", trace.WithAttributes(
+				attribute.Int("test.attempt", attempt),
+			))
+		}
+
+		ds = t.doAttempt(ctx, data, trefs, tracer)
+		if ds.HasError() {
+			return fmt.Errorf("%s", ds[len(ds)-1].Detail())
+		}
+		return nil
+	})
+
+	if result.Retried {
+		suiteSpan.SetAttributes(
+			attribute.Int("test.attempts", result.Attempts),
+			attribute.Bool("test.retried", true),
+		)
+		if !ds.HasError() {
+			ds = append(ds, diag.NewWarningDiagnostic(
+				fmt.Sprintf("tests passed after retry (attempt %d/%d)", result.Attempts, retryCfg.Attempts),
+				fmt.Sprintf("previous attempt failed: %s", result.LastError),
+			))
+		}
+	}
+
+	return ds
+}
+
+// doAttempt runs a single attempt of the full driver lifecycle: load → setup →
+// run tests → teardown. Each resource-level retry calls this with a fresh driver.
+func (t *TestsResource) doAttempt(ctx context.Context, data *TestsResourceModel, trefs []name.Reference, tracer trace.Tracer) (ds diag.Diagnostics) {
+	dr, err := t.LoadDriver(ctx, data)
+	if err != nil {
+		return []diag.Diagnostic{diag.NewErrorDiagnostic("failed to load driver", err.Error())}
+	}
+
 	defer func() {
 		ctx, teardownSpan := tracer.Start(ctx, "imagetest.teardown",
 			trace.WithAttributes(
@@ -443,7 +548,7 @@ func (t *TestsResource) do(ctx context.Context, data *TestsResourceModel) (ds di
 	setupSpan.End()
 
 	for i, tref := range trefs {
-		ds.Append(t.doTest(ctx, dr, data.Tests[i], tref)...)
+		ds.Append(t.doTestWithRetry(ctx, dr, data.Tests[i], tref)...)
 		if ds.HasError() {
 			return ds
 		}
@@ -452,6 +557,36 @@ func (t *TestsResource) do(ctx context.Context, data *TestsResourceModel) (ds di
 	return ds
 }
 
+// doTestWithRetry wraps doTest with per-test retry. Each retry re-runs d.Run()
+// within the same driver — the test author asserts idempotency.
+func (t *TestsResource) doTestWithRetry(ctx context.Context, d drivers.Tester, test *TestResourceModel, ref name.Reference) diag.Diagnostics {
+	cfg, cfgDiags := test.Retry.config()
+	if cfgDiags.HasError() {
+		return cfgDiags
+	}
+	if cfg.Attempts <= 1 {
+		return t.doTest(ctx, d, test, ref)
+	}
+
+	var lastDiags diag.Diagnostics
+	result := retry.Do(ctx, cfg, func(ctx context.Context, attempt int) error {
+		lastDiags = t.doTest(ctx, d, test, ref)
+		if lastDiags.HasError() {
+			return fmt.Errorf("%s", lastDiags[len(lastDiags)-1].Detail())
+		}
+		return nil
+	})
+
+	if result.Retried && !lastDiags.HasError() {
+		lastDiags = append(lastDiags, diag.NewWarningDiagnostic(
+			fmt.Sprintf("test %q passed after retry (attempt %d/%d)", test.Name.ValueString(), result.Attempts, cfg.Attempts),
+			fmt.Sprintf("previous attempt failed: %s", result.LastError),
+		))
+	}
+
+	return lastDiags
+}
+
 func (t *TestsResource) doTest(ctx context.Context, d drivers.Tester, test *TestResourceModel, ref name.Reference) diag.Diagnostics {
 	// Get the test_id from context
 	testID, ok := ctx.Value(contextKeyResourceTestID).(string)
 
@@ -296,6 +296,116 @@ resource "imagetest_tests" "foo" {
 				Check: checkArtifact(t),
 			},
 		},
+		// Per-test retry on a passing test: retry block is accepted, test still passes.
+		"dockerindocker-per-test-retry-passes": {
+			{
+				Config: fmt.Sprintf(`
+resource "imagetest_tests" "foo" {
+  name   = "dind-per-test-retry-passes"
+  driver = "docker_in_docker"
+
+  images = {
+    foo = "cgr.dev/chainguard/busybox:latest@sha256:c546e746013d75c1fc9bf01b7a645ce7caa1ec46c45cb618c6e28d7b57bccc85"
+  }
+
+  tests = [
+    {
+      name    = "sample"
+      image   = "cgr.dev/chainguard/busybox:latest"
+      content = [{ source = "${path.module}/testdata/TestAccTestsResource" }]
+      cmd     = "./%[1]s"
+      retry   = { attempts = 3, delay = "1s" }
+    }
+  ]
+
+  timeout = "5m"
+}
+        `, "docker-in-docker-basic.sh"),
+			},
+		},
+		// Per-test retry on a failing test: all attempts exhausted, error surfaces.
+		"dockerindocker-per-test-retry-exhausted": {
+			{
+				Config: fmt.Sprintf(`
+resource "imagetest_tests" "foo" {
+  name   = "dind-per-test-retry-exhausted"
+  driver = "docker_in_docker"
+
+  images = {
+    foo = "cgr.dev/chainguard/busybox:latest@sha256:c546e746013d75c1fc9bf01b7a645ce7caa1ec46c45cb618c6e28d7b57bccc85"
+  }
+
+  tests = [
+    {
+      name    = "sample"
+      image   = "cgr.dev/chainguard/busybox:latest"
+      content = [{ source = "${path.module}/testdata/TestAccTestsResource" }]
+      cmd     = "./%[1]s"
+      retry   = { attempts = 2, delay = "1s" }
+    }
+  ]
+
+  timeout = "5m"
+}
+        `, "docker-in-docker-fails.sh"),
+				ExpectError: regexp.MustCompile(`.*can't open 'imalittleteapot'.*`),
+			},
+		},
+		// Resource-level retry on a passing test: retry block is accepted, test still passes.
+		"dockerindocker-resource-retry-passes": {
+			{
+				Config: fmt.Sprintf(`
+resource "imagetest_tests" "foo" {
+  name   = "dind-resource-retry-passes"
+  driver = "docker_in_docker"
+
+  images = {
+    foo = "cgr.dev/chainguard/busybox:latest@sha256:c546e746013d75c1fc9bf01b7a645ce7caa1ec46c45cb618c6e28d7b57bccc85"
+  }
+
+  tests = [
+    {
+      name    = "sample"
+      image   = "cgr.dev/chainguard/busybox:latest"
+      content = [{ source = "${path.module}/testdata/TestAccTestsResource" }]
+      cmd     = "./%[1]s"
+    }
+  ]
+
+  retry   = { attempts = 2, delay = "1s" }
+  timeout = "5m"
+}
+        `, "docker-in-docker-basic.sh"),
+			},
+		},
+		// Resource-level retry on a failing test: all attempts exhausted, error surfaces.
+		"dockerindocker-resource-retry-exhausted": {
+			{
+				Config: fmt.Sprintf(`
+resource "imagetest_tests" "foo" {
+  name   = "dind-resource-retry-exhausted"
+  driver = "docker_in_docker"
+
+  images = {
+    foo = "cgr.dev/chainguard/busybox:latest@sha256:c546e746013d75c1fc9bf01b7a645ce7caa1ec46c45cb618c6e28d7b57bccc85"
+  }
+
+  tests = [
+    {
+      name    = "sample"
+      image   = "cgr.dev/chainguard/busybox:latest"
+      content = [{ source = "${path.module}/testdata/TestAccTestsResource" }]
+      cmd     = "./%[1]s"
+    }
+  ]
+
+  retry   = { attempts = 2, delay = "1s" }
+  timeout = "5m"
+}
+        `, "docker-in-docker-fails.sh"),
+				ExpectError: regexp.MustCompile(`.*can't open 'imalittleteapot'.*`),
+			},
+		},
 	}
 
 	for name, tc := range testCases {