Merge #147835

craig[bot] · herkolategan · craig[bot] · commit 8e52c9687fb1 · 2025-06-17T09:01:28.000Z
147835: roachprod: add VM reset to failure injection framework r=DarrylWong,golgeek a=herkolategan This PR adds VM reset functionality to the failure injection framework, allowing for controlled node resets during testing. 1. Updated `roachprod` reset command to operate on a subset of nodes instead of all nodes 2. Implemented a new reset-vm failure mode that: - Captures running processes before reset - Resets specified nodes - Restarts processes after reset 3. Added smoke tests to validate the reset failure mode Fixes: #147361 Epic: None Release note: None Co-authored-by: Herko Lategan <herko@cockroachlabs.com>
diff --git a/pkg/cmd/roachprod/cli/commands.go b/pkg/cmd/roachprod/cli/commands.go
@@ -199,15 +199,17 @@ Removing nodes from the middle of the cluster is not supported yet.
 }
 
 func (cr *commandRegistry) buildResetCmd() *cobra.Command {
-	return &cobra.Command{
+	resetCmd := &cobra.Command{
 		Use:   "reset <cluster>",
-		Short: "reset *all* VMs in a cluster",
-		Long:  `Reset a cloud VM.`,
+		Short: "reset VMs in a cluster",
+		Long:  `Reset cloud VMs in a cluster.`,
 		Args:  cobra.ExactArgs(1),
 		Run: wrap(func(cmd *cobra.Command, args []string) (retErr error) {
 			return roachprod.Reset(config.Logger, args[0])
 		}),
 	}
+	addHelpAboutNodes(resetCmd)
+	return resetCmd
 }
 
 func (cr *commandRegistry) buildDestroyCmd() *cobra.Command {
diff --git a/pkg/cmd/roachtest/cluster.go b/pkg/cmd/roachtest/cluster.go
@@ -1852,6 +1852,12 @@ func (c *clusterImpl) doDestroy(ctx context.Context, l *logger.Logger) <-chan st
 	return ch
 }
 
+func (c *clusterImpl) Reset(
+	ctx context.Context, l *logger.Logger, nodes option.NodeListOption,
+) error {
+	return roachprod.Reset(l, c.MakeNodes(nodes))
+}
+
 func (c *clusterImpl) addLabels(labels map[string]string) error {
 	// N.B. we must sanitize the values; e.g., some test names can exceed the maximum length (63 chars in GCE).
 	// N.B. we don't sanitize the keys; unlike values, they are typically _not_ (dynamically) generated.
diff --git a/pkg/cmd/roachtest/cluster/cluster_interface.go b/pkg/cmd/roachtest/cluster/cluster_interface.go
@@ -160,6 +160,10 @@ type Cluster interface {
 	) error
 	PopulateEtcHosts(ctx context.Context, l *logger.Logger) error
 
+	// VM management
+
+	Reset(ctx context.Context, l *logger.Logger, nodes option.NodeListOption) error
+
 	// Methods whose inclusion on this interface is purely historical.
 	// These should be removed over time.
 
diff --git a/pkg/cmd/roachtest/clusterstats/mock_cluster_generated_test.go b/pkg/cmd/roachtest/clusterstats/mock_cluster_generated_test.go
diff --git a/pkg/cmd/roachtest/tests/failure_injection.go b/pkg/cmd/roachtest/tests/failure_injection.go
@@ -726,6 +726,48 @@ var processKillTests = func(c cluster.Cluster) []failureSmokeTest {
 	return tests
 }
 
+var resetVMTests = func(c cluster.Cluster) failureSmokeTest {
+	rng, _ := randutil.NewPseudoRand()
+	rebootedNode := c.CRDBNodes().SeededRandNode(rng)
+	return failureSmokeTest{
+		testName:    failures.ResetVMFailureName,
+		failureName: failures.ResetVMFailureName,
+		args: failures.ResetVMArgs{
+			Nodes: rebootedNode.InstallNodes(),
+		},
+		validateFailure: func(ctx context.Context, l *logger.Logger, c cluster.Cluster, f *failures.Failer) error {
+			// Check that we aren't able to establish a SQL connection to the rebooted node.
+			// waitForFailureToPropagate already does a similar check, but we do it here
+			// to satisfy the smoke test framework since this is a fairly simple failure
+			// mode with less to validate.
+			return testutils.SucceedsSoonError(func() error {
+				if ctx.Err() != nil {
+					return ctx.Err()
+				}
+
+				killedDB, err := c.ConnE(ctx, l, rebootedNode[0])
+				if err == nil {
+					defer killedDB.Close()
+					if err := killedDB.Ping(); err == nil {
+						return errors.Errorf("expected node %d to be dead, but it is alive", rebootedNode)
+					} else {
+						l.Printf("failed to connect to node %d: %v", rebootedNode, err)
+					}
+				} else {
+					l.Printf("unable to establish SQL connection to node %d", rebootedNode)
+				}
+				return nil
+			})
+		},
+		validateRecover: func(ctx context.Context, l *logger.Logger, c cluster.Cluster, f *failures.Failer) error {
+			return nil
+		},
+		workload: func(ctx context.Context, c cluster.Cluster, args ...string) error {
+			return defaultFailureSmokeTestWorkload(ctx, c, "--tolerate-errors")
+		},
+	}
+}
+
 func defaultFailureSmokeTestWorkload(ctx context.Context, c cluster.Cluster, args ...string) error {
 	workloadArgs := strings.Join(args, " ")
 	cmd := roachtestutil.NewCommand("./cockroach workload run kv %s", workloadArgs).
@@ -772,6 +814,7 @@ func runFailureSmokeTest(ctx context.Context, t test.Test, c cluster.Cluster, no
 		asymmetricOutgoingNetworkPartitionTest(c),
 		latencyTest(c),
 		dmsetupDiskStallTest(c),
+		resetVMTests(c),
 	}
 	failureSmokeTests = append(failureSmokeTests, cgroupsDiskStallTests(c)...)
 	failureSmokeTests = append(failureSmokeTests, processKillTests(c)...)
diff --git a/pkg/roachprod/failureinjection/failures/BUILD.bazel b/pkg/roachprod/failureinjection/failures/BUILD.bazel
@@ -11,6 +11,7 @@ go_library(
         "noop.go",
         "process_kill.go",
         "registry.go",
+        "reset.go",
     ],
     importpath = "github.com/cockroachdb/cockroach/pkg/roachprod/failureinjection/failures",
     visibility = ["//visibility:public"],
diff --git a/pkg/roachprod/failureinjection/failures/registry.go b/pkg/roachprod/failureinjection/failures/registry.go
@@ -33,6 +33,7 @@ func (r *FailureRegistry) Register() {
 	registerDmsetupDiskStall(r)
 	registerIPTablesPartitionFailure(r)
 	registerNetworkLatencyFailure(r)
+	registerResetVM(r)
 	registerNoopFailure(r)
 	registerProcessKillFailure(r)
 }
diff --git a/pkg/roachprod/failureinjection/failures/reset.go b/pkg/roachprod/failureinjection/failures/reset.go
@@ -0,0 +1,121 @@
+// Copyright 2025 The Cockroach Authors.
+//
+// Use of this software is governed by the CockroachDB Software License
+// included in the /LICENSE file.
+
+package failures
+
+import (
+	"context"
+	"time"
+
+	"github.com/cockroachdb/cockroach/pkg/roachprod"
+	"github.com/cockroachdb/cockroach/pkg/roachprod/install"
+	"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
+)
+
+type (
+	ResetVMArgs struct {
+		Nodes install.Nodes
+	}
+	resetVMFailure struct {
+		GenericFailure
+		Processes map[install.Node][]install.MonitorProcessRunning
+	}
+)
+
+var _ FailureMode = &resetVMFailure{}
+
+const ResetVMFailureName = "reset-vm"
+
+func registerResetVM(r *FailureRegistry) {
+	r.add(ResetVMFailureName, ResetVMArgs{}, MakeResetVMFailure)
+}
+
+func MakeResetVMFailure(clusterName string, l *logger.Logger, secure bool) (FailureMode, error) {
+	c, err := roachprod.GetClusterFromCache(l, clusterName, install.SecureOption(secure))
+	if err != nil {
+		return nil, err
+	}
+
+	return &resetVMFailure{
+		GenericFailure: GenericFailure{
+			c: c,
+		},
+	}, nil
+}
+
+// Description implements FailureMode.
+func (r *resetVMFailure) Description() string {
+	return ResetVMFailureName
+}
+
+// Setup implements FailureMode.
+func (r *resetVMFailure) Setup(ctx context.Context, l *logger.Logger, args FailureArgs) error {
+	return nil
+}
+
+// Inject implements FailureMode.
+func (r *resetVMFailure) Inject(ctx context.Context, l *logger.Logger, args FailureArgs) error {
+	// Capture the processes running on the nodes.
+	nodes := args.(ResetVMArgs).Nodes
+	monitorChan := r.c.WithNodes(nodes).Monitor(l, ctx, install.MonitorOpts{OneShot: true})
+	r.Processes = make(map[install.Node][]install.MonitorProcessRunning, 0)
+	for e := range monitorChan {
+		if p, ok := e.Event.(install.MonitorProcessRunning); ok {
+			r.Processes[e.Node] = append(r.Processes[e.Node], p)
+		}
+	}
+
+	return r.c.WithNodes(nodes).Reset(l)
+}
+
+// Cleanup implements FailureMode.
+func (r *resetVMFailure) Cleanup(ctx context.Context, l *logger.Logger, args FailureArgs) error {
+	return nil
+}
+
+// Recover implements FailureMode.
+func (r *resetVMFailure) Recover(ctx context.Context, l *logger.Logger, args FailureArgs) error {
+	// Restart the processes.
+	for node, processes := range r.Processes {
+		for _, p := range processes {
+			l.Printf("Starting process %s on node %s", p.PID, p.VirtualClusterName)
+			err := r.c.WithNodes([]install.Node{node}).Start(ctx, l, install.StartOpts{
+				VirtualClusterName: p.VirtualClusterName,
+				SQLInstance:        p.SQLInstance,
+				IsRestart:          true,
+			})
+			if err != nil {
+				return err
+			}
+		}
+	}
+	return nil
+}
+
+// WaitForFailureToPropagate implements FailureMode.
+func (r *resetVMFailure) WaitForFailureToPropagate(
+	ctx context.Context, l *logger.Logger, args FailureArgs,
+) error {
+	nodes := args.(ResetVMArgs).Nodes
+	l.Printf("Waiting for nodes to become unavailable: %v", nodes)
+
+	// Some providers take a while to stop VMs (>10 minutes).
+	return forEachNode(nodes, func(n install.Nodes) error {
+		return r.WaitForSQLUnavailable(ctx, l, n, 15*time.Minute)
+	})
+}
+
+// WaitForFailureToRecover implements FailureMode.
+func (r *resetVMFailure) WaitForFailureToRecover(
+	ctx context.Context, l *logger.Logger, args FailureArgs,
+) error {
+	nodes := args.(ResetVMArgs).Nodes
+	l.Printf("Waiting for nodes to become available: %v", nodes)
+
+	// Some providers take a while to start VMs (>10 minutes).
+	return forEachNode(nodes, func(n install.Nodes) error {
+		return r.WaitForSQLReady(ctx, l, n, 15*time.Minute)
+	})
+}
diff --git a/pkg/roachprod/install/cluster_synced.go b/pkg/roachprod/install/cluster_synced.go
@@ -2725,3 +2725,20 @@ done <<< "$HOSTS_LIST"
 
 	return nil
 }
+
+// Reset resets VMs in a cluster.
+func (c *SyncedCluster) Reset(l *logger.Logger) error {
+	if c.IsLocal() {
+		return nil
+	}
+
+	nodes := c.TargetNodes()
+	targetVMs := make(vm.List, len(nodes))
+	for idx, node := range nodes {
+		targetVMs[idx] = c.VMs[node-1]
+	}
+
+	return vm.FanOut(targetVMs, func(p vm.Provider, vms vm.List) error {
+		return p.Reset(l, vms)
+	})
+}
diff --git a/pkg/roachprod/roachprod.go b/pkg/roachprod/roachprod.go
@@ -626,24 +626,14 @@ func Stage(
 	return install.StageApplication(ctx, l, c, applicationName, version, os, vm.CPUArch(arch), dir)
 }
 
-// Reset resets all VMs in a cluster.
+// Reset resets VMs in a cluster.
 func Reset(l *logger.Logger, clusterName string) error {
-	if err := LoadClusters(); err != nil {
-		return err
-	}
-
-	if config.IsLocalClusterName(clusterName) {
-		return nil
-	}
-
-	c, err := getClusterFromCloud(l, clusterName)
+	c, err := GetClusterFromCache(l, clusterName)
 	if err != nil {
 		return err
 	}
 
-	return vm.FanOut(c.VMs, func(p vm.Provider, vms vm.List) error {
-		return p.Reset(l, vms)
-	})
+	return c.Reset(l)
 }
 
 // SetupSSH sets up the keys and host keys for the vms in the cluster.
diff --git a/pkg/roachprod/vm/ibm/provider.go b/pkg/roachprod/vm/ibm/provider.go
@@ -639,7 +639,7 @@ func (p *Provider) Reset(l *logger.Logger, vms vm.List) error {
 		g.Go(func() error {
 			_, _, err = vpcService.CreateInstanceAction(&vpcv1.CreateInstanceActionOptions{
 				InstanceID: instance.instance.ID,
-				Type:       core.StringPtr("reboot"),
+				Type:       core.StringPtr(vpcv1.CreateInstanceActionOptionsTypeRebootConst),
 				Force:      core.BoolPtr(true),
 			})
 			if err != nil {

Original file line number	Diff line number	Diff line change
`@@ -199,15 +199,17 @@ Removing nodes from the middle of the cluster is not supported yet.`
`199`	`199`	`}`
`200`	`200`
`201`	`201`	`func (cr commandRegistry) buildResetCmd() cobra.Command {`
`202`		`- return &cobra.Command{`
	`202`	`+ resetCmd := &cobra.Command{`
`203`	`203`	`Use: "reset <cluster>",`
`204`		`- Short: "reset all VMs in a cluster",`
`205`		- Long: `Reset a cloud VM.`,
	`204`	`+ Short: "reset VMs in a cluster",`
	`205`	+ Long: `Reset cloud VMs in a cluster.`,
`206`	`206`	`Args: cobra.ExactArgs(1),`
`207`	`207`	`Run: wrap(func(cmd *cobra.Command, args []string) (retErr error) {`
`208`	`208`	`return roachprod.Reset(config.Logger, args[0])`
`209`	`209`	`}),`
`210`	`210`	`}`
	`211`	`+ addHelpAboutNodes(resetCmd)`
	`212`	`+ return resetCmd`
`211`	`213`	`}`
`212`	`214`
`213`	`215`	`func (cr commandRegistry) buildDestroyCmd() cobra.Command {`
Original file line number	Diff line number	Diff line change
`@@ -33,6 +33,7 @@ func (r *FailureRegistry) Register() {`
`33`	`33`	`registerDmsetupDiskStall(r)`
`34`	`34`	`registerIPTablesPartitionFailure(r)`
`35`	`35`	`registerNetworkLatencyFailure(r)`
	`36`	`+ registerResetVM(r)`
`36`	`37`	`registerNoopFailure(r)`
`37`	`38`	`registerProcessKillFailure(r)`
`38`	`39`	`}`