Merge #151053

craig[bot] · nameisbhaskar · craig[bot] · commit 585a797eb9ab · 2025-08-01T07:32:22.000Z
151053: drtprod: add wait_before and wait_after yaml support r=shailendra-patel a=nameisbhaskar

Currently, there is a wait parameter for a step in the yaml configuration. But this parameter can be confusing as the wait can be before or after the command run. So, in this PR 2 separate parameters are added to be more explicit - wait_before and wait_after.

Epic: None
Release note: None

Co-authored-by: Bhaskarjyoti Bora &lt;bhaskar.bora@cockroachlabs.com&gt;
diff --git a/pkg/cmd/drtprod/cli/commands/yamlprocessor.go b/pkg/cmd/drtprod/cli/commands/yamlprocessor.go
@@ -118,7 +118,8 @@ type step struct {
 	Flags             map[string]interface{} `yaml:"flags"`               // Flags to pass to the command or script
 	ContinueOnFailure bool                   `yaml:"continue_on_failure"` // Whether to continue on failure
 	OnRollback        []step                 `yaml:"on_rollback"`         // Steps to execute if rollback is needed
-	Wait              int                    `yaml:"wait"`                // Wait time in seconds before executing the next step
+	WaitBefore        int                    `yaml:"wait_before"`         // Wait time in seconds before executing the step
+	WaitAfter         int                    `yaml:"wait_after"`          // Wait time in seconds after executing the step
 }
 
 // target defines a target cluster with associated steps to be executed.
@@ -143,7 +144,8 @@ type command struct {
 	args              []string   // Command arguments
 	continueOnFailure bool       // Whether to continue on failure
 	rollbackCmds      []*command // Rollback commands to execute in case of failure
-	wait              int        // Wait time in seconds before executing the next step
+	waitAfter         int        // Wait time in seconds after executing the command
+	waitBefore        int        // Wait time in seconds before executing the command
 }
 
 // String returns the command as a string for easy printing.
@@ -550,6 +552,10 @@ func executeCommands(ctx context.Context, logPrefix string, cmds []*command) err
 	}()
 
 	for _, cmd := range cmds {
+		if cmd.waitBefore > 0 {
+			fmt.Printf("[%s] Waiting for %d seconds\n", logPrefix, cmd.waitBefore)
+			time.Sleep(time.Duration(cmd.waitBefore) * time.Second)
+		}
 		fmt.Printf("[%s] Starting <%v>\n", logPrefix, cmd)
 		err := commandExecutor(ctx, logPrefix, cmd.name, cmd.args...)
 		if err != nil {
@@ -561,9 +567,9 @@ func executeCommands(ctx context.Context, logPrefix string, cmds []*command) err
 			fmt.Printf("[%s] Failed <%v>, Error Ignored: %v\n", logPrefix, cmd, err)
 		} else {
 			fmt.Printf("[%s] Completed <%v>\n", logPrefix, cmd)
-			if cmd.wait > 0 {
-				fmt.Printf("[%s] Waiting for %d seconds\n", logPrefix, cmd.wait)
-				time.Sleep(time.Duration(cmd.wait) * time.Second)
+			if cmd.waitAfter > 0 {
+				fmt.Printf("[%s] Waiting for %d seconds\n", logPrefix, cmd.waitAfter)
+				time.Sleep(time.Duration(cmd.waitAfter) * time.Second)
 			}
 		}
 
@@ -622,7 +628,8 @@ func generateStepCmd(clusterName string, s step) (*command, error) {
 			return nil, err
 		}
 	}
-	cmd.wait = s.Wait
+	cmd.waitAfter = s.WaitAfter
+	cmd.waitBefore = s.WaitBefore
 	return cmd, err
 }
 
diff --git a/pkg/cmd/drtprod/configs/drt_pua_9.yaml b/pkg/cmd/drtprod/configs/drt_pua_9.yaml
@@ -221,11 +221,11 @@ targets:
           active-warehouses: $TPCC_ACTIVE_WAREHOUSES
           duration: $RUN_DURATION
           ramp: 5m
-          wait: true
+          wait_after: true
           max-conn-lifetime: $MAX_CONN_LIFETIME
           conns: $CONNS
       - script: "pkg/cmd/drtprod/scripts/pua_operations.sh"
-        wait: 10
+        wait_after: 10
   - target_name: "Data Import"
     dependent_targets:
       - "Setup Certs & SSH Keys"
@@ -235,7 +235,7 @@ targets:
           - $WORKLOAD_CLUSTER:1
           - --
           - "sudo systemd-run --unit tpcc_init --same-dir --uid $(id -u) --gid $(id -g) bash ./tpcc_init_cct_tpcc.sh"
-        wait: 3600
+        wait_after: 3600
   - target_name: "Phase-1: Baseline Performance"
     dependent_targets:
       - "Data Import"
@@ -245,7 +245,7 @@ targets:
           - $WORKLOAD_CLUSTER
           - --
           - "sudo systemd-run --unit tpcc_run --same-dir --uid $(id -u) --gid $(id -g) bash ./tpcc_run_cct_tpcc.sh"
-        wait: 3600
+        wait_after: 3600
   - target_name: "Phase-2: Internal Operational Stress"
     dependent_targets:
       - "Phase-1: Baseline Performance"
@@ -258,21 +258,21 @@ targets:
           - |
             BACKUP INTO 'gs://$BUCKET_US_EAST_1/$CLUSTER?AUTH=implicit'
             WITH OPTIONS (revision_history = true, detached)
-        wait: 1800
+        wait_after: 1800
       - command: sql # create changefeed without initial scan
         args:
           - $CLUSTER:1
           - --
           - -e
           - "CREATE CHANGEFEED FOR TABLE cct_tpcc.public.order_line INTO 'null://' WITH initial_scan = 'no'"
-        wait: 600
+        wait_after: 600
       - command: sql # create index on order table
         args:
           - $CLUSTER:1
           - --
           - -e
           - "CREATE INDEX add_index_o_w_id ON cct_tpcc.public.order (o_w_id)"
-        wait: 700
+        wait_after: 700
       - command: deploy # rolling upgrade
         args:
           - $CLUSTER
@@ -281,7 +281,7 @@ targets:
         flags:
           pause: 5m
           grace-period: 500
-        wait: 300
+        wait_after: 300
   - target_name: "Phase-3: Disk Stalls"
     dependent_targets:
       - "Phase-2: Internal Operational Stress"
@@ -291,7 +291,7 @@ targets:
           - $WORKLOAD_CLUSTER:1
           - --
           - "./run_ops_disk-stall.sh"
-        wait: 1200
+        wait_after: 1200
   - target_name: "Phase-4: Network Failures"
     dependent_targets:
       - "Phase-3: Disk Stalls"
@@ -301,58 +301,58 @@ targets:
           - $WORKLOAD_CLUSTER:1
           - --
           - "./run_ops_network-partition-partial.sh"
-        wait: 1500
+        wait_after: 1500
       - command: run
         args:
           - $WORKLOAD_CLUSTER:1
           - --
           - "./run_ops_network-partition-full.sh"
-        wait: 1500
+        wait_after: 1500
   - target_name: "Phase-5: Node Restarts"
     dependent_targets:
       - "Phase-4: Network Failures"
     steps:
       - command: stop # ungraceful shutdown of node 2
         args:
           - $CLUSTER:2
-        wait: 30
+        wait_after: 30
       - command: start # restart node 2
         args:
           - $CLUSTER:2
         flags:
           restart: true
-        wait: 600
+        wait_after: 600
       - command: stop # ungraceful shutdown of node 6
         args:
           - $CLUSTER:6
-        wait: 30
+        wait_after: 30
       - command: start # restart node 6
         args:
           - $CLUSTER:6
         flags:
           restart: true
-        wait: 1500
+        wait_after: 1500
       - command: stop # ungraceful shutdown of node 7
         args:
           - $CLUSTER:7
-        wait: 30
+        wait_after: 30
       - command: start # restart node 7
         args:
           - $CLUSTER:7
         flags:
           restart: true
-        wait: 1500
+        wait_after: 1500
   - target_name: "Phase-6: Zone Outages"
     dependent_targets:
       - "Phase-5: Node Restarts"
     steps:
       - command: stop # ungraceful shutdown of nodes 7-9 to simulate zone outage
         args:
           - $CLUSTER:7-9
-        wait: 300
+        wait_after: 300
       - command: start # restart nodes 7-9
         args:
           - $CLUSTER:7-9
         flags:
           restart: true
-        wait: 3300
+        wait_after: 3300
diff --git a/pkg/cmd/drtprod/configs/drt_pua_mr.yaml b/pkg/cmd/drtprod/configs/drt_pua_mr.yaml
@@ -229,7 +229,7 @@ targets:
           regions: $REGIONS
       - script: "pkg/cmd/drtprod/scripts/tpcc_run_multiregion.sh"
       - script: "pkg/cmd/drtprod/scripts/pua_operations.sh"
-        wait: 10
+        wait_after: 10
   - target_name: "Data Import"
     dependent_targets:
       - "Setup Certs & SSH Keys"
@@ -239,7 +239,7 @@ targets:
           - $WORKLOAD_CLUSTER:1
           - --
           - "sudo systemd-run --unit tpcc_init --same-dir --uid $(id -u) --gid $(id -g) bash ./tpcc_init_cct_tpcc.sh"
-        wait: 3600
+        wait_after: 3600
   - target_name: "Phase-1: Baseline Performance"
     dependent_targets:
       - "Data Import"
@@ -249,7 +249,7 @@ targets:
           - $WORKLOAD_CLUSTER
           - --
           - "sudo systemd-run --unit tpcc_run --same-dir --uid $(id -u) --gid $(id -g) bash ./tpcc_run.sh"
-        wait: 3600
+        wait_after: 3600
   - target_name: "Phase-2: Internal Operational Stress"
     dependent_targets:
       - "Phase-1: Baseline Performance"
@@ -264,21 +264,21 @@ targets:
                             'gs://$BUCKET_US_EAST_5/$CLUSTER?AUTH=implicit&COCKROACH_LOCALITY=region%3Dus-east5',
                             'gs://$BUCKET_US_EAST_1/$CLUSTER?AUTH=implicit&COCKROACH_LOCALITY=region%3Dus-east1')
             WITH OPTIONS (revision_history = true, detached)
-        wait: 1500
+        wait_after: 1500
       - command: sql # create changefeed without initial scan
         args:
           - $CLUSTER:1
           - --
           - -e
           - "CREATE CHANGEFEED FOR TABLE cct_tpcc.public.order_line INTO 'null://' WITH initial_scan = 'no'"
-        wait: 900
+        wait_after: 900
       - command: sql # create index on order table
         args:
           - $CLUSTER:1
           - --
           - -e
           - "CREATE INDEX add_index_o_w_id ON cct_tpcc.public.order (o_w_id)"
-        wait: 700
+        wait_after: 700
       - command: deploy # rolling upgrade
         args:
           - $CLUSTER
@@ -287,7 +287,7 @@ targets:
         flags:
           pause: 5m
           grace-period: 500
-        wait: 300
+        wait_after: 300
   - target_name: "Phase-3: Disk Stalls"
     dependent_targets:
       - "Phase-2: Internal Operational Stress"
@@ -297,7 +297,7 @@ targets:
           - $WORKLOAD_CLUSTER:1
           - --
           - "./run_ops_disk-stall.sh"
-        wait: 1200
+        wait_after: 1200
   - target_name: "Phase-4: Network Failures"
     dependent_targets:
       - "Phase-3: Disk Stalls"
@@ -307,69 +307,69 @@ targets:
           - $WORKLOAD_CLUSTER:1
           - --
           - "./run_ops_network-partition-partial.sh"
-        wait: 1500
+        wait_after: 1500
       - command: run
         args:
           - $WORKLOAD_CLUSTER:1
           - --
           - "./run_ops_network-partition-full.sh"
-        wait: 1500
+        wait_after: 1500
   - target_name: "Phase-5: Node Restarts"
     dependent_targets:
       - "Phase-4: Network Failures"
     steps:
       - command: stop # ungraceful shutdown of node 4
         args:
           - $CLUSTER:4
-        wait: 30
+        wait_after: 30
       - command: start # restart node 4
         args:
           - $CLUSTER:4
         flags:
           restart: true
-        wait: 600
+        wait_after: 600
       - command: stop # ungraceful shutdown of node 6
         args:
           - $CLUSTER:6
-        wait: 30
+        wait_after: 30
       - command: start # restart node 6
         args:
           - $CLUSTER:6
         flags:
           restart: true
-        wait: 1500
+        wait_after: 1500
       - command: stop # ungraceful shutdown of node 15
         args:
           - $CLUSTER:15
-        wait: 30
+        wait_after: 30
       - command: start # restart node 15
         args:
           - $CLUSTER:15
         flags:
           restart: true
-        wait: 1500
+        wait_after: 1500
   - target_name: "Phase-6: Zone Outages"
     dependent_targets:
       - "Phase-5: Node Restarts"
     steps:
       - command: stop # ungraceful shutdown of nodes 3, 4 to simulate zone outage
         args:
           - $CLUSTER:3-4
-        wait: 300
+        wait_after: 300
       - command: start # restart nodes 3, 4
         args:
           - $CLUSTER:3-4
         flags:
           restart: true
-        wait: 3300
+        wait_after: 3300
   - target_name: "Phase-7: Region Outages"
     dependent_targets:
       - "Phase-6: Zone Outages"
     steps:
       - command: stop # ungraceful shutdown of nodes 11-15 to simulate region outage
         args:
           - $CLUSTER:11-15
-        wait: 300
+        wait_after: 300
       - command: start # restart nodes 11-15
         args:
           - $CLUSTER:11-15