drtprod: add wait_before and wait_after yaml support

nameisbhaskar · nameisbhaskar · commit 95fd0d7cbdf5 · 2025-08-01T13:01:17.000+05:30
Currently, there is a wait parameter for a step in the yaml configuration. But this parameter can be confusing as the wait can be before or after the command run. So, in this PR to explicit waits are added - wait_before and wait_after.

Epic: None
Release note: None
diff --git a/pkg/cmd/drtprod/cli/commands/yamlprocessor.go b/pkg/cmd/drtprod/cli/commands/yamlprocessor.go
@@ -118,7 +118,8 @@ type step struct {
 	Flags             map[string]interface{} `yaml:"flags"`               // Flags to pass to the command or script
 	ContinueOnFailure bool                   `yaml:"continue_on_failure"` // Whether to continue on failure
 	OnRollback        []step                 `yaml:"on_rollback"`         // Steps to execute if rollback is needed
-	Wait              int                    `yaml:"wait"`                // Wait time in seconds before executing the next step
+	WaitBefore        int                    `yaml:"wait_before"`         // Wait time in seconds before executing the step
+	WaitAfter         int                    `yaml:"wait_after"`          // Wait time in seconds after executing the step
 }
 
 // target defines a target cluster with associated steps to be executed.
@@ -143,7 +144,8 @@ type command struct {
 	args              []string   // Command arguments
 	continueOnFailure bool       // Whether to continue on failure
 	rollbackCmds      []*command // Rollback commands to execute in case of failure
-	wait              int        // Wait time in seconds before executing the next step
+	waitAfter         int        // Wait time in seconds after executing the command
+	waitBefore        int        // Wait time in seconds before executing the command
 }
 
 // String returns the command as a string for easy printing.
@@ -550,6 +552,10 @@ func executeCommands(ctx context.Context, logPrefix string, cmds []*command) err
 	}()
 
 	for _, cmd := range cmds {
+		if cmd.waitBefore > 0 {
+			fmt.Printf("[%s] Waiting for %d seconds\n", logPrefix, cmd.waitBefore)
+			time.Sleep(time.Duration(cmd.waitBefore) * time.Second)
+		}
 		fmt.Printf("[%s] Starting <%v>\n", logPrefix, cmd)
 		err := commandExecutor(ctx, logPrefix, cmd.name, cmd.args...)
 		if err != nil {
@@ -561,9 +567,9 @@ func executeCommands(ctx context.Context, logPrefix string, cmds []*command) err
 			fmt.Printf("[%s] Failed <%v>, Error Ignored: %v\n", logPrefix, cmd, err)
 		} else {
 			fmt.Printf("[%s] Completed <%v>\n", logPrefix, cmd)
-			if cmd.wait > 0 {
-				fmt.Printf("[%s] Waiting for %d seconds\n", logPrefix, cmd.wait)
-				time.Sleep(time.Duration(cmd.wait) * time.Second)
+			if cmd.waitAfter > 0 {
+				fmt.Printf("[%s] Waiting for %d seconds\n", logPrefix, cmd.waitAfter)
+				time.Sleep(time.Duration(cmd.waitAfter) * time.Second)
 			}
 		}
 
@@ -622,7 +628,8 @@ func generateStepCmd(clusterName string, s step) (*command, error) {
 			return nil, err
 		}
 	}
-	cmd.wait = s.Wait
+	cmd.waitAfter = s.WaitAfter
+	cmd.waitBefore = s.WaitBefore
 	return cmd, err
 }
 
diff --git a/pkg/cmd/drtprod/configs/drt_pua_9.yaml b/pkg/cmd/drtprod/configs/drt_pua_9.yaml
@@ -221,11 +221,11 @@ targets:
           active-warehouses: $TPCC_ACTIVE_WAREHOUSES
           duration: $RUN_DURATION
           ramp: 5m
-          wait: true
+          wait_after: true
           max-conn-lifetime: $MAX_CONN_LIFETIME
           conns: $CONNS
       - script: "pkg/cmd/drtprod/scripts/pua_operations.sh"
-        wait: 10
+        wait_after: 10
   - target_name: "Data Import"
     dependent_targets:
       - "Setup Certs & SSH Keys"
@@ -235,7 +235,7 @@ targets:
           - $WORKLOAD_CLUSTER:1
           - --
           - "sudo systemd-run --unit tpcc_init --same-dir --uid $(id -u) --gid $(id -g) bash ./tpcc_init_cct_tpcc.sh"
-        wait: 3600
+        wait_after: 3600
   - target_name: "Phase-1: Baseline Performance"
     dependent_targets:
       - "Data Import"
@@ -245,7 +245,7 @@ targets:
           - $WORKLOAD_CLUSTER
           - --
           - "sudo systemd-run --unit tpcc_run --same-dir --uid $(id -u) --gid $(id -g) bash ./tpcc_run_cct_tpcc.sh"
-        wait: 3600
+        wait_after: 3600
   - target_name: "Phase-2: Internal Operational Stress"
     dependent_targets:
       - "Phase-1: Baseline Performance"
@@ -258,21 +258,21 @@ targets:
           - |
             BACKUP INTO 'gs://$BUCKET_US_EAST_1/$CLUSTER?AUTH=implicit'
             WITH OPTIONS (revision_history = true, detached)
-        wait: 1800
+        wait_after: 1800
       - command: sql # create changefeed without initial scan
         args:
           - $CLUSTER:1
           - --
           - -e
           - "CREATE CHANGEFEED FOR TABLE cct_tpcc.public.order_line INTO 'null://' WITH initial_scan = 'no'"
-        wait: 600
+        wait_after: 600
       - command: sql # create index on order table
         args:
           - $CLUSTER:1
           - --
           - -e
           - "CREATE INDEX add_index_o_w_id ON cct_tpcc.public.order (o_w_id)"
-        wait: 700
+        wait_after: 700
       - command: deploy # rolling upgrade
         args:
           - $CLUSTER
@@ -281,7 +281,7 @@ targets:
         flags:
           pause: 5m
           grace-period: 500
-        wait: 300
+        wait_after: 300
   - target_name: "Phase-3: Disk Stalls"
     dependent_targets:
       - "Phase-2: Internal Operational Stress"
@@ -291,7 +291,7 @@ targets:
           - $WORKLOAD_CLUSTER:1
           - --
           - "./run_ops_disk-stall.sh"
-        wait: 1200
+        wait_after: 1200
   - target_name: "Phase-4: Network Failures"
     dependent_targets:
       - "Phase-3: Disk Stalls"
@@ -301,58 +301,58 @@ targets:
           - $WORKLOAD_CLUSTER:1
           - --
           - "./run_ops_network-partition-partial.sh"
-        wait: 1500
+        wait_after: 1500
       - command: run
         args:
           - $WORKLOAD_CLUSTER:1
           - --
           - "./run_ops_network-partition-full.sh"
-        wait: 1500
+        wait_after: 1500
   - target_name: "Phase-5: Node Restarts"
     dependent_targets:
       - "Phase-4: Network Failures"
     steps:
       - command: stop # ungraceful shutdown of node 2
         args:
           - $CLUSTER:2
-        wait: 30
+        wait_after: 30
       - command: start # restart node 2
         args:
           - $CLUSTER:2
         flags:
           restart: true
-        wait: 600
+        wait_after: 600
       - command: stop # ungraceful shutdown of node 6
         args:
           - $CLUSTER:6
-        wait: 30
+        wait_after: 30
       - command: start # restart node 6
         args:
           - $CLUSTER:6
         flags:
           restart: true
-        wait: 1500
+        wait_after: 1500
       - command: stop # ungraceful shutdown of node 7
         args:
           - $CLUSTER:7
-        wait: 30
+        wait_after: 30
       - command: start # restart node 7
         args:
           - $CLUSTER:7
         flags:
           restart: true
-        wait: 1500
+        wait_after: 1500
   - target_name: "Phase-6: Zone Outages"
     dependent_targets:
       - "Phase-5: Node Restarts"
     steps:
       - command: stop # ungraceful shutdown of nodes 7-9 to simulate zone outage
         args:
           - $CLUSTER:7-9
-        wait: 300
+        wait_after: 300
       - command: start # restart nodes 7-9
         args:
           - $CLUSTER:7-9
         flags:
           restart: true
-        wait: 3300
+        wait_after: 3300
diff --git a/pkg/cmd/drtprod/configs/drt_pua_mr.yaml b/pkg/cmd/drtprod/configs/drt_pua_mr.yaml
@@ -229,7 +229,7 @@ targets:
           regions: $REGIONS
       - script: "pkg/cmd/drtprod/scripts/tpcc_run_multiregion.sh"
       - script: "pkg/cmd/drtprod/scripts/pua_operations.sh"
-        wait: 10
+        wait_after: 10
   - target_name: "Data Import"
     dependent_targets:
       - "Setup Certs & SSH Keys"
@@ -239,7 +239,7 @@ targets:
           - $WORKLOAD_CLUSTER:1
           - --
           - "sudo systemd-run --unit tpcc_init --same-dir --uid $(id -u) --gid $(id -g) bash ./tpcc_init_cct_tpcc.sh"
-        wait: 3600
+        wait_after: 3600
   - target_name: "Phase-1: Baseline Performance"
     dependent_targets:
       - "Data Import"
@@ -249,7 +249,7 @@ targets:
           - $WORKLOAD_CLUSTER
           - --
           - "sudo systemd-run --unit tpcc_run --same-dir --uid $(id -u) --gid $(id -g) bash ./tpcc_run.sh"
-        wait: 3600
+        wait_after: 3600
   - target_name: "Phase-2: Internal Operational Stress"
     dependent_targets:
       - "Phase-1: Baseline Performance"
@@ -264,21 +264,21 @@ targets:
                             'gs://$BUCKET_US_EAST_5/$CLUSTER?AUTH=implicit&COCKROACH_LOCALITY=region%3Dus-east5',
                             'gs://$BUCKET_US_EAST_1/$CLUSTER?AUTH=implicit&COCKROACH_LOCALITY=region%3Dus-east1')
             WITH OPTIONS (revision_history = true, detached)
-        wait: 1500
+        wait_after: 1500
       - command: sql # create changefeed without initial scan
         args:
           - $CLUSTER:1
           - --
           - -e
           - "CREATE CHANGEFEED FOR TABLE cct_tpcc.public.order_line INTO 'null://' WITH initial_scan = 'no'"
-        wait: 900
+        wait_after: 900
       - command: sql # create index on order table
         args:
           - $CLUSTER:1
           - --
           - -e
           - "CREATE INDEX add_index_o_w_id ON cct_tpcc.public.order (o_w_id)"
-        wait: 700
+        wait_after: 700
       - command: deploy # rolling upgrade
         args:
           - $CLUSTER
@@ -287,7 +287,7 @@ targets:
         flags:
           pause: 5m
           grace-period: 500
-        wait: 300
+        wait_after: 300
   - target_name: "Phase-3: Disk Stalls"
     dependent_targets:
       - "Phase-2: Internal Operational Stress"
@@ -297,7 +297,7 @@ targets:
           - $WORKLOAD_CLUSTER:1
           - --
           - "./run_ops_disk-stall.sh"
-        wait: 1200
+        wait_after: 1200
   - target_name: "Phase-4: Network Failures"
     dependent_targets:
       - "Phase-3: Disk Stalls"
@@ -307,69 +307,69 @@ targets:
           - $WORKLOAD_CLUSTER:1
           - --
           - "./run_ops_network-partition-partial.sh"
-        wait: 1500
+        wait_after: 1500
       - command: run
         args:
           - $WORKLOAD_CLUSTER:1
           - --
           - "./run_ops_network-partition-full.sh"
-        wait: 1500
+        wait_after: 1500
   - target_name: "Phase-5: Node Restarts"
     dependent_targets:
       - "Phase-4: Network Failures"
     steps:
       - command: stop # ungraceful shutdown of node 4
         args:
           - $CLUSTER:4
-        wait: 30
+        wait_after: 30
       - command: start # restart node 4
         args:
           - $CLUSTER:4
         flags:
           restart: true
-        wait: 600
+        wait_after: 600
       - command: stop # ungraceful shutdown of node 6
         args:
           - $CLUSTER:6
-        wait: 30
+        wait_after: 30
       - command: start # restart node 6
         args:
           - $CLUSTER:6
         flags:
           restart: true
-        wait: 1500
+        wait_after: 1500
       - command: stop # ungraceful shutdown of node 15
         args:
           - $CLUSTER:15
-        wait: 30
+        wait_after: 30
       - command: start # restart node 15
         args:
           - $CLUSTER:15
         flags:
           restart: true
-        wait: 1500
+        wait_after: 1500
   - target_name: "Phase-6: Zone Outages"
     dependent_targets:
       - "Phase-5: Node Restarts"
     steps:
       - command: stop # ungraceful shutdown of nodes 3, 4 to simulate zone outage
         args:
           - $CLUSTER:3-4
-        wait: 300
+        wait_after: 300
       - command: start # restart nodes 3, 4
         args:
           - $CLUSTER:3-4
         flags:
           restart: true
-        wait: 3300
+        wait_after: 3300
   - target_name: "Phase-7: Region Outages"
     dependent_targets:
       - "Phase-6: Zone Outages"
     steps:
       - command: stop # ungraceful shutdown of nodes 11-15 to simulate region outage
         args:
           - $CLUSTER:11-15
-        wait: 300
+        wait_after: 300
       - command: start # restart nodes 11-15
         args:
           - $CLUSTER:11-15