Skip to content

Commit 95fd0d7

Browse files
committed
drtprod: add wait_before and wait_after yaml support
Currently, there is a wait parameter for a step in the yaml configuration. But this parameter can be confusing as the wait can be before or after the command run. So, in this PR to explicit waits are added - wait_before and wait_after. Epic: None Release note: None
1 parent 9831be5 commit 95fd0d7

File tree

3 files changed

+51
-44
lines changed

3 files changed

+51
-44
lines changed

pkg/cmd/drtprod/cli/commands/yamlprocessor.go

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,8 @@ type step struct {
118118
Flags map[string]interface{} `yaml:"flags"` // Flags to pass to the command or script
119119
ContinueOnFailure bool `yaml:"continue_on_failure"` // Whether to continue on failure
120120
OnRollback []step `yaml:"on_rollback"` // Steps to execute if rollback is needed
121-
Wait int `yaml:"wait"` // Wait time in seconds before executing the next step
121+
WaitBefore int `yaml:"wait_before"` // Wait time in seconds before executing the step
122+
WaitAfter int `yaml:"wait_after"` // Wait time in seconds after executing the step
122123
}
123124

124125
// target defines a target cluster with associated steps to be executed.
@@ -143,7 +144,8 @@ type command struct {
143144
args []string // Command arguments
144145
continueOnFailure bool // Whether to continue on failure
145146
rollbackCmds []*command // Rollback commands to execute in case of failure
146-
wait int // Wait time in seconds before executing the next step
147+
waitAfter int // Wait time in seconds after executing the command
148+
waitBefore int // Wait time in seconds before executing the command
147149
}
148150

149151
// String returns the command as a string for easy printing.
@@ -550,6 +552,10 @@ func executeCommands(ctx context.Context, logPrefix string, cmds []*command) err
550552
}()
551553

552554
for _, cmd := range cmds {
555+
if cmd.waitBefore > 0 {
556+
fmt.Printf("[%s] Waiting for %d seconds\n", logPrefix, cmd.waitBefore)
557+
time.Sleep(time.Duration(cmd.waitBefore) * time.Second)
558+
}
553559
fmt.Printf("[%s] Starting <%v>\n", logPrefix, cmd)
554560
err := commandExecutor(ctx, logPrefix, cmd.name, cmd.args...)
555561
if err != nil {
@@ -561,9 +567,9 @@ func executeCommands(ctx context.Context, logPrefix string, cmds []*command) err
561567
fmt.Printf("[%s] Failed <%v>, Error Ignored: %v\n", logPrefix, cmd, err)
562568
} else {
563569
fmt.Printf("[%s] Completed <%v>\n", logPrefix, cmd)
564-
if cmd.wait > 0 {
565-
fmt.Printf("[%s] Waiting for %d seconds\n", logPrefix, cmd.wait)
566-
time.Sleep(time.Duration(cmd.wait) * time.Second)
570+
if cmd.waitAfter > 0 {
571+
fmt.Printf("[%s] Waiting for %d seconds\n", logPrefix, cmd.waitAfter)
572+
time.Sleep(time.Duration(cmd.waitAfter) * time.Second)
567573
}
568574
}
569575

@@ -622,7 +628,8 @@ func generateStepCmd(clusterName string, s step) (*command, error) {
622628
return nil, err
623629
}
624630
}
625-
cmd.wait = s.Wait
631+
cmd.waitAfter = s.WaitAfter
632+
cmd.waitBefore = s.WaitBefore
626633
return cmd, err
627634
}
628635

pkg/cmd/drtprod/configs/drt_pua_9.yaml

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -221,11 +221,11 @@ targets:
221221
active-warehouses: $TPCC_ACTIVE_WAREHOUSES
222222
duration: $RUN_DURATION
223223
ramp: 5m
224-
wait: true
224+
wait_after: true
225225
max-conn-lifetime: $MAX_CONN_LIFETIME
226226
conns: $CONNS
227227
- script: "pkg/cmd/drtprod/scripts/pua_operations.sh"
228-
wait: 10
228+
wait_after: 10
229229
- target_name: "Data Import"
230230
dependent_targets:
231231
- "Setup Certs & SSH Keys"
@@ -235,7 +235,7 @@ targets:
235235
- $WORKLOAD_CLUSTER:1
236236
- --
237237
- "sudo systemd-run --unit tpcc_init --same-dir --uid $(id -u) --gid $(id -g) bash ./tpcc_init_cct_tpcc.sh"
238-
wait: 3600
238+
wait_after: 3600
239239
- target_name: "Phase-1: Baseline Performance"
240240
dependent_targets:
241241
- "Data Import"
@@ -245,7 +245,7 @@ targets:
245245
- $WORKLOAD_CLUSTER
246246
- --
247247
- "sudo systemd-run --unit tpcc_run --same-dir --uid $(id -u) --gid $(id -g) bash ./tpcc_run_cct_tpcc.sh"
248-
wait: 3600
248+
wait_after: 3600
249249
- target_name: "Phase-2: Internal Operational Stress"
250250
dependent_targets:
251251
- "Phase-1: Baseline Performance"
@@ -258,21 +258,21 @@ targets:
258258
- |
259259
BACKUP INTO 'gs://$BUCKET_US_EAST_1/$CLUSTER?AUTH=implicit'
260260
WITH OPTIONS (revision_history = true, detached)
261-
wait: 1800
261+
wait_after: 1800
262262
- command: sql # create changefeed without initial scan
263263
args:
264264
- $CLUSTER:1
265265
- --
266266
- -e
267267
- "CREATE CHANGEFEED FOR TABLE cct_tpcc.public.order_line INTO 'null://' WITH initial_scan = 'no'"
268-
wait: 600
268+
wait_after: 600
269269
- command: sql # create index on order table
270270
args:
271271
- $CLUSTER:1
272272
- --
273273
- -e
274274
- "CREATE INDEX add_index_o_w_id ON cct_tpcc.public.order (o_w_id)"
275-
wait: 700
275+
wait_after: 700
276276
- command: deploy # rolling upgrade
277277
args:
278278
- $CLUSTER
@@ -281,7 +281,7 @@ targets:
281281
flags:
282282
pause: 5m
283283
grace-period: 500
284-
wait: 300
284+
wait_after: 300
285285
- target_name: "Phase-3: Disk Stalls"
286286
dependent_targets:
287287
- "Phase-2: Internal Operational Stress"
@@ -291,7 +291,7 @@ targets:
291291
- $WORKLOAD_CLUSTER:1
292292
- --
293293
- "./run_ops_disk-stall.sh"
294-
wait: 1200
294+
wait_after: 1200
295295
- target_name: "Phase-4: Network Failures"
296296
dependent_targets:
297297
- "Phase-3: Disk Stalls"
@@ -301,58 +301,58 @@ targets:
301301
- $WORKLOAD_CLUSTER:1
302302
- --
303303
- "./run_ops_network-partition-partial.sh"
304-
wait: 1500
304+
wait_after: 1500
305305
- command: run
306306
args:
307307
- $WORKLOAD_CLUSTER:1
308308
- --
309309
- "./run_ops_network-partition-full.sh"
310-
wait: 1500
310+
wait_after: 1500
311311
- target_name: "Phase-5: Node Restarts"
312312
dependent_targets:
313313
- "Phase-4: Network Failures"
314314
steps:
315315
- command: stop # ungraceful shutdown of node 2
316316
args:
317317
- $CLUSTER:2
318-
wait: 30
318+
wait_after: 30
319319
- command: start # restart node 2
320320
args:
321321
- $CLUSTER:2
322322
flags:
323323
restart: true
324-
wait: 600
324+
wait_after: 600
325325
- command: stop # ungraceful shutdown of node 6
326326
args:
327327
- $CLUSTER:6
328-
wait: 30
328+
wait_after: 30
329329
- command: start # restart node 6
330330
args:
331331
- $CLUSTER:6
332332
flags:
333333
restart: true
334-
wait: 1500
334+
wait_after: 1500
335335
- command: stop # ungraceful shutdown of node 7
336336
args:
337337
- $CLUSTER:7
338-
wait: 30
338+
wait_after: 30
339339
- command: start # restart node 7
340340
args:
341341
- $CLUSTER:7
342342
flags:
343343
restart: true
344-
wait: 1500
344+
wait_after: 1500
345345
- target_name: "Phase-6: Zone Outages"
346346
dependent_targets:
347347
- "Phase-5: Node Restarts"
348348
steps:
349349
- command: stop # ungraceful shutdown of nodes 7-9 to simulate zone outage
350350
args:
351351
- $CLUSTER:7-9
352-
wait: 300
352+
wait_after: 300
353353
- command: start # restart nodes 7-9
354354
args:
355355
- $CLUSTER:7-9
356356
flags:
357357
restart: true
358-
wait: 3300
358+
wait_after: 3300

pkg/cmd/drtprod/configs/drt_pua_mr.yaml

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -229,7 +229,7 @@ targets:
229229
regions: $REGIONS
230230
- script: "pkg/cmd/drtprod/scripts/tpcc_run_multiregion.sh"
231231
- script: "pkg/cmd/drtprod/scripts/pua_operations.sh"
232-
wait: 10
232+
wait_after: 10
233233
- target_name: "Data Import"
234234
dependent_targets:
235235
- "Setup Certs & SSH Keys"
@@ -239,7 +239,7 @@ targets:
239239
- $WORKLOAD_CLUSTER:1
240240
- --
241241
- "sudo systemd-run --unit tpcc_init --same-dir --uid $(id -u) --gid $(id -g) bash ./tpcc_init_cct_tpcc.sh"
242-
wait: 3600
242+
wait_after: 3600
243243
- target_name: "Phase-1: Baseline Performance"
244244
dependent_targets:
245245
- "Data Import"
@@ -249,7 +249,7 @@ targets:
249249
- $WORKLOAD_CLUSTER
250250
- --
251251
- "sudo systemd-run --unit tpcc_run --same-dir --uid $(id -u) --gid $(id -g) bash ./tpcc_run.sh"
252-
wait: 3600
252+
wait_after: 3600
253253
- target_name: "Phase-2: Internal Operational Stress"
254254
dependent_targets:
255255
- "Phase-1: Baseline Performance"
@@ -264,21 +264,21 @@ targets:
264264
'gs://$BUCKET_US_EAST_5/$CLUSTER?AUTH=implicit&COCKROACH_LOCALITY=region%3Dus-east5',
265265
'gs://$BUCKET_US_EAST_1/$CLUSTER?AUTH=implicit&COCKROACH_LOCALITY=region%3Dus-east1')
266266
WITH OPTIONS (revision_history = true, detached)
267-
wait: 1500
267+
wait_after: 1500
268268
- command: sql # create changefeed without initial scan
269269
args:
270270
- $CLUSTER:1
271271
- --
272272
- -e
273273
- "CREATE CHANGEFEED FOR TABLE cct_tpcc.public.order_line INTO 'null://' WITH initial_scan = 'no'"
274-
wait: 900
274+
wait_after: 900
275275
- command: sql # create index on order table
276276
args:
277277
- $CLUSTER:1
278278
- --
279279
- -e
280280
- "CREATE INDEX add_index_o_w_id ON cct_tpcc.public.order (o_w_id)"
281-
wait: 700
281+
wait_after: 700
282282
- command: deploy # rolling upgrade
283283
args:
284284
- $CLUSTER
@@ -287,7 +287,7 @@ targets:
287287
flags:
288288
pause: 5m
289289
grace-period: 500
290-
wait: 300
290+
wait_after: 300
291291
- target_name: "Phase-3: Disk Stalls"
292292
dependent_targets:
293293
- "Phase-2: Internal Operational Stress"
@@ -297,7 +297,7 @@ targets:
297297
- $WORKLOAD_CLUSTER:1
298298
- --
299299
- "./run_ops_disk-stall.sh"
300-
wait: 1200
300+
wait_after: 1200
301301
- target_name: "Phase-4: Network Failures"
302302
dependent_targets:
303303
- "Phase-3: Disk Stalls"
@@ -307,69 +307,69 @@ targets:
307307
- $WORKLOAD_CLUSTER:1
308308
- --
309309
- "./run_ops_network-partition-partial.sh"
310-
wait: 1500
310+
wait_after: 1500
311311
- command: run
312312
args:
313313
- $WORKLOAD_CLUSTER:1
314314
- --
315315
- "./run_ops_network-partition-full.sh"
316-
wait: 1500
316+
wait_after: 1500
317317
- target_name: "Phase-5: Node Restarts"
318318
dependent_targets:
319319
- "Phase-4: Network Failures"
320320
steps:
321321
- command: stop # ungraceful shutdown of node 4
322322
args:
323323
- $CLUSTER:4
324-
wait: 30
324+
wait_after: 30
325325
- command: start # restart node 4
326326
args:
327327
- $CLUSTER:4
328328
flags:
329329
restart: true
330-
wait: 600
330+
wait_after: 600
331331
- command: stop # ungraceful shutdown of node 6
332332
args:
333333
- $CLUSTER:6
334-
wait: 30
334+
wait_after: 30
335335
- command: start # restart node 6
336336
args:
337337
- $CLUSTER:6
338338
flags:
339339
restart: true
340-
wait: 1500
340+
wait_after: 1500
341341
- command: stop # ungraceful shutdown of node 15
342342
args:
343343
- $CLUSTER:15
344-
wait: 30
344+
wait_after: 30
345345
- command: start # restart node 15
346346
args:
347347
- $CLUSTER:15
348348
flags:
349349
restart: true
350-
wait: 1500
350+
wait_after: 1500
351351
- target_name: "Phase-6: Zone Outages"
352352
dependent_targets:
353353
- "Phase-5: Node Restarts"
354354
steps:
355355
- command: stop # ungraceful shutdown of nodes 3, 4 to simulate zone outage
356356
args:
357357
- $CLUSTER:3-4
358-
wait: 300
358+
wait_after: 300
359359
- command: start # restart nodes 3, 4
360360
args:
361361
- $CLUSTER:3-4
362362
flags:
363363
restart: true
364-
wait: 3300
364+
wait_after: 3300
365365
- target_name: "Phase-7: Region Outages"
366366
dependent_targets:
367367
- "Phase-6: Zone Outages"
368368
steps:
369369
- command: stop # ungraceful shutdown of nodes 11-15 to simulate region outage
370370
args:
371371
- $CLUSTER:11-15
372-
wait: 300
372+
wait_after: 300
373373
- command: start # restart nodes 11-15
374374
args:
375375
- $CLUSTER:11-15

0 commit comments

Comments
 (0)