Skip to content

Commit 585a797

Browse files
craig[bot]nameisbhaskar
andcommitted
Merge #151053
151053: drtprod: add wait_before and wait_after yaml support r=shailendra-patel a=nameisbhaskar Currently, there is a wait parameter for a step in the yaml configuration. But this parameter can be confusing as the wait can be before or after the command run. So, in this PR 2 separate parameters are added to be more explicit - wait_before and wait_after. Epic: None Release note: None Co-authored-by: Bhaskarjyoti Bora <[email protected]>
2 parents 9831be5 + 95fd0d7 commit 585a797

File tree

3 files changed

+51
-44
lines changed

3 files changed

+51
-44
lines changed

pkg/cmd/drtprod/cli/commands/yamlprocessor.go

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,8 @@ type step struct {
118118
Flags map[string]interface{} `yaml:"flags"` // Flags to pass to the command or script
119119
ContinueOnFailure bool `yaml:"continue_on_failure"` // Whether to continue on failure
120120
OnRollback []step `yaml:"on_rollback"` // Steps to execute if rollback is needed
121-
Wait int `yaml:"wait"` // Wait time in seconds before executing the next step
121+
WaitBefore int `yaml:"wait_before"` // Wait time in seconds before executing the step
122+
WaitAfter int `yaml:"wait_after"` // Wait time in seconds after executing the step
122123
}
123124

124125
// target defines a target cluster with associated steps to be executed.
@@ -143,7 +144,8 @@ type command struct {
143144
args []string // Command arguments
144145
continueOnFailure bool // Whether to continue on failure
145146
rollbackCmds []*command // Rollback commands to execute in case of failure
146-
wait int // Wait time in seconds before executing the next step
147+
waitAfter int // Wait time in seconds after executing the command
148+
waitBefore int // Wait time in seconds before executing the command
147149
}
148150

149151
// String returns the command as a string for easy printing.
@@ -550,6 +552,10 @@ func executeCommands(ctx context.Context, logPrefix string, cmds []*command) err
550552
}()
551553

552554
for _, cmd := range cmds {
555+
if cmd.waitBefore > 0 {
556+
fmt.Printf("[%s] Waiting for %d seconds\n", logPrefix, cmd.waitBefore)
557+
time.Sleep(time.Duration(cmd.waitBefore) * time.Second)
558+
}
553559
fmt.Printf("[%s] Starting <%v>\n", logPrefix, cmd)
554560
err := commandExecutor(ctx, logPrefix, cmd.name, cmd.args...)
555561
if err != nil {
@@ -561,9 +567,9 @@ func executeCommands(ctx context.Context, logPrefix string, cmds []*command) err
561567
fmt.Printf("[%s] Failed <%v>, Error Ignored: %v\n", logPrefix, cmd, err)
562568
} else {
563569
fmt.Printf("[%s] Completed <%v>\n", logPrefix, cmd)
564-
if cmd.wait > 0 {
565-
fmt.Printf("[%s] Waiting for %d seconds\n", logPrefix, cmd.wait)
566-
time.Sleep(time.Duration(cmd.wait) * time.Second)
570+
if cmd.waitAfter > 0 {
571+
fmt.Printf("[%s] Waiting for %d seconds\n", logPrefix, cmd.waitAfter)
572+
time.Sleep(time.Duration(cmd.waitAfter) * time.Second)
567573
}
568574
}
569575

@@ -622,7 +628,8 @@ func generateStepCmd(clusterName string, s step) (*command, error) {
622628
return nil, err
623629
}
624630
}
625-
cmd.wait = s.Wait
631+
cmd.waitAfter = s.WaitAfter
632+
cmd.waitBefore = s.WaitBefore
626633
return cmd, err
627634
}
628635

pkg/cmd/drtprod/configs/drt_pua_9.yaml

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -221,11 +221,11 @@ targets:
221221
active-warehouses: $TPCC_ACTIVE_WAREHOUSES
222222
duration: $RUN_DURATION
223223
ramp: 5m
224-
wait: true
224+
wait_after: true
225225
max-conn-lifetime: $MAX_CONN_LIFETIME
226226
conns: $CONNS
227227
- script: "pkg/cmd/drtprod/scripts/pua_operations.sh"
228-
wait: 10
228+
wait_after: 10
229229
- target_name: "Data Import"
230230
dependent_targets:
231231
- "Setup Certs & SSH Keys"
@@ -235,7 +235,7 @@ targets:
235235
- $WORKLOAD_CLUSTER:1
236236
- --
237237
- "sudo systemd-run --unit tpcc_init --same-dir --uid $(id -u) --gid $(id -g) bash ./tpcc_init_cct_tpcc.sh"
238-
wait: 3600
238+
wait_after: 3600
239239
- target_name: "Phase-1: Baseline Performance"
240240
dependent_targets:
241241
- "Data Import"
@@ -245,7 +245,7 @@ targets:
245245
- $WORKLOAD_CLUSTER
246246
- --
247247
- "sudo systemd-run --unit tpcc_run --same-dir --uid $(id -u) --gid $(id -g) bash ./tpcc_run_cct_tpcc.sh"
248-
wait: 3600
248+
wait_after: 3600
249249
- target_name: "Phase-2: Internal Operational Stress"
250250
dependent_targets:
251251
- "Phase-1: Baseline Performance"
@@ -258,21 +258,21 @@ targets:
258258
- |
259259
BACKUP INTO 'gs://$BUCKET_US_EAST_1/$CLUSTER?AUTH=implicit'
260260
WITH OPTIONS (revision_history = true, detached)
261-
wait: 1800
261+
wait_after: 1800
262262
- command: sql # create changefeed without initial scan
263263
args:
264264
- $CLUSTER:1
265265
- --
266266
- -e
267267
- "CREATE CHANGEFEED FOR TABLE cct_tpcc.public.order_line INTO 'null://' WITH initial_scan = 'no'"
268-
wait: 600
268+
wait_after: 600
269269
- command: sql # create index on order table
270270
args:
271271
- $CLUSTER:1
272272
- --
273273
- -e
274274
- "CREATE INDEX add_index_o_w_id ON cct_tpcc.public.order (o_w_id)"
275-
wait: 700
275+
wait_after: 700
276276
- command: deploy # rolling upgrade
277277
args:
278278
- $CLUSTER
@@ -281,7 +281,7 @@ targets:
281281
flags:
282282
pause: 5m
283283
grace-period: 500
284-
wait: 300
284+
wait_after: 300
285285
- target_name: "Phase-3: Disk Stalls"
286286
dependent_targets:
287287
- "Phase-2: Internal Operational Stress"
@@ -291,7 +291,7 @@ targets:
291291
- $WORKLOAD_CLUSTER:1
292292
- --
293293
- "./run_ops_disk-stall.sh"
294-
wait: 1200
294+
wait_after: 1200
295295
- target_name: "Phase-4: Network Failures"
296296
dependent_targets:
297297
- "Phase-3: Disk Stalls"
@@ -301,58 +301,58 @@ targets:
301301
- $WORKLOAD_CLUSTER:1
302302
- --
303303
- "./run_ops_network-partition-partial.sh"
304-
wait: 1500
304+
wait_after: 1500
305305
- command: run
306306
args:
307307
- $WORKLOAD_CLUSTER:1
308308
- --
309309
- "./run_ops_network-partition-full.sh"
310-
wait: 1500
310+
wait_after: 1500
311311
- target_name: "Phase-5: Node Restarts"
312312
dependent_targets:
313313
- "Phase-4: Network Failures"
314314
steps:
315315
- command: stop # ungraceful shutdown of node 2
316316
args:
317317
- $CLUSTER:2
318-
wait: 30
318+
wait_after: 30
319319
- command: start # restart node 2
320320
args:
321321
- $CLUSTER:2
322322
flags:
323323
restart: true
324-
wait: 600
324+
wait_after: 600
325325
- command: stop # ungraceful shutdown of node 6
326326
args:
327327
- $CLUSTER:6
328-
wait: 30
328+
wait_after: 30
329329
- command: start # restart node 6
330330
args:
331331
- $CLUSTER:6
332332
flags:
333333
restart: true
334-
wait: 1500
334+
wait_after: 1500
335335
- command: stop # ungraceful shutdown of node 7
336336
args:
337337
- $CLUSTER:7
338-
wait: 30
338+
wait_after: 30
339339
- command: start # restart node 7
340340
args:
341341
- $CLUSTER:7
342342
flags:
343343
restart: true
344-
wait: 1500
344+
wait_after: 1500
345345
- target_name: "Phase-6: Zone Outages"
346346
dependent_targets:
347347
- "Phase-5: Node Restarts"
348348
steps:
349349
- command: stop # ungraceful shutdown of nodes 7-9 to simulate zone outage
350350
args:
351351
- $CLUSTER:7-9
352-
wait: 300
352+
wait_after: 300
353353
- command: start # restart nodes 7-9
354354
args:
355355
- $CLUSTER:7-9
356356
flags:
357357
restart: true
358-
wait: 3300
358+
wait_after: 3300

pkg/cmd/drtprod/configs/drt_pua_mr.yaml

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -229,7 +229,7 @@ targets:
229229
regions: $REGIONS
230230
- script: "pkg/cmd/drtprod/scripts/tpcc_run_multiregion.sh"
231231
- script: "pkg/cmd/drtprod/scripts/pua_operations.sh"
232-
wait: 10
232+
wait_after: 10
233233
- target_name: "Data Import"
234234
dependent_targets:
235235
- "Setup Certs & SSH Keys"
@@ -239,7 +239,7 @@ targets:
239239
- $WORKLOAD_CLUSTER:1
240240
- --
241241
- "sudo systemd-run --unit tpcc_init --same-dir --uid $(id -u) --gid $(id -g) bash ./tpcc_init_cct_tpcc.sh"
242-
wait: 3600
242+
wait_after: 3600
243243
- target_name: "Phase-1: Baseline Performance"
244244
dependent_targets:
245245
- "Data Import"
@@ -249,7 +249,7 @@ targets:
249249
- $WORKLOAD_CLUSTER
250250
- --
251251
- "sudo systemd-run --unit tpcc_run --same-dir --uid $(id -u) --gid $(id -g) bash ./tpcc_run.sh"
252-
wait: 3600
252+
wait_after: 3600
253253
- target_name: "Phase-2: Internal Operational Stress"
254254
dependent_targets:
255255
- "Phase-1: Baseline Performance"
@@ -264,21 +264,21 @@ targets:
264264
'gs://$BUCKET_US_EAST_5/$CLUSTER?AUTH=implicit&COCKROACH_LOCALITY=region%3Dus-east5',
265265
'gs://$BUCKET_US_EAST_1/$CLUSTER?AUTH=implicit&COCKROACH_LOCALITY=region%3Dus-east1')
266266
WITH OPTIONS (revision_history = true, detached)
267-
wait: 1500
267+
wait_after: 1500
268268
- command: sql # create changefeed without initial scan
269269
args:
270270
- $CLUSTER:1
271271
- --
272272
- -e
273273
- "CREATE CHANGEFEED FOR TABLE cct_tpcc.public.order_line INTO 'null://' WITH initial_scan = 'no'"
274-
wait: 900
274+
wait_after: 900
275275
- command: sql # create index on order table
276276
args:
277277
- $CLUSTER:1
278278
- --
279279
- -e
280280
- "CREATE INDEX add_index_o_w_id ON cct_tpcc.public.order (o_w_id)"
281-
wait: 700
281+
wait_after: 700
282282
- command: deploy # rolling upgrade
283283
args:
284284
- $CLUSTER
@@ -287,7 +287,7 @@ targets:
287287
flags:
288288
pause: 5m
289289
grace-period: 500
290-
wait: 300
290+
wait_after: 300
291291
- target_name: "Phase-3: Disk Stalls"
292292
dependent_targets:
293293
- "Phase-2: Internal Operational Stress"
@@ -297,7 +297,7 @@ targets:
297297
- $WORKLOAD_CLUSTER:1
298298
- --
299299
- "./run_ops_disk-stall.sh"
300-
wait: 1200
300+
wait_after: 1200
301301
- target_name: "Phase-4: Network Failures"
302302
dependent_targets:
303303
- "Phase-3: Disk Stalls"
@@ -307,69 +307,69 @@ targets:
307307
- $WORKLOAD_CLUSTER:1
308308
- --
309309
- "./run_ops_network-partition-partial.sh"
310-
wait: 1500
310+
wait_after: 1500
311311
- command: run
312312
args:
313313
- $WORKLOAD_CLUSTER:1
314314
- --
315315
- "./run_ops_network-partition-full.sh"
316-
wait: 1500
316+
wait_after: 1500
317317
- target_name: "Phase-5: Node Restarts"
318318
dependent_targets:
319319
- "Phase-4: Network Failures"
320320
steps:
321321
- command: stop # ungraceful shutdown of node 4
322322
args:
323323
- $CLUSTER:4
324-
wait: 30
324+
wait_after: 30
325325
- command: start # restart node 4
326326
args:
327327
- $CLUSTER:4
328328
flags:
329329
restart: true
330-
wait: 600
330+
wait_after: 600
331331
- command: stop # ungraceful shutdown of node 6
332332
args:
333333
- $CLUSTER:6
334-
wait: 30
334+
wait_after: 30
335335
- command: start # restart node 6
336336
args:
337337
- $CLUSTER:6
338338
flags:
339339
restart: true
340-
wait: 1500
340+
wait_after: 1500
341341
- command: stop # ungraceful shutdown of node 15
342342
args:
343343
- $CLUSTER:15
344-
wait: 30
344+
wait_after: 30
345345
- command: start # restart node 15
346346
args:
347347
- $CLUSTER:15
348348
flags:
349349
restart: true
350-
wait: 1500
350+
wait_after: 1500
351351
- target_name: "Phase-6: Zone Outages"
352352
dependent_targets:
353353
- "Phase-5: Node Restarts"
354354
steps:
355355
- command: stop # ungraceful shutdown of nodes 3, 4 to simulate zone outage
356356
args:
357357
- $CLUSTER:3-4
358-
wait: 300
358+
wait_after: 300
359359
- command: start # restart nodes 3, 4
360360
args:
361361
- $CLUSTER:3-4
362362
flags:
363363
restart: true
364-
wait: 3300
364+
wait_after: 3300
365365
- target_name: "Phase-7: Region Outages"
366366
dependent_targets:
367367
- "Phase-6: Zone Outages"
368368
steps:
369369
- command: stop # ungraceful shutdown of nodes 11-15 to simulate region outage
370370
args:
371371
- $CLUSTER:11-15
372-
wait: 300
372+
wait_after: 300
373373
- command: start # restart nodes 11-15
374374
args:
375375
- $CLUSTER:11-15

0 commit comments

Comments
 (0)