Skip to content

Commit 70d0ff8

Browse files
committed
roachtest: adjust first sampling time in wal-failover/among-stores/with-progress
Increase the time to first sample from 30s to 40s. The rate interval used by the sampling query is 30s, so we should let the workload run for at least that time with a little buffer before sampling. Release note: None Closes: #148143
1 parent 1c2c92b commit 70d0ff8

File tree

1 file changed

+11
-6
lines changed

1 file changed

+11
-6
lines changed

pkg/cmd/roachtest/tests/disk_stall.go

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -444,7 +444,7 @@ func registerDiskStalledWALFailoverWithProgress(r registry.Registry) {
444444
func runDiskStalledWALFailoverWithProgress(ctx context.Context, t test.Test, c cluster.Cluster) {
445445
const (
446446
testDuration = 1 * time.Hour
447-
// We'll issue short stalls every 10s to keep us in the failover state.
447+
// We'll issue short stalls every 5s to keep us in the failover state.
448448
stallInterval = 5 * time.Second
449449
shortStallDur = 200 * time.Millisecond
450450
// For each loop, each operation will start after a random wait between [30s, 150s).
@@ -560,13 +560,13 @@ func runDiskStalledWALFailoverWithProgress(ctx context.Context, t test.Test, c c
560560
select {
561561
case <-ctx.Done():
562562
t.Fatalf("context done before workload started: %s", ctx.Err())
563-
case <-time.After(20 * time.Second):
563+
case <-time.After(30 * time.Second):
564564
t.Status("starting QPS sampling")
565565
}
566566

567-
// Calculate approx how many samples we can take before workload ends.
568567
// We want to stop sampling 10s before workload ends to avoid sampling during shutdown.
569-
samplingDuration := operationDur - 30*time.Second // 20s initial wait + 10s buffer at workload end
568+
// We'll take approx. 14 samples with this configuration.
569+
samplingDuration := operationDur - 40*time.Second // 30s initial wait + 10s buffer at workload end
570570
sampleCount := int(samplingDuration / sampleInterval)
571571

572572
sampleTimer := time.NewTicker(sampleInterval)
@@ -620,19 +620,24 @@ func runDiskStalledWALFailoverWithProgress(ctx context.Context, t test.Test, c c
620620
t.Status("starting disk stall")
621621
}
622622
stallStart := timeutil.Now()
623-
// Execute short 200ms stalls every 10s.
623+
// Execute short 200ms stalls every 5s.
624624
for timeutil.Since(stallStart) < operationDur {
625625
select {
626626
case <-ctx.Done():
627627
t.Fatalf("context done while stall induced: %s", ctx.Err())
628628
case <-time.After(stallInterval):
629629
func() {
630-
s.Stall(ctx, c.Node(1))
631630
t.Status("short disk stall on n1")
631+
s.Stall(ctx, c.Node(1))
632632
defer func() {
633+
// NB: We use a background context in the defer'ed unstall command,
634+
// otherwise on test failure our Unstall calls will be ignored. Leaving
635+
// the disk stalled will prevent artifact collection, making debugging
636+
// difficult.
633637
ctx, cancel := context.WithTimeout(context.Background(), time.Minute)
634638
defer cancel()
635639
s.Unstall(ctx, c.Node(1))
640+
t.Status("unstalled disk on n1")
636641
}()
637642
select {
638643
case <-ctx.Done():

0 commit comments

Comments
 (0)