Skip to content

Commit f594ee2

Browse files
committed
roachtest: update wal-failover/among-stores/with-progress to use process monitor
Epic: none
1 parent e4d1523 commit f594ee2

File tree

1 file changed

+11
-24
lines changed

1 file changed

+11
-24
lines changed

pkg/cmd/roachtest/tests/disk_stall.go

Lines changed: 11 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -11,17 +11,18 @@ import (
1111
"math/rand"
1212
"strconv"
1313
"strings"
14-
"sync"
1514
"time"
1615

1716
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster"
1817
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/clusterstats"
1918
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
2019
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry"
2120
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil"
21+
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/task"
2222
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec"
2323
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
2424
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
25+
"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
2526
"github.com/cockroachdb/cockroach/pkg/roachprod/prometheus"
2627
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
2728
"github.com/stretchr/testify/require"
@@ -435,6 +436,7 @@ func registerDiskStalledWALFailoverWithProgress(r registry.Registry) {
435436
SkipPostValidations: registry.PostValidationNoDeadNodes,
436437
EncryptionSupport: registry.EncryptionMetamorphic,
437438
Leases: registry.MetamorphicLeases,
439+
Monitor: true,
438440
Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
439441
runDiskStalledWALFailoverWithProgress(ctx, t, c)
440442
},
@@ -484,7 +486,7 @@ func runDiskStalledWALFailoverWithProgress(ctx context.Context, t test.Test, c c
484486

485487
t.Status("starting oscillating workload and disk stall pattern")
486488
testStartedAt := timeutil.Now()
487-
m := c.NewMonitor(ctx, c.CRDBNodes())
489+
g := t.NewGroup(task.WithContext(ctx))
488490

489491
// Setup stats collector.
490492
promCfg := &prometheus.Config{}
@@ -511,7 +513,6 @@ func runDiskStalledWALFailoverWithProgress(ctx context.Context, t test.Test, c c
511513
for timeutil.Since(testStartedAt) < testDuration {
512514
if t.Failed() {
513515
t.Fatalf("test failed, stopping further iterations")
514-
return
515516
}
516517

517518
workloadWaitDur := operationWaitBase + time.Duration(rand.Int63n(int64(waitJitterMax)))
@@ -521,11 +522,7 @@ func runDiskStalledWALFailoverWithProgress(ctx context.Context, t test.Test, c c
521522
workloadStarted := make(chan struct{})
522523
workloadFinished := make(chan struct{})
523524

524-
var wg sync.WaitGroup
525-
wg.Add(1)
526-
m.Go(func(ctx context.Context) error {
527-
defer wg.Done()
528-
525+
g.Go(func(ctx context.Context, _ *logger.Logger) error {
529526
select {
530527
case <-ctx.Done():
531528
t.Fatalf("context done before workload started: %s", ctx.Err())
@@ -540,14 +537,12 @@ func runDiskStalledWALFailoverWithProgress(ctx context.Context, t test.Test, c c
540537
return nil
541538
}
542539
return nil
543-
})
540+
}, task.Name("workload-run"))
544541

545542
// Collecting QPS samples while the workload is running and verify
546543
// that the throughput is within errorTolerance of the mean.
547544
var samples []float64
548-
wg.Add(1)
549-
m.Go(func(ctx context.Context) error {
550-
defer wg.Done()
545+
g.Go(func(ctx context.Context, _ *logger.Logger) error {
551546

552547
// Wait for workload to start.
553548
select {
@@ -602,17 +597,15 @@ func runDiskStalledWALFailoverWithProgress(ctx context.Context, t test.Test, c c
602597

603598
t.Status(fmt.Sprintf("workload finished, %d samples collected", len(samples)))
604599
return nil
605-
})
600+
}, task.Name("qps-sampling"))
606601

607602
// Every 4th iteration, we'll skip the disk stall phase.
608603
if iteration%4 != 0 {
609604
// Calculate next stall phase with jitter.
610605
diskStallWaitDur := operationWaitBase + time.Duration(rand.Int63n(int64(waitJitterMax)))
611606
t.Status("next stall phase in ", diskStallWaitDur)
612607

613-
wg.Add(1)
614-
m.Go(func(ctx context.Context) error {
615-
defer wg.Done()
608+
g.Go(func(ctx context.Context, _ *logger.Logger) error {
616609
select {
617610
case <-ctx.Done():
618611
t.Fatalf("context done before stall started: %s", ctx.Err())
@@ -650,13 +643,13 @@ func runDiskStalledWALFailoverWithProgress(ctx context.Context, t test.Test, c c
650643
}
651644

652645
return nil
653-
})
646+
}, task.Name("disk-stall-phase"))
654647
} else {
655648
t.Status("skipping disk stall phase for this iteration")
656649
}
657650

658651
// Wait for all goroutines to complete.
659-
wg.Wait()
652+
g.Wait()
660653

661654
// Validate throughput samples are within tolerance.
662655
meanThroughput := roachtestutil.GetMeanOverLastN(len(samples), samples)
@@ -698,12 +691,6 @@ func runDiskStalledWALFailoverWithProgress(ctx context.Context, t test.Test, c c
698691
if durInFailover < 10*time.Minute {
699692
t.Errorf("expected s1 to spend at least 10m writing to secondary, but spent %s", durInFailover)
700693
}
701-
702-
// Wait for the workload to finish (if it hasn't already).
703-
m.Wait()
704-
705-
// Shut down the nodes, allowing any devices to be unmounted during cleanup.
706-
c.Stop(ctx, t.L(), option.DefaultStopOpts(), c.CRDBNodes())
707694
}
708695

709696
func getProcessStartMonotonic(

0 commit comments

Comments
 (0)