Skip to content

Commit fd2f8d1

Browse files
craig[bot]srosenberg
andcommitted
Merge #143399
143399: roachtest: "large" cluster tweaks r=herkolategan,darrylwong a=srosenberg While triaging recent failures of the _weekly_ `multi-region/mixed-version`, we observed that the 1h upgrade timeout was exceeded, potentially due to [1]. We also observed that 10m wasn't sufficient to construct `debug.zip`. Thus, we bump the upgrade timeout to 2h and `FetchDebugZip` to 20m. We also reduce the cluster size from 80 nodes down to 52, in hope of stabilizing it. Finally, we skip `FetchTimeSeries` since we have no direct use for it, and it has non-negligible overhead. [1] #141420 Informs: #121455 Epic: none Release note: None Co-authored-by: Stan Rosenberg <[email protected]>
2 parents 5af35e3 + 6c3f497 commit fd2f8d1

File tree

3 files changed

+15
-5
lines changed

3 files changed

+15
-5
lines changed

pkg/cmd/roachtest/cluster.go

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1395,9 +1395,16 @@ func (c *clusterImpl) FetchDebugZip(
13951395
c.status("fetching debug zip")
13961396

13971397
nodes := selectedNodesOrDefault(opts, c.All())
1398+
// Shuffle the nodes to avoid always trying the same node first.
1399+
rand.Shuffle(len(nodes), func(i, j int) { nodes[i], nodes[j] = nodes[j], nodes[i] })
1400+
defaultTimeout := 10 * time.Minute
1401+
if c.spec.NodeCount >= 30 {
1402+
// For "large" clusters, double the timeout.
1403+
defaultTimeout *= 2
1404+
}
13981405

13991406
// Don't hang forever if we can't fetch the debug zip.
1400-
return timeutil.RunWithTimeout(ctx, "debug zip", 10*time.Minute, func(ctx context.Context) error {
1407+
return timeutil.RunWithTimeout(ctx, "debug zip", defaultTimeout, func(ctx context.Context) error {
14011408
const zipName = "debug.zip"
14021409
path := filepath.Join(c.t.ArtifactsDir(), dest)
14031410
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {

pkg/cmd/roachtest/test_runner.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1753,8 +1753,11 @@ func (r *testRunner) collectArtifacts(
17531753
if err := c.FetchPebbleCheckpoints(ctx, t.L()); err != nil {
17541754
t.L().Printf("failed to fetch Pebble checkpoints: %s", err)
17551755
}
1756-
if err := c.FetchTimeseriesData(ctx, t.L()); err != nil {
1757-
t.L().Printf("failed to fetch timeseries data: %s", err)
1756+
// Bypass the collection of timeseries data for "large" clusters.
1757+
if c.spec.NodeCount < 30 {
1758+
if err := c.FetchTimeseriesData(ctx, t.L()); err != nil {
1759+
t.L().Printf("failed to fetch timeseries data: %s", err)
1760+
}
17581761
}
17591762
if err := c.FetchDebugZip(ctx, t.L(), "debug.zip"); err != nil {
17601763
t.L().Printf("failed to collect zip: %s", err)

pkg/cmd/roachtest/tests/mixed_version_multi_region.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ func registerMultiRegionMixedVersion(r registry.Registry) {
3636
}
3737

3838
const (
39-
nodesPerRegion = 20
39+
nodesPerRegion = 13
4040
// These values are somewhat arbitrary: currently, they are
4141
// sufficient to keep the cluster relatively busy (CPU utilization
4242
// varying from 10-60%). In the future, these values might be
@@ -75,7 +75,7 @@ func registerMultiRegionMixedVersion(r registry.Registry) {
7575
mixedversion.NeverUseFixtures,
7676
// Allow migrations to run for a longer period of time due to
7777
// added latency and cluster size.
78-
mixedversion.UpgradeTimeout(1*time.Hour),
78+
mixedversion.UpgradeTimeout(2*time.Hour),
7979
// There are known issues upgrading from older patch releases
8080
// in MR clusters (e.g., #113908), so use the latest patch
8181
// releases to avoid flakes.

0 commit comments

Comments
 (0)