Skip to content

Commit e1f4739

Browse files
committed
roachtest: wait for stability in rebalance load
The `rebalance/by-load` roachtests would immediately pass once the balance target was hit. However, it was possible that the cluster was only transiently balanced. Bump the timeout of all tests to be uniformly 10 minutes, from a previous 5-10 minutes, require that the load remains balanced for at least 1 minute to pass. Informs: cockroachdb#107247 Release note: None
1 parent 480c277 commit e1f4739

File tree

1 file changed

+18
-8
lines changed

1 file changed

+18
-8
lines changed

pkg/cmd/roachtest/tests/rebalance_load.go

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,9 @@ const (
4848
meanCPUTolerance = 0.15
4949
// statSamplePeriod is the period at which timeseries stats are sampled.
5050
statSamplePeriod = 10 * time.Second
51+
// stableDuration is the duration which the cluster's load must remain
52+
// balanced for to pass.
53+
stableDuration = time.Minute
5154
)
5255

5356
func registerRebalanceLoad(r registry.Registry) {
@@ -150,7 +153,8 @@ func registerRebalanceLoad(r registry.Registry) {
150153
}
151154

152155
var reason string
153-
var done bool
156+
var balancedStartTime time.Time
157+
var prevIsBalanced bool
154158
for tBegin := timeutil.Now(); timeutil.Since(tBegin) <= maxDuration; {
155159
// Wait out the sample period initially to allow the timeseries to
156160
// populate meaningful information for the test to query.
@@ -160,14 +164,20 @@ func registerRebalanceLoad(r registry.Registry) {
160164
case <-time.After(statSamplePeriod):
161165
}
162166

167+
now := timeutil.Now()
163168
clusterStoresCPU, err := storeCPUFn(ctx)
164169
if err != nil {
165170
t.L().Printf("unable to get the cluster stores CPU %s\n", err.Error())
171+
continue
166172
}
167-
168-
done, reason = isLoadEvenlyDistributed(clusterStoresCPU, meanCPUTolerance)
173+
var curIsBalanced bool
174+
curIsBalanced, reason = isLoadEvenlyDistributed(clusterStoresCPU, meanCPUTolerance)
169175
t.L().Printf("cpu %s", reason)
170-
if done {
176+
if !prevIsBalanced && curIsBalanced {
177+
balancedStartTime = now
178+
}
179+
prevIsBalanced = curIsBalanced
180+
if prevIsBalanced && now.Sub(balancedStartTime) > stableDuration {
171181
t.Status("successfully achieved CPU balance; waiting for kv to finish running")
172182
cancel()
173183
return nil
@@ -194,7 +204,7 @@ func registerRebalanceLoad(r registry.Registry) {
194204
concurrency = 32
195205
fmt.Printf("lowering concurrency to %d in local testing\n", concurrency)
196206
}
197-
rebalanceLoadRun(ctx, t, c, "leases", 5*time.Minute, concurrency, false /* mixedVersion */)
207+
rebalanceLoadRun(ctx, t, c, "leases", 10*time.Minute, concurrency, false /* mixedVersion */)
198208
},
199209
},
200210
)
@@ -208,7 +218,7 @@ func registerRebalanceLoad(r registry.Registry) {
208218
concurrency = 32
209219
fmt.Printf("lowering concurrency to %d in local testing\n", concurrency)
210220
}
211-
rebalanceLoadRun(ctx, t, c, "leases", 5*time.Minute, concurrency, true /* mixedVersion */)
221+
rebalanceLoadRun(ctx, t, c, "leases", 10*time.Minute, concurrency, true /* mixedVersion */)
212222
},
213223
},
214224
)
@@ -224,7 +234,7 @@ func registerRebalanceLoad(r registry.Registry) {
224234
fmt.Printf("lowering concurrency to %d in local testing\n", concurrency)
225235
}
226236
rebalanceLoadRun(
227-
ctx, t, c, "leases and replicas", 5*time.Minute, concurrency, false, /* mixedVersion */
237+
ctx, t, c, "leases and replicas", 10*time.Minute, concurrency, false, /* mixedVersion */
228238
)
229239
},
230240
},
@@ -240,7 +250,7 @@ func registerRebalanceLoad(r registry.Registry) {
240250
fmt.Printf("lowering concurrency to %d in local testing\n", concurrency)
241251
}
242252
rebalanceLoadRun(
243-
ctx, t, c, "leases and replicas", 5*time.Minute, concurrency, true, /* mixedVersion */
253+
ctx, t, c, "leases and replicas", 10*time.Minute, concurrency, true, /* mixedVersion */
244254
)
245255
},
246256
},

0 commit comments

Comments
 (0)