@@ -85,18 +85,13 @@ func TestStartupFailure(t *testing.T) {
85
85
func TestStartupFailureRandomRange (t * testing.T ) {
86
86
defer leaktest .AfterTest (t )()
87
87
defer log .Scope (t ).Close (t )
88
- // This test takes 30s and so we don't want it to run in the "blocking path"
89
- // of CI at all, and we also don't want to stress it in nightlies as part of
90
- // a big package (where it will take a lot of time that could be spent running
91
- // "faster" tests). In this package, it is the only test and so it's fine to
92
- // run it under nightly (skipping race builds because with many nodes they are
93
- // very resource intensive and tend to collapse).
88
+ // This test takes 30s+, so we don't want it to run in the "blocking path" of
89
+ // CI. We also skip race builds as the test uses multiple nodes, which can
90
+ // cause the test to grind to a halt and flake out.
94
91
skip .UnderRace (t , "6 nodes with replication is too slow for race" )
95
- skip .WithIssue (t , 9999999999 , "nicktrav will have a fix shortly" )
96
- // TODO(nicktrav): re-enable only under nightlies once the fix is out.
97
- //if !skip.NightlyStress() {
98
- // skip.IgnoreLint(t, "test takes 30s to run due to circuit breakers and timeouts")
99
- //}
92
+ if ! skip .Stress () {
93
+ skip .IgnoreLint (t , "test takes 30s to run due to circuit breakers and timeouts" )
94
+ }
100
95
101
96
rng , seed := randutil .NewTestRand ()
102
97
t .Log ("TestStartupFailureRandomRange using seed" , seed )
@@ -148,6 +143,11 @@ func runCircuitBreakerTestForKey(
148
143
args := base.TestClusterArgs {
149
144
ServerArgsPerNode : make (map [int ]base.TestServerArgs ),
150
145
ReusableListenerReg : lReg ,
146
+ // TODO(travers): This test is has a lingering issue when run in UA mode
147
+ // that needs to be addressed before the following can be removed.
148
+ ServerArgs : base.TestServerArgs {
149
+ DefaultTestTenant : base .TestIsSpecificToStorageLayerAndNeedsASystemTenant ,
150
+ },
151
151
}
152
152
var enableFaults atomic.Bool
153
153
for i := 0 ; i < nodes ; i ++ {
@@ -229,6 +229,7 @@ func runCircuitBreakerTestForKey(
229
229
return d .StartKey
230
230
}
231
231
232
+ t .Log ("segmenting ranges" )
232
233
var rangeSpans []roachpb.Span
233
234
r , err := c .QueryContext (ctx , "select range_id, start_key, end_key from crdb_internal.ranges_no_leases order by start_key" )
234
235
require .NoError (t , err , "failed to query ranges" )
@@ -243,9 +244,11 @@ func runCircuitBreakerTestForKey(
243
244
})
244
245
}
245
246
good , bad := faultyRangeSelector (rangeSpans )
247
+ t .Logf ("prepping %d good ranges" , len (good ))
246
248
for _ , span := range good {
247
249
prepRange (span .Key , false )
248
250
}
251
+ t .Logf ("prepping %d faulty ranges" , len (good ))
249
252
var ranges []string
250
253
for _ , span := range bad {
251
254
prepRange (span .Key , true )
@@ -254,27 +257,33 @@ func runCircuitBreakerTestForKey(
254
257
rangesList := fmt .Sprintf ("[%s]" , strings .Join (ranges , ", " ))
255
258
256
259
// Remove nodes permanently to only leave quorum on planned ranges.
260
+ t .Log ("stopping n3 and n4" )
257
261
tc .StopServer (3 )
258
262
tc .StopServer (4 )
259
263
260
264
// Stop node with replicas that would leave ranges without quorum.
265
+ t .Log ("stopping n5" )
261
266
tc .StopServer (5 )
262
267
263
268
// Probe compromised ranges to trigger circuit breakers on them. If we don't
264
269
// do this, then restart queries will wait for quorum to be reestablished with
265
270
// restarting node without failing.
271
+ t .Logf ("waiting for %d compromised ranges to trigger CBs" , len (bad ))
266
272
var wg sync.WaitGroup
267
273
wg .Add (len (bad ))
268
274
for _ , span := range bad {
269
275
go func (key roachpb.Key ) {
270
276
defer wg .Done ()
277
+ t .Logf ("waiting for compromised range: %s" , key )
271
278
_ = db .Put (context .Background (), keys .RangeProbeKey (roachpb .RKey (key )), "" )
279
+ t .Logf ("done waiting for compromised range: %s" , key )
272
280
}(span .Key )
273
281
}
274
282
wg .Wait ()
275
283
276
284
// Restart node and check that it succeeds in reestablishing range quorum
277
285
// necessary for startup actions.
286
+ t .Log ("starting n5" )
278
287
require .NoError (t , lReg .MustGet (t , 5 ).Reopen ())
279
288
err = tc .RestartServer (5 )
280
289
require .NoError (t , err , "restarting server with range(s) %s tripping circuit breaker" , rangesList )
0 commit comments