Skip to content

Commit 1e8d027

Browse files
committed
startup: unskip TestStartupFailureRandomRange; disable UA improve logs
TestStartupFailureRandomRange is marked as a nightly-only test. However, given the intricacies of how various environment variables are passed into the test infrastructure, this test is effectively never run. Update the skip logic to build on top of #153135, allowing the test to be run in nightly-only stress runs. Update the test to disable multi-tenant mode. This currently causes the test to fail. Fixes #123908. Release note: None.
1 parent ffca318 commit 1e8d027

File tree

1 file changed

+20
-11
lines changed

1 file changed

+20
-11
lines changed

pkg/util/startup/startup_test.go

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -85,18 +85,13 @@ func TestStartupFailure(t *testing.T) {
8585
func TestStartupFailureRandomRange(t *testing.T) {
8686
defer leaktest.AfterTest(t)()
8787
defer log.Scope(t).Close(t)
88-
// This test takes 30s and so we don't want it to run in the "blocking path"
89-
// of CI at all, and we also don't want to stress it in nightlies as part of
90-
// a big package (where it will take a lot of time that could be spent running
91-
// "faster" tests). In this package, it is the only test and so it's fine to
92-
// run it under nightly (skipping race builds because with many nodes they are
93-
// very resource intensive and tend to collapse).
88+
// This test takes 30s+, so we don't want it to run in the "blocking path" of
89+
// CI. We also skip race builds as the test uses multiple nodes, which can
90+
// cause the test to grind to a halt and flake out.
9491
skip.UnderRace(t, "6 nodes with replication is too slow for race")
95-
skip.WithIssue(t, 9999999999, "nicktrav will have a fix shortly")
96-
// TODO(nicktrav): re-enable only under nightlies once the fix is out.
97-
//if !skip.NightlyStress() {
98-
// skip.IgnoreLint(t, "test takes 30s to run due to circuit breakers and timeouts")
99-
//}
92+
if !skip.Stress() {
93+
skip.IgnoreLint(t, "test takes 30s to run due to circuit breakers and timeouts")
94+
}
10095

10196
rng, seed := randutil.NewTestRand()
10297
t.Log("TestStartupFailureRandomRange using seed", seed)
@@ -148,6 +143,11 @@ func runCircuitBreakerTestForKey(
148143
args := base.TestClusterArgs{
149144
ServerArgsPerNode: make(map[int]base.TestServerArgs),
150145
ReusableListenerReg: lReg,
146+
// TODO(travers): This test is has a lingering issue when run in UA mode
147+
// that needs to be addressed before the following can be removed.
148+
ServerArgs: base.TestServerArgs{
149+
DefaultTestTenant: base.TestIsSpecificToStorageLayerAndNeedsASystemTenant,
150+
},
151151
}
152152
var enableFaults atomic.Bool
153153
for i := 0; i < nodes; i++ {
@@ -229,6 +229,7 @@ func runCircuitBreakerTestForKey(
229229
return d.StartKey
230230
}
231231

232+
t.Log("segmenting ranges")
232233
var rangeSpans []roachpb.Span
233234
r, err := c.QueryContext(ctx, "select range_id, start_key, end_key from crdb_internal.ranges_no_leases order by start_key")
234235
require.NoError(t, err, "failed to query ranges")
@@ -243,9 +244,11 @@ func runCircuitBreakerTestForKey(
243244
})
244245
}
245246
good, bad := faultyRangeSelector(rangeSpans)
247+
t.Logf("prepping %d good ranges", len(good))
246248
for _, span := range good {
247249
prepRange(span.Key, false)
248250
}
251+
t.Logf("prepping %d faulty ranges", len(good))
249252
var ranges []string
250253
for _, span := range bad {
251254
prepRange(span.Key, true)
@@ -254,27 +257,33 @@ func runCircuitBreakerTestForKey(
254257
rangesList := fmt.Sprintf("[%s]", strings.Join(ranges, ", "))
255258

256259
// Remove nodes permanently to only leave quorum on planned ranges.
260+
t.Log("stopping n3 and n4")
257261
tc.StopServer(3)
258262
tc.StopServer(4)
259263

260264
// Stop node with replicas that would leave ranges without quorum.
265+
t.Log("stopping n5")
261266
tc.StopServer(5)
262267

263268
// Probe compromised ranges to trigger circuit breakers on them. If we don't
264269
// do this, then restart queries will wait for quorum to be reestablished with
265270
// restarting node without failing.
271+
t.Logf("waiting for %d compromised ranges to trigger CBs", len(bad))
266272
var wg sync.WaitGroup
267273
wg.Add(len(bad))
268274
for _, span := range bad {
269275
go func(key roachpb.Key) {
270276
defer wg.Done()
277+
t.Logf("waiting for compromised range: %s", key)
271278
_ = db.Put(context.Background(), keys.RangeProbeKey(roachpb.RKey(key)), "")
279+
t.Logf("done waiting for compromised range: %s", key)
272280
}(span.Key)
273281
}
274282
wg.Wait()
275283

276284
// Restart node and check that it succeeds in reestablishing range quorum
277285
// necessary for startup actions.
286+
t.Log("starting n5")
278287
require.NoError(t, lReg.MustGet(t, 5).Reopen())
279288
err = tc.RestartServer(5)
280289
require.NoError(t, err, "restarting server with range(s) %s tripping circuit breaker", rangesList)

0 commit comments

Comments
 (0)