Skip to content

Commit 1cddb3a

Browse files
authored
Merge pull request #156313 from cockroachdb/blathers/backport-staging-v25.3.4-155634
staging-v25.3.4: release-25.3: server: return draining progress if the server controller is draining
2 parents 1afaade + 3cf9de4 commit 1cddb3a

File tree

3 files changed

+54
-9
lines changed

3 files changed

+54
-9
lines changed

pkg/server/drain.go

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -389,10 +389,13 @@ func (s *drainServer) drainInner(
389389
return s.drainNode(ctx, reporter, verbose)
390390
}
391391

392-
// isDraining returns true if either SQL client connections are being drained
393-
// or if one of the stores on the node is not accepting replicas.
392+
// isDraining returns true if the SQL client connections are being drained,
393+
// if one of the stores on the node is not accepting replicas, or if the
394+
// server controller is being drained.
394395
func (s *drainServer) isDraining() bool {
395-
return s.sqlServer.pgServer.IsDraining() || (s.kvServer.node != nil && s.kvServer.node.IsDraining())
396+
return s.sqlServer.pgServer.IsDraining() ||
397+
(s.kvServer.node != nil && s.kvServer.node.IsDraining()) ||
398+
(s.serverCtl != nil && s.serverCtl.IsDraining())
396399
}
397400

398401
// drainClients starts draining the SQL layer.
@@ -404,7 +407,10 @@ func (s *drainServer) drainClients(
404407
var cancel context.CancelFunc
405408
ctx, cancel = context.WithCancel(ctx)
406409
defer cancel()
407-
shouldDelayDraining := !s.isDraining()
410+
411+
// If this is the first time we are draining the clients, we delay draining
412+
// to allow health probes to notice that the node is not ready.
413+
shouldDelayDraining := !s.sqlServer.pgServer.IsDraining()
408414

409415
// Set the gRPC mode of the node to "draining" and mark the node as "not ready".
410416
// Probes to /health?ready=1 will now notice the change in the node's readiness.

pkg/server/drain_test.go

Lines changed: 40 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ package server_test
77

88
import (
99
"context"
10+
gosql "database/sql"
1011
"io"
1112
"testing"
1213
"time"
@@ -20,6 +21,7 @@ import (
2021
"github.com/cockroachdb/cockroach/pkg/sql/sqlstats/persistedsqlstats/sqlstatstestutil"
2122
"github.com/cockroachdb/cockroach/pkg/testutils"
2223
"github.com/cockroachdb/cockroach/pkg/testutils/serverutils"
24+
"github.com/cockroachdb/cockroach/pkg/testutils/skip"
2325
"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils"
2426
"github.com/cockroachdb/cockroach/pkg/testutils/testcluster"
2527
"github.com/cockroachdb/cockroach/pkg/util/grpcutil"
@@ -34,15 +36,39 @@ import (
3436
func TestDrain(t *testing.T) {
3537
defer leaktest.AfterTest(t)()
3638
defer log.Scope(t).Close(t)
37-
doTestDrain(t)
39+
testutils.RunTrueAndFalse(t, "secondaryTenants", func(t *testing.T, secondaryTenants bool) {
40+
doTestDrain(t, secondaryTenants)
41+
})
3842
}
3943

4044
// doTestDrain runs the drain test.
41-
func doTestDrain(tt *testing.T) {
45+
func doTestDrain(tt *testing.T, secondaryTenants bool) {
46+
if secondaryTenants {
47+
// Draining the tenant server takes a long time under stress. See #155229.
48+
skip.UnderRace(tt)
49+
}
4250
var drainSleepCallCount = 0
4351
t := newTestDrainContext(tt, &drainSleepCallCount)
4452
defer t.Close()
4553

54+
var tenantConn *gosql.DB
55+
if secondaryTenants {
56+
_, err := t.tc.ServerConn(0).Exec("CREATE TENANT hello")
57+
require.NoError(tt, err)
58+
_, err = t.tc.ServerConn(0).Exec("ALTER TENANT hello START SERVICE SHARED")
59+
require.NoError(tt, err)
60+
61+
// Wait for the tenant to be ready by establishing a test connection.
62+
testutils.SucceedsSoon(tt, func() error {
63+
tenantConn, err = t.tc.Server(0).SystemLayer().SQLConnE(
64+
serverutils.DBName("cluster:hello/defaultdb"))
65+
if err != nil {
66+
return err
67+
}
68+
return tenantConn.Ping()
69+
})
70+
}
71+
4672
// Issue a probe. We're not draining yet, so the probe should
4773
// reflect that.
4874
resp := t.sendProbe()
@@ -54,7 +80,12 @@ func doTestDrain(tt *testing.T) {
5480
resp = t.sendDrainNoShutdown()
5581
t.assertDraining(resp, true)
5682
t.assertRemaining(resp, true)
57-
t.assertEqual(1, drainSleepCallCount)
83+
if !secondaryTenants {
84+
// If we have secondary tenants, we might not have waited before draining
85+
// the clients at this point because we might have returned early if we are
86+
// still draining the serverController.
87+
t.assertEqual(1, drainSleepCallCount)
88+
}
5889

5990
// Issue another probe. This checks that the server is still running
6091
// (i.e. Shutdown: false was effective), the draining status is
@@ -64,12 +95,14 @@ func doTestDrain(tt *testing.T) {
6495
t.assertDraining(resp, true)
6596
// probe-only has no remaining.
6697
t.assertRemaining(resp, false)
67-
t.assertEqual(1, drainSleepCallCount)
68-
98+
if !secondaryTenants {
99+
t.assertEqual(1, drainSleepCallCount)
100+
}
69101
// Repeat drain commands until we verify that there are zero remaining leases
70102
// (i.e. complete). Also validate that the server did not sleep again.
71103
testutils.SucceedsSoon(t, func() error {
72104
resp = t.sendDrainNoShutdown()
105+
t.Logf("drain response: %+v", resp)
73106
if !resp.IsDraining {
74107
return errors.Newf("expected draining")
75108
}
@@ -79,6 +112,7 @@ func doTestDrain(tt *testing.T) {
79112
}
80113
return nil
81114
})
115+
82116
t.assertEqual(1, drainSleepCallCount)
83117

84118
// Now issue a drain request without drain but with shutdown.
@@ -189,6 +223,7 @@ func newTestDrainContext(t *testing.T, drainSleepCallCount *int) *testDrainConte
189223
// We need to start the cluster insecure in order to not
190224
// care about TLS settings for the RPC client connection.
191225
ServerArgs: base.TestServerArgs{
226+
DefaultTestTenant: base.TestControlsTenantsExplicitly,
192227
Knobs: base.TestingKnobs{
193228
Server: &server.TestingKnobs{
194229
DrainSleepFn: func(time.Duration) {

pkg/server/server_controller.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,10 @@ func (s *serverController) SetDraining() {
168168
}
169169
}
170170

171+
func (s *serverController) IsDraining() bool {
172+
return s.draining.Load()
173+
}
174+
171175
// start monitors changes to the service mode and updates
172176
// the running servers accordingly.
173177
func (c *serverController) start(ctx context.Context, ie isql.Executor) error {

0 commit comments

Comments
 (0)