Skip to content

Commit 164c8cb

Browse files
craig[bot]knz
andcommitted
106414: server: properly check existence of tenant record r=yuzefovich,stevendanna a=knz PR extracted from cockroachdb#105441. Informs cockroachdb#83650. Part of cockroachdb#98431. Epic: CRDB-26691 Prior to this patch, the API on the tenant info watcher would return an "undefined" metadata payload if called prior to the tenant record being created. This was most visible in the following scenario: 1. new cluster starts. watcher+rangefeed start successfully (tenant table empty) 2. tenant client connects. At this time there is no metadata for its tenant ID, so the metadata payload is available but empty. 3. CREATE TENANT is executed for the new tenant. 4. only then (later) the rangefeed introduces the metadata into the cache. This is insufficient for use by the KV tenant client RPCs: there we only want to accept incoming requests from tenant clients after we actually have seen metadata for them. This patch improves the situation by checking whether the tenant record is present before returning bogus data to the SQL tenant client. Simultaneously, we add error handling logic in the SQL tenant client to handle this case gracefully. In a mixed-version deployment (with and without this patch applied), the following behaviors will occur if one starts the SQL tenant server without a tenant record defined: - Unpatched server: Late startup error in client with "database 1 does not exist". - Patched server: Client loops forever, waiting for tenant record. Behavior when the tenant record is created shortly *after* the SQL tenant server starts up: - Unpatched server: Inconsistent / non-deterministic behavior. - Patched server: Clean startup of client after tenant record appears. Co-authored-by: Raphael 'kena' Poss <[email protected]>
2 parents 8d10f15 + 35ea6d5 commit 164c8cb

File tree

39 files changed

+912
-213
lines changed

39 files changed

+912
-213
lines changed

pkg/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1424,6 +1424,7 @@ GO_TARGETS = [
14241424
"//pkg/kv/kvserver:kvserver_test",
14251425
"//pkg/kv:kv",
14261426
"//pkg/kv:kv_test",
1427+
"//pkg/multitenant/mtinfo:mtinfo",
14271428
"//pkg/multitenant/mtinfopb:mtinfopb",
14281429
"//pkg/multitenant/multitenantcpu:multitenantcpu",
14291430
"//pkg/multitenant/multitenantio:multitenantio",

pkg/base/test_server_args.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -443,6 +443,10 @@ type TestTenantArgs struct {
443443
// Skip check for tenant existence when running the test.
444444
SkipTenantCheck bool
445445

446+
// Do not wait for tenant record cache to be populated before
447+
// starting a tenant server.
448+
SkipWaitForTenantCache bool
449+
446450
// Locality is used to initialize the same-named field on the server.Config
447451
// struct.
448452
Locality roachpb.Locality

pkg/ccl/serverccl/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ go_test(
5656
"//pkg/ccl/utilccl/licenseccl",
5757
"//pkg/clusterversion",
5858
"//pkg/jobs",
59+
"//pkg/kv/kvpb",
5960
"//pkg/kv/kvserver/liveness",
6061
"//pkg/kv/kvserver/liveness/livenesspb",
6162
"//pkg/multitenant/tenantcapabilities",

pkg/ccl/serverccl/server_sql_test.go

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@ package serverccl
1010

1111
import (
1212
"context"
13-
"errors"
1413
"fmt"
1514
"io"
1615
"net/http"
@@ -21,9 +20,11 @@ import (
2120
"github.com/cockroachdb/cockroach/pkg/base"
2221
"github.com/cockroachdb/cockroach/pkg/ccl"
2322
"github.com/cockroachdb/cockroach/pkg/ccl/utilccl/licenseccl"
23+
"github.com/cockroachdb/cockroach/pkg/kv/kvpb"
2424
"github.com/cockroachdb/cockroach/pkg/multitenant/tenantcapabilities"
2525
"github.com/cockroachdb/cockroach/pkg/roachpb"
2626
"github.com/cockroachdb/cockroach/pkg/security"
27+
"github.com/cockroachdb/cockroach/pkg/server"
2728
"github.com/cockroachdb/cockroach/pkg/server/systemconfigwatcher/systemconfigwatchertest"
2829
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
2930
"github.com/cockroachdb/cockroach/pkg/sql"
@@ -34,6 +35,7 @@ import (
3435
"github.com/cockroachdb/cockroach/pkg/util/envutil"
3536
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
3637
"github.com/cockroachdb/cockroach/pkg/util/log"
38+
"github.com/cockroachdb/errors"
3739
"github.com/lib/pq"
3840
"github.com/stretchr/testify/require"
3941
)
@@ -327,8 +329,16 @@ func TestNonExistentTenant(t *testing.T) {
327329
TenantID: serverutils.TestTenantID(),
328330
DisableCreateTenant: true,
329331
SkipTenantCheck: true,
332+
333+
SkipWaitForTenantCache: true,
334+
335+
TestingKnobs: base.TestingKnobs{
336+
Server: &server.TestingKnobs{
337+
ShutdownTenantConnectorEarlyIfNoRecordPresent: true,
338+
},
339+
},
330340
})
331-
require.EqualError(t, err, `database "[1]" does not exist`)
341+
require.True(t, errors.Is(err, &kvpb.MissingRecordError{}))
332342
}
333343

334344
// TestTenantRowIDs confirms `unique_rowid()` works as expected in a

pkg/kv/kvclient/kvtenant/connector.go

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,8 @@ type connector struct {
160160
defaultZoneCfg *zonepb.ZoneConfig
161161
addrs []string
162162

163+
earlyShutdownIfMissingTenantRecord bool
164+
163165
startCh chan struct{} // closed when connector has started up
164166
startErr error
165167

@@ -251,6 +253,8 @@ func NewConnector(cfg ConnectorConfig, addrs []string) Connector {
251253
rpcRetryOptions: cfg.RPCRetryOptions,
252254
defaultZoneCfg: cfg.DefaultZoneConfig,
253255
addrs: addrs,
256+
257+
earlyShutdownIfMissingTenantRecord: cfg.ShutdownTenantConnectorEarlyIfNoRecordPresent,
254258
}
255259

256260
c.mu.nodeDescs = make(map[roachpb.NodeID]*roachpb.NodeDescriptor)
@@ -311,7 +315,7 @@ func (c *connector) Start(ctx context.Context) error {
311315

312316
func (c *connector) internalStart(ctx context.Context) error {
313317
gossipStartupCh := make(chan struct{})
314-
settingsStartupCh := make(chan struct{})
318+
settingsStartupCh := make(chan error)
315319
bgCtx := c.AnnotateCtx(context.Background())
316320

317321
if err := c.rpcContext.Stopper.RunAsyncTask(bgCtx, "connector-gossip", func(ctx context.Context) {
@@ -339,9 +343,13 @@ func (c *connector) internalStart(ctx context.Context) error {
339343
case <-gossipStartupCh:
340344
log.Infof(ctx, "kv connector gossip subscription started")
341345
gossipStartupCh = nil
342-
case <-settingsStartupCh:
343-
log.Infof(ctx, "kv connector tenant settings started")
346+
case err := <-settingsStartupCh:
344347
settingsStartupCh = nil
348+
if err != nil {
349+
log.Infof(ctx, "kv connector initialization error: %v", err)
350+
return err
351+
}
352+
log.Infof(ctx, "kv connector tenant settings started")
345353
case <-ctx.Done():
346354
return ctx.Err()
347355
case <-c.rpcContext.Stopper.ShouldQuiesce():

pkg/kv/kvclient/kvtenant/connector_factory.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,12 @@ type ConnectorConfig struct {
3131
RPCContext *rpc.Context
3232
RPCRetryOptions retry.Options
3333
DefaultZoneConfig *zonepb.ZoneConfig
34+
35+
// ShutdownTenantConnectorEarlyIfNoRecordPresent, if set, will cause the
36+
// tenant connector to be shut down early if no record is present in the
37+
// system.tenants table. This is useful for tests that want to verify that
38+
// the tenant connector can't start when the record doesn't exist.
39+
ShutdownTenantConnectorEarlyIfNoRecordPresent bool
3440
}
3541

3642
// KVAddressConfig encompasses the network addresses, pointing to KV nodes,

pkg/kv/kvclient/kvtenant/setting_overrides.go

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ package kvtenant
1313
import (
1414
"context"
1515
"io"
16+
"time"
1617

1718
"github.com/cockroachdb/cockroach/pkg/kv/kvpb"
1819
"github.com/cockroachdb/cockroach/pkg/settings"
@@ -24,7 +25,7 @@ import (
2425
// runTenantSettingsSubscription listens for tenant setting override changes.
2526
// It closes the given channel once the initial set of overrides were obtained.
2627
// Exits when the context is done.
27-
func (c *connector) runTenantSettingsSubscription(ctx context.Context, startupCh chan struct{}) {
28+
func (c *connector) runTenantSettingsSubscription(ctx context.Context, startupCh chan<- error) {
2829
for ctx.Err() == nil {
2930
client, err := c.getClient(ctx)
3031
if err != nil {
@@ -60,9 +61,29 @@ func (c *connector) runTenantSettingsSubscription(ctx context.Context, startupCh
6061
break
6162
}
6263
if e.Error != (errorspb.EncodedError{}) {
63-
// Hard logical error. We expect io.EOF next.
64+
// Hard logical error.
6465
err := errors.DecodeError(ctx, e.Error)
6566
log.Errorf(ctx, "error consuming TenantSettings RPC: %v", err)
67+
if startupCh != nil && errors.Is(err, &kvpb.MissingRecordError{}) && c.earlyShutdownIfMissingTenantRecord {
68+
startupCh <- err
69+
close(startupCh)
70+
c.tryForgetClient(ctx, client)
71+
return
72+
}
73+
// Other errors, or configuration tells us to continue if the
74+
// tenant record in missing: in that case we continue the
75+
// loop. We're expecting io.EOF from the server next, which
76+
// will lead us to reconnect and retry.
77+
//
78+
// However, don't hammer the server with retries if there was
79+
// an actual error reported: we wait a bit before the retry.
80+
select {
81+
case <-time.After(1 * time.Second):
82+
83+
case <-ctx.Done():
84+
// Shutdown or cancellation short circuits the wait and retry.
85+
return
86+
}
6687
continue
6788
}
6889

@@ -79,6 +100,7 @@ func (c *connector) runTenantSettingsSubscription(ctx context.Context, startupCh
79100
log.Infof(ctx, "received initial tenant settings")
80101

81102
if startupCh != nil {
103+
startupCh <- nil
82104
close(startupCh)
83105
startupCh = nil
84106
}

pkg/kv/kvpb/BUILD.bazel

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@ proto_library(
9797
deps = [
9898
"//pkg/kv/kvserver/concurrency/lock:lock_proto",
9999
"//pkg/kv/kvserver/readsummary/rspb:rspb_proto",
100+
"//pkg/multitenant/tenantcapabilities/tenantcapabilitiespb:tenantcapabilitiespb_proto",
100101
"//pkg/roachpb:roachpb_proto",
101102
"//pkg/settings:settings_proto",
102103
"//pkg/sql/catalog/fetchpb:fetchpb_proto",
@@ -118,6 +119,7 @@ go_proto_library(
118119
deps = [
119120
"//pkg/kv/kvserver/concurrency/lock",
120121
"//pkg/kv/kvserver/readsummary/rspb",
122+
"//pkg/multitenant/tenantcapabilities/tenantcapabilitiespb",
121123
"//pkg/roachpb",
122124
"//pkg/settings",
123125
"//pkg/sql/catalog/fetchpb",

pkg/kv/kvpb/api.proto

Lines changed: 90 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import "util/tracing/tracingpb/recorded_span.proto";
2828
import "util/tracing/tracingpb/tracing.proto";
2929
import "gogoproto/gogo.proto";
3030
import "google/protobuf/duration.proto";
31+
import "multitenant/tenantcapabilities/tenantcapabilitiespb/capabilities.proto";
3132

3233
// ReadConsistencyType specifies what type of consistency is observed
3334
// during read operations.
@@ -3117,19 +3118,81 @@ message TenantSettingsRequest {
31173118
TenantID tenant_id = 1 [(gogoproto.customname) = "TenantID", (gogoproto.nullable) = false];
31183119
}
31193120

3120-
// TenantSettingsEvent is used to report changes to setting overrides for a
3121-
// particular tenant.
3122-
// The protocol is as follows:
3121+
// TenantSettingsEvent is used to report changes to setting overrides and
3122+
// other metadata for a particular tenant.
3123+
//
3124+
// When used to report changes to setting overrides, the protocol is as follows:
31233125
// - When a tenant server first connects, a non-incremental TenantSettingsEvent
31243126
// settings event is sent to the client for each precedence value.
31253127
// This reports to the client the initial values of all the cluster setting
31263128
// overrides.
31273129
// - Afterwards, more TenantSettingsEvent are sent (with Incremental set or not)
31283130
// whenever settings are updated.
31293131
//
3132+
// TODO(knz): The name of the message should be updated to reflect
3133+
// its more general purpose.
3134+
//
31303135
// Note: this API is designed to allow flexibility of implementation on the
31313136
// server side (e.g. to make it maintain very little state per tenant).
31323137
message TenantSettingsEvent {
3138+
enum EventType {
3139+
// The event is about an update to cluster setting overrides.
3140+
// This must be zero for backward-compatibility with pre-v23.1
3141+
// CockroachDB.
3142+
SETTING_EVENT = 0;
3143+
3144+
// Note: for compatibility with pre-23.1 tenant clients, it is
3145+
// important that all event types that convey data about the
3146+
// initial state of a tenant service be sent after the first
3147+
// SETTING_EVENT message that communicates overrides (for one of
3148+
// the two precedence levels), and before the second one (for the
3149+
// other precedence level).
3150+
//
3151+
// This is necessary because of a combination of factors:
3152+
//
3153+
// - For compatibility with older version tenant clients,
3154+
// all non-setting event types must fake being a no-op
3155+
// setting event (see the docstring on event_type below).
3156+
// A no-op fake setting event must have `Incremental` set to
3157+
// `true`.
3158+
// - Meanwhile, older version tenant clients also assert that the
3159+
// very first message sent by the server must have `Incremental`
3160+
// set to `false`.
3161+
//
3162+
// This means we can only send events of other types after the
3163+
// first setting overrides event.
3164+
//
3165+
// - Then, separately, newer version tenant clients also
3166+
// synchronize their startup on the reception of a tenant
3167+
// setting event for each precedence level. This is because
3168+
// these tenant clients must, in turn, remain compatible with
3169+
// older version *KV servers* that do not send other event types
3170+
// and send just 2 setting overrides events initially.
3171+
//
3172+
// This means we cannot send other event types after the second
3173+
// setting overrides event.
3174+
3175+
// The event is about an update to the tenant's metadata.
3176+
METADATA_EVENT = 1;
3177+
}
3178+
3179+
// The type of event. For backward-compatibility with early 23.1
3180+
// servers that do not check the event_type field, server of any
3181+
// 23.2 version must ensure all events of other types than
3182+
// SETTING_EVENT must appear as a no-op event when interpreted as a
3183+
// setting event. This means: 1) set precedence to any value; 2) set
3184+
// incremental to true 3) provide a nil slice in overrides.
3185+
// This constraint can be lifted once all servers are at least 23.2.
3186+
EventType event_type = 5;
3187+
3188+
// If non-nil, the other fields will be empty and this will be the final event
3189+
// sent on the stream before it is terminated.
3190+
errorspb.EncodedError error = 4 [(gogoproto.nullable) = false];
3191+
3192+
//
3193+
// Fields that pertain to cluster setting updates.
3194+
//
3195+
31333196
enum Precedence {
31343197
// Sentinel value to ensure that the 0 value is not a valid precedence.
31353198
INVALID = 0;
@@ -3160,9 +3223,30 @@ message TenantSettingsEvent {
31603223
// fields).
31613224
repeated TenantSetting overrides = 3 [(gogoproto.nullable) = false];
31623225

3163-
// If non-nil, the other fields will be empty and this will be the final event
3164-
// sent on the stream before it is terminated.
3165-
errorspb.EncodedError error = 4 [(gogoproto.nullable) = false];
3226+
//
3227+
// Fields that pertain to other tenant metadata updates.
3228+
//
3229+
3230+
// Name is the tenant's current name.
3231+
string name = 6 [(gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/roachpb.TenantName"];
3232+
3233+
// Capabilities is the tenant's current capabilities.
3234+
// Note that this field is advisory: the server may know of a more
3235+
// recent (and different) set of capabilities, and server-side
3236+
// capabilities checks always prevail.
3237+
cockroach.multitenant.tenantcapabilitiespb.TenantCapabilities capabilities = 7;
3238+
3239+
// DataState is the tenant's current data state.
3240+
// TODO(knz): This should really be casted to go type mtinfopb.TenantDataState but we
3241+
// can't do that yet due to a dependency cycle. We should break the cycle.
3242+
uint32 data_state = 8;
3243+
3244+
// ServiceMode is the tenant's current service mode.
3245+
// TODO(knz): This should really be casted to go type mtinfopb.TenantServiceMode but we
3246+
// can't do that yet due to a dependency cycle. We should break the cycle.
3247+
uint32 service_mode = 9;
3248+
3249+
// NEXT ID: 10
31663250
}
31673251

31683252
// TenantSetting contains the name and value of a tenant setting.

pkg/kv/kvpb/errors.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import (
2525
"github.com/cockroachdb/errors"
2626
_ "github.com/cockroachdb/errors/extgrpc" // register EncodeError support for gRPC Status
2727
"github.com/cockroachdb/redact"
28+
"github.com/gogo/protobuf/proto"
2829
)
2930

3031
// Printer is an interface that lets us use what's common between the
@@ -1536,6 +1537,25 @@ func NewNotLeaseHolderErrorWithSpeculativeLease(
15361537
return NewNotLeaseHolderError(speculativeLease, proposerStoreID, rangeDesc, msg)
15371538
}
15381539

1540+
// MissingRecordError is reported when a record is missing.
1541+
type MissingRecordError struct{}
1542+
1543+
func (e *MissingRecordError) Error() string {
1544+
return redact.Sprint(e).StripMarkers()
1545+
}
1546+
1547+
func (e *MissingRecordError) SafeFormatError(p errors.Printer) (next error) {
1548+
p.Printf("missing record")
1549+
return nil
1550+
}
1551+
1552+
func init() {
1553+
errors.RegisterLeafDecoder(errors.GetTypeKey((*MissingRecordError)(nil)), func(_ context.Context, _ string, _ []string, _ proto.Message) error {
1554+
return &MissingRecordError{}
1555+
})
1556+
}
1557+
1558+
var _ errors.SafeFormatter = &MissingRecordError{}
15391559
var _ errors.SafeFormatter = &NotLeaseHolderError{}
15401560
var _ errors.SafeFormatter = &RangeNotFoundError{}
15411561
var _ errors.SafeFormatter = &RangeKeyMismatchError{}

0 commit comments

Comments
 (0)