knz
diff --git a/‎pkg/BUILD.bazel‎
Lines changed: 1 addition & 0 deletions b/‎pkg/BUILD.bazel‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎pkg/base/test_server_args.go‎
Lines changed: 4 additions & 0 deletions b/‎pkg/base/test_server_args.go‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎pkg/ccl/serverccl/BUILD.bazel‎
Lines changed: 1 addition & 0 deletions b/‎pkg/ccl/serverccl/BUILD.bazel‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎pkg/ccl/serverccl/server_sql_test.go‎
Lines changed: 12 additions & 2 deletions b/‎pkg/ccl/serverccl/server_sql_test.go‎
Lines changed: 12 additions & 2 deletions
diff --git a/‎pkg/kv/kvclient/kvtenant/connector.go‎
Lines changed: 11 additions & 3 deletions b/‎pkg/kv/kvclient/kvtenant/connector.go‎
Lines changed: 11 additions & 3 deletions
diff --git a/‎pkg/kv/kvclient/kvtenant/connector_factory.go‎
Lines changed: 6 additions & 0 deletions b/‎pkg/kv/kvclient/kvtenant/connector_factory.go‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎pkg/kv/kvclient/kvtenant/setting_overrides.go‎
Lines changed: 24 additions & 2 deletions b/‎pkg/kv/kvclient/kvtenant/setting_overrides.go‎
Lines changed: 24 additions & 2 deletions
diff --git a/‎pkg/kv/kvpb/BUILD.bazel‎
Lines changed: 2 additions & 0 deletions b/‎pkg/kv/kvpb/BUILD.bazel‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎pkg/kv/kvpb/api.proto‎
Lines changed: 90 additions & 6 deletions b/‎pkg/kv/kvpb/api.proto‎
Lines changed: 90 additions & 6 deletions
diff --git a/‎pkg/kv/kvpb/errors.go‎
Lines changed: 20 additions & 0 deletions b/‎pkg/kv/kvpb/errors.go‎
Lines changed: 20 additions & 0 deletions
@@ -1424,6 +1424,7 @@ GO_TARGETS = [
     "//pkg/kv/kvserver:kvserver_test",
     "//pkg/kv:kv",
     "//pkg/kv:kv_test",
+    "//pkg/multitenant/mtinfo:mtinfo",
     "//pkg/multitenant/mtinfopb:mtinfopb",
     "//pkg/multitenant/multitenantcpu:multitenantcpu",
     "//pkg/multitenant/multitenantio:multitenantio",
 
@@ -443,6 +443,10 @@ type TestTenantArgs struct {
 	// Skip check for tenant existence when running the test.
 	SkipTenantCheck bool
 
+	// Do not wait for tenant record cache to be populated before
+	// starting a tenant server.
+	SkipWaitForTenantCache bool
+
 	// Locality is used to initialize the same-named field on the server.Config
 	// struct.
 	Locality roachpb.Locality
 
@@ -56,6 +56,7 @@ go_test(
         "//pkg/ccl/utilccl/licenseccl",
         "//pkg/clusterversion",
         "//pkg/jobs",
+        "//pkg/kv/kvpb",
         "//pkg/kv/kvserver/liveness",
         "//pkg/kv/kvserver/liveness/livenesspb",
         "//pkg/multitenant/tenantcapabilities",
 
@@ -10,7 +10,6 @@ package serverccl
 
 import (
 	"context"
-	"errors"
 	"fmt"
 	"io"
 	"net/http"
@@ -21,9 +20,11 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/base"
 	"github.com/cockroachdb/cockroach/pkg/ccl"
 	"github.com/cockroachdb/cockroach/pkg/ccl/utilccl/licenseccl"
+	"github.com/cockroachdb/cockroach/pkg/kv/kvpb"
 	"github.com/cockroachdb/cockroach/pkg/multitenant/tenantcapabilities"
 	"github.com/cockroachdb/cockroach/pkg/roachpb"
 	"github.com/cockroachdb/cockroach/pkg/security"
+	"github.com/cockroachdb/cockroach/pkg/server"
 	"github.com/cockroachdb/cockroach/pkg/server/systemconfigwatcher/systemconfigwatchertest"
 	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
 	"github.com/cockroachdb/cockroach/pkg/sql"
@@ -34,6 +35,7 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/util/envutil"
 	"github.com/cockroachdb/cockroach/pkg/util/leaktest"
 	"github.com/cockroachdb/cockroach/pkg/util/log"
+	"github.com/cockroachdb/errors"
 	"github.com/lib/pq"
 	"github.com/stretchr/testify/require"
 )
@@ -327,8 +329,16 @@ func TestNonExistentTenant(t *testing.T) {
 			TenantID:            serverutils.TestTenantID(),
 			DisableCreateTenant: true,
 			SkipTenantCheck:     true,
+
+			SkipWaitForTenantCache: true,
+
+			TestingKnobs: base.TestingKnobs{
+				Server: &server.TestingKnobs{
+					ShutdownTenantConnectorEarlyIfNoRecordPresent: true,
+				},
+			},
 		})
-	require.EqualError(t, err, `database "[1]" does not exist`)
+	require.True(t, errors.Is(err, &kvpb.MissingRecordError{}))
 }
 
 // TestTenantRowIDs confirms `unique_rowid()` works as expected in a
 
@@ -160,6 +160,8 @@ type connector struct {
 	defaultZoneCfg  *zonepb.ZoneConfig
 	addrs           []string
 
+	earlyShutdownIfMissingTenantRecord bool
+
 	startCh  chan struct{} // closed when connector has started up
 	startErr error
 
@@ -251,6 +253,8 @@ func NewConnector(cfg ConnectorConfig, addrs []string) Connector {
 		rpcRetryOptions: cfg.RPCRetryOptions,
 		defaultZoneCfg:  cfg.DefaultZoneConfig,
 		addrs:           addrs,
+
+		earlyShutdownIfMissingTenantRecord: cfg.ShutdownTenantConnectorEarlyIfNoRecordPresent,
 	}
 
 	c.mu.nodeDescs = make(map[roachpb.NodeID]*roachpb.NodeDescriptor)
@@ -311,7 +315,7 @@ func (c *connector) Start(ctx context.Context) error {
 
 func (c *connector) internalStart(ctx context.Context) error {
 	gossipStartupCh := make(chan struct{})
-	settingsStartupCh := make(chan struct{})
+	settingsStartupCh := make(chan error)
 	bgCtx := c.AnnotateCtx(context.Background())
 
 	if err := c.rpcContext.Stopper.RunAsyncTask(bgCtx, "connector-gossip", func(ctx context.Context) {
@@ -339,9 +343,13 @@ func (c *connector) internalStart(ctx context.Context) error {
 		case <-gossipStartupCh:
 			log.Infof(ctx, "kv connector gossip subscription started")
 			gossipStartupCh = nil
-		case <-settingsStartupCh:
-			log.Infof(ctx, "kv connector tenant settings started")
+		case err := <-settingsStartupCh:
 			settingsStartupCh = nil
+			if err != nil {
+				log.Infof(ctx, "kv connector initialization error: %v", err)
+				return err
+			}
+			log.Infof(ctx, "kv connector tenant settings started")
 		case <-ctx.Done():
 			return ctx.Err()
 		case <-c.rpcContext.Stopper.ShouldQuiesce():
 
@@ -31,6 +31,12 @@ type ConnectorConfig struct {
 	RPCContext        *rpc.Context
 	RPCRetryOptions   retry.Options
 	DefaultZoneConfig *zonepb.ZoneConfig
+
+	// ShutdownTenantConnectorEarlyIfNoRecordPresent, if set, will cause the
+	// tenant connector to be shut down early if no record is present in the
+	// system.tenants table. This is useful for tests that want to verify that
+	// the tenant connector can't start when the record doesn't exist.
+	ShutdownTenantConnectorEarlyIfNoRecordPresent bool
 }
 
 // KVAddressConfig encompasses the network addresses, pointing to KV nodes,
 
@@ -13,6 +13,7 @@ package kvtenant
 import (
 	"context"
 	"io"
+	"time"
 
 	"github.com/cockroachdb/cockroach/pkg/kv/kvpb"
 	"github.com/cockroachdb/cockroach/pkg/settings"
@@ -24,7 +25,7 @@ import (
 // runTenantSettingsSubscription listens for tenant setting override changes.
 // It closes the given channel once the initial set of overrides were obtained.
 // Exits when the context is done.
-func (c *connector) runTenantSettingsSubscription(ctx context.Context, startupCh chan struct{}) {
+func (c *connector) runTenantSettingsSubscription(ctx context.Context, startupCh chan<- error) {
 	for ctx.Err() == nil {
 		client, err := c.getClient(ctx)
 		if err != nil {
@@ -60,9 +61,29 @@ func (c *connector) runTenantSettingsSubscription(ctx context.Context, startupCh
 				break
 			}
 			if e.Error != (errorspb.EncodedError{}) {
-				// Hard logical error. We expect io.EOF next.
+				// Hard logical error.
 				err := errors.DecodeError(ctx, e.Error)
 				log.Errorf(ctx, "error consuming TenantSettings RPC: %v", err)
+				if startupCh != nil && errors.Is(err, &kvpb.MissingRecordError{}) && c.earlyShutdownIfMissingTenantRecord {
+					startupCh <- err
+					close(startupCh)
+					c.tryForgetClient(ctx, client)
+					return
+				}
+				// Other errors, or configuration tells us to continue if the
+				// tenant record in missing: in that case we continue the
+				// loop. We're expecting io.EOF from the server next, which
+				// will lead us to reconnect and retry.
+				//
+				// However, don't hammer the server with retries if there was
+				// an actual error reported: we wait a bit before the retry.
+				select {
+				case <-time.After(1 * time.Second):
+
+				case <-ctx.Done():
+					// Shutdown or cancellation short circuits the wait and retry.
+					return
+				}
 				continue
 			}
 
@@ -79,6 +100,7 @@ func (c *connector) runTenantSettingsSubscription(ctx context.Context, startupCh
 				log.Infof(ctx, "received initial tenant settings")
 
 				if startupCh != nil {
+					startupCh <- nil
 					close(startupCh)
 					startupCh = nil
 				}
 
@@ -97,6 +97,7 @@ proto_library(
     deps = [
         "//pkg/kv/kvserver/concurrency/lock:lock_proto",
         "//pkg/kv/kvserver/readsummary/rspb:rspb_proto",
+        "//pkg/multitenant/tenantcapabilities/tenantcapabilitiespb:tenantcapabilitiespb_proto",
         "//pkg/roachpb:roachpb_proto",
         "//pkg/settings:settings_proto",
         "//pkg/sql/catalog/fetchpb:fetchpb_proto",
@@ -118,6 +119,7 @@ go_proto_library(
     deps = [
         "//pkg/kv/kvserver/concurrency/lock",
         "//pkg/kv/kvserver/readsummary/rspb",
+        "//pkg/multitenant/tenantcapabilities/tenantcapabilitiespb",
         "//pkg/roachpb",
         "//pkg/settings",
         "//pkg/sql/catalog/fetchpb",
 
@@ -28,6 +28,7 @@ import "util/tracing/tracingpb/recorded_span.proto";
 import "util/tracing/tracingpb/tracing.proto";
 import "gogoproto/gogo.proto";
 import "google/protobuf/duration.proto";
+import "multitenant/tenantcapabilities/tenantcapabilitiespb/capabilities.proto";
 
 // ReadConsistencyType specifies what type of consistency is observed
 // during read operations.
@@ -3117,19 +3118,81 @@ message TenantSettingsRequest {
   TenantID tenant_id = 1 [(gogoproto.customname) = "TenantID", (gogoproto.nullable) = false];
 }
 
-// TenantSettingsEvent is used to report changes to setting overrides for a
-// particular tenant.
-// The protocol is as follows:
+// TenantSettingsEvent is used to report changes to setting overrides and
+// other metadata for a particular tenant.
+//
+// When used to report changes to setting overrides, the protocol is as follows:
 // - When a tenant server first connects, a non-incremental TenantSettingsEvent
 //   settings event is sent to the client for each precedence value.
 //   This reports to the client the initial values of all the cluster setting
 //   overrides.
 // - Afterwards, more TenantSettingsEvent are sent (with Incremental set or not)
 //   whenever settings are updated.
 //
+// TODO(knz): The name of the message should be updated to reflect
+// its more general purpose.
+//
 // Note: this API is designed to allow flexibility of implementation on the
 // server side (e.g. to make it maintain very little state per tenant).
 message TenantSettingsEvent {
+  enum EventType {
+    // The event is about an update to cluster setting overrides.
+    // This must be zero for backward-compatibility with pre-v23.1
+    // CockroachDB.
+    SETTING_EVENT = 0;
+
+    // Note: for compatibility with pre-23.1 tenant clients, it is
+    // important that all event types that convey data about the
+    // initial state of a tenant service be sent after the first
+    // SETTING_EVENT message that communicates overrides (for one of
+    // the two precedence levels), and before the second one (for the
+    // other precedence level).
+    //
+    // This is necessary because of a combination of factors:
+    //
+    // - For compatibility with older version tenant clients,
+    //   all non-setting event types must fake being a no-op
+    //   setting event (see the docstring on event_type below).
+    //   A no-op fake setting event must have `Incremental` set to
+    //   `true`.
+    // - Meanwhile, older version tenant clients also assert that the
+    //   very first message sent by the server must have `Incremental`
+    //   set to `false`.
+    //
+    //   This means we can only send events of other types after the
+    //   first setting overrides event.
+    //
+    // - Then, separately, newer version tenant clients also
+    //   synchronize their startup on the reception of a tenant
+    //   setting event for each precedence level. This is because
+    //   these tenant clients must, in turn, remain compatible with
+    //   older version *KV servers* that do not send other event types
+    //   and send just 2 setting overrides events initially.
+    //
+    //   This means we cannot send other event types after the second
+    //   setting overrides event.
+
+    // The event is about an update to the tenant's metadata.
+    METADATA_EVENT = 1;
+  }
+
+  // The type of event. For backward-compatibility with early 23.1
+  // servers that do not check the event_type field, server of any
+  // 23.2 version must ensure all events of other types than
+  // SETTING_EVENT must appear as a no-op event when interpreted as a
+  // setting event. This means: 1) set precedence to any value; 2) set
+  // incremental to true 3) provide a nil slice in overrides.
+  // This constraint can be lifted once all servers are at least 23.2.
+  EventType event_type = 5;
+
+  // If non-nil, the other fields will be empty and this will be the final event
+  // sent on the stream before it is terminated.
+  errorspb.EncodedError error = 4 [(gogoproto.nullable) = false];
+
+  //
+  // Fields that pertain to cluster setting updates.
+  //
+
   enum Precedence {
     // Sentinel value to ensure that the 0 value is not a valid precedence.
     INVALID = 0;
@@ -3160,9 +3223,30 @@ message TenantSettingsEvent {
   //    fields).
   repeated TenantSetting overrides = 3 [(gogoproto.nullable) = false];
 
-  // If non-nil, the other fields will be empty and this will be the final event
-  // sent on the stream before it is terminated.
-  errorspb.EncodedError error = 4 [(gogoproto.nullable) = false];
+  //
+  // Fields that pertain to other tenant metadata updates.
+  //
+
+  // Name is the tenant's current name.
+  string name = 6 [(gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/roachpb.TenantName"];
+
+  // Capabilities is the tenant's current capabilities.
+  // Note that this field is advisory: the server may know of a more
+  // recent (and different) set of capabilities, and server-side
+  // capabilities checks always prevail.
+  cockroach.multitenant.tenantcapabilitiespb.TenantCapabilities capabilities = 7;
+
+  // DataState is the tenant's current data state.
+  // TODO(knz): This should really be casted to go type mtinfopb.TenantDataState but we
+  // can't do that yet due to a dependency cycle. We should break the cycle.
+  uint32 data_state = 8;
+
+  // ServiceMode is the tenant's current service mode.
+  // TODO(knz): This should really be casted to go type mtinfopb.TenantServiceMode but we
+  // can't do that yet due to a dependency cycle. We should break the cycle.
+  uint32 service_mode = 9;
+
+  // NEXT ID: 10
 }
 
 // TenantSetting contains the name and value of a tenant setting.
 
@@ -25,6 +25,7 @@ import (
 	"github.com/cockroachdb/errors"
 	_ "github.com/cockroachdb/errors/extgrpc" // register EncodeError support for gRPC Status
 	"github.com/cockroachdb/redact"
+	"github.com/gogo/protobuf/proto"
 )
 
 // Printer is an interface that lets us use what's common between the
@@ -1536,6 +1537,25 @@ func NewNotLeaseHolderErrorWithSpeculativeLease(
 	return NewNotLeaseHolderError(speculativeLease, proposerStoreID, rangeDesc, msg)
 }
 
+// MissingRecordError is reported when a record is missing.
+type MissingRecordError struct{}
+
+func (e *MissingRecordError) Error() string {
+	return redact.Sprint(e).StripMarkers()
+}
+
+func (e *MissingRecordError) SafeFormatError(p errors.Printer) (next error) {
+	p.Printf("missing record")
+	return nil
+}
+
+func init() {
+	errors.RegisterLeafDecoder(errors.GetTypeKey((*MissingRecordError)(nil)), func(_ context.Context, _ string, _ []string, _ proto.Message) error {
+		return &MissingRecordError{}
+	})
+}
+
+var _ errors.SafeFormatter = &MissingRecordError{}
 var _ errors.SafeFormatter = &NotLeaseHolderError{}
 var _ errors.SafeFormatter = &RangeNotFoundError{}
 var _ errors.SafeFormatter = &RangeKeyMismatchError{}