cockroachdb
diff --git a/‎pkg/ccl/kvccl/kvfollowerreadsccl/BUILD.bazel‎
Lines changed: 1 addition & 0 deletions b/‎pkg/ccl/kvccl/kvfollowerreadsccl/BUILD.bazel‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎pkg/ccl/kvccl/kvfollowerreadsccl/followerreads_test.go‎
Lines changed: 33 additions & 9 deletions b/‎pkg/ccl/kvccl/kvfollowerreadsccl/followerreads_test.go‎
Lines changed: 33 additions & 9 deletions
diff --git a/‎pkg/kv/kvclient/kvcoord/dist_sender.go‎
Lines changed: 32 additions & 10 deletions b/‎pkg/kv/kvclient/kvcoord/dist_sender.go‎
Lines changed: 32 additions & 10 deletions
diff --git a/‎pkg/kv/kvclient/kvcoord/dist_sender_test.go‎
Lines changed: 82 additions & 27 deletions b/‎pkg/kv/kvclient/kvcoord/dist_sender_test.go‎
Lines changed: 82 additions & 27 deletions
diff --git a/‎pkg/sql/colcontainer/diskqueue.go‎
Lines changed: 6 additions & 0 deletions b/‎pkg/sql/colcontainer/diskqueue.go‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎pkg/sql/inspect/BUILD.bazel‎
Lines changed: 4 additions & 1 deletion b/‎pkg/sql/inspect/BUILD.bazel‎
Lines changed: 4 additions & 1 deletion
@@ -60,6 +60,7 @@ go_test(
         "//pkg/kv/kvserver/concurrency/lock",
         "//pkg/kv/kvserver/kvserverbase",
         "//pkg/kv/kvtestutils",
+        "//pkg/multitenant/tenantcapabilitiespb",
         "//pkg/roachpb",
         "//pkg/rpc",
         "//pkg/rpc/rpcbase",
 
@@ -27,6 +27,7 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/concurrency/lock"
 	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase"
 	"github.com/cockroachdb/cockroach/pkg/kv/kvtestutils"
+	"github.com/cockroachdb/cockroach/pkg/multitenant/tenantcapabilitiespb"
 	"github.com/cockroachdb/cockroach/pkg/roachpb"
 	"github.com/cockroachdb/cockroach/pkg/rpc"
 	"github.com/cockroachdb/cockroach/pkg/rpc/rpcbase"
@@ -804,9 +805,8 @@ func TestFollowerReadsWithStaleDescriptor(t *testing.T) {
 		base.TestClusterArgs{
 			ReplicationMode: base.ReplicationManual,
 			ServerArgs: base.TestServerArgs{
-				Settings:          settings,
-				DefaultTestTenant: base.TODOTestTenantDisabled,
-				UseDatabase:       "t",
+				Settings:    settings,
+				UseDatabase: "t",
 			},
 			// n4 pretends to have low latency to n2 and n3, so that it tries to use
 			// them for follower reads.
@@ -847,15 +847,24 @@ func TestFollowerReadsWithStaleDescriptor(t *testing.T) {
 	// load based rebalancing to make sure it doesn't move.
 	kvserverbase.LoadBasedRebalancingMode.Override(ctx, &settings.SV, kvserverbase.LBRebalancingOff)
 
+	// NB: Tenants need capabilities to be able to relocate ranges.
+	if !tc.Server(0).DeploymentMode().IsSingleTenant() {
+		require.NoError(t, tc.Server(0).GrantTenantCapabilities(
+			context.Background(), serverutils.TestTenantID(),
+			map[tenantcapabilitiespb.ID]string{
+				tenantcapabilitiespb.CanAdminRelocateRange: "true",
+			}))
+	}
+
 	n1 := sqlutils.MakeSQLRunner(tc.Conns[0])
 	n1.Exec(t, `CREATE DATABASE t`)
 	n1.Exec(t, `CREATE TABLE test (k INT PRIMARY KEY)`)
 	n1.Exec(t, `ALTER TABLE test EXPERIMENTAL_RELOCATE VOTERS VALUES (ARRAY[1,2], 1)`)
 	// Speed up closing of timestamps, in order to sleep less below before we can
 	// use follower_read_timestamp(). follower_read_timestamp() uses the sum of
 	// the following settings.
-	n1.Exec(t, `SET CLUSTER SETTING kv.closed_timestamp.target_duration = '0.1s'`)
-	n1.Exec(t, `SET CLUSTER SETTING kv.closed_timestamp.side_transport_interval = '0.1s'`)
+	closedts.TargetDuration.Override(ctx, &settings.SV, 100*time.Millisecond)
+	closedts.SideTransportCloseInterval.Override(ctx, &settings.SV, 100*time.Millisecond)
 
 	// Sleep so that we can perform follower reads. The read timestamp needs to be
 	// above the timestamp when the table was created.
@@ -865,11 +874,16 @@ func TestFollowerReadsWithStaleDescriptor(t *testing.T) {
 
 	// Run a query on n4 to populate its cache.
 	n4 := sqlutils.MakeSQLRunner(tc.Conns[3])
-	n4.Exec(t, "SELECT * from test WHERE k=1")
 	// Check that the cache was indeed populated.
 	var tableID uint32
 	n1.QueryRow(t, `SELECT id from system.namespace WHERE name='test'`).Scan(&tableID)
-	tablePrefix := keys.MustAddr(keys.SystemSQLCodec.TablePrefix(tableID))
+	tablePrefix := keys.MustAddr(tc.Server(0).Codec().TablePrefix(tableID))
+	// NB: Splitting a range helps prevent tenants from getting an out of band
+	// RangeDescriptor update to the RangeCache. Unclear about the exact mechanics
+	// at play here, but they don't matter for our test.
+	_, _, err := tc.SplitRange(roachpb.Key(tablePrefix))
+	require.NoError(t, err)
+	n4.Exec(t, "SELECT * from test WHERE k=1")
 	n4Cache := tc.Server(3).DistSenderI().(*kvcoord.DistSender).RangeDescriptorCache()
 	entry, err := n4Cache.TestingGetCached(ctx, tablePrefix, false, roachpb.LAG_BY_CLUSTER_SETTING)
 	require.NoError(t, err)
@@ -964,8 +978,18 @@ func TestFollowerReadsWithStaleDescriptor(t *testing.T) {
 
 	// Sanity check that the plan was distributed.
 	require.True(t, strings.Contains(rec.String(), "creating DistSQL plan with isLocal=false"))
-	// Look at the trace and check that we've served a follower read.
-	require.True(t, kvtestutils.OnlyFollowerReads(rec), "query was not served through follower reads: %s", rec)
+	// NB: We're distributing the plan here, and it (the DistSender) is running on
+	// n3. Note that we've only injected latencies on n4, so any replica is fair
+	// game for n3. When run in normal or shared-process multi-tenancy
+	// deployments, n3 will route to its local replica (which is a follower), as
+	// that guy always sorts first. However, for external process multi-tenancy,
+	// there's no such concept of a local replica. Requests can therefore be
+	// routed to either n1 or n3, so we can't make any assertions about whether
+	// there'll be a follower read or not.
+	if !tc.Server(0).DeploymentMode().IsExternal() {
+		// Look at the trace and check that we've served a follower read.
+		require.True(t, kvtestutils.OnlyFollowerReads(rec), "query was not served through follower reads: %s", rec)
+	}
 	// Verify that we didn't produce the "misplanned ranges" metadata that would
 	// purge the non-stale entries from the range cache on n4.
 	require.False(t, strings.Contains(rec.String(), "clearing entries overlapping"))
 
@@ -419,6 +419,20 @@ var ProxyBatchRequest = settings.RegisterBoolSetting(
 	true,
 )
 
+// NonTransactionalWritesNotIdempotent controls whether non-transactional writes
+// are considered idempotent or not. When this setting is true, a
+// non-transactional write that experiences an RPC error is not retried, and
+// returns an ambiguous error. This is the same behavior as commit batches (or
+// batched issued concurrently with a commit batch). This is arguably the
+// correct behavior for non-transactional writes, but it's behind a default-off
+// cluster setting to get some kvnemesis mileage first.
+var NonTransactionalWritesNotIdempotent = settings.RegisterBoolSetting(
+	settings.ApplicationLevel,
+	"kv.dist_sender.non_transactional_writes_not_idempotent.enabled",
+	"when true, non-transactional writes are not retried and may return an ambiguous error",
+	false,
+)
+
 // DistSenderMetrics is the set of metrics for a given distributed sender.
 type DistSenderMetrics struct {
 	BatchCount                         *metric.Counter
@@ -2541,7 +2555,13 @@ const slowDistSenderReplicaThreshold = 10 * time.Second
 func (ds *DistSender) sendToReplicas(
 	ctx context.Context, ba *kvpb.BatchRequest, routing rangecache.EvictionToken, withCommit bool,
 ) (*kvpb.BatchResponse, error) {
-
+	// In addition to batches where withCommit is true, non-transactional write
+	// batches are also not safe to be retried as they are not guaranteed to be
+	// idempotent. Returning ambiguous errors for those batches is controlled by a
+	// cluster setting for now.
+	nonIdempotentWrite :=
+		ba.Txn == nil && ba.IsWrite() && NonTransactionalWritesNotIdempotent.Get(&ds.st.SV)
+	nonIdempotent := withCommit || nonIdempotentWrite
 	// If this request can be sent to a follower to perform a consistent follower
 	// read under the closed timestamp, promote its routing policy to NEAREST.
 	// If we don't know the closed timestamp policy, we ought to optimistically
@@ -2759,7 +2779,9 @@ func (ds *DistSender) sendToReplicas(
 		}
 
 		tBegin := crtime.NowMono() // for slow log message
-		sendCtx, cbToken, cbErr := ds.circuitBreakers.ForReplica(desc, &curReplica).Track(ctx, ba, withCommit, tBegin)
+		sendCtx, cbToken, cbErr := ds.circuitBreakers.ForReplica(desc, &curReplica).Track(
+			ctx, ba, nonIdempotent, tBegin,
+		)
 		if cbErr != nil {
 			// Circuit breaker is tripped. err will be handled below.
 			err = cbErr
@@ -2861,12 +2883,12 @@ func (ds *DistSender) sendToReplicas(
 			// prevents them from double evaluation. This can result in, for example,
 			// an increment applying twice, or more subtle problems like a blind write
 			// evaluating twice, overwriting another unrelated write that fell
-			// in-between.
+			// in-between. This is fixed under the cluster setting
+			// NonTransactionalWritesNotIdempotent. Consider enabling it by default.
 			//
-			// NB: If this partial batch does not contain the EndTxn request but the
-			// batch contains a commit, the ambiguous error should be caught on
-			// retrying the writes, should it need to be propagated.
-			if withCommit && !grpcutil.RequestDidNotStart(err) {
+			// NB: If this partial batch is not idempotent, the ambiguous error should
+			// be caught on retrying the writes, should it need to be propagated.
+			if nonIdempotent && !grpcutil.RequestDidNotStart(err) {
 				ambiguousError = err
 			}
 			// If we get a gRPC error against the leaseholder, we don't want to
@@ -2987,9 +3009,9 @@ func (ds *DistSender) sendToReplicas(
 					// return it if all other replicas fail (regardless of error).
 					replicaUnavailableError = br.Error.GoError()
 				}
-				// The circuit breaker may have tripped while a commit proposal was in
-				// flight, so we have to mark it as ambiguous as well.
-				if withCommit && ambiguousError == nil {
+				// The circuit breaker may have tripped while a non-idempotent request
+				// was in flight, so we have to mark it as ambiguous as well.
+				if nonIdempotent && ambiguousError == nil {
 					ambiguousError = br.Error.GoError()
 				}
 			case *kvpb.NotLeaseHolderError:
 
@@ -3664,6 +3664,7 @@ func TestReplicaErrorsMerged(t *testing.T) {
 
 	notLeaseHolderErr := kvpb.NewError(kvpb.NewNotLeaseHolderError(lease3, 0, &descriptor2, ""))
 	startedRequestError := errors.New("request might have started")
+	notStartedRequestError := grpcstatus.Errorf(codes.PermissionDenied, "request did not start")
 	unavailableError1 := kvpb.NewError(kvpb.NewReplicaUnavailableError(errors.New("unavailable"), &initDescriptor, initDescriptor.InternalReplicas[0]))
 	unavailableError2 := kvpb.NewError(kvpb.NewReplicaUnavailableError(errors.New("unavailable"), &initDescriptor, initDescriptor.InternalReplicas[1]))
 
@@ -3675,52 +3676,101 @@ func TestReplicaErrorsMerged(t *testing.T) {
 	// See https://cockroachlabs.com/blog/demonic-nondeterminism/#appendix for
 	// the gory details.
 	testCases := []struct {
+		transactional      bool
 		withCommit         bool
 		sendErr1, sendErr2 error
 		err1, err2         *kvpb.Error
 		expErr             string
 	}{
 		// The ambiguous error is returned with higher priority for withCommit.
 		{
-			withCommit: true,
-			sendErr1:   startedRequestError,
-			err2:       notLeaseHolderErr,
-			expErr:     "result is ambiguous",
+			transactional: true,
+			withCommit:    true,
+			sendErr1:      startedRequestError,
+			err2:          notLeaseHolderErr,
+			expErr:        "result is ambiguous",
 		},
 		// The not leaseholder errors is the last error.
 		{
-			withCommit: false,
-			sendErr1:   startedRequestError,
-			err2:       notLeaseHolderErr,
-			expErr:     "leaseholder not found in transport",
+			transactional: true,
+			withCommit:    false,
+			sendErr1:      startedRequestError,
+			err2:          notLeaseHolderErr,
+			expErr:        "leaseholder not found in transport",
 		},
 		// The ambiguous error is returned with higher priority for withCommit.
 		{
-			withCommit: true,
-			sendErr1:   startedRequestError,
-			err2:       unavailableError2,
-			expErr:     "result is ambiguous",
+			transactional: true,
+			withCommit:    true,
+			sendErr1:      startedRequestError,
+			err2:          unavailableError2,
+			expErr:        "result is ambiguous",
 		},
 		// The unavailable error is the last error.
 		{
-			withCommit: false,
-			sendErr1:   startedRequestError,
-			err2:       unavailableError2,
-			expErr:     "unavailable",
+			transactional: true,
+			withCommit:    false,
+			sendErr1:      startedRequestError,
+			err2:          unavailableError2,
+			expErr:        "unavailable",
+		},
+		// The ambiguous error is returned with higher priority for
+		// non-transactional batches (next 2 test cases). This is the case only
+		// because the test sets NonTransactionalWritesNotIdempotent = true.
+		// Otherwise, the non-transactional requests would be treated like they are
+		// idempotent and the NLHE/RUE would be returned as the last error.
+		{
+			transactional: false,
+			withCommit:    false,
+			sendErr1:      startedRequestError,
+			err2:          notLeaseHolderErr,
+			expErr:        "result is ambiguous",
 		},
-		// The unavailable error is returned with higher priority regardless of withCommit.
 		{
-			withCommit: true,
-			err1:       unavailableError1,
-			err2:       notLeaseHolderErr,
-			expErr:     "unavailable",
+			transactional: false,
+			withCommit:    false,
+			sendErr1:      startedRequestError,
+			err2:          unavailableError2,
+			expErr:        "result is ambiguous",
+		},
+		// If we know the request did not start, do not return an ambiguous error
+		// (next 2 test cases).
+		{
+			transactional: false,
+			withCommit:    false,
+			sendErr1:      notStartedRequestError,
+			err2:          notLeaseHolderErr,
+			expErr:        "leaseholder not found in transport",
+		},
+		{
+			transactional: false,
+			withCommit:    false,
+			sendErr1:      notStartedRequestError,
+			err2:          unavailableError2,
+			expErr:        "unavailable",
+		},
+		// The unavailable error is returned with higher priority regardless of
+		// withCommit and transactional (next 3 test cases).
+		{
+			transactional: true,
+			withCommit:    true,
+			err1:          unavailableError1,
+			err2:          notLeaseHolderErr,
+			expErr:        "unavailable",
 		},
-		// The unavailable error is returned with higher priority regardless of withCommit.
 		{
-			withCommit: false,
-			err1:       unavailableError1,
-			err2:       notLeaseHolderErr,
-			expErr:     "unavailable",
+			transactional: true,
+			withCommit:    false,
+			err1:          unavailableError1,
+			err2:          notLeaseHolderErr,
+			expErr:        "unavailable",
+		},
+		{
+			transactional: false,
+			withCommit:    false,
+			err1:          unavailableError1,
+			err2:          notLeaseHolderErr,
+			expErr:        "unavailable",
 		},
 	}
 	clock := hlc.NewClockForTesting(nil)
@@ -3744,6 +3794,7 @@ func TestReplicaErrorsMerged(t *testing.T) {
 				stopper := stop.NewStopper()
 				defer stopper.Stop(ctx)
 				st := cluster.MakeTestingClusterSettings()
+				NonTransactionalWritesNotIdempotent.Override(ctx, &st.SV, true)
 				rc := rangecache.NewRangeCache(st, nil /* db */, func() int64 { return 100 }, stopper)
 				rc.Insert(ctx, roachpb.RangeInfo{
 					Desc:  initDescriptor,
@@ -3786,12 +3837,16 @@ func TestReplicaErrorsMerged(t *testing.T) {
 						return nil, nil, errors.New("range desc db unexpectedly used")
 					}),
 					TransportFactory: adaptSimpleTransport(transportFn),
-					Settings:         cluster.MakeTestingClusterSettings(),
+					Settings:         st,
 				}
 				ds := NewDistSender(cfg)
 
 				ba := &kvpb.BatchRequest{}
 				ba.Add(kvpb.NewGet(roachpb.Key("a")))
+				ba.Add(kvpb.NewPut(roachpb.Key("b"), roachpb.MakeValueFromString("value")))
+				if tc.transactional {
+					ba.Txn = &roachpb.Transaction{Name: "test"}
+				}
 				tok, err := rc.LookupWithEvictionToken(ctx, roachpb.RKeyMin, rangecache.EvictionToken{}, false)
 				require.NoError(t, err)
 				br, err := ds.sendToReplicas(ctx, ba, tok, tc.withCommit)
 
@@ -597,6 +597,12 @@ func (d *diskQueue) writeFooterAndFlush(ctx context.Context) (err error) {
 			d.serializer = nil
 		}
 	}()
+	if d.writer != nil {
+		// The context that we captured when we created the diskQueueWriter
+		// might have a tracing span that has already been finished. To go
+		// around this, we capture the fresh context.
+		d.writer.ctx = ctx
+	}
 	if err := d.serializer.Finish(); err != nil {
 		return err
 	}
 
@@ -18,6 +18,7 @@ go_library(
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/jobs",
+        "//pkg/jobs/jobfrontier",
         "//pkg/jobs/jobspb",
         "//pkg/keys",
         "//pkg/kv/kvserver/protectedts/ptpb",
@@ -47,7 +48,8 @@ go_library(
         "//pkg/util/ctxgroup",
         "//pkg/util/hlc",
         "//pkg/util/log",
-        "//pkg/util/stop",
+        "//pkg/util/protoutil",
+        "//pkg/util/span",
         "//pkg/util/syncutil",
         "//pkg/util/timeutil",
         "//pkg/util/tracing",
@@ -77,6 +79,7 @@ go_test(
     deps = [
         "//pkg/base",
         "//pkg/jobs",
+        "//pkg/jobs/jobfrontier",
         "//pkg/jobs/jobspb",
         "//pkg/keys",
         "//pkg/kv",
Original file line number	Diff line number	Diff line change
`@@ -597,6 +597,12 @@ func (d *diskQueue) writeFooterAndFlush(ctx context.Context) (err error) {`
`597`	`597`	`d.serializer = nil`
`598`	`598`	`}`
`599`	`599`	`}()`
	`600`	`+ if d.writer != nil {`
	`601`	`+ // The context that we captured when we created the diskQueueWriter`
	`602`	`+ // might have a tracing span that has already been finished. To go`
	`603`	`+ // around this, we capture the fresh context.`
	`604`	`+ d.writer.ctx = ctx`
	`605`	`+ }`
`600`	`606`	`if err := d.serializer.Finish(); err != nil {`
`601`	`607`	`return err`
`602`	`608`	`}`