kvcoord: fix txnWriteBuffer for batches with limits and Dels

yuzefovich · yuzefovich · commit 174d42fb8ad4 · 2025-08-07T19:53:48.000Z
Previously, the txnWriteBuffer was oblivious to the fact that some
transformed requests might be returned incomplete due to limits set on
the BatchRequest (either TargetBytes or MaxSpanRequestKeys), so it would
incorrectly think that it has acquired locks on some keys when it
hasn't. Usage from SQL was only exposed to the bug via special delete-range
fast-path where we used point Dels (i.e. a stmt of the form
`DELETE FROM t WHERE k IN (&lt;ids&gt;)` where there are gaps between `id`s)
since it always sets a key limit of 600. This commit fixes this
particular issue for Dels transformed into Gets and adds a couple of
assertions that we don't see batches with CPuts and/or Puts with the
limits set.

Additionally, it adjusts the comment to indicate which requests are
allowed in batches with limits.

Given that this feature is disabled by default and in the private
preview AND it's limited to the DELETE fast-path when more than 600 keys
are deleted, I decided to omit the release note.

Release note: None
diff --git a/pkg/kv/kvclient/kvcoord/dist_sender.go b/pkg/kv/kvclient/kvcoord/dist_sender.go
@@ -1070,9 +1070,8 @@ func (ds *DistSender) initAndVerifyBatch(ctx context.Context, ba *kvpb.BatchRequ
 	}
 
 	if ba.MaxSpanRequestKeys != 0 || ba.TargetBytes != 0 {
-		// Verify that the batch contains only specific range requests or the
-		// EndTxnRequest. Verify that a batch with a ReverseScan only contains
-		// ReverseScan range requests.
+		// Verify that the batch contains only specific requests. Verify that a
+		// batch with a ReverseScan only contains ReverseScan range requests.
 		var foundForward, foundReverse bool
 		for _, req := range ba.Requests {
 			inner := req.GetInner()
diff --git a/pkg/kv/kvclient/kvcoord/txn_interceptor_write_buffer.go b/pkg/kv/kvclient/kvcoord/txn_interceptor_write_buffer.go
@@ -326,7 +326,15 @@ func (twb *txnWriteBuffer) validateRequests(ba *kvpb.BatchRequest) error {
 			if t.OriginTimestamp.IsSet() {
 				return unsupportedOptionError(t.Method(), "OriginTimestamp")
 			}
+			assertTrue(ba.MaxSpanRequestKeys == 0 && ba.TargetBytes == 0, "unexpectedly found CPut in a BatchRequest with a limit")
 		case *kvpb.PutRequest:
+			// TODO(yuzefovich): the DistSender allows Puts to be in batches
+			// with limits, which can happen when we're forced to flush the
+			// buffered Puts, and the batch we piggy-back on has a limit set.
+			// However, SQL never constructs such a batch on its own, so we're
+			// asserting the expectations from SQL. Figure out how to reconcile
+			// this with more permissive DistSender-level checks.
+			assertTrue(ba.MaxSpanRequestKeys == 0 && ba.TargetBytes == 0, "unexpectedly found Put in a BatchRequest with a limit")
 		case *kvpb.DeleteRequest:
 		case *kvpb.GetRequest:
 			// ReturnRawMVCCValues is unsupported because we don't know how to serve
@@ -1058,6 +1066,11 @@ func (rr requestRecord) toResp(
 			// We only use the response from KV if there wasn't already a
 			// buffered value for this key that our transaction wrote
 			// previously.
+			// TODO(yuzefovich): for completeness, we should check whether
+			// ResumeSpan is non-nil, in which case the response from KV is
+			// incomplete. This can happen when MaxSpanRequestKeys and/or
+			// TargetBytes limits are set on the batch, and SQL currently
+			// doesn't do that for batches with CPuts.
 			val = br.GetInner().(*kvpb.GetResponse).Value
 		}
 
@@ -1078,17 +1091,22 @@ func (rr requestRecord) toResp(
 		twb.addToBuffer(req.Key, req.Value, req.Sequence, req.KVNemesisSeq)
 
 	case *kvpb.PutRequest:
+		// TODO(yuzefovich): for completeness, we should check whether
+		// ResumeSpan is non-nil if we transformed the request, in which case
+		// the response from KV is incomplete. This can happen when
+		// MaxSpanRequestKeys and/or TargetBytes limits are set on the batch,
+		// and SQL currently doesn't do that for batches with Puts.
 		ru.MustSetInner(&kvpb.PutResponse{})
 		twb.addToBuffer(req.Key, req.Value, req.Sequence, req.KVNemesisSeq)
 
 	case *kvpb.DeleteRequest:
 		// To correctly populate FoundKey in the response, we must prefer any
 		// buffered values (if they exist).
-		var foundKey bool
+		var resp kvpb.DeleteResponse
 		val, served := twb.maybeServeRead(req.Key, req.Sequence)
 		if served {
 			log.VEventf(ctx, 2, "serving read portion of %s on key %s from the buffer", req.Method(), req.Key)
-			foundKey = val.IsPresent()
+			resp.FoundKey = val.IsPresent()
 		} else if req.MustAcquireExclusiveLock {
 			// We sent a GetRequest to the KV layer to acquire an exclusive lock
 			// on the key, regardless of whether the key already exists or not.
@@ -1097,7 +1115,8 @@ func (rr requestRecord) toResp(
 			if log.ExpensiveLogEnabled(ctx, 2) {
 				log.Eventf(ctx, "synthesizing DeleteResponse from GetResponse: %#v", getResp)
 			}
-			foundKey = getResp.Value.IsPresent()
+			resp.FoundKey = getResp.Value.IsPresent()
+			resp.ResumeSpan = getResp.ResumeSpan
 		} else {
 			// NB: If MustAcquireExclusiveLock wasn't set by the client then we
 			// eschew sending a Get request to the KV layer just to populate
@@ -1109,16 +1128,26 @@ func (rr requestRecord) toResp(
 			// TODO(arul): improve the FoundKey semantics to have callers opt
 			// into whether the care about the key being found. Alternatively,
 			// clarify the behaviour on DeleteRequest.
-			foundKey = false
+			resp.FoundKey = false
 		}
-		ru.MustSetInner(&kvpb.DeleteResponse{
-			FoundKey: foundKey,
-		})
+
+		ru.MustSetInner(&resp)
+		if resp.ResumeSpan != nil {
+			// When the Get was incomplete, we haven't actually processed this
+			// Del, so we cannot buffer the write.
+			break
+		}
+
 		twb.addToBuffer(req.Key, roachpb.Value{}, req.Sequence, req.KVNemesisSeq)
 
 	case *kvpb.GetRequest:
 		val, served := twb.maybeServeRead(req.Key, req.Sequence)
 		if served {
+			// TODO(yuzefovich): we're effectively ignoring the limits of
+			// BatchRequest when serving the Get from the buffer. We should
+			// consider setting the ResumeSpan if a limit has already been
+			// reached by this point. This will force us to set ResumeSpan on
+			// all remaining requests in the batch.
 			getResp := &kvpb.GetResponse{}
 			if val.IsPresent() {
 				getResp.Value = val
@@ -1612,8 +1641,6 @@ func (s *respIter) startKey() roachpb.Key {
 	// For ReverseScans, the EndKey of the ResumeSpan is updated to indicate the
 	// start key for the "next" page, which is exactly the last key that was
 	// reverse-scanned for the current response.
-	// TODO(yuzefovich): we should have some unit tests that exercise the
-	// ResumeSpan case.
 	if s.resumeSpan != nil {
 		return s.resumeSpan.EndKey
 	}
@@ -1684,6 +1711,11 @@ func makeRespSizeHelper(it *respIter) respSizeHelper {
 }
 
 func (h *respSizeHelper) acceptBuffer(key roachpb.Key, value *roachpb.Value) {
+	// TODO(yuzefovich): we're effectively ignoring the limits of BatchRequest
+	// when serving the reads from the buffer. We should consider checking how
+	// many keys and bytes have already been included to see whether we've
+	// reached a limit, and set the ResumeSpan if so (which can result in some
+	// wasted work by the server).
 	h.numKeys++
 	lenKV, _ := encKVLength(key, value)
 	h.numBytes += int64(lenKV)
diff --git a/pkg/kv/kvclient/kvcoord/txn_interceptor_write_buffer_test.go b/pkg/kv/kvclient/kvcoord/txn_interceptor_write_buffer_test.go
@@ -1310,6 +1310,75 @@ func TestTxnWriteBufferRespectsMustAcquireExclusiveLock(t *testing.T) {
 	require.IsType(t, &kvpb.EndTxnResponse{}, br.Responses[0].GetInner())
 }
 
+// TestTxnWriteBufferResumeSpans verifies that the txnWriteBuffer behaves
+// correctly in presence of BatchRequest's limits that result in non-nil
+// ResumeSpans.
+// TODO(yuzefovich): extend the test for Scans and ReverseScans.
+func TestTxnWriteBufferResumeSpans(t *testing.T) {
+	defer leaktest.AfterTest(t)()
+	defer log.Scope(t).Close(t)
+	ctx := context.Background()
+	twb, mockSender := makeMockTxnWriteBuffer(cluster.MakeClusterSettings())
+
+	txn := makeTxnProto()
+	txn.Sequence = 1
+	keyA, keyB, keyC := roachpb.Key("a"), roachpb.Key("b"), roachpb.Key("c")
+
+	// Delete 3 keys while setting MaxSpanRequestKeys to 2 (only the first two
+	// Dels should be processed).
+	ba := &kvpb.BatchRequest{}
+	ba.Header = kvpb.Header{Txn: &txn, MaxSpanRequestKeys: 2}
+	for _, k := range []roachpb.Key{keyA, keyB, keyC} {
+		del := delArgs(k, txn.Sequence)
+		// Set MustAcquireExclusiveLock so that Del is transformed into Get.
+		del.MustAcquireExclusiveLock = true
+		ba.Add(del)
+	}
+
+	// Simulate a scenario where each transformed Get finds something and the
+	// limit is reached after the second Get.
+	mockSender.MockSend(func(ba *kvpb.BatchRequest) (*kvpb.BatchResponse, *kvpb.Error) {
+		require.Equal(t, int64(2), ba.MaxSpanRequestKeys)
+		require.Len(t, ba.Requests, 3)
+		require.IsType(t, &kvpb.GetRequest{}, ba.Requests[0].GetInner())
+		require.IsType(t, &kvpb.GetRequest{}, ba.Requests[1].GetInner())
+		require.IsType(t, &kvpb.GetRequest{}, ba.Requests[2].GetInner())
+
+		br := ba.CreateReply()
+		br.Txn = ba.Txn
+		br.Responses = []kvpb.ResponseUnion{
+			{Value: &kvpb.ResponseUnion_Get{
+				Get: &kvpb.GetResponse{Value: &roachpb.Value{RawBytes: []byte("a")}},
+			}},
+			{Value: &kvpb.ResponseUnion_Get{
+				Get: &kvpb.GetResponse{Value: &roachpb.Value{RawBytes: []byte("b")}},
+			}},
+			{Value: &kvpb.ResponseUnion_Get{
+				Get: &kvpb.GetResponse{ResponseHeader: kvpb.ResponseHeader{
+					ResumeSpan: &roachpb.Span{Key: keyC},
+				}},
+			}},
+		}
+		return br, nil
+	})
+
+	br, pErr := twb.SendLocked(ctx, ba)
+	require.Nil(t, pErr)
+	require.NotNil(t, br)
+
+	// Even though the txnWriteBuffer did not send any Del requests to the KV
+	// layer above, the responses should still be populated.
+	require.Len(t, br.Responses, 3)
+	require.Equal(t, &kvpb.DeleteResponse{FoundKey: true}, br.Responses[0].GetInner())
+	require.Equal(t, &kvpb.DeleteResponse{FoundKey: true}, br.Responses[1].GetInner())
+	// The last Del wasn't processed, so we should see the ResumeSpan set in the
+	// header.
+	require.NotNil(t, br.Responses[2].GetInner().(*kvpb.DeleteResponse).ResumeSpan)
+
+	// Verify that only two writes are buffered.
+	require.Equal(t, 2, len(twb.testingBufferedWritesAsSlice()))
+}
+
 // TestTxnWriteBufferMustSortBatchesBySequenceNumber verifies that flushed
 // batches are sorted in sequence number order, as currently required by the txn
 // pipeliner interceptor.
diff --git a/pkg/kv/kvpb/api.proto b/pkg/kv/kvpb/api.proto
@@ -2808,11 +2808,16 @@ message Header {
   // - RevertRangeRequest
   // - ResolveIntentRangeRequest
   // - QueryLocksRequest
+  // - IsSpanEmptyRequest
   //
-  // The following two requests types are also allowed in the batch, although
-  // the limit has no effect on them:
+  // The following requests types are also allowed in the batch, although the
+  // limit has no effect on them:
+  // - ExportRequest
   // - QueryIntentRequest
   // - EndTxnRequest
+  // - ResolveIntentRequest
+  // - DeleteRequest
+  // - PutRequest
   //
   // [*] DeleteRangeRequests are generally not allowed to be batched together
   // with a commit (i.e. 1PC), except if Require1PC is also set. See #37457.