kvclient: flush the write buffer if it gets too large

arulajmani · arulajmani · commit 202d7ad1366d · 2025-04-07T18:48:07.000-04:00
This patch introduces a new cluster setting, kv.transaction.write_buffering.max_buffer_size, which dictates how large a transaction's write buffer can get before we decide to flush all buffered writes to KV. It defaults to 4MB, for now. Once a transaction's buffer is flushed, subsequent writes will no longer be buffered on the client. Instead, the transaction will write intents, as it would have in a pre-buffered writes world. I briefly considered other schemes where we didn't disable buffered writes completely once a transaction goes over budget -- either by only flushing the buffer partly or flushing the buffer in its entirety but allowing subsequent writes to be buffered as long as the transaction has budget. However, I decided against either of these, as many of the benefits of having buffered writes (e.g. 1PC) are no longer possible after the first flush. Moreover, other benefits (e.g. batching, cheaper read-your-own-writes) don't generalize either. For now, we do the simple thing. Resolves #139056 Release note: None
diff --git a/docs/generated/settings/settings-for-tenants.txt b/docs/generated/settings/settings-for-tenants.txt
@@ -98,6 +98,7 @@ kv.transaction.max_refresh_spans_bytes	integer	4194304	maximum number of bytes u
 kv.transaction.randomized_anchor_key.enabled	boolean	false	dictates whether a transactions anchor key is randomized or not	application
 kv.transaction.reject_over_max_intents_budget.enabled	boolean	false	if set, transactions that exceed their lock tracking budget (kv.transaction.max_intents_bytes) are rejected instead of having their lock spans imprecisely compressed	application
 kv.transaction.write_buffering.enabled	boolean	false	if enabled, transactional writes are buffered on the client	application
+kv.transaction.write_buffering.max_buffer_size	integer	4194304	if non-zero, defines that maximum size of the buffer that will be used to buffer transactional writes per-transaction	application
 kv.transaction.write_pipelining.locking_reads.enabled	boolean	true	if enabled, transactional locking reads are pipelined through Raft consensus	application
 kv.transaction.write_pipelining.ranged_writes.enabled	boolean	true	if enabled, transactional ranged writes are pipelined through Raft consensus	application
 kv.transaction.write_pipelining.enabled (alias: kv.transaction.write_pipelining_enabled)	boolean	true	if enabled, transactional writes are pipelined through Raft consensus	application
diff --git a/docs/generated/settings/settings.html b/docs/generated/settings/settings.html
@@ -127,6 +127,7 @@
 <tr><td><div id="setting-kv-transaction-randomized-anchor-key-enabled" class="anchored"><code>kv.transaction.randomized_anchor_key.enabled</code></div></td><td>boolean</td><td><code>false</code></td><td>dictates whether a transactions anchor key is randomized or not</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
 <tr><td><div id="setting-kv-transaction-reject-over-max-intents-budget-enabled" class="anchored"><code>kv.transaction.reject_over_max_intents_budget.enabled</code></div></td><td>boolean</td><td><code>false</code></td><td>if set, transactions that exceed their lock tracking budget (kv.transaction.max_intents_bytes) are rejected instead of having their lock spans imprecisely compressed</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
 <tr><td><div id="setting-kv-transaction-write-buffering-enabled" class="anchored"><code>kv.transaction.write_buffering.enabled</code></div></td><td>boolean</td><td><code>false</code></td><td>if enabled, transactional writes are buffered on the client</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
+<tr><td><div id="setting-kv-transaction-write-buffering-max-buffer-size" class="anchored"><code>kv.transaction.write_buffering.max_buffer_size</code></div></td><td>integer</td><td><code>4194304</code></td><td>if non-zero, defines that maximum size of the buffer that will be used to buffer transactional writes per-transaction</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
 <tr><td><div id="setting-kv-transaction-write-pipelining-locking-reads-enabled" class="anchored"><code>kv.transaction.write_pipelining.locking_reads.enabled</code></div></td><td>boolean</td><td><code>true</code></td><td>if enabled, transactional locking reads are pipelined through Raft consensus</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
 <tr><td><div id="setting-kv-transaction-write-pipelining-ranged-writes-enabled" class="anchored"><code>kv.transaction.write_pipelining.ranged_writes.enabled</code></div></td><td>boolean</td><td><code>true</code></td><td>if enabled, transactional ranged writes are pipelined through Raft consensus</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
 <tr><td><div id="setting-kv-transaction-write-pipelining-enabled" class="anchored"><code>kv.transaction.write_pipelining.enabled<br />(alias: kv.transaction.write_pipelining_enabled)</code></div></td><td>boolean</td><td><code>true</code></td><td>if enabled, transactional writes are pipelined through Raft consensus</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
diff --git a/pkg/kv/kvclient/kvcoord/txn_coord_sender.go b/pkg/kv/kvclient/kvcoord/txn_coord_sender.go
@@ -321,6 +321,7 @@ func (tc *TxnCoordSender) initCommonInterceptors(
 	if ds, ok := tcf.wrapped.(*DistSender); ok {
 		riGen.ds = ds
 	}
+	tc.interceptorAlloc.txnWriteBuffer.st = tcf.st
 	tc.interceptorAlloc.txnPipeliner = txnPipeliner{
 		st:                       tcf.st,
 		riGen:                    riGen,
diff --git a/pkg/kv/kvclient/kvcoord/txn_interceptor_write_buffer.go b/pkg/kv/kvclient/kvcoord/txn_interceptor_write_buffer.go
@@ -9,11 +9,13 @@ import (
 	"context"
 	"encoding/binary"
 	"slices"
+	"unsafe"
 
 	"github.com/cockroachdb/cockroach/pkg/kv/kvpb"
 	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/concurrency/lock"
 	"github.com/cockroachdb/cockroach/pkg/roachpb"
 	"github.com/cockroachdb/cockroach/pkg/settings"
+	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
 	"github.com/cockroachdb/cockroach/pkg/storage/enginepb"
 	"github.com/cockroachdb/cockroach/pkg/storage/mvccencoding"
 	"github.com/cockroachdb/cockroach/pkg/storage/mvcceval"
@@ -30,6 +32,16 @@ var BufferedWritesEnabled = settings.RegisterBoolSetting(
 	settings.WithPublic,
 )
 
+var bufferedWritesMaxBufferSize = settings.RegisterIntSetting(
+	settings.ApplicationLevel,
+	"kv.transaction.write_buffering.max_buffer_size",
+	"if non-zero, defines that maximum size of the "+
+		"buffer that will be used to buffer transactional writes per-transaction",
+	1<<22, // 4MB
+	settings.NonNegativeInt,
+	settings.WithPublic,
+)
+
 // txnWriteBuffer is a txnInterceptor that buffers transactional writes until
 // commit time. Moreover, it also decomposes read-write KV operations (e.g.
 // CPuts, InitPuts) into separate (locking) read and write operations, buffering
@@ -111,11 +123,40 @@ var BufferedWritesEnabled = settings.RegisterBoolSetting(
 // TODO(arul): In various places below, there's potential to optimize things by
 // batch allocating misc objects and pre-allocating some slices.
 type txnWriteBuffer struct {
+	st *cluster.Settings
+	// enabled indicates whether write buffering is currently enabled for the
+	// transaction or not. Write buffering may only be enabled on RootTxns, and
+	// before the transaction has sent any requests. However, a transaction that
+	// has previously buffered writes may flush its buffer and disable write
+	// buffering for subsequent requests. This can happen for a few different
+	// reasons:
+	//
+	// 1. If the buffer has exceeded its configured budget, or if the transaction
+	// issues a DeleteRange request, we flush the buffer and disable write
+	// buffering going forward. In either case, we're dealing with large writing
+	// transactions, and there isn't much benefit from write buffering.
+	// 2. If the transaction is performing a DDL operation, we flush the buffer
+	// and disable write buffering going forward out of an abundance of caution.
+	// This is opted into by SQL.
+	//
+	// As a result, we have a nice invariant: if write buffering is enabled, then
+	// all writes performed by the transaction are buffered in memory. We can
+	// never have the case where a part of the write set is buffered, and the
+	// other part is replicated.
+	//
+	// In the future, the invariant above allows us to omit checking the AbortSpan
+	// for transactions that have buffered writes enabled. The AbortSpan is used
+	// to ensure we don't violate read-your-own-write semantics for transactions
+	// that have been aborted by a conflicting transaction. As read-your-own-write
+	// semantics are upheld by the client, not the server, for transactions that
+	// use buffered writes, we can skip the AbortSpan check on the server.
 	enabled bool
 
 	buffer        btree
-	bufferSeek    bufferedWrite // re-use while seeking
 	bufferIDAlloc uint64
+	bufferSize    int64
+
+	bufferSeek bufferedWrite // re-use while seeking
 
 	wrapped lockedSender
 
@@ -137,7 +178,15 @@ func (twb *txnWriteBuffer) SendLocked(
 			// anything.
 			return twb.wrapped.SendLocked(ctx, ba)
 		}
-		return twb.flushWithEndTxn(ctx, ba)
+		return twb.flushBufferAndSendBatch(ctx, ba)
+	}
+
+	// Check if buffering writes from the supplied batch will run us over
+	// budget. If it will, we shouldn't buffer writes from the current batch,
+	// and flush the buffer.
+	if twb.estimateSize(ba)+twb.bufferSize > bufferedWritesMaxBufferSize.Get(&twb.st.SV) {
+		// TODO(arul): add some metrics for this case.
+		return twb.flushBufferAndSendBatch(ctx, ba)
 	}
 
 	transformedBa, ts := twb.applyTransformations(ctx, ba)
@@ -165,6 +214,50 @@ func (twb *txnWriteBuffer) SendLocked(
 	return twb.mergeResponseWithTransformations(ctx, ts, br)
 }
 
+// estimateSize returns a conservative estimate by which the buffer will grow in
+// size if the writes from the supplied batch request are buffered.
+func (twb *txnWriteBuffer) estimateSize(ba *kvpb.BatchRequest) int64 {
+	var scratch bufferedWrite
+	estimate := int64(0)
+	scratch.vals = make([]bufferedValue, 1)
+	for _, ru := range ba.Requests {
+		req := ru.GetInner()
+		switch t := req.(type) {
+		case *kvpb.ConditionalPutRequest:
+			// At this point, we don't know whether the condition will evaluate
+			// successfully or not, and by extension, whether the KV will be added to
+			// the buffer. We therefore assume the worst case scenario (where the KV
+			// is added to the buffer) in our estimate.
+			scratch.key = t.Key
+			scratch.vals[0] = bufferedValue{
+				val: t.Value,
+				seq: t.Sequence,
+			}
+			estimate += scratch.size()
+		case *kvpb.PutRequest:
+			// NB: when estimating, we're being conservative by assuming the Put is to
+			// a key that isn't already present in the buffer. If it were, we could
+			// omit the key's size from the estimate.
+			scratch.key = t.Key
+			scratch.vals[0] = bufferedValue{
+				val: t.Value,
+				seq: t.Sequence,
+			}
+			estimate += scratch.size()
+		case *kvpb.DeleteRequest:
+			// NB: Similar to Put, we're assuming we're deleting a key that isn't
+			// already present in the buffer.
+			scratch.key = t.Key
+			scratch.vals[0] = bufferedValue{
+				seq: t.Sequence,
+			}
+			estimate += scratch.size()
+		}
+		// No other request is buffered.
+	}
+	return estimate
+}
+
 // adjustError adjusts the provided error based on the transformations made by
 // the txnWriteBuffer to the batch request before sending it to KV.
 func (twb *txnWriteBuffer) adjustError(
@@ -922,36 +1015,61 @@ func (twb *txnWriteBuffer) addToBuffer(key roachpb.Key, val roachpb.Value, seq e
 	if it.Valid() {
 		// We've already seen a write for this key.
 		bw := it.Cur()
-		bw.vals = append(bw.vals, bufferedValue{val: val, seq: seq})
+		val := bufferedValue{val: val, seq: seq}
+		bw.vals = append(bw.vals, val)
+		twb.bufferSize += val.size()
 	} else {
 		twb.bufferIDAlloc++
-		twb.buffer.Set(&bufferedWrite{
+		bw := &bufferedWrite{
 			id:   twb.bufferIDAlloc,
 			key:  key,
 			vals: []bufferedValue{{val: val, seq: seq}},
-		})
+		}
+		twb.buffer.Set(bw)
+		twb.bufferSize += bw.size()
 	}
 }
 
-// flushWithEndTxn flushes all buffered writes to the KV layer along with the
-// EndTxn request. Responses from the flushing are stripped before returning.
-func (twb *txnWriteBuffer) flushWithEndTxn(
+// flushBufferAndSendBatch flushes all buffered writes when sending the supplied
+// batch request to the KV layer. This is done by pre-pending the buffered
+// writes to the requests in the batch.
+//
+// The response is transformed to hide the fact that requests were added to the
+// batch to flush the buffer. Upper layers remain oblivious to the flush and any
+// buffering in general.
+func (twb *txnWriteBuffer) flushBufferAndSendBatch(
 	ctx context.Context, ba *kvpb.BatchRequest,
 ) (*kvpb.BatchResponse, *kvpb.Error) {
+	defer func() {
+		assertTrue(twb.buffer.Len() == 0, "buffer should be empty after flush")
+		assertTrue(twb.bufferSize == 0, "buffer size should be 0 after flush")
+	}()
+
 	numBuffered := twb.buffer.Len()
 	if numBuffered == 0 {
 		return twb.wrapped.SendLocked(ctx, ba) // nothing to flush
 	}
-	// Iterate over the buffered writes and flush all buffered writes to the KV
-	// layer by adding them to the batch.
-	//
-	// TODO(arul): If the batch request with the EndTxn request also contains an
-	// overlapping write to a key that's already in the buffer, we could exclude
-	// that write from the buffer.
-	reqs := make([]kvpb.RequestUnion, 0, numBuffered+len(ba.Requests))
+
+	// Once we've flushed the buffer, we disable write buffering going forward.
+	twb.enabled = false
+
+	// Flush all buffered writes by pre-pending them to the requests being sent
+	// in the batch.
+	// First, collect the requests we'll need to flush.
+	toFlushBufferedWrites := make([]bufferedWrite, 0, twb.buffer.Len())
+
 	it := twb.buffer.MakeIter()
 	for it.First(); it.Valid(); it.Next() {
-		reqs = append(reqs, it.Cur().toRequest())
+		toFlushBufferedWrites = append(toFlushBufferedWrites, *it.Cur())
+	}
+
+	reqs := make([]kvpb.RequestUnion, 0, numBuffered+len(ba.Requests))
+
+	// Next, remove the buffered writes from the buffer and collect them into requests.
+	for _, bw := range toFlushBufferedWrites {
+		reqs = append(reqs, bw.toRequest())
+		twb.buffer.Delete(&bw)
+		twb.bufferSize -= bw.size()
 	}
 
 	// Layers below us expect that writes inside a batch are in sequence number
@@ -970,13 +1088,12 @@ func (twb *txnWriteBuffer) flushWithEndTxn(
 	})
 
 	ba = ba.ShallowCopy()
-	reqs = append(reqs, ba.Requests...)
-	ba.Requests = reqs
-
+	ba.Requests = append(reqs, ba.Requests...)
 	br, pErr := twb.wrapped.SendLocked(ctx, ba)
 	if pErr != nil {
 		return nil, twb.adjustErrorUponFlush(ctx, numBuffered, pErr)
 	}
+
 	// Strip out responses for all the flushed buffered writes.
 	br.Responses = br.Responses[numBuffered:]
 	return br, nil
@@ -1003,6 +1120,8 @@ func (twb *txnWriteBuffer) testingBufferedWritesAsSlice() []bufferedWrite {
 	return writes
 }
 
+const bufferedWriteStructOverhead = int64(unsafe.Sizeof(bufferedWrite{}))
+
 // bufferedWrite is a buffered write operation to a given key. It maps a key to
 // possibly multiple values[1], each with an associated sequence number.
 //
@@ -1022,6 +1141,16 @@ type bufferedWrite struct {
 	vals []bufferedValue // sorted in increasing sequence number order
 }
 
+func (bw *bufferedWrite) size() int64 {
+	size := keySize(bw.key) + keySize(bw.endKey) + bufferedWriteStructOverhead
+	for _, v := range bw.vals {
+		size += v.size()
+	}
+	return size
+}
+
+const bufferedValueStructOverhead = int64(unsafe.Sizeof(bufferedValue{}))
+
 // bufferedValue is a value written to a key at a given sequence number.
 type bufferedValue struct {
 	val roachpb.Value
@@ -1037,6 +1166,10 @@ func (bv *bufferedValue) valPtr() *roachpb.Value {
 	return &valCpy
 }
 
+func (bv *bufferedValue) size() int64 {
+	return int64(len(bv.val.RawBytes)) + bufferedValueStructOverhead
+}
+
 //go:generate ../../../util/interval/generic/gen.sh *bufferedWrite kvcoord
 
 // Methods required by util/interval/generic type contract.
diff --git a/pkg/kv/kvclient/kvcoord/txn_interceptor_write_buffer_test.go b/pkg/kv/kvclient/kvcoord/txn_interceptor_write_buffer_test.go

Original file line number	Diff line number	Diff line change
`@@ -321,6 +321,7 @@ func (tc *TxnCoordSender) initCommonInterceptors(`
`321`	`321`	`if ds, ok := tcf.wrapped.(*DistSender); ok {`
`322`	`322`	`riGen.ds = ds`
`323`	`323`	`}`
	`324`	`+ tc.interceptorAlloc.txnWriteBuffer.st = tcf.st`
`324`	`325`	`tc.interceptorAlloc.txnPipeliner = txnPipeliner{`
`325`	`326`	`st: tcf.st,`
`326`	`327`	`riGen: riGen,`