Merge #147065

craig[bot] · stevendanna · craig[bot] · commit 4819df357c0e · 2025-09-26T12:51:28.000Z
147065: concurrency: update wait queues in AddDiscoveredLock r=miraradeva a=stevendanna Concurrent readers in the face of a concurrent, retrying writer can result in a situation where AddDiscoveredLock moves the timestamp of a held intent past the read timestamp a waiting reader. The reader should be unblocked in this case but previously wasn't, resulting in a lock table verification assertion failure in the form of: error: non locking reader ... does not conflict with lock holder This is my best theory for what is happening in #146749 based on increased logging. It is difficult to be certain given the required ordering of events is hard to observe with locking. See the comment in the test for a more complete timeline that can lead to the bug. Fixes #146749 Release note: None Co-authored-by: Steven Danna <danna@cockroachlabs.com>
diff --git a/pkg/kv/kvserver/concurrency/lock_table.go b/pkg/kv/kvserver/concurrency/lock_table.go
@@ -3132,6 +3132,7 @@ func (kl *keyLocks) discoveredLock(
 	accessStrength lock.Strength,
 	notRemovable bool,
 	clock *hlc.Clock,
+	st *cluster.Settings,
 ) error {
 	kl.mu.Lock()
 	defer kl.mu.Unlock()
@@ -3144,7 +3145,14 @@ func (kl *keyLocks) discoveredLock(
 	if kl.isLockedBy(foundLock.Txn.ID) {
 		e := kl.heldBy[foundLock.Txn.ID]
 		tl = e.Value
+
+		beforeTs := tl.writeTS()
+
 		tl.replicatedInfo.acquire(foundLock.Strength, foundLock.Txn.WriteTimestamp)
+
+		if beforeTs.Less(tl.writeTS()) {
+			kl.recomputeWaitQueues(st)
+		}
 		// TODO(arul): If the discovered lock indicates a newer epoch than what's
 		// being tracked, should we clear out unreplicatedLockInfo here?
 	} else {
@@ -4352,7 +4360,7 @@ func (t *lockTableImpl) AddDiscoveredLock(
 		g.notRemovableLock = l
 		notRemovableLock = true
 	}
-	err = l.discoveredLock(foundLock, g, str, notRemovableLock, g.lt.clock)
+	err = l.discoveredLock(foundLock, g, str, notRemovableLock, g.lt.clock, g.lt.settings)
 	// Can't release tree.mu until call l.discoveredLock() since someone may
 	// find an empty lock and remove it from the tree.
 	t.locks.mu.Unlock()
diff --git a/pkg/kv/kvserver/concurrency/lock_table_test.go b/pkg/kv/kvserver/concurrency/lock_table_test.go
@@ -312,6 +312,7 @@ func TestLockTableBasic(t *testing.T) {
 				if d.HasArg("skip-locked") {
 					waitPolicy = lock.WaitPolicy_SkipLocked
 				}
+				updateRetainedTxn := !d.HasArg("no-update-retained-txn")
 				var maxLockWaitQueueLength int
 				if d.HasArg("max-lock-wait-queue-length") {
 					d.ScanArgs(t, "max-lock-wait-queue-length", &maxLockWaitQueueLength)
@@ -331,11 +332,17 @@ func TestLockTableBasic(t *testing.T) {
 				if txnMeta != nil {
 					// Update the transaction's timestamp, if necessary. The transaction
 					// may have needed to move its timestamp for any number of reasons.
-					txnMeta.WriteTimestamp = ts
+					if updateRetainedTxn {
+						txnMeta.WriteTimestamp = ts
+					}
 					ba.Txn = &roachpb.Transaction{
 						TxnMeta:       *txnMeta,
 						ReadTimestamp: ts,
 					}
+					if !updateRetainedTxn {
+						ba.Txn.WriteTimestamp = ts
+					}
+
 					req.Txn = ba.Txn
 				}
 				requestsByName[reqName] = req
diff --git a/pkg/kv/kvserver/concurrency/testdata/lock_table/add_discovered b/pkg/kv/kvserver/concurrency/testdata/lock_table/add_discovered
@@ -141,3 +141,157 @@ new: state=doneWaiting
 dequeue r=req2
 ----
 num=0
+
+# -------------------------------------------------------------
+# This is a regression test for #146749
+#
+# It attempts to reproduce the following series of events on a single key.
+#
+# s | w1@ts10                  | w2@ts12                  | r1@ts10                    |r2@ts12                      | r3@ts11
+# --+--------------------------+--------------------------+----------------------------+-----------------------------+-----------------------------
+# 1 | Seq                      |                          |                            |                             |
+# 2 | AcquireLock(intent@ts10) |                          |                            |                             |
+# 3 | FinishReq                |                          |                            |                             |
+# 4 |                          | SeqReq                   | SeqReq                     |                             |
+# 5 |                          | AcquireLock(intent@ts12) |                            |                             |
+# 6 |                          | FinishReq                |                            |                             |
+# 8 |                          |                          |                            | SeqReq                      |
+# 9 |                          |                          | AddDiscovered(intent@ts10) |                             |
+# 10|                          |                          | FinishReq                  |                             |
+# 11|                          |                          |                            |                             | SeqReq(waits on intent@10)
+# 12|                          |                          |                            | AddDiscovered(intent@ts12)  |
+# 13|                          |                          |                            | FinishReq
+#
+# Note that w1 and w2 are from the same txn.
+#
+# The invariant that the test cares about is that r3 that starts waiting at step 11 is unblocked by the lock table
+# update at step 12.
+#
+# Note that in the test as written, the followin are how the reqests align
+# to the above diagram:
+#
+# w1 = req1
+# w2 = req3
+# r1 = req2
+# r2 = req4
+# r3 = req5
+#
+# -------------------------------------------------------------
+
+new-lock-table maxlocks=10000
+----
+
+# txn1 is used for the writer (req1 and req3)
+new-txn txn=txn1 ts=10 epoch=0
+----
+
+# txn2-4 are our readers.
+new-txn txn=txn2 ts=10 epoch=0
+----
+
+new-txn txn=txn3 ts=12 epoch=0
+----
+
+new-txn txn=txn4 ts=11 epoch=0
+----
+
+# w1
+new-request r=req1 txn=txn1 ts=10 spans=intent@
+----
+
+scan r=req1
+----
+start-waiting: false
+
+acquire r=req1 k=a durability=r strength=intent
+----
+num=0
+
+dequeue r=req1
+----
+num=0
+
+# r1
+new-request r=req2 txn=txn2 ts=10 spans=none@a
+----
+
+# w2
+# Note that we don't allow the retained txn to be updated because that will erroneously impact
+# the fist call to add-discovered-lock.
+new-request r=req3 txn=txn1 ts=12 spans=intent@a no-update-retained-txn
+----
+
+scan r=req3
+----
+start-waiting: false
+
+scan r=req2
+----
+start-waiting: false
+
+acquire r=req3 k=a durability=r strength=intent
+----
+num=0
+
+dequeue r=req3
+----
+num=0
+
+# r2
+new-request r=req4 txn=txn3 ts=12 spans=none@a
+----
+
+scan r=req4
+----
+start-waiting: false
+
+# The first reader now discovers the intent at ts10.
+add-discovered r=req2 k=a txn=txn1
+----
+num=1
+ lock: "a"
+  holder: txn: 00000000-0000-0000-0000-000000000001 epoch: 0, iso: Serializable, ts: 10.000000000,0, info: repl [Intent]
+
+dequeue r=req2
+----
+num=1
+ lock: "a"
+  holder: txn: 00000000-0000-0000-0000-000000000001 epoch: 0, iso: Serializable, ts: 10.000000000,0, info: repl [Intent]
+
+# Third reader that waits on intents.
+new-request r=req5 txn=txn4 ts=11 spans=none@a
+----
+
+scan r=req5
+----
+start-waiting: true
+
+print
+----
+num=1
+ lock: "a"
+  holder: txn: 00000000-0000-0000-0000-000000000001 epoch: 0, iso: Serializable, ts: 10.000000000,0, info: repl [Intent]
+   waiting readers:
+    req: 5, txn: 00000000-0000-0000-0000-000000000004
+
+# Now update txn1's retained record to ts12.
+update-txn-not-observed txn=txn1 ts=12 epoch=1
+----
+
+add-discovered r=req4 k=a txn=txn1
+----
+num=1
+ lock: "a"
+  holder: txn: 00000000-0000-0000-0000-000000000001 epoch: 0, iso: Serializable, ts: 12.000000000,0, info: repl [Intent]
+
+dequeue r=req4
+----
+num=1
+ lock: "a"
+  holder: txn: 00000000-0000-0000-0000-000000000001 epoch: 0, iso: Serializable, ts: 12.000000000,0, info: repl [Intent]
+
+dequeue r=req5
+----
+num=1
+ lock: "a"
+  holder: txn: 00000000-0000-0000-0000-000000000001 epoch: 0, iso: Serializable, ts: 12.000000000,0, info: repl [Intent]