Skip to content

Commit f76dd4a

Browse files
committed
execstats: add latch wait time into cumulative contention time
This patch adds a field `IsLatch` to the `ContentionEvent` protobuf struct that indicates that a contention event was emitted by the latch table. The event is added to the trace and used to increment the contention time metrics, but is only added to the contention registry if `register_latch_wait_contention_events` is set to true. This commit also propagates the transaction ID into the latch manager and the contention events listener. This information is used during a latch wait event to distinguish between time waiting for a transaction's own requests (e.g. for QueryIntent) and for real contention between different transactions. Fixes #103713 Release note (ui change): The contention time metric now includes time spent waiting to acquire latches in addition to time spent acquiring locks. This applies to both the cumulative contention time visible in the db console and the annotations of an `EXPLAIN ANALYZE` output.
1 parent 3a25007 commit f76dd4a

27 files changed

+244
-85
lines changed

pkg/kv/kvpb/api.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2349,7 +2349,13 @@ func (r *IsSpanEmptyResponse) IsEmpty() bool {
23492349

23502350
// SafeFormat implements redact.SafeFormatter.
23512351
func (c *ContentionEvent) SafeFormat(w redact.SafePrinter, _ rune) {
2352-
w.Printf("conflicted with %s on %s for %.3fs", c.TxnMeta.ID, c.Key, c.Duration.Seconds())
2352+
prefix := redact.SafeString("conflicted")
2353+
if c.IsLatch {
2354+
prefix = "latch conflict"
2355+
}
2356+
w.Printf("%s with %s on %s for %.3fs",
2357+
prefix, c.TxnMeta.ID, c.Key, c.Duration.Seconds(),
2358+
)
23532359
}
23542360

23552361
// String implements fmt.Stringer.

pkg/kv/kvpb/api.proto

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3737,6 +3737,9 @@ message ContentionEvent {
37373737
// Duration spent contending against the other transaction.
37383738
google.protobuf.Duration duration = 3 [(gogoproto.nullable) = false,
37393739
(gogoproto.stdduration) = true];
3740+
// IsLatch, if true, indicates that the contention event was due to waiting to
3741+
// acquire a latch.
3742+
bool is_latch = 4;
37403743
}
37413744

37423745
// ScanStats is a message that tracks miscellaneous statistics of all Gets,

pkg/kv/kvserver/concurrency/concurrency_control.go

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@ import (
2222
"github.com/cockroachdb/cockroach/pkg/storage/enginepb"
2323
"github.com/cockroachdb/cockroach/pkg/util/hlc"
2424
"github.com/cockroachdb/cockroach/pkg/util/uuid"
25-
"github.com/cockroachdb/redact"
2625
)
2726

2827
// Manager is a structure that sequences incoming requests and provides
@@ -446,9 +445,8 @@ type Request struct {
446445
// not also passed an exiting Guard.
447446
LockSpans *lockspanset.LockSpanSet
448447

449-
// The SafeFormatter capable of formatting the request. This is used to enrich
450-
// logging with request level information when latches conflict.
451-
BaFmt redact.SafeFormatter
448+
// Batch is the batch to which the request belongs.
449+
Batch *kvpb.BatchRequest
452450

453451
// DeadlockTimeout is the amount of time that the request will wait on a lock
454452
// before pushing the lock holder's transaction for deadlock detection.
@@ -531,7 +529,12 @@ type latchManager interface {
531529
// WaitFor waits for conflicting latches on the specified spans without adding
532530
// any latches itself. Fast path for operations that only require flushing out
533531
// old operations without blocking any new ones.
534-
WaitFor(ctx context.Context, spans *spanset.SpanSet, pp poison.Policy, baFmt redact.SafeFormatter) *Error
532+
WaitFor(
533+
ctx context.Context,
534+
spans *spanset.SpanSet,
535+
pp poison.Policy,
536+
ba *kvpb.BatchRequest,
537+
) *Error
535538

536539
// Poison a guard's latches, allowing waiters to fail fast.
537540
Poison(latchGuard)

pkg/kv/kvserver/concurrency/concurrency_manager.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,7 @@ func NewManager(cfg Config) Manager {
181181
cfg.SlowLatchGauge,
182182
cfg.Settings,
183183
cfg.LatchWaitDurations,
184+
cfg.Clock,
184185
),
185186
},
186187
lt: lt,
@@ -269,7 +270,7 @@ func (m *managerImpl) sequenceReqWithGuard(
269270
// them.
270271
if shouldWaitOnLatchesWithoutAcquiring(g.Req) {
271272
log.Event(ctx, "waiting on latches without acquiring")
272-
return nil, m.lm.WaitFor(ctx, g.Req.LatchSpans, g.Req.PoisonPolicy, g.Req.BaFmt)
273+
return nil, m.lm.WaitFor(ctx, g.Req.LatchSpans, g.Req.PoisonPolicy, g.Req.Batch)
273274
}
274275

275276
// Provide the manager with an opportunity to intercept the request. It

pkg/kv/kvserver/concurrency/concurrency_manager_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -220,7 +220,7 @@ func TestConcurrencyManagerBasic(t *testing.T) {
220220
LatchSpans: latchSpans,
221221
LockSpans: lockSpans,
222222
PoisonPolicy: pp,
223-
BaFmt: ba,
223+
Batch: ba,
224224
}
225225
return ""
226226

pkg/kv/kvserver/concurrency/latch_manager.go

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@ import (
1212
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/concurrency/poison"
1313
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/spanlatch"
1414
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/spanset"
15-
"github.com/cockroachdb/redact"
1615
)
1716

1817
// latchManagerImpl implements the latchManager interface.
@@ -21,15 +20,15 @@ type latchManagerImpl struct {
2120
}
2221

2322
func (m *latchManagerImpl) Acquire(ctx context.Context, req Request) (latchGuard, *Error) {
24-
lg, err := m.m.Acquire(ctx, req.LatchSpans, req.PoisonPolicy, req.BaFmt)
23+
lg, err := m.m.Acquire(ctx, req.LatchSpans, req.PoisonPolicy, req.Batch)
2524
if err != nil {
2625
return nil, kvpb.NewError(err)
2726
}
2827
return lg, nil
2928
}
3029

3130
func (m *latchManagerImpl) AcquireOptimistic(req Request) latchGuard {
32-
lg := m.m.AcquireOptimistic(req.LatchSpans, req.PoisonPolicy, req.BaFmt)
31+
lg := m.m.AcquireOptimistic(req.LatchSpans, req.PoisonPolicy, req.Batch)
3332
return lg
3433
}
3534

@@ -48,9 +47,9 @@ func (m *latchManagerImpl) WaitUntilAcquired(
4847
}
4948

5049
func (m *latchManagerImpl) WaitFor(
51-
ctx context.Context, ss *spanset.SpanSet, pp poison.Policy, baFmt redact.SafeFormatter,
50+
ctx context.Context, ss *spanset.SpanSet, pp poison.Policy, ba *kvpb.BatchRequest,
5251
) *Error {
53-
err := m.m.WaitFor(ctx, ss, pp, baFmt)
52+
err := m.m.WaitFor(ctx, ss, pp, ba)
5453
if err != nil {
5554
return kvpb.NewError(err)
5655
}

pkg/kv/kvserver/concurrency/lock_table_test.go

Lines changed: 28 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -326,7 +326,7 @@ func TestLockTableBasic(t *testing.T) {
326326
MaxLockWaitQueueLength: maxLockWaitQueueLength,
327327
LatchSpans: latchSpans,
328328
LockSpans: lockSpans,
329-
BaFmt: ba,
329+
Batch: ba,
330330
}
331331
if txnMeta != nil {
332332
// Update the transaction's timestamp, if necessary. The transaction
@@ -890,7 +890,7 @@ func TestLockTableMaxLocks(t *testing.T) {
890890
Timestamp: ba.Timestamp,
891891
LatchSpans: latchSpans,
892892
LockSpans: lockSpans,
893-
BaFmt: ba,
893+
Batch: ba,
894894
}
895895
reqs = append(reqs, req)
896896
ltg, err := lt.ScanAndEnqueue(req, nil)
@@ -1030,7 +1030,7 @@ func TestLockTableMaxLocksWithMultipleNotRemovableRefs(t *testing.T) {
10301030
Timestamp: ba.Timestamp,
10311031
LatchSpans: latchSpans,
10321032
LockSpans: lockSpans,
1033-
BaFmt: ba,
1033+
Batch: ba,
10341034
}
10351035
ltg, err := lt.ScanAndEnqueue(req, nil)
10361036
require.Nil(t, err)
@@ -1129,7 +1129,7 @@ func doWork(ctx context.Context, item *workItem, e *workloadExecutor) error {
11291129
// cancellation, the code makes sure to release latches when returning
11301130
// early due to error. Otherwise other requests will get stuck and
11311131
// group.Wait() will not return until the test times out.
1132-
lg, err = e.lm.Acquire(context.Background(), item.request.LatchSpans, poison.Policy_Error, item.request.BaFmt)
1132+
lg, err = e.lm.Acquire(context.Background(), item.request.LatchSpans, poison.Policy_Error, item.request.Batch)
11331133
if err != nil {
11341134
return err
11351135
}
@@ -1269,7 +1269,7 @@ func makeWorkItemForRequest(wi workloadItem) workItem {
12691269
}
12701270

12711271
type workloadExecutor struct {
1272-
lm spanlatch.Manager
1272+
lm *spanlatch.Manager
12731273
lt lockTable
12741274

12751275
// Protects the following fields in transactionState: acquiredLocks and
@@ -1285,19 +1285,27 @@ type workloadExecutor struct {
12851285

12861286
func newWorkLoadExecutor(items []workloadItem, concurrency int) *workloadExecutor {
12871287
const maxLocks = 100000
1288-
ltImpl := newLockTable(
1289-
maxLocks, roachpb.RangeID(3), hlc.NewClockForTesting(nil), cluster.MakeTestingClusterSettings(),
1288+
clock := hlc.NewClockForTesting(nil)
1289+
settings := cluster.MakeTestingClusterSettings()
1290+
lm := spanlatch.Make(
1291+
nil, /* stopper */
1292+
nil, /* slowReqs */
1293+
settings,
1294+
nil, /* latchWaitDurations */
1295+
clock,
12901296
)
1297+
ltImpl := newLockTable(maxLocks, roachpb.RangeID(3), clock, settings)
12911298
ltImpl.enabled = true
12921299
lt := maybeWrapInVerifyingLockTable(ltImpl)
1293-
return &workloadExecutor{
1294-
lm: spanlatch.Manager{},
1300+
ex := &workloadExecutor{
1301+
lm: &lm,
12951302
lt: lt,
12961303
items: items,
12971304
transactions: make(map[uuid.UUID]*transactionState),
12981305
doneWork: make(chan *workItem),
12991306
concurrency: concurrency,
13001307
}
1308+
return ex
13011309
}
13021310

13031311
func (e *workloadExecutor) acquireLock(txn *roachpb.Transaction, toAcq lockToAcquire) error {
@@ -1553,7 +1561,7 @@ func TestLockTableConcurrentSingleRequests(t *testing.T) {
15531561
Timestamp: ba.Timestamp,
15541562
LatchSpans: latchSpans,
15551563
LockSpans: lockSpans,
1556-
BaFmt: ba,
1564+
Batch: ba,
15571565
}
15581566
items = append(items, workloadItem{request: request})
15591567
if txn != nil {
@@ -1693,7 +1701,7 @@ func testLockTableConcurrentRequests(
16931701
Timestamp: ba.Timestamp,
16941702
LatchSpans: latchSpans,
16951703
LockSpans: lockSpans,
1696-
BaFmt: ba,
1704+
Batch: ba,
16971705
}
16981706
if txnMeta != nil {
16991707
request.Txn = &roachpb.Transaction{
@@ -1809,7 +1817,7 @@ func doBenchWork(item *benchWorkItem, env benchEnv, doneCh chan<- error) {
18091817
firstIter := true
18101818
ctx := context.Background()
18111819
for {
1812-
if lg, err = env.lm.Acquire(context.Background(), item.LatchSpans, poison.Policy_Error, item.BaFmt); err != nil {
1820+
if lg, err = env.lm.Acquire(context.Background(), item.LatchSpans, poison.Policy_Error, item.Batch); err != nil {
18131821
doneCh <- err
18141822
return
18151823
}
@@ -1856,7 +1864,7 @@ func doBenchWork(item *benchWorkItem, env benchEnv, doneCh chan<- error) {
18561864
return
18571865
}
18581866
// Release locks.
1859-
if lg, err = env.lm.Acquire(context.Background(), item.LatchSpans, poison.Policy_Error, item.BaFmt); err != nil {
1867+
if lg, err = env.lm.Acquire(context.Background(), item.LatchSpans, poison.Policy_Error, item.Batch); err != nil {
18601868
doneCh <- err
18611869
return
18621870
}
@@ -1891,7 +1899,7 @@ func createRequests(index int, numOutstanding int, numKeys int, numReadKeys int)
18911899
Timestamp: ts,
18921900
LatchSpans: latchSpans,
18931901
LockSpans: lockSpans,
1894-
BaFmt: ba,
1902+
Batch: ba,
18951903
},
18961904
}
18971905
for i := 0; i < numKeys; i++ {
@@ -1986,15 +1994,15 @@ func BenchmarkLockTable(b *testing.B) {
19861994
var numRequestsWaited uint64
19871995
var numScanCalls uint64
19881996
const maxLocks = 100000
1989-
lt := newLockTable(
1990-
maxLocks,
1991-
roachpb.RangeID(3),
1992-
hlc.NewClockForTesting(nil),
1993-
cluster.MakeTestingClusterSettings(),
1997+
clock := hlc.NewClockForTesting(nil)
1998+
settings := cluster.MakeTestingClusterSettings()
1999+
lm := spanlatch.Make(
2000+
nil /* stopper */, nil /* slowReqs */, settings, nil /* latchWaitDurations */, clock,
19942001
)
2002+
lt := newLockTable(maxLocks, roachpb.RangeID(3), clock, settings)
19952003
lt.enabled = true
19962004
env := benchEnv{
1997-
lm: &spanlatch.Manager{},
2005+
lm: &lm,
19982006
lt: lt,
19992007
numRequestsWaited: &numRequestsWaited,
20002008
numScanCalls: &numScanCalls,

pkg/kv/kvserver/concurrency/lock_table_waiter_test.go

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ func TestLockTableWaiterWithTxn(t *testing.T) {
147147
return Request{
148148
Txn: &txn,
149149
Timestamp: ba.Timestamp,
150-
BaFmt: ba,
150+
Batch: ba,
151151
}
152152
}
153153

@@ -230,7 +230,7 @@ func TestLockTableWaiterWithNonTxn(t *testing.T) {
230230
return Request{
231231
Timestamp: ba.Timestamp,
232232
NonTxnPriority: ba.UserPriority,
233-
BaFmt: ba,
233+
Batch: ba,
234234
}
235235
}
236236

@@ -431,7 +431,7 @@ func TestLockTableWaiterWithErrorWaitPolicy(t *testing.T) {
431431
Txn: &txn,
432432
Timestamp: ba.Timestamp,
433433
WaitPolicy: ba.WaitPolicy,
434-
BaFmt: ba,
434+
Batch: ba,
435435
}
436436
}
437437
makeHighPriReq := func() Request {
@@ -599,7 +599,7 @@ func TestLockTableWaiterWithLockTimeout(t *testing.T) {
599599
Txn: ba.Txn,
600600
Timestamp: ba.Timestamp,
601601
LockTimeout: ba.LockTimeout,
602-
BaFmt: ba,
602+
Batch: ba,
603603
}
604604
}
605605
if !txn {
@@ -610,7 +610,7 @@ func TestLockTableWaiterWithLockTimeout(t *testing.T) {
610610
return Request{
611611
Timestamp: ba.Timestamp,
612612
LockTimeout: ba.LockTimeout,
613-
BaFmt: ba,
613+
Batch: ba,
614614
}
615615
}
616616
}
@@ -787,7 +787,7 @@ func TestLockTableWaiterIntentResolverError(t *testing.T) {
787787
req := Request{
788788
Txn: ba.Txn,
789789
Timestamp: ba.Timestamp,
790-
BaFmt: ba,
790+
Batch: ba,
791791
}
792792

793793
// Test with both synchronous and asynchronous pushes.
@@ -847,7 +847,7 @@ func TestLockTableWaiterDeferredIntentResolverError(t *testing.T) {
847847
req := Request{
848848
Txn: ba.Txn,
849849
Timestamp: ba.Timestamp,
850-
BaFmt: ba,
850+
Batch: ba,
851851
}
852852
keyA := roachpb.Key("keyA")
853853
pusheeTxn := makeTxnProto("pushee")

pkg/kv/kvserver/concurrency/testdata/concurrency_manager/basic

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -151,9 +151,13 @@ debug-latch-manager
151151
write count: 1
152152
read count: 1
153153

154+
debug-advance-clock ts=123
155+
----
156+
154157
finish req=req3
155158
----
156159
[-] finish req3: finishing request
160+
[3] sequence req4: latch conflict with 00000003-0000-0000-0000-000000000000 on ‹"k"› for 123.000s
157161
[3] sequence req4: scanning lock table for conflicting locks
158162
[3] sequence req4: sequencing complete, returned guard
159163

@@ -226,13 +230,13 @@ debug-advance-clock ts=123
226230
on-txn-updated txn=txn2 status=pending ts=18,1
227231
----
228232
[-] update txn: increasing timestamp of txn2
229-
[2] sequence req5: resolving intent ‹"k"› for txn 00000002 with PENDING status and clock observation {1 246.000000000,0}
233+
[2] sequence req5: resolving intent ‹"k"› for txn 00000002 with PENDING status and clock observation {1 369.000000000,0}
230234
[2] sequence req5: lock wait-queue event: done waiting
231235
[2] sequence req5: conflicted with 00000002-0000-0000-0000-000000000000 on ‹"k"› for 123.000s
232236
[2] sequence req5: acquiring latches
233237
[2] sequence req5: scanning lock table for conflicting locks
234238
[2] sequence req5: sequencing complete, returned guard
235-
[3] sequence req6: resolving intent ‹"k"› for txn 00000002 with PENDING status and clock observation {1 246.000000000,1}
239+
[3] sequence req6: resolving intent ‹"k"› for txn 00000002 with PENDING status and clock observation {1 369.000000000,1}
236240
[3] sequence req6: lock wait-queue event: done waiting
237241
[3] sequence req6: conflicted with 00000002-0000-0000-0000-000000000000 on ‹"k"› for 123.000s
238242
[3] sequence req6: acquiring latches

pkg/kv/kvserver/concurrency/testdata/concurrency_manager/optimistic

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,10 +175,14 @@ sequence req=req6 eval-kind=pess-after-opt
175175
[8] sequence req6: waiting to acquire read latch ‹{a-e}›@12.000000000,1 for request Scan [‹"a"›,‹"e"›), [txn: 00000002], held by write latch ‹d›@10.000000000,1 for request Put [‹"d"›], [txn: 00000003]
176176
[8] sequence req6: blocked on select in spanlatch.(*Manager).waitForSignal
177177

178+
debug-advance-clock ts=123
179+
----
180+
178181
# req4 finishing releases the latch and allows req6 to proceed.
179182
finish req=req4
180183
----
181184
[-] finish req4: finishing request
185+
[8] sequence req6: latch conflict with 00000003-0000-0000-0000-000000000000 on ‹"d"› for 123.000s
182186
[8] sequence req6: scanning lock table for conflicting locks
183187
[8] sequence req6: sequencing complete, returned guard
184188

0 commit comments

Comments
 (0)