Skip to content

Commit 1314dc0

Browse files
committed
sql: add lock and latch wait time metrics
This commit plumbs cumulative lock and latch wait durations from the contention events listener up to `QueryLevelStats`. The metrics are displayed for `EXPLAIN ANALYZE` variants. Surfacing these metrics in the DB console, statement/txn insights etc. is left for future work. Informs #103713 Release note (ui change): The lock and latch wait time components of a query's cumulative contention time are now tracked separately and surfaced as annotations in `EXPLAIN ANALYZE` output.
1 parent f76dd4a commit 1314dc0

33 files changed

+318
-30
lines changed

pkg/ccl/logictestccl/testdata/logic_test/triggers_explain

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,8 @@ quality of service: regular
194194
vectorized batch count: 0
195195
KV time: 0µs
196196
KV contention time: 0µs
197+
KV lock wait time: 0µs
198+
KV latch wait time: 0µs
197199
KV rows decoded: 0
198200
KV pairs read: 0
199201
KV bytes read: 0 B
@@ -367,6 +369,8 @@ quality of service: regular
367369
│ vectorized batch count: 0
368370
│ KV time: 0µs
369371
│ KV contention time: 0µs
372+
│ KV lock wait time: 0µs
373+
│ KV latch wait time: 0µs
370374
│ KV rows decoded: 0
371375
│ KV pairs read: 0
372376
│ KV bytes read: 0 B
@@ -441,6 +445,8 @@ quality of service: regular
441445
│ vectorized batch count: 0
442446
│ KV time: 0µs
443447
│ KV contention time: 0µs
448+
│ KV lock wait time: 0µs
449+
│ KV latch wait time: 0µs
444450
│ KV rows decoded: 1
445451
│ KV pairs read: 2
446452
│ KV bytes read: 8 B
@@ -710,6 +716,8 @@ quality of service: regular
710716
│ vectorized batch count: 0
711717
│ KV time: 0µs
712718
│ KV contention time: 0µs
719+
│ KV lock wait time: 0µs
720+
│ KV latch wait time: 0µs
713721
│ KV rows decoded: 0
714722
│ KV pairs read: 0
715723
│ KV bytes read: 0 B
@@ -783,6 +791,8 @@ quality of service: regular
783791
│ vectorized batch count: 0
784792
│ KV time: 0µs
785793
│ KV contention time: 0µs
794+
│ KV lock wait time: 0µs
795+
│ KV latch wait time: 0µs
786796
│ KV rows decoded: 1
787797
│ KV pairs read: 2
788798
│ KV bytes read: 8 B
@@ -880,6 +890,8 @@ quality of service: regular
880890
│ │ vectorized batch count: 0
881891
│ │ KV time: 0µs
882892
│ │ KV contention time: 0µs
893+
│ │ KV lock wait time: 0µs
894+
│ │ KV latch wait time: 0µs
883895
│ │ KV rows decoded: 1
884896
│ │ KV pairs read: 2
885897
│ │ KV bytes read: 8 B
@@ -927,6 +939,8 @@ quality of service: regular
927939
│ │ vectorized batch count: 0
928940
│ │ KV time: 0µs
929941
│ │ KV contention time: 0µs
942+
│ │ KV lock wait time: 0µs
943+
│ │ KV latch wait time: 0µs
930944
│ │ KV rows decoded: 1
931945
│ │ KV pairs read: 2
932946
│ │ KV bytes read: 8 B
@@ -1103,6 +1117,8 @@ quality of service: regular
11031117
│ vectorized batch count: 0
11041118
│ KV time: 0µs
11051119
│ KV contention time: 0µs
1120+
│ KV lock wait time: 0µs
1121+
│ KV latch wait time: 0µs
11061122
│ KV rows decoded: 1
11071123
│ KV pairs read: 2
11081124
│ KV bytes read: 8 B

pkg/sql/colexecop/operator.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,12 @@ type KVReader interface {
7676
// GetContentionTime returns the amount of time KV reads spent
7777
// contending. It must be safe for concurrent use.
7878
GetContentionTime() time.Duration
79+
// GetLockWaitTime returns the amount of time KV reads spent
80+
// waiting in the lock table. It must be safe for concurrent use.
81+
GetLockWaitTime() time.Duration
82+
// GetLatchWaitTime returns the amount of time KV reads spent
83+
// waiting to acquire a latch. It must be safe for concurrent use.
84+
GetLatchWaitTime() time.Duration
7985
// GetScanStats returns statistics about the scan that happened during the
8086
// KV reads. It must be safe for concurrent use.
8187
GetScanStats() execstats.ScanStats

pkg/sql/colflow/stats.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -293,6 +293,8 @@ func (vsc *vectorizedStatsCollectorImpl) GetStats() *execinfrapb.ComponentStats
293293
s.KV.TuplesRead.Set(uint64(vsc.kvReader.GetRowsRead()))
294294
s.KV.BatchRequestsIssued.Set(uint64(vsc.kvReader.GetBatchRequestsIssued()))
295295
s.KV.ContentionTime.Set(vsc.kvReader.GetContentionTime())
296+
s.KV.LockWaitTime.Set(vsc.kvReader.GetLockWaitTime())
297+
s.KV.LatchWaitTime.Set(vsc.kvReader.GetLatchWaitTime())
296298
s.KV.UsedStreamer = vsc.kvReader.UsedStreamer()
297299
scanStats := vsc.kvReader.GetScanStats()
298300
execstats.PopulateKVMVCCStats(&s.KV, &scanStats)

pkg/sql/execinfrapb/component_stats.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,12 @@ func (s *ComponentStats) formatStats(fn func(suffix string, value interface{}))
188188
if s.KV.ContentionTime.HasValue() {
189189
fn("KV contention time", humanizeutil.Duration(s.KV.ContentionTime.Value()))
190190
}
191+
if s.KV.LockWaitTime.HasValue() {
192+
fn("KV lock wait time", humanizeutil.Duration(s.KV.LockWaitTime.Value()))
193+
}
194+
if s.KV.LatchWaitTime.HasValue() {
195+
fn("KV latch wait time", humanizeutil.Duration(s.KV.LatchWaitTime.Value()))
196+
}
191197
if s.KV.TuplesRead.HasValue() {
192198
fn("KV rows decoded", humanizeutil.Count(s.KV.TuplesRead.Value()))
193199
}
@@ -293,6 +299,12 @@ func (s *ComponentStats) Union(other *ComponentStats) *ComponentStats {
293299
if !result.KV.ContentionTime.HasValue() {
294300
result.KV.ContentionTime = other.KV.ContentionTime
295301
}
302+
if !result.KV.LockWaitTime.HasValue() {
303+
result.KV.LockWaitTime = other.KV.LockWaitTime
304+
}
305+
if !result.KV.LatchWaitTime.HasValue() {
306+
result.KV.LatchWaitTime = other.KV.LatchWaitTime
307+
}
296308
if !result.KV.NumInterfaceSteps.HasValue() {
297309
result.KV.NumInterfaceSteps = other.KV.NumInterfaceSteps
298310
}
@@ -436,6 +448,8 @@ func (s *ComponentStats) MakeDeterministic() {
436448
// KV.
437449
timeVal(&s.KV.KVTime)
438450
timeVal(&s.KV.ContentionTime)
451+
timeVal(&s.KV.LockWaitTime)
452+
timeVal(&s.KV.LatchWaitTime)
439453
resetUint(&s.KV.NumInterfaceSteps)
440454
resetUint(&s.KV.NumInternalSteps)
441455
resetUint(&s.KV.NumInterfaceSeeks)

pkg/sql/execinfrapb/component_stats.proto

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,14 @@ message KVStats {
134134
// other transactions. This time accounts for a portion of KVTime above.
135135
optional util.optional.Duration contention_time = 4 [(gogoproto.nullable) = false];
136136

137+
// LockWaitTime is the cumulative time a KV request spent waiting in the lock
138+
// table. This time accounts for a portion of ContentionTime above.
139+
optional util.optional.Duration lock_wait_time = 29 [(gogoproto.nullable) = false];
140+
141+
// LatchWaitTime is the cumulative time a KV request spent waiting to acquire
142+
// a latch. This time accounts for a portion of ContentionTime above.
143+
optional util.optional.Duration latch_wait_time = 30 [(gogoproto.nullable) = false];
144+
137145
// Internal iteration statistics.
138146
// For details, see pkg/sql/opt/exec/factory.go, pkg/storage/engine.go,
139147
// pebble.InternalIteratorStats, and pebble.RangeKeyIteratorStats.
@@ -177,6 +185,8 @@ message KVStats {
177185
// UsedFollowerRead indicates whether at least some reads were served by the
178186
// follower replicas.
179187
optional bool used_follower_read = 28 [(gogoproto.nullable) = false];
188+
189+
// Next ID: 31
180190
}
181191

182192
// ExecStats contains statistics about the execution of a component.

pkg/sql/execstats/stats.go

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,14 @@ func ShouldCollectStats(ctx context.Context, collectStats bool) bool {
3333
type ContentionEventsListener struct {
3434
cumulativeContentionTime int64 // atomic
3535

36+
// lockWaitTime is the cumulative time spent waiting in the lock table. It
37+
// accounts for a portion of the time in cumulativeContentionTime.
38+
lockWaitTime int64 // atomic
39+
40+
// latchWaitTime is the cumulative time spent waiting to acquire latches. It
41+
// accounts for a portion of the time in cumulativeContentionTime.
42+
latchWaitTime int64 // atomic
43+
3644
// txnID is the ID of the transaction that this listener is associated with.
3745
// This is used to distinguish self-induced latch wait time
3846
// (e.g. for QueryIntent) from contention-induced latch wait time.
@@ -59,6 +67,11 @@ func (c *ContentionEventsListener) Notify(event tracing.Structured) tracing.Even
5967
// waits for a pipelined write to finish replication.
6068
if c.txnID == uuid.Nil || c.txnID != ce.TxnMeta.ID {
6169
atomic.AddInt64(&c.cumulativeContentionTime, int64(ce.Duration))
70+
if ce.IsLatch {
71+
atomic.AddInt64(&c.latchWaitTime, int64(ce.Duration))
72+
} else {
73+
atomic.AddInt64(&c.lockWaitTime, int64(ce.Duration))
74+
}
6275
}
6376
return tracing.EventConsumed
6477
}
@@ -69,6 +82,18 @@ func (c *ContentionEventsListener) GetContentionTime() time.Duration {
6982
return time.Duration(atomic.LoadInt64(&c.cumulativeContentionTime))
7083
}
7184

85+
// GetLockWaitTime returns the cumulative lock wait time this listener has seen
86+
// so far.
87+
func (c *ContentionEventsListener) GetLockWaitTime() time.Duration {
88+
return time.Duration(atomic.LoadInt64(&c.lockWaitTime))
89+
}
90+
91+
// GetLatchWaitTime returns the cumulative latch wait time this listener has
92+
// seen so far.
93+
func (c *ContentionEventsListener) GetLatchWaitTime() time.Duration {
94+
return time.Duration(atomic.LoadInt64(&c.latchWaitTime))
95+
}
96+
7297
// ScanStatsListener aggregates all kvpb.ScanStats objects into a single
7398
// ScanStats object. It additionally looks for kvpb.UsedFollowerRead objects.
7499
type ScanStatsListener struct {

pkg/sql/execstats/traceanalyzer.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,8 @@ type QueryLevelStats struct {
118118
MvccRangeKeySkippedPoints int64
119119
NetworkMessages int64
120120
ContentionTime time.Duration
121+
LockWaitTime time.Duration
122+
LatchWaitTime time.Duration
121123
ContentionEvents []kvpb.ContentionEvent
122124
RUEstimate float64
123125
CPUTime time.Duration
@@ -181,6 +183,8 @@ func (s *QueryLevelStats) Accumulate(other QueryLevelStats) {
181183
s.MvccRangeKeySkippedPoints += other.MvccRangeKeySkippedPoints
182184
s.NetworkMessages += other.NetworkMessages
183185
s.ContentionTime += other.ContentionTime
186+
s.LockWaitTime += other.LockWaitTime
187+
s.LatchWaitTime += other.LatchWaitTime
184188
s.ContentionEvents = append(s.ContentionEvents, other.ContentionEvents...)
185189
s.RUEstimate += other.RUEstimate
186190
s.CPUTime += other.CPUTime
@@ -280,6 +284,8 @@ func (a *TraceAnalyzer) ProcessStats() {
280284
s.MvccRangeKeyContainedPoints += int64(stats.KV.RangeKeyContainedPoints.Value())
281285
s.MvccRangeKeySkippedPoints += int64(stats.KV.RangeKeySkippedPoints.Value())
282286
s.ContentionTime += stats.KV.ContentionTime.Value()
287+
s.LockWaitTime += stats.KV.LockWaitTime.Value()
288+
s.LatchWaitTime += stats.KV.LatchWaitTime.Value()
283289
s.RUEstimate += float64(stats.Exec.ConsumedRU.Value())
284290
s.CPUTime += stats.Exec.CPUTime.Value()
285291
}

pkg/sql/execstats/traceanalyzer_test.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,8 @@ func TestQueryLevelStatsAccumulate(t *testing.T) {
242242
KVTime: 5 * time.Second,
243243
NetworkMessages: 6,
244244
ContentionTime: 7 * time.Second,
245+
LockWaitTime: 4 * time.Second,
246+
LatchWaitTime: 3 * time.Second,
245247
ContentionEvents: []kvpb.ContentionEvent{aEvent},
246248
MaxDiskUsage: 8,
247249
RUEstimate: 9,
@@ -276,6 +278,8 @@ func TestQueryLevelStatsAccumulate(t *testing.T) {
276278
KVTime: 12 * time.Second,
277279
NetworkMessages: 13,
278280
ContentionTime: 14 * time.Second,
281+
LockWaitTime: 10 * time.Second,
282+
LatchWaitTime: 4 * time.Second,
279283
ContentionEvents: []kvpb.ContentionEvent{bEvent},
280284
MaxDiskUsage: 15,
281285
RUEstimate: 16,
@@ -309,6 +313,8 @@ func TestQueryLevelStatsAccumulate(t *testing.T) {
309313
KVTime: 17 * time.Second,
310314
NetworkMessages: 19,
311315
ContentionTime: 21 * time.Second,
316+
LockWaitTime: 14 * time.Second,
317+
LatchWaitTime: 7 * time.Second,
312318
ContentionEvents: []kvpb.ContentionEvent{aEvent, bEvent},
313319
MaxDiskUsage: 15,
314320
RUEstimate: 25,

pkg/sql/instrumentation.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -830,6 +830,12 @@ func (ih *instrumentationHelper) emitExplainAnalyzePlanToOutputBuilder(
830830
if queryStats.ContentionTime != 0 {
831831
ob.AddContentionTime(queryStats.ContentionTime)
832832
}
833+
if queryStats.LockWaitTime != 0 {
834+
ob.AddLockWaitTime(queryStats.LockWaitTime)
835+
}
836+
if queryStats.LatchWaitTime != 0 {
837+
ob.AddLatchWaitTime(queryStats.LatchWaitTime)
838+
}
833839

834840
ob.AddMaxMemUsage(queryStats.MaxMemUsage)
835841
ob.AddNetworkStats(queryStats.NetworkMessages, queryStats.NetworkBytesSent)
@@ -1031,6 +1037,8 @@ func (m execNodeTraceMetadata) annotateExplain(
10311037
}
10321038
nodeStats.KVTime.MaybeAdd(stats.KV.KVTime)
10331039
nodeStats.KVContentionTime.MaybeAdd(stats.KV.ContentionTime)
1040+
nodeStats.KVLockWaitTime.MaybeAdd(stats.KV.LockWaitTime)
1041+
nodeStats.KVLatchWaitTime.MaybeAdd(stats.KV.LatchWaitTime)
10341042
nodeStats.KVBytesRead.MaybeAdd(stats.KV.BytesRead)
10351043
nodeStats.KVPairsRead.MaybeAdd(stats.KV.KVPairsRead)
10361044
nodeStats.KVRowsRead.MaybeAdd(stats.KV.TuplesRead)

pkg/sql/opt/exec/execbuilder/testdata/cascade

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1099,6 +1099,8 @@ quality of service: regular
10991099
│ actual row count: 1
11001100
│ KV time: 0µs
11011101
│ KV contention time: 0µs
1102+
│ KV lock wait time: 0µs
1103+
│ KV latch wait time: 0µs
11021104
│ KV rows decoded: 1
11031105
│ KV pairs read: 2
11041106
│ KV bytes read: 8 B
@@ -1140,6 +1142,8 @@ quality of service: regular
11401142
│ │ actual row count: 3
11411143
│ │ KV time: 0µs
11421144
│ │ KV contention time: 0µs
1145+
│ │ KV lock wait time: 0µs
1146+
│ │ KV latch wait time: 0µs
11431147
│ │ KV rows decoded: 3
11441148
│ │ KV pairs read: 6
11451149
│ │ KV bytes read: 24 B
@@ -1180,6 +1184,8 @@ quality of service: regular
11801184
│ │ actual row count: 1
11811185
│ │ KV time: 0µs
11821186
│ │ KV contention time: 0µs
1187+
│ │ KV lock wait time: 0µs
1188+
│ │ KV latch wait time: 0µs
11831189
│ │ KV rows decoded: 1
11841190
│ │ KV pairs read: 2
11851191
│ │ KV bytes read: 8 B
@@ -1509,6 +1515,8 @@ quality of service: regular
15091515
│ actual row count: 0
15101516
│ KV time: 0µs
15111517
│ KV contention time: 0µs
1518+
│ KV lock wait time: 0µs
1519+
│ KV latch wait time: 0µs
15121520
│ KV rows decoded: 0
15131521
│ KV bytes read: 0 B
15141522
│ KV gRPC calls: 0

0 commit comments

Comments
 (0)