Skip to content

Commit ee80cea

Browse files
committed
kvserver: add metrics for when the lock table sheds locks
This commit adds a couple of metrics -- one to track the number of times the lock table shed locks because it ran into memory limits and another to track the number of locks shed as a result of running into memory limits. I've modified TestLockTableMaxLocks to make use of these. Epic: none Release note: None
1 parent 8f858b5 commit ee80cea

File tree

8 files changed

+184
-54
lines changed

8 files changed

+184
-54
lines changed

docs/generated/metrics/metrics.yaml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12420,6 +12420,14 @@ layers:
1242012420
unit: COUNT
1242112421
aggregation: AVG
1242212422
derivative: NONE
12423+
- name: kv.concurrency.locks_shed_due_to_memory_limit
12424+
exported_name: kv_concurrency_locks_shed_due_to_memory_limit
12425+
description: The number of locks that were shed because the lock table ran into memory limits
12426+
y_axis_label: Locks
12427+
type: COUNTER
12428+
unit: COUNT
12429+
aggregation: AVG
12430+
derivative: NON_NEGATIVE_DERIVATIVE
1242312431
- name: kv.concurrency.locks_with_wait_queues
1242412432
exported_name: kv_concurrency_locks_with_wait_queues
1242512433
description: Number of active locks held in lock tables with active wait-queues
@@ -12452,6 +12460,14 @@ layers:
1245212460
unit: COUNT
1245312461
aggregation: AVG
1245412462
derivative: NONE
12463+
- name: kv.concurrency.num_lock_shed_due_to_memory_limit_events
12464+
exported_name: kv_concurrency_num_lock_shed_due_to_memory_limit_events
12465+
description: The number of times locks that were shed by the lock table because it ran into memory limits
12466+
y_axis_label: Lock Shed Events
12467+
type: COUNTER
12468+
unit: COUNT
12469+
aggregation: AVG
12470+
derivative: NON_NEGATIVE_DERIVATIVE
1245512471
- name: kv.loadsplitter.cleardirection
1245612472
exported_name: kv_loadsplitter_cleardirection
1245712473
description: Load-based splitter observed an access direction greater than 80% left or right in the samples.

pkg/kv/kvserver/concurrency/concurrency_manager.go

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -186,9 +186,11 @@ type Config struct {
186186
Stopper *stop.Stopper
187187
IntentResolver IntentResolver
188188
// Metrics.
189-
TxnWaitMetrics *txnwait.Metrics
190-
SlowLatchGauge *metric.Gauge
191-
LatchWaitDurations metric.IHistogram
189+
TxnWaitMetrics *txnwait.Metrics
190+
SlowLatchGauge *metric.Gauge
191+
LatchWaitDurations metric.IHistogram
192+
LocksShedDueToMemoryLimit *metric.Counter
193+
NumLockShedDueToMemoryLimitEvents *metric.Counter
192194
// Configs + Knobs.
193195
MaxLockTableSize int64
194196
DisableTxnPushing bool
@@ -206,7 +208,10 @@ func NewManager(cfg Config) Manager {
206208
cfg.initDefaults()
207209
m := new(managerImpl)
208210
lt := maybeWrapInVerifyingLockTable(
209-
newLockTable(cfg.MaxLockTableSize, cfg.RangeDesc.RangeID, cfg.Clock, cfg.Settings),
211+
newLockTable(
212+
cfg.MaxLockTableSize, cfg.RangeDesc.RangeID, cfg.Clock, cfg.Settings,
213+
cfg.LocksShedDueToMemoryLimit, cfg.NumLockShedDueToMemoryLimitEvents,
214+
),
210215
)
211216
*m = managerImpl{
212217
st: cfg.Settings,

pkg/kv/kvserver/concurrency/concurrency_manager_test.go

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -755,13 +755,16 @@ func newClusterWithSettings(st *clustersettings.Settings) *cluster {
755755
}
756756

757757
func (c *cluster) makeConfig() concurrency.Config {
758+
m := concurrency.TestingMakeLockTableMetricsCfg()
758759
return concurrency.Config{
759-
NodeDesc: c.nodeDesc,
760-
RangeDesc: c.rangeDesc,
761-
Settings: c.st,
762-
Clock: c.clock,
763-
IntentResolver: c,
764-
TxnWaitMetrics: txnwait.NewMetrics(time.Minute),
760+
NodeDesc: c.nodeDesc,
761+
RangeDesc: c.rangeDesc,
762+
Settings: c.st,
763+
Clock: c.clock,
764+
IntentResolver: c,
765+
TxnWaitMetrics: txnwait.NewMetrics(time.Minute),
766+
LocksShedDueToMemoryLimit: m.LocksShedDueToMemoryLimit,
767+
NumLockShedDueToMemoryLimitEvents: m.NumLockShedDueToMemoryLimitEvents,
765768
}
766769
}
767770

pkg/kv/kvserver/concurrency/lock_table.go

Lines changed: 30 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
"github.com/cockroachdb/cockroach/pkg/util/buildutil"
2424
"github.com/cockroachdb/cockroach/pkg/util/container/list"
2525
"github.com/cockroachdb/cockroach/pkg/util/hlc"
26+
"github.com/cockroachdb/cockroach/pkg/util/metric"
2627
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
2728
"github.com/cockroachdb/cockroach/pkg/util/uuid"
2829
"github.com/cockroachdb/errors"
@@ -289,17 +290,31 @@ type lockTableImpl struct {
289290

290291
// settings provides a handle to cluster settings.
291292
settings *cluster.Settings
293+
294+
// The number of locks that are shed due to the lock table running into memory
295+
// limits.
296+
locksShedDueToMemoryLimit *metric.Counter
297+
// The number of times the lock table ran into memory limits and shed locks as
298+
// a result.
299+
numLockShedDueToMemoryLimitEvents *metric.Counter
292300
}
293301

294302
var _ lockTable = &lockTableImpl{}
295303

296304
func newLockTable(
297-
maxLocks int64, rangeID roachpb.RangeID, clock *hlc.Clock, settings *cluster.Settings,
305+
maxLocks int64,
306+
rangeID roachpb.RangeID,
307+
clock *hlc.Clock,
308+
settings *cluster.Settings,
309+
locksShedDueToMemoryLimit *metric.Counter,
310+
numLockShedDueToMemoryLimitEvents *metric.Counter,
298311
) *lockTableImpl {
299312
lt := &lockTableImpl{
300-
rID: rangeID,
301-
clock: clock,
302-
settings: settings,
313+
rID: rangeID,
314+
clock: clock,
315+
settings: settings,
316+
locksShedDueToMemoryLimit: locksShedDueToMemoryLimit,
317+
numLockShedDueToMemoryLimitEvents: numLockShedDueToMemoryLimitEvents,
303318
}
304319
lt.setMaxKeysLocked(maxLocks)
305320
return lt
@@ -4494,7 +4509,12 @@ func (t *lockTableImpl) checkMaxKeysLockedAndTryClear() {
44944509
totalLocks := t.locks.numKeysLocked.Load()
44954510
if totalLocks > t.maxKeysLocked {
44964511
numToClear := totalLocks - t.minKeysLocked
4497-
t.tryClearLocks(false /* force */, int(numToClear))
4512+
numCleared := t.tryClearLocks(false /* force */, int(numToClear))
4513+
// Update metrics if we successfully cleared any number of locks.
4514+
if numCleared != 0 {
4515+
t.locksShedDueToMemoryLimit.Inc(numCleared)
4516+
t.numLockShedDueToMemoryLimitEvents.Inc(1)
4517+
}
44984518
}
44994519
}
45004520

@@ -4509,23 +4529,24 @@ func (t *lockTableImpl) lockCountForTesting() int64 {
45094529
//
45104530
// Waiters of removed locks are told to wait elsewhere or that they are done
45114531
// waiting.
4512-
func (t *lockTableImpl) tryClearLocks(force bool, numToClear int) {
4513-
clearCount := 0
4532+
func (t *lockTableImpl) tryClearLocks(force bool, numToClear int) int64 {
4533+
var clearCount int64
45144534
t.locks.mu.Lock()
4535+
defer t.locks.mu.Unlock()
45154536
var locksToClear []*keyLocks
45164537
iter := t.locks.MakeIter()
45174538
for iter.First(); iter.Valid(); iter.Next() {
45184539
l := iter.Cur()
45194540
if l.tryClearLock(force) {
45204541
locksToClear = append(locksToClear, l)
45214542
clearCount++
4522-
if !force && clearCount >= numToClear {
4543+
if !force && clearCount >= int64(numToClear) {
45234544
break
45244545
}
45254546
}
45264547
}
45274548
t.clearLocksMuLocked(locksToClear)
4528-
t.locks.mu.Unlock()
4549+
return clearCount
45294550
}
45304551

45314552
// tryClearLockGE attempts to clear all locks greater or equal to given key.

pkg/kv/kvserver/concurrency/lock_table_test.go

Lines changed: 52 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -196,8 +196,10 @@ func TestLockTableBasic(t *testing.T) {
196196
case "new-lock-table":
197197
var maxLocks int
198198
d.ScanArgs(t, "maxlocks", &maxLocks)
199+
m := TestingMakeLockTableMetricsCfg()
199200
ltImpl := newLockTable(
200201
int64(maxLocks), roachpb.RangeID(3), clock, cluster.MakeTestingClusterSettings(),
202+
m.LocksShedDueToMemoryLimit, m.NumLockShedDueToMemoryLimitEvents,
201203
)
202204
ltImpl.enabled = true
203205
ltImpl.enabledSeq = 1
@@ -888,8 +890,10 @@ func newLock(txn *enginepb.TxnMeta, key roachpb.Key, str lock.Strength) *roachpb
888890
}
889891

890892
func TestLockTableMaxLocks(t *testing.T) {
893+
m := TestingMakeLockTableMetricsCfg()
891894
lt := newLockTable(
892895
5, roachpb.RangeID(3), hlc.NewClockForTesting(nil), cluster.MakeTestingClusterSettings(),
896+
m.LocksShedDueToMemoryLimit, m.NumLockShedDueToMemoryLimitEvents,
893897
)
894898
lt.minKeysLocked = 0
895899
lt.enabled = true
@@ -926,6 +930,9 @@ func TestLockTableMaxLocks(t *testing.T) {
926930
ID: uuid.MakeV4(),
927931
WriteTimestamp: hlc.Timestamp{WallTime: 10},
928932
}
933+
// Sanity check counters at the start before we start adding locks.
934+
require.Equal(t, int64(0), m.NumLockShedDueToMemoryLimitEvents.Count())
935+
require.Equal(t, int64(0), m.LocksShedDueToMemoryLimit.Count())
929936
for i := range guards {
930937
for j := 0; j < 10; j++ {
931938
k := i*20 + j
@@ -938,11 +945,16 @@ func TestLockTableMaxLocks(t *testing.T) {
938945
}
939946
// Only the notRemovable locks survive after addition.
940947
require.Equal(t, int64(10), lt.lockCountForTesting())
941-
// Two guards are dequeued.
948+
// The other 90 locks should be shed.
949+
require.Equal(t, int64(90), m.LocksShedDueToMemoryLimit.Count())
950+
require.Equal(t, int64(66), m.NumLockShedDueToMemoryLimitEvents.Count())
951+
// Two guards are dequeued. This marks 2 notRemovable locks as removable.
952+
// We're at 8 notRemovable locks now.
942953
lt.Dequeue(guards[0])
943954
lt.Dequeue(guards[1])
944955
require.Equal(t, int64(10), lt.lockCountForTesting())
945-
// Two guards do ScanAndEnqueue.
956+
// Two guards do ScanAndEnqueue. This marks 2 notRemovable locks as
957+
// removable. We're at 6 notRemovable locks now.
946958
for i := 2; i < 4; i++ {
947959
var err *Error
948960
guards[i], err = lt.ScanAndEnqueue(reqs[i], guards[i])
@@ -958,6 +970,10 @@ func TestLockTableMaxLocks(t *testing.T) {
958970
require.NoError(t, err)
959971
// The 6 notRemovable locks remain.
960972
require.Equal(t, int64(6), lt.lockCountForTesting())
973+
// NB: 4 locks that became removable are cleared + the lock that was added by
974+
// AddDiscoveredLock, taking the total number of locks removed to 5.
975+
require.Equal(t, int64(95), m.LocksShedDueToMemoryLimit.Count())
976+
require.Equal(t, int64(67), m.NumLockShedDueToMemoryLimitEvents.Count())
961977
require.Equal(t, int64(101), int64(lt.locks.lockIDSeqNum))
962978
// Add another discovered lock, to trigger tryClearLocks.
963979
added, err = lt.AddDiscoveredLock(
@@ -967,7 +983,9 @@ func TestLockTableMaxLocks(t *testing.T) {
967983
require.NoError(t, err)
968984
// Still the 6 notRemovable locks remain.
969985
require.Equal(t, int64(6), lt.lockCountForTesting())
970-
require.Equal(t, int64(102), int64(lt.locks.lockIDSeqNum))
986+
// NB: We cleared the lock added by AddDiscoveredLock above.
987+
require.Equal(t, int64(96), m.LocksShedDueToMemoryLimit.Count())
988+
require.Equal(t, int64(68), m.NumLockShedDueToMemoryLimitEvents.Count())
971989
// Two more guards are dequeued, so we are down to 4 notRemovable locks.
972990
lt.Dequeue(guards[4])
973991
lt.Dequeue(guards[5])
@@ -981,6 +999,9 @@ func TestLockTableMaxLocks(t *testing.T) {
981999
require.NoError(t, err)
9821000
// This notRemovable=false lock is also added, since enforcement not done.
9831001
require.Equal(t, int64(7), lt.lockCountForTesting())
1002+
// Metrics shouldn't change as enforcement wasn't done.
1003+
require.Equal(t, int64(96), m.LocksShedDueToMemoryLimit.Count())
1004+
require.Equal(t, int64(68), m.NumLockShedDueToMemoryLimitEvents.Count())
9841005
// Add another discovered lock, to trigger tryClearLocks.
9851006
added, err = lt.AddDiscoveredLock(
9861007
newLock(&txnMeta, keys[9*20+13], lock.Intent),
@@ -989,10 +1010,14 @@ func TestLockTableMaxLocks(t *testing.T) {
9891010
require.NoError(t, err)
9901011
// Now enforcement is done, so only 4 remain.
9911012
require.Equal(t, int64(4), lt.lockCountForTesting())
1013+
// We made 2 locks removable above + added to locks by calling
1014+
// AddDiscoveredLocks, resulting in 4 locks being cleared in total.
1015+
require.Equal(t, int64(100), m.LocksShedDueToMemoryLimit.Count())
1016+
require.Equal(t, int64(69), m.NumLockShedDueToMemoryLimitEvents.Count())
9921017
// Bump down the enforcement interval manually, and bump up minKeysLocked.
9931018
lt.locks.lockAddMaxLocksCheckInterval = 1
9941019
lt.minKeysLocked = 2
995-
// Three more guards dequeued.
1020+
// Three more guards dequeued. Now only 1 lock is notRemovable.
9961021
lt.Dequeue(guards[6])
9971022
lt.Dequeue(guards[7])
9981023
lt.Dequeue(guards[8])
@@ -1003,6 +1028,10 @@ func TestLockTableMaxLocks(t *testing.T) {
10031028
require.True(t, added)
10041029
require.NoError(t, err)
10051030
require.Equal(t, int64(5), lt.lockCountForTesting())
1031+
// NB: We're allowed 5 locks. We won't trigger tryClearLocks and the metrics
1032+
// should reflect this.
1033+
require.Equal(t, int64(100), m.LocksShedDueToMemoryLimit.Count())
1034+
require.Equal(t, int64(69), m.NumLockShedDueToMemoryLimitEvents.Count())
10061035
// Add another discovered lock, to trigger tryClearLocks, and push us over 5
10071036
// locks.
10081037
added, err = lt.AddDiscoveredLock(
@@ -1013,6 +1042,9 @@ func TestLockTableMaxLocks(t *testing.T) {
10131042
// Enforcement keeps the 1 notRemovable lock, and another, since
10141043
// minKeysLocked=2.
10151044
require.Equal(t, int64(2), lt.lockCountForTesting())
1045+
// Which means that in total, 4 more locks are cleared.
1046+
require.Equal(t, int64(104), m.LocksShedDueToMemoryLimit.Count())
1047+
require.Equal(t, int64(70), m.NumLockShedDueToMemoryLimitEvents.Count())
10161048
// Restore minKeysLocked to 0.
10171049
lt.minKeysLocked = 0
10181050
// Add locks to push us over 5 locks.
@@ -1025,13 +1057,19 @@ func TestLockTableMaxLocks(t *testing.T) {
10251057
}
10261058
// Only the 1 notRemovable lock remains.
10271059
require.Equal(t, int64(1), lt.lockCountForTesting())
1060+
// Locks were cleared when we got to 6 locks (and 5 of them were cleared) as
1061+
// one of them is notRemovable.
1062+
require.Equal(t, int64(109), m.LocksShedDueToMemoryLimit.Count())
1063+
require.Equal(t, int64(71), m.NumLockShedDueToMemoryLimitEvents.Count())
10281064
}
10291065

10301066
// TestLockTableMaxLocksWithMultipleNotRemovableRefs tests the notRemovable
10311067
// ref counting.
10321068
func TestLockTableMaxLocksWithMultipleNotRemovableRefs(t *testing.T) {
1069+
m := TestingMakeLockTableMetricsCfg()
10331070
lt := newLockTable(
10341071
2, roachpb.RangeID(3), hlc.NewClockForTesting(nil), cluster.MakeTestingClusterSettings(),
1072+
m.LocksShedDueToMemoryLimit, m.NumLockShedDueToMemoryLimitEvents,
10351073
)
10361074
lt.minKeysLocked = 0
10371075
lt.enabled = true
@@ -1316,7 +1354,9 @@ func newWorkLoadExecutor(items []workloadItem, concurrency int) *workloadExecuto
13161354
nil, /* latchWaitDurations */
13171355
clock,
13181356
)
1319-
ltImpl := newLockTable(maxLocks, roachpb.RangeID(3), clock, settings)
1357+
m := TestingMakeLockTableMetricsCfg()
1358+
ltImpl := newLockTable(maxLocks, roachpb.RangeID(3), clock, settings,
1359+
m.LocksShedDueToMemoryLimit, m.NumLockShedDueToMemoryLimitEvents)
13201360
ltImpl.enabled = true
13211361
lt := maybeWrapInVerifyingLockTable(ltImpl)
13221362
ex := &workloadExecutor{
@@ -2018,7 +2058,10 @@ func BenchmarkLockTable(b *testing.B) {
20182058
lm := spanlatch.Make(
20192059
nil /* stopper */, nil /* slowReqs */, settings, nil /* latchWaitDurations */, clock,
20202060
)
2021-
lt := newLockTable(maxLocks, roachpb.RangeID(3), clock, settings)
2061+
m := TestingMakeLockTableMetricsCfg()
2062+
lt := newLockTable(maxLocks, roachpb.RangeID(3), clock, settings,
2063+
m.LocksShedDueToMemoryLimit, m.NumLockShedDueToMemoryLimitEvents,
2064+
)
20222065
lt.enabled = true
20232066
env := benchEnv{
20242067
lm: &lm,
@@ -2058,11 +2101,14 @@ func BenchmarkLockTableMetrics(b *testing.B) {
20582101
for _, locks := range []int{0, 1 << 0, 1 << 4, 1 << 8, 1 << 12} {
20592102
b.Run(fmt.Sprintf("locks=%d", locks), func(b *testing.B) {
20602103
const maxLocks = 100000
2104+
m := TestingMakeLockTableMetricsCfg()
20612105
lt := newLockTable(
20622106
maxLocks,
20632107
roachpb.RangeID(3),
20642108
hlc.NewClockForTesting(nil),
20652109
cluster.MakeTestingClusterSettings(),
2110+
m.LocksShedDueToMemoryLimit,
2111+
m.NumLockShedDueToMemoryLimitEvents,
20662112
)
20672113
lt.enabled = true
20682114

pkg/kv/kvserver/concurrency/metrics.go

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ package concurrency
88
import (
99
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/spanlatch"
1010
"github.com/cockroachdb/cockroach/pkg/roachpb"
11+
"github.com/cockroachdb/cockroach/pkg/util/metric"
1112
)
1213

1314
// LatchMetrics holds information about the state of a latchManager.
@@ -127,3 +128,33 @@ func addToTopK(topK []LockMetrics, lm LockMetrics, cmp func(LockMetrics) int64)
127128
}
128129
}
129130
}
131+
132+
var MetaConcurrencyLocksShedDueToMemoryLimit = metric.Metadata{
133+
Name: "kv.concurrency.locks_shed_due_to_memory_limit",
134+
Help: "The number of locks that were shed because the lock table ran into memory limits",
135+
Measurement: "Locks",
136+
Unit: metric.Unit_COUNT,
137+
}
138+
139+
var MetaConcurrencyNumLockShedDueToMemoryLimitEvents = metric.Metadata{
140+
Name: "kv.concurrency.num_lock_shed_due_to_memory_limit_events",
141+
Help: "The number of times locks that were shed by the lock table because it ran into memory limits",
142+
Measurement: "Lock Shed Events",
143+
Unit: metric.Unit_COUNT,
144+
}
145+
146+
// TestingLockTableMetricsCfg is a subset of store metrics that are required to
147+
// construct a new lock table to be used for testing purposes.
148+
type TestingLockTableMetricsCfg struct {
149+
LocksShedDueToMemoryLimit *metric.Counter
150+
NumLockShedDueToMemoryLimitEvents *metric.Counter
151+
}
152+
153+
// TestingMakeLockTableMetricsCfg returns a new TestingLockTableMetricsCfg for
154+
// testing.
155+
func TestingMakeLockTableMetricsCfg() TestingLockTableMetricsCfg {
156+
return TestingLockTableMetricsCfg{
157+
LocksShedDueToMemoryLimit: metric.NewCounter(MetaConcurrencyLocksShedDueToMemoryLimit),
158+
NumLockShedDueToMemoryLimitEvents: metric.NewCounter(MetaConcurrencyNumLockShedDueToMemoryLimitEvents),
159+
}
160+
}

0 commit comments

Comments
 (0)