Skip to content

Commit 8de4c80

Browse files
contention: periodically log resolved contention events
This commit adds logging to the flushAndResolve method in the contention event store. This method is called periodically at an interval defined by the (non-public) cluster setting sql.contention.event_store.resolution_interval. By logging aggregated contention event information, users now have the ability to find which waiting statements/txns are experiencing high contention through logs (previously this was only available via insights or SQL activity). Additionally, this aggregated event information contains the blocking txn and contended key - information that was previously only available via querying crdb_internal.transaction_contention_events. The logs are aggregated by a composite identifier of (waiting stmt, waiting txn, blocking txn, key), and the total contention time observed for that combination. They are structured logs. For example: ``` {"Timestamp":1756224167482848000,"EventType":"aggregated_contention_info", "WaitingStmtFingerprintId":"\\x000000000000007b", "WaitingTxnFingerprintId":"\\x00000000000001c8", "BlockingTxnFingerprintId":"\\x0000000000000315", "ContendedKey":"test-key-1","Duration":300000000} ``` Since this information is available via logs, it provides a solution for users who desire more persistence of contention event information. Part of: https://cockroachlabs.atlassian.net/browse/CRDB-53592 Release note (ops change): Adds logging of contention events.
1 parent f5c0bbd commit 8de4c80

File tree

12 files changed

+367
-4
lines changed

12 files changed

+367
-4
lines changed

docs/generated/eventlog.md

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -409,6 +409,35 @@ is shut down inside the same process as the KV layer.
409409
| `InstanceID` | The ID of the server instance. | no |
410410
| `TenantName` | The name of the tenant at the time the event was emitted. | yes |
411411

412+
## Contention events
413+
414+
Aggregated information about contention events.
415+
416+
Events in this category are logged to the `SQL_EXEC` channel.
417+
418+
419+
### `aggregated_contention_info`
420+
421+
An event of type `aggregated_contention_info` is recorded periodically when contention events
422+
are resolved.
423+
424+
425+
| Field | Description | Sensitive |
426+
|--|--|--|
427+
| `WaitingStmtFingerprintId` | | no |
428+
| `WaitingTxnFingerprintId` | | no |
429+
| `BlockingTxnFingerprintId` | | no |
430+
| `ContendedKey` | | partially |
431+
| `Duration` | | no |
432+
433+
434+
#### Common fields
435+
436+
| Field | Description | Sensitive |
437+
|--|--|--|
438+
| `Timestamp` | The timestamp of the event. Expressed as nanoseconds since the Unix epoch. | no |
439+
| `EventType` | The type of the event. | no |
440+
412441
## Debugging events
413442

414443
Events in this category pertain to debugging operations performed by

pkg/sql/contention/BUILD.bazel

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,15 +23,19 @@ go_library(
2323
"//pkg/sql/catalog/descpb",
2424
"//pkg/sql/contention/contentionutils",
2525
"//pkg/sql/contentionpb",
26+
"//pkg/sql/sqlstats/persistedsqlstats/sqlstatsutil",
2627
"//pkg/util/cache",
2728
"//pkg/util/log",
29+
"//pkg/util/log/eventpb",
30+
"//pkg/util/log/logpb",
2831
"//pkg/util/metric",
2932
"//pkg/util/stop",
3033
"//pkg/util/syncutil",
3134
"//pkg/util/timeutil",
3235
"//pkg/util/uuid",
3336
"@com_github_biogo_store//llrb",
3437
"@com_github_cockroachdb_errors//:errors",
38+
"@com_github_cockroachdb_redact//:redact",
3539
],
3640
)
3741

@@ -53,18 +57,24 @@ go_test(
5357
"//pkg/settings/cluster",
5458
"//pkg/sql/appstatspb",
5559
"//pkg/sql/contentionpb",
60+
"//pkg/sql/sqlstats/persistedsqlstats/sqlstatsutil",
5661
"//pkg/storage/enginepb",
5762
"//pkg/testutils",
5863
"//pkg/testutils/datapathutils",
5964
"//pkg/util/cache",
6065
"//pkg/util/encoding",
6166
"//pkg/util/leaktest",
67+
"//pkg/util/log",
68+
"//pkg/util/log/eventpb",
69+
"//pkg/util/log/logpb",
70+
"//pkg/util/log/logtestutils",
6271
"//pkg/util/randutil",
6372
"//pkg/util/stop",
6473
"//pkg/util/timeutil",
6574
"//pkg/util/uuid",
6675
"@com_github_cockroachdb_datadriven//:datadriven",
6776
"@com_github_cockroachdb_errors//:errors",
77+
"@com_github_cockroachdb_redact//:redact",
6878
"@com_github_stretchr_testify//require",
6979
],
7080
)

pkg/sql/contention/event_store.go

Lines changed: 46 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,18 @@ import (
1313
"time"
1414

1515
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
16+
"github.com/cockroachdb/cockroach/pkg/sql/appstatspb"
1617
"github.com/cockroachdb/cockroach/pkg/sql/contention/contentionutils"
1718
"github.com/cockroachdb/cockroach/pkg/sql/contentionpb"
19+
"github.com/cockroachdb/cockroach/pkg/sql/sqlstats/persistedsqlstats/sqlstatsutil"
1820
"github.com/cockroachdb/cockroach/pkg/util/cache"
1921
"github.com/cockroachdb/cockroach/pkg/util/log"
22+
"github.com/cockroachdb/cockroach/pkg/util/log/eventpb"
23+
"github.com/cockroachdb/cockroach/pkg/util/log/logpb"
2024
"github.com/cockroachdb/cockroach/pkg/util/stop"
2125
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
2226
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
27+
"github.com/cockroachdb/redact"
2328
)
2429

2530
const (
@@ -296,14 +301,14 @@ func (s *eventStore) getEventByEventHash(
296301
}
297302

298303
// flushAndResolve is the main method called by the resolver goroutine each
299-
// time the timer fires. This method does two things:
304+
// time the timer fires. This method does three things:
300305
// 1. it triggers the batching buffer to flush its content into the intake
301306
// goroutine. This is to ensure that in the case where we have very low
302307
// rate of contentions, the contention events won't be permanently trapped
303308
// in the batching buffer.
304309
// 2. it invokes the dequeue() method on the resolverQueue. This cause the
305310
// resolver to perform txnID resolution. See inline comments on the method
306-
// for details.
311+
// 3. Lastly, it logs the resolved events.
307312
func (s *eventStore) flushAndResolve(ctx context.Context) error {
308313
// This forces the write-buffer flushes its batch into the intake goroutine.
309314
// The intake goroutine will asynchronously add all events in the batch
@@ -320,6 +325,9 @@ func (s *eventStore) flushAndResolve(ctx context.Context) error {
320325
// before we bubble up the error.
321326
s.upsertBatch(result)
322327

328+
// Aggregate the resolved event information for logging.
329+
logResolvedEvents(ctx, result)
330+
323331
return err
324332
}
325333

@@ -348,6 +356,42 @@ func (s *eventStore) resolutionIntervalWithJitter() time.Duration {
348356
return jitteredInterval
349357
}
350358

359+
// contentionKey is used as a key in the eventsAggregated map to group
360+
// contention events by their characteristics.
361+
type contentionKey struct {
362+
waitingStmtFingerprintID appstatspb.StmtFingerprintID
363+
waitingTxnFingerprintID appstatspb.TransactionFingerprintID
364+
blockingTxnFingerprintID appstatspb.TransactionFingerprintID
365+
contendedKey redact.RedactableString
366+
}
367+
368+
func logResolvedEvents(ctx context.Context, events []contentionpb.ExtendedContentionEvent) {
369+
eventsAggregated := make(map[contentionKey]time.Duration)
370+
for _, event := range events {
371+
// Create a struct key instead of a composite string
372+
key := contentionKey{
373+
waitingStmtFingerprintID: event.WaitingStmtFingerprintID,
374+
waitingTxnFingerprintID: event.WaitingTxnFingerprintID,
375+
blockingTxnFingerprintID: event.BlockingTxnFingerprintID,
376+
contendedKey: redact.Sprint(event.BlockingEvent.Key),
377+
}
378+
// Add the contention time to the existing total for this key combination
379+
eventsAggregated[key] += event.BlockingEvent.Duration
380+
}
381+
382+
for key, duration := range eventsAggregated {
383+
// Create an AggregatedContentionInfo event for structured logging
384+
event := &eventpb.AggregatedContentionInfo{
385+
WaitingStmtFingerprintId: "\\x" + sqlstatsutil.EncodeStmtFingerprintIDToString(key.waitingStmtFingerprintID),
386+
WaitingTxnFingerprintId: "\\x" + sqlstatsutil.EncodeTxnFingerprintIDToString(key.waitingTxnFingerprintID),
387+
BlockingTxnFingerprintId: "\\x" + sqlstatsutil.EncodeTxnFingerprintIDToString(key.blockingTxnFingerprintID),
388+
ContendedKey: key.contendedKey,
389+
Duration: duration.Nanoseconds(),
390+
}
391+
log.StructuredEvent(ctx, logpb.Severity_INFO, event)
392+
}
393+
}
394+
351395
func entryBytes(event *contentionpb.ExtendedContentionEvent) int {
352396
// Since we store the event's hash as the key to the unordered cache,
353397
// this is means we are storing another copy of uint64 (8 bytes).

pkg/sql/contention/event_store_test.go

Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,18 @@ import (
1919
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
2020
"github.com/cockroachdb/cockroach/pkg/sql/appstatspb"
2121
"github.com/cockroachdb/cockroach/pkg/sql/contentionpb"
22+
"github.com/cockroachdb/cockroach/pkg/sql/sqlstats/persistedsqlstats/sqlstatsutil"
2223
"github.com/cockroachdb/cockroach/pkg/storage/enginepb"
2324
"github.com/cockroachdb/cockroach/pkg/testutils"
25+
"github.com/cockroachdb/cockroach/pkg/util/log"
26+
"github.com/cockroachdb/cockroach/pkg/util/log/eventpb"
27+
"github.com/cockroachdb/cockroach/pkg/util/log/logpb"
28+
"github.com/cockroachdb/cockroach/pkg/util/log/logtestutils"
2429
"github.com/cockroachdb/cockroach/pkg/util/stop"
2530
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
2631
"github.com/cockroachdb/cockroach/pkg/util/uuid"
2732
"github.com/cockroachdb/errors"
33+
"github.com/cockroachdb/redact"
2834
"github.com/stretchr/testify/require"
2935
)
3036

@@ -249,6 +255,171 @@ func BenchmarkEventStoreIntake(b *testing.B) {
249255
}
250256
}
251257

258+
func TestLogResolvedEvents(t *testing.T) {
259+
ctx := context.Background()
260+
261+
logCommonDetails := logpb.CommonEventDetails{
262+
Timestamp: timeutil.Now().UnixNano(),
263+
EventType: "aggregated_contention_info",
264+
}
265+
// Set up log interception to capture structured AggregatedContentionInfo events
266+
spy := logtestutils.NewStructuredLogSpy(
267+
t,
268+
[]logpb.Channel{logpb.Channel_SQL_EXEC},
269+
[]string{"aggregated_contention_info"},
270+
func(entry logpb.Entry) (eventpb.AggregatedContentionInfo, error) {
271+
le, err := logtestutils.FromLogEntry[eventpb.AggregatedContentionInfo](entry)
272+
if err != nil {
273+
return le, err
274+
}
275+
le.CommonEventDetails = logCommonDetails
276+
return le, nil
277+
},
278+
)
279+
cleanup := log.InterceptWith(ctx, spy)
280+
defer cleanup()
281+
282+
fingerprintIDToHex := func(id uint64) string {
283+
return "\\x" + sqlstatsutil.EncodeStmtFingerprintIDToString(appstatspb.StmtFingerprintID(id))
284+
}
285+
txnFingerprintIDToHex := func(id uint64) string {
286+
return "\\x" + sqlstatsutil.EncodeTxnFingerprintIDToString(appstatspb.TransactionFingerprintID(id))
287+
}
288+
289+
type testCase struct {
290+
events []contentionpb.ExtendedContentionEvent
291+
expected []eventpb.AggregatedContentionInfo
292+
}
293+
294+
testCases := []testCase{
295+
{
296+
// Two events with same waiting (stmt, txn), blocking txn and key
297+
events: []contentionpb.ExtendedContentionEvent{
298+
{
299+
WaitingStmtFingerprintID: 123,
300+
WaitingTxnFingerprintID: 456,
301+
BlockingTxnFingerprintID: 789,
302+
BlockingEvent: kvpb.ContentionEvent{
303+
Key: []byte("test-key-1"),
304+
Duration: 100 * time.Millisecond,
305+
},
306+
},
307+
{
308+
WaitingStmtFingerprintID: 123,
309+
WaitingTxnFingerprintID: 456,
310+
BlockingTxnFingerprintID: 789,
311+
BlockingEvent: kvpb.ContentionEvent{
312+
Key: []byte("test-key-1"),
313+
Duration: 200 * time.Millisecond,
314+
},
315+
},
316+
},
317+
expected: []eventpb.AggregatedContentionInfo{
318+
{
319+
CommonEventDetails: logCommonDetails,
320+
WaitingStmtFingerprintId: fingerprintIDToHex(123),
321+
WaitingTxnFingerprintId: txnFingerprintIDToHex(456),
322+
BlockingTxnFingerprintId: txnFingerprintIDToHex(789),
323+
ContendedKey: redact.Sprint("\"test-key-1\""),
324+
Duration: 300000000,
325+
},
326+
},
327+
},
328+
{
329+
// Different key.
330+
events: []contentionpb.ExtendedContentionEvent{
331+
{
332+
WaitingStmtFingerprintID: 123,
333+
WaitingTxnFingerprintID: 456,
334+
BlockingTxnFingerprintID: 789,
335+
BlockingEvent: kvpb.ContentionEvent{
336+
Key: []byte("test-key-1"),
337+
Duration: 100 * time.Millisecond,
338+
},
339+
},
340+
{
341+
WaitingStmtFingerprintID: 123,
342+
WaitingTxnFingerprintID: 456,
343+
BlockingTxnFingerprintID: 789,
344+
BlockingEvent: kvpb.ContentionEvent{
345+
Key: []byte("test-key-2"),
346+
Duration: 200 * time.Millisecond,
347+
},
348+
},
349+
},
350+
expected: []eventpb.AggregatedContentionInfo{
351+
{
352+
CommonEventDetails: logCommonDetails,
353+
WaitingStmtFingerprintId: fingerprintIDToHex(123),
354+
WaitingTxnFingerprintId: txnFingerprintIDToHex(456),
355+
BlockingTxnFingerprintId: txnFingerprintIDToHex(789),
356+
ContendedKey: redact.Sprint("\"test-key-1\""),
357+
Duration: 100000000,
358+
},
359+
{
360+
CommonEventDetails: logCommonDetails,
361+
WaitingStmtFingerprintId: fingerprintIDToHex(123),
362+
WaitingTxnFingerprintId: txnFingerprintIDToHex(456),
363+
BlockingTxnFingerprintId: txnFingerprintIDToHex(789),
364+
ContendedKey: redact.Sprint("\"test-key-2\""),
365+
Duration: 200000000,
366+
},
367+
},
368+
},
369+
{
370+
// Different fingerprint IDs.
371+
events: []contentionpb.ExtendedContentionEvent{
372+
{
373+
WaitingStmtFingerprintID: 123,
374+
WaitingTxnFingerprintID: 456,
375+
BlockingTxnFingerprintID: 789,
376+
BlockingEvent: kvpb.ContentionEvent{
377+
Key: []byte("test-key-1"),
378+
Duration: 100 * time.Millisecond,
379+
},
380+
},
381+
{
382+
WaitingStmtFingerprintID: 123,
383+
WaitingTxnFingerprintID: 654,
384+
BlockingTxnFingerprintID: 789,
385+
BlockingEvent: kvpb.ContentionEvent{
386+
Key: []byte("test-key-1"),
387+
Duration: 200 * time.Millisecond,
388+
},
389+
},
390+
},
391+
expected: []eventpb.AggregatedContentionInfo{
392+
{
393+
CommonEventDetails: logCommonDetails,
394+
WaitingStmtFingerprintId: fingerprintIDToHex(123),
395+
WaitingTxnFingerprintId: txnFingerprintIDToHex(456),
396+
BlockingTxnFingerprintId: txnFingerprintIDToHex(789),
397+
ContendedKey: redact.Sprint("\"test-key-1\""),
398+
Duration: 100000000,
399+
},
400+
{
401+
CommonEventDetails: logCommonDetails,
402+
WaitingStmtFingerprintId: fingerprintIDToHex(123),
403+
WaitingTxnFingerprintId: txnFingerprintIDToHex(654),
404+
BlockingTxnFingerprintId: txnFingerprintIDToHex(789),
405+
ContendedKey: redact.Sprint("\"test-key-1\""),
406+
Duration: 200000000,
407+
},
408+
},
409+
},
410+
}
411+
412+
for _, tc := range testCases {
413+
spy.Reset()
414+
logResolvedEvents(ctx, tc.events)
415+
contentionLogs := spy.GetLogs(logpb.Channel_SQL_EXEC)
416+
require.Len(t, contentionLogs, len(tc.expected))
417+
for _, exepectedContentionLog := range tc.expected {
418+
require.Contains(t, contentionLogs, exepectedContentionLog)
419+
}
420+
}
421+
}
422+
252423
func eventSliceToMap(
253424
events []contentionpb.ExtendedContentionEvent,
254425
) map[uuid.UUID]contentionpb.ExtendedContentionEvent {

pkg/sql/sqlstats/persistedsqlstats/sqlstatsutil/json_encoding.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -455,6 +455,12 @@ func EncodeUint64ToBytes(id uint64) []byte {
455455
return encoding.EncodeUint64Ascending(result, id)
456456
}
457457

458-
func encodeStmtFingerprintIDToString(id appstatspb.StmtFingerprintID) string {
458+
// EncodeStmtFingerprintIDToString returns the hex string representation of a statement fingerprint ID.
459+
func EncodeStmtFingerprintIDToString(id appstatspb.StmtFingerprintID) string {
460+
return hex.EncodeToString(EncodeUint64ToBytes(uint64(id)))
461+
}
462+
463+
// EncodeTxnFingerprintIDToString returns the hex string representation of a transaction fingerprint ID.
464+
func EncodeTxnFingerprintIDToString(id appstatspb.TransactionFingerprintID) string {
459465
return hex.EncodeToString(EncodeUint64ToBytes(uint64(id)))
460466
}

pkg/sql/sqlstats/persistedsqlstats/sqlstatsutil/json_impl.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -289,7 +289,7 @@ func (s *stmtFingerprintID) decodeJSON(js json.JSON) error {
289289

290290
func (s *stmtFingerprintID) encodeJSON() (json.JSON, error) {
291291
return json.FromString(
292-
encodeStmtFingerprintIDToString((appstatspb.StmtFingerprintID)(*s))), nil
292+
EncodeStmtFingerprintIDToString((appstatspb.StmtFingerprintID)(*s))), nil
293293
}
294294

295295
type innerTxnStats appstatspb.TransactionStatistics

pkg/util/log/eventpb/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ proto_library(
4949
srcs = [
5050
"changefeed_events.proto",
5151
"cluster_events.proto",
52+
"contention_events.proto",
5253
"ddl_events.proto",
5354
"debug_events.proto",
5455
"events.proto",

pkg/util/log/eventpb/PROTOS.bzl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ EVENTPB_PROTOS = [
2020
"storage_events.proto",
2121
"telemetry.proto",
2222
"changefeed_events.proto",
23+
"contention_events.proto",
2324
]
2425

2526
EVENTPB_PROTO_DEPS = [ "//pkg/util/log/logpb:event.proto", ] + EVENTPB_PROTOS

0 commit comments

Comments
 (0)