Skip to content

Commit f5a38d8

Browse files
craig[bot]iskettaneh
andcommitted
Merge #153816
153816: kvprober: log traced planning query r=iskettaneh a=iskettaneh This commit adds a cluster setting kv.prober.log_tracing_on_plan_failure.enabled. If enabled, along with kv.prober.tracing.enabled, the read/write probers will log the traced planning query (targetting Meta2) if there is an error in the planning phase. Example of the output log: ``` E250919 20:38:53.527962 2486 2@kv/kvprober/kvprober.go:385 ⋮ [T1,Vsystem,n1,kvprober] 985 can't make a plan: failed to get meta2 rows: operation "db.Scan" timed out after 1ms (given timeout 1ms): aborted in DistSender: result is ambiguous: context deadline exceeded, recorded trace: 0.000ms 0.000ms === operation:read probe gid:2486 _verbose:‹1› node:‹1› kvprober: E250919 20:38:53.527962 2486 2@kv/kvprober/kvprober.go:385 ⋮ [T1,Vsystem,n1,kvprober] 985 + 0.000ms 0.000ms [kvprober.meta2-scan: {count: 1, duration 1ms}] E250919 20:38:53.527962 2486 2@kv/kvprober/kvprober.go:385 ⋮ [T1,Vsystem,n1,kvprober] 985 + 0.000ms 0.000ms [dist sender send: {count: 1, duration 1ms}] E250919 20:38:53.527962 2486 2@kv/kvprober/kvprober.go:385 ⋮ [T1,Vsystem,n1,kvprober] 985 + 0.050ms 0.050ms event:kv/kvprober/planner.go:223 [n1,kvprober] Meta2 start scan E250919 20:38:53.527962 2486 2@kv/kvprober/kvprober.go:385 ⋮ [T1,Vsystem,n1,kvprober] 985 + 0.069ms 0.019ms === operation:kvprober.meta2-scan gid:2486 _verbose:‹1› node:‹1› kvprober: E250919 20:38:53.527962 2486 2@kv/kvprober/kvprober.go:385 ⋮ [T1,Vsystem,n1,kvprober] 985 + 0.069ms 0.000ms [dist sender send: {count: 1, duration 1ms}] E250919 20:38:53.527962 2486 2@kv/kvprober/kvprober.go:385 ⋮ [T1,Vsystem,n1,kvprober] 985 + 0.081ms 0.012ms === operation:dist sender send gid:2486 _verbose:‹1› node:‹1› kvprober: E250919 20:38:53.527962 2486 2@kv/kvprober/kvprober.go:385 ⋮ [T1,Vsystem,n1,kvprober] 985 + 0.118ms 0.037ms event:kv/kvclient/kvcoord/range_iter.go:183 [n1,kvprober] querying next range at /Meta2/Table/106/1 /‹-5113857622831569008›/‹NULL› E250919 20:38:53.527962 2486 2@kv/kvprober/kvprober.go:385 ⋮ [T1,Vsystem,n1,kvprober] 985 + 0.161ms 0.043ms event:kv/kvclient/kvcoord/range_iter.go:220 [n1,kvprober] key: /Meta2/Table/106/1/‹-511385762283156 9008›/‹NULL›, desc: r1:/{Min-System/NodeLiveness} [(n1,s1):1, (n2,s2):2, (n3,s3):3, next=4, gen=4] E250919 20:38:53.527962 2486 2@kv/kvprober/kvprober.go:385 ⋮ [T1,Vsystem,n1,kvprober] 985 + 0.200ms 0.039ms event:kv/kvclient/kvcoord/dist_sender.go:2662 [n1,kvprober] r1: sending batch ‹1 Scan› to (n3,s3):3 E250919 20:38:53.527962 2486 2@kv/kvprober/kvprober.go:385 ⋮ [T1,Vsystem,n1,kvprober] 985 + 0.214ms 0.014ms event:rpc/nodedialer/nodedialer.go:230 [n1,kvprober] sending request to ‹10.142.0.124:26257› E250919 20:38:53.527962 2486 2@kv/kvprober/kvprober.go:385 ⋮ [T1,Vsystem,n1,kvprober] 985 + 0.224ms 0.010ms event:kv/kvclient/kvcoord/transport.go:210 [n1,kvprober] ‹sending batch request› E250919 20:38:53.527962 2486 2@kv/kvprober/kvprober.go:385 ⋮ [T1,Vsystem,n1,kvprober] 985 + 1.175ms 0.951ms event:kv/kvclient/kvcoord/transport.go:212 [n1,kvprober] ‹received batch response› E250919 20:38:53.527962 2486 2@kv/kvprober/kvprober.go:385 ⋮ [T1,Vsystem,n1,kvprober] 985 + 1.277ms 0.102ms event:kv/kvclient/kvcoord/dist_sender.go:2813 [n1,kvprober] RPC error: ba: ‹Scan [/Meta2/Table/106/ 1/-5113857622831569008/NULL,/Meta2/"\xff\xff\x00"), [max_span_request_keys: 100], [target_bytes: 0]› RPC error: context deadline exceeded E250919 20:38:53.527962 2486 2@kv/kvprober/kvprober.go:385 ⋮ [T1,Vsystem,n1,kvprober] 985 + 1.303ms 0.026ms event:kv/kvclient/kvcoord/dist_sender.go:3137 [n1,kvprober] context deadline exceeded E250919 20:38:53.527962 2486 2@kv/kvprober/kvprober.go:385 ⋮ [T1,Vsystem,n1,kvprober] 985 + 1.366ms 0.063ms event:kv/kvclient/kvcoord/dist_sender.go:2331 [n1,kvprober] replace error context deadline exceeded with aborted in DistSender: result is ambiguous: context deadline exceeded ``` Epic: None Release note: None Co-authored-by: Ibrahim Kettaneh <[email protected]>
2 parents 3ac1c85 + 99cfb5b commit f5a38d8

File tree

2 files changed

+19
-2
lines changed

2 files changed

+19
-2
lines changed

pkg/kv/kvprober/kvprober.go

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -368,6 +368,7 @@ func (p *Prober) readProbeImpl(ctx context.Context, ops proberOpsI, txns proberT
368368
var probeCtx = ctx
369369

370370
isTracingEnabled := tracingEnabled.Get(&p.settings.SV)
371+
logTracingOnPlanFailureEnabled := isTracingEnabled && logTracingOnPlanFailureEnabled.Get(&p.settings.SV)
371372
if isTracingEnabled {
372373
probeCtx, finishAndGetRecording = tracing.ContextWithRecordingSpan(ctx, p.tracer, "read probe")
373374
}
@@ -380,7 +381,11 @@ func (p *Prober) readProbeImpl(ctx context.Context, ops proberOpsI, txns proberT
380381
if errorIsExpectedDuringNormalOperation(err) {
381382
log.Health.Warningf(ctx, "making a plan failed with expected error: %v", err)
382383
} else {
383-
log.Health.Errorf(ctx, "can't make a plan: %v", err)
384+
if logTracingOnPlanFailureEnabled {
385+
log.Health.Errorf(ctx, "can't make a plan: %v, recorded trace: %s", err, finishAndGetRecording())
386+
} else {
387+
log.Health.Errorf(ctx, "can't make a plan: %v", err)
388+
}
384389
p.metrics.ProbePlanFailures.Inc(1)
385390
}
386391
return
@@ -453,6 +458,7 @@ func (p *Prober) writeProbeImpl(ctx context.Context, ops proberOpsI, txns prober
453458
var probeCtx = ctx
454459

455460
isTracingEnabled := tracingEnabled.Get(&p.settings.SV)
461+
logTracingOnPlanFailureEnabled := isTracingEnabled && logTracingOnPlanFailureEnabled.Get(&p.settings.SV)
456462
if isTracingEnabled {
457463
probeCtx, finishAndGetRecording = tracing.ContextWithRecordingSpan(ctx, p.tracer, "write probe")
458464
}
@@ -467,7 +473,11 @@ func (p *Prober) writeProbeImpl(ctx context.Context, ops proberOpsI, txns prober
467473
if errorIsExpectedDuringNormalOperation(err) {
468474
log.Health.Warningf(ctx, "making a plan failed with expected error: %v", err)
469475
} else {
470-
log.Health.Errorf(ctx, "can't make a plan: %v", err)
476+
if logTracingOnPlanFailureEnabled {
477+
log.Health.Errorf(ctx, "can't make a plan: %v, recorded trace: %s", err, finishAndGetRecording())
478+
} else {
479+
log.Health.Errorf(ctx, "can't make a plan: %v", err)
480+
}
471481
p.metrics.ProbePlanFailures.Inc(1)
472482
}
473483
return

pkg/kv/kvprober/settings.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,3 +136,10 @@ var tracingEnabled = settings.RegisterBoolSetting(
136136
"whether the KV prober should collect traces, in order to log info about"+
137137
"leaseholders of ranges probed",
138138
false)
139+
140+
var logTracingOnPlanFailureEnabled = settings.RegisterBoolSetting(
141+
settings.ApplicationLevel,
142+
"kv.prober.log_tracing_on_plan_failure.enabled",
143+
"whether the KV prober should print log traces when planning fails; "+
144+
"requires kv.prober.tracing.enabled to be set to true",
145+
false)

0 commit comments

Comments
 (0)