Skip to content

Commit c9d9621

Browse files
craig[bot]tbg
andcommitted
Merge #156368
156368: kvcoord: more tracing for leaseholder backoffs r=tbg a=tbg Informs #156284. Epic: none Co-authored-by: Tobias Grieger <[email protected]>
2 parents fa11d2f + 8d9ec35 commit c9d9621

File tree

2 files changed

+24
-8
lines changed

2 files changed

+24
-8
lines changed

pkg/kv/kvclient/kvcoord/dist_sender.go

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -370,6 +370,9 @@ const (
370370
// The maximum number of times a replica is retried when it repeatedly returns
371371
// stale lease info.
372372
sameReplicaRetryLimit = 10
373+
// InLeaseTransferBackoffTraceMessage is traced when DistSender backs off as a
374+
// result of a NotLeaseholderError. It is exported for testing.
375+
InLeaseTransferBackoffTraceMessage = "backing off due to NotLeaseHolderErr with stale info"
373376
)
374377

375378
var rangeDescriptorCacheSize = settings.RegisterIntSetting(
@@ -2899,6 +2902,7 @@ func (ds *DistSender) sendToReplicas(
28992902
// If we get a gRPC error against the leaseholder, we don't want to
29002903
// backoff and keep trying the request against the same leaseholder.
29012904
if lh := routing.Leaseholder(); lh != nil && lh.IsSame(curReplica) {
2905+
log.VEventf(ctx, 2, "RPC error and lh %s != curReplica %s; marking leaseholder as unavailable", lh, curReplica)
29022906
leaseholderUnavailable = true
29032907
}
29042908
} else {
@@ -3008,6 +3012,7 @@ func (ds *DistSender) sendToReplicas(
30083012
// error out when the transport is exhausted even if multiple replicas
30093013
// return NLHEs to different replicas all returning RUEs.
30103014
replicaUnavailableError = br.Error.GoError()
3015+
log.VEventf(ctx, 2, "got RUE from lh and lh equals curReplica; marking as leaseholderUnavailable")
30113016
leaseholderUnavailable = true
30123017
} else if replicaUnavailableError == nil {
30133018
// This is the first time we see a RUE. Record it, such that we'll
@@ -3054,6 +3059,8 @@ func (ds *DistSender) sendToReplicas(
30543059
// prevents accidentally returning a replica unavailable
30553060
// error too aggressively.
30563061
if updatedLeaseholder {
3062+
log.VEventf(ctx, 2,
3063+
"updated leaseholder; resetting leaseholderUnavailable and routing to leaseholder")
30573064
leaseholderUnavailable = false
30583065
routeToLeaseholder = true
30593066
// If we changed the leaseholder, reset the transport to try all the
@@ -3130,7 +3137,10 @@ func (ds *DistSender) sendToReplicas(
31303137
shouldBackoff := !updatedLeaseholder && !intentionallySentToFollower && !leaseholderUnavailable
31313138
if shouldBackoff {
31323139
ds.metrics.InLeaseTransferBackoffs.Inc(1)
3133-
log.VErrEventf(ctx, 2, "backing off due to NotLeaseHolderErr with stale info")
3140+
log.VErrEventf(ctx, 2,
3141+
InLeaseTransferBackoffTraceMessage+
3142+
" (updatedLH=%t intentionallySentToFollower=%t leaseholderUnavailable=%t)",
3143+
updatedLeaseholder, intentionallySentToFollower, leaseholderUnavailable)
31343144
} else {
31353145
inTransferRetry.Reset() // The following Next() call will not block.
31363146
}

pkg/kv/kvserver/client_replica_circuit_breaker_test.go

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -863,14 +863,20 @@ func TestReplicaCircuitBreaker_Partial_Retry(t *testing.T) {
863863
requireRUEs := func(t *testing.T, dbs []*kv.DB) {
864864
t.Helper()
865865
for _, db := range dbs {
866-
backoffMetric := (db.NonTransactionalSender().(*kv.CrossRangeTxnWrapperSender)).Wrapped().(*kvcoord.DistSender).Metrics().InLeaseTransferBackoffs
867-
initialBackoff := backoffMetric.Count()
866+
ctx, finishAndGetRec := tracing.ContextWithRecordingSpan(context.Background(), db.Tracer, "requireRUEs")
867+
defer func() {
868+
rec := finishAndGetRec()
869+
if !t.Failed() {
870+
return
871+
}
872+
t.Logf("failure trace:\n%s", rec)
873+
}()
868874
err := db.Put(ctx, key, value)
869-
// Verify that we did not perform any backoff while executing this request.
870-
require.EqualValues(t, 0, backoffMetric.Count()-initialBackoff)
871875
require.Error(t, err)
872876
require.True(t, errors.HasType(err, (*kvpb.ReplicaUnavailableError)(nil)),
873877
"expected ReplicaUnavailableError, got %v", err)
878+
// Verify that we did not perform any backoff while executing this request.
879+
require.NotContains(t, finishAndGetRec(), kvcoord.InLeaseTransferBackoffTraceMessage)
874880
}
875881
t.Logf("writes failed with ReplicaUnavailableError")
876882
}
@@ -921,7 +927,7 @@ func TestReplicaCircuitBreaker_Partial_Retry(t *testing.T) {
921927
// and it will return RUE.
922928
lease, _ := repl3.GetLease()
923929
manualClock.Increment(tc.Servers[0].RaftConfig().RangeLeaseDuration.Nanoseconds())
924-
t.Logf("expired n%d lease", lease.Replica.ReplicaID)
930+
t.Logf("expired first lease (n%d)", lease.Replica.ReplicaID) // always n3
925931

926932
requireRUEs(t, dbs)
927933

@@ -968,7 +974,7 @@ func TestReplicaCircuitBreaker_Partial_Retry(t *testing.T) {
968974
// because the leader's circuit breaker is tripped.
969975
lease, _ = repl1.GetLease()
970976
manualClock.Increment(tc.Servers[0].RaftConfig().RangeLeaseDuration.Nanoseconds())
971-
t.Logf("expired n%d lease", lease.Replica.ReplicaID)
977+
t.Logf("expired second lease (n%d)", lease.Replica.ReplicaID)
972978

973979
requireRUEs(t, dbs)
974980

@@ -982,7 +988,7 @@ func TestReplicaCircuitBreaker_Partial_Retry(t *testing.T) {
982988
}
983989
return true
984990
}, 10*time.Second, 100*time.Millisecond)
985-
t.Logf("no raft leader")
991+
t.Logf("expired raft leader")
986992

987993
requireRUEs(t, dbs)
988994

0 commit comments

Comments
 (0)