Skip to content

Commit d6665c0

Browse files
craig[bot]joshimhoff
andcommitted
104365: kvprober: special case node-is-decommissioned errors r=joshimhoff a=joshimhoff **kvprober: special case node-is-decommissioned errors** kvprober runs on decommissioned node. In CC, this is generally fine, since automation fully takes down nodes once they reach the decommissioned state. But there is a brief period where a node is running and in the decommissioned state, and we see kvprober errors in metrics during this period, as in below. This sometimes leads to false positive kvprober pages in CC production. ‹rpc error: code = PermissionDenied desc = n1 was permanently removed from... To be clear, the errors are not wrong per say. They just are expected to happen, once a node is decommissioned. This commit adds special handling for errors of the kind above, by doing a substring match on the error string. To be exact, kvprober now logs such errors at warning level and does not increment any error counters. This way, an operation like decommissioning a node does not cause false positive kvprober pages in CC production. Fixes cockroachdb#104367 Release note: None, since kvprober is not used by customers. (It is not documented.) Co-authored-by: Josh Carp <[email protected]> Co-authored-by: Josh Imhoff <[email protected]>
2 parents b6b5d28 + 2611c74 commit d6665c0

File tree

7 files changed

+153
-30
lines changed

7 files changed

+153
-30
lines changed

pkg/kv/kvprober/BUILD.bazel

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ go_library(
1313
deps = [
1414
"//pkg/keys",
1515
"//pkg/kv",
16+
"//pkg/kv/kvpb",
1617
"//pkg/roachpb",
1718
"//pkg/settings",
1819
"//pkg/settings/cluster",
@@ -64,5 +65,6 @@ go_test(
6465
"//pkg/util/tracing",
6566
"@com_github_cockroachdb_errors//:errors",
6667
"@com_github_stretchr_testify//require",
68+
"@org_golang_google_grpc//codes",
6769
],
6870
)

pkg/kv/kvprober/kvprober.go

Lines changed: 52 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424

2525
"github.com/cockroachdb/cockroach/pkg/keys"
2626
"github.com/cockroachdb/cockroach/pkg/kv"
27+
"github.com/cockroachdb/cockroach/pkg/kv/kvpb"
2728
"github.com/cockroachdb/cockroach/pkg/roachpb"
2829
"github.com/cockroachdb/cockroach/pkg/settings"
2930
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
@@ -209,6 +210,27 @@ func (p *ProberOps) Write(key roachpb.Key) func(context.Context, *kv.Txn) error
209210
}
210211
}
211212

213+
// errorIsExpectedDuringNormalOperation filters out errors that may be returned
214+
// during normal operation of CRDB.
215+
//
216+
// One such example is the `was permanently removed from the cluster at` error
217+
// that is returned to the kvclient of decommissioned nodes. This error does not
218+
// affect user traffic, since such traffic is drained off the node by the time it
219+
// becomes decommissioned.
220+
//
221+
// Since such errors do not indicate a problem with CRDB, kvprober does not report
222+
// them as an error in its metrics.
223+
func errorIsExpectedDuringNormalOperation(err error) bool {
224+
// Note that errors *other* than decommissioned status errors, such as
225+
// `use of closed network connection`, happen *occasionally* on the kvclient
226+
// of a decommissioned node. The full set of other errors is not known exactly,
227+
// and the errors mostly lack structure. Since they happen rarely, and since
228+
// the intended use of kvprober is to page on a sustained error rate, not a
229+
// single error, we choose to only filter out errors via the
230+
// kvpb.IsDecommissionedStatusErr function.
231+
return kvpb.IsDecommissionedStatusErr(err)
232+
}
233+
212234
// validateKey returns an error if the key is not valid for use by the kvprober.
213235
// This is a sanity check to ensure that the kvprober does not corrupt user data
214236
// in the global keyspace or other system data in the local keyspace.
@@ -351,8 +373,12 @@ func (p *Prober) readProbeImpl(ctx context.Context, ops proberOpsI, txns proberT
351373
return
352374
}
353375
if err != nil {
354-
log.Health.Errorf(ctx, "can't make a plan: %v", err)
355-
p.metrics.ProbePlanFailures.Inc(1)
376+
if errorIsExpectedDuringNormalOperation(err) {
377+
log.Health.Warningf(ctx, "making a plan failed with expected error: %v", err)
378+
} else {
379+
log.Health.Errorf(ctx, "can't make a plan: %v", err)
380+
p.metrics.ProbePlanFailures.Inc(1)
381+
}
356382
return
357383
}
358384

@@ -382,9 +408,13 @@ func (p *Prober) readProbeImpl(ctx context.Context, ops proberOpsI, txns proberT
382408
return txns.TxnRootKV(ctx, f)
383409
})
384410
if err != nil {
385-
// TODO(josh): Write structured events with log.Structured.
386-
log.Health.Errorf(ctx, "kv.Get(%s), r=%v failed with: %v", step.Key, step.RangeID, err)
387-
p.metrics.ReadProbeFailures.Inc(1)
411+
if errorIsExpectedDuringNormalOperation(err) {
412+
log.Health.Warningf(ctx, "kv.Get(%s), r=%v failed with expected error: %v", step.Key, step.RangeID, err)
413+
} else {
414+
// TODO(josh): Write structured events with log.Structured.
415+
log.Health.Errorf(ctx, "kv.Get(%s), r=%v failed with: %v", step.Key, step.RangeID, err)
416+
p.metrics.ReadProbeFailures.Inc(1)
417+
}
388418
return
389419
}
390420

@@ -414,8 +444,12 @@ func (p *Prober) writeProbeImpl(ctx context.Context, ops proberOpsI, txns prober
414444
return
415445
}
416446
if err != nil {
417-
log.Health.Errorf(ctx, "can't make a plan: %v", err)
418-
p.metrics.ProbePlanFailures.Inc(1)
447+
if errorIsExpectedDuringNormalOperation(err) {
448+
log.Health.Warningf(ctx, "making a plan failed with expected error: %v", err)
449+
} else {
450+
log.Health.Errorf(ctx, "can't make a plan: %v", err)
451+
p.metrics.ProbePlanFailures.Inc(1)
452+
}
419453
return
420454
}
421455

@@ -434,11 +468,17 @@ func (p *Prober) writeProbeImpl(ctx context.Context, ops proberOpsI, txns prober
434468
return txns.TxnRootKV(ctx, f)
435469
})
436470
if err != nil {
437-
added := p.quarantineWritePool.maybeAdd(ctx, step)
438-
log.Health.Errorf(
439-
ctx, "kv.Txn(Put(%s); Del(-)), r=%v failed with: %v [quarantined=%t]", step.Key, step.RangeID, err, added,
440-
)
441-
p.metrics.WriteProbeFailures.Inc(1)
471+
if errorIsExpectedDuringNormalOperation(err) {
472+
log.Health.Warningf(
473+
ctx, "kv.Txn(Put(%s); Del(-)), r=%v failed with expected error: %v", step.Key, step.RangeID, err,
474+
)
475+
} else {
476+
added := p.quarantineWritePool.maybeAdd(ctx, step)
477+
log.Health.Errorf(
478+
ctx, "kv.Txn(Put(%s); Del(-)), r=%v failed with: %v [quarantined=%t]", step.Key, step.RangeID, err, added,
479+
)
480+
p.metrics.WriteProbeFailures.Inc(1)
481+
}
442482
return
443483
}
444484
// This will no-op if not in the quarantine pool.

pkg/kv/kvprober/kvprober_integration_test.go

Lines changed: 12 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@ import (
3232
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
3333
"github.com/cockroachdb/cockroach/pkg/util/log"
3434
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
35-
"github.com/cockroachdb/cockroach/pkg/util/tracing"
3635
"github.com/cockroachdb/errors"
3736
"github.com/stretchr/testify/require"
3837
)
@@ -46,7 +45,7 @@ func TestProberDoesReadsAndWrites(t *testing.T) {
4645
ctx := context.Background()
4746

4847
t.Run("disabled by default", func(t *testing.T) {
49-
s, _, p, cleanup := initTestProber(t, base.TestingKnobs{})
48+
s, _, p, cleanup := initTestServer(t, base.TestingKnobs{})
5049
defer cleanup()
5150

5251
kvprober.ReadInterval.Override(ctx, &s.ClusterSettings().SV, 5*time.Millisecond)
@@ -61,7 +60,7 @@ func TestProberDoesReadsAndWrites(t *testing.T) {
6160
})
6261

6362
t.Run("happy path", func(t *testing.T) {
64-
s, _, p, cleanup := initTestProber(t, base.TestingKnobs{})
63+
s, _, p, cleanup := initTestServer(t, base.TestingKnobs{})
6564
defer cleanup()
6665

6766
kvprober.ReadEnabled.Override(ctx, &s.ClusterSettings().SV, true)
@@ -87,7 +86,7 @@ func TestProberDoesReadsAndWrites(t *testing.T) {
8786
})
8887

8988
t.Run("a single range is unavailable for all KV ops", func(t *testing.T) {
90-
s, _, p, cleanup := initTestProber(t, base.TestingKnobs{
89+
s, _, p, cleanup := initTestServer(t, base.TestingKnobs{
9190
Store: &kvserver.StoreTestingKnobs{
9291
TestingRequestFilter: func(i context.Context, ba *kvpb.BatchRequest) *kvpb.Error {
9392
for _, ru := range ba.Requests {
@@ -129,7 +128,7 @@ func TestProberDoesReadsAndWrites(t *testing.T) {
129128
var dbIsAvailable syncutil.AtomicBool
130129
dbIsAvailable.Set(true)
131130

132-
s, _, p, cleanup := initTestProber(t, base.TestingKnobs{
131+
s, _, p, cleanup := initTestServer(t, base.TestingKnobs{
133132
Store: &kvserver.StoreTestingKnobs{
134133
TestingRequestFilter: func(i context.Context, ba *kvpb.BatchRequest) *kvpb.Error {
135134
if !dbIsAvailable.Get() {
@@ -174,7 +173,7 @@ func TestProberDoesReadsAndWrites(t *testing.T) {
174173
var dbIsAvailable syncutil.AtomicBool
175174
dbIsAvailable.Set(true)
176175

177-
s, _, p, cleanup := initTestProber(t, base.TestingKnobs{
176+
s, _, p, cleanup := initTestServer(t, base.TestingKnobs{
178177
Store: &kvserver.StoreTestingKnobs{
179178
TestingRequestFilter: func(i context.Context, ba *kvpb.BatchRequest) *kvpb.Error {
180179
if !dbIsAvailable.Get() {
@@ -225,7 +224,7 @@ func TestWriteProbeDoesNotLeaveLiveData(t *testing.T) {
225224

226225
ctx := context.Background()
227226

228-
s, _, p, cleanup := initTestProber(t, base.TestingKnobs{})
227+
s, _, p, cleanup := initTestServer(t, base.TestingKnobs{})
229228
defer cleanup()
230229

231230
kvprober.WriteEnabled.Override(ctx, &s.ClusterSettings().SV, true)
@@ -259,7 +258,7 @@ func TestPlannerMakesPlansCoveringAllRanges(t *testing.T) {
259258

260259
ctx := context.Background()
261260
// Disable split and merge queue just in case.
262-
_, sqlDB, p, cleanup := initTestProber(t, base.TestingKnobs{
261+
_, sqlDB, p, cleanup := initTestServer(t, base.TestingKnobs{
263262
Store: &kvserver.StoreTestingKnobs{DisableSplitQueue: true, DisableMergeQueue: true},
264263
})
265264
defer cleanup()
@@ -307,7 +306,7 @@ func TestProberOpsValidatesProbeKey(t *testing.T) {
307306
defer log.Scope(t).Close(t)
308307

309308
ctx := context.Background()
310-
s, _, _, cleanup := initTestProber(t, base.TestingKnobs{})
309+
s, _, _, cleanup := initTestServer(t, base.TestingKnobs{})
311310
defer cleanup()
312311

313312
var ops kvprober.ProberOps
@@ -350,23 +349,18 @@ func TestProberOpsValidatesProbeKey(t *testing.T) {
350349
}
351350
}
352351

353-
func initTestProber(
352+
func initTestServer(
354353
t *testing.T, knobs base.TestingKnobs,
355354
) (serverutils.TestServerInterface, *gosql.DB, *kvprober.Prober, func()) {
356-
357-
s, sqlDB, kvDB := serverutils.StartServer(t, base.TestServerArgs{
355+
s, sqlDB, _ := serverutils.StartServer(t, base.TestServerArgs{
358356
Settings: cluster.MakeClusterSettings(),
359357
Knobs: knobs,
360358
})
361-
p := kvprober.NewProber(kvprober.Opts{
362-
Tracer: s.TracerI().(*tracing.Tracer),
363-
DB: kvDB,
364-
HistogramWindowInterval: time.Minute, // actual value not important to test
365-
Settings: s.ClusterSettings(),
366-
})
367359

368360
// Given small test cluster, this better exercises the planning logic.
369361
kvprober.NumStepsToPlanAtOnce.Override(context.Background(), &s.ClusterSettings().SV, 10)
362+
363+
p := s.KvProber()
370364
// Want these tests to run as fast as possible; see planner_test.go for a
371365
// unit test of the rate limiting.
372366
p.SetPlanningRateLimits(0)

pkg/kv/kvprober/kvprober_test.go

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,12 @@ import (
1818

1919
"github.com/cockroachdb/cockroach/pkg/keys"
2020
"github.com/cockroachdb/cockroach/pkg/kv"
21+
"github.com/cockroachdb/cockroach/pkg/kv/kvpb"
2122
"github.com/cockroachdb/cockroach/pkg/roachpb"
2223
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
2324
"github.com/cockroachdb/cockroach/pkg/util/tracing"
2425
"github.com/stretchr/testify/require"
26+
"google.golang.org/grpc/codes"
2527
)
2628

2729
func TestReadProbe(t *testing.T) {
@@ -78,6 +80,31 @@ func TestReadProbe(t *testing.T) {
7880
require.Zero(t, p.Metrics().ReadProbeFailures.Count())
7981
})
8082

83+
// Once a node is fully decommissioned, neither kvclient nor kvprober work from
84+
// the node. This does not indicate a service health issue; it is expected behavior.
85+
//
86+
// This is not tested with an integration test, since the kvclient of a decommissioned
87+
// node will occasionally return other errors. We choose not to filter those out for
88+
// reasons given at errorIsExpectedDuringNormalOperation. As a result, an integration test
89+
// would be flaky. We believe a unit test is sufficient, largely because the main risk
90+
// in only having a unit test is false positive pages on SRE, due to changes in what errors
91+
// are returned from the kvclient of a decommissioned node. Though false positive pages add
92+
// ops load, they do not directly affect the customer experience.
93+
t.Run("planning fails due to decommissioning but not counted as error", func(t *testing.T) {
94+
m := &mock{
95+
t: t,
96+
read: true,
97+
planErr: kvpb.NewDecommissionedStatusErrorf(codes.PermissionDenied, "foobar"),
98+
}
99+
p := initTestProber(ctx, m)
100+
p.readProbeImpl(ctx, m, m, m)
101+
102+
require.Equal(t, int64(1), p.Metrics().ProbePlanAttempts.Count())
103+
require.Zero(t, p.Metrics().ReadProbeAttempts.Count())
104+
require.Zero(t, p.Metrics().ProbePlanFailures.Count())
105+
require.Zero(t, p.Metrics().ReadProbeFailures.Count())
106+
})
107+
81108
t.Run("txn fails", func(t *testing.T) {
82109
m := &mock{
83110
t: t,
@@ -107,6 +134,22 @@ func TestReadProbe(t *testing.T) {
107134
require.Zero(t, p.Metrics().ProbePlanFailures.Count())
108135
require.Equal(t, int64(1), p.Metrics().ReadProbeFailures.Count())
109136
})
137+
138+
// See comment above matching case in TestReadProbe regarding planning.
139+
t.Run("read fails due to decommissioning but not counted as error", func(t *testing.T) {
140+
m := &mock{
141+
t: t,
142+
read: true,
143+
readErr: kvpb.NewDecommissionedStatusErrorf(codes.PermissionDenied, "foobar"),
144+
}
145+
p := initTestProber(ctx, m)
146+
p.readProbeImpl(ctx, m, m, m)
147+
148+
require.Equal(t, int64(1), p.Metrics().ProbePlanAttempts.Count())
149+
require.Equal(t, int64(1), p.Metrics().ReadProbeAttempts.Count())
150+
require.Zero(t, p.Metrics().ProbePlanFailures.Count())
151+
require.Zero(t, p.Metrics().ReadProbeFailures.Count())
152+
})
110153
}
111154

112155
func TestWriteProbe(t *testing.T) {
@@ -163,6 +206,22 @@ func TestWriteProbe(t *testing.T) {
163206
require.Zero(t, p.Metrics().WriteProbeFailures.Count())
164207
})
165208

209+
// See comment above matching case in TestReadProbe regarding planning.
210+
t.Run("planning fails due to decommissioning but not counted as error", func(t *testing.T) {
211+
m := &mock{
212+
t: t,
213+
write: true,
214+
planErr: kvpb.NewDecommissionedStatusErrorf(codes.PermissionDenied, "foobar"),
215+
}
216+
p := initTestProber(ctx, m)
217+
p.writeProbeImpl(ctx, m, m, m)
218+
219+
require.Equal(t, int64(1), p.Metrics().ProbePlanAttempts.Count())
220+
require.Zero(t, p.Metrics().WriteProbeAttempts.Count())
221+
require.Zero(t, p.Metrics().ProbePlanFailures.Count())
222+
require.Zero(t, p.Metrics().WriteProbeFailures.Count())
223+
})
224+
166225
t.Run("open txn fails", func(t *testing.T) {
167226
m := &mock{
168227
t: t,
@@ -192,6 +251,22 @@ func TestWriteProbe(t *testing.T) {
192251
require.Zero(t, p.Metrics().ProbePlanFailures.Count())
193252
require.Equal(t, int64(1), p.Metrics().WriteProbeFailures.Count())
194253
})
254+
255+
// See comment above matching case in TestReadProbe regarding planning.
256+
t.Run("write fails due to decommissioning but not counted as error", func(t *testing.T) {
257+
m := &mock{
258+
t: t,
259+
write: true,
260+
writeErr: kvpb.NewDecommissionedStatusErrorf(codes.PermissionDenied, "foobar"),
261+
}
262+
p := initTestProber(ctx, m)
263+
p.writeProbeImpl(ctx, m, m, m)
264+
265+
require.Equal(t, int64(1), p.Metrics().ProbePlanAttempts.Count())
266+
require.Equal(t, int64(1), p.Metrics().WriteProbeAttempts.Count())
267+
require.Zero(t, p.Metrics().ProbePlanFailures.Count())
268+
require.Zero(t, p.Metrics().WriteProbeFailures.Count())
269+
})
195270
}
196271

197272
func initTestProber(ctx context.Context, m *mock) *Prober {

pkg/server/testserver.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ import (
3535
"github.com/cockroachdb/cockroach/pkg/kv/kvclient/kvcoord"
3636
"github.com/cockroachdb/cockroach/pkg/kv/kvclient/rangefeed"
3737
"github.com/cockroachdb/cockroach/pkg/kv/kvpb"
38+
"github.com/cockroachdb/cockroach/pkg/kv/kvprober"
3839
"github.com/cockroachdb/cockroach/pkg/kv/kvserver"
3940
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/liveness"
4041
"github.com/cockroachdb/cockroach/pkg/multitenant/tenantcapabilities"
@@ -1836,6 +1837,11 @@ func (ts *TestServer) BinaryVersionOverride() roachpb.Version {
18361837
return knobs.(*TestingKnobs).BinaryVersionOverride
18371838
}
18381839

1840+
// KvProber is part of the TestServerInterface.
1841+
func (ts *TestServer) KvProber() *kvprober.Prober {
1842+
return ts.Server.kvProber
1843+
}
1844+
18391845
type testServerFactoryImpl struct{}
18401846

18411847
// TestServerFactory can be passed to serverutils.InitTestServerFactory

pkg/testutils/serverutils/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ go_library(
1515
"//pkg/config",
1616
"//pkg/keys",
1717
"//pkg/kv",
18+
"//pkg/kv/kvprober",
1819
"//pkg/kv/kvserver/liveness/livenesspb",
1920
"//pkg/multitenant/tenantcapabilities",
2021
"//pkg/roachpb",

pkg/testutils/serverutils/test_server_shim.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import (
2626

2727
"github.com/cockroachdb/cockroach/pkg/base"
2828
"github.com/cockroachdb/cockroach/pkg/kv"
29+
"github.com/cockroachdb/cockroach/pkg/kv/kvprober"
2930
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/liveness/livenesspb"
3031
"github.com/cockroachdb/cockroach/pkg/roachpb"
3132
"github.com/cockroachdb/cockroach/pkg/security"
@@ -302,6 +303,10 @@ type TestServerInterface interface {
302303
// BinaryVersionOverride returns the value of an override if set using
303304
// TestingKnobs.
304305
BinaryVersionOverride() roachpb.Version
306+
307+
// KvProber returns a *kvprober.Prober, which is useful when asserting the
308+
//correctness of the prober from integration tests.
309+
KvProber() *kvprober.Prober
305310
}
306311

307312
// TestServerFactory encompasses the actual implementation of the shim

0 commit comments

Comments
 (0)