Skip to content

Commit 37f7b9a

Browse files
committed
kv: add TestDecommissionPreCheckRetryThrottledStores
Previously, we made decommission prechecks retry on errors, since some transient issues resolve quickly and shouldn’t cause the precheck to fail. This commit adds a test that verifies the precheck retries when it encounters transient throttled errors.
1 parent 5d90692 commit 37f7b9a

File tree

3 files changed

+62
-0
lines changed

3 files changed

+62
-0
lines changed

pkg/kv/kvserver/store.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3965,6 +3965,13 @@ func (s *Store) AllocatorCheckRange(
39653965
collectTraces bool,
39663966
overrideStorePool storepool.AllocatorStorePool,
39673967
) (allocatorimpl.AllocatorAction, roachpb.ReplicationTarget, tracingpb.Recording, error) {
3968+
// Testing knob to inject errors.
3969+
if interceptor := s.cfg.TestingKnobs.AllocatorCheckRangeInterceptor; interceptor != nil {
3970+
if err := interceptor(); err != nil {
3971+
return allocatorimpl.AllocatorNoop, roachpb.ReplicationTarget{}, tracingpb.Recording{}, err
3972+
}
3973+
}
3974+
39683975
var spanOptions []tracing.SpanOption
39693976
if collectTraces {
39703977
spanOptions = append(spanOptions, tracing.WithRecording(tracingpb.RecordingVerbose))

pkg/kv/kvserver/testing_knobs.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -483,6 +483,12 @@ type StoreTestingKnobs struct {
483483
// EnqueueReplicaInterceptor intercepts calls to `store.Enqueue()`.
484484
EnqueueReplicaInterceptor func(queueName string, replica *Replica)
485485

486+
// AllocatorCheckRangeInterceptor intercepts calls to
487+
// `Store.AllocatorCheckRange()` and can be used to inject errors for testing.
488+
// If returns non-nil error, the error is returned instead of calling the
489+
// actual AllocatorCheckRange logic.
490+
AllocatorCheckRangeInterceptor func() error
491+
486492
// GlobalMVCCRangeTombstone will write a global MVCC range tombstone across
487493
// the entire user keyspace during cluster bootstrapping. This will be written
488494
// below all other data, and thus won't affect query results, but it does

pkg/server/storage_api/decommission_test.go

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"context"
1010
"fmt"
1111
"strings"
12+
"sync/atomic"
1213
"testing"
1314
"time"
1415

@@ -1050,3 +1051,51 @@ func checkRangeCheckResult(
10501051
}
10511052
passed = true
10521053
}
1054+
1055+
// TestDecommissionPreCheckRetryThrottledStores tests that the decommission
1056+
// pre-check retries when it encounters transient throttled errors. Regression
1057+
// test for #156849.
1058+
func TestDecommissionPreCheckRetryThrottledStores(t *testing.T) {
1059+
defer leaktest.AfterTest(t)()
1060+
defer log.Scope(t).Close(t)
1061+
1062+
ctx := context.Background()
1063+
1064+
var returnedError atomic.Bool
1065+
returnError := func() error {
1066+
if returnedError.CompareAndSwap(false, true) {
1067+
return errors.New("injected error")
1068+
}
1069+
return nil
1070+
}
1071+
1072+
tc := serverutils.StartCluster(t, 4, base.TestClusterArgs{
1073+
ServerArgs: base.TestServerArgs{
1074+
DefaultTestTenant: base.TestIsSpecificToStorageLayerAndNeedsASystemTenant,
1075+
Knobs: base.TestingKnobs{
1076+
Store: &kvserver.StoreTestingKnobs{
1077+
AllocatorCheckRangeInterceptor: returnError,
1078+
},
1079+
},
1080+
},
1081+
ReplicationMode: base.ReplicationManual,
1082+
})
1083+
defer tc.Stopper().Stop(ctx)
1084+
1085+
scratchKey := tc.ScratchRange(t)
1086+
1087+
// Add replicas to the scratch range on nodes 2 and 3.
1088+
scratchDesc := tc.AddVotersOrFatal(t, scratchKey, tc.Target(1), tc.Target(2))
1089+
require.Len(t, scratchDesc.InternalReplicas, 3)
1090+
1091+
// Perform decommission pre-check on node 3. AllocatorCheckRange will fail
1092+
// once due to the injected error, then succeed after retries.
1093+
decommissioningNodeIDs := []roachpb.NodeID{tc.Server(2).NodeID()}
1094+
result, err := tc.Server(0).DecommissionPreCheck(ctx, decommissioningNodeIDs,
1095+
true /* strictReadiness */, false /* collectTraces */, 0 /* maxErrors */)
1096+
1097+
require.NoError(t, err)
1098+
require.Greater(t, result.RangesChecked, 0)
1099+
require.Empty(t, result.RangesNotReady)
1100+
require.True(t, returnedError.Load())
1101+
}

0 commit comments

Comments
 (0)