Skip to content

Commit 220787d

Browse files
committed
kvnemesis: address fault mode todos
This commit addresses TODOs left after introducing network partitions and node restarts to kvnemesis: - All fault mode tests run with 4 nodes to ensure that liveness tests can always maintain a majority quorum of replicas on nodes 1 and 2. If the tests run with 5 nodes, we need to take extra care to ensure 3 replicas are healthy for all RF=5 ranges (e.g. the systems ranges). - Splits are now allowed in both partition and restart liveness modes. Previously, if r3 was unavailable, the range ID allocator would get stuck retrying the increment operation that generates a new range ID. This commit ensures all systems ranges are available in liveness mode. Moreover, for the restart liveness variant, some error handling tweaks were needed to make sure the split is retried if it races with a node shutdown. - A few operations are re-enabled across the two partition test variants: lease transafers and delete range. It's not clear what changed but these don't cause the tests to fail anymore (or at least it's hard to repro). Potentially, the theory for why they could fail was not quite right. Will investigate more if they fail in CI. Part of: #64828 Part of: #114814 Release note: None
1 parent 8abc131 commit 220787d

File tree

1 file changed

+16
-45
lines changed

1 file changed

+16
-45
lines changed

pkg/kv/kvnemesis/kvnemesis_test.go

Lines changed: 16 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -470,7 +470,7 @@ func TestKVNemesisMultiNode_Partition_Safety(t *testing.T) {
470470
defer log.Scope(t).Close(t)
471471

472472
testKVNemesisImpl(t, kvnemesisTestCfg{
473-
numNodes: 5,
473+
numNodes: 4,
474474
numSteps: defaultNumSteps,
475475
concurrency: 5,
476476
seedOverride: 0,
@@ -481,16 +481,6 @@ func TestKVNemesisMultiNode_Partition_Safety(t *testing.T) {
481481
testGeneratorConfig: func(cfg *GeneratorConfig) {
482482
cfg.Ops.Fault.AddNetworkPartition = 1
483483
cfg.Ops.Fault.RemoveNetworkPartition = 1
484-
// TODO(mira): DeleteRangeUsingTombstone and AddSSTable are always
485-
// non-transactional, and as such are susceptible to double-application.
486-
// The cluster setting kvcoord.NonTransactionalWritesNotIdempotent is
487-
// enabled for this test to protect against double-application, but these
488-
// requests don't propagate the flag AmbiguousReplayProtection to ensure
489-
// the second application fails. We should fix this.
490-
cfg.Ops.DB.DeleteRangeUsingTombstone = 0
491-
cfg.Ops.DB.AddSSTable = 0
492-
// The same issue above occurs for non-transactional DeleteRange requests.
493-
cfg.Ops.DB.DeleteRange = 0
494484
},
495485
})
496486
}
@@ -500,7 +490,7 @@ func TestKVNemesisMultiNode_Partition_Liveness(t *testing.T) {
500490
defer log.Scope(t).Close(t)
501491

502492
testKVNemesisImpl(t, kvnemesisTestCfg{
503-
numNodes: 5,
493+
numNodes: 4,
504494
numSteps: defaultNumSteps,
505495
concurrency: 5,
506496
seedOverride: 0,
@@ -516,20 +506,9 @@ func TestKVNemesisMultiNode_Partition_Liveness(t *testing.T) {
516506
// constraints (at least one replica on nodes 1 and 2).
517507
cfg.Ops.ChangeReplicas = ChangeReplicasConfig{}
518508
// Epoch leases can experience indefinite unavailability in the case of a
519-
// leader-leaseholder split and a network partition, so only leader leases
520-
// are allowed.
509+
// leader-leaseholder split and a network partition. This test starts off
510+
// with leader leases, and lease type changes are disallowed.
521511
cfg.Ops.ChangeSetting = ChangeSettingConfig{}
522-
// TODO(mira): Transfers can result in RUEs because in the intermediate
523-
// expiration-lease state, a request can get stuck holding latches until
524-
// the replica circuit breaker trips and poisons the latches. This results
525-
// in RUEs returned to the client. The behavior is expected; we can enable
526-
// this setting if we allow the test to tolerate these RUEs.
527-
cfg.Ops.ChangeLease = ChangeLeaseConfig{}
528-
// TODO(mira): We should investigate splits more. So far I've seen then
529-
// fail for two reasons: (1) r1 can become uvavailable (we can fix this by
530-
// setting the right zone configs), and (2) if a partition races with the
531-
// split, the range ID allocator can get stuck waiting for a response.
532-
cfg.Ops.Split = SplitConfig{}
533512
},
534513
})
535514
}
@@ -559,9 +538,6 @@ func TestKVNemesisMultiNode_Restart_Liveness(t *testing.T) {
559538
// Disallow replica changes because they interfere with the zone config
560539
// constraints (at least one replica on nodes 1 and 2).
561540
cfg.Ops.ChangeReplicas = ChangeReplicasConfig{}
562-
// TODO(mira): Similar issue to Partition_Liveness, except the failure
563-
// mode (2) here looks like "could not allocate ID; system is draining".
564-
cfg.Ops.Split = SplitConfig{}
565541
},
566542
})
567543
}
@@ -817,22 +793,17 @@ func setAndVerifyZoneConfigs(
817793
voter_constraints = '{"+node=n1": 1, "+node=n2": 1}'`,
818794
)
819795

820-
// Ensure the liveness and meta ranges are also constrained appropriately.
821-
sqlRunner.Exec(
822-
t, `ALTER RANGE meta CONFIGURE ZONE USING
823-
num_replicas = 3,
824-
num_voters = 3,
825-
constraints = '{"+node=n1": 1, "+node=n2": 1}',
826-
voter_constraints = '{"+node=n1": 1, "+node=n2": 1}'`,
827-
)
828-
829-
sqlRunner.Exec(
830-
t, `ALTER RANGE liveness CONFIGURE ZONE USING
831-
num_replicas = 3,
832-
num_voters = 3,
833-
constraints = '{"+node=n1": 1, "+node=n2": 1}',
834-
voter_constraints = '{"+node=n1": 1, "+node=n2": 1}'`,
835-
)
796+
// Ensure the liveness, meta and system ranges are also constrained.
797+
systemRanges := []string{"meta", "liveness", "system"}
798+
for _, r := range systemRanges {
799+
sqlRunner.Exec(
800+
t, fmt.Sprintf(`ALTER RANGE %s CONFIGURE ZONE USING
801+
num_replicas = 3,
802+
num_voters = 3,
803+
constraints = '{"+node=n1": 1, "+node=n2": 1}',
804+
voter_constraints = '{"+node=n1": 1, "+node=n2": 1}'`, r),
805+
)
806+
}
836807

837808
// Wait for zone configs to propagate to all span config subscribers.
838809
require.NoError(t, tc.WaitForZoneConfigPropagation())
@@ -853,7 +824,7 @@ func setAndVerifyZoneConfigs(
853824
Key: desc.StartKey.AsRawKey(),
854825
EndKey: desc.EndKey.AsRawKey(),
855826
}
856-
if replicaSpan.Overlaps(dataSpan) || desc.RangeID <= 2 {
827+
if replicaSpan.Overlaps(dataSpan) || desc.RangeID <= 3 {
857828
overlappingReplicas = append(overlappingReplicas, replica)
858829
}
859830
return true // continue

0 commit comments

Comments
 (0)