Skip to content

Commit bb2f6ed

Browse files
committed
kvnemesis: introduce safety and liveness modes
Currently, all kvnemesis test variants test both safety and liveness properties: safety in the sense of validating serializability, and liveness in the sense of ensuring availability. With the introduction of various faults (e.g. network partitions), kvnemesis tests can continue to operate the same way only if we carefully craft the fault patterns in order to avoid unavailability. While this is valuable, it is also important to test for safety testing in the presense of unavailability. This commit introduces two new modes of running kvnemesis tests, both of which still validate serializability: - Safety mode: all fault patterns are allowed; unavailability errors are ignored. - Liveness mode: faults are introduced carefully to ensure a well-connected quorum is preserved; unavailability errors fail the test. Fixes: #114814 Release note: None
1 parent 8c93138 commit bb2f6ed

File tree

8 files changed

+346
-60
lines changed

8 files changed

+346
-60
lines changed

pkg/kv/kvnemesis/BUILD.bazel

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,8 +96,10 @@ go_test(
9696
"//pkg/security/securitytest",
9797
"//pkg/server",
9898
"//pkg/settings/cluster",
99+
"//pkg/spanconfig",
99100
"//pkg/storage",
100101
"//pkg/storage/enginepb",
102+
"//pkg/testutils",
101103
"//pkg/testutils/datapathutils",
102104
"//pkg/testutils/echotest",
103105
"//pkg/testutils/serverutils",

pkg/kv/kvnemesis/applier.go

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ package kvnemesis
77

88
import (
99
"context"
10+
"strings"
1011
"time"
1112

1213
"github.com/cockroachdb/cockroach/pkg/config/zonepb"
@@ -98,7 +99,8 @@ func exceptUnhandledRetry(err error) bool {
9899
}
99100

100101
func exceptAmbiguous(err error) bool { // true if ambiguous result
101-
return errors.HasInterface(err, (*kvpb.ClientVisibleAmbiguousError)(nil))
102+
return errors.HasInterface(err, (*kvpb.ClientVisibleAmbiguousError)(nil)) ||
103+
strings.Contains(err.Error(), "result is ambiguous")
102104
}
103105

104106
func exceptDelRangeUsingTombstoneStraddlesRangeBoundary(err error) bool {
@@ -109,6 +111,15 @@ func exceptConditionFailed(err error) bool {
109111
return errors.HasType(err, (*kvpb.ConditionFailedError)(nil))
110112
}
111113

114+
func exceptReplicaUnavailable(err error) bool {
115+
return errors.HasType(err, (*kvpb.ReplicaUnavailableError)(nil))
116+
}
117+
118+
func exceptContextCanceled(err error) bool {
119+
return errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) ||
120+
strings.Contains(err.Error(), "query execution canceled")
121+
}
122+
112123
func applyOp(ctx context.Context, env *Env, db *kv.DB, op *Operation) {
113124
switch o := op.GetValue().(type) {
114125
case *GetOperation,
@@ -751,7 +762,12 @@ func getRangeDesc(ctx context.Context, key roachpb.Key, dbs ...*kv.DB) roachpb.R
751762
var opts = retry.Options{}
752763
for r := retry.StartWithCtx(ctx, opts); r.Next(); dbIdx = (dbIdx + 1) % len(dbs) {
753764
sender := dbs[dbIdx].NonTransactionalSender()
754-
descs, _, err := kv.RangeLookup(ctx, sender, key, kvpb.CONSISTENT, 0, false)
765+
// Use kvpb.INCONSISTENT because kv.CONSISTENT requires a transactional
766+
// sender. In the generator, range lookups are usually used for finding
767+
// replica/lease change targets, so it's ok if these are not consistent.
768+
// Using kv.CONSISTENT with a non-transactional sender and in the presence
769+
// of network partitions can lead to infinitely stuck lookups.
770+
descs, _, err := kv.RangeLookup(ctx, sender, key, kvpb.INCONSISTENT, 0, false)
755771
if err != nil {
756772
log.Dev.Infof(ctx, "looking up descriptor for %s: %+v", key, err)
757773
continue
@@ -762,12 +778,11 @@ func getRangeDesc(ctx context.Context, key roachpb.Key, dbs ...*kv.DB) roachpb.R
762778
}
763779
return descs[0]
764780
}
765-
panic(`unreachable`)
781+
return roachpb.RangeDescriptor{}
766782
}
767783

768784
func newGetReplicasFn(dbs ...*kv.DB) GetReplicasFn {
769-
ctx := context.Background()
770-
return func(key roachpb.Key) ([]roachpb.ReplicationTarget, []roachpb.ReplicationTarget) {
785+
return func(ctx context.Context, key roachpb.Key) ([]roachpb.ReplicationTarget, []roachpb.ReplicationTarget) {
771786
desc := getRangeDesc(ctx, key, dbs...)
772787
replicas := desc.Replicas().Descriptors()
773788
var voters []roachpb.ReplicationTarget

pkg/kv/kvnemesis/generator.go

Lines changed: 48 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import (
1515
"slices"
1616
"sort"
1717
"sync/atomic"
18+
"time"
1819

1920
"github.com/cockroachdb/cockroach/pkg/keys"
2021
"github.com/cockroachdb/cockroach/pkg/kv/kvnemesis/kvnemesisutil"
@@ -28,6 +29,7 @@ import (
2829
"github.com/cockroachdb/cockroach/pkg/util/encoding"
2930
"github.com/cockroachdb/cockroach/pkg/util/hlc"
3031
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
32+
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
3133
"github.com/cockroachdb/errors"
3234
"golang.org/x/exp/maps"
3335
)
@@ -645,7 +647,7 @@ func GeneratorDataSpan() roachpb.Span {
645647

646648
// GetReplicasFn is a function that returns the current voting and non-voting
647649
// replicas, respectively, for the range containing a key.
648-
type GetReplicasFn func(roachpb.Key) ([]roachpb.ReplicationTarget, []roachpb.ReplicationTarget)
650+
type GetReplicasFn func(context.Context, roachpb.Key) ([]roachpb.ReplicationTarget, []roachpb.ReplicationTarget)
649651

650652
// Generator incrementally constructs KV traffic designed to maximally test edge
651653
// cases.
@@ -676,7 +678,9 @@ type Generator struct {
676678
}
677679

678680
// MakeGenerator constructs a Generator.
679-
func MakeGenerator(config GeneratorConfig, replicasFn GetReplicasFn) (*Generator, error) {
681+
func MakeGenerator(
682+
config GeneratorConfig, replicasFn GetReplicasFn, mode TestMode,
683+
) (*Generator, error) {
680684
if config.NumNodes <= 0 {
681685
return nil, errors.Errorf(`NumNodes must be positive got: %d`, config.NumNodes)
682686
}
@@ -693,7 +697,11 @@ func MakeGenerator(config GeneratorConfig, replicasFn GetReplicasFn) (*Generator
693697
}
694698
for i := 1; i <= config.NumNodes; i++ {
695699
for j := 1; j <= config.NumNodes; j++ {
696-
if i == j {
700+
// In liveness mode, we don't allow adding partitions between the two
701+
// protected nodes (node 1 and node 2), so we don't include those
702+
// connections in the set of healthy connections at all.
703+
protectedConn := (i == 1 && j == 2) || (i == 2 && j == 1)
704+
if i == j || (mode == Liveness && protectedConn) {
697705
continue
698706
}
699707
conn := connection{from: i, to: j}
@@ -708,6 +716,7 @@ func MakeGenerator(config GeneratorConfig, replicasFn GetReplicasFn) (*Generator
708716
currentSplits: make(map[string]struct{}),
709717
historicalSplits: make(map[string]struct{}),
710718
partitions: p,
719+
mode: mode,
711720
}
712721
return g, nil
713722
}
@@ -745,6 +754,10 @@ type generator struct {
745754
// partitions contains the sets of healthy and partitioned connections
746755
// between nodes.
747756
partitions
757+
758+
// mode is the test mode (e.g. Liveness or Safety). The generator needs it in
759+
// order to set a timeout for range lookups under safety mode.
760+
mode TestMode
748761
}
749762

750763
type connection struct {
@@ -777,26 +790,45 @@ func (g *generator) RandStep(rng *rand.Rand) Step {
777790
}
778791

779792
key := randKey(rng)
780-
voters, nonVoters := g.replicasFn(roachpb.Key(key))
793+
var voters, nonVoters []roachpb.ReplicationTarget
794+
if g.mode == Safety {
795+
if err := timeutil.RunWithTimeout(context.Background(), "getting replicas", 3*time.Second,
796+
func(ctx context.Context) error {
797+
voters, nonVoters = g.replicasFn(ctx, roachpb.Key(key))
798+
return nil
799+
}); err != nil {
800+
voters, nonVoters = []roachpb.ReplicationTarget{}, []roachpb.ReplicationTarget{}
801+
}
802+
} else {
803+
voters, nonVoters = g.replicasFn(context.Background(), roachpb.Key(key))
804+
}
781805
numVoters, numNonVoters := len(voters), len(nonVoters)
782806
numReplicas := numVoters + numNonVoters
783807
if numReplicas < g.Config.NumNodes {
784-
addVoterFn := makeAddReplicaFn(key, voters, false /* atomicSwap */, true /* voter */)
785-
addOpGen(&allowed, addVoterFn, g.Config.Ops.ChangeReplicas.AddVotingReplica)
786-
addNonVoterFn := makeAddReplicaFn(key, nonVoters, false /* atomicSwap */, false /* voter */)
787-
addOpGen(&allowed, addNonVoterFn, g.Config.Ops.ChangeReplicas.AddNonVotingReplica)
808+
if len(voters) > 0 {
809+
addVoterFn := makeAddReplicaFn(key, voters, false /* atomicSwap */, true /* voter */)
810+
addOpGen(&allowed, addVoterFn, g.Config.Ops.ChangeReplicas.AddVotingReplica)
811+
}
812+
if len(nonVoters) > 0 {
813+
addNonVoterFn := makeAddReplicaFn(key, nonVoters, false /* atomicSwap */, false /* voter */)
814+
addOpGen(&allowed, addNonVoterFn, g.Config.Ops.ChangeReplicas.AddNonVotingReplica)
815+
}
788816
}
789817
if numReplicas == g.Config.NumReplicas && numReplicas < g.Config.NumNodes {
790-
atomicSwapVoterFn := makeAddReplicaFn(key, voters, true /* atomicSwap */, true /* voter */)
791-
addOpGen(&allowed, atomicSwapVoterFn, g.Config.Ops.ChangeReplicas.AtomicSwapVotingReplica)
818+
if len(voters) > 0 {
819+
atomicSwapVoterFn := makeAddReplicaFn(key, voters, true /* atomicSwap */, true /* voter */)
820+
addOpGen(&allowed, atomicSwapVoterFn, g.Config.Ops.ChangeReplicas.AtomicSwapVotingReplica)
821+
}
792822
if numNonVoters > 0 {
793823
atomicSwapNonVoterFn := makeAddReplicaFn(key, nonVoters, true /* atomicSwap */, false /* voter */)
794824
addOpGen(&allowed, atomicSwapNonVoterFn, g.Config.Ops.ChangeReplicas.AtomicSwapNonVotingReplica)
795825
}
796826
}
797827
if numReplicas > g.Config.NumReplicas {
798-
removeVoterFn := makeRemoveReplicaFn(key, voters, true /* voter */)
799-
addOpGen(&allowed, removeVoterFn, g.Config.Ops.ChangeReplicas.RemoveVotingReplica)
828+
if len(voters) > 0 {
829+
removeVoterFn := makeRemoveReplicaFn(key, voters, true /* voter */)
830+
addOpGen(&allowed, removeVoterFn, g.Config.Ops.ChangeReplicas.RemoveVotingReplica)
831+
}
800832
if numNonVoters > 0 {
801833
removeNonVoterFn := makeRemoveReplicaFn(key, nonVoters, false /* voter */)
802834
addOpGen(&allowed, removeNonVoterFn, g.Config.Ops.ChangeReplicas.RemoveNonVotingReplica)
@@ -810,8 +842,10 @@ func (g *generator) RandStep(rng *rand.Rand) Step {
810842
promoteNonVoterFn := makePromoteReplicaFn(key, nonVoters)
811843
addOpGen(&allowed, promoteNonVoterFn, g.Config.Ops.ChangeReplicas.PromoteReplica)
812844
}
813-
transferLeaseFn := makeTransferLeaseFn(key, append(voters, nonVoters...))
814-
addOpGen(&allowed, transferLeaseFn, g.Config.Ops.ChangeLease.TransferLease)
845+
if numVoters > 0 {
846+
transferLeaseFn := makeTransferLeaseFn(key, append(voters, nonVoters...))
847+
addOpGen(&allowed, transferLeaseFn, g.Config.Ops.ChangeLease.TransferLease)
848+
}
815849

816850
addOpGen(&allowed, setLeaseType, g.Config.Ops.ChangeSetting.SetLeaseType)
817851
addOpGen(&allowed, toggleGlobalReads, g.Config.Ops.ChangeZone.ToggleGlobalReads)

pkg/kv/kvnemesis/generator_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,11 +73,11 @@ func TestRandStep(t *testing.T) {
7373
config := newAllOperationsConfig()
7474
config.NumNodes, config.NumReplicas = 3, 2
7575
rng, _ := randutil.NewTestRand()
76-
getReplicasFn := func(_ roachpb.Key) ([]roachpb.ReplicationTarget, []roachpb.ReplicationTarget) {
76+
getReplicasFn := func(ctx context.Context, _ roachpb.Key) ([]roachpb.ReplicationTarget, []roachpb.ReplicationTarget) {
7777
return make([]roachpb.ReplicationTarget, rng.Intn(config.NumNodes)+1),
7878
make([]roachpb.ReplicationTarget, rng.Intn(config.NumNodes)+1)
7979
}
80-
g, err := MakeGenerator(config, getReplicasFn)
80+
g, err := MakeGenerator(config, getReplicasFn, 0)
8181
require.NoError(t, err)
8282

8383
keys := make(map[string]struct{})

pkg/kv/kvnemesis/kvnemesis.go

Lines changed: 63 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,13 @@ import (
1515
"reflect"
1616
"strings"
1717
"sync/atomic"
18+
"time"
1819

1920
"github.com/cockroachdb/cockroach/pkg/kv"
2021
"github.com/cockroachdb/cockroach/pkg/testutils/datapathutils"
2122
"github.com/cockroachdb/cockroach/pkg/util/ctxgroup"
2223
"github.com/cockroachdb/cockroach/pkg/util/log"
24+
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
2325
"github.com/cockroachdb/errors"
2426
)
2527

@@ -66,6 +68,25 @@ func l(ctx context.Context, basename string, format string, args ...interface{})
6668
return ""
6769
}
6870

71+
// TestMode defines how faults are inserted and validated.
72+
type TestMode int
73+
74+
const (
75+
// The default value of TestMode is 0, which corresponds to no faults.
76+
_ TestMode = iota
77+
// Safety mode is used to test for safety properties (i.e. serializability) in
78+
// the presence of unlimited faults. Unavailability errors are expected and
79+
// ignored.
80+
Safety = 1
81+
// Liveness mode is used to test for liveness properties (i.e. availability).
82+
// To do so in the presence of faults, the test will inject faults carefully,
83+
// ensuring a well-connected quorum of replicas is always available, and the
84+
// tests connects to one of the nodes in it. Without loss of generality, we
85+
// keep nodes 1 and 2 available and connected to each other.
86+
// TODO(mira): don't hardcode the protected nodes.
87+
Liveness = 2
88+
)
89+
6990
// RunNemesis generates and applies a series of Operations to exercise the KV
7091
// api. It returns a slice of the logical failures encountered.
7192
func RunNemesis(
@@ -75,6 +96,7 @@ func RunNemesis(
7596
config GeneratorConfig,
7697
concurrency int,
7798
numSteps int,
99+
mode TestMode,
78100
dbs ...*kv.DB,
79101
) ([]error, error) {
80102
if env.L != nil {
@@ -86,11 +108,17 @@ func RunNemesis(
86108

87109
dataSpan := GeneratorDataSpan()
88110

89-
g, err := MakeGenerator(config, newGetReplicasFn(dbs...))
111+
g, err := MakeGenerator(config, newGetReplicasFn(dbs...), mode)
90112
if err != nil {
91113
return nil, err
92114
}
93-
a := MakeApplier(env, dbs...)
115+
applierDBs := dbs
116+
// In Liveness mode, only nodes 1 and 2 are guaranteed to be available, so use
117+
// only the first two DBs to apply operations.
118+
if mode == Liveness && len(applierDBs) >= 2 {
119+
applierDBs = applierDBs[:2]
120+
}
121+
a := MakeApplier(env, applierDBs...)
94122
w, err := Watch(ctx, env, dbs, dataSpan)
95123
if err != nil {
96124
return nil, err
@@ -117,28 +145,35 @@ func RunNemesis(
117145
step.format(&buf, formatCtx{indent: ` ` + workerName + ` PRE `})
118146
l(ctx, basename, "%s", &buf)
119147
}
120-
121-
trace, err := a.Apply(ctx, &step)
122-
step.Trace = l(ctx, fmt.Sprintf("%s_trace", stepPrefix), "%s", trace.String())
123-
124-
stepsByWorker[workerIdx] = append(stepsByWorker[workerIdx], step)
125-
126-
prefix := ` OP `
127-
if err != nil {
128-
prefix = ` ERR `
129-
}
130-
131-
{
132-
var buf strings.Builder
133-
fmt.Fprintf(&buf, " before: %s", step.Before)
134-
step.format(&buf, formatCtx{indent: ` ` + workerName + prefix})
135-
fmt.Fprintf(&buf, "\n after: %s", step.After)
136-
l(ctx, basename, "%s", &buf)
137-
}
138-
139-
if err != nil {
148+
applyAndLogOp := func(ctx context.Context) error {
149+
trace, err := a.Apply(ctx, &step)
150+
step.Trace = l(ctx, fmt.Sprintf("%s_trace", stepPrefix), "%s", trace.String())
151+
152+
stepsByWorker[workerIdx] = append(stepsByWorker[workerIdx], step)
153+
154+
prefix := ` OP `
155+
if err != nil {
156+
prefix = ` ERR `
157+
}
158+
159+
{
160+
var buf strings.Builder
161+
fmt.Fprintf(&buf, " before: %s", step.Before)
162+
step.format(&buf, formatCtx{indent: ` ` + workerName + prefix})
163+
fmt.Fprintf(&buf, "\n after: %s", step.After)
164+
l(ctx, basename, "%s", &buf)
165+
}
140166
return err
141167
}
168+
if mode == Safety {
169+
if err = timeutil.RunWithTimeout(ctx, "applying op", 10*time.Second, applyAndLogOp); err != nil {
170+
return err
171+
}
172+
} else {
173+
if err = applyAndLogOp(ctx); err != nil {
174+
return err
175+
}
176+
}
142177
}
143178
return nil
144179
}
@@ -162,12 +197,16 @@ func RunNemesis(
162197
defer kvs.Close()
163198

164199
failures := Validate(allSteps, kvs, env.Tracker)
165-
166200
var filteredFailures []error
167201
for _, f := range failures {
168202
// ConditionFailedErrors are expected and can be ignored.
169203
canBeIgnored := exceptConditionFailed(f)
170-
if !canBeIgnored {
204+
// The following errors are expected in safety mode.
205+
canBeIgnoredSafety := mode == Safety &&
206+
(exceptReplicaUnavailable(f) || exceptAmbiguous(f) || exceptContextCanceled(f))
207+
// Ambiguous errors are expected in liveness mode.
208+
canBeIgnoredLiveness := mode == Liveness && exceptAmbiguous(f)
209+
if !canBeIgnored && !canBeIgnoredSafety && !canBeIgnoredLiveness {
171210
filteredFailures = append(filteredFailures, f)
172211
}
173212
}

0 commit comments

Comments
 (0)