Skip to content

Commit f26dd5b

Browse files
craig[bot]arulajmaniDarrylWong
committed
148707: kvserver: introduce TestStoreRangeSplitRaftSnapshotAfterRHSRebalanced r=arulajmani a=arulajmani This patch adds a new test, TestStoreRangeSplitRaftSnapshotAfterRHSRebalanced, to show the hazard described in #73462 is legit. In particular, when a replica learns about a split through a snapshot after the post-split RHS has been rebalanced away, we leak the RHS's on-disk data. References #73462 Release note: None 148750: roachtest: fix failover disk stall r=golgeek a=DarrylWong This fixes the failover disk staller to be compatible with the FI library. Specifically, it does not attempt to recover in cleanup, as this is already handled in the FI framework. Additionally, it temporarily disables concurrent disk stall failures from being injected as this is an invariant of the FI framework. Instead, they should be injected in one shot. Fixes: #148739 Fixes: #148742 Fixes: #148741 Fixes: #148740 Fixes: #148738 Fixes: #148737 Release note: none Co-authored-by: Arul Ajmani <[email protected]> Co-authored-by: DarrylWong <[email protected]>
3 parents 1250b4e + 5446eec + d37deb6 commit f26dd5b

File tree

4 files changed

+512
-344
lines changed

4 files changed

+512
-344
lines changed

pkg/cmd/roachtest/tests/failover.go

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1594,18 +1594,24 @@ type diskStallFailer struct {
15941594
staller diskStaller
15951595
}
15961596

1597-
func (f *diskStallFailer) Mode() failureMode { return failureModeDiskStall }
1598-
func (f *diskStallFailer) String() string { return string(f.Mode()) }
1599-
func (f *diskStallFailer) CanUseLocal() bool { return false } // needs dmsetup
1600-
func (f *diskStallFailer) CanUseChaos() bool { return true }
1601-
func (f *diskStallFailer) CanRunWith(failureMode) bool { return true }
1597+
func (f *diskStallFailer) Mode() failureMode { return failureModeDiskStall }
1598+
func (f *diskStallFailer) String() string { return string(f.Mode()) }
1599+
func (f *diskStallFailer) CanUseLocal() bool { return false } // needs dmsetup
1600+
func (f *diskStallFailer) CanUseChaos() bool { return true }
1601+
1602+
// CanRunWith returns false for other disk stalls, as the FI library it uses
1603+
// does not allow concurrent failure modes to be injected without recovering
1604+
// from them first.
1605+
// TODO(darryl): This is a temporary workaround to reduce test failure noise.
1606+
// We should fix this by merging concurrent disk stall failures and injecting
1607+
// them in one shot.
1608+
func (f *diskStallFailer) CanRunWith(other failureMode) bool { return other != failureModeDiskStall }
16021609

16031610
func (f *diskStallFailer) Setup(ctx context.Context) {
16041611
f.staller.Setup(ctx)
16051612
}
16061613

16071614
func (f *diskStallFailer) Cleanup(ctx context.Context) {
1608-
f.staller.Unstall(ctx, f.c.All())
16091615
// We have to stop the cluster before cleaning up the staller.
16101616
f.m.ExpectDeaths(int32(f.c.Spec().NodeCount))
16111617
f.c.Stop(ctx, f.t.L(), option.DefaultStopOpts(), f.c.All())

pkg/kv/kvserver/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -576,6 +576,7 @@ go_test(
576576
"@com_github_stretchr_testify//require",
577577
"@org_golang_google_grpc//:grpc",
578578
"@org_golang_google_grpc//metadata",
579+
"@org_golang_x_exp//maps",
579580
"@org_golang_x_sync//errgroup",
580581
"@org_golang_x_sync//syncmap",
581582
],

0 commit comments

Comments
 (0)