Skip to content

Commit 232b919

Browse files
craig[bot]arulajmani
andcommitted
Merge #148781
148781: kvserver: clear RHS when learning about a split through a snapshot r=pav-kv a=arulajmani Previously, it was possible for us to leak the (post-split) RHS keyspans if we learned about a split through a snapshot. This leak could be long lived if the RHS replica was moved away from the store, as the store wouldn't eventually get a snapshot for the RHS which would force the keyspans to be cleared out. This hazard was shown in #148707. We know there's been a split if the store receives a snapshot that's narrower than the replica it was previously tracking. In the general case, when there are merges involved, a store knows that there's been a N merges followed by a split if there are N (necessarily contiguous) overlapping (with the snapshot) replicas whose keyspan is narrower than that of the snapshot. Interestingly, we were handling the general case correctly, but it required for there to be at least one merge before the split. This patch lifts that logic from clearSubsumedReplicaDiskData and moves it to a similarly named clearSplitReplicaDiskData, tying the narrowing logic more directly to splits and not confusing it with merges. Closes #73462 Release note: None Co-authored-by: Arul Ajmani <[email protected]>
2 parents 47aa793 + 1ca06e4 commit 232b919

File tree

4 files changed

+137
-83
lines changed

4 files changed

+137
-83
lines changed

pkg/kv/kvserver/client_split_test.go

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4566,8 +4566,6 @@ func TestStoreRangeSplitRaftSnapshotAfterRHSRebalanced(t *testing.T) {
45664566
defer leaktest.AfterTest(t)()
45674567
defer log.Scope(t).Close(t)
45684568

4569-
skip.WithIssue(t, 73462)
4570-
45714569
ctx := context.Background()
45724570
// Start a 5 node cluster.
45734571
tc := testcluster.StartTestCluster(t, 5, base.TestClusterArgs{

pkg/kv/kvserver/replica_raftstorage.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ package kvserver
77

88
import (
99
"context"
10+
"slices"
1011
"time"
1112

1213
"github.com/cockroachdb/cockroach/pkg/keys"
@@ -24,6 +25,7 @@ import (
2425
"github.com/cockroachdb/cockroach/pkg/storage"
2526
"github.com/cockroachdb/cockroach/pkg/storage/fs"
2627
"github.com/cockroachdb/cockroach/pkg/util"
28+
"github.com/cockroachdb/cockroach/pkg/util/buildutil"
2729
"github.com/cockroachdb/cockroach/pkg/util/hlc"
2830
"github.com/cockroachdb/cockroach/pkg/util/humanizeutil"
2931
"github.com/cockroachdb/cockroach/pkg/util/log"
@@ -587,6 +589,13 @@ func (r *Replica) applySnapshotRaftMuLocked(
587589
subsume = append(subsume, destroyReplicaInfo{id: sr.ID(), desc: srDesc})
588590
}
589591

592+
// NB: subsumedDescs in snapWriteBuilder must be sorted by start key. This
593+
// should be the case, by construction, but add a test-only assertion just in
594+
// case this ever changes.
595+
testingAssert(slices.IsSortedFunc(subsume, func(a, b destroyReplicaInfo) int {
596+
return a.desc.StartKey.Compare(b.desc.StartKey)
597+
}), "subsumedDescs must be sorted by start key")
598+
590599
sb := snapWriteBuilder{
591600
id: r.ID(),
592601

@@ -597,6 +606,7 @@ func (r *Replica) applySnapshotRaftMuLocked(
597606
truncState: truncState,
598607
hardState: hs,
599608
desc: desc,
609+
origDesc: r.shMu.state.Desc,
600610
subsume: subsume,
601611

602612
cleared: inSnap.clearedSpans,
@@ -826,3 +836,9 @@ func (r *Replica) clearSubsumedReplicaInMemoryData(
826836
}
827837
return phs, nil
828838
}
839+
840+
func testingAssert(cond bool, msg string) {
841+
if buildutil.CrdbTestBuild && !cond {
842+
panic(msg)
843+
}
844+
}

pkg/kv/kvserver/snapshot_apply_prepare.go

Lines changed: 120 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,11 @@ type snapWriteBuilder struct {
3939

4040
truncState kvserverpb.RaftTruncatedState
4141
hardState raftpb.HardState
42-
desc *roachpb.RangeDescriptor
43-
subsume []destroyReplicaInfo
42+
desc *roachpb.RangeDescriptor // corresponds to the range descriptor in the snapshot
43+
origDesc *roachpb.RangeDescriptor // pre-snapshot range descriptor
44+
// NB: subsume, if set, must be in sorted (by destroyReplicaInfo.desc start
45+
// key) order.
46+
subsume []destroyReplicaInfo
4447

4548
// cleared contains the spans that this snapshot application clears before
4649
// writing new state on top.
@@ -54,7 +57,12 @@ func (s *snapWriteBuilder) prepareSnapApply(ctx context.Context) error {
5457
return err
5558
}
5659
_ = applySnapshotTODO // 1.2 + 2.1 + 2.2 + 2.3 (diff) + 3.2
57-
return s.clearSubsumedReplicaDiskData(ctx)
60+
if err := s.clearSubsumedReplicaDiskData(ctx); err != nil {
61+
return err
62+
}
63+
64+
_ = applySnapshotTODO // 2.3 (split)
65+
return s.clearResidualDataOnNarrowSnapshot(ctx)
5866
}
5967

6068
// rewriteRaftState clears and rewrites the unreplicated rangeID-local key space
@@ -104,25 +112,50 @@ func (s *snapWriteBuilder) rewriteRaftState(ctx context.Context, w storage.Write
104112
// the Reader reflects the latest I/O each of the subsumed replicas has done
105113
// (i.e. Reader was instantiated after all raftMu were acquired).
106114
//
107-
// NB: does nothing if subsumedDescs is empty.
115+
// NB: does nothing if s.subsumedDescs is empty.
108116
func (s *snapWriteBuilder) clearSubsumedReplicaDiskData(ctx context.Context) error {
117+
if len(s.subsume) == 0 {
118+
return nil // no subsumed replicas to speak of; early return
119+
}
120+
// NB: The snapshot must never subsume a replica that extends the range of the
121+
// replica to the left. This is because splits and merges (the only operation
122+
// that change the key bounds) always leave the start key intact. Extending to
123+
// the left implies that either we merged "to the left" (we don't), or that
124+
// we're applying a snapshot for another range (we don't do that either).
125+
// Something is severely wrong for this to happen, so perform a sanity check.
126+
if s.subsume[0].desc.StartKey.Compare(s.desc.StartKey) < 0 { // subsumedDescs are sorted by StartKey
127+
log.Dev.Fatalf(ctx,
128+
"subsuming replica to our left; subsumed desc start key: %v; snapshot desc start key %v",
129+
s.subsume[0].desc.StartKey, s.desc.StartKey,
130+
)
131+
}
132+
133+
// In the common case, the subsumed replicas' end key does not extend beyond
134+
// the snapshot end key (sn <= b):
135+
//
136+
// subsumed: [a---s1---...---sn)
137+
// snapshot: [a---------------b)
138+
// or snapshot: [a-------------------b)
139+
//
140+
// In this case, we do not need to clear the range-local, lock table, and user
141+
// keys owned by subsumed replicas, since clearing [a, b) will cover it. We
142+
// only need to clear the per-replica RangeID-local key spans here.
143+
//
144+
// This leaves the only other case, where the subsumed replicas extend past
145+
// the snapshot's end key (s[n-1] < b < sn):
146+
//
147+
// subsumed: [a---s1---...---s[n-1]---sn)
148+
// snapshot: [a--------------------b)
149+
//
150+
// This is only possible if we're not only learning about merges through the
151+
// snapshot, but also a split -- that's the only way the bounds of the
152+
// snapshot can be narrower than the bounds of all the subsumed replicas. In
153+
// this case, we do need to clear range-local, lock table, and user keys in
154+
// the span [b, sn). We do this in
155+
// clearResidualDataOnNarrowSnapshot, not here.
156+
109157
// TODO(sep-raft-log): need different readers for raft and state engine.
110158
reader := storage.Reader(s.todoEng)
111-
112-
// NB: we don't clear RangeID local key spans here. That happens
113-
// via the call to DestroyReplica.
114-
getKeySpans := func(d *roachpb.RangeDescriptor) []roachpb.Span {
115-
return rditer.Select(d.RangeID, rditer.SelectOpts{
116-
Ranged: rditer.SelectRangedOptions{
117-
RSpan: d.RSpan(),
118-
SystemKeys: true,
119-
UserKeys: true,
120-
LockTable: true,
121-
},
122-
})
123-
}
124-
keySpans := getKeySpans(s.desc)
125-
totalKeySpans := append([]roachpb.Span(nil), keySpans...)
126159
for _, sub := range s.subsume {
127160
// We have to create an SST for the subsumed replica's range-id local keys.
128161
if err := s.writeSST(ctx, func(ctx context.Context, w storage.Writer) error {
@@ -138,83 +171,89 @@ func (s *snapWriteBuilder) clearSubsumedReplicaDiskData(ctx context.Context) err
138171
ReplicatedByRangeID: opts.ClearReplicatedByRangeID,
139172
UnreplicatedByRangeID: opts.ClearUnreplicatedByRangeID,
140173
})...)
174+
// NB: Actually clear RangeID local key spans.
141175
return kvstorage.DestroyReplica(ctx, sub.id, reader, w, mergedTombstoneReplicaID, opts)
142176
}); err != nil {
143177
return err
144178
}
145-
146-
srKeySpans := getKeySpans(sub.desc)
147-
// Compute the total key space covered by the current replica and all
148-
// subsumed replicas.
149-
for i := range srKeySpans {
150-
if srKeySpans[i].Key.Compare(totalKeySpans[i].Key) < 0 {
151-
totalKeySpans[i].Key = srKeySpans[i].Key
152-
}
153-
if srKeySpans[i].EndKey.Compare(totalKeySpans[i].EndKey) > 0 {
154-
totalKeySpans[i].EndKey = srKeySpans[i].EndKey
155-
}
156-
}
157179
}
158180

159-
// We might have to create SSTs for the range local keys, lock table keys,
160-
// and user keys depending on if the subsumed replicas are not fully
161-
// contained by the replica in our snapshot. The following is an example to
162-
// this case happening.
163-
//
164-
// a b c d
165-
// |---1---|-------2-------| S1
166-
// |---1-------------------| S2
167-
// |---1-----------|---3---| S3
168-
//
169-
// Since the merge is the first operation to happen, a follower could be down
170-
// before it completes. It is reasonable for a snapshot for r1 from S3 to
171-
// subsume both r1 and r2 in S1.
172-
for i := range keySpans {
173-
// The snapshot must never subsume a replica that extends the range of the
174-
// replica to the left. This is because splits and merges (the only
175-
// operation that change the key bounds) always leave the start key intact.
176-
// Extending to the left implies that either we merged "to the left" (we
177-
// don't), or that we're applying a snapshot for another range (we don't do
178-
// that either). Something is severely wrong for this to happen.
179-
if totalKeySpans[i].Key.Compare(keySpans[i].Key) < 0 {
180-
log.Dev.Fatalf(ctx, "subsuming replica to our left; key span: %v; total key span %v",
181-
keySpans[i], totalKeySpans[i])
182-
}
181+
return nil
182+
}
183183

184-
// In the comments below, s1, ..., sn are the end keys for the subsumed
185-
// replicas (for the current keySpan).
186-
// Note that if there aren't any subsumed replicas (the common case), the
187-
// next comparison is always zero and this loop is a no-op.
188-
189-
if totalKeySpans[i].EndKey.Compare(keySpans[i].EndKey) <= 0 {
190-
// The subsumed replicas are fully contained in the snapshot:
191-
//
192-
// [a---s1---...---sn)
193-
// [a---------------------b)
194-
//
195-
// We don't need to clear additional keyspace here, since clearing `[a,b)`
196-
// will also clear all subsumed replicas.
197-
continue
198-
}
184+
// clearResidualDataOnNarrowSnapshot clears the overlapping replicas' data not
185+
// covered by the snapshot. Specifically, the data between the snapshot's end
186+
// key and that of the keyspace covered by s.origDesc + s.subsume.descs, if the
187+
// former is lower (i.e. the snapshot is "narrower"). Note that the start keys
188+
// of the snapshot and s.origDesc[^1] match, so the residual data may only exist
189+
// between the end keys. clearResidualDataOnNarrowSnapshot is a no-op if there
190+
// is no narrowing business to speak of.
191+
//
192+
// Visually, the picture looks as follows:
193+
//
194+
// The simplest case is when the snapshot isn't subsuming any replicas.
195+
// original descriptor: [a-----------------------------c)
196+
// snapshot descriptor: [a---------------------b)
197+
// cleared: [b, c)
198+
//
199+
// In the more general case[^2]:
200+
//
201+
// store descriptors: [a----------------s1---...---sn)
202+
// snapshot descriptor: [a---------------------b)
203+
// cleared: [b, sn)
204+
//
205+
// Practically speaking, the simple case above corresponds to a replica learning
206+
// about a split through a snapshot. The more general case corresponds to a
207+
// replica learning about a series of merges and at least one split through the
208+
// snapshot.
209+
//
210+
// [1] Assuming s.origDesc is initialized. If it isn't, and we're applying an
211+
// initial snapshot, the snapshot may still be narrower than the end key of the
212+
// right-most subsumed replica.
213+
//
214+
// [2] In the diagram , S1...Sn correspond to subsumed replicas with end keys
215+
// s1...sn respectively. These are all replicas on the store that overlap with
216+
// the snapshot descriptor, covering the range [a,b), and the right-most
217+
// descriptor is that of replica Sn.
218+
func (s *snapWriteBuilder) clearResidualDataOnNarrowSnapshot(ctx context.Context) error {
219+
if !s.origDesc.IsInitialized() && len(s.subsume) == 0 {
220+
// Early return in the case where we're ingesting an initial snapshot and
221+
// there are no subsumed replicas that the snapshot could be making
222+
// narrower.
223+
return nil
224+
}
199225

200-
// The subsumed replicas extend past the snapshot:
201-
//
202-
// [a----------------s1---...---sn)
203-
// [a---------------------b)
204-
//
205-
// We need to additionally clear [b,sn).
226+
rightMostDesc := s.origDesc
227+
if len(s.subsume) != 0 {
228+
// NB: s.subsume are non-overlapping and sorted by start key. Pick the last
229+
// one to determine whether the snapshot is narrowing the keyspace or not.
230+
rightMostDesc = s.subsume[len(s.subsume)-1].desc
231+
}
206232

233+
if rightMostDesc.EndKey.Compare(s.desc.EndKey) <= 0 {
234+
return nil // we aren't narrowing anything; no-op
235+
}
236+
237+
// TODO(sep-raft-log): read from the state machine engine here.
238+
reader := storage.Reader(s.todoEng)
239+
for _, span := range rditer.Select(0, rditer.SelectOpts{
240+
Ranged: rditer.SelectRangedOptions{RSpan: roachpb.RSpan{
241+
Key: s.desc.EndKey, EndKey: rightMostDesc.EndKey,
242+
},
243+
SystemKeys: true,
244+
LockTable: true,
245+
UserKeys: true,
246+
},
247+
}) {
207248
if err := s.writeSST(ctx, func(ctx context.Context, w storage.Writer) error {
208249
return storage.ClearRangeWithHeuristic(
209-
ctx, reader, w,
210-
keySpans[i].EndKey, totalKeySpans[i].EndKey,
211-
kvstorage.ClearRangeThresholdPointKeys,
250+
ctx, reader, w, span.Key, span.EndKey, kvstorage.ClearRangeThresholdPointKeys,
212251
)
213252
}); err != nil {
214253
return err
215254
}
216-
s.cleared = append(s.cleared,
217-
roachpb.Span{Key: keySpans[i].EndKey, EndKey: totalKeySpans[i].EndKey})
255+
s.cleared = append(s.cleared, span)
218256
}
257+
219258
return nil
220259
}

pkg/kv/kvserver/snapshot_apply_prepare_test.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ func TestPrepareSnapApply(t *testing.T) {
9292
truncState: kvserverpb.RaftTruncatedState{Index: 100, Term: 20},
9393
hardState: raftpb.HardState{Term: 20, Commit: 100},
9494
desc: desc(id.RangeID, "a", "k"),
95+
origDesc: desc(id.RangeID, "a", "k"),
9596
subsume: []destroyReplicaInfo{
9697
{id: roachpb.FullReplicaID{RangeID: descA.RangeID, ReplicaID: replicaID}, desc: descA},
9798
{id: roachpb.FullReplicaID{RangeID: descB.RangeID, ReplicaID: replicaID}, desc: descB},

0 commit comments

Comments
 (0)