Skip to content

Commit 1ca06e4

Browse files
committed
kvserver: clear RHS when learning about a split through a snapshot
Previously, it was possible for us to leak keyspans on receiving a narrower snapshot. This could happen if the end key of the keyspace covered by the original descriptor + any subsumed descriptors was less than the end key of the snapshot. This hazard was shown in #148707. Practically speaking, the simplest case that's fixed here corresponds to a replica learning about a split through the snapshot. The more general case involves the replica learning about a series of merges and at least one split through the snapshot. Closes #73462 Release note: None
1 parent 47aa793 commit 1ca06e4

File tree

4 files changed

+137
-83
lines changed

4 files changed

+137
-83
lines changed

pkg/kv/kvserver/client_split_test.go

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4566,8 +4566,6 @@ func TestStoreRangeSplitRaftSnapshotAfterRHSRebalanced(t *testing.T) {
45664566
defer leaktest.AfterTest(t)()
45674567
defer log.Scope(t).Close(t)
45684568

4569-
skip.WithIssue(t, 73462)
4570-
45714569
ctx := context.Background()
45724570
// Start a 5 node cluster.
45734571
tc := testcluster.StartTestCluster(t, 5, base.TestClusterArgs{

pkg/kv/kvserver/replica_raftstorage.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ package kvserver
77

88
import (
99
"context"
10+
"slices"
1011
"time"
1112

1213
"github.com/cockroachdb/cockroach/pkg/keys"
@@ -24,6 +25,7 @@ import (
2425
"github.com/cockroachdb/cockroach/pkg/storage"
2526
"github.com/cockroachdb/cockroach/pkg/storage/fs"
2627
"github.com/cockroachdb/cockroach/pkg/util"
28+
"github.com/cockroachdb/cockroach/pkg/util/buildutil"
2729
"github.com/cockroachdb/cockroach/pkg/util/hlc"
2830
"github.com/cockroachdb/cockroach/pkg/util/humanizeutil"
2931
"github.com/cockroachdb/cockroach/pkg/util/log"
@@ -587,6 +589,13 @@ func (r *Replica) applySnapshotRaftMuLocked(
587589
subsume = append(subsume, destroyReplicaInfo{id: sr.ID(), desc: srDesc})
588590
}
589591

592+
// NB: subsumedDescs in snapWriteBuilder must be sorted by start key. This
593+
// should be the case, by construction, but add a test-only assertion just in
594+
// case this ever changes.
595+
testingAssert(slices.IsSortedFunc(subsume, func(a, b destroyReplicaInfo) int {
596+
return a.desc.StartKey.Compare(b.desc.StartKey)
597+
}), "subsumedDescs must be sorted by start key")
598+
590599
sb := snapWriteBuilder{
591600
id: r.ID(),
592601

@@ -597,6 +606,7 @@ func (r *Replica) applySnapshotRaftMuLocked(
597606
truncState: truncState,
598607
hardState: hs,
599608
desc: desc,
609+
origDesc: r.shMu.state.Desc,
600610
subsume: subsume,
601611

602612
cleared: inSnap.clearedSpans,
@@ -826,3 +836,9 @@ func (r *Replica) clearSubsumedReplicaInMemoryData(
826836
}
827837
return phs, nil
828838
}
839+
840+
func testingAssert(cond bool, msg string) {
841+
if buildutil.CrdbTestBuild && !cond {
842+
panic(msg)
843+
}
844+
}

pkg/kv/kvserver/snapshot_apply_prepare.go

Lines changed: 120 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,11 @@ type snapWriteBuilder struct {
3939

4040
truncState kvserverpb.RaftTruncatedState
4141
hardState raftpb.HardState
42-
desc *roachpb.RangeDescriptor
43-
subsume []destroyReplicaInfo
42+
desc *roachpb.RangeDescriptor // corresponds to the range descriptor in the snapshot
43+
origDesc *roachpb.RangeDescriptor // pre-snapshot range descriptor
44+
// NB: subsume, if set, must be in sorted (by destroyReplicaInfo.desc start
45+
// key) order.
46+
subsume []destroyReplicaInfo
4447

4548
// cleared contains the spans that this snapshot application clears before
4649
// writing new state on top.
@@ -54,7 +57,12 @@ func (s *snapWriteBuilder) prepareSnapApply(ctx context.Context) error {
5457
return err
5558
}
5659
_ = applySnapshotTODO // 1.2 + 2.1 + 2.2 + 2.3 (diff) + 3.2
57-
return s.clearSubsumedReplicaDiskData(ctx)
60+
if err := s.clearSubsumedReplicaDiskData(ctx); err != nil {
61+
return err
62+
}
63+
64+
_ = applySnapshotTODO // 2.3 (split)
65+
return s.clearResidualDataOnNarrowSnapshot(ctx)
5866
}
5967

6068
// rewriteRaftState clears and rewrites the unreplicated rangeID-local key space
@@ -104,25 +112,50 @@ func (s *snapWriteBuilder) rewriteRaftState(ctx context.Context, w storage.Write
104112
// the Reader reflects the latest I/O each of the subsumed replicas has done
105113
// (i.e. Reader was instantiated after all raftMu were acquired).
106114
//
107-
// NB: does nothing if subsumedDescs is empty.
115+
// NB: does nothing if s.subsumedDescs is empty.
108116
func (s *snapWriteBuilder) clearSubsumedReplicaDiskData(ctx context.Context) error {
117+
if len(s.subsume) == 0 {
118+
return nil // no subsumed replicas to speak of; early return
119+
}
120+
// NB: The snapshot must never subsume a replica that extends the range of the
121+
// replica to the left. This is because splits and merges (the only operation
122+
// that change the key bounds) always leave the start key intact. Extending to
123+
// the left implies that either we merged "to the left" (we don't), or that
124+
// we're applying a snapshot for another range (we don't do that either).
125+
// Something is severely wrong for this to happen, so perform a sanity check.
126+
if s.subsume[0].desc.StartKey.Compare(s.desc.StartKey) < 0 { // subsumedDescs are sorted by StartKey
127+
log.Dev.Fatalf(ctx,
128+
"subsuming replica to our left; subsumed desc start key: %v; snapshot desc start key %v",
129+
s.subsume[0].desc.StartKey, s.desc.StartKey,
130+
)
131+
}
132+
133+
// In the common case, the subsumed replicas' end key does not extend beyond
134+
// the snapshot end key (sn <= b):
135+
//
136+
// subsumed: [a---s1---...---sn)
137+
// snapshot: [a---------------b)
138+
// or snapshot: [a-------------------b)
139+
//
140+
// In this case, we do not need to clear the range-local, lock table, and user
141+
// keys owned by subsumed replicas, since clearing [a, b) will cover it. We
142+
// only need to clear the per-replica RangeID-local key spans here.
143+
//
144+
// This leaves the only other case, where the subsumed replicas extend past
145+
// the snapshot's end key (s[n-1] < b < sn):
146+
//
147+
// subsumed: [a---s1---...---s[n-1]---sn)
148+
// snapshot: [a--------------------b)
149+
//
150+
// This is only possible if we're not only learning about merges through the
151+
// snapshot, but also a split -- that's the only way the bounds of the
152+
// snapshot can be narrower than the bounds of all the subsumed replicas. In
153+
// this case, we do need to clear range-local, lock table, and user keys in
154+
// the span [b, sn). We do this in
155+
// clearResidualDataOnNarrowSnapshot, not here.
156+
109157
// TODO(sep-raft-log): need different readers for raft and state engine.
110158
reader := storage.Reader(s.todoEng)
111-
112-
// NB: we don't clear RangeID local key spans here. That happens
113-
// via the call to DestroyReplica.
114-
getKeySpans := func(d *roachpb.RangeDescriptor) []roachpb.Span {
115-
return rditer.Select(d.RangeID, rditer.SelectOpts{
116-
Ranged: rditer.SelectRangedOptions{
117-
RSpan: d.RSpan(),
118-
SystemKeys: true,
119-
UserKeys: true,
120-
LockTable: true,
121-
},
122-
})
123-
}
124-
keySpans := getKeySpans(s.desc)
125-
totalKeySpans := append([]roachpb.Span(nil), keySpans...)
126159
for _, sub := range s.subsume {
127160
// We have to create an SST for the subsumed replica's range-id local keys.
128161
if err := s.writeSST(ctx, func(ctx context.Context, w storage.Writer) error {
@@ -138,83 +171,89 @@ func (s *snapWriteBuilder) clearSubsumedReplicaDiskData(ctx context.Context) err
138171
ReplicatedByRangeID: opts.ClearReplicatedByRangeID,
139172
UnreplicatedByRangeID: opts.ClearUnreplicatedByRangeID,
140173
})...)
174+
// NB: Actually clear RangeID local key spans.
141175
return kvstorage.DestroyReplica(ctx, sub.id, reader, w, mergedTombstoneReplicaID, opts)
142176
}); err != nil {
143177
return err
144178
}
145-
146-
srKeySpans := getKeySpans(sub.desc)
147-
// Compute the total key space covered by the current replica and all
148-
// subsumed replicas.
149-
for i := range srKeySpans {
150-
if srKeySpans[i].Key.Compare(totalKeySpans[i].Key) < 0 {
151-
totalKeySpans[i].Key = srKeySpans[i].Key
152-
}
153-
if srKeySpans[i].EndKey.Compare(totalKeySpans[i].EndKey) > 0 {
154-
totalKeySpans[i].EndKey = srKeySpans[i].EndKey
155-
}
156-
}
157179
}
158180

159-
// We might have to create SSTs for the range local keys, lock table keys,
160-
// and user keys depending on if the subsumed replicas are not fully
161-
// contained by the replica in our snapshot. The following is an example to
162-
// this case happening.
163-
//
164-
// a b c d
165-
// |---1---|-------2-------| S1
166-
// |---1-------------------| S2
167-
// |---1-----------|---3---| S3
168-
//
169-
// Since the merge is the first operation to happen, a follower could be down
170-
// before it completes. It is reasonable for a snapshot for r1 from S3 to
171-
// subsume both r1 and r2 in S1.
172-
for i := range keySpans {
173-
// The snapshot must never subsume a replica that extends the range of the
174-
// replica to the left. This is because splits and merges (the only
175-
// operation that change the key bounds) always leave the start key intact.
176-
// Extending to the left implies that either we merged "to the left" (we
177-
// don't), or that we're applying a snapshot for another range (we don't do
178-
// that either). Something is severely wrong for this to happen.
179-
if totalKeySpans[i].Key.Compare(keySpans[i].Key) < 0 {
180-
log.Dev.Fatalf(ctx, "subsuming replica to our left; key span: %v; total key span %v",
181-
keySpans[i], totalKeySpans[i])
182-
}
181+
return nil
182+
}
183183

184-
// In the comments below, s1, ..., sn are the end keys for the subsumed
185-
// replicas (for the current keySpan).
186-
// Note that if there aren't any subsumed replicas (the common case), the
187-
// next comparison is always zero and this loop is a no-op.
188-
189-
if totalKeySpans[i].EndKey.Compare(keySpans[i].EndKey) <= 0 {
190-
// The subsumed replicas are fully contained in the snapshot:
191-
//
192-
// [a---s1---...---sn)
193-
// [a---------------------b)
194-
//
195-
// We don't need to clear additional keyspace here, since clearing `[a,b)`
196-
// will also clear all subsumed replicas.
197-
continue
198-
}
184+
// clearResidualDataOnNarrowSnapshot clears the overlapping replicas' data not
185+
// covered by the snapshot. Specifically, the data between the snapshot's end
186+
// key and that of the keyspace covered by s.origDesc + s.subsume.descs, if the
187+
// former is lower (i.e. the snapshot is "narrower"). Note that the start keys
188+
// of the snapshot and s.origDesc[^1] match, so the residual data may only exist
189+
// between the end keys. clearResidualDataOnNarrowSnapshot is a no-op if there
190+
// is no narrowing business to speak of.
191+
//
192+
// Visually, the picture looks as follows:
193+
//
194+
// The simplest case is when the snapshot isn't subsuming any replicas.
195+
// original descriptor: [a-----------------------------c)
196+
// snapshot descriptor: [a---------------------b)
197+
// cleared: [b, c)
198+
//
199+
// In the more general case[^2]:
200+
//
201+
// store descriptors: [a----------------s1---...---sn)
202+
// snapshot descriptor: [a---------------------b)
203+
// cleared: [b, sn)
204+
//
205+
// Practically speaking, the simple case above corresponds to a replica learning
206+
// about a split through a snapshot. The more general case corresponds to a
207+
// replica learning about a series of merges and at least one split through the
208+
// snapshot.
209+
//
210+
// [1] Assuming s.origDesc is initialized. If it isn't, and we're applying an
211+
// initial snapshot, the snapshot may still be narrower than the end key of the
212+
// right-most subsumed replica.
213+
//
214+
// [2] In the diagram , S1...Sn correspond to subsumed replicas with end keys
215+
// s1...sn respectively. These are all replicas on the store that overlap with
216+
// the snapshot descriptor, covering the range [a,b), and the right-most
217+
// descriptor is that of replica Sn.
218+
func (s *snapWriteBuilder) clearResidualDataOnNarrowSnapshot(ctx context.Context) error {
219+
if !s.origDesc.IsInitialized() && len(s.subsume) == 0 {
220+
// Early return in the case where we're ingesting an initial snapshot and
221+
// there are no subsumed replicas that the snapshot could be making
222+
// narrower.
223+
return nil
224+
}
199225

200-
// The subsumed replicas extend past the snapshot:
201-
//
202-
// [a----------------s1---...---sn)
203-
// [a---------------------b)
204-
//
205-
// We need to additionally clear [b,sn).
226+
rightMostDesc := s.origDesc
227+
if len(s.subsume) != 0 {
228+
// NB: s.subsume are non-overlapping and sorted by start key. Pick the last
229+
// one to determine whether the snapshot is narrowing the keyspace or not.
230+
rightMostDesc = s.subsume[len(s.subsume)-1].desc
231+
}
206232

233+
if rightMostDesc.EndKey.Compare(s.desc.EndKey) <= 0 {
234+
return nil // we aren't narrowing anything; no-op
235+
}
236+
237+
// TODO(sep-raft-log): read from the state machine engine here.
238+
reader := storage.Reader(s.todoEng)
239+
for _, span := range rditer.Select(0, rditer.SelectOpts{
240+
Ranged: rditer.SelectRangedOptions{RSpan: roachpb.RSpan{
241+
Key: s.desc.EndKey, EndKey: rightMostDesc.EndKey,
242+
},
243+
SystemKeys: true,
244+
LockTable: true,
245+
UserKeys: true,
246+
},
247+
}) {
207248
if err := s.writeSST(ctx, func(ctx context.Context, w storage.Writer) error {
208249
return storage.ClearRangeWithHeuristic(
209-
ctx, reader, w,
210-
keySpans[i].EndKey, totalKeySpans[i].EndKey,
211-
kvstorage.ClearRangeThresholdPointKeys,
250+
ctx, reader, w, span.Key, span.EndKey, kvstorage.ClearRangeThresholdPointKeys,
212251
)
213252
}); err != nil {
214253
return err
215254
}
216-
s.cleared = append(s.cleared,
217-
roachpb.Span{Key: keySpans[i].EndKey, EndKey: totalKeySpans[i].EndKey})
255+
s.cleared = append(s.cleared, span)
218256
}
257+
219258
return nil
220259
}

pkg/kv/kvserver/snapshot_apply_prepare_test.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ func TestPrepareSnapApply(t *testing.T) {
9292
truncState: kvserverpb.RaftTruncatedState{Index: 100, Term: 20},
9393
hardState: raftpb.HardState{Term: 20, Commit: 100},
9494
desc: desc(id.RangeID, "a", "k"),
95+
origDesc: desc(id.RangeID, "a", "k"),
9596
subsume: []destroyReplicaInfo{
9697
{id: roachpb.FullReplicaID{RangeID: descA.RangeID, ReplicaID: replicaID}, desc: descA},
9798
{id: roachpb.FullReplicaID{RangeID: descB.RangeID, ReplicaID: replicaID}, desc: descB},

0 commit comments

Comments
 (0)