@@ -39,8 +39,11 @@ type snapWriteBuilder struct {
39
39
40
40
truncState kvserverpb.RaftTruncatedState
41
41
hardState raftpb.HardState
42
- desc * roachpb.RangeDescriptor
43
- subsume []destroyReplicaInfo
42
+ desc * roachpb.RangeDescriptor // corresponds to the range descriptor in the snapshot
43
+ origDesc * roachpb.RangeDescriptor // pre-snapshot range descriptor
44
+ // NB: subsume, if set, must be in sorted (by destroyReplicaInfo.desc start
45
+ // key) order.
46
+ subsume []destroyReplicaInfo
44
47
45
48
// cleared contains the spans that this snapshot application clears before
46
49
// writing new state on top.
@@ -54,7 +57,12 @@ func (s *snapWriteBuilder) prepareSnapApply(ctx context.Context) error {
54
57
return err
55
58
}
56
59
_ = applySnapshotTODO // 1.2 + 2.1 + 2.2 + 2.3 (diff) + 3.2
57
- return s .clearSubsumedReplicaDiskData (ctx )
60
+ if err := s .clearSubsumedReplicaDiskData (ctx ); err != nil {
61
+ return err
62
+ }
63
+
64
+ _ = applySnapshotTODO // 2.3 (split)
65
+ return s .clearResidualDataOnNarrowSnapshot (ctx )
58
66
}
59
67
60
68
// rewriteRaftState clears and rewrites the unreplicated rangeID-local key space
@@ -104,25 +112,50 @@ func (s *snapWriteBuilder) rewriteRaftState(ctx context.Context, w storage.Write
104
112
// the Reader reflects the latest I/O each of the subsumed replicas has done
105
113
// (i.e. Reader was instantiated after all raftMu were acquired).
106
114
//
107
- // NB: does nothing if subsumedDescs is empty.
115
+ // NB: does nothing if s. subsumedDescs is empty.
108
116
func (s * snapWriteBuilder ) clearSubsumedReplicaDiskData (ctx context.Context ) error {
117
+ if len (s .subsume ) == 0 {
118
+ return nil // no subsumed replicas to speak of; early return
119
+ }
120
+ // NB: The snapshot must never subsume a replica that extends the range of the
121
+ // replica to the left. This is because splits and merges (the only operation
122
+ // that change the key bounds) always leave the start key intact. Extending to
123
+ // the left implies that either we merged "to the left" (we don't), or that
124
+ // we're applying a snapshot for another range (we don't do that either).
125
+ // Something is severely wrong for this to happen, so perform a sanity check.
126
+ if s .subsume [0 ].desc .StartKey .Compare (s .desc .StartKey ) < 0 { // subsumedDescs are sorted by StartKey
127
+ log .Dev .Fatalf (ctx ,
128
+ "subsuming replica to our left; subsumed desc start key: %v; snapshot desc start key %v" ,
129
+ s .subsume [0 ].desc .StartKey , s .desc .StartKey ,
130
+ )
131
+ }
132
+
133
+ // In the common case, the subsumed replicas' end key does not extend beyond
134
+ // the snapshot end key (sn <= b):
135
+ //
136
+ // subsumed: [a---s1---...---sn)
137
+ // snapshot: [a---------------b)
138
+ // or snapshot: [a-------------------b)
139
+ //
140
+ // In this case, we do not need to clear the range-local, lock table, and user
141
+ // keys owned by subsumed replicas, since clearing [a, b) will cover it. We
142
+ // only need to clear the per-replica RangeID-local key spans here.
143
+ //
144
+ // This leaves the only other case, where the subsumed replicas extend past
145
+ // the snapshot's end key (s[n-1] < b < sn):
146
+ //
147
+ // subsumed: [a---s1---...---s[n-1]---sn)
148
+ // snapshot: [a--------------------b)
149
+ //
150
+ // This is only possible if we're not only learning about merges through the
151
+ // snapshot, but also a split -- that's the only way the bounds of the
152
+ // snapshot can be narrower than the bounds of all the subsumed replicas. In
153
+ // this case, we do need to clear range-local, lock table, and user keys in
154
+ // the span [b, sn). We do this in
155
+ // clearResidualDataOnNarrowSnapshot, not here.
156
+
109
157
// TODO(sep-raft-log): need different readers for raft and state engine.
110
158
reader := storage .Reader (s .todoEng )
111
-
112
- // NB: we don't clear RangeID local key spans here. That happens
113
- // via the call to DestroyReplica.
114
- getKeySpans := func (d * roachpb.RangeDescriptor ) []roachpb.Span {
115
- return rditer .Select (d .RangeID , rditer.SelectOpts {
116
- Ranged : rditer.SelectRangedOptions {
117
- RSpan : d .RSpan (),
118
- SystemKeys : true ,
119
- UserKeys : true ,
120
- LockTable : true ,
121
- },
122
- })
123
- }
124
- keySpans := getKeySpans (s .desc )
125
- totalKeySpans := append ([]roachpb.Span (nil ), keySpans ... )
126
159
for _ , sub := range s .subsume {
127
160
// We have to create an SST for the subsumed replica's range-id local keys.
128
161
if err := s .writeSST (ctx , func (ctx context.Context , w storage.Writer ) error {
@@ -138,83 +171,89 @@ func (s *snapWriteBuilder) clearSubsumedReplicaDiskData(ctx context.Context) err
138
171
ReplicatedByRangeID : opts .ClearReplicatedByRangeID ,
139
172
UnreplicatedByRangeID : opts .ClearUnreplicatedByRangeID ,
140
173
})... )
174
+ // NB: Actually clear RangeID local key spans.
141
175
return kvstorage .DestroyReplica (ctx , sub .id , reader , w , mergedTombstoneReplicaID , opts )
142
176
}); err != nil {
143
177
return err
144
178
}
145
-
146
- srKeySpans := getKeySpans (sub .desc )
147
- // Compute the total key space covered by the current replica and all
148
- // subsumed replicas.
149
- for i := range srKeySpans {
150
- if srKeySpans [i ].Key .Compare (totalKeySpans [i ].Key ) < 0 {
151
- totalKeySpans [i ].Key = srKeySpans [i ].Key
152
- }
153
- if srKeySpans [i ].EndKey .Compare (totalKeySpans [i ].EndKey ) > 0 {
154
- totalKeySpans [i ].EndKey = srKeySpans [i ].EndKey
155
- }
156
- }
157
179
}
158
180
159
- // We might have to create SSTs for the range local keys, lock table keys,
160
- // and user keys depending on if the subsumed replicas are not fully
161
- // contained by the replica in our snapshot. The following is an example to
162
- // this case happening.
163
- //
164
- // a b c d
165
- // |---1---|-------2-------| S1
166
- // |---1-------------------| S2
167
- // |---1-----------|---3---| S3
168
- //
169
- // Since the merge is the first operation to happen, a follower could be down
170
- // before it completes. It is reasonable for a snapshot for r1 from S3 to
171
- // subsume both r1 and r2 in S1.
172
- for i := range keySpans {
173
- // The snapshot must never subsume a replica that extends the range of the
174
- // replica to the left. This is because splits and merges (the only
175
- // operation that change the key bounds) always leave the start key intact.
176
- // Extending to the left implies that either we merged "to the left" (we
177
- // don't), or that we're applying a snapshot for another range (we don't do
178
- // that either). Something is severely wrong for this to happen.
179
- if totalKeySpans [i ].Key .Compare (keySpans [i ].Key ) < 0 {
180
- log .Dev .Fatalf (ctx , "subsuming replica to our left; key span: %v; total key span %v" ,
181
- keySpans [i ], totalKeySpans [i ])
182
- }
181
+ return nil
182
+ }
183
183
184
- // In the comments below, s1, ..., sn are the end keys for the subsumed
185
- // replicas (for the current keySpan).
186
- // Note that if there aren't any subsumed replicas (the common case), the
187
- // next comparison is always zero and this loop is a no-op.
188
-
189
- if totalKeySpans [i ].EndKey .Compare (keySpans [i ].EndKey ) <= 0 {
190
- // The subsumed replicas are fully contained in the snapshot:
191
- //
192
- // [a---s1---...---sn)
193
- // [a---------------------b)
194
- //
195
- // We don't need to clear additional keyspace here, since clearing `[a,b)`
196
- // will also clear all subsumed replicas.
197
- continue
198
- }
184
+ // clearResidualDataOnNarrowSnapshot clears the overlapping replicas' data not
185
+ // covered by the snapshot. Specifically, the data between the snapshot's end
186
+ // key and that of the keyspace covered by s.origDesc + s.subsume.descs, if the
187
+ // former is lower (i.e. the snapshot is "narrower"). Note that the start keys
188
+ // of the snapshot and s.origDesc[^1] match, so the residual data may only exist
189
+ // between the end keys. clearResidualDataOnNarrowSnapshot is a no-op if there
190
+ // is no narrowing business to speak of.
191
+ //
192
+ // Visually, the picture looks as follows:
193
+ //
194
+ // The simplest case is when the snapshot isn't subsuming any replicas.
195
+ // original descriptor: [a-----------------------------c)
196
+ // snapshot descriptor: [a---------------------b)
197
+ // cleared: [b, c)
198
+ //
199
+ // In the more general case[^2]:
200
+ //
201
+ // store descriptors: [a----------------s1---...---sn)
202
+ // snapshot descriptor: [a---------------------b)
203
+ // cleared: [b, sn)
204
+ //
205
+ // Practically speaking, the simple case above corresponds to a replica learning
206
+ // about a split through a snapshot. The more general case corresponds to a
207
+ // replica learning about a series of merges and at least one split through the
208
+ // snapshot.
209
+ //
210
+ // [1] Assuming s.origDesc is initialized. If it isn't, and we're applying an
211
+ // initial snapshot, the snapshot may still be narrower than the end key of the
212
+ // right-most subsumed replica.
213
+ //
214
+ // [2] In the diagram , S1...Sn correspond to subsumed replicas with end keys
215
+ // s1...sn respectively. These are all replicas on the store that overlap with
216
+ // the snapshot descriptor, covering the range [a,b), and the right-most
217
+ // descriptor is that of replica Sn.
218
+ func (s * snapWriteBuilder ) clearResidualDataOnNarrowSnapshot (ctx context.Context ) error {
219
+ if ! s .origDesc .IsInitialized () && len (s .subsume ) == 0 {
220
+ // Early return in the case where we're ingesting an initial snapshot and
221
+ // there are no subsumed replicas that the snapshot could be making
222
+ // narrower.
223
+ return nil
224
+ }
199
225
200
- // The subsumed replicas extend past the snapshot:
201
- //
202
- // [a----------------s1---...---sn)
203
- // [a---------------------b)
204
- //
205
- // We need to additionally clear [b,sn).
226
+ rightMostDesc := s . origDesc
227
+ if len ( s . subsume ) != 0 {
228
+ // NB: s.subsume are non-overlapping and sorted by start key. Pick the last
229
+ // one to determine whether the snapshot is narrowing the keyspace or not.
230
+ rightMostDesc = s . subsume [ len ( s . subsume ) - 1 ]. desc
231
+ }
206
232
233
+ if rightMostDesc .EndKey .Compare (s .desc .EndKey ) <= 0 {
234
+ return nil // we aren't narrowing anything; no-op
235
+ }
236
+
237
+ // TODO(sep-raft-log): read from the state machine engine here.
238
+ reader := storage .Reader (s .todoEng )
239
+ for _ , span := range rditer .Select (0 , rditer.SelectOpts {
240
+ Ranged : rditer.SelectRangedOptions {RSpan : roachpb.RSpan {
241
+ Key : s .desc .EndKey , EndKey : rightMostDesc .EndKey ,
242
+ },
243
+ SystemKeys : true ,
244
+ LockTable : true ,
245
+ UserKeys : true ,
246
+ },
247
+ }) {
207
248
if err := s .writeSST (ctx , func (ctx context.Context , w storage.Writer ) error {
208
249
return storage .ClearRangeWithHeuristic (
209
- ctx , reader , w ,
210
- keySpans [i ].EndKey , totalKeySpans [i ].EndKey ,
211
- kvstorage .ClearRangeThresholdPointKeys ,
250
+ ctx , reader , w , span .Key , span .EndKey , kvstorage .ClearRangeThresholdPointKeys ,
212
251
)
213
252
}); err != nil {
214
253
return err
215
254
}
216
- s .cleared = append (s .cleared ,
217
- roachpb.Span {Key : keySpans [i ].EndKey , EndKey : totalKeySpans [i ].EndKey })
255
+ s .cleared = append (s .cleared , span )
218
256
}
257
+
219
258
return nil
220
259
}
0 commit comments