@@ -19,56 +19,108 @@ import (
1919 "github.com/cockroachdb/errors"
2020)
2121
22- // splitPreApply is called when the raft command is applied. Any
23- // changes to the given ReadWriter will be written atomically with the
24- // split commit.
22+ // splitPreApplyInput contains input for the RHS replica required for
23+ // splitPreApply.
24+ type splitPreApplyInput struct {
25+ // destroyed is set to true iff the RHS replica (the one with the matching
26+ // ReplicaID from the SplitTrigger used to construct a splitPreApplyInput) has
27+ // already been removed from the store.
28+ //
29+ // If the RHS replica has already been destroyed on the store, then split
30+ // application entails "throwing away" the data that would have belonged to
31+ // the RHS. Simply put, user data belonging to the RHS needs to be cleared and
32+ // any RangeID-local replicated state in the split batch also needs to be
33+ // cleared.
34+ destroyed bool
35+ // rhsDesc is the descriptor for the post split RHS range.
36+ rhsDesc roachpb.RangeDescriptor
37+ // initClosedTimestamp is the initial closed timestamp that the RHS replica
38+ // inherits from the pre-split range. Set iff destroyed is false.
39+ initClosedTimestamp hlc.Timestamp
40+ }
41+
42+ // validateAndPrepareSplit performs invariant checks on the supplied
43+ // splitTrigger and, assuming they hold, returns the corresponding input that
44+ // should be passed to splitPreApply.
2545//
2646// initClosedTS is the closed timestamp carried by the split command. It will be
27- // used to initialize the new RHS range.
28- func splitPreApply (
29- ctx context.Context ,
30- r * Replica ,
31- stateRW kvstorage.StateRW ,
32- raftRW kvstorage.Raft ,
33- split roachpb.SplitTrigger ,
34- initClosedTS * hlc.Timestamp ,
35- ) {
47+ // used to initialize the closed timestamp of the RHS replica.
48+ func validateAndPrepareSplit (
49+ ctx context.Context , r * Replica , split roachpb.SplitTrigger , initClosedTS * hlc.Timestamp ,
50+ ) (splitPreApplyInput , error ) {
3651 // Sanity check that the store is in the split.
37- //
38- // The exception to that is if the DisableEagerReplicaRemoval testing flag is
39- // enabled.
40- _ , hasRightDesc := split .RightDesc .GetReplicaDescriptor (r .StoreID ())
52+ splitRightReplDesc , hasRightDesc := split .RightDesc .GetReplicaDescriptor (r .StoreID ())
4153 _ , hasLeftDesc := split .LeftDesc .GetReplicaDescriptor (r .StoreID ())
4254 if ! hasRightDesc || ! hasLeftDesc {
43- log . KvExec . Fatalf ( ctx , "cannot process split on s%s which does not exist in the split: %+v" ,
55+ return splitPreApplyInput {}, errors . AssertionFailedf ( "cannot process split on s%s which does not exist in the split: %+v" ,
4456 r .StoreID (), split )
4557 }
4658
47- // Obtain the RHS replica. In the common case, it exists and its ReplicaID
48- // matches the one in the split trigger. It is the uninitialized replica that
49- // has just been created or obtained in Replica.acquireSplitLock, and its
50- // raftMu is locked.
51- //
52- // In the less common case, the ReplicaID is already removed from this Store,
53- // and rightRepl is either nil (though the replica may be created concurrently
54- // after we got this nil), or an uninitialized replica with a higher
55- // ReplicaID. Its raftMu is not locked, so this replica might as well be in
56- // the process of destruction or being replaced with another higher-ReplicaID
57- // uninitialized replica.
58- //
59- // In any case, the RHS, if exists (one or multiple replicas, throughout this
60- // splitPreApply call), is uninitialized. This is due to the Store invariant
61- // that all initialized replicas don't overlap, thus the RHS one can only be
62- // uninitialized while we still own the RHS part of the keyspace.
63- //
64- // Uninitialized replicas don't have replicated state, and only have non-empty
65- // RaftReplicaID and RaftHardState keys in storage (at the time of writing),
66- // or, more generally, only unreplicated keys. As a rule of thumb, all
67- // unreplicated keys belong to the *current* ReplicaID in the store, rather
68- // than the ReplicaID in the split trigger (which can be stale).
59+ // Try to obtain the RHS replica. In the common case, it exists and its
60+ // ReplicaID matches the one in the split trigger. In the less common case,
61+ // the ReplicaID has already been removed from this store, and it may have
62+ // been re-added with a higher ReplicaID one or more times. We use this to
63+ // inform the destroyed field.
6964 rightRepl := r .store .GetReplicaIfExists (split .RightDesc .RangeID )
65+ if rightRepl == nil || rightRepl .isNewerThanSplit (& split ) {
66+ // We're in the rare case where we know that the RHS has been removed or
67+ // re-added with a higher replica ID (one or more times).
68+ //
69+ // NB: the rightRepl == nil condition is flaky, in a sense that the RHS
70+ // replica can be created or destroyed concurrently here, one or more times.
71+ // This is because the RHS replica is not locked if its ReplicaID does not
72+ // match the one in the SplitTrigger. But we only use it for a best-effort
73+ // assertion, so this is not critical.
74+ if rightRepl != nil {
75+ // Assert that the rightRepl is not initialized. We're about to clear out
76+ // the data of the RHS of the split; we cannot have already accepted a
77+ // snapshot to initialize this newer RHS.
78+ if rightRepl .IsInitialized () {
79+ return splitPreApplyInput {}, errors .AssertionFailedf (
80+ "unexpectedly found initialized newer RHS of split: %v" , rightRepl .Desc (),
81+ )
82+ }
83+ }
84+
85+ return splitPreApplyInput {destroyed : true , rhsDesc : split .RightDesc }, nil
86+ }
87+ // Sanity check the common case -- the RHS replica that exists should match
88+ // the ReplicaID in the split trigger. In particular, it shouldn't older than
89+ // the one in the split trigger; we've already checked for the newer case
90+ // above.
91+ testingAssert (rightRepl .replicaID == splitRightReplDesc .ReplicaID ,
92+ "expected RHS replica ID to match split trigger replica ID" ,
93+ )
7094
71- rsl := kvstorage .MakeStateLoader (split .RightDesc .RangeID )
95+ // In order to tolerate a nil initClosedTS input, let's forward to
96+ // r.GetCurrentClosedTimestamp(). Generally, initClosedTS is not expected to
97+ // be nil (and is expected to be in advance of r.GetCurrentClosedTimestamp()
98+ // since it's coming hot off a Raft command), but let's not rely on the
99+ // non-nil. Note that r.GetCurrentClosedTimestamp() does not yet incorporate
100+ // initClosedTS because the split command has not been applied yet.
101+ //
102+ // TODO(arul): we should avoid this and have splits always carry a non-nil
103+ // initial closed timestamp; see
104+ // https://github.com/cockroachdb/cockroach/issues/148972.
105+ if initClosedTS == nil {
106+ initClosedTS = & hlc.Timestamp {}
107+ }
108+ initClosedTS .Forward (r .GetCurrentClosedTimestamp (ctx ))
109+
110+ return splitPreApplyInput {
111+ destroyed : false ,
112+ rhsDesc : split .RightDesc ,
113+ initClosedTimestamp : * initClosedTS ,
114+ }, nil
115+ }
116+
117+ // splitPreApply is called when the raft command is applied. Any
118+ // changes to the given ReadWriter will be written atomically with the
119+ // split commit.
120+ func splitPreApply (
121+ ctx context.Context , stateRW kvstorage.StateRW , raftRW kvstorage.Raft , in splitPreApplyInput ,
122+ ) {
123+ rsl := kvstorage .MakeStateLoader (in .rhsDesc .RangeID )
72124 // After PR #149620, the split trigger batch may only contain replicated state
73125 // machine keys, and never contains unreplicated / raft keys. One exception:
74126 // there can still be historical split proposals that write the initial
@@ -93,40 +145,23 @@ func splitPreApply(
93145 log .KvExec .Fatalf (ctx , "cannot clear RaftTruncatedState: %v" , err )
94146 }
95147
96- // Check to see if we know that the RHS has already been removed from this
97- // store at the replica ID implied by the split.
98- if rightRepl == nil || rightRepl .isNewerThanSplit (& split ) {
99- // We're in the rare case where we know that the RHS has been removed or
100- // re-added with a higher replica ID (one or more times).
101- //
102- // If rightRepl is not nil, we are *not* holding raftMu.
148+ if in .destroyed {
149+ // The RHS replica has already been removed from the store. To apply the
150+ // split, we must clear the user data the RHS would have inherited from the
151+ // LHS due to the split. Additionally, we also want to clear any
152+ // RangeID-local replicated keys in the split batch.
103153 //
104- // To apply the split, we need to "throw away" the data that would belong to
105- // the RHS, i.e. we clear the user data the RHS would have inherited from
106- // the LHS due to the split.
107- //
108- // Leave the RangeID-local state intact, since it either belongs to a newer
109- // replica or does not exist. At the time of writing, it can be a non-empty
110- // HardState and RaftReplicaID. It is important to preserve the HardState
111- // because the replica might have already voted at a higher term. In general
112- // this shouldn't happen because we add learners and then promote them only
113- // after they apply a snapshot, but we're going to be extra careful in case
114- // future versions of cockroach somehow promote replicas without ensuring
115- // that a snapshot has been received.
116- //
117- // NB: the rightRepl == nil condition is flaky, in a sense that the RHS
118- // replica can be created concurrently here, one or more times. But we only
119- // use it for a best effort assertion, so this is not critical.
120- if rightRepl != nil {
121- // Assert that the rightRepl is not initialized. We're about to clear out
122- // the data of the RHS of the split; we cannot have already accepted a
123- // snapshot to initialize this newer RHS.
124- if rightRepl .IsInitialized () {
125- log .KvExec .Fatalf (ctx , "unexpectedly found initialized newer RHS of split: %v" , rightRepl .Desc ())
126- }
127- }
154+ // Note that we leave the RangeID-local state intact, since it either
155+ // belongs to a newer replica or does not exist. For the former case, when a
156+ // newer RHS replica exists on this store, it must be uninitialized (this
157+ // was asserted in validateAndPrepareSplit). Uninitialized replicas do not have any
158+ // replicated state; however, at the time of writing, they do have non-empty
159+ // RaftReplicaID and RaftHardState keys in storage. More generally, they are
160+ // allowed to have unreplicated keys. As a rule of thumb, all unreplicated
161+ // keys belong to the *current* ReplicaID in the store, rather than the
162+ // ReplicaID in the split trigger (which in this case, is stale).
128163 if err := kvstorage .RemoveStaleRHSFromSplit (
129- ctx , kvstorage .WrapState (stateRW ), split . RightDesc .RangeID , split . RightDesc .RSpan (),
164+ ctx , kvstorage .WrapState (stateRW ), in . rhsDesc .RangeID , in . rhsDesc .RSpan (),
130165 ); err != nil {
131166 log .KvExec .Fatalf (ctx , "failed to clear range data for removed rhs: %v" , err )
132167 }
@@ -152,18 +187,7 @@ func splitPreApply(
152187 log .KvExec .Fatalf (ctx , "%v" , err )
153188 }
154189 // Persist the closed timestamp.
155- //
156- // In order to tolerate a nil initClosedTS input, let's forward to
157- // r.GetCurrentClosedTimestamp(). Generally, initClosedTS is not expected to
158- // be nil (and is expected to be in advance of r.GetCurrentClosedTimestamp()
159- // since it's coming hot off a Raft command), but let's not rely on the
160- // non-nil. Note that r.GetCurrentClosedTimestamp() does not yet incorporate
161- // initClosedTS because the split command has not been applied yet.
162- if initClosedTS == nil {
163- initClosedTS = & hlc.Timestamp {}
164- }
165- initClosedTS .Forward (r .GetCurrentClosedTimestamp (ctx ))
166- if err := rsl .SetClosedTimestamp (ctx , stateRW , * initClosedTS ); err != nil {
190+ if err := rsl .SetClosedTimestamp (ctx , stateRW , in .initClosedTimestamp ); err != nil {
167191 log .KvExec .Fatalf (ctx , "%s" , err )
168192 }
169193}
0 commit comments