@@ -234,12 +234,83 @@ type EventuallyFileOnlySnapshot struct {
234234}
235235
236236func (d * DB ) makeEventuallyFileOnlySnapshot (keyRanges []KeyRange ) * EventuallyFileOnlySnapshot {
237- isFileOnly := true
238-
237+ var snapshotSeqNum , exciseSeqNumToWaitFor base.SeqNum
238+ // tryGetSnapshotSeqNum attempts to initialize a snapshotSeqNum. The
239+ // snapshotSeqNum should be ignored if exciseSeqNumToWaitFor is > 0. In this
240+ // case the caller should wait for this seqnum to become visible and call
241+ // this function again.
242+ tryGetSnapshotSeqNum := func () {
243+ // In AllocateSeqNum, for an ingest-and-excise, some seqnums are
244+ // allocated, say starting at N. Then AllocateSeqNum calls prepare, which
245+ // acquires DB.mu and grabs all the existing EFOS that have not
246+ // transitioned to FOS (which are in d.mu.snapshots.snapshotList) and
247+ // overlap with the excise. After releasing DB.mu, in apply, the
248+ // ingest-and-excise waits for all the previous EFOSs to transition to FOS
249+ // (as a side effect of waiting for a memtable flush to complete). Note,
250+ // the visible seqnum is still <= N-1, and will not be bumped to N until
251+ // the ingest-and-excise completes. Any EFOS that gets created after
252+ // prepare looked at the existing EFOSs, but before the ingest-and-excise
253+ // completes, will be created with EventuallyFileOnlySnapshot.seqNum=N-1,
254+ // and may not transition to FOS until after the excise, which is
255+ // incorrect (the version at the transition to FOS has already experienced
256+ // the excise). To avoid this incorrectness, tryGetSnapshotSeqNum is
257+ // called in a loop, until there are no such ongoing excises. The loop can
258+ // starve EFOS creation if the keyRanges keep overlapping with new ongoing
259+ // ingest-and-excises. So EFOS should only be used when ingest-and-excises
260+ // are rare over the keyRanges.
261+ //
262+ // Improving this starvation behavior would require this snapshot to
263+ // register itself in a way that blocks future ingest-and-excises. The
264+ // blocking would need to be done before allocating the seqnum, since
265+ // blocking after can delay (latency sensitive) writes that get a seqnum
266+ // later than the ingest-and-excise. Then the problem shifts to not
267+ // starving the ingest-and-excise if EFOS creation is frequent. We observe
268+ // that in the CockroachDB use case (a) ingest-and-excises are not very
269+ // frequent, so EFOS starvation is unlikely, (b) EFOS creation is not
270+ // latency sensitive. Hence, we ignore this starvation problem.
271+ snapshotSeqNum = d .mu .versions .visibleSeqNum .Load ()
272+ // Check if any of the keyRanges overlap with an ongoing
273+ // ingest-and-excise.
274+ //
275+ // NB: The zero seqnum cannot occur in practice, since base.SeqNumStart >
276+ // 0.
277+ exciseSeqNumToWaitFor = base .SeqNum (0 )
278+ for seqNum , span := range d .mu .snapshots .ongoingExcises {
279+ if base .Visible (seqNum , snapshotSeqNum , base .SeqNumMax ) {
280+ // Skip this excise, since this is visible to the snapshot.
281+ continue
282+ }
283+ // INVARIANT: seqNum >= snapshotSeqNum.
284+ if seqNum <= exciseSeqNumToWaitFor {
285+ // We are already waiting for a later excise.
286+ continue
287+ }
288+ for i := range keyRanges {
289+ if keyRanges [i ].OverlapsKeyRange (d .cmp , span ) {
290+ exciseSeqNumToWaitFor = seqNum
291+ break
292+ }
293+ }
294+ }
295+ }
239296 d .mu .Lock ()
240297 defer d .mu .Unlock ()
241- seqNum := d .mu .versions .visibleSeqNum .Load ()
242- // Check if any of the keyRanges overlap with a memtable.
298+ for {
299+ // This call updates snapshotSeqNum and exciseSeqNumToWaitFor.
300+ tryGetSnapshotSeqNum ()
301+ if exciseSeqNumToWaitFor == 0 {
302+ break
303+ }
304+ for ! base .Visible (exciseSeqNumToWaitFor , d .mu .versions .visibleSeqNum .Load (), base .SeqNumMax ) {
305+ d .mu .snapshots .ongoingExcisesRemovedCond .Wait ()
306+ }
307+ }
308+ isFileOnly := true
309+ // Check if any of the keyRanges overlap with a memtable. It is possible
310+ // (with very low probability) that all these memtable have seqnums later
311+ // than seqNum, and the overlap is a false positive. This is harmless, and
312+ // this EFOS will transition to FOS, when the false positive memtable is
313+ // flushed.
243314 for i := range d .mu .mem .queue {
244315 d .mu .mem .queue [i ].computePossibleOverlaps (func (bounded ) shouldContinue {
245316 isFileOnly = false
@@ -248,7 +319,7 @@ func (d *DB) makeEventuallyFileOnlySnapshot(keyRanges []KeyRange) *EventuallyFil
248319 }
249320 es := & EventuallyFileOnlySnapshot {
250321 db : d ,
251- seqNum : seqNum ,
322+ seqNum : snapshotSeqNum ,
252323 protectedRanges : keyRanges ,
253324 closed : make (chan struct {}),
254325 }
@@ -258,7 +329,7 @@ func (d *DB) makeEventuallyFileOnlySnapshot(keyRanges []KeyRange) *EventuallyFil
258329 } else {
259330 s := & Snapshot {
260331 db : d ,
261- seqNum : seqNum ,
332+ seqNum : snapshotSeqNum ,
262333 }
263334 s .efos = es
264335 es .mu .snap = s
0 commit comments