@@ -131,7 +131,10 @@ void PassiveDurabilityMonitor::notifySnapshotEndReceived(uint64_t snapEnd) {
131131 int64_t hps{0 };
132132 {
133133 auto s = state.wlock ();
134- s->receivedSnapshotEndSeqnos .push (snapEnd);
134+ s->receivedSnapshotEnds .push ({int64_t (snapEnd),
135+ vb.isReceivingDiskSnapshot ()
136+ ? CheckpointType::Disk
137+ : CheckpointType::Memory});
135138 // Maybe the new tracked Prepare is already satisfied and could be
136139 // ack'ed back to the Active.
137140 prevHps = s->highPreparedSeqno .lastWriteSeqno ;
@@ -307,6 +310,7 @@ void PassiveDurabilityMonitor::State::updateHighPreparedSeqno() {
307310 // at PDM under the following constraints:
308311 //
309312 // (1) Nothing is ack'ed before the complete snapshot is received
313+ // (I.e., do nothing if receivedSnapshotEnds is empty)
310314 //
311315 // (2) Majority and MajorityAndPersistOnMaster Prepares (which don't need to
312316 // be persisted for being locally satisfied) may be satisfied as soon as
@@ -319,6 +323,12 @@ void PassiveDurabilityMonitor::State::updateHighPreparedSeqno() {
319323 // (4) The durability-fence can move (ie, PersistToMajority Prepares are
320324 // locally-satisfied) only when the complete snapshot is persisted.
321325 //
326+ // (5) Once a disk snapshot is fully persisted, the HPS is advanced to the
327+ // snapshot end - even if no prepares were seen during the snapshot
328+ // or if trackedWrites is empty. This accounts for deduping; there may
329+ // have been prepares we have not seen, but they are definitely
330+ // satisfied (they are persisted) and should be acked.
331+ //
322332 // This function implements all the logic necessary for moving the HPS by
323333 // enforcing the rules above. The function is called:
324334 //
@@ -334,16 +344,6 @@ void PassiveDurabilityMonitor::State::updateHighPreparedSeqno() {
334344 // durability-fence. As already mentioned, we can move the
335345 // durability-fence only if the complete snapshot is persisted.
336346
337- if (trackedWrites.empty ()) {
338- return ;
339- }
340-
341- if (receivedSnapshotEndSeqnos.empty ()) {
342- // We have not received a full snapshot, we cannot advance the hps at
343- // all
344- return ;
345- }
346-
347347 const auto prevHPS = highPreparedSeqno.lastWriteSeqno ;
348348
349349 // Helper to keep conditions short and meaningful
@@ -355,49 +355,87 @@ void PassiveDurabilityMonitor::State::updateHighPreparedSeqno() {
355355 snapshotEndSeqno;
356356 };
357357
358- while (!receivedSnapshotEndSeqnos.empty () && !trackedWrites.empty ()) {
359- uint64_t snapshotEndSeqno = receivedSnapshotEndSeqnos.front ();
360-
361- // ** If pdm.vb.getPersistenceSeqno() >= snapshotEndSeqno
362- // we have received and persisted an entire snapshot
363- // All prepares from this snapshot are satisfied and the state
364- // is consistent at snap end. The HPS can advance over Prepares of
365- // PersistToMajority or lower (i.e., everything currently)
366-
367- // ** if pdm.vb.getPersistenceSeqno() < snapshotEndSeqno
368- // we have received but NOT persisted an entire snapshot
369- // We *may* be able to advance the HPS part way
370- // into this snapshot - The HPS can be advanced over all Prepares of
371- // MajorityAndPersistOnMaster level or lower, to the last Prepare
372- // immediately preceding an *unpersisted* Prepare with Level ==
373- // PersistToMajority. We cannot move the HPS past this Prepare until
374- // it *is* persisted.
375-
376- const cb::durability::Level maxLevelCanAdvanceOver =
377- (pdm.vb .getPersistenceSeqno () >= snapshotEndSeqno)
378- ? cb::durability::Level::PersistToMajority
379- : cb::durability::Level::MajorityAndPersistOnMaster;
380-
381- for (auto next = getIteratorNext (highPreparedSeqno.it );
382- inSnapshot (snapshotEndSeqno, next) &&
383- next->getDurabilityReqs ().getLevel () <= maxLevelCanAdvanceOver;
384- next = getIteratorNext (highPreparedSeqno.it )) {
385- // Note: Update last-write-seqno first to enforce monotonicity and
386- // avoid any state-change if monotonicity checks fail
387- highPreparedSeqno.lastWriteSeqno = next->getBySeqno ();
388- highPreparedSeqno.it = next;
358+ while (!receivedSnapshotEnds.empty ()) {
359+ const auto snapshotEnd = receivedSnapshotEnds.front ();
360+
361+ const bool snapshotFullyPersisted =
362+ static_cast <int64_t >(pdm.vb .getPersistenceSeqno ()) >=
363+ snapshotEnd.seqno ;
364+
365+ const bool isDiskSnapshot = snapshotEnd.type == CheckpointType::Disk;
366+
367+ using namespace cb ::durability;
368+
369+ Level maxLevelCanAdvanceOver{};
370+
371+ if (snapshotFullyPersisted) {
372+ // we have received and persisted an entire snapshot
373+ // All prepares from this snapshot are satisfied and the state
374+ // is consistent at snap end. The HPS can advance over Prepares of
375+ // PersistToMajority or lower (i.e., everything currently)
376+ maxLevelCanAdvanceOver = Level::PersistToMajority;
377+ } else if (!isDiskSnapshot) {
378+ // we have received but NOT persisted an entire snapshot
379+ // We *may* be able to advance the HPS part way
380+ // into this snapshot - The HPS can be advanced over all Prepares of
381+ // MajorityAndPersistOnMaster level or lower, to the last Prepare
382+ // immediately preceding an *unpersisted* Prepare with Level ==
383+ // PersistToMajority. We cannot move the HPS past this Prepare until
384+ // it *is* persisted.
385+ maxLevelCanAdvanceOver = Level::MajorityAndPersistOnMaster;
386+ } else {
387+ // we have received but NOT persisted an entire *DISK* snapshot
388+ // we cannot ack anything until the entire snapshot has been
389+ // persisted because PersistToMajority level Prepares may have been
390+ // deduped by lower level prepares.
391+ // Therefore, the HPS cannot advance over *any* prepares.
392+ maxLevelCanAdvanceOver = Level::None;
389393 }
390- // Check if we finished an entire snapshot, and might be able to
391- // continue checking the next one.
392- if (inSnapshot (snapshotEndSeqno,
393- getIteratorNext (highPreparedSeqno.it ))) {
394- // we stopped advancing the HPS before the end of a snapshot
395- // because we reached a PersistToMajority Prepare
396- // HPS now points to the last Prepare before any
397- // PersistToMajority
398- break ;
394+
395+ // Advance the HPS, respecting maxLevelCanAdvanceOver
396+ if (!trackedWrites.empty ()) {
397+ for (auto next = getIteratorNext (highPreparedSeqno.it );
398+ inSnapshot (snapshotEnd.seqno , next) &&
399+ next->getDurabilityReqs ().getLevel () <= maxLevelCanAdvanceOver;
400+ next = getIteratorNext (highPreparedSeqno.it )) {
401+ // Note: Update last-write-seqno first to enforce monotonicity
402+ // and avoid any state-change if monotonicity checks fail
403+ highPreparedSeqno.lastWriteSeqno = next->getBySeqno ();
404+ highPreparedSeqno.it = next;
399405 }
400- receivedSnapshotEndSeqnos.pop ();
406+ }
407+
408+ if (isDiskSnapshot && snapshotFullyPersisted) {
409+ // Special case - prepares in disk snapshots may have been
410+ // deduplicated.
411+ // PRE(persistMajority), CMT, PRE(), ABORT, SET
412+ // may, after the abort has been purged be sent as:
413+ // SET
414+ // We would have no prepare for this op, but we still need to
415+ // seqno ack something. To resolve this, advance the HPS seqno to
416+ // the snapshotEndSeqno. There may not be an associated prepare.
417+ // NB: lastWriteSeqno is NOT guaranteed to match
418+ // highPreparedSeqno.it->getBySeqno()
419+ // because of this case
420+ highPreparedSeqno.lastWriteSeqno = snapshotEnd.seqno ;
421+ }
422+
423+ // Check if we could have acked everything within the snapshot and
424+ // might be able to continue checking the next one.
425+ if ((isDiskSnapshot && !snapshotFullyPersisted) ||
426+ inSnapshot (snapshotEnd.seqno ,
427+ getIteratorNext (highPreparedSeqno.it ))) {
428+ // Either we have not fully persisted a disk snapshot and
429+ // the HPS is left <= the start of this snapshot
430+ // OR
431+ // we stopped advancing the HPS before the end of a memory
432+ // snapshot because we reached a PersistToMajority Prepare
433+ // HPS now points to the last Prepare before any
434+ // PersistToMajority
435+ break ;
436+ }
437+
438+ receivedSnapshotEnds.pop ();
401439 }
402440
403441 // We have now acked all the complete, persisted snapshots we received,
0 commit comments