MB-35607: Correct HCS flushing logic

BenHuddleston · BenHuddleston · commit 9a5827d437ab · 2019-08-20T16:11:38.000Z
Currently the HCS flushing logic is flawed in two ways: 1) If we disconnect and reconnect a stream then flush a disk snapshot the HCS may be weakly monotonic (if the active node moves on and streams to the replica from disk without doing any other SyncWrites). 2) getItemsForCursor is returning a HCS value before the checkpoint end. This is incorrect as we can flush partial disk snapshots and a subsequent warmup may bring the node back in a bad state (missing prepares). This should work fine if the rest of the disk snapshot is streamed, but if this node is promoted beforehand and streams from memory to a replica then the replica will start firing assertions. This is a dataloss situation already, but undesirable nonetheless. Change-Id: I700e25d248968ce01abd68236a61fe3a960b11a5 Reviewed-on: http://review.couchbase.org/113552 Reviewed-by: Dave Rigby <daver@couchbase.com> Tested-by: Build Bot <build@couchbase.com>
diff --git a/engines/ep/src/checkpoint.cc b/engines/ep/src/checkpoint.cc
@@ -15,6 +15,7 @@
  *   limitations under the License.
  */
 
+#include <boost/optional/optional_io.hpp>
 #include <gsl.h>
 #include <platform/checked_snprintf.h>
 #include <string>
@@ -385,8 +386,8 @@ std::ostream& operator <<(std::ostream& os, const Checkpoint& c) {
        << " snap:{" << c.getSnapshotStartSeqno() << ","
        << c.getSnapshotEndSeqno() << "}"
        << " state:" << to_string(c.getState())
-       << " type:" << to_string(c.getCheckpointType()) << " items:["
-       << std::endl;
+       << " type:" << to_string(c.getCheckpointType())
+       << " hcs:" << c.getHighCompletedSeqno() << " items:[" << std::endl;
     for (const auto& e : c.toWrite) {
         os << "\t{" << e->getBySeqno() << "," << to_string(e->getOperation());
         e->isDeleted() ? os << "[d]," : os << ",";
diff --git a/engines/ep/src/checkpoint.h b/engines/ep/src/checkpoint.h
@@ -525,7 +525,7 @@ class Checkpoint {
         highCompletedSeqno = seqno;
     }
 
-    boost::optional<uint64_t> getHighCompletedSeqno() {
+    boost::optional<uint64_t> getHighCompletedSeqno() const {
         return highCompletedSeqno;
     }
 
diff --git a/engines/ep/src/checkpoint_manager.cc b/engines/ep/src/checkpoint_manager.cc
@@ -855,7 +855,6 @@ CheckpointManager::ItemsForCursor CheckpointManager::getItemsForCursor(
     // limit.
     ItemsForCursor result((*cursor.currentCheckpoint)->getSnapshotStartSeqno(),
                           (*cursor.currentCheckpoint)->getSnapshotEndSeqno(),
-                          (*cursor.currentCheckpoint)->getHighCompletedSeqno(),
                           (*cursor.currentCheckpoint)->getCheckpointType());
 
     size_t itemCount = 0;
@@ -875,18 +874,18 @@ CheckpointManager::ItemsForCursor CheckpointManager::getItemsForCursor(
         itemCount++;
 
         if (qi->getOperation() == queue_op::checkpoint_end) {
+            // Only move the HCS at checkpoint end (don't want to flush a
+            // HCS mid-checkpoint).
+            result.highCompletedSeqno =
+                    (*cursor.currentCheckpoint)->getHighCompletedSeqno();
+
             // Reached the end of a checkpoint; check if we have exceeded
             // our limit.
             if (itemCount >= approxLimit) {
                 // Reached our limit - don't want any more items.
                 result.range.setEnd(
                         (*cursor.currentCheckpoint)->getSnapshotEndSeqno());
 
-                // Only move the HCS at checkpoint end (don't want to flush a
-                // HCS mid-checkpoint).
-                result.highCompletedSeqno =
-                        (*cursor.currentCheckpoint)->getHighCompletedSeqno();
-
                 // However, we *do* want to move the cursor into the next
                 // checkpoint if possible; as that means the checkpoint we just
                 // completed has one less cursor in it (and could potentially be
diff --git a/engines/ep/src/checkpoint_manager.h b/engines/ep/src/checkpoint_manager.h
@@ -59,18 +59,19 @@ class CheckpointManager {
     struct ItemsForCursor {
         ItemsForCursor(uint64_t start,
                        uint64_t end,
-                       boost::optional<uint64_t> highCompletedSeqno = {},
-                       CheckpointType checkpointType = CheckpointType::Memory)
+                       CheckpointType checkpointType = CheckpointType::Memory,
+                       boost::optional<uint64_t> highCompletedSeqno = {})
             : range(start, end),
-              highCompletedSeqno(highCompletedSeqno),
-              checkpointType(checkpointType) {
+              checkpointType(checkpointType),
+              highCompletedSeqno(highCompletedSeqno) {
         }
         snapshot_range_t range;
         bool moreAvailable = {false};
+        CheckpointType checkpointType = CheckpointType::Memory;
 
-        // HCS that should be flushed
+        // HCS that should be flushed. Currently should only be set for Disk
+        // Checkpoint runs.
         boost::optional<uint64_t> highCompletedSeqno = {};
-        CheckpointType checkpointType = CheckpointType::Memory;
     };
 
     /// Return type of expelUnreferencedCheckpointItems()
diff --git a/engines/ep/src/ep_bucket.cc b/engines/ep/src/ep_bucket.cc
@@ -417,7 +417,16 @@ std::pair<bool, size_t> EPBucket::flushVBucket(Vbid vbid) {
             // any other item in this flush batch. This is required because we
             // send mutations instead of a commits and would not otherwise
             // update the HCS on disk.
-            boost::optional<uint64_t> hcs = toFlush.highCompletedSeqno;
+            boost::optional<uint64_t> hcs =
+                    boost::make_optional(false, uint64_t());
+
+            // HCS may be weakly monotonic when received via a disk snapshot so
+            // we special case this for the disk snapshot instead of relaxing
+            // the general constraint.
+            if (toFlush.highCompletedSeqno &&
+                *toFlush.highCompletedSeqno != vbstate.highCompletedSeqno) {
+                hcs = toFlush.highCompletedSeqno;
+            }
             // HPS is optional because we have to update it on disk only if a
             // prepare is found in the flush-batch
             boost::optional<uint64_t> hps;
diff --git a/engines/ep/tests/module_tests/dcp_durability_stream_test.cc b/engines/ep/tests/module_tests/dcp_durability_stream_test.cc
@@ -985,9 +985,25 @@ TEST_P(DurabilityPassiveStreamPersistentTest,
     EXPECT_EQ(1, ack.getPreparedSeqno());
 }
 
-TEST_P(DurabilityPassiveStreamPersistentTest, DiskSnapshotHCSPersisted) {
+void DurabilityPassiveStreamPersistentTest::testDiskSnapshotHCSPersisted() {
     testReceiveMutationOrDeletionInsteadOfCommitWhenStreamingFromDisk(
             DocumentState::Alive);
+
+    // We won't flush a HCS from a snapshot marker until we process the entire
+    // checkpoint, this is because we need to be pessimistic with flushing the
+    // HCS in a disk checkpoint for our warmup optimization. If we flush a HCS
+    // that is too high whilst receiving a disk snapshot we may end up in some
+    // inconsistent state due to out of order commit.
+    SnapshotMarker marker(
+            0 /*opaque*/,
+            vbid,
+            5 /*snapStart*/,
+            6 /*snapEnd*/,
+            dcp_marker_flag_t::MARKER_FLAG_MEMORY | MARKER_FLAG_CHK,
+            {} /*HCS*/,
+            {} /*streamId*/);
+    stream->processMarker(&marker);
+
     flushVBucketToDiskIfPersistent(vbid, 2);
     {
         auto vb = store->getVBucket(vbid);
@@ -1009,9 +1025,31 @@ TEST_P(DurabilityPassiveStreamPersistentTest, DiskSnapshotHCSPersisted) {
     {
         auto vb = store->getVBucket(vbid);
         EXPECT_EQ(2, vb->getHighCompletedSeqno());
+        EXPECT_EQ(4, vb->getHighSeqno());
     }
 }
 
+TEST_P(DurabilityPassiveStreamPersistentTest, DiskSnapshotHCSPersisted) {
+    testDiskSnapshotHCSPersisted();
+}
+
+TEST_P(DurabilityPassiveStreamPersistentTest,
+       DiskSnapshotHCSIgnoredIfWeaklyMonotonic) {
+    testDiskSnapshotHCSPersisted();
+    SnapshotMarker marker(0 /*opaque*/,
+                          vbid,
+                          6 /*snapStart*/,
+                          7 /*snapEnd*/,
+                          dcp_marker_flag_t::MARKER_FLAG_DISK | MARKER_FLAG_CHK,
+                          2 /*HCS*/,
+                          {} /*streamId*/);
+    stream->processMarker(&marker);
+
+    // We don't flush any items but we will run the flusher which will advance
+    // use out of the checkpoint
+    flushVBucketToDiskIfPersistent(vbid, 0);
+}
+
 TEST_P(DurabilityPassiveStreamTest,
        NoSeqnoAckOnStreamAcceptanceIfNotSupported) {
     consumer->disableSyncReplication();
diff --git a/engines/ep/tests/module_tests/dcp_durability_stream_test.h b/engines/ep/tests/module_tests/dcp_durability_stream_test.h
@@ -155,7 +155,14 @@ class DurabilityPassiveStreamTest
  * Single-threaded.
  */
 class DurabilityPassiveStreamPersistentTest
-    : public DurabilityPassiveStreamTest {};
+    : public DurabilityPassiveStreamTest {
+protected:
+    /**
+     * Test that hte HCS sent in a disk snapshot is persisted by sending sending
+     * a disk snapshot containing a mutation instead of a commit.
+     */
+    void testDiskSnapshotHCSPersisted();
+};
 
 /**
  * ActiveStream tests for Durability against ephemeral buckets. Single-threaded.

Original file line number	Diff line number	Diff line change
`@@ -525,7 +525,7 @@ class Checkpoint {`
`525`	`525`	`highCompletedSeqno = seqno;`
`526`	`526`	`}`
`527`	`527`
`528`		`- boost::optional<uint64_t> getHighCompletedSeqno() {`
	`528`	`+ boost::optional<uint64_t> getHighCompletedSeqno() const {`
`529`	`529`	`return highCompletedSeqno;`
`530`	`530`	`}`
`531`	`531`