MB-34091: Manually ack seqnos for new secondChain nodes

BenHuddleston · daverigby · commit 80fa82eac3f8 · 2019-06-03T08:46:58.000Z
It is possible for a new node (will exist in the new topology) to ack before ns_server gives us a new replication topology. ns_server does this so that we do not block SyncWrites on vBucket streaming on the new node as this could take a long time and cause the SyncWrites to time out. However, this means that a new node can ack before it exists in our replication topology. Store a map of acks for nodes that we do not know about, and use this map on setting of the replication topology to manually ack each new node at their highest acked seqno. This will ensure that our SyncWrites do not timeout waiting for the secondChain to be satisfied. Change-Id: I480abf22878b30d321b3ffb4419f61975d33c5eb Reviewed-on: http://review.couchbase.org/109575 Reviewed-by: Paolo Cocchi <paolo.cocchi@couchbase.com> Reviewed-by: Dave Rigby <daver@couchbase.com> Tested-by: Build Bot <build@couchbase.com>
diff --git a/engines/ep/src/dcp/active_stream.cc b/engines/ep/src/dcp/active_stream.cc
@@ -1123,6 +1123,21 @@ uint32_t ActiveStream::setDead(end_stream_status_t status) {
     if (status != END_STREAM_DISCONNECTED) {
         notifyStreamReady();
     }
+
+    // Remove any unknown acks for the stream. Why here and not on
+    // destruction of the object? We could be replacing an existing
+    // DcpProducer with another. This old ActiveStream may then live on
+    // (owned by a backfill) and clear a seqno ack from a new ActiveStream.
+    if (supportSyncReplication()) {
+        auto vb = engine->getVBucket(vb_);
+        // Take the vb state lock so that we don't change the state of
+        // this vb
+        folly::SharedMutex::ReadHolder vbStateLh(vb->getStateLock());
+        if (vb->getState() == vbucket_state_active) {
+            vb->removeQueuedAckFromDM(name_);
+        }
+    }
+
     return 0;
 }
 
diff --git a/engines/ep/src/durability/active_durability_monitor.cc b/engines/ep/src/durability/active_durability_monitor.cc
@@ -320,6 +320,10 @@ uint8_t ActiveDurabilityMonitor::getSecondChainMajority() const {
     return s->secondChain ? s->secondChain->majority : 0;
 }
 
+void ActiveDurabilityMonitor::removedQueuedAck(const std::string& node) {
+    state.wlock()->queuedSeqnoAcks.erase(node);
+}
+
 ActiveDurabilityMonitor::Container::iterator
 ActiveDurabilityMonitor::State::getNodeNext(const std::string& node) {
     Expects(firstChain.get());
@@ -431,20 +435,23 @@ void ActiveDurabilityMonitor::State::updateNodeAck(const std::string& node,
         firstChainPos.lastAckSeqno = seqno;
     }
 
+    bool secondChainFound = false;
     if (secondChain) {
         auto secondChainItr = secondChain->positions.find(node);
         if (secondChainItr != secondChain->positions.end()) {
+            secondChainFound = true;
             auto& secondChainPos =
                     const_cast<Position&>(secondChainItr->second);
             secondChainPos.lastAckSeqno = seqno;
         }
     }
 
-    // Just drop out of here if we don't find the node. We could be receiving an
-    // ack from a new replica that is not yet in the second chain. We don't want
-    // to make each sync write wait on a vBucket being (almost) fully
-    // transferred during a rebalance so ns_server deal with these by waiting
-    // for seqno persistence on the new replica.
+    if (!firstChainFound && !secondChainFound) {
+        // We didn't find the node in either of our chains, but we still need to
+        // track the ack for this node in case we are about to get a topology
+        // change in which this node will exist.
+        queuedSeqnoAcks[node] = seqno;
+    }
 }
 
 int64_t ActiveDurabilityMonitor::getNodeWriteSeqno(
@@ -617,6 +624,9 @@ void ActiveDurabilityMonitor::State::processSeqnoAck(const std::string& node,
                 std::to_string(lastTrackedSeqno) + "\"");
     }
 
+    // We should never ack for the active
+    Expects(firstChain->active != node);
+
     // Note: process up to the ack'ed seqno
     ActiveDurabilityMonitor::Container::iterator next;
     while ((next = getNodeNext(node)) != trackedWrites.end() &&
@@ -748,6 +758,50 @@ void ActiveDurabilityMonitor::State::setReplicationTopology(
     for (auto& write : trackedWrites) {
         write.resetTopology(*firstChain, secondChain.get());
     }
+
+    // Manually ack any nodes that did not previously exist in either chain
+    performQueuedAckForChain(*firstChain);
+
+    if (secondChain) {
+        performQueuedAckForChain(*secondChain);
+    }
+}
+
+void ActiveDurabilityMonitor::State::performQueuedAckForChain(
+        const DurabilityMonitor::ReplicationChain& chain) {
+    for (const auto& node : chain.positions) {
+        auto existingAck = queuedSeqnoAcks.find(node.first);
+        if (existingAck != queuedSeqnoAcks.end()) {
+            Container toCommit;
+            processSeqnoAck(existingAck->first, existingAck->second, toCommit);
+            // ======================= FIRST CHAIN =============================
+            // @TODO MB-34318 this should no longer be true and we will need
+            // to remove the pre-condition check.
+            //
+            // This is a little bit counter-intuitive. We may actually need to
+            // commit something post-topology change, however, because we have
+            // reset the ackCount of all in flight SyncWrites previously we
+            // should never ack here. If we had Replicas=1 then we would have
+            // already committed due to active ack or would require an active
+            // ack (PERSIST levels) to commit. So, if we do commit something as
+            // a result of a topology change it will only be done when we move
+            // the HighPreparedSeqno. The active can never exist in the
+            // queuedSeqnoAcks map so we should also never attempt to ack it
+            // here.
+            // ===================== SECOND CHAIN ==============================
+            // We don't expect any SyncWrite to currently need committing. Why?
+            // We require that a SyncWrite must satisfy both firstChain and
+            // secondChain. The SyncWrite should have already been committed
+            // if the firstChain is satisfied and we are under a vbState lock
+            // which will block seqno acks until this topology change has been
+            // completed.
+            Expects(toCommit.empty());
+
+            // Remove the existingAck, we don't need to track it any further as
+            // it is in a chain.
+            queuedSeqnoAcks.erase(existingAck);
+        }
+    }
 }
 
 void ActiveDurabilityMonitor::State::addSyncWrite(const void* cookie,
diff --git a/engines/ep/src/durability/active_durability_monitor.h b/engines/ep/src/durability/active_durability_monitor.h
@@ -199,6 +199,16 @@ class ActiveDurabilityMonitor : public DurabilityMonitor {
      */
     void checkForCommit();
 
+    /**
+     * We track acks for unknown nodes as they may precede a topology change
+     * that could cause a SyncWrite to timeout. We only receive these acks via
+     * DCP so we can remove any "unknown" ack for a given node when we close the
+     * ActiveStream serving it.
+     *
+     * @param node Node for which we wish to remove the unknown ack
+     */
+    void removedQueuedAck(const std::string& node);
+
 protected:
     void toOStream(std::ostream& os) const override;
 
@@ -384,6 +394,14 @@ class ActiveDurabilityMonitor : public DurabilityMonitor {
          */
         Container updateHighPreparedSeqno();
 
+        /**
+         * Perform the manual ack (from the map of queuedSeqnoAcks) that is
+         * required at rebalance for the given chain
+         *
+         * @param chain Chain for which we should manually ack nodes
+         */
+        void performQueuedAckForChain(const ReplicationChain& chain);
+
     private:
         /**
          * Advance the current Position (iterator and seqno).
@@ -436,6 +454,12 @@ class ActiveDurabilityMonitor : public DurabilityMonitor {
         std::chrono::milliseconds defaultTimeout = std::chrono::seconds(30);
 
         const ActiveDurabilityMonitor& adm;
+
+        // Map of node to seqno value for seqno acks that we have seen but
+        // do not exist in the current replication topology. They may be
+        // required to manually ack for a new node if we receive an ack before
+        // ns_server sends us a new replication topology.
+        std::unordered_map<std::string, Monotonic<int64_t>> queuedSeqnoAcks;
     };
 
     // The VBucket owning this DurabilityMonitor instance
diff --git a/engines/ep/src/vbucket.cc b/engines/ep/src/vbucket.cc
@@ -3556,3 +3556,7 @@ ENGINE_ERROR_CODE VBucket::checkDurabilityRequirements(const Item& item) {
 
     return ENGINE_SUCCESS;
 }
+
+void VBucket::removeQueuedAckFromDM(const std::string& node) {
+    getActiveDM().removedQueuedAck(node);
+}
diff --git a/engines/ep/src/vbucket.h b/engines/ep/src/vbucket.h
@@ -1640,6 +1640,14 @@ class VBucket : public std::enable_shared_from_this<VBucket> {
      */
     const DurabilityMonitor& getDurabilityMonitor() const;
 
+    /**
+     * Remove any queued acks for the given node from the Durability Monitor.
+     * (should be Active)
+     *
+     * @param node Name of the node for which we wish to remove the ack
+     */
+    void removeQueuedAckFromDM(const std::string& node);
+
     std::queue<queued_item> rejectQueue;
     std::unique_ptr<FailoverTable> failovers;
 
diff --git a/engines/ep/tests/module_tests/dcp_durability_stream_test.cc b/engines/ep/tests/module_tests/dcp_durability_stream_test.cc
@@ -205,6 +205,48 @@ TEST_P(DurabilityActiveStreamTest, SendDcpAbort) {
     ASSERT_FALSE(resp);
 }
 
+TEST_P(DurabilityActiveStreamTest, RemoveUnknownSeqnoAckAtDestruction) {
+    auto vb = engine->getVBucket(vbid);
+
+    const auto key = makeStoredDocKey("key");
+    const auto& value = "value";
+    auto item = makePendingItem(
+            key,
+            value,
+            cb::durability::Requirements(cb::durability::Level::Majority,
+                                         1 /*timeout*/));
+    VBQueueItemCtx ctx;
+    ctx.durability =
+            DurabilityItemCtx{item->getDurabilityReqs(), nullptr /*cookie*/};
+
+    EXPECT_EQ(MutationStatus::WasClean, public_processSet(*vb, *item, ctx));
+    flushVBucketToDiskIfPersistent(vbid, 1);
+
+    // We don't include prepares in the numItems stat (should not exist in here)
+    EXPECT_EQ(0, vb->getNumItems());
+
+    // Our topology gives replica name as "replica" an our producer/stream has
+    // name "test_producer". Simulate a seqno ack by calling the vBucket level
+    // function.
+    vb->seqnoAcknowledged("test_producer", 1);
+
+    // An unknown seqno ack should not have committed the item
+    EXPECT_EQ(0, vb->getNumItems());
+
+    // Disconnect the ActiveStream
+    stream->setDead(END_STREAM_DISCONNECTED);
+
+    // If the seqno ack still existed in the queuedSeqnoAcks map then it would
+    // result in a commit on topology change
+    setVBucketStateAndRunPersistTask(
+            vbid,
+            vbucket_state_active,
+            {{"topology",
+              nlohmann::json::array(
+                      {{"active", "replica1", "test_producer"}})}});
+    EXPECT_EQ(0, vb->getNumItems());
+}
+
 void DurabilityPassiveStreamTest::SetUp() {
     SingleThreadedPassiveStreamTest::SetUp();
     consumer->enableSyncReplication();
diff --git a/engines/ep/tests/module_tests/durability_monitor_test.cc b/engines/ep/tests/module_tests/durability_monitor_test.cc

Original file line number	Diff line number	Diff line change
`@@ -3556,3 +3556,7 @@ ENGINE_ERROR_CODE VBucket::checkDurabilityRequirements(const Item& item) {`
`3556`	`3556`
`3557`	`3557`	`return ENGINE_SUCCESS;`
`3558`	`3558`	`}`
	`3559`	`+`
	`3560`	`+void VBucket::removeQueuedAckFromDM(const std::string& node) {`
	`3561`	`+ getActiveDM().removedQueuedAck(node);`
	`3562`	`+}`