couchbase
diff --git a/‎engines/ep/benchmarks/defragmenter_bench.cc‎
Lines changed: 1 addition & 0 deletions b/‎engines/ep/benchmarks/defragmenter_bench.cc‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎engines/ep/benchmarks/item_compressor_bench.cc‎
Lines changed: 1 addition & 0 deletions b/‎engines/ep/benchmarks/item_compressor_bench.cc‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎engines/ep/configuration.json‎
Lines changed: 17 additions & 1 deletion b/‎engines/ep/configuration.json‎
Lines changed: 17 additions & 1 deletion
diff --git a/‎engines/ep/src/durability/active_durability_monitor.cc‎
Lines changed: 59 additions & 10 deletions b/‎engines/ep/src/durability/active_durability_monitor.cc‎
Lines changed: 59 additions & 10 deletions
diff --git a/‎engines/ep/src/durability/active_durability_monitor.h‎
Lines changed: 17 additions & 4 deletions b/‎engines/ep/src/durability/active_durability_monitor.h‎
Lines changed: 17 additions & 4 deletions
diff --git a/‎engines/ep/src/durability/durability_monitor_impl.cc‎
Lines changed: 5 additions & 0 deletions b/‎engines/ep/src/durability/durability_monitor_impl.cc‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎engines/ep/src/durability/durability_monitor_impl.h‎
Lines changed: 22 additions & 1 deletion b/‎engines/ep/src/durability/durability_monitor_impl.h‎
Lines changed: 22 additions & 1 deletion
@@ -55,6 +55,7 @@ class DefragmentBench : public benchmark::Fixture {
                 /*newSeqnoCb*/ nullptr,
                 [](Vbid) { return; },
                 NoopSyncWriteCompleteCb,
+                NoopSyncWriteTimeoutFactory,
                 NoopSeqnoAckCb,
                 ImmediateCkptDisposer,
                 config,
 
@@ -55,6 +55,7 @@ class ItemCompressorBench : public benchmark::Fixture {
                 /*newSeqnoCb*/ nullptr,
                 [](Vbid) { return; },
                 NoopSyncWriteCompleteCb,
+                NoopSyncWriteTimeoutFactory,
                 NoopSeqnoAckCb,
                 ImmediateCkptDisposer,
                 config,
 
@@ -467,11 +467,27 @@
             "dynamic": true,
             "type": "size_t"
         },
+        "durability_timeout_mode": {
+            "default": "polling",
+            "descr": "How should durability timeouts be scheduled? polling=periodic task running every 'durability_timeout_task_interval'; event-driven=per-VBucket tasks scheduled based on when next SyncWrite will time out.",
+            "dynamic": false,
+            "type": "std::string",
+            "validator": {
+                "enum": [
+                    "polling",
+                    "event-driven"
+                ]
+            }
+        },
         "durability_timeout_task_interval": {
             "default": "25",
             "descr": "Interval (in ms) between subsequent runs of the DurabilityTimeoutTask",
             "dynamic": true,
-            "type": "size_t"
+            "type": "size_t",
+            "requires": {
+                "durability_timeout_mode": "polling"
+            }
+
         },
         "durability_min_level": {
             "default": "none",
 
@@ -143,19 +143,23 @@ class ActiveDurabilityMonitor::ResolvedQueue {
     }
 };
 
-ActiveDurabilityMonitor::ActiveDurabilityMonitor(EPStats& stats, VBucket& vb)
+ActiveDurabilityMonitor::ActiveDurabilityMonitor(
+        EPStats& stats,
+        VBucket& vb,
+        std::unique_ptr<EventDrivenDurabilityTimeoutIface> nextExpiryChanged)
     : stats(stats),
       vb(vb),
-      state(std::make_unique<State>(*this)),
+      state(std::make_unique<State>(*this, std::move(nextExpiryChanged))),
       resolvedQueue(std::make_unique<ResolvedQueue>(vb.getId())) {
 }
 
 ActiveDurabilityMonitor::ActiveDurabilityMonitor(
         EPStats& stats,
         VBucket& vb,
         const vbucket_state& vbs,
+        std::unique_ptr<EventDrivenDurabilityTimeoutIface> nextExpiryChanged,
         std::vector<queued_item>&& outstandingPrepares)
-    : ActiveDurabilityMonitor(stats, vb) {
+    : ActiveDurabilityMonitor(stats, vb, std::move(nextExpiryChanged)) {
     if (!vbs.transition.replicationTopology.is_null()) {
         setReplicationTopology(vbs.transition.replicationTopology);
     }
@@ -165,6 +169,8 @@ ActiveDurabilityMonitor::ActiveDurabilityMonitor(
         // Any outstanding prepares "grandfathered" into the DM from warmup
         // should have an infinite timeout (we cannot abort them as they
         // may already have been Committed before we restarted).
+        // (This also means there's no need to consider scheduling the
+        // timeout callback).
         Expects(prepare->getDurabilityReqs().getTimeout().isInfinite());
         s->trackedWrites.emplace_back(nullptr,
                                       std::move(prepare),
@@ -183,9 +189,11 @@ ActiveDurabilityMonitor::ActiveDurabilityMonitor(
     s->highCompletedSeqno.reset(vbs.persistedCompletedSeqno);
 }
 
-ActiveDurabilityMonitor::ActiveDurabilityMonitor(EPStats& stats,
-                                                 PassiveDurabilityMonitor&& pdm)
-    : ActiveDurabilityMonitor(stats, pdm.vb) {
+ActiveDurabilityMonitor::ActiveDurabilityMonitor(
+        EPStats& stats,
+        PassiveDurabilityMonitor&& pdm,
+        std::unique_ptr<EventDrivenDurabilityTimeoutIface> nextExpiryChanged)
+    : ActiveDurabilityMonitor(stats, pdm.vb, std::move(nextExpiryChanged)) {
     EP_LOG_INFO(
             "ActiveDurabilityMonitor::ctor(PDM&&): {} Transitioning from "
             "PDM: HPS:{}, HCS:{}, numTracked:{}, highestTracked:{}",
@@ -197,7 +205,13 @@ ActiveDurabilityMonitor::ActiveDurabilityMonitor(EPStats& stats,
 
     auto s = state.wlock();
     for (auto& write : pdm.state.wlock()->trackedWrites) {
+        // Any prepares converted from the PDM into the ADM have an infinite
+        // timeout set (we cannot abort them as they may already have been
+        // Committed when we were non-active.
+        // This also means there's no need to consider scheduling the timeout
+        // callback here.
         s->trackedWrites.emplace_back(std::move(write));
+        Expects(!s->trackedWrites.back().getExpiryTime());
     }
 
     if (!s->trackedWrites.empty()) {
@@ -344,7 +358,7 @@ void ActiveDurabilityMonitor::processTimeout(
     }
 
     // Identify SyncWrites which can be timed out as of this time point
-    // and should be aborted, transferring them into the completedQeuue (under
+    // and should be aborted, transferring them into the completedQueue (under
     // the correct locks).
     state.wlock()->removeExpired(asOf, *resolvedQueue);
 
@@ -559,8 +573,10 @@ void ActiveDurabilityMonitor::removedQueuedAck(const std::string& node) {
     state.wlock()->queuedSeqnoAcks.erase(node);
 }
 
-ActiveDurabilityMonitor::State::State(const ActiveDurabilityMonitor& adm)
-    : adm(adm) {
+ActiveDurabilityMonitor::State::State(
+        ActiveDurabilityMonitor& adm,
+        std::unique_ptr<EventDrivenDurabilityTimeoutIface> nextExpiryChanged)
+    : adm(adm), nextExpiryChanged(std::move(nextExpiryChanged)) {
     const auto prefix = "ActiveDM(" + adm.vb.getId().to_string() + ")::State::";
     lastTrackedSeqno.setLabel(prefix + "lastTrackedSeqno");
     lastCommittedSeqno.setLabel(prefix + "lastCommittedSeqno");
@@ -804,11 +820,15 @@ ActiveDurabilityMonitor::State::removeSyncWrite(Container::iterator it,
     // the SyncWrite from trackedWrites.
     it->resetChains();
 
+    // If we are removing the first element then (a) the "previous" item is
+    // different, and (b) we need to re-schedule the SyncWrite timeout task.
+    const bool removingFirstElement = it == trackedWrites.begin();
+
     Container::iterator prev;
     // Note: iterators in trackedWrites are never singular, Container::end
     //     is used as placeholder element for when an iterator cannot point to
     //     any valid element in Container
-    if (it == trackedWrites.begin()) {
+    if (removingFirstElement) {
         prev = trackedWrites.end();
     } else {
         prev = std::prev(it);
@@ -839,6 +859,14 @@ ActiveDurabilityMonitor::State::removeSyncWrite(Container::iterator it,
 
     Container removed;
     removed.splice(removed.end(), trackedWrites, it);
+
+    if (removingFirstElement) {
+        // If first element was removed, then a new SyncWrite (or possibly none
+        // at all) is at the head of trackedWrites and hence now the next
+        // SyncWrite to be timeed out - reschedule the timeout callback.
+        scheduleTimeoutCallback();
+    }
+
     return std::move(removed.front());
 }
 
@@ -1426,11 +1454,20 @@ void ActiveDurabilityMonitor::State::addSyncWrite(const CookieIface* cookie,
                                                   queued_item item) {
     Expects(firstChain.get());
     const auto seqno = item->getBySeqno();
+    const auto wasEmpty = trackedWrites.empty();
     trackedWrites.emplace_back(cookie,
                                std::move(item),
                                defaultTimeout,
                                firstChain.get(),
                                secondChain.get());
+
+    if (wasEmpty) {
+        // trackedWrites transitioned from empty to non-empty; so the front
+        // item has changed (we now have one) and hence the next timeout
+        // callback should be scheduled.
+        scheduleTimeoutCallback();
+    }
+
     lastTrackedSeqno = seqno;
     totalAccepted++;
 }
@@ -1457,6 +1494,18 @@ void ActiveDurabilityMonitor::State::removeExpired(
     }
 }
 
+void ActiveDurabilityMonitor::State::scheduleTimeoutCallback() {
+    if (!trackedWrites.empty()) {
+        const auto nextExpiry = trackedWrites.front().getExpiryTime();
+        if (nextExpiry) {
+            nextExpiryChanged->updateNextExpiryTime(*nextExpiry);
+            return;
+        }
+    }
+    // No SyncWrites exist, or no expiry set - cancel expiry task.
+    nextExpiryChanged->cancelNextExpiryTime();
+}
+
 void ActiveDurabilityMonitor::State::updateHighPreparedSeqno(
         ResolvedQueue& completed) {
     // Note: All the logic below relies on the fact that HPS for Active is
 
@@ -103,9 +103,16 @@ class ActiveDurabilityMonitor : public DurabilityMonitor {
     // Container type used for State::trackedWrites
     using Container = std::list<DurabilityMonitor::ActiveSyncWrite>;
 
-    // Note: constructor and destructor implementation in the .cc file to allow
-    // the forward declaration of ReplicationChain in the header
-    ActiveDurabilityMonitor(EPStats& stats, VBucket& vb);
+    /**
+     * Construct an ActiveDM for the given vBucket.
+     * @param stats EPStats object for the associated Bucket.
+     * @param vb VBucket which owns this Durability Monitor.
+     * @param nextExpiryChanged Object to use for timing out SyncWrites.
+     */
+    ActiveDurabilityMonitor(EPStats& stats,
+                            VBucket& vb,
+                            std::unique_ptr<EventDrivenDurabilityTimeoutIface>
+                                    nextExpiryChanged);
 
     /**
      * Construct an ActiveDM for the given vBucket, with the specified
@@ -114,6 +121,7 @@ class ActiveDurabilityMonitor : public DurabilityMonitor {
      * @param stats EPStats object for the associated Bucket.
      * @param vb VBucket which owns this Durability Monitor.
      * @param vbs reference to the vbucket_state found at warmup
+     * @param nextExpiryChanged Object to use for timing out SyncWrites.
      * @param outstandingPrepares In-flight prepares which the DM should take
      *        responsibility for.
      *        These must be ordered by ascending seqno, otherwise
@@ -122,6 +130,8 @@ class ActiveDurabilityMonitor : public DurabilityMonitor {
     ActiveDurabilityMonitor(EPStats& stats,
                             VBucket& vb,
                             const vbucket_state& vbs,
+                            std::unique_ptr<EventDrivenDurabilityTimeoutIface>
+                                    nextExpiryChanged,
                             std::vector<queued_item>&& outstandingPrepares);
 
     /**
@@ -131,7 +141,10 @@ class ActiveDurabilityMonitor : public DurabilityMonitor {
      * @param stats EPStats object for the associated Bucket.
      * @param pdm The PassiveDM to be converted
      */
-    ActiveDurabilityMonitor(EPStats& stats, PassiveDurabilityMonitor&& pdm);
+    ActiveDurabilityMonitor(EPStats& stats,
+                            PassiveDurabilityMonitor&& pdm,
+                            std::unique_ptr<EventDrivenDurabilityTimeoutIface>
+                                    nextExpiryChanged);
 
     ~ActiveDurabilityMonitor() override;
 
 
@@ -123,6 +123,11 @@ DurabilityMonitor::ActiveSyncWrite::getStartTime() const {
     return startTime;
 }
 
+std::optional<std::chrono::steady_clock::time_point>
+DurabilityMonitor::ActiveSyncWrite::getExpiryTime() const {
+    return expiryTime;
+}
+
 void DurabilityMonitor::ActiveSyncWrite::ack(const std::string& node) {
     if (!firstChain) {
         throw std::logic_error(
 
@@ -165,6 +165,12 @@ class DurabilityMonitor::ActiveSyncWrite : public DurabilityMonitor::SyncWrite {
 
     std::chrono::steady_clock::time_point getStartTime() const;
 
+    /**
+     * @returns The time point this SyncWrite will expire at. Will return
+     * an empty optional for SyncWrites which have no expiry time set.
+     */
+    std::optional<std::chrono::steady_clock::time_point> getExpiryTime() const;
+
     /**
      * Notify this SyncWrite that it has been ack'ed by node.
      *
@@ -406,7 +412,9 @@ struct ActiveDurabilityMonitor::State {
     /**
      * @param adm The owning ActiveDurabilityMonitor
      */
-    explicit State(const ActiveDurabilityMonitor& adm);
+    explicit State(ActiveDurabilityMonitor& adm,
+                   std::unique_ptr<EventDrivenDurabilityTimeoutIface>
+                           nextExpiryChanged);
 
     /**
      * Create a replication chain. Not static as we require an iterator from
@@ -499,6 +507,14 @@ struct ActiveDurabilityMonitor::State {
     void removeExpired(std::chrono::steady_clock::time_point asOf,
                        ResolvedQueue& expired);
 
+    /**
+     * Schedule the timeout callback based on the state of trackedWrites.
+     * If trackedWrites is non-empty then schedule timeout callback to run
+     * when trackedWrites.front() is due to expire; otherwise cancel the
+     * timeout callback.
+     */
+    void scheduleTimeoutCallback();
+
     /// @returns the name of the active node. Assumes the first chain is valid.
     const std::string& getActive() const;
 
@@ -736,6 +752,11 @@ struct ActiveDurabilityMonitor::State {
     std::unordered_map<std::string, Monotonic<int64_t, ThrowExceptionPolicy>>
             queuedSeqnoAcks;
 
+    /// Interface to the VBucket's SyncWriteExpiry task, used to schedule when
+    /// the task should run to cancel (abort) any SyncWrites which have
+    /// exceeded their durability timeout.
+    std::unique_ptr<EventDrivenDurabilityTimeoutIface> nextExpiryChanged;
+
     friend std::ostream& operator<<(std::ostream& os, const State& s) {
         os << "#trackedWrites:" << s.trackedWrites.size()
            << " highPreparedSeqno:" << s.highPreparedSeqno