Fix memory ordering in needs_wake (#281)

Andreas Molzer · web-flow · commit 4e52bca1f2c3 · 2024-04-28T19:28:04.000+08:00
* Fix memory ordering in needs_wake

A memory fence is required to ensure the decision of the kernel needing
waking is consistent between user-space and kernel. The kernel must not
go to sleep without us reliably observing the wake flag, respectively it
must see updates to the submission head if our load does not contain
updated wake flag. The fence is paired with one in the kernel which
ensures that one of the two stores is observed by the respective load of
the other party.

* Relax memory order of needs_wake

There is no other state that is acquired. The barrier which the library
or callers are required to ensure provides the necessary synchronization
with regards to kernel state.

* Fix Submit use of NEED_WAKEUP flag
diff --git a/src/squeue.rs b/src/squeue.rs
@@ -178,10 +178,45 @@ impl<E: EntryMarker> SubmissionQueue<'_, E> {
 
     /// When [`is_setup_sqpoll`](crate::Parameters::is_setup_sqpoll) is set, whether the kernel
     /// threads has gone to sleep and requires a system call to wake it up.
+    ///
+    /// A result of `false` is only meaningful if the function was called after the latest update
+    /// to the queue head. Other interpretations could lead to a race condition where the kernel
+    /// concurrently put the device to sleep and no further progress is made.
     #[inline]
     pub fn need_wakeup(&self) -> bool {
+        // See discussions that happened in [#197] and its linked threads in liburing. We need to
+        // ensure that writes to the head have been visible _to the kernel_ if this load results in
+        // decision to sleep. This is solved with a SeqCst fence. There is no common modified
+        // memory location that would provide alternative synchronization.
+        //
+        // The kernel, from its sequencing, first writes the wake flag, then performs a full
+        // barrier (`smp_mb`, or `smp_mb__after_atomic`), then reads the head. We assume that our
+        // user first writes the head and then reads the `need_wakeup` flag as documented. It is
+        // necessary to ensure that at least one observes the other write. By establishing a point
+        // of sequential consistency on both sides between their respective write and read, at
+        // least one coherency order holds. With regards to the interpretation of the atomic memory
+        // model of Rust (that is, that of C++20) we're assuming that an `smp_mb` provides at least
+        // the effect of a `fence(SeqCst)`.
+        //
+        // [#197]: https://github.com/tokio-rs/io-uring/issues/197
+        atomic::fence(atomic::Ordering::SeqCst);
+        unsafe {
+            (*self.queue.flags).load(atomic::Ordering::Relaxed) & sys::IORING_SQ_NEED_WAKEUP != 0
+        }
+    }
+
+    /// The effect of [`Self::need_wakeup`], after synchronization work performed by the caller.
+    ///
+    /// This function should only be called if the caller can guarantee that a `SeqCst` fence has
+    /// been inserted after the last write to the queue's head. The function is then a little more
+    /// efficient by avoiding to perform one itself.
+    ///
+    /// Failure to uphold the precondition can result in an effective dead-lock due to a sleeping
+    /// device.
+    #[inline]
+    pub fn need_wakeup_after_intermittent_seqcst(&self) -> bool {
         unsafe {
-            (*self.queue.flags).load(atomic::Ordering::Acquire) & sys::IORING_SQ_NEED_WAKEUP != 0
+            (*self.queue.flags).load(atomic::Ordering::Relaxed) & sys::IORING_SQ_NEED_WAKEUP != 0
         }
     }
 
diff --git a/src/submit.rs b/src/submit.rs
@@ -59,7 +59,7 @@ impl<'a> Submitter<'a> {
     #[inline]
     fn sq_need_wakeup(&self) -> bool {
         unsafe {
-            (*self.sq_flags).load(atomic::Ordering::Acquire) & sys::IORING_SQ_NEED_WAKEUP != 0
+            (*self.sq_flags).load(atomic::Ordering::Relaxed) & sys::IORING_SQ_NEED_WAKEUP != 0
         }
     }
 
@@ -119,12 +119,17 @@ impl<'a> Submitter<'a> {
         // each cause an atomic load of the same variable, self.sq_flags.
         // In the hottest paths, when a server is running with sqpoll,
         // this is going to be hit twice, when once would be sufficient.
+        // However, consider that the `SeqCst` barrier required for interpreting
+        // the IORING_ENTER_SQ_WAKEUP bit is required in all paths where sqpoll
+        // is setup when consolidating the reads.
 
         if want > 0 || self.params.is_setup_iopoll() || self.sq_cq_overflow() {
             flags |= sys::IORING_ENTER_GETEVENTS;
         }
 
         if self.params.is_setup_sqpoll() {
+            // See discussion in [`SubmissionQueue::need_wakeup`].
+            atomic::fence(atomic::Ordering::SeqCst);
             if self.sq_need_wakeup() {
                 flags |= sys::IORING_ENTER_SQ_WAKEUP;
             } else if want == 0 {
@@ -150,6 +155,8 @@ impl<'a> Submitter<'a> {
         }
 
         if self.params.is_setup_sqpoll() {
+            // See discussion in [`SubmissionQueue::need_wakeup`].
+            atomic::fence(atomic::Ordering::SeqCst);
             if self.sq_need_wakeup() {
                 flags |= sys::IORING_ENTER_SQ_WAKEUP;
             } else if want == 0 {