Optimize SeqCst fences

Stjepan Glavina · Stjepan Glavina · commit e4a9db66f4fa · 2020-07-22T17:57:03.000+02:00
diff --git a/src/bounded.rs b/src/bounded.rs
@@ -1,7 +1,7 @@
 use std::cell::UnsafeCell;
 use std::marker::PhantomData;
 use std::mem::{self, MaybeUninit};
-use std::sync::atomic::{self, AtomicUsize, Ordering};
+use std::sync::atomic::{AtomicUsize, Ordering};
 use std::thread;
 
 use cache_padded::CachePadded;
@@ -145,7 +145,7 @@ impl<T> Bounded<T> {
                     }
                 }
             } else if stamp.wrapping_add(self.one_lap) == tail + 1 {
-                atomic::fence(Ordering::SeqCst);
+                crate::full_fence();
                 let head = self.head.load(Ordering::Relaxed);
 
                 // If the head lags one lap behind the tail as well...
@@ -207,7 +207,7 @@ impl<T> Bounded<T> {
                     }
                 }
             } else if stamp == head {
-                atomic::fence(Ordering::SeqCst);
+                crate::full_fence();
                 let tail = self.tail.load(Ordering::Relaxed);
 
                 // If the tail equals the head, that means the queue is empty.
diff --git a/src/lib.rs b/src/lib.rs
@@ -32,6 +32,7 @@
 
 use std::error;
 use std::fmt;
+use std::sync::atomic::{self, AtomicUsize, Ordering};
 
 use crate::bounded::Bounded;
 use crate::unbounded::Unbounded;
@@ -422,3 +423,26 @@ impl<T> fmt::Display for PushError<T> {
         }
     }
 }
+
+/// Equivalent to `atomic::fence(Ordering::SeqCst)`, but in some cases faster.
+#[inline]
+fn full_fence() {
+    if cfg!(any(target_arch = "x86", target_arch = "x86_64")) {
+        // HACK(stjepang): On x86 architectures there are two different ways of executing
+        // a `SeqCst` fence.
+        //
+        // 1. `atomic::fence(SeqCst)`, which compiles into a `mfence` instruction.
+        // 2. `_.compare_and_swap(_, _, SeqCst)`, which compiles into a `lock cmpxchg` instruction.
+        //
+        // Both instructions have the effect of a full barrier, but empirical benchmarks have shown
+        // that the second one is sometimes a bit faster.
+        //
+        // The ideal solution here would be to use inline assembly, but we're instead creating a
+        // temporary atomic variable and compare-and-exchanging its value. No sane compiler to
+        // x86 platforms is going to optimize this away.
+        let a = AtomicUsize::new(0);
+        a.compare_and_swap(0, 1, Ordering::SeqCst);
+    } else {
+        atomic::fence(Ordering::SeqCst);
+    }
+}
diff --git a/src/unbounded.rs b/src/unbounded.rs
@@ -2,7 +2,7 @@ use std::cell::UnsafeCell;
 use std::marker::PhantomData;
 use std::mem::MaybeUninit;
 use std::ptr;
-use std::sync::atomic::{self, AtomicPtr, AtomicUsize, Ordering};
+use std::sync::atomic::{AtomicPtr, AtomicUsize, Ordering};
 use std::thread;
 
 use cache_padded::CachePadded;
@@ -242,7 +242,7 @@ impl<T> Unbounded<T> {
             let mut new_head = head + (1 << SHIFT);
 
             if new_head & MARK_BIT == 0 {
-                atomic::fence(Ordering::SeqCst);
+                crate::full_fence();
                 let tail = self.tail.index.load(Ordering::Relaxed);
 
                 // If the tail equals the head, that means the queue is empty.

Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,7 @@`
`1`	`1`	`use std::cell::UnsafeCell;`
`2`	`2`	`use std::marker::PhantomData;`
`3`	`3`	`use std::mem::{self, MaybeUninit};`
`4`		`-use std::sync::atomic::{self, AtomicUsize, Ordering};`
	`4`	`+use std::sync::atomic::{AtomicUsize, Ordering};`
`5`	`5`	`use std::thread;`
`6`	`6`
`7`	`7`	`use cache_padded::CachePadded;`
`@@ -145,7 +145,7 @@ impl<T> Bounded<T> {`
`145`	`145`	`}`
`146`	`146`	`}`
`147`	`147`	`} else if stamp.wrapping_add(self.one_lap) == tail + 1 {`
`148`		`- atomic::fence(Ordering::SeqCst);`
	`148`	`+ crate::full_fence();`
`149`	`149`	`let head = self.head.load(Ordering::Relaxed);`
`150`	`150`
`151`	`151`	`// If the head lags one lap behind the tail as well...`
`@@ -207,7 +207,7 @@ impl<T> Bounded<T> {`
`207`	`207`	`}`
`208`	`208`	`}`
`209`	`209`	`} else if stamp == head {`
`210`		`- atomic::fence(Ordering::SeqCst);`
	`210`	`+ crate::full_fence();`
`211`	`211`	`let tail = self.tail.load(Ordering::Relaxed);`
`212`	`212`
`213`	`213`	`// If the tail equals the head, that means the queue is empty.`