Skip to content

Commit e4a9db6

Browse files
author
Stjepan Glavina
committed
Optimize SeqCst fences
1 parent b2d6bbc commit e4a9db6

File tree

3 files changed

+29
-5
lines changed

3 files changed

+29
-5
lines changed

src/bounded.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
use std::cell::UnsafeCell;
22
use std::marker::PhantomData;
33
use std::mem::{self, MaybeUninit};
4-
use std::sync::atomic::{self, AtomicUsize, Ordering};
4+
use std::sync::atomic::{AtomicUsize, Ordering};
55
use std::thread;
66

77
use cache_padded::CachePadded;
@@ -145,7 +145,7 @@ impl<T> Bounded<T> {
145145
}
146146
}
147147
} else if stamp.wrapping_add(self.one_lap) == tail + 1 {
148-
atomic::fence(Ordering::SeqCst);
148+
crate::full_fence();
149149
let head = self.head.load(Ordering::Relaxed);
150150

151151
// If the head lags one lap behind the tail as well...
@@ -207,7 +207,7 @@ impl<T> Bounded<T> {
207207
}
208208
}
209209
} else if stamp == head {
210-
atomic::fence(Ordering::SeqCst);
210+
crate::full_fence();
211211
let tail = self.tail.load(Ordering::Relaxed);
212212

213213
// If the tail equals the head, that means the queue is empty.

src/lib.rs

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232

3333
use std::error;
3434
use std::fmt;
35+
use std::sync::atomic::{self, AtomicUsize, Ordering};
3536

3637
use crate::bounded::Bounded;
3738
use crate::unbounded::Unbounded;
@@ -422,3 +423,26 @@ impl<T> fmt::Display for PushError<T> {
422423
}
423424
}
424425
}
426+
427+
/// Equivalent to `atomic::fence(Ordering::SeqCst)`, but in some cases faster.
428+
#[inline]
429+
fn full_fence() {
430+
if cfg!(any(target_arch = "x86", target_arch = "x86_64")) {
431+
// HACK(stjepang): On x86 architectures there are two different ways of executing
432+
// a `SeqCst` fence.
433+
//
434+
// 1. `atomic::fence(SeqCst)`, which compiles into a `mfence` instruction.
435+
// 2. `_.compare_and_swap(_, _, SeqCst)`, which compiles into a `lock cmpxchg` instruction.
436+
//
437+
// Both instructions have the effect of a full barrier, but empirical benchmarks have shown
438+
// that the second one is sometimes a bit faster.
439+
//
440+
// The ideal solution here would be to use inline assembly, but we're instead creating a
441+
// temporary atomic variable and compare-and-exchanging its value. No sane compiler to
442+
// x86 platforms is going to optimize this away.
443+
let a = AtomicUsize::new(0);
444+
a.compare_and_swap(0, 1, Ordering::SeqCst);
445+
} else {
446+
atomic::fence(Ordering::SeqCst);
447+
}
448+
}

src/unbounded.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ use std::cell::UnsafeCell;
22
use std::marker::PhantomData;
33
use std::mem::MaybeUninit;
44
use std::ptr;
5-
use std::sync::atomic::{self, AtomicPtr, AtomicUsize, Ordering};
5+
use std::sync::atomic::{AtomicPtr, AtomicUsize, Ordering};
66
use std::thread;
77

88
use cache_padded::CachePadded;
@@ -242,7 +242,7 @@ impl<T> Unbounded<T> {
242242
let mut new_head = head + (1 << SHIFT);
243243

244244
if new_head & MARK_BIT == 0 {
245-
atomic::fence(Ordering::SeqCst);
245+
crate::full_fence();
246246
let tail = self.tail.index.load(Ordering::Relaxed);
247247

248248
// If the tail equals the head, that means the queue is empty.

0 commit comments

Comments
 (0)