@@ -338,18 +338,20 @@ int64_t GetCurrentTimeNanos() {
338338 // to the same shared data.
339339 seq_read0 = time_state.seq .load (std::memory_order_acquire);
340340
341- base_ns = time_state.last_sample .base_ns .load (std::memory_order_relaxed);
341+ // The algorithm does not require that the following four loads be ordered
342+ // with respect to one another; it requires only that they precede the load of
343+ // time_state.seq below them. Nevertheless, we mark each of them as an
344+ // acquire-load, rather than using a barrier immediately before the
345+ // time_state.seq load, because the former is likely faster on most CPUs of
346+ // interest. Architectures that may see a regression because of this approach
347+ // include PowerPC and MIPS.
348+ base_ns = time_state.last_sample .base_ns .load (std::memory_order_acquire);
342349 base_cycles =
343- time_state.last_sample .base_cycles .load (std::memory_order_relaxed );
350+ time_state.last_sample .base_cycles .load (std::memory_order_acquire );
344351 nsscaled_per_cycle =
345- time_state.last_sample .nsscaled_per_cycle .load (std::memory_order_relaxed );
352+ time_state.last_sample .nsscaled_per_cycle .load (std::memory_order_acquire );
346353 min_cycles_per_sample = time_state.last_sample .min_cycles_per_sample .load (
347- std::memory_order_relaxed);
348-
349- // This acquire fence pairs with the release fence in SeqAcquire. Since it
350- // is sequenced between reads of shared data and seq_read1, the reads of
351- // shared data are effectively acquiring.
352- std::atomic_thread_fence (std::memory_order_acquire);
354+ std::memory_order_acquire);
353355
354356 // The shared-data reads are effectively acquire ordered, and the
355357 // shared-data writes are effectively release ordered. Therefore if our
0 commit comments