Skip to content

Commit d1d6185

Browse files
authored
Merge pull request #4577 from RalfJung/release-seq
Fix release/scquire synchonization for loads from the store buffer
2 parents 6f1f392 + 724c99e commit d1d6185

File tree

3 files changed

+157
-44
lines changed

3 files changed

+157
-44
lines changed

src/concurrency/data_race.rs

Lines changed: 36 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,9 @@ struct AtomicMemoryCellClocks {
183183
/// contains the vector of timestamps that will
184184
/// happen-before a thread if an acquire-load is
185185
/// performed on the data.
186+
///
187+
/// With weak memory emulation, this is the clock of the most recent write. It is then only used
188+
/// for release sequences, to integrate the most recent clock into the next one for RMWs.
186189
sync_vector: VClock,
187190

188191
/// The size of accesses to this atomic location.
@@ -276,7 +279,7 @@ struct MemoryCellClocks {
276279
/// zero on each write operation.
277280
read: VClock,
278281

279-
/// Atomic access, acquire, release sequence tracking clocks.
282+
/// Atomic access tracking clocks.
280283
/// For non-atomic memory this value is set to None.
281284
/// For atomic memory, each byte carries this information.
282285
atomic_ops: Option<Box<AtomicMemoryCellClocks>>,
@@ -504,10 +507,11 @@ impl MemoryCellClocks {
504507
thread_clocks: &mut ThreadClockSet,
505508
index: VectorIdx,
506509
access_size: Size,
510+
sync_clock: Option<&VClock>,
507511
) -> Result<(), DataRace> {
508512
self.atomic_read_detect(thread_clocks, index, access_size)?;
509-
if let Some(atomic) = self.atomic() {
510-
thread_clocks.clock.join(&atomic.sync_vector);
513+
if let Some(sync_clock) = sync_clock.or_else(|| self.atomic().map(|a| &a.sync_vector)) {
514+
thread_clocks.clock.join(sync_clock);
511515
}
512516
Ok(())
513517
}
@@ -520,10 +524,11 @@ impl MemoryCellClocks {
520524
thread_clocks: &mut ThreadClockSet,
521525
index: VectorIdx,
522526
access_size: Size,
527+
sync_clock: Option<&VClock>,
523528
) -> Result<(), DataRace> {
524529
self.atomic_read_detect(thread_clocks, index, access_size)?;
525-
if let Some(atomic) = self.atomic() {
526-
thread_clocks.fence_acquire.join(&atomic.sync_vector);
530+
if let Some(sync_clock) = sync_clock.or_else(|| self.atomic().map(|a| &a.sync_vector)) {
531+
thread_clocks.fence_acquire.join(sync_clock);
527532
}
528533
Ok(())
529534
}
@@ -555,7 +560,8 @@ impl MemoryCellClocks {
555560
// The handling of release sequences was changed in C++20 and so
556561
// the code here is different to the paper since now all relaxed
557562
// stores block release sequences. The exception for same-thread
558-
// relaxed stores has been removed.
563+
// relaxed stores has been removed. We always overwrite the `sync_vector`,
564+
// meaning the previous release sequence is broken.
559565
let atomic = self.atomic_mut_unwrap();
560566
atomic.sync_vector.clone_from(&thread_clocks.fence_release);
561567
Ok(())
@@ -571,6 +577,8 @@ impl MemoryCellClocks {
571577
) -> Result<(), DataRace> {
572578
self.atomic_write_detect(thread_clocks, index, access_size)?;
573579
let atomic = self.atomic_mut_unwrap();
580+
// This *joining* of `sync_vector` implements release sequences: future
581+
// reads of this location will acquire our clock *and* what was here before.
574582
atomic.sync_vector.join(&thread_clocks.clock);
575583
Ok(())
576584
}
@@ -585,6 +593,8 @@ impl MemoryCellClocks {
585593
) -> Result<(), DataRace> {
586594
self.atomic_write_detect(thread_clocks, index, access_size)?;
587595
let atomic = self.atomic_mut_unwrap();
596+
// This *joining* of `sync_vector` implements release sequences: future
597+
// reads of this location will acquire our fence clock *and* what was here before.
588598
atomic.sync_vector.join(&thread_clocks.fence_release);
589599
Ok(())
590600
}
@@ -731,8 +741,8 @@ pub trait EvalContextExt<'tcx>: MiriInterpCxExt<'tcx> {
731741
}
732742

733743
let scalar = this.allow_data_races_ref(move |this| this.read_scalar(place))?;
734-
let buffered_scalar = this.buffered_atomic_read(place, atomic, scalar, || {
735-
this.validate_atomic_load(place, atomic)
744+
let buffered_scalar = this.buffered_atomic_read(place, atomic, scalar, |sync_clock| {
745+
this.validate_atomic_load(place, atomic, sync_clock)
736746
})?;
737747
interp_ok(buffered_scalar.ok_or_else(|| err_ub!(InvalidUninitBytes(None)))?)
738748
}
@@ -930,7 +940,7 @@ pub trait EvalContextExt<'tcx>: MiriInterpCxExt<'tcx> {
930940
this.validate_atomic_rmw(place, success)?;
931941
this.buffered_atomic_rmw(new, place, success, old.to_scalar())?;
932942
} else {
933-
this.validate_atomic_load(place, fail)?;
943+
this.validate_atomic_load(place, fail, /* can use latest sync clock */ None)?;
934944
// A failed compare exchange is equivalent to a load, reading from the latest store
935945
// in the modification order.
936946
// Since `old` is only a value and not the store element, we need to separately
@@ -1172,6 +1182,18 @@ impl VClockAlloc {
11721182
}))?
11731183
}
11741184

1185+
/// Return the release/acquire synchronization clock for the given memory range.
1186+
pub(super) fn sync_clock(&self, access_range: AllocRange) -> VClock {
1187+
let alloc_ranges = self.alloc_ranges.borrow();
1188+
let mut clock = VClock::default();
1189+
for (_, mem_clocks) in alloc_ranges.iter(access_range.start, access_range.size) {
1190+
if let Some(atomic) = mem_clocks.atomic() {
1191+
clock.join(&atomic.sync_vector);
1192+
}
1193+
}
1194+
clock
1195+
}
1196+
11751197
/// Detect data-races for an unsynchronized read operation. It will not perform
11761198
/// data-race detection if `race_detecting()` is false, either due to no threads
11771199
/// being created or if it is temporarily disabled during a racy read or write
@@ -1448,6 +1470,7 @@ trait EvalContextPrivExt<'tcx>: MiriInterpCxExt<'tcx> {
14481470
&self,
14491471
place: &MPlaceTy<'tcx>,
14501472
atomic: AtomicReadOrd,
1473+
sync_clock: Option<&VClock>,
14511474
) -> InterpResult<'tcx> {
14521475
let this = self.eval_context_ref();
14531476
this.validate_atomic_op(
@@ -1456,9 +1479,9 @@ trait EvalContextPrivExt<'tcx>: MiriInterpCxExt<'tcx> {
14561479
AccessType::AtomicLoad,
14571480
move |memory, clocks, index, atomic| {
14581481
if atomic == AtomicReadOrd::Relaxed {
1459-
memory.load_relaxed(&mut *clocks, index, place.layout.size)
1482+
memory.load_relaxed(&mut *clocks, index, place.layout.size, sync_clock)
14601483
} else {
1461-
memory.load_acquire(&mut *clocks, index, place.layout.size)
1484+
memory.load_acquire(&mut *clocks, index, place.layout.size, sync_clock)
14621485
}
14631486
},
14641487
)
@@ -1503,9 +1526,9 @@ trait EvalContextPrivExt<'tcx>: MiriInterpCxExt<'tcx> {
15031526
AccessType::AtomicRmw,
15041527
move |memory, clocks, index, _| {
15051528
if acquire {
1506-
memory.load_acquire(clocks, index, place.layout.size)?;
1529+
memory.load_acquire(clocks, index, place.layout.size, None)?;
15071530
} else {
1508-
memory.load_relaxed(clocks, index, place.layout.size)?;
1531+
memory.load_relaxed(clocks, index, place.layout.size, None)?;
15091532
}
15101533
if release {
15111534
memory.rmw_release(clocks, index, place.layout.size)

src/concurrency/weak_memory.rs

Lines changed: 65 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -62,8 +62,11 @@
6262
//! You can refer to test cases in weak_memory/extra_cpp.rs and weak_memory/extra_cpp_unsafe.rs for examples of these operations.
6363
6464
// Our and the author's own implementation (tsan11) of the paper have some deviations from the provided operational semantics in §5.3:
65-
// 1. In the operational semantics, store elements keep a copy of the atomic object's vector clock (AtomicCellClocks::sync_vector in miri),
66-
// but this is not used anywhere so it's omitted here.
65+
// 1. In the operational semantics, loads acquire the vector clock of the atomic location
66+
// irrespective of which store buffer element is loaded. That's incorrect; the synchronization clock
67+
// needs to be tracked per-store-buffer-element. (The paper has a field "clocks" for that purpose,
68+
// but it is not actuallt used.) tsan11 does this correctly
69+
// (https://github.com/ChrisLidbury/tsan11/blob/ecbd6b81e9b9454e01cba78eb9d88684168132c7/lib/tsan/rtl/tsan_relaxed.cc#L305).
6770
//
6871
// 2. In the operational semantics, each store element keeps the timestamp of a thread when it loads from the store.
6972
// If the same thread loads from the same store element multiple times, then the timestamps at all loads are saved in a list of load elements.
@@ -138,16 +141,17 @@ enum LoadRecency {
138141

139142
#[derive(Debug, Clone, PartialEq, Eq)]
140143
struct StoreElement {
141-
/// The identifier of the vector index, corresponding to a thread
142-
/// that performed the store.
143-
store_index: VectorIdx,
144+
/// The thread that performed the store.
145+
store_thread: VectorIdx,
146+
/// The timestamp of the storing thread when it performed the store
147+
store_timestamp: VTimestamp,
148+
149+
/// The vector clock that can be acquired by loading this store.
150+
sync_clock: VClock,
144151

145152
/// Whether this store is SC.
146153
is_seqcst: bool,
147154

148-
/// The timestamp of the storing thread when it performed the store
149-
timestamp: VTimestamp,
150-
151155
/// The value of this store. `None` means uninitialized.
152156
// FIXME: Currently, we cannot represent partial initialization.
153157
val: Option<Scalar>,
@@ -159,9 +163,12 @@ struct StoreElement {
159163

160164
#[derive(Debug, Clone, PartialEq, Eq, Default)]
161165
struct LoadInfo {
162-
/// Timestamp of first loads from this store element by each thread
166+
/// Timestamp of first loads from this store element by each thread.
163167
timestamps: FxHashMap<VectorIdx, VTimestamp>,
164-
/// Whether this store element has been read by an SC load
168+
/// Whether this store element has been read by an SC load.
169+
/// This is crucial to ensure we respect coherence-ordered-before. Concretely we use
170+
/// this to ensure that if a store element is seen by an SC load, then all later SC loads
171+
/// cannot see `mo`-earlier store elements.
165172
sc_loaded: bool,
166173
}
167174

@@ -243,8 +250,10 @@ impl<'tcx> StoreBuffer {
243250
let store_elem = StoreElement {
244251
// The thread index and timestamp of the initialisation write
245252
// are never meaningfully used, so it's fine to leave them as 0
246-
store_index: VectorIdx::from(0),
247-
timestamp: VTimestamp::ZERO,
253+
store_thread: VectorIdx::from(0),
254+
store_timestamp: VTimestamp::ZERO,
255+
// The initialization write is non-atomic so nothing can be acquired.
256+
sync_clock: VClock::default(),
248257
val: init,
249258
is_seqcst: false,
250259
load_info: RefCell::new(LoadInfo::default()),
@@ -273,7 +282,7 @@ impl<'tcx> StoreBuffer {
273282
thread_mgr: &ThreadManager<'_>,
274283
is_seqcst: bool,
275284
rng: &mut (impl rand::Rng + ?Sized),
276-
validate: impl FnOnce() -> InterpResult<'tcx>,
285+
validate: impl FnOnce(Option<&VClock>) -> InterpResult<'tcx>,
277286
) -> InterpResult<'tcx, (Option<Scalar>, LoadRecency)> {
278287
// Having a live borrow to store_buffer while calling validate_atomic_load is fine
279288
// because the race detector doesn't touch store_buffer
@@ -290,7 +299,7 @@ impl<'tcx> StoreBuffer {
290299
// after we've picked a store element from the store buffer, as presented
291300
// in ATOMIC LOAD rule of the paper. This is because fetch_store
292301
// requires access to ThreadClockSet.clock, which is updated by the race detector
293-
validate()?;
302+
validate(Some(&store_elem.sync_clock))?;
294303

295304
let (index, clocks) = global.active_thread_state(thread_mgr);
296305
let loaded = store_elem.load_impl(index, &clocks, is_seqcst);
@@ -303,10 +312,11 @@ impl<'tcx> StoreBuffer {
303312
global: &DataRaceState,
304313
thread_mgr: &ThreadManager<'_>,
305314
is_seqcst: bool,
315+
sync_clock: VClock,
306316
) -> InterpResult<'tcx> {
307317
let (index, clocks) = global.active_thread_state(thread_mgr);
308318

309-
self.store_impl(val, index, &clocks.clock, is_seqcst);
319+
self.store_impl(val, index, &clocks.clock, is_seqcst, sync_clock);
310320
interp_ok(())
311321
}
312322

@@ -333,7 +343,9 @@ impl<'tcx> StoreBuffer {
333343
return false;
334344
}
335345

336-
keep_searching = if store_elem.timestamp <= clocks.clock[store_elem.store_index] {
346+
keep_searching = if store_elem.store_timestamp
347+
<= clocks.clock[store_elem.store_thread]
348+
{
337349
// CoWR: if a store happens-before the current load,
338350
// then we can't read-from anything earlier in modification order.
339351
// C++20 §6.9.2.2 [intro.races] paragraph 18
@@ -345,15 +357,15 @@ impl<'tcx> StoreBuffer {
345357
// then we cannot read-from anything earlier in modification order.
346358
// C++20 §6.9.2.2 [intro.races] paragraph 16
347359
false
348-
} else if store_elem.timestamp <= clocks.write_seqcst[store_elem.store_index]
360+
} else if store_elem.store_timestamp <= clocks.write_seqcst[store_elem.store_thread]
349361
&& store_elem.is_seqcst
350362
{
351363
// The current non-SC load, which may be sequenced-after an SC fence,
352364
// cannot read-before the last SC store executed before the fence.
353365
// C++17 §32.4 [atomics.order] paragraph 4
354366
false
355367
} else if is_seqcst
356-
&& store_elem.timestamp <= clocks.read_seqcst[store_elem.store_index]
368+
&& store_elem.store_timestamp <= clocks.read_seqcst[store_elem.store_thread]
357369
{
358370
// The current SC load cannot read-before the last store sequenced-before
359371
// the last SC fence.
@@ -391,17 +403,19 @@ impl<'tcx> StoreBuffer {
391403
}
392404
}
393405

394-
/// ATOMIC STORE IMPL in the paper (except we don't need the location's vector clock)
406+
/// ATOMIC STORE IMPL in the paper
395407
fn store_impl(
396408
&mut self,
397409
val: Scalar,
398410
index: VectorIdx,
399411
thread_clock: &VClock,
400412
is_seqcst: bool,
413+
sync_clock: VClock,
401414
) {
402415
let store_elem = StoreElement {
403-
store_index: index,
404-
timestamp: thread_clock[index],
416+
store_thread: index,
417+
store_timestamp: thread_clock[index],
418+
sync_clock,
405419
// In the language provided in the paper, an atomic store takes the value from a
406420
// non-atomic memory location.
407421
// But we already have the immediate value here so we don't need to do the memory
@@ -419,7 +433,7 @@ impl<'tcx> StoreBuffer {
419433
// so that in a later SC load, only the last SC store (i.e. this one) or stores that
420434
// aren't ordered by hb with the last SC is picked.
421435
self.buffer.iter_mut().rev().for_each(|elem| {
422-
if elem.timestamp <= thread_clock[elem.store_index] {
436+
if elem.store_timestamp <= thread_clock[elem.store_thread] {
423437
elem.is_seqcst = true;
424438
}
425439
})
@@ -462,7 +476,7 @@ pub(super) trait EvalContextExt<'tcx>: crate::MiriInterpCxExt<'tcx> {
462476
let (alloc_id, base_offset, ..) = this.ptr_get_alloc_id(place.ptr(), 0)?;
463477
if let (
464478
crate::AllocExtra {
465-
data_race: AllocDataRaceHandler::Vclocks(_, Some(alloc_buffers)),
479+
data_race: AllocDataRaceHandler::Vclocks(data_race_clocks, Some(alloc_buffers)),
466480
..
467481
},
468482
crate::MiriMachine {
@@ -475,19 +489,29 @@ pub(super) trait EvalContextExt<'tcx>: crate::MiriInterpCxExt<'tcx> {
475489
global.sc_write(threads);
476490
}
477491
let range = alloc_range(base_offset, place.layout.size);
492+
let sync_clock = data_race_clocks.sync_clock(range);
478493
let buffer = alloc_buffers.get_or_create_store_buffer_mut(range, Some(init))?;
494+
// The RMW always reads from the most recent store.
479495
buffer.read_from_last_store(global, threads, atomic == AtomicRwOrd::SeqCst);
480-
buffer.buffered_write(new_val, global, threads, atomic == AtomicRwOrd::SeqCst)?;
496+
buffer.buffered_write(
497+
new_val,
498+
global,
499+
threads,
500+
atomic == AtomicRwOrd::SeqCst,
501+
sync_clock,
502+
)?;
481503
}
482504
interp_ok(())
483505
}
484506

507+
/// The argument to `validate` is the synchronization clock of the memory that is being read,
508+
/// if we are reading from a store buffer element.
485509
fn buffered_atomic_read(
486510
&self,
487511
place: &MPlaceTy<'tcx>,
488512
atomic: AtomicReadOrd,
489513
latest_in_mo: Scalar,
490-
validate: impl FnOnce() -> InterpResult<'tcx>,
514+
validate: impl FnOnce(Option<&VClock>) -> InterpResult<'tcx>,
491515
) -> InterpResult<'tcx, Option<Scalar>> {
492516
let this = self.eval_context_ref();
493517
'fallback: {
@@ -525,14 +549,16 @@ pub(super) trait EvalContextExt<'tcx>: crate::MiriInterpCxExt<'tcx> {
525549
}
526550

527551
// Race detector or weak memory disabled, simply read the latest value
528-
validate()?;
552+
validate(None)?;
529553
interp_ok(Some(latest_in_mo))
530554
}
531555

532556
/// Add the given write to the store buffer. (Does not change machine memory.)
533557
///
534558
/// `init` says with which value to initialize the store buffer in case there wasn't a store
535559
/// buffer for this memory range before.
560+
///
561+
/// Must be called *after* `validate_atomic_store` to ensure that `sync_clock` is up-to-date.
536562
fn buffered_atomic_write(
537563
&mut self,
538564
val: Scalar,
@@ -544,7 +570,7 @@ pub(super) trait EvalContextExt<'tcx>: crate::MiriInterpCxExt<'tcx> {
544570
let (alloc_id, base_offset, ..) = this.ptr_get_alloc_id(dest.ptr(), 0)?;
545571
if let (
546572
crate::AllocExtra {
547-
data_race: AllocDataRaceHandler::Vclocks(_, Some(alloc_buffers)),
573+
data_race: AllocDataRaceHandler::Vclocks(data_race_clocks, Some(alloc_buffers)),
548574
..
549575
},
550576
crate::MiriMachine {
@@ -556,9 +582,18 @@ pub(super) trait EvalContextExt<'tcx>: crate::MiriInterpCxExt<'tcx> {
556582
global.sc_write(threads);
557583
}
558584

559-
let buffer = alloc_buffers
560-
.get_or_create_store_buffer_mut(alloc_range(base_offset, dest.layout.size), init)?;
561-
buffer.buffered_write(val, global, threads, atomic == AtomicWriteOrd::SeqCst)?;
585+
let range = alloc_range(base_offset, dest.layout.size);
586+
// It's a bit annoying that we have to go back to the data race part to get the clock...
587+
// but it does make things a lot simpler.
588+
let sync_clock = data_race_clocks.sync_clock(range);
589+
let buffer = alloc_buffers.get_or_create_store_buffer_mut(range, init)?;
590+
buffer.buffered_write(
591+
val,
592+
global,
593+
threads,
594+
atomic == AtomicWriteOrd::SeqCst,
595+
sync_clock,
596+
)?;
562597
}
563598

564599
// Caller should've written to dest with the vanilla scalar write, we do nothing here

0 commit comments

Comments
 (0)