Skip to content

Commit a8baf33

Browse files
committed
epoll: do proper edge detection inside the epoll system
1 parent 047ee0a commit a8baf33

File tree

5 files changed

+157
-52
lines changed

5 files changed

+157
-52
lines changed

src/tools/miri/src/shims/unix/linux_like/epoll.rs

Lines changed: 49 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
use std::cell::RefCell;
2-
use std::collections::BTreeMap;
2+
use std::collections::{BTreeMap, btree_map};
33
use std::io;
44
use std::time::Duration;
55

@@ -55,8 +55,8 @@ pub struct EpollEventInstance {
5555
}
5656

5757
impl EpollEventInstance {
58-
pub fn new(events: u32, data: u64) -> EpollEventInstance {
59-
EpollEventInstance { events, data, clock: Default::default() }
58+
pub fn new(data: u64) -> EpollEventInstance {
59+
EpollEventInstance { events: 0, data, clock: Default::default() }
6060
}
6161
}
6262

@@ -69,10 +69,12 @@ impl EpollEventInstance {
6969
/// see the man page:
7070
///
7171
/// <https://man7.org/linux/man-pages/man2/epoll_ctl.2.html>
72-
#[derive(Debug, Copy, Clone)]
72+
#[derive(Debug)]
7373
pub struct EpollEventInterest {
7474
/// The events bitmask retrieved from `epoll_event`.
7575
events: u32,
76+
/// The way the events looked last time we checked (for edge trigger / ET detection).
77+
prev_events: u32,
7678
/// The data retrieved from `epoll_event`.
7779
/// libc's data field in epoll_event can store integer or pointer,
7880
/// but only u64 is supported for now.
@@ -345,31 +347,37 @@ pub trait EvalContextExt<'tcx>: crate::MiriInterpCxExt<'tcx> {
345347

346348
// Add new interest to list.
347349
let epoll_key = (id, fd);
348-
let new_interest = EpollEventInterest { events, data };
349-
if op == epoll_ctl_add {
350+
let new_interest = if op == epoll_ctl_add {
350351
if interest_list.range(range_for_id(id)).next().is_none() {
351352
// This is the first time this FD got added to this epoll.
352353
// Remember that in the global list so we get notified about FD events.
353354
this.machine.epoll_interests.insert(id, FileDescriptionRef::downgrade(&epfd));
354355
}
355-
if interest_list.insert(epoll_key, new_interest).is_some() {
356-
// We already had interest in this.
357-
return this.set_last_error_and_return_i32(LibcError("EEXIST"));
356+
match interest_list.entry(epoll_key) {
357+
btree_map::Entry::Occupied(_) => {
358+
// We already had interest in this.
359+
return this.set_last_error_and_return_i32(LibcError("EEXIST"));
360+
}
361+
btree_map::Entry::Vacant(e) =>
362+
e.insert(EpollEventInterest { events, data, prev_events: 0 }),
358363
}
359364
} else {
360365
// Modify the existing interest.
361366
let Some(interest) = interest_list.get_mut(&epoll_key) else {
362367
return this.set_last_error_and_return_i32(LibcError("ENOENT"));
363368
};
364-
*interest = new_interest;
369+
interest.events = events;
370+
// FIXME what about the data?
371+
interest
365372
};
366373

367374
// Deliver events for the new interest.
368375
send_ready_events_to_interests(
369376
this,
370377
&epfd,
371378
fd_ref.as_unix(this).get_epoll_ready_events()?.get_event_bitmask(this),
372-
std::iter::once((&epoll_key, &new_interest)),
379+
/* force_edge */ false,
380+
std::iter::once((&epoll_key, new_interest)),
373381
)?;
374382

375383
interp_ok(Scalar::from_i32(0))
@@ -387,7 +395,7 @@ pub trait EvalContextExt<'tcx>: crate::MiriInterpCxExt<'tcx> {
387395
this.machine.epoll_interests.remove(id, epfd.addr());
388396
}
389397

390-
// Remove related epoll_interest from ready list.
398+
// Remove related event instance from ready list.
391399
epfd.ready_list.borrow_mut().remove(&epoll_key);
392400

393401
interp_ok(Scalar::from_i32(0))
@@ -519,13 +527,16 @@ pub trait EvalContextExt<'tcx>: crate::MiriInterpCxExt<'tcx> {
519527
}
520528

521529
/// For a specific file description, get its ready events and send it to everyone who registered
522-
/// interest in this FD. This function should be called whenever an event causes more bytes or
523-
/// an EOF to become newly readable from an FD, and whenever more bytes can be written to an FD
524-
/// or no more future writes are possible.
530+
/// interest in this FD. This function should be called whenever the result of
531+
/// `get_epoll_ready_events` would change.
525532
///
526-
/// This *will* report an event if anyone is subscribed to it, without any further filtering, so
527-
/// do not call this function when an FD didn't have anything happen to it!
528-
fn epoll_send_fd_ready_events(&mut self, fd_ref: DynFileDescriptionRef) -> InterpResult<'tcx> {
533+
/// If `force_edge` is set, edge-triggered interests will be triggered even if the set of
534+
/// ready events did not change. This can lead to spurious wakeups. Use with caution!
535+
fn epoll_send_fd_ready_events(
536+
&mut self,
537+
fd_ref: DynFileDescriptionRef,
538+
force_edge: bool,
539+
) -> InterpResult<'tcx> {
529540
let this = self.eval_context_mut();
530541
let id = fd_ref.id();
531542
// Figure out who is interested in this. We need to clone this list since we can't prove
@@ -546,7 +557,8 @@ pub trait EvalContextExt<'tcx>: crate::MiriInterpCxExt<'tcx> {
546557
this,
547558
&epoll,
548559
event_bitmask,
549-
epoll.interest_list.borrow().range(range_for_id(id)),
560+
force_edge,
561+
epoll.interest_list.borrow_mut().range_mut(range_for_id(id)),
550562
)?;
551563
}
552564

@@ -560,28 +572,39 @@ fn send_ready_events_to_interests<'tcx, 'a>(
560572
ecx: &mut MiriInterpCx<'tcx>,
561573
epoll: &Epoll,
562574
event_bitmask: u32,
563-
interests: impl Iterator<Item = (&'a EpollEventKey, &'a EpollEventInterest)>,
575+
force_edge: bool,
576+
interests: impl Iterator<Item = (&'a EpollEventKey, &'a mut EpollEventInterest)>,
564577
) -> InterpResult<'tcx> {
565578
let mut wakeup = false;
566579
for (&event_key, interest) in interests {
567580
let mut ready_list = epoll.ready_list.borrow_mut();
568581
// This checks if any of the events specified in epoll_event_interest.events
569582
// match those in ready_events.
570583
let flags = interest.events & event_bitmask;
584+
let prev = std::mem::replace(&mut interest.prev_events, flags);
571585
if flags == 0 {
572586
// Make sure we *remove* any previous item from the ready list, since this
573587
// is not ready any more.
574588
ready_list.remove(&event_key);
575589
continue;
576590
}
577-
// Geenrate a new event instance, with the flags that this one is interested in.
578-
let mut new_instance = EpollEventInstance::new(flags, interest.data);
591+
// Generate new instance, or update existing one.
592+
let instance = match ready_list.entry(event_key) {
593+
btree_map::Entry::Occupied(e) => e.into_mut(),
594+
btree_map::Entry::Vacant(e) => {
595+
if !force_edge && flags == prev & flags {
596+
// Every bit in `flags` was already set in `prev`, and there's currently
597+
// no entry in the ready list for this. So there is nothing new and no
598+
// prior entry to update; just skip it.
599+
continue;
600+
}
601+
e.insert(EpollEventInstance::new(interest.data))
602+
}
603+
};
604+
instance.events = flags;
579605
ecx.release_clock(|clock| {
580-
new_instance.clock.clone_from(clock);
606+
instance.clock.join(clock);
581607
})?;
582-
// Add event to ready list for this epoll instance.
583-
// Tests confirm that we have to *overwrite* the old instance for the same key.
584-
ready_list.insert(event_key, new_instance);
585608
wakeup = true;
586609
}
587610
if wakeup {

src/tools/miri/src/shims/unix/linux_like/eventfd.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,7 @@ fn eventfd_write<'tcx>(
217217

218218
// The state changed; we check and update the status of all supported event
219219
// types for current file description.
220-
ecx.epoll_send_fd_ready_events(eventfd)?;
220+
ecx.epoll_send_fd_ready_events(eventfd, /* force_edge */ false)?;
221221

222222
// Return how many bytes we consumed from the user-provided buffer.
223223
return finish.call(ecx, Ok(buf_place.layout.size.bytes_usize()));
@@ -312,7 +312,7 @@ fn eventfd_read<'tcx>(
312312

313313
// The state changed; we check and update the status of all supported event
314314
// types for current file description.
315-
ecx.epoll_send_fd_ready_events(eventfd)?;
315+
ecx.epoll_send_fd_ready_events(eventfd, /* force_edge */ false)?;
316316

317317
// Tell userspace how many bytes we put into the buffer.
318318
return finish.call(ecx, Ok(buf_place.layout.size.bytes_usize()));

src/tools/miri/src/shims/unix/unnamed_socket.rs

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ use crate::*;
1818
/// The maximum capacity of the socketpair buffer in bytes.
1919
/// This number is arbitrary as the value can always
2020
/// be configured in the real system.
21-
const MAX_SOCKETPAIR_BUFFER_CAPACITY: usize = 212992;
21+
const MAX_SOCKETPAIR_BUFFER_CAPACITY: usize = 0x34000;
2222

2323
#[derive(Debug, PartialEq)]
2424
enum AnonSocketType {
@@ -97,7 +97,7 @@ impl FileDescription for AnonSocket {
9797
}
9898
}
9999
// Notify peer fd that close has happened, since that can unblock reads and writes.
100-
ecx.epoll_send_fd_ready_events(peer_fd)?;
100+
ecx.epoll_send_fd_ready_events(peer_fd, /* force_edge */ false)?;
101101
}
102102
interp_ok(Ok(()))
103103
}
@@ -275,9 +275,11 @@ fn anonsocket_write<'tcx>(
275275
for thread_id in waiting_threads {
276276
ecx.unblock_thread(thread_id, BlockReason::UnnamedSocket)?;
277277
}
278-
// Notification should be provided for peer fd as it became readable.
279-
// The kernel does this even if the fd was already readable before, so we follow suit.
280-
ecx.epoll_send_fd_ready_events(peer_fd)?;
278+
// Notify epoll waiters: we might be no longer writable, peer might now be readable.
279+
// The notification to the peer seems to be always sent on Linux, even if the
280+
// FD was readable before.
281+
ecx.epoll_send_fd_ready_events(self_ref, /* force_edge */ false)?;
282+
ecx.epoll_send_fd_ready_events(peer_fd, /* force_edge */ true)?;
281283

282284
return finish.call(ecx, Ok(write_size));
283285
}
@@ -351,6 +353,7 @@ fn anonsocket_read<'tcx>(
351353
// Do full read / partial read based on the space available.
352354
// Conveniently, `read` exists on `VecDeque` and has exactly the desired behavior.
353355
let read_size = ecx.read_from_host(&mut readbuf.buf, len, ptr)?.unwrap();
356+
let readbuf_now_empty = readbuf.buf.is_empty();
354357

355358
// Need to drop before others can access the readbuf again.
356359
drop(readbuf);
@@ -369,9 +372,14 @@ fn anonsocket_read<'tcx>(
369372
for thread_id in waiting_threads {
370373
ecx.unblock_thread(thread_id, BlockReason::UnnamedSocket)?;
371374
}
372-
// Notify epoll waiters.
373-
ecx.epoll_send_fd_ready_events(peer_fd)?;
375+
// Notify epoll waiters: peer is now writable.
376+
// Linux seems to always notify the peer if the read buffer is now empty.
377+
// (Linux also does that if this was a "big" read, but to avoid some arbitrary
378+
// threshold, we do not match that.)
379+
ecx.epoll_send_fd_ready_events(peer_fd, /* force_edge */ readbuf_now_empty)?;
374380
};
381+
// Notify epoll waiters: we might be no longer readable.
382+
ecx.epoll_send_fd_ready_events(self_ref, /* force_edge */ false)?;
375383

376384
return finish.call(ecx, Ok(read_size));
377385
}

0 commit comments

Comments
 (0)