Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 23 additions & 3 deletions lib/propolis/src/block/file.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,24 @@ impl SharedState {
}

fn processing_loop(&self, wctx: SyncWorkerCtx) {
// Conspire with gross hack in `preadv()` and `pwritev()`: there, we
// want to give the OS a buffer backed by Propolis heap, rather than a
// VMM segment, to avoid a slow fallback to lock pages in memory during
// the I/O. We could either allocate buffers on-demand sized for the
// I/O, or up front in the worker thread, here. On-demand allocation
// works, but when I/O sizes are mixed the high malloc/free rates
// (X00k/sec/process) get a bit busy on locks in libumem. So instead,
// retain a buffer across I/Os, which is passed into `process_request()`
// and on.
//
// This buffer is grown on-demand when handling an I/O because some
// guests' maximum I/O size seems to be smaller than maximums we
// indicate (for example, Linux seems to send NVMe I/Os 256 KiB or
// smaller, even though we report we'll tolerate up to 1 MiB). In those
// cases, sizing against an expected maximum will grossly oversize
// buffers and increase memory pressure for no good reason.
let mut scratch = Vec::new();

while let Some(dreq) = wctx.block_for_req() {
let req = dreq.req();
if self.info.read_only && req.op.is_write() {
Expand All @@ -74,7 +92,8 @@ impl SharedState {
dreq.complete(block::Result::Failure);
continue;
};
let res = match self.process_request(&req, &mem) {
let res = match self.process_request(&req, &mem, Some(&mut scratch))
{
Ok(_) => block::Result::Success,
Err(_) => block::Result::Failure,
};
Expand All @@ -86,13 +105,14 @@ impl SharedState {
&self,
req: &block::Request,
mem: &MemCtx,
scratch: Option<&mut Vec<u8>>,
) -> std::result::Result<(), &'static str> {
match req.op {
block::Operation::Read(off, len) => {
let maps = req.mappings(mem).ok_or("mapping unavailable")?;

let nbytes = maps
.preadv(self.fp.as_raw_fd(), off as i64)
.preadv(self.fp.as_raw_fd(), off as i64, scratch)
.map_err(|_| "io error")?;
if nbytes != len {
return Err("bad read length");
Expand All @@ -102,7 +122,7 @@ impl SharedState {
let maps = req.mappings(mem).ok_or("bad guest region")?;

let nbytes = maps
.pwritev(self.fp.as_raw_fd(), off as i64)
.pwritev(self.fp.as_raw_fd(), off as i64, scratch)
.map_err(|_| "io error")?;
if nbytes != len {
return Err("bad write length");
Expand Down
84 changes: 63 additions & 21 deletions lib/propolis/src/vmm/mem.rs
Original file line number Diff line number Diff line change
Expand Up @@ -748,10 +748,20 @@ unsafe impl Sync for SubMapping<'_> {}

pub trait MappingExt {
/// preadv from `file` into multiple mappings
fn preadv(&self, fd: RawFd, offset: i64) -> Result<usize>;
fn preadv(
&self,
fd: RawFd,
offset: i64,
scratch: Option<&mut Vec<u8>>,
) -> Result<usize>;

/// pwritev from multiple mappings to `file`
fn pwritev(&self, fd: RawFd, offset: i64) -> Result<usize>;
fn pwritev(
&self,
fd: RawFd,
offset: i64,
scratch: Option<&mut Vec<u8>>,
) -> Result<usize>;
}

// Gross hack alert: since the mappings below are memory regions backed by
Expand Down Expand Up @@ -792,8 +802,45 @@ fn total_mapping_size(mappings: &[SubMapping<'_>]) -> Result<usize> {
})
}

/// Ensure the maybe-provided buffer is ready for I/O of `desired_size`.
///
/// Returns `Some` with a slice with length at least as large as `desired_size`,
/// if the desired size is in the range we're willing to use the I/O buffer
/// proxying workaround.
///
/// If the scratch buffer is provided, was too small, but the desired size is
/// acceptable, this function will grow the scratch buffer.
fn prepare_scratch_buffer(
scratch: Option<&mut Vec<u8>>,
desired_size: usize,
) -> Option<&mut [u8]> {
scratch.and_then(|scratch| {
if desired_size > MAPPING_IO_LIMIT_BYTES {
// Even though the caller provided a buffer, we're not willing to
// grow it to handle this I/O.
return None;
}

if scratch.len() < desired_size {
// The provided buffer is smaller than necessary, but the needed
// size is acceptable, so grow the buffer.
//
// Buffer growth is done lazily because experience shows that guests
// may submit I/Os much smaller than the largest permitted by MDTS.
scratch.resize(desired_size, 0);
}

Some(scratch.as_mut_slice())
})
}

impl<'a, T: AsRef<[SubMapping<'a>]>> MappingExt for T {
fn preadv(&self, fd: RawFd, offset: i64) -> Result<usize> {
fn preadv(
&self,
fd: RawFd,
offset: i64,
scratch: Option<&mut Vec<u8>>,
) -> Result<usize> {
if !self
.as_ref()
.iter()
Expand All @@ -808,16 +855,11 @@ impl<'a, T: AsRef<[SubMapping<'a>]>> MappingExt for T {
let total_capacity = total_mapping_size(self.as_ref())?;

// Gross hack: see the comment on `MAPPING_IO_LIMIT_BYTES`.
if total_capacity <= MAPPING_IO_LIMIT_BYTES {
// If we're motivated to avoid the zero-fill via
// `Layout::with_size_align` + `GlobalAlloc::alloc`, we should
// probably avoid this gross hack entirely (see comment on
// MAPPING_IO_LIMIT_BYTES).
let mut buf = vec![0; total_capacity];

let scratch = prepare_scratch_buffer(scratch, total_capacity);
if let Some(buf) = scratch {
let iov = [iovec {
iov_base: buf.as_mut_ptr() as *mut libc::c_void,
iov_len: buf.len(),
iov_len: total_capacity,
}];

let res = unsafe {
Expand Down Expand Up @@ -883,7 +925,12 @@ impl<'a, T: AsRef<[SubMapping<'a>]>> MappingExt for T {
}
}

fn pwritev(&self, fd: RawFd, offset: i64) -> Result<usize> {
fn pwritev(
&self,
fd: RawFd,
offset: i64,
scratch: Option<&mut Vec<u8>>,
) -> Result<usize> {
if !self
.as_ref()
.iter()
Expand All @@ -898,14 +945,9 @@ impl<'a, T: AsRef<[SubMapping<'a>]>> MappingExt for T {
let total_capacity = total_mapping_size(self.as_ref())?;

// Gross hack: see the comment on `MAPPING_IO_LIMIT_BYTES`.
let written = if total_capacity <= MAPPING_IO_LIMIT_BYTES {
// If we're motivated to avoid the zero-fill via
// `Layout::with_size_align` + `GlobalAlloc::alloc`, we should
// probably avoid this gross hack entirely (see comment on
// MAPPING_IO_LIMIT_BYTES).
let mut buf = vec![0; total_capacity];

let mut remaining = buf.as_mut_slice();
let scratch = prepare_scratch_buffer(scratch, total_capacity);
let written = if let Some(buf) = scratch {
let mut remaining = &mut buf[..];
for mapping in self.as_ref().iter() {
// Safety: guest physical memory is READ|WRITE, and won't become
// unmapped as long as we hold a Mapping, which `self` implies.
Expand All @@ -928,7 +970,7 @@ impl<'a, T: AsRef<[SubMapping<'a>]>> MappingExt for T {

let iovs = [iovec {
iov_base: buf.as_mut_ptr() as *mut libc::c_void,
iov_len: buf.len(),
iov_len: total_capacity,
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

my first go at this change I'd missed this line in the change, and a small write from the guest went and wrote the whole much larger buffer, garbage and all, into my guest disk. I'd mentioned this morning that I was hesitant about misusing the longer-lifetime scratch buffer and it was really me still processing the shock of having bricked a test image without realizing I'd been playing with fire.

in retrospect, looking at this change, I'm much less anxious about this as a risky change, but I'm still not chomping at the bit to get this in.

}];

unsafe {
Expand Down
Loading