diff --git a/lib/propolis/src/block/file.rs b/lib/propolis/src/block/file.rs index 97d801534..1d8f8c6da 100644 --- a/lib/propolis/src/block/file.rs +++ b/lib/propolis/src/block/file.rs @@ -59,6 +59,24 @@ impl SharedState { } fn processing_loop(&self, wctx: SyncWorkerCtx) { + // Conspire with gross hack in `preadv()` and `pwritev()`: there, we + // want to give the OS a buffer backed by Propolis heap, rather than a + // VMM segment, to avoid a slow fallback to lock pages in memory during + // the I/O. We could either allocate buffers on-demand sized for the + // I/O, or up front in the worker thread, here. On-demand allocation + // works, but when I/O sizes are mixed the high malloc/free rates + // (X00k/sec/process) get a bit busy on locks in libumem. So instead, + // retain a buffer across I/Os, which is passed into `process_request()` + // and on. + // + // This buffer is grown on-demand when handling an I/O because some + // guests' maximum I/O size seems to be smaller than maximums we + // indicate (for example, Linux seems to send NVMe I/Os 256 KiB or + // smaller, even though we report we'll tolerate up to 1 MiB). In those + // cases, sizing against an expected maximum will grossly oversize + // buffers and increase memory pressure for no good reason. + let mut scratch = Vec::new(); + while let Some(dreq) = wctx.block_for_req() { let req = dreq.req(); if self.info.read_only && req.op.is_write() { @@ -74,7 +92,8 @@ impl SharedState { dreq.complete(block::Result::Failure); continue; }; - let res = match self.process_request(&req, &mem) { + let res = match self.process_request(&req, &mem, Some(&mut scratch)) + { Ok(_) => block::Result::Success, Err(_) => block::Result::Failure, }; @@ -86,13 +105,14 @@ impl SharedState { &self, req: &block::Request, mem: &MemCtx, + scratch: Option<&mut Vec>, ) -> std::result::Result<(), &'static str> { match req.op { block::Operation::Read(off, len) => { let maps = req.mappings(mem).ok_or("mapping unavailable")?; let nbytes = maps - .preadv(self.fp.as_raw_fd(), off as i64) + .preadv(self.fp.as_raw_fd(), off as i64, scratch) .map_err(|_| "io error")?; if nbytes != len { return Err("bad read length"); @@ -102,7 +122,7 @@ impl SharedState { let maps = req.mappings(mem).ok_or("bad guest region")?; let nbytes = maps - .pwritev(self.fp.as_raw_fd(), off as i64) + .pwritev(self.fp.as_raw_fd(), off as i64, scratch) .map_err(|_| "io error")?; if nbytes != len { return Err("bad write length"); diff --git a/lib/propolis/src/vmm/mem.rs b/lib/propolis/src/vmm/mem.rs index 0b79b0ffc..e52e8f5a2 100644 --- a/lib/propolis/src/vmm/mem.rs +++ b/lib/propolis/src/vmm/mem.rs @@ -748,10 +748,20 @@ unsafe impl Sync for SubMapping<'_> {} pub trait MappingExt { /// preadv from `file` into multiple mappings - fn preadv(&self, fd: RawFd, offset: i64) -> Result; + fn preadv( + &self, + fd: RawFd, + offset: i64, + scratch: Option<&mut Vec>, + ) -> Result; /// pwritev from multiple mappings to `file` - fn pwritev(&self, fd: RawFd, offset: i64) -> Result; + fn pwritev( + &self, + fd: RawFd, + offset: i64, + scratch: Option<&mut Vec>, + ) -> Result; } // Gross hack alert: since the mappings below are memory regions backed by @@ -792,8 +802,45 @@ fn total_mapping_size(mappings: &[SubMapping<'_>]) -> Result { }) } +/// Ensure the maybe-provided buffer is ready for I/O of `desired_size`. +/// +/// Returns `Some` with a slice with length at least as large as `desired_size`, +/// if the desired size is in the range we're willing to use the I/O buffer +/// proxying workaround. +/// +/// If the scratch buffer is provided, was too small, but the desired size is +/// acceptable, this function will grow the scratch buffer. +fn prepare_scratch_buffer( + scratch: Option<&mut Vec>, + desired_size: usize, +) -> Option<&mut [u8]> { + scratch.and_then(|scratch| { + if desired_size > MAPPING_IO_LIMIT_BYTES { + // Even though the caller provided a buffer, we're not willing to + // grow it to handle this I/O. + return None; + } + + if scratch.len() < desired_size { + // The provided buffer is smaller than necessary, but the needed + // size is acceptable, so grow the buffer. + // + // Buffer growth is done lazily because experience shows that guests + // may submit I/Os much smaller than the largest permitted by MDTS. + scratch.resize(desired_size, 0); + } + + Some(scratch.as_mut_slice()) + }) +} + impl<'a, T: AsRef<[SubMapping<'a>]>> MappingExt for T { - fn preadv(&self, fd: RawFd, offset: i64) -> Result { + fn preadv( + &self, + fd: RawFd, + offset: i64, + scratch: Option<&mut Vec>, + ) -> Result { if !self .as_ref() .iter() @@ -808,16 +855,11 @@ impl<'a, T: AsRef<[SubMapping<'a>]>> MappingExt for T { let total_capacity = total_mapping_size(self.as_ref())?; // Gross hack: see the comment on `MAPPING_IO_LIMIT_BYTES`. - if total_capacity <= MAPPING_IO_LIMIT_BYTES { - // If we're motivated to avoid the zero-fill via - // `Layout::with_size_align` + `GlobalAlloc::alloc`, we should - // probably avoid this gross hack entirely (see comment on - // MAPPING_IO_LIMIT_BYTES). - let mut buf = vec![0; total_capacity]; - + let scratch = prepare_scratch_buffer(scratch, total_capacity); + if let Some(buf) = scratch { let iov = [iovec { iov_base: buf.as_mut_ptr() as *mut libc::c_void, - iov_len: buf.len(), + iov_len: total_capacity, }]; let res = unsafe { @@ -883,7 +925,12 @@ impl<'a, T: AsRef<[SubMapping<'a>]>> MappingExt for T { } } - fn pwritev(&self, fd: RawFd, offset: i64) -> Result { + fn pwritev( + &self, + fd: RawFd, + offset: i64, + scratch: Option<&mut Vec>, + ) -> Result { if !self .as_ref() .iter() @@ -898,14 +945,9 @@ impl<'a, T: AsRef<[SubMapping<'a>]>> MappingExt for T { let total_capacity = total_mapping_size(self.as_ref())?; // Gross hack: see the comment on `MAPPING_IO_LIMIT_BYTES`. - let written = if total_capacity <= MAPPING_IO_LIMIT_BYTES { - // If we're motivated to avoid the zero-fill via - // `Layout::with_size_align` + `GlobalAlloc::alloc`, we should - // probably avoid this gross hack entirely (see comment on - // MAPPING_IO_LIMIT_BYTES). - let mut buf = vec![0; total_capacity]; - - let mut remaining = buf.as_mut_slice(); + let scratch = prepare_scratch_buffer(scratch, total_capacity); + let written = if let Some(buf) = scratch { + let mut remaining = &mut buf[..]; for mapping in self.as_ref().iter() { // Safety: guest physical memory is READ|WRITE, and won't become // unmapped as long as we hold a Mapping, which `self` implies. @@ -928,7 +970,7 @@ impl<'a, T: AsRef<[SubMapping<'a>]>> MappingExt for T { let iovs = [iovec { iov_base: buf.as_mut_ptr() as *mut libc::c_void, - iov_len: buf.len(), + iov_len: total_capacity, }]; unsafe {