Skip to content

Commit 17d02e1

Browse files
authored
block/file: do pread/pwrite from Propolis heap instead of VM memory (#985)
As described in the comment on `MAPPING_IO_LIMIT_BYTES`, this is a gross hack: if VM memory is used as the iovec for `p{read,write}v` to a block device or zvol, we'll end up contending on `svmd->svmd_lock` pretty heavily in higher IOPS situations. This might arise when doing fewer larger I/Os to zvols or block devices as well, when we'll be forwarding each of the 4k entries in guest PRP lists as a distinct iovec, but we haven't tested that. Either way, operating from Propolis heap avoids the issue for now. One could imagine this "should" be a configurable behavior on file block backends, since many files (like disk images in propolis-standalone) would not need this. The hope is that we can fix this in the kernel so the hack is no longer needed for zvols or block devices, rather than leaning further into properly supporting this copy hack.
1 parent dacb53d commit 17d02e1

File tree

1 file changed

+174
-25
lines changed

1 file changed

+174
-25
lines changed

lib/propolis/src/vmm/mem.rs

Lines changed: 174 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -754,6 +754,44 @@ pub trait MappingExt {
754754
fn pwritev(&self, fd: RawFd, offset: i64) -> Result<usize>;
755755
}
756756

757+
// Gross hack alert: since the mappings below are memory regions backed by
758+
// segvmm_ops, `zvol_{read,write}` and similar will end up contending on
759+
// `svmd->svmd_lock`. Instead, as long as the I/O is small enough we'll tolerate
760+
// it, copy from guest memory to Propolis heap. The segment backing Propolis'
761+
// heap has an `as_page{,un}lock` impl that avoids the more
762+
// expensive/contentious `as_fault()` fallback.
763+
//
764+
// This is an optimization until stlouis#871 can get things sorted, at
765+
// which point it should be strictly worse than directly using the
766+
// requested mappings.
767+
//
768+
// 1 MiB is an arbitrary-ish choice: `propolis-server` and `propolis-standalone`
769+
// set NVMe MDTS to "8", so the largest I/Os from NVMe will be
770+
// `2**8 * 4096 == 1048576 bytes == 1 MiB`. Beyond this, fall back to using
771+
// iovecs directly, potentially at increased OS overhead.
772+
//
773+
// The amount of memory used for temporary buffers is given by the number of
774+
// worker threads for all file-backed disks, times this threshold. It works out
775+
// to up to 8 MiB (8 worker threads) of buffers per disk by default as of
776+
// writing.
777+
const MAPPING_IO_LIMIT_BYTES: usize = crate::common::MB;
778+
779+
/// Compute the number of bytes that would be required to hold these mappings
780+
/// sequentially.
781+
///
782+
/// Ranges covered by multiple mappings are counted repeatedly.
783+
fn total_mapping_size(mappings: &[SubMapping<'_>]) -> Result<usize> {
784+
mappings
785+
.iter()
786+
.try_fold(0usize, |total, mapping| total.checked_add(mapping.len))
787+
.ok_or_else(|| {
788+
Error::new(
789+
ErrorKind::InvalidInput,
790+
"Total mapping larger than a `usize`",
791+
)
792+
})
793+
}
794+
757795
impl<'a, T: AsRef<[SubMapping<'a>]>> MappingExt for T {
758796
fn preadv(&self, fd: RawFd, offset: i64) -> Result<usize> {
759797
if !self
@@ -767,23 +805,82 @@ impl<'a, T: AsRef<[SubMapping<'a>]>> MappingExt for T {
767805
));
768806
}
769807

770-
let iov = self
771-
.as_ref()
772-
.iter()
773-
.map(|mapping| iovec {
774-
iov_base: mapping.ptr.as_ptr() as *mut libc::c_void,
775-
iov_len: mapping.len,
776-
})
777-
.collect::<Vec<_>>();
808+
let total_capacity = total_mapping_size(self.as_ref())?;
809+
810+
// Gross hack: see the comment on `MAPPING_IO_LIMIT_BYTES`.
811+
if total_capacity <= MAPPING_IO_LIMIT_BYTES {
812+
// If we're motivated to avoid the zero-fill via
813+
// `Layout::with_size_align` + `GlobalAlloc::alloc`, we should
814+
// probably avoid this gross hack entirely (see comment on
815+
// MAPPING_IO_LIMIT_BYTES).
816+
let mut buf = vec![0; total_capacity];
817+
818+
let iov = [iovec {
819+
iov_base: buf.as_mut_ptr() as *mut libc::c_void,
820+
iov_len: buf.len(),
821+
}];
822+
823+
let res = unsafe {
824+
libc::preadv(fd, iov.as_ptr(), iov.len() as libc::c_int, offset)
825+
};
826+
if res == -1 {
827+
return Err(Error::last_os_error());
828+
}
829+
let read = res as usize;
830+
831+
// copy `read` bytes back into the iovecs and return
832+
let mut remaining = &mut buf[..read];
833+
for mapping in self.as_ref().iter() {
834+
let to_copy = std::cmp::min(remaining.len(), mapping.len);
835+
836+
// Safety: guest physical memory is READ|WRITE, and won't become
837+
// unmapped as long as we hold a Mapping, which `self` implies.
838+
//
839+
// The guest may be concurrently modifying this memory, we may
840+
// be concurrently reading or writing this memory (if it is
841+
// submitted for a read or write on another thread, for
842+
// example), but `copy_nonoverlapping` has no compunctions about
843+
// concurrent mutation.
844+
unsafe {
845+
copy_nonoverlapping::<u8>(
846+
remaining.as_ptr(),
847+
mapping.ptr.as_ptr(),
848+
mapping.len,
849+
);
850+
}
851+
852+
remaining = remaining.split_at_mut(to_copy).1;
853+
854+
if remaining.len() == 0 {
855+
// Either we're at the last iov and we're finished copying
856+
// back into the guest, or `preadv` did a short read.
857+
break;
858+
}
859+
}
778860

779-
let read = unsafe {
780-
libc::preadv(fd, iov.as_ptr(), iov.len() as libc::c_int, offset)
781-
};
782-
if read == -1 {
783-
return Err(Error::last_os_error());
784-
}
861+
// We should never read more than the guest mappings could hold.
862+
assert_eq!(remaining.len(), 0);
785863

786-
Ok(read as usize)
864+
Ok(read)
865+
} else {
866+
let iov = self
867+
.as_ref()
868+
.iter()
869+
.map(|mapping| iovec {
870+
iov_base: mapping.ptr.as_ptr() as *mut libc::c_void,
871+
iov_len: mapping.len,
872+
})
873+
.collect::<Vec<_>>();
874+
875+
let read = unsafe {
876+
libc::preadv(fd, iov.as_ptr(), iov.len() as libc::c_int, offset)
877+
};
878+
if read == -1 {
879+
return Err(Error::last_os_error());
880+
}
881+
let read: usize = read.try_into().expect("read is positive");
882+
Ok(read)
883+
}
787884
}
788885

789886
fn pwritev(&self, fd: RawFd, offset: i64) -> Result<usize> {
@@ -798,18 +895,70 @@ impl<'a, T: AsRef<[SubMapping<'a>]>> MappingExt for T {
798895
));
799896
}
800897

801-
let iov = self
802-
.as_ref()
803-
.iter()
804-
.map(|mapping| iovec {
805-
iov_base: mapping.ptr.as_ptr() as *mut libc::c_void,
806-
iov_len: mapping.len,
807-
})
808-
.collect::<Vec<_>>();
898+
let total_capacity = total_mapping_size(self.as_ref())?;
899+
900+
// Gross hack: see the comment on `MAPPING_IO_LIMIT_BYTES`.
901+
let written = if total_capacity <= MAPPING_IO_LIMIT_BYTES {
902+
// If we're motivated to avoid the zero-fill via
903+
// `Layout::with_size_align` + `GlobalAlloc::alloc`, we should
904+
// probably avoid this gross hack entirely (see comment on
905+
// MAPPING_IO_LIMIT_BYTES).
906+
let mut buf = vec![0; total_capacity];
907+
908+
let mut remaining = buf.as_mut_slice();
909+
for mapping in self.as_ref().iter() {
910+
// Safety: guest physical memory is READ|WRITE, and won't become
911+
// unmapped as long as we hold a Mapping, which `self` implies.
912+
//
913+
// The guest may be concurrently modifying this memory, we may
914+
// be concurrently reading or writing this memory (if it is
915+
// submitted for a read or write on another thread, for
916+
// example), but `copy_nonoverlapping` has no compunctions about
917+
// concurrent mutation.
918+
unsafe {
919+
copy_nonoverlapping::<u8>(
920+
mapping.ptr.as_ptr() as *const u8,
921+
remaining.as_mut_ptr(),
922+
mapping.len,
923+
);
924+
}
925+
926+
remaining = remaining.split_at_mut(mapping.len).1;
927+
}
809928

810-
let written = unsafe {
811-
libc::pwritev(fd, iov.as_ptr(), iov.len() as libc::c_int, offset)
929+
let iovs = [iovec {
930+
iov_base: buf.as_mut_ptr() as *mut libc::c_void,
931+
iov_len: buf.len(),
932+
}];
933+
934+
unsafe {
935+
libc::pwritev(
936+
fd,
937+
iovs.as_ptr(),
938+
iovs.len() as libc::c_int,
939+
offset,
940+
)
941+
}
942+
} else {
943+
let iovs = self
944+
.as_ref()
945+
.iter()
946+
.map(|mapping| iovec {
947+
iov_base: mapping.ptr.as_ptr() as *mut libc::c_void,
948+
iov_len: mapping.len,
949+
})
950+
.collect::<Vec<_>>();
951+
952+
unsafe {
953+
libc::pwritev(
954+
fd,
955+
iovs.as_ptr(),
956+
iovs.len() as libc::c_int,
957+
offset,
958+
)
959+
}
812960
};
961+
813962
if written == -1 {
814963
return Err(Error::last_os_error());
815964
}

0 commit comments

Comments
 (0)