@@ -754,6 +754,44 @@ pub trait MappingExt {
754754 fn pwritev ( & self , fd : RawFd , offset : i64 ) -> Result < usize > ;
755755}
756756
757+ // Gross hack alert: since the mappings below are memory regions backed by
758+ // segvmm_ops, `zvol_{read,write}` and similar will end up contending on
759+ // `svmd->svmd_lock`. Instead, as long as the I/O is small enough we'll tolerate
760+ // it, copy from guest memory to Propolis heap. The segment backing Propolis'
761+ // heap has an `as_page{,un}lock` impl that avoids the more
762+ // expensive/contentious `as_fault()` fallback.
763+ //
764+ // This is an optimization until stlouis#871 can get things sorted, at
765+ // which point it should be strictly worse than directly using the
766+ // requested mappings.
767+ //
768+ // 1 MiB is an arbitrary-ish choice: `propolis-server` and `propolis-standalone`
769+ // set NVMe MDTS to "8", so the largest I/Os from NVMe will be
770+ // `2**8 * 4096 == 1048576 bytes == 1 MiB`. Beyond this, fall back to using
771+ // iovecs directly, potentially at increased OS overhead.
772+ //
773+ // The amount of memory used for temporary buffers is given by the number of
774+ // worker threads for all file-backed disks, times this threshold. It works out
775+ // to up to 8 MiB (8 worker threads) of buffers per disk by default as of
776+ // writing.
777+ const MAPPING_IO_LIMIT_BYTES : usize = crate :: common:: MB ;
778+
779+ /// Compute the number of bytes that would be required to hold these mappings
780+ /// sequentially.
781+ ///
782+ /// Ranges covered by multiple mappings are counted repeatedly.
783+ fn total_mapping_size ( mappings : & [ SubMapping < ' _ > ] ) -> Result < usize > {
784+ mappings
785+ . iter ( )
786+ . try_fold ( 0usize , |total, mapping| total. checked_add ( mapping. len ) )
787+ . ok_or_else ( || {
788+ Error :: new (
789+ ErrorKind :: InvalidInput ,
790+ "Total mapping larger than a `usize`" ,
791+ )
792+ } )
793+ }
794+
757795impl < ' a , T : AsRef < [ SubMapping < ' a > ] > > MappingExt for T {
758796 fn preadv ( & self , fd : RawFd , offset : i64 ) -> Result < usize > {
759797 if !self
@@ -767,23 +805,82 @@ impl<'a, T: AsRef<[SubMapping<'a>]>> MappingExt for T {
767805 ) ) ;
768806 }
769807
770- let iov = self
771- . as_ref ( )
772- . iter ( )
773- . map ( |mapping| iovec {
774- iov_base : mapping. ptr . as_ptr ( ) as * mut libc:: c_void ,
775- iov_len : mapping. len ,
776- } )
777- . collect :: < Vec < _ > > ( ) ;
808+ let total_capacity = total_mapping_size ( self . as_ref ( ) ) ?;
809+
810+ // Gross hack: see the comment on `MAPPING_IO_LIMIT_BYTES`.
811+ if total_capacity <= MAPPING_IO_LIMIT_BYTES {
812+ // If we're motivated to avoid the zero-fill via
813+ // `Layout::with_size_align` + `GlobalAlloc::alloc`, we should
814+ // probably avoid this gross hack entirely (see comment on
815+ // MAPPING_IO_LIMIT_BYTES).
816+ let mut buf = vec ! [ 0 ; total_capacity] ;
817+
818+ let iov = [ iovec {
819+ iov_base : buf. as_mut_ptr ( ) as * mut libc:: c_void ,
820+ iov_len : buf. len ( ) ,
821+ } ] ;
822+
823+ let res = unsafe {
824+ libc:: preadv ( fd, iov. as_ptr ( ) , iov. len ( ) as libc:: c_int , offset)
825+ } ;
826+ if res == -1 {
827+ return Err ( Error :: last_os_error ( ) ) ;
828+ }
829+ let read = res as usize ;
830+
831+ // copy `read` bytes back into the iovecs and return
832+ let mut remaining = & mut buf[ ..read] ;
833+ for mapping in self . as_ref ( ) . iter ( ) {
834+ let to_copy = std:: cmp:: min ( remaining. len ( ) , mapping. len ) ;
835+
836+ // Safety: guest physical memory is READ|WRITE, and won't become
837+ // unmapped as long as we hold a Mapping, which `self` implies.
838+ //
839+ // The guest may be concurrently modifying this memory, we may
840+ // be concurrently reading or writing this memory (if it is
841+ // submitted for a read or write on another thread, for
842+ // example), but `copy_nonoverlapping` has no compunctions about
843+ // concurrent mutation.
844+ unsafe {
845+ copy_nonoverlapping :: < u8 > (
846+ remaining. as_ptr ( ) ,
847+ mapping. ptr . as_ptr ( ) ,
848+ mapping. len ,
849+ ) ;
850+ }
851+
852+ remaining = remaining. split_at_mut ( to_copy) . 1 ;
853+
854+ if remaining. len ( ) == 0 {
855+ // Either we're at the last iov and we're finished copying
856+ // back into the guest, or `preadv` did a short read.
857+ break ;
858+ }
859+ }
778860
779- let read = unsafe {
780- libc:: preadv ( fd, iov. as_ptr ( ) , iov. len ( ) as libc:: c_int , offset)
781- } ;
782- if read == -1 {
783- return Err ( Error :: last_os_error ( ) ) ;
784- }
861+ // We should never read more than the guest mappings could hold.
862+ assert_eq ! ( remaining. len( ) , 0 ) ;
785863
786- Ok ( read as usize )
864+ Ok ( read)
865+ } else {
866+ let iov = self
867+ . as_ref ( )
868+ . iter ( )
869+ . map ( |mapping| iovec {
870+ iov_base : mapping. ptr . as_ptr ( ) as * mut libc:: c_void ,
871+ iov_len : mapping. len ,
872+ } )
873+ . collect :: < Vec < _ > > ( ) ;
874+
875+ let read = unsafe {
876+ libc:: preadv ( fd, iov. as_ptr ( ) , iov. len ( ) as libc:: c_int , offset)
877+ } ;
878+ if read == -1 {
879+ return Err ( Error :: last_os_error ( ) ) ;
880+ }
881+ let read: usize = read. try_into ( ) . expect ( "read is positive" ) ;
882+ Ok ( read)
883+ }
787884 }
788885
789886 fn pwritev ( & self , fd : RawFd , offset : i64 ) -> Result < usize > {
@@ -798,18 +895,70 @@ impl<'a, T: AsRef<[SubMapping<'a>]>> MappingExt for T {
798895 ) ) ;
799896 }
800897
801- let iov = self
802- . as_ref ( )
803- . iter ( )
804- . map ( |mapping| iovec {
805- iov_base : mapping. ptr . as_ptr ( ) as * mut libc:: c_void ,
806- iov_len : mapping. len ,
807- } )
808- . collect :: < Vec < _ > > ( ) ;
898+ let total_capacity = total_mapping_size ( self . as_ref ( ) ) ?;
899+
900+ // Gross hack: see the comment on `MAPPING_IO_LIMIT_BYTES`.
901+ let written = if total_capacity <= MAPPING_IO_LIMIT_BYTES {
902+ // If we're motivated to avoid the zero-fill via
903+ // `Layout::with_size_align` + `GlobalAlloc::alloc`, we should
904+ // probably avoid this gross hack entirely (see comment on
905+ // MAPPING_IO_LIMIT_BYTES).
906+ let mut buf = vec ! [ 0 ; total_capacity] ;
907+
908+ let mut remaining = buf. as_mut_slice ( ) ;
909+ for mapping in self . as_ref ( ) . iter ( ) {
910+ // Safety: guest physical memory is READ|WRITE, and won't become
911+ // unmapped as long as we hold a Mapping, which `self` implies.
912+ //
913+ // The guest may be concurrently modifying this memory, we may
914+ // be concurrently reading or writing this memory (if it is
915+ // submitted for a read or write on another thread, for
916+ // example), but `copy_nonoverlapping` has no compunctions about
917+ // concurrent mutation.
918+ unsafe {
919+ copy_nonoverlapping :: < u8 > (
920+ mapping. ptr . as_ptr ( ) as * const u8 ,
921+ remaining. as_mut_ptr ( ) ,
922+ mapping. len ,
923+ ) ;
924+ }
925+
926+ remaining = remaining. split_at_mut ( mapping. len ) . 1 ;
927+ }
809928
810- let written = unsafe {
811- libc:: pwritev ( fd, iov. as_ptr ( ) , iov. len ( ) as libc:: c_int , offset)
929+ let iovs = [ iovec {
930+ iov_base : buf. as_mut_ptr ( ) as * mut libc:: c_void ,
931+ iov_len : buf. len ( ) ,
932+ } ] ;
933+
934+ unsafe {
935+ libc:: pwritev (
936+ fd,
937+ iovs. as_ptr ( ) ,
938+ iovs. len ( ) as libc:: c_int ,
939+ offset,
940+ )
941+ }
942+ } else {
943+ let iovs = self
944+ . as_ref ( )
945+ . iter ( )
946+ . map ( |mapping| iovec {
947+ iov_base : mapping. ptr . as_ptr ( ) as * mut libc:: c_void ,
948+ iov_len : mapping. len ,
949+ } )
950+ . collect :: < Vec < _ > > ( ) ;
951+
952+ unsafe {
953+ libc:: pwritev (
954+ fd,
955+ iovs. as_ptr ( ) ,
956+ iovs. len ( ) as libc:: c_int ,
957+ offset,
958+ )
959+ }
812960 } ;
961+
813962 if written == -1 {
814963 return Err ( Error :: last_os_error ( ) ) ;
815964 }
0 commit comments