From 050ca03d13eacfe59661e71eea71924b3f67e8df Mon Sep 17 00:00:00 2001
From: Babis Chalios <bchalios@amazon.es>
Date: Thu, 12 Sep 2024 12:45:33 +0200
Subject: [PATCH 1/5] virtio: add ring buffer type for describing guest memory

Add a ring buffer type that is tailored for holding `struct iovec`
objects that point to guest memory for IO. The `struct iovec` objects
represent the memory that the guest passed to us as `Descriptors` in a
VirtIO queue for performing some I/O operation.

We plan to use this type to describe the guest memory we have available
for doing network RX. This should facilitate us in optimizing the
reception of data from the TAP device using `readv`, thus avoiding a
memory copy.

Co-authored-by: Egor Lazarchuk <yegorlz@amazon.co.uk>
Signed-off-by: Babis Chalios <bchalios@amazon.es>
---
 src/vmm/src/devices/virtio/iov_deque.rs | 449 ++++++++++++++++++++++++
 src/vmm/src/devices/virtio/mod.rs       |   1 +
 2 files changed, 450 insertions(+)
 create mode 100644 src/vmm/src/devices/virtio/iov_deque.rs

diff --git a/src/vmm/src/devices/virtio/iov_deque.rs b/src/vmm/src/devices/virtio/iov_deque.rs
new file mode 100644
index 00000000000..201f1d9c92f
--- /dev/null
+++ b/src/vmm/src/devices/virtio/iov_deque.rs
@@ -0,0 +1,449 @@
+// Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+use std::os::fd::AsRawFd;
+
+use libc::{c_int, c_void, iovec, off_t, size_t};
+use memfd;
+
+use super::queue::FIRECRACKER_MAX_QUEUE_SIZE;
+use crate::arch::PAGE_SIZE;
+
+#[derive(Debug, thiserror::Error, displaydoc::Display)]
+pub enum IovDequeError {
+    /// Error with memfd: {0}
+    Memfd(#[from] memfd::Error),
+    /// Error while resizing memfd: {0}
+    MemfdResize(std::io::Error),
+    /// Error calling mmap: {0}
+    Mmap(std::io::Error),
+}
+
+/// ['IovDeque'] is a ring buffer tailored for `struct iovec` objects.
+///
+/// From the point of view of API, [`IovDeque`] is a typical ring buffer that allows us to push
+/// `struct iovec` objects at the end of the buffer and pop them from its beginning.
+///
+/// It is tailored to store `struct iovec` objects that described memory that was passed to us from
+/// the guest via a VirtIO queue. This allows us to assume the maximum size of a ring buffer (the
+/// negotiated size of the queue).
+// An important feature of the data structure is that it can give us a slice of all `struct iovec`
+// objects in the queue, so that we can use this `&mut [iovec]` to perform operations such as
+// `readv`. A typical implementation of a ring buffer allows for entries to wrap around the end of
+// the underlying buffer. For example, a ring buffer with a capacity of 10 elements which
+// currently holds 4 elements can look like this:
+//
+//                      tail                        head
+//                       |                           |
+//                       v                           v
+//                 +---+---+---+---+---+---+---+---+---+---+
+// ring buffer:    | C | D |   |   |   |   |   |   | A | B |
+//                 +---+---+---+---+---+---+---+---+---+---+
+//
+// When getting a slice for this data we should get something like that: &[A, B, C, D], which
+// would require copies in order to make the elements continuous in memory.
+//
+// In order to avoid that and make the operation of getting a slice more efficient, we implement
+// the optimization described in the "Optimization" section of the "Circular buffer" wikipedia
+// entry: https://en.wikipedia.org/wiki/Circular_buffer. The optimization consists of allocating
+// double the size of the virtual memory required for the buffer and map both parts on the same
+// physical address. Looking at the same example as before, we should get, this picture:
+//
+//                                    head   |    tail
+//                                     |     |     |
+//                                     v     |     v
+//   +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
+//   | C | D |   |   |   |   |   |   | A | B | C | D |   |   |   |   |   |   | A | B |
+//   +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
+//            First virtual page             |       Second virtual page
+//                                           |
+//                                           |
+//
+//                                     Virtual memory
+// ---------------------------------------------------------------------------------------
+//                                    Physical memory
+//
+//                      +---+---+---+---+---+---+---+---+---+---+
+//                      | C | D |   |   |   |   |   |   | A | B |
+//                      +---+---+---+---+---+---+---+---+---+---+
+//
+// Like that, the elements stored in the buffer are always laid out in contiguous virtual memory,
+// so making a slice out of them does not require any copies.
+#[derive(Debug)]
+pub struct IovDeque {
+    iov: *mut libc::iovec,
+    start: u16,
+    len: u16,
+}
+
+// SAFETY: This is `Send`. We hold sole ownership of the underlying buffer.
+unsafe impl Send for IovDeque {}
+
+impl IovDeque {
+    /// Create a [`memfd`] object that represents a single physical page
+    fn create_memfd() -> Result<memfd::Memfd, IovDequeError> {
+        // Create a sealable memfd.
+        let opts = memfd::MemfdOptions::default().allow_sealing(true);
+        let mfd = opts.create("sized-1K")?;
+
+        // Resize to system page size.
+        mfd.as_file()
+            .set_len(PAGE_SIZE.try_into().unwrap())
+            .map_err(IovDequeError::MemfdResize)?;
+
+        // Add seals to prevent further resizing.
+        mfd.add_seals(&[memfd::FileSeal::SealShrink, memfd::FileSeal::SealGrow])?;
+
+        // Prevent further sealing changes.
+        mfd.add_seal(memfd::FileSeal::SealSeal)?;
+
+        Ok(mfd)
+    }
+
+    /// A safe wrapper on top of libc's `mmap` system call
+    ///
+    /// # Safety: Callers need to make sure that the arguments to `mmap` are valid
+    unsafe fn mmap(
+        addr: *mut c_void,
+        len: size_t,
+        prot: c_int,
+        flags: c_int,
+        fd: c_int,
+        offset: off_t,
+    ) -> Result<*mut c_void, IovDequeError> {
+        let ptr = libc::mmap(addr, len, prot, flags, fd, offset);
+        if ptr == libc::MAP_FAILED {
+            return Err(IovDequeError::Mmap(std::io::Error::last_os_error()));
+        }
+
+        Ok(ptr)
+    }
+
+    /// Allocate memory for our ring buffer
+    ///
+    /// This will allocate exactly two pages of virtual memory. In order to implement the
+    /// optimization that allows us to always have elements in contiguous memory we need
+    /// allocations at the granularity of `PAGE_SIZE`. Now, our queues are at maximum 256
+    /// descriptors long and `struct iovec` looks like this:
+    ///
+    /// ```Rust
+    /// pub struct iovec {
+    ///    pub iov_base: *mut ::c_void,
+    ///    pub iov_len: ::size_t,
+    /// }
+    /// ```
+    ///
+    /// so, it's 16 bytes long. As a result, we need a single page for holding the actual data of
+    /// our buffer.
+    fn allocate_ring_buffer_memory() -> Result<*mut c_void, IovDequeError> {
+        // The fact that we allocate two pages is due to the size of `struct iovec` times our queue
+        // size equals the page size. Add here a debug assertion to reflect that and ensure that we
+        // will adapt our logic if the assumption changes in the future.
+        const {
+            assert!(
+                std::mem::size_of::<iovec>() * FIRECRACKER_MAX_QUEUE_SIZE as usize == PAGE_SIZE
+            );
+        }
+
+        // SAFETY: We are calling the system call with valid arguments
+        unsafe {
+            Self::mmap(
+                std::ptr::null_mut(),
+                PAGE_SIZE * 2,
+                libc::PROT_NONE,
+                libc::MAP_PRIVATE | libc::MAP_ANONYMOUS,
+                -1,
+                0,
+            )
+        }
+    }
+
+    /// Create a new [`IovDeque`] that can hold memory described by a single VirtIO queue.
+    pub fn new() -> Result<Self, IovDequeError> {
+        let memfd = Self::create_memfd()?;
+        let raw_memfd = memfd.as_file().as_raw_fd();
+        let buffer = Self::allocate_ring_buffer_memory()?;
+
+        // Map the first page of virtual memory to the physical page described by the memfd object
+        // SAFETY: We are calling the system call with valid arguments
+        let _ = unsafe {
+            Self::mmap(
+                buffer,
+                PAGE_SIZE,
+                libc::PROT_READ | libc::PROT_WRITE,
+                libc::MAP_SHARED | libc::MAP_FIXED,
+                raw_memfd,
+                0,
+            )
+        }?;
+
+        // Map the second page of virtual memory to the physical page described by the memfd object
+        //
+        // SAFETY: This is safe because:
+        // * Both `buffer` and the result of `buffer.add(PAGE_SIZE)` are within bounds of the
+        //   allocation we got from `Self::allocate_ring_buffer_memory`.
+        // * The computed offset is `PAGE_SIZE * size_of::<c_void>() == PAGE_SIZE bytes` which fits
+        //   in `isize`
+        // * The resulting pointer is the beginning of the second page of our allocation, so it
+        //   doesn't wrap around the address space.
+        let next_page = unsafe { buffer.add(PAGE_SIZE) };
+
+        // SAFETY: We are calling the system call with valid arguments
+        let _ = unsafe {
+            Self::mmap(
+                next_page,
+                PAGE_SIZE,
+                libc::PROT_READ | libc::PROT_WRITE,
+                libc::MAP_SHARED | libc::MAP_FIXED,
+                raw_memfd,
+                0,
+            )
+        }?;
+
+        Ok(Self {
+            iov: buffer.cast(),
+            start: 0,
+            len: 0,
+        })
+    }
+
+    /// Returns the number of `iovec` objects currently in the [`IovDeque`]
+    #[inline(always)]
+    pub fn len(&self) -> u16 {
+        self.len
+    }
+
+    /// Returns `true` if the [`IovDeque`] is full, `false` otherwise
+    #[inline(always)]
+    fn is_full(&self) -> bool {
+        self.len() == FIRECRACKER_MAX_QUEUE_SIZE
+    }
+
+    /// Resets the queue, dropping all its elements.
+    #[inline(always)]
+    pub fn clear(&mut self) {
+        self.start = 0;
+        self.len = 0;
+    }
+
+    /// Adds an `iovec` in the ring buffer.
+    ///
+    /// Returns an `IovDequeError::Full` error if the buffer is full.
+    pub fn push_back(&mut self, iov: iovec) {
+        // This should NEVER happen, since our ring buffer is as big as the maximum queue size.
+        // We also check for the sanity of the VirtIO queues, in queue.rs, which means that if we
+        // ever try to add something in a full ring buffer, there is an internal bug in the device
+        // emulation logic. Panic here because the device is hopelessly broken.
+        assert!(
+            !self.is_full(),
+            "The number of `iovec` objects is bigger than the available space"
+        );
+
+        // SAFETY: self.iov is a valid pointer and `self.start + self.len` is within range (we
+        // asserted before that the buffer is not full).
+        unsafe {
+            self.iov
+                .add((self.start + self.len) as usize)
+                .write_volatile(iov)
+        };
+        self.len += 1;
+    }
+
+    /// Pops the first `nr_iovecs` iovecs from the buffer.
+    ///
+    /// Returns the total number of bytes of all the popped iovecs. This will panic if we are asked
+    /// to pop more iovecs than what is currently available in the buffer.
+    pub fn pop_front(&mut self, nr_iovecs: u16) {
+        assert!(
+            self.len() >= nr_iovecs,
+            "Internal bug! Trying to drop more iovec objects than what is available"
+        );
+
+        self.start += nr_iovecs;
+        self.len -= nr_iovecs;
+        if self.start >= FIRECRACKER_MAX_QUEUE_SIZE {
+            self.start -= FIRECRACKER_MAX_QUEUE_SIZE;
+        }
+    }
+
+    /// Get a slice of the iovec objects currently in the buffer.
+    pub fn as_slice(&self) -> &[iovec] {
+        // SAFETY: Here we create a slice out of the existing elements in the buffer (not the whole
+        // allocated memory). That means that we can:
+        // * We can read `self.len * mem::size_of::<iovec>()` bytes out of the memory range we are
+        //   returning.
+        // * `self.iov.add(self.start.into())` is a non-null pointer and aligned.
+        // * The underlying memory comes from a single allocation.
+        // * The returning pointer points to `self.len` consecutive initialized `iovec` objects.
+        // * We are only accessing the underlying memory through the returned slice. Since we are
+        //   returning a slice of only the existing pushed elements the slice does not contain any
+        //   aliasing references.
+        // * The slice can be up to 1 page long which is smaller than `isize::MAX`.
+        unsafe {
+            let slice_start = self.iov.add(self.start.into());
+            std::slice::from_raw_parts(slice_start, self.len.into())
+        }
+    }
+
+    /// Get a mutable slice of the iovec objects currently in the buffer.
+    pub fn as_mut_slice(&mut self) -> &mut [iovec] {
+        // SAFETY: Here we create a slice out of the existing elements in the buffer (not the whole
+        // allocated memory). That means that we can:
+        // * We can read/write `self.len * mem::size_of::<iovec>()` bytes out of the memory range we
+        //   are returning.
+        // * The underlying memory comes from a single allocation.
+        // * `self.iov.add(self.start.into())` is a non-null pointer and aligned
+        // * The returning pointer points to `self.len` consecutive initialized `iovec` objects.
+        // * We are only accessing the underlying memory through the returned slice. Since we are
+        //   returning a slice of only the existing pushed elements the slice does not contain any
+        //   aliasing references.
+        // * The slice can be up to 1 page long which is smaller than `isize::MAX`.
+        unsafe {
+            let slice_start = self.iov.add(self.start.into());
+            std::slice::from_raw_parts_mut(slice_start, self.len.into())
+        }
+    }
+}
+
+impl Drop for IovDeque {
+    fn drop(&mut self) {
+        // SAFETY: We are passing an address that we got from a previous allocation of `2 *
+        // PAGE_SIZE` bytes by calling mmap
+        let _ = unsafe { libc::munmap(self.iov.cast(), PAGE_SIZE * 2) };
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use libc::iovec;
+
+    use super::IovDeque;
+
+    #[test]
+    fn test_new() {
+        let deque = IovDeque::new().unwrap();
+        assert_eq!(deque.len(), 0);
+    }
+
+    fn make_iovec(id: u16, len: u16) -> iovec {
+        iovec {
+            iov_base: id as *mut libc::c_void,
+            iov_len: len as usize,
+        }
+    }
+
+    #[test]
+    #[should_panic]
+    fn test_push_back_too_many() {
+        let mut deque = IovDeque::new().unwrap();
+        assert_eq!(deque.len(), 0);
+
+        for i in 0u16..256 {
+            deque.push_back(make_iovec(i, i));
+            assert_eq!(deque.len(), i + 1);
+        }
+
+        deque.push_back(make_iovec(0, 0));
+    }
+
+    #[test]
+    #[should_panic]
+    fn test_pop_front_from_empty() {
+        let mut deque = IovDeque::new().unwrap();
+        deque.pop_front(1);
+    }
+
+    #[test]
+    #[should_panic]
+    fn test_pop_front_too_many() {
+        let mut deque = IovDeque::new().unwrap();
+        deque.push_back(make_iovec(42, 42));
+        deque.pop_front(2);
+    }
+
+    #[test]
+    fn test_pop() {
+        let mut deque = IovDeque::new().unwrap();
+        assert_eq!(deque.len(), 0);
+        assert!(!deque.is_full());
+        deque.pop_front(0);
+
+        for i in 0u16..256 {
+            deque.push_back(make_iovec(i, i));
+            assert_eq!(deque.len(), i + 1);
+        }
+
+        assert!(deque.is_full());
+        assert!(deque.len() != 0);
+
+        for i in 0u16..256 {
+            deque.pop_front(1);
+            assert_eq!(deque.len(), 256 - i - 1);
+        }
+    }
+
+    #[test]
+    fn test_pop_many() {
+        let mut deque = IovDeque::new().unwrap();
+
+        for i in 0u16..256 {
+            deque.push_back(make_iovec(i, i));
+        }
+
+        deque.pop_front(1);
+        assert_eq!(deque.len(), 255);
+        deque.pop_front(2);
+        assert_eq!(deque.len(), 253);
+        deque.pop_front(4);
+        assert_eq!(deque.len(), 249);
+        deque.pop_front(8);
+        assert_eq!(deque.len(), 241);
+        deque.pop_front(16);
+        assert_eq!(deque.len(), 225);
+        deque.pop_front(32);
+        assert_eq!(deque.len(), 193);
+        deque.pop_front(64);
+        assert_eq!(deque.len(), 129);
+        deque.pop_front(128);
+        assert_eq!(deque.len(), 1);
+    }
+
+    #[test]
+    fn test_as_slice() {
+        let mut deque = IovDeque::new().unwrap();
+        assert!(deque.as_slice().is_empty());
+
+        for i in 0..256 {
+            deque.push_back(make_iovec(i, 100));
+            assert_eq!(deque.as_slice().len(), (i + 1) as usize);
+        }
+        let copy: Vec<iovec> = deque.as_slice().to_vec();
+
+        assert_eq!(copy.len(), deque.len() as usize);
+        for (i, iov) in deque.as_slice().iter().enumerate() {
+            assert_eq!(iov.iov_len, copy[i].iov_len);
+        }
+    }
+
+    #[test]
+    fn test_as_mut_slice() {
+        let mut deque = IovDeque::new().unwrap();
+        assert!(deque.as_mut_slice().is_empty());
+
+        for i in 0..256 {
+            deque.push_back(make_iovec(i, 100));
+            assert_eq!(deque.as_mut_slice().len(), (i + 1) as usize);
+        }
+
+        let copy: Vec<iovec> = deque.as_mut_slice().to_vec();
+        deque
+            .as_mut_slice()
+            .iter_mut()
+            .for_each(|iov| iov.iov_len *= 2);
+
+        assert_eq!(copy.len(), deque.len() as usize);
+        for (i, iov) in deque.as_slice().iter().enumerate() {
+            assert_eq!(iov.iov_len, 2 * copy[i].iov_len);
+        }
+    }
+}
diff --git a/src/vmm/src/devices/virtio/mod.rs b/src/vmm/src/devices/virtio/mod.rs
index f68c2a123c9..9931e1211d1 100644
--- a/src/vmm/src/devices/virtio/mod.rs
+++ b/src/vmm/src/devices/virtio/mod.rs
@@ -16,6 +16,7 @@ pub mod balloon;
 pub mod block;
 pub mod device;
 pub mod gen;
+mod iov_deque;
 pub mod iovec;
 pub mod mmio;
 pub mod net;

From ba139b66585f674678f55607a2cc4d0951d6f8fc Mon Sep 17 00:00:00 2001
From: Babis Chalios <bchalios@amazon.es>
Date: Thu, 12 Sep 2024 14:27:21 +0200
Subject: [PATCH 2/5] virtio: allow IoVecBufferMut to hold multiple
 DescriptorChain objects

Allow IoVecBufferMut objects to store multiple DescriptorChain objects,
so that we can describe guest memory meant to be used for receiving data
(for example memory used for network RX) as a single (sparse) memory
region.

This will allow us to always keep track all the available memory we have
for performing RX and use `readv` for copying memory from the TAP device
inside guest memory avoiding the extra copy. In the future, it will also
facilitate the implementation of mergeable buffers for the RX path of
the network device.

Co-authored-by: Egor Lazarchuk <yegorlz@amazon.co.uk>
Signed-off-by: Babis Chalios <bchalios@amazon.es>
---
 src/vmm/src/devices/virtio/iovec.rs        | 205 +++++++++++++++++----
 src/vmm/src/devices/virtio/rng/device.rs   |   8 +-
 src/vmm/src/devices/virtio/vsock/mod.rs    |   4 +
 src/vmm/src/devices/virtio/vsock/packet.rs |  12 +-
 4 files changed, 186 insertions(+), 43 deletions(-)

diff --git a/src/vmm/src/devices/virtio/iovec.rs b/src/vmm/src/devices/virtio/iovec.rs
index 3acde02fc05..eb8e46e7de6 100644
--- a/src/vmm/src/devices/virtio/iovec.rs
+++ b/src/vmm/src/devices/virtio/iovec.rs
@@ -4,12 +4,14 @@
 use std::io::ErrorKind;
 
 use libc::{c_void, iovec, size_t};
+#[cfg(not(kani))]
 use smallvec::SmallVec;
 use vm_memory::bitmap::Bitmap;
 use vm_memory::{
     GuestMemory, GuestMemoryError, ReadVolatile, VolatileMemoryError, VolatileSlice, WriteVolatile,
 };
 
+use super::iov_deque::{IovDeque, IovDequeError};
 use crate::devices::virtio::queue::DescriptorChain;
 use crate::vstate::memory::GuestMemoryMmap;
 
@@ -23,6 +25,8 @@ pub enum IoVecError {
     OverflowedDescriptor,
     /// Guest memory error: {0}
     GuestMemory(#[from] GuestMemoryError),
+    /// Error with underlying `IovDeque`: {0}
+    IovDeque(#[from] IovDequeError),
 }
 
 // Using SmallVec in the kani proofs causes kani to use unbounded amounts of memory
@@ -214,42 +218,55 @@ impl IoVecBuffer {
     }
 }
 
+#[derive(Debug)]
+pub struct ParsedDescriptorChain {
+    pub head_index: u16,
+    pub length: u32,
+    pub nr_iovecs: u16,
+}
+
 /// This is essentially a wrapper of a `Vec<libc::iovec>` which can be passed to `libc::readv`.
 ///
 /// It describes a write-only buffer passed to us by the guest that is scattered across multiple
 /// memory regions. Additionally, this wrapper provides methods that allow reading arbitrary ranges
 /// of data from that buffer.
-#[derive(Debug, Default, Clone)]
+#[derive(Debug)]
 pub struct IoVecBufferMut {
     // container of the memory regions included in this IO vector
-    vecs: IoVecVec,
+    vecs: IovDeque,
     // Total length of the IoVecBufferMut
-    len: u32,
+    len: usize,
 }
 
 impl IoVecBufferMut {
-    /// Create an `IoVecBuffer` from a `DescriptorChain`
+    /// Parse a `DescriptorChain` object and append the memory regions it describes in the
+    /// underlying ring buffer.
     ///
     /// # Safety
     ///
     /// The descriptor chain cannot be referencing the same memory location as another chain
-    pub unsafe fn load_descriptor_chain(
+    unsafe fn parse_descriptor(
         &mut self,
         mem: &GuestMemoryMmap,
         head: DescriptorChain,
-    ) -> Result<(), IoVecError> {
-        self.clear();
-
+    ) -> Result<ParsedDescriptorChain, IoVecError> {
+        let head_index = head.index;
         let mut next_descriptor = Some(head);
+        let mut length = 0u32;
+        let mut nr_iovecs = 0u16;
         while let Some(desc) = next_descriptor {
             if !desc.is_write_only() {
+                self.vecs.pop_front(nr_iovecs);
                 return Err(IoVecError::ReadOnlyDescriptor);
             }
 
             // We use get_slice instead of `get_host_address` here in order to have the whole
             // range of the descriptor chain checked, i.e. [addr, addr + len) is a valid memory
             // region in the GuestMemoryMmap.
-            let slice = mem.get_slice(desc.addr, desc.len as usize)?;
+            let slice = mem.get_slice(desc.addr, desc.len as usize).map_err(|err| {
+                self.vecs.pop_front(nr_iovecs);
+                err
+            })?;
 
             // We need to mark the area of guest memory that will be mutated through this
             // IoVecBufferMut as dirty ahead of time, as we loose access to all
@@ -257,21 +274,79 @@ impl IoVecBufferMut {
             slice.bitmap().mark_dirty(0, desc.len as usize);
 
             let iov_base = slice.ptr_guard_mut().as_ptr().cast::<c_void>();
-            self.vecs.push(iovec {
+            self.vecs.push_back(iovec {
                 iov_base,
                 iov_len: desc.len as size_t,
             });
-            self.len = self
-                .len
+            nr_iovecs += 1;
+            length = length
                 .checked_add(desc.len)
-                .ok_or(IoVecError::OverflowedDescriptor)?;
+                .ok_or(IoVecError::OverflowedDescriptor)
+                .map_err(|err| {
+                    self.vecs.pop_front(nr_iovecs);
+                    err
+                })?;
 
             next_descriptor = desc.next_descriptor();
         }
 
+        self.len = self.len.checked_add(length as usize).ok_or_else(|| {
+            self.vecs.pop_front(nr_iovecs);
+            IoVecError::OverflowedDescriptor
+        })?;
+
+        Ok(ParsedDescriptorChain {
+            head_index,
+            length,
+            nr_iovecs,
+        })
+    }
+
+    /// Create an empty `IoVecBufferMut`.
+    pub fn new() -> Result<Self, IovDequeError> {
+        let vecs = IovDeque::new()?;
+        Ok(Self { vecs, len: 0 })
+    }
+
+    /// Create an `IoVecBufferMut` from a `DescriptorChain`
+    ///
+    /// This will clear any previous `iovec` objects in the buffer and load the new
+    /// [`DescriptorChain`].
+    ///
+    /// # Safety
+    ///
+    /// The descriptor chain cannot be referencing the same memory location as another chain
+    pub unsafe fn load_descriptor_chain(
+        &mut self,
+        mem: &GuestMemoryMmap,
+        head: DescriptorChain,
+    ) -> Result<(), IoVecError> {
+        self.clear();
+        let _ = self.parse_descriptor(mem, head)?;
         Ok(())
     }
 
+    /// Append a `DescriptorChain` in this `IoVecBufferMut`
+    ///
+    /// # Safety
+    ///
+    /// The descriptor chain cannot be referencing the same memory location as another chain
+    pub unsafe fn append_descriptor_chain(
+        &mut self,
+        mem: &GuestMemoryMmap,
+        head: DescriptorChain,
+    ) -> Result<ParsedDescriptorChain, IoVecError> {
+        self.parse_descriptor(mem, head)
+    }
+
+    /// Drop memory from the `IoVecBufferMut`
+    ///
+    /// This will drop memory described by the `IoVecBufferMut` from the beginning.
+    pub fn drop_descriptor_chain(&mut self, parse_descriptor: &ParsedDescriptorChain) {
+        self.vecs.pop_front(parse_descriptor.nr_iovecs);
+        self.len -= parse_descriptor.length as usize;
+    }
+
     /// Create an `IoVecBuffer` from a `DescriptorChain`
     ///
     /// # Safety
@@ -281,20 +356,29 @@ impl IoVecBufferMut {
         mem: &GuestMemoryMmap,
         head: DescriptorChain,
     ) -> Result<Self, IoVecError> {
-        let mut new_buffer = Self::default();
+        let mut new_buffer = Self::new()?;
         new_buffer.load_descriptor_chain(mem, head)?;
         Ok(new_buffer)
     }
 
     /// Get the total length of the memory regions covered by this `IoVecBuffer`
-    pub(crate) fn len(&self) -> u32 {
+    ///
+    /// In contrast to the equivalent [`IoVecBuffer::len()`] which returns `u32`, this one returns
+    /// `usize` since the buffer can contain multiple `DescriptorChain` objects, so we don't have
+    /// the limit that the length of a buffer is limited by `u32`.
+    pub(crate) fn len(&self) -> usize {
         self.len
     }
 
+    /// Returns a pointer to the memory keeping the `iovec` structs
+    pub fn as_iovec_mut_slice(&mut self) -> &mut [iovec] {
+        self.vecs.as_mut_slice()
+    }
+
     /// Clears the `iovec` array
     pub fn clear(&mut self) {
         self.vecs.clear();
-        self.len = 0u32;
+        self.len = 0;
     }
 
     /// Writes a number of bytes into the `IoVecBufferMut` starting at a given offset.
@@ -313,7 +397,7 @@ impl IoVecBufferMut {
         mut buf: &[u8],
         offset: usize,
     ) -> Result<(), VolatileMemoryError> {
-        if offset < self.len() as usize {
+        if offset < self.len() {
             let expected = buf.len();
             let bytes_written = self.write_volatile_at(&mut buf, offset, expected)?;
 
@@ -342,7 +426,7 @@ impl IoVecBufferMut {
     ) -> Result<usize, VolatileMemoryError> {
         let mut total_bytes_read = 0;
 
-        for iov in &self.vecs {
+        for iov in self.vecs.as_slice() {
             if len == 0 {
                 break;
             }
@@ -391,6 +475,7 @@ mod tests {
     use vm_memory::VolatileMemoryError;
 
     use super::{IoVecBuffer, IoVecBufferMut};
+    use crate::devices::virtio::iov_deque::IovDeque;
     use crate::devices::virtio::queue::{Queue, VIRTQ_DESC_F_NEXT, VIRTQ_DESC_F_WRITE};
     use crate::devices::virtio::test_utils::VirtQueue;
     use crate::test_utils::multi_region_mem;
@@ -429,17 +514,36 @@ mod tests {
 
     impl From<&mut [u8]> for IoVecBufferMut {
         fn from(buf: &mut [u8]) -> Self {
+            let mut vecs = IovDeque::new().unwrap();
+            vecs.push_back(iovec {
+                iov_base: buf.as_mut_ptr().cast::<c_void>(),
+                iov_len: buf.len(),
+            });
+
             Self {
-                vecs: vec![iovec {
-                    iov_base: buf.as_mut_ptr().cast::<c_void>(),
-                    iov_len: buf.len(),
-                }]
-                .into(),
-                len: buf.len().try_into().unwrap(),
+                vecs,
+                len: buf.len(),
             }
         }
     }
 
+    impl From<Vec<&mut [u8]>> for IoVecBufferMut {
+        fn from(buffer: Vec<&mut [u8]>) -> Self {
+            let mut len = 0usize;
+            let mut vecs = IovDeque::new().unwrap();
+            for slice in buffer {
+                len += slice.len();
+
+                vecs.push_back(iovec {
+                    iov_base: slice.as_ptr() as *mut c_void,
+                    iov_len: slice.len(),
+                });
+            }
+
+            Self { vecs, len }
+        }
+    }
+
     fn default_mem() -> GuestMemoryMmap {
         multi_region_mem(&[
             (GuestAddress(0), 0x10000),
@@ -528,8 +632,19 @@ mod tests {
         let head = q.pop().unwrap();
 
         // SAFETY: This descriptor chain is only loaded once in this test
-        let iovec = unsafe { IoVecBufferMut::from_descriptor_chain(&mem, head).unwrap() };
+        let mut iovec = unsafe { IoVecBufferMut::from_descriptor_chain(&mem, head).unwrap() };
         assert_eq!(iovec.len(), 4 * 64);
+
+        // We are creating a new queue where we can get descriptors from. Probably, this is not
+        // something that we will ever want to do, as `IoVecBufferMut`s are typically
+        // (concpetually) associated with a single `Queue`. We just do this here to be able to test
+        // the appending logic.
+        let (mut q, _) = write_only_chain(&mem);
+        let head = q.pop().unwrap();
+        // SAFETY: it is actually unsafe, but we just want to check the length of the
+        // `IoVecBufferMut` after appending.
+        let _ = unsafe { iovec.append_descriptor_chain(&mem, head).unwrap() };
+        assert_eq!(iovec.len(), 8 * 64);
     }
 
     #[test]
@@ -687,6 +802,7 @@ mod verification {
     use vm_memory::VolatileSlice;
 
     use super::{IoVecBuffer, IoVecBufferMut, IoVecVec};
+    use crate::devices::virtio::iov_deque::IovDeque;
 
     // Maximum memory size to use for our buffers. For the time being 1KB.
     const GUEST_MEMORY_SIZE: usize = 1 << 10;
@@ -702,10 +818,10 @@ mod verification {
         let mut vecs: Vec<iovec> = Vec::with_capacity(nr_descs);
         let mut len = 0u32;
         for _ in 0..nr_descs {
-            // The `IoVecBuffer(Mut)` constructors ensure that the memory region described by every
+            // The `IoVecBuffer` constructors ensure that the memory region described by every
             // `Descriptor` in the chain is a valid, i.e. it is memory with then guest's memory
             // mmap. The assumption, here, that the last address is within the memory object's
-            // bound substitutes these checks that `IoVecBuffer(Mut)::new() performs.`
+            // bound substitutes these checks that `IoVecBuffer::new() performs.`
             let addr: usize = kani::any();
             let iov_len: usize =
                 kani::any_where(|&len| matches!(addr.checked_add(len), Some(x) if x <= size));
@@ -728,6 +844,26 @@ mod verification {
         }
     }
 
+    fn create_iovecs_mut(mem: *mut u8, size: usize, nr_descs: usize) -> (IovDeque, u32) {
+        let mut vecs = IovDeque::new().unwrap();
+        let mut len = 0u32;
+        for _ in 0..nr_descs {
+            // The `IoVecBufferMut` constructors ensure that the memory region described by every
+            // `Descriptor` in the chain is a valid, i.e. it is memory with then guest's memory
+            // mmap. The assumption, here, that the last address is within the memory object's
+            // bound substitutes these checks that `IoVecBufferMut::new() performs.`
+            let addr: usize = kani::any();
+            let iov_len: usize =
+                kani::any_where(|&len| matches!(addr.checked_add(len), Some(x) if x <= size));
+            let iov_base = unsafe { mem.offset(addr.try_into().unwrap()) } as *mut c_void;
+
+            vecs.push_back(iovec { iov_base, iov_len });
+            len += u32::try_from(iov_len).unwrap();
+        }
+
+        (vecs, len)
+    }
+
     impl IoVecBufferMut {
         fn any_of_length(nr_descs: usize) -> Self {
             // We only write into `IoVecBufferMut` objects, so we can simply create a guest memory
@@ -739,8 +875,11 @@ mod verification {
                 ))
             };
 
-            let (vecs, len) = create_iovecs(mem, GUEST_MEMORY_SIZE, nr_descs);
-            Self { vecs, len }
+            let (vecs, len) = create_iovecs_mut(mem, GUEST_MEMORY_SIZE, nr_descs);
+            Self {
+                vecs,
+                len: len.try_into().unwrap(),
+            }
         }
     }
 
@@ -820,7 +959,7 @@ mod verification {
             let mut iov_mut = IoVecBufferMut::any_of_length(nr_descs);
 
             let mut buf = kani::vec::any_vec::<u8, GUEST_MEMORY_SIZE>();
-            let offset: u32 = kani::any();
+            let offset: usize = kani::any();
 
             // We can't really check the contents that the operation here writes into
             // `IoVecBufferMut`, because our `IoVecBufferMut` being completely arbitrary
@@ -835,11 +974,7 @@ mod verification {
             // Ok(...)
             assert_eq!(
                 iov_mut
-                    .write_volatile_at(
-                        &mut KaniBuffer(&mut buf),
-                        offset as usize,
-                        GUEST_MEMORY_SIZE
-                    )
+                    .write_volatile_at(&mut KaniBuffer(&mut buf), offset, GUEST_MEMORY_SIZE)
                     .unwrap(),
                 buf.len().min(iov_mut.len().saturating_sub(offset) as usize)
             );
diff --git a/src/vmm/src/devices/virtio/rng/device.rs b/src/vmm/src/devices/virtio/rng/device.rs
index 4bdea1c40f6..6a73816947a 100644
--- a/src/vmm/src/devices/virtio/rng/device.rs
+++ b/src/vmm/src/devices/virtio/rng/device.rs
@@ -112,7 +112,7 @@ impl Entropy {
             return Ok(0);
         }
 
-        let mut rand_bytes = vec![0; iovec.len() as usize];
+        let mut rand_bytes = vec![0; iovec.len()];
         rand::fill(&mut rand_bytes).map_err(|err| {
             METRICS.host_rng_fails.inc();
             err
@@ -120,7 +120,9 @@ impl Entropy {
 
         // It is ok to unwrap here. We are writing `iovec.len()` bytes at offset 0.
         iovec.write_all_volatile_at(&rand_bytes, 0).unwrap();
-        Ok(iovec.len())
+        // It is ok to unwrap here. `iovec` contains only a single `DescriptorChain`, which means
+        // that its length fit in a u32.
+        Ok(u32::try_from(iovec.len()).unwrap())
     }
 
     fn process_entropy_queue(&mut self) {
@@ -145,7 +147,7 @@ impl Entropy {
                     // Check for available rate limiting budget.
                     // If not enough budget is available, leave the request descriptor in the queue
                     // to handle once we do have budget.
-                    if !Self::rate_limit_request(&mut self.rate_limiter, u64::from(iovec.len())) {
+                    if !Self::rate_limit_request(&mut self.rate_limiter, iovec.len() as u64) {
                         debug!("entropy: throttling entropy queue");
                         METRICS.entropy_rate_limiter_throttled.inc();
                         self.queues[RNG_QUEUE].undo_pop();
diff --git a/src/vmm/src/devices/virtio/vsock/mod.rs b/src/vmm/src/devices/virtio/vsock/mod.rs
index 7fdc86aed2e..879097fef45 100644
--- a/src/vmm/src/devices/virtio/vsock/mod.rs
+++ b/src/vmm/src/devices/virtio/vsock/mod.rs
@@ -30,6 +30,7 @@ pub use self::defs::uapi::VIRTIO_ID_VSOCK as TYPE_VSOCK;
 pub use self::defs::VSOCK_DEV_ID;
 pub use self::device::Vsock;
 pub use self::unix::{VsockUnixBackend, VsockUnixBackendError};
+use super::iov_deque::IovDequeError;
 use crate::devices::virtio::iovec::IoVecError;
 use crate::devices::virtio::persist::PersistError as VirtioStateError;
 
@@ -138,6 +139,8 @@ pub enum VsockError {
     VirtioState(VirtioStateError),
     /// Vsock uds backend error: {0}
     VsockUdsBackend(VsockUnixBackendError),
+    /// Underlying IovDeque error: {0}
+    IovDeque(IovDequeError),
 }
 
 impl From<IoVecError> for VsockError {
@@ -147,6 +150,7 @@ impl From<IoVecError> for VsockError {
             IoVecError::ReadOnlyDescriptor => VsockError::UnwritableDescriptor,
             IoVecError::GuestMemory(err) => VsockError::GuestMemoryMmap(err),
             IoVecError::OverflowedDescriptor => VsockError::DescChainOverflow,
+            IoVecError::IovDeque(err) => VsockError::IovDeque(err),
         }
     }
 }
diff --git a/src/vmm/src/devices/virtio/vsock/packet.rs b/src/vmm/src/devices/virtio/vsock/packet.rs
index d63dd41386e..f43c586330c 100644
--- a/src/vmm/src/devices/virtio/vsock/packet.rs
+++ b/src/vmm/src/devices/virtio/vsock/packet.rs
@@ -172,8 +172,10 @@ impl VsockPacket {
         // are live at the same time, meaning this has exclusive ownership over the memory
         let buffer = unsafe { IoVecBufferMut::from_descriptor_chain(mem, chain)? };
 
-        if buffer.len() < VSOCK_PKT_HDR_SIZE {
-            return Err(VsockError::DescChainTooShortForHeader(buffer.len() as usize));
+        // It is ok to unwrap the conversion from usize to u32, because the `buffer` only contains
+        // a single `DescriptorChain`, so its length fits in a u32.
+        if (u32::try_from(buffer.len()).unwrap()) < VSOCK_PKT_HDR_SIZE {
+            return Err(VsockError::DescChainTooShortForHeader(buffer.len()));
         }
 
         Ok(Self {
@@ -222,7 +224,7 @@ impl VsockPacket {
     pub fn buf_size(&self) -> u32 {
         let chain_length = match self.buffer {
             VsockPacketBuffer::Tx(ref iovec_buf) => iovec_buf.len(),
-            VsockPacketBuffer::Rx(ref iovec_buf) => iovec_buf.len(),
+            VsockPacketBuffer::Rx(ref iovec_buf) => u32::try_from(iovec_buf.len()).unwrap(),
         };
         chain_length - VSOCK_PKT_HDR_SIZE
     }
@@ -237,8 +239,8 @@ impl VsockPacket {
             VsockPacketBuffer::Tx(_) => Err(VsockError::UnwritableDescriptor),
             VsockPacketBuffer::Rx(ref mut buffer) => {
                 if count
-                    > buffer
-                        .len()
+                    > u32::try_from(buffer.len())
+                        .unwrap()
                         .saturating_sub(VSOCK_PKT_HDR_SIZE)
                         .saturating_sub(offset)
                 {

From a0028749d7b18f7b73a044568b6aaba2018f273a Mon Sep 17 00:00:00 2001
From: Babis Chalios <bchalios@amazon.es>
Date: Thu, 12 Sep 2024 21:05:53 +0200
Subject: [PATCH 3/5] net: use readv for reading frames from TAP device

Right now, we are performing two copies for writing a frame from the TAP
device into guest memory. We first read the frame in an array held by
the Net device and then copy that array in a DescriptorChain.

In order to avoid the double copy use the readv system call to read
directly from the TAP device into the buffers described by
DescriptorChain.

The main challenge with this is that DescriptorChain objects describe
memory that is at least 65562 bytes long when guest TSO4, TSO6 or UFO
are enabled or 1526 otherwise and parsing the chain includes overhead
which we pay even if the frame we are receiving is much smaller than
these sizes.

PR https://github.com/firecracker-microvm/firecracker/pull/4748 reduced
the overheads involved with parsing DescriptorChain objects. To further
avoid this overhead, move the parsing of DescriptorChain objects out of
the hot path of process_rx() where we are actually receiving a frame
into process_rx_queue_event() where we get the notification that the
guest added new buffers for network RX.

Signed-off-by: Babis Chalios <bchalios@amazon.es>
---
 .../seccomp/aarch64-unknown-linux-musl.json   | 102 ++++
 .../seccomp/x86_64-unknown-linux-musl.json    | 102 ++++
 src/vmm/src/devices/virtio/net/device.rs      | 565 ++++++++++--------
 src/vmm/src/devices/virtio/net/mod.rs         |   6 +
 src/vmm/src/devices/virtio/net/tap.rs         |  59 +-
 src/vmm/src/devices/virtio/net/test_utils.rs  |  20 +-
 6 files changed, 557 insertions(+), 297 deletions(-)

diff --git a/resources/seccomp/aarch64-unknown-linux-musl.json b/resources/seccomp/aarch64-unknown-linux-musl.json
index 868e7ce0e99..f0a27b6e40d 100644
--- a/resources/seccomp/aarch64-unknown-linux-musl.json
+++ b/resources/seccomp/aarch64-unknown-linux-musl.json
@@ -32,6 +32,108 @@
                 "syscall": "writev",
                 "comment": "Used by the VirtIO net device to write to tap"
             },
+            {
+                "syscall": "readv",
+                "comment": "Used by the VirtIO net device to read from tap"
+            },
+            {
+                "syscall": "memfd_create",
+                "comment": "Used by the IovDeque implementation"
+            },
+            {
+                "syscall": "fcntl",
+                "comment": "Used by the IovDeque implementation",
+                "args": [
+                    {
+                        "index": 1,
+                        "type": "dword",
+                        "op": "eq",
+                        "val": 1033,
+                        "comment": "FCNTL_F_SETFD"
+                    },
+                    {
+                        "index": 2,
+                        "type": "dword",
+                        "op": "eq",
+                        "val": 6,
+                        "comment": "F_SEAL_SHRINK|F_SEAL_GROW"
+                    }
+                ]
+            },
+            {
+                "syscall": "fcntl",
+                "comment": "Used by the IovDeque implementation",
+                "args": [
+                    {
+                        "index": 1,
+                        "type": "dword",
+                        "op": "eq",
+                        "val": 1033,
+                        "comment": "FCNTL_F_SETFD"
+                    },
+                    {
+                        "index": 2,
+                        "type": "dword",
+                        "op": "eq",
+                        "val": 1,
+                        "comment": "F_SEAL_SEAL"
+                    }
+                ]
+            },
+            {
+                "syscall": "mmap",
+                "comment": "Used by the IovDeque implementation",
+                "args": [
+                    {
+                        "index": 1,
+                        "type": "dword",
+                        "op": "eq",
+                        "val": 4096,
+                        "comment": "Page size allocation"
+                    },
+                    {
+                        "index": 2,
+                        "type": "dword",
+                        "op": "eq",
+                        "val": 3,
+                        "comment": "PROT_READ|PROT_WRITE"
+                    },
+                    {
+                        "index": 3,
+                        "type": "dword",
+                        "op": "eq",
+                        "val": 17,
+                        "comment": "MAP_SHARED|MAP_FIXED"
+                    }
+                ]
+            },
+            {
+                "syscall": "mmap",
+                "comment": "Used by the IovDeque implementation",
+                "args": [
+                    {
+                        "index": 1,
+                        "type": "dword",
+                        "op": "eq",
+                        "val": 8192,
+                        "comment": "2 pages allocation"
+                    },
+                    {
+                        "index": 2,
+                        "type": "dword",
+                        "op": "eq",
+                        "val": 0,
+                        "comment": "PROT_NONE"
+                    },
+                    {
+                        "index": 3,
+                        "type": "dword",
+                        "op": "eq",
+                        "val": 34,
+                        "comment": "MAP_PRIVATE|MAP_ANONYMOUS"
+                    }
+                ]
+            },
             {
                 "syscall": "fsync"
             },
diff --git a/resources/seccomp/x86_64-unknown-linux-musl.json b/resources/seccomp/x86_64-unknown-linux-musl.json
index e5b4b690196..630211f47d1 100644
--- a/resources/seccomp/x86_64-unknown-linux-musl.json
+++ b/resources/seccomp/x86_64-unknown-linux-musl.json
@@ -32,6 +32,108 @@
                 "syscall": "writev",
                 "comment": "Used by the VirtIO net device to write to tap"
             },
+            {
+                "syscall": "readv",
+                "comment": "Used by the VirtIO net device to read from tap"
+            },
+            {
+                "syscall": "memfd_create",
+                "comment": "Used by the IovDeque implementation"
+            },
+            {
+                "syscall": "fcntl",
+                "comment": "Used by the IovDeque implementation",
+                "args": [
+                    {
+                        "index": 1,
+                        "type": "dword",
+                        "op": "eq",
+                        "val": 1033,
+                        "comment": "FCNTL_F_SETFD"
+                    },
+                    {
+                        "index": 2,
+                        "type": "dword",
+                        "op": "eq",
+                        "val": 6,
+                        "comment": "F_SEAL_SHRINK|F_SEAL_GROW"
+                    }
+                ]
+            },
+            {
+                "syscall": "fcntl",
+                "comment": "Used by the IovDeque implementation",
+                "args": [
+                    {
+                        "index": 1,
+                        "type": "dword",
+                        "op": "eq",
+                        "val": 1033,
+                        "comment": "FCNTL_F_SETFD"
+                    },
+                    {
+                        "index": 2,
+                        "type": "dword",
+                        "op": "eq",
+                        "val": 1,
+                        "comment": "F_SEAL_SEAL"
+                    }
+                ]
+            },
+            {
+                "syscall": "mmap",
+                "comment": "Used by the IovDeque implementation",
+                "args": [
+                    {
+                        "index": 1,
+                        "type": "dword",
+                        "op": "eq",
+                        "val": 4096,
+                        "comment": "Page size allocation"
+                    },
+                    {
+                        "index": 2,
+                        "type": "dword",
+                        "op": "eq",
+                        "val": 3,
+                        "comment": "PROT_READ|PROT_WRITE"
+                    },
+                    {
+                        "index": 3,
+                        "type": "dword",
+                        "op": "eq",
+                        "val": 17,
+                        "comment": "MAP_SHARED|MAP_FIXED"
+                    }
+                ]
+            },
+            {
+                "syscall": "mmap",
+                "comment": "Used by the IovDeque implementation",
+                "args": [
+                    {
+                        "index": 1,
+                        "type": "dword",
+                        "op": "eq",
+                        "val": 8192,
+                        "comment": "2 pages allocation"
+                    },
+                    {
+                        "index": 2,
+                        "type": "dword",
+                        "op": "eq",
+                        "val": 0,
+                        "comment": "PROT_NONE"
+                    },
+                    {
+                        "index": 3,
+                        "type": "dword",
+                        "op": "eq",
+                        "val": 34,
+                        "comment": "MAP_PRIVATE|MAP_ANONYMOUS"
+                    }
+                ]
+            },
             {
                 "syscall": "fsync"
             },
diff --git a/src/vmm/src/devices/virtio/net/device.rs b/src/vmm/src/devices/virtio/net/device.rs
index f8c29f95175..ed6f36dcb0b 100755
--- a/src/vmm/src/devices/virtio/net/device.rs
+++ b/src/vmm/src/devices/virtio/net/device.rs
@@ -5,14 +5,13 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the THIRD-PARTY file.
 
-use std::io::Read;
+use std::collections::VecDeque;
 use std::mem;
 use std::net::Ipv4Addr;
 use std::sync::{Arc, Mutex};
 
 use libc::EAGAIN;
-use log::{error, warn};
-use vm_memory::GuestMemoryError;
+use log::error;
 use vmm_sys_util::eventfd::EventFd;
 
 use crate::devices::virtio::device::{DeviceState, IrqTrigger, IrqType, VirtioDevice};
@@ -23,13 +22,15 @@ use crate::devices::virtio::gen::virtio_net::{
     VIRTIO_NET_F_HOST_TSO6, VIRTIO_NET_F_HOST_UFO, VIRTIO_NET_F_MAC,
 };
 use crate::devices::virtio::gen::virtio_ring::VIRTIO_RING_F_EVENT_IDX;
-use crate::devices::virtio::iovec::IoVecBuffer;
+use crate::devices::virtio::iovec::{
+    IoVecBuffer, IoVecBufferMut, IoVecError, ParsedDescriptorChain,
+};
 use crate::devices::virtio::net::metrics::{NetDeviceMetrics, NetMetricsPerDevice};
 use crate::devices::virtio::net::tap::Tap;
 use crate::devices::virtio::net::{
     gen, NetError, NetQueue, MAX_BUFFER_SIZE, NET_QUEUE_SIZES, RX_INDEX, TX_INDEX,
 };
-use crate::devices::virtio::queue::{DescriptorChain, Queue};
+use crate::devices::virtio::queue::{DescriptorChain, Queue, FIRECRACKER_MAX_QUEUE_SIZE};
 use crate::devices::virtio::{ActivateError, TYPE_NET};
 use crate::devices::{report_net_event_fail, DeviceError};
 use crate::dumbo::pdu::arp::ETH_IPV4_FRAME_LEN;
@@ -40,24 +41,10 @@ use crate::mmds::ns::MmdsNetworkStack;
 use crate::rate_limiter::{BucketUpdate, RateLimiter, TokenType};
 use crate::utils::net::mac::MacAddr;
 use crate::utils::u64_to_usize;
-use crate::vstate::memory::{ByteValued, Bytes, GuestMemoryMmap};
+use crate::vstate::memory::{ByteValued, GuestMemoryMmap};
 
 const FRAME_HEADER_MAX_LEN: usize = PAYLOAD_OFFSET + ETH_IPV4_FRAME_LEN;
 
-#[derive(Debug, thiserror::Error, displaydoc::Display)]
-enum FrontendError {
-    /// Add user.
-    AddUsed,
-    /// Descriptor chain too mall.
-    DescriptorChainTooSmall,
-    /// Empty queue.
-    EmptyQueue,
-    /// Guest memory error: {0}
-    GuestMemory(GuestMemoryError),
-    /// Read only descriptor.
-    ReadOnlyDescriptor,
-}
-
 pub(crate) const fn vnet_hdr_len() -> usize {
     mem::size_of::<virtio_net_hdr_v1>()
 }
@@ -102,6 +89,118 @@ pub struct ConfigSpace {
 // SAFETY: `ConfigSpace` contains only PODs in `repr(C)` or `repr(transparent)`, without padding.
 unsafe impl ByteValued for ConfigSpace {}
 
+#[derive(Debug, thiserror::Error, displaydoc::Display)]
+enum AddRxBufferError {
+    /// Error while parsing new buffer: {0}
+    Parsing(#[from] IoVecError),
+    /// RX buffer is too small
+    BufferTooSmall,
+}
+
+/// A map of all the memory the guest has provided us with for performing RX
+#[derive(Debug)]
+pub struct RxBuffers {
+    // minimum size of a usable buffer for doing RX
+    pub min_buffer_size: u32,
+    // An [`IoVecBufferMut`] covering all the memory we have available for receiving network
+    // frames.
+    pub iovec: IoVecBufferMut,
+    // A map of which part of the memory belongs to which `DescriptorChain` object
+    pub parsed_descriptors: VecDeque<ParsedDescriptorChain>,
+    // Buffers that we have used and they are ready to be given back to the guest.
+    pub deferred_descriptor: Option<ParsedDescriptorChain>,
+}
+
+impl RxBuffers {
+    /// Create a new [`RxBuffers`] object for storing guest memory for performing RX
+    fn new() -> Result<Self, IoVecError> {
+        Ok(Self {
+            min_buffer_size: 0,
+            iovec: IoVecBufferMut::new()?,
+            parsed_descriptors: VecDeque::with_capacity(FIRECRACKER_MAX_QUEUE_SIZE.into()),
+            deferred_descriptor: None,
+        })
+    }
+
+    /// Add a new `DescriptorChain` that we received from the RX queue in the buffer.
+    ///
+    /// SAFETY: The `DescriptorChain` cannot be referencing the same memory location as any other
+    /// `DescriptorChain`. (See also related comment in
+    /// [`IoVecBufferMut::append_descriptor_chain`]).
+    unsafe fn add_buffer(
+        &mut self,
+        mem: &GuestMemoryMmap,
+        head: DescriptorChain,
+    ) -> Result<(), AddRxBufferError> {
+        let parsed_dc = self.iovec.append_descriptor_chain(mem, head)?;
+        if parsed_dc.length < self.min_buffer_size {
+            self.iovec.drop_descriptor_chain(&parsed_dc);
+            return Err(AddRxBufferError::BufferTooSmall);
+        }
+        self.parsed_descriptors.push_back(parsed_dc);
+        Ok(())
+    }
+
+    /// Returns the number of available `iovec` objects.
+    #[inline(always)]
+    fn len(&self) -> usize {
+        self.iovec.len()
+    }
+
+    /// Returns `true` if there aren't any available `iovec` objects.
+    #[inline(always)]
+    fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
+
+    /// Mark the first `size` bytes of available memory as used.
+    ///
+    /// # Safety:
+    ///
+    /// * The `RxBuffers` should include at least one parsed `DescriptorChain`.
+    /// * `size` needs to be smaller or equal to total length of the first `DescriptorChain` stored
+    ///   in the `RxBuffers`.
+    unsafe fn mark_used(&mut self, size: u32) -> ParsedDescriptorChain {
+        // Since we were able to write a frame in guest memory, we should have at least one
+        // descriptor chain here. If not, we have a bug, so fail fast, since the device is
+        // fundamentally broken.
+        let mut parsed_dc = self.parsed_descriptors.pop_front().expect(
+            "net: internal bug. Mismatch between written frame size and available descriptors",
+        );
+
+        self.header_set_num_buffers(1);
+        self.iovec.drop_descriptor_chain(&parsed_dc);
+        parsed_dc.length = size;
+        parsed_dc
+    }
+
+    /// Write the number of descriptors used in VirtIO header
+    fn header_set_num_buffers(&mut self, nr_descs: u16) {
+        // We can unwrap here, because we have checked before that the `IoVecBufferMut` holds at
+        // least one buffer with the proper size, depending on the feature negotiation. In any
+        // case, the buffer holds memory of at least `std::mem::size_of::<virtio_net_hdr_v1>()`
+        // bytes.
+        self.iovec
+            .write_all_volatile_at(
+                &nr_descs.to_le_bytes(),
+                std::mem::offset_of!(virtio_net_hdr_v1, num_buffers),
+            )
+            .unwrap()
+    }
+
+    /// This will let the guest know that about all the `DescriptorChain` object that has been
+    /// used to receive a frame from the TAP.
+    fn finish_frame(&mut self, dc: &ParsedDescriptorChain, rx_queue: &mut Queue) {
+        // It is fine to `.unrap()` here. The only reason why `add_used` can fail is if the
+        // `head_index` is not a valid descriptor id. `head_index` here is a valid
+        // `DescriptorChain` index. We got it from `queue.pop_or_enable_notification()` which
+        // checks for its validity. In other words, if this unwrap() fails there's a bug in our
+        // emulation logic which, most likely, we can't recover from. So, let's crash here
+        // instead of logging an error and continuing.
+        rx_queue.add_used(dc.head_index, dc.length).unwrap();
+    }
+}
+
 /// VirtIO network device.
 ///
 /// It emulates a network device able to exchange L2 frames between the guest
@@ -122,9 +221,6 @@ pub struct Net {
     pub(crate) rx_rate_limiter: RateLimiter,
     pub(crate) tx_rate_limiter: RateLimiter,
 
-    pub(crate) rx_deferred_frame: bool,
-
-    rx_bytes_read: usize,
     rx_frame_buf: [u8; MAX_BUFFER_SIZE],
 
     tx_frame_headers: [u8; frame_hdr_len()],
@@ -143,6 +239,7 @@ pub struct Net {
     pub(crate) metrics: Arc<NetDeviceMetrics>,
 
     tx_buffer: IoVecBuffer,
+    pub(crate) rx_buffer: RxBuffers,
 }
 
 impl Net {
@@ -189,8 +286,6 @@ impl Net {
             queue_evts,
             rx_rate_limiter,
             tx_rate_limiter,
-            rx_deferred_frame: false,
-            rx_bytes_read: 0,
             rx_frame_buf: [0u8; MAX_BUFFER_SIZE],
             tx_frame_headers: [0u8; frame_hdr_len()],
             irq_trigger: IrqTrigger::new().map_err(NetError::EventFd)?,
@@ -201,6 +296,7 @@ impl Net {
             mmds_ns: None,
             metrics: NetMetricsPerDevice::alloc(id),
             tx_buffer: Default::default(),
+            rx_buffer: RxBuffers::new()?,
         })
     }
 
@@ -311,126 +407,50 @@ impl Net {
     // Attempts to copy a single frame into the guest if there is enough
     // rate limiting budget.
     // Returns true on successful frame delivery.
-    fn rate_limited_rx_single_frame(&mut self) -> bool {
-        if !Self::rate_limiter_consume_op(&mut self.rx_rate_limiter, self.rx_bytes_read as u64) {
+    fn rate_limited_rx_single_frame(&mut self, dc: &ParsedDescriptorChain) -> bool {
+        let rx_queue = &mut self.queues[RX_INDEX];
+        if !Self::rate_limiter_consume_op(&mut self.rx_rate_limiter, dc.length as u64) {
             self.metrics.rx_rate_limiter_throttled.inc();
             return false;
         }
 
-        // Attempt frame delivery.
-        let success = self.write_frame_to_guest();
-
-        // Undo the tokens consumption if guest delivery failed.
-        if !success {
-            // revert the rate limiting budget consumption
-            Self::rate_limiter_replenish_op(&mut self.rx_rate_limiter, self.rx_bytes_read as u64);
-        }
-
-        success
+        self.rx_buffer.finish_frame(dc, rx_queue);
+        true
     }
 
-    /// Write a slice in a descriptor chain
-    ///
-    /// # Errors
-    ///
-    /// Returns an error if the descriptor chain is too short or
-    /// an inappropriate (read only) descriptor is found in the chain
-    fn write_to_descriptor_chain(
-        mem: &GuestMemoryMmap,
-        data: &[u8],
-        head: DescriptorChain,
-        net_metrics: &NetDeviceMetrics,
-    ) -> Result<(), FrontendError> {
-        let mut chunk = data;
-        let mut next_descriptor = Some(head);
-
-        while let Some(descriptor) = &next_descriptor {
-            if !descriptor.is_write_only() {
-                return Err(FrontendError::ReadOnlyDescriptor);
-            }
-
-            let len = std::cmp::min(chunk.len(), descriptor.len as usize);
-            match mem.write_slice(&chunk[..len], descriptor.addr) {
-                Ok(()) => {
-                    net_metrics.rx_count.inc();
-                    chunk = &chunk[len..];
-                }
-                Err(err) => {
-                    error!("Failed to write slice: {:?}", err);
-                    if let GuestMemoryError::PartialBuffer { .. } = err {
-                        net_metrics.rx_partial_writes.inc();
-                    }
-                    return Err(FrontendError::GuestMemory(err));
-                }
-            }
-
-            // If chunk is empty we are done here.
-            if chunk.is_empty() {
-                let len = data.len() as u64;
-                net_metrics.rx_bytes_count.add(len);
-                net_metrics.rx_packets_count.inc();
-                return Ok(());
-            }
-
-            next_descriptor = descriptor.next_descriptor();
+    /// Returns the minimum size of buffer we expect the guest to provide us depending on the
+    /// features we have negotiated with it
+    fn minimum_rx_buffer_size(&self) -> u32 {
+        if self.has_feature(VIRTIO_NET_F_GUEST_TSO4 as u64)
+            || self.has_feature(VIRTIO_NET_F_GUEST_TSO6 as u64)
+            || self.has_feature(VIRTIO_NET_F_GUEST_UFO as u64)
+        {
+            65562
+        } else {
+            1526
         }
-
-        warn!("Receiving buffer is too small to hold frame of current size");
-        Err(FrontendError::DescriptorChainTooSmall)
     }
 
-    // Copies a single frame from `self.rx_frame_buf` into the guest.
-    fn do_write_frame_to_guest(&mut self) -> Result<(), FrontendError> {
+    /// Parse available RX `DescriptorChains` from the queue
+    pub fn parse_rx_descriptors(&mut self) {
         // This is safe since we checked in the event handler that the device is activated.
         let mem = self.device_state.mem().unwrap();
-
         let queue = &mut self.queues[RX_INDEX];
-        let head_descriptor = queue.pop_or_enable_notification().ok_or_else(|| {
-            self.metrics.no_rx_avail_buffer.inc();
-            FrontendError::EmptyQueue
-        })?;
-        let head_index = head_descriptor.index;
-
-        let result = Self::write_to_descriptor_chain(
-            mem,
-            &self.rx_frame_buf[..self.rx_bytes_read],
-            head_descriptor,
-            &self.metrics,
-        );
-        // Mark the descriptor chain as used. If an error occurred, skip the descriptor chain.
-        let used_len = if result.is_err() {
-            self.metrics.rx_fails.inc();
-            0
-        } else {
-            // Safe to unwrap because a frame must be smaller than 2^16 bytes.
-            u32::try_from(self.rx_bytes_read).unwrap()
-        };
-        queue.add_used(head_index, used_len).map_err(|err| {
-            error!("Failed to add available descriptor {}: {}", head_index, err);
-            FrontendError::AddUsed
-        })?;
-
-        result
-    }
-
-    // Copies a single frame from `self.rx_frame_buf` into the guest. In case of an error retries
-    // the operation if possible. Returns true if the operation was successfull.
-    fn write_frame_to_guest(&mut self) -> bool {
-        let max_iterations = self.queues[RX_INDEX].actual_size();
-        for _ in 0..max_iterations {
-            match self.do_write_frame_to_guest() {
-                Ok(()) => return true,
-                Err(FrontendError::EmptyQueue) | Err(FrontendError::AddUsed) => {
-                    return false;
-                }
-                Err(_) => {
-                    // retry
-                    continue;
+        while let Some(head) = queue.pop_or_enable_notification() {
+            let index = head.index;
+            // SAFETY: we are only using this `DescriptorChain` here.
+            if let Err(err) = unsafe { self.rx_buffer.add_buffer(mem, head) } {
+                self.metrics.rx_fails.inc();
+                error!("net: Could not parse an RX descriptor: {err}");
+                // Try to add the bad descriptor to the used ring.
+                if let Err(err) = queue.add_used(index, 0) {
+                    error!(
+                        "net: Failed to add available RX descriptor {index} while handling a \
+                         parsing error: {err}",
+                    );
                 }
             }
         }
-
-        false
     }
 
     // Tries to detour the frame to MMDS and if MMDS doesn't accept it, sends it on the host TAP.
@@ -508,7 +528,20 @@ impl Net {
     }
 
     // We currently prioritize packets from the MMDS over regular network packets.
-    fn read_from_mmds_or_tap(&mut self) -> Result<usize, NetError> {
+    fn read_from_mmds_or_tap(&mut self) -> Result<Option<ParsedDescriptorChain>, NetError> {
+        // If we don't have any buffers available try to parse more from the RX queue. There might
+        // be some buffers we didn't get the chance to process, because we got to handle the TAP
+        // event before the RX queue event.
+        if self.rx_buffer.is_empty() {
+            self.parse_rx_descriptors();
+
+            // If after parsing the RX queue we still don't have any buffers stop processing RX
+            // frames.
+            if self.rx_buffer.is_empty() {
+                return Ok(None);
+            }
+        }
+
         if let Some(ns) = self.mmds_ns.as_mut() {
             if let Some(len) =
                 ns.write_next_frame(frame_bytes_from_buf_mut(&mut self.rx_frame_buf)?)
@@ -517,22 +550,48 @@ impl Net {
                 METRICS.mmds.tx_frames.inc();
                 METRICS.mmds.tx_bytes.add(len as u64);
                 init_vnet_hdr(&mut self.rx_frame_buf);
-                return Ok(vnet_hdr_len() + len);
+                self.rx_buffer
+                    .iovec
+                    .write_all_volatile_at(&self.rx_frame_buf[..vnet_hdr_len() + len], 0)?;
+                // SAFETY: This is safe:
+                // * We checked that `rx_buffer` includes at least one `DescriptorChain`
+                // * `rx_frame_buf` has size of `MAX_BUFFER_SIZE` and all `DescriptorChain` objects
+                //   are at least that big.
+                let dc = unsafe {
+                    self.rx_buffer
+                        .mark_used((vnet_hdr_len() + len).try_into().unwrap())
+                };
+
+                return Ok(Some(dc));
             }
         }
 
-        self.read_tap().map_err(NetError::IO)
+        // SAFETY: this is safe because we ensured that `self.rx_buffer` has at least one
+        // DescriptorChain parsed in it.
+        let len = unsafe { self.read_tap().map_err(NetError::IO) }?;
+
+        // SAFETY: This is safe,
+        // * `rx_buffer` has at least one `DescriptorChain`
+        // * `read_tap` passes the first `DescriptorChain` to `readv` so we can't have read more
+        //   bytes than its capacity.
+        let dc = unsafe { self.rx_buffer.mark_used(len.try_into().unwrap()) };
+        Ok(Some(dc))
     }
 
+    /// Read as many frames as possible.
     fn process_rx(&mut self) -> Result<(), DeviceError> {
-        // Read as many frames as possible.
         loop {
             match self.read_from_mmds_or_tap() {
-                Ok(count) => {
-                    self.rx_bytes_read = count;
+                Ok(None) => {
+                    self.metrics.no_rx_avail_buffer.inc();
+                    break;
+                }
+                Ok(Some(dc)) => {
                     self.metrics.rx_count.inc();
-                    if !self.rate_limited_rx_single_frame() {
-                        self.rx_deferred_frame = true;
+                    self.metrics.rx_bytes_count.add(dc.length as u64);
+                    self.metrics.rx_packets_count.inc();
+                    if !self.rate_limited_rx_single_frame(&dc) {
+                        self.rx_buffer.deferred_descriptor = Some(dc);
                         break;
                     }
                 }
@@ -558,24 +617,18 @@ impl Net {
         self.try_signal_queue(NetQueue::Rx)
     }
 
-    // Process the deferred frame first, then continue reading from tap.
-    fn handle_deferred_frame(&mut self) -> Result<(), DeviceError> {
-        if self.rate_limited_rx_single_frame() {
-            self.rx_deferred_frame = false;
-            // process_rx() was interrupted possibly before consuming all
-            // packets in the tap; try continuing now.
-            return self.process_rx();
-        }
-
-        self.try_signal_queue(NetQueue::Rx)
-    }
-
     fn resume_rx(&mut self) -> Result<(), DeviceError> {
-        if self.rx_deferred_frame {
-            self.handle_deferred_frame()
-        } else {
-            Ok(())
+        // First try to handle any deferred frame
+        if let Some(deferred_descriptor) = self.rx_buffer.deferred_descriptor.take() {
+            // If can't finish sending this frame, re-set it as deferred and return; we can't
+            // process any more frames from the TAP.
+            if !self.rate_limited_rx_single_frame(&deferred_descriptor) {
+                self.rx_buffer.deferred_descriptor = Some(deferred_descriptor);
+                return Ok(());
+            }
         }
+
+        self.process_rx()
     }
 
     fn process_tx(&mut self) -> Result<(), DeviceError> {
@@ -636,7 +689,7 @@ impl Net {
                 &self.metrics,
             )
             .unwrap_or(false);
-            if frame_consumed_by_mmds && !self.rx_deferred_frame {
+            if frame_consumed_by_mmds && self.rx_buffer.deferred_descriptor.is_none() {
                 // MMDS consumed this frame/request, let's also try to process the response.
                 process_rx_for_mmds = true;
             }
@@ -715,8 +768,15 @@ impl Net {
         self.tx_rate_limiter.update_buckets(tx_bytes, tx_ops);
     }
 
-    fn read_tap(&mut self) -> std::io::Result<usize> {
-        self.tap.read(&mut self.rx_frame_buf)
+    /// Reads a frame from the TAP device inside the first descriptor held by `self.rx_buffer`.
+    ///
+    /// # Safety
+    ///
+    /// `self.rx_buffer` needs to have at least one descriptor chain parsed
+    pub unsafe fn read_tap(&mut self) -> std::io::Result<usize> {
+        let nr_iovecs = self.rx_buffer.parsed_descriptors[0].nr_iovecs as usize;
+        self.tap
+            .read_iovec(&mut self.rx_buffer.iovec.as_iovec_mut_slice()[..nr_iovecs])
     }
 
     fn write_tap(tap: &mut Tap, buf: &IoVecBuffer) -> std::io::Result<usize> {
@@ -734,7 +794,12 @@ impl Net {
             // rate limiters present but with _very high_ allowed rate
             error!("Failed to get rx queue event: {:?}", err);
             self.metrics.event_fails.inc();
-        } else if self.rx_rate_limiter.is_blocked() {
+            return;
+        } else {
+            self.parse_rx_descriptors();
+        }
+
+        if self.rx_rate_limiter.is_blocked() {
             self.metrics.rx_rate_limiter_throttled.inc();
         } else {
             // If the limiter is not blocked, resume the receiving of bytes.
@@ -747,31 +812,14 @@ impl Net {
         // This is safe since we checked in the event handler that the device is activated.
         self.metrics.rx_tap_event_count.inc();
 
-        // While there are no available RX queue buffers and there's a deferred_frame
-        // don't process any more incoming. Otherwise start processing a frame. In the
-        // process the deferred_frame flag will be set in order to avoid freezing the
-        // RX queue.
-        if self.queues[RX_INDEX].is_empty() && self.rx_deferred_frame {
-            self.metrics.no_rx_avail_buffer.inc();
-            return;
-        }
-
         // While limiter is blocked, don't process any more incoming.
         if self.rx_rate_limiter.is_blocked() {
             self.metrics.rx_rate_limiter_throttled.inc();
             return;
         }
 
-        if self.rx_deferred_frame
-        // Process a deferred frame first if available. Don't read from tap again
-        // until we manage to receive this deferred frame.
-        {
-            self.handle_deferred_frame()
-                .unwrap_or_else(|err| report_net_event_fail(&self.metrics, err));
-        } else {
-            self.process_rx()
-                .unwrap_or_else(|err| report_net_event_fail(&self.metrics, err));
-        }
+        self.resume_rx()
+            .unwrap_or_else(|err| report_net_event_fail(&self.metrics, err));
     }
 
     /// Process a single TX queue event.
@@ -913,6 +961,8 @@ impl VirtioDevice for Net {
             .set_offload(supported_flags)
             .map_err(super::super::ActivateError::TapSetOffload)?;
 
+        self.rx_buffer.min_buffer_size = self.minimum_rx_buffer_size();
+
         if self.activate_evt.write(1).is_err() {
             self.metrics.activate_fails.inc();
             return Err(ActivateError::EventFd);
@@ -958,6 +1008,14 @@ pub mod tests {
     use crate::utils::net::mac::{MacAddr, MAC_ADDR_LEN};
     use crate::vstate::memory::{Address, GuestMemory};
 
+    /// Write the number of descriptors used in VirtIO header
+    fn header_set_num_buffers(frame: &mut [u8], nr_descs: u16) {
+        let bytes = nr_descs.to_le_bytes();
+        let offset = std::mem::offset_of!(virtio_net_hdr_v1, num_buffers);
+        frame[offset] = bytes[0];
+        frame[offset + 1] = bytes[1];
+    }
+
     #[test]
     fn test_vnet_helpers() {
         let mut frame_buf = vec![42u8; vnet_hdr_len() - 1];
@@ -1144,9 +1202,14 @@ pub mod tests {
                 (2, 1000, VIRTQ_DESC_F_WRITE),
             ],
         );
-        let frame = th.check_rx_deferred_frame(1000);
+        let mut frame = inject_tap_tx_frame(&th.net(), 1000);
+        check_metric_after_block!(
+            th.net().metrics.rx_fails,
+            1,
+            th.event_manager.run_with_timeout(100).unwrap()
+        );
         th.rxq.check_used_elem(0, 0, 0);
-
+        header_set_num_buffers(frame.as_mut_slice(), 1);
         th.check_rx_queue_resume(&frame);
     }
 
@@ -1157,9 +1220,10 @@ pub mod tests {
         th.activate_net();
 
         th.add_desc_chain(NetQueue::Rx, 0, &[(0, 100, VIRTQ_DESC_F_WRITE)]);
-        let frame = th.check_rx_deferred_frame(1000);
+        let mut frame = th.check_rx_discarded_buffer(1000);
         th.rxq.check_used_elem(0, 0, 0);
 
+        header_set_num_buffers(frame.as_mut_slice(), 1);
         th.check_rx_queue_resume(&frame);
     }
 
@@ -1181,9 +1245,10 @@ pub mod tests {
                 (2, 4096, VIRTQ_DESC_F_WRITE),
             ],
         );
-        let frame = th.check_rx_deferred_frame(1000);
+        let mut frame = th.check_rx_discarded_buffer(1000);
         th.rxq.check_used_elem(0, 0, 0);
 
+        header_set_num_buffers(frame.as_mut_slice(), 1);
         th.check_rx_queue_resume(&frame);
     }
 
@@ -1212,11 +1277,12 @@ pub mod tests {
             &[(4, 1000, VIRTQ_DESC_F_WRITE)],
         );
 
-        // Add valid descriptor chain.
-        th.add_desc_chain(NetQueue::Rx, 1300, &[(5, 1000, VIRTQ_DESC_F_WRITE)]);
+        // Add valid descriptor chain. TestHelper does not negotiate any feature offloading so the
+        // buffers need to be at least 1526 bytes long.
+        th.add_desc_chain(NetQueue::Rx, 1300, &[(5, 1526, VIRTQ_DESC_F_WRITE)]);
 
         // Inject frame to tap and run epoll.
-        let frame = inject_tap_tx_frame(&th.net(), 1000);
+        let mut frame = inject_tap_tx_frame(&th.net(), 1000);
         check_metric_after_block!(
             th.net().metrics.rx_packets_count,
             1,
@@ -1231,10 +1297,11 @@ pub mod tests {
         th.rxq.check_used_elem(1, 3, 0);
         th.rxq.check_used_elem(2, 4, 0);
         // Check that the frame wasn't deferred.
-        assert!(!th.net().rx_deferred_frame);
+        assert!(th.net().rx_buffer.deferred_descriptor.is_none());
         // Check that the frame has been written successfully to the valid Rx descriptor chain.
         th.rxq
             .check_used_elem(3, 5, frame.len().try_into().unwrap());
+        header_set_num_buffers(frame.as_mut_slice(), 1);
         th.rxq.dtable[5].check_data(&frame);
     }
 
@@ -1257,7 +1324,7 @@ pub mod tests {
             ],
         );
         // Inject frame to tap and run epoll.
-        let frame = inject_tap_tx_frame(&th.net(), 1000);
+        let mut frame = inject_tap_tx_frame(&th.net(), 1000);
         check_metric_after_block!(
             th.net().metrics.rx_packets_count,
             1,
@@ -1265,11 +1332,12 @@ pub mod tests {
         );
 
         // Check that the frame wasn't deferred.
-        assert!(!th.net().rx_deferred_frame);
+        assert!(th.net().rx_buffer.deferred_descriptor.is_none());
         // Check that the used queue has advanced.
         assert_eq!(th.rxq.used.idx.get(), 1);
         assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring));
         // Check that the frame has been written successfully to the Rx descriptor chain.
+        header_set_num_buffers(frame.as_mut_slice(), 1);
         th.rxq
             .check_used_elem(0, 3, frame.len().try_into().unwrap());
         th.rxq.dtable[3].check_data(&frame[..100]);
@@ -1288,16 +1356,24 @@ pub mod tests {
         th.add_desc_chain(
             NetQueue::Rx,
             0,
-            &[(0, 500, VIRTQ_DESC_F_WRITE), (1, 500, VIRTQ_DESC_F_WRITE)],
+            &[
+                (0, 500, VIRTQ_DESC_F_WRITE),
+                (1, 500, VIRTQ_DESC_F_WRITE),
+                (2, 526, VIRTQ_DESC_F_WRITE),
+            ],
         );
         th.add_desc_chain(
             NetQueue::Rx,
-            1000,
-            &[(2, 500, VIRTQ_DESC_F_WRITE), (3, 500, VIRTQ_DESC_F_WRITE)],
+            2000,
+            &[
+                (3, 500, VIRTQ_DESC_F_WRITE),
+                (4, 500, VIRTQ_DESC_F_WRITE),
+                (5, 526, VIRTQ_DESC_F_WRITE),
+            ],
         );
         // Inject 2 frames to tap and run epoll.
-        let frame_1 = inject_tap_tx_frame(&th.net(), 200);
-        let frame_2 = inject_tap_tx_frame(&th.net(), 300);
+        let mut frame_1 = inject_tap_tx_frame(&th.net(), 200);
+        let mut frame_2 = inject_tap_tx_frame(&th.net(), 300);
         check_metric_after_block!(
             th.net().metrics.rx_packets_count,
             2,
@@ -1305,20 +1381,24 @@ pub mod tests {
         );
 
         // Check that the frames weren't deferred.
-        assert!(!th.net().rx_deferred_frame);
+        assert!(th.net().rx_buffer.deferred_descriptor.is_none());
         // Check that the used queue has advanced.
         assert_eq!(th.rxq.used.idx.get(), 2);
         assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring));
         // Check that the 1st frame was written successfully to the 1st Rx descriptor chain.
+        header_set_num_buffers(frame_1.as_mut_slice(), 1);
         th.rxq
             .check_used_elem(0, 0, frame_1.len().try_into().unwrap());
         th.rxq.dtable[0].check_data(&frame_1);
         th.rxq.dtable[1].check_data(&[0; 500]);
+        th.rxq.dtable[2].check_data(&[0; 526]);
         // Check that the 2nd frame was written successfully to the 2nd Rx descriptor chain.
+        header_set_num_buffers(frame_2.as_mut_slice(), 1);
         th.rxq
-            .check_used_elem(1, 2, frame_2.len().try_into().unwrap());
-        th.rxq.dtable[2].check_data(&frame_2);
-        th.rxq.dtable[3].check_data(&[0; 500]);
+            .check_used_elem(1, 3, frame_2.len().try_into().unwrap());
+        th.rxq.dtable[3].check_data(&frame_2);
+        th.rxq.dtable[4].check_data(&[0; 500]);
+        th.rxq.dtable[2].check_data(&[0; 526]);
     }
 
     #[test]
@@ -1605,6 +1685,19 @@ pub mod tests {
     fn test_mmds_detour_and_injection() {
         let mut net = default_net();
 
+        // Inject a fake buffer in the devices buffers, otherwise we won't be able to receive the
+        // MMDS frame. One iovec will be just fine.
+        let mut fake_buffer = vec![0u8; 1024];
+        let iov_buffer = IoVecBufferMut::from(fake_buffer.as_mut_slice());
+        net.rx_buffer.iovec = iov_buffer;
+        net.rx_buffer
+            .parsed_descriptors
+            .push_back(ParsedDescriptorChain {
+                head_index: 1,
+                length: 1024,
+                nr_iovecs: 1,
+            });
+
         let src_mac = MacAddr::from_str("11:11:11:11:11:11").unwrap();
         let src_ip = Ipv4Addr::new(10, 1, 2, 3);
         let dst_mac = MacAddr::from_str("22:22:22:22:22:22").unwrap();
@@ -1721,8 +1814,12 @@ pub mod tests {
         // SAFETY: its a valid fd
         unsafe { libc::close(th.net.lock().unwrap().tap.as_raw_fd()) };
 
-        // The RX queue is empty and rx_deffered_frame is set.
-        th.net().rx_deferred_frame = true;
+        // The RX queue is empty and there is a deferred frame.
+        th.net().rx_buffer.deferred_descriptor = Some(ParsedDescriptorChain {
+            head_index: 1,
+            length: 100,
+            nr_iovecs: 1,
+        });
         check_metric_after_block!(
             th.net().metrics.no_rx_avail_buffer,
             1,
@@ -1732,68 +1829,14 @@ pub mod tests {
         // We need to set this here to false, otherwise the device will try to
         // handle a deferred frame, it will fail and will never try to read from
         // the tap.
-        th.net().rx_deferred_frame = false;
+        th.net().rx_buffer.deferred_descriptor = None;
 
-        // Fake an avail buffer; this time, tap reading should error out.
-        th.rxq.avail.idx.set(1);
-        check_metric_after_block!(
-            th.net().metrics.tap_read_fails,
-            1,
-            th.simulate_event(NetEvent::Tap)
-        );
-    }
-
-    #[test]
-    fn test_deferred_frame() {
-        let mem = single_region_mem(2 * MAX_BUFFER_SIZE);
-        let mut th = TestHelper::get_default(&mem);
-        th.activate_net();
-
-        let rx_packets_count = th.net().metrics.rx_packets_count.count();
-        let _ = inject_tap_tx_frame(&th.net(), 1000);
-        // Trigger a Tap event that. This should fail since there
-        // are not any available descriptors in the queue
-        check_metric_after_block!(
-            th.net().metrics.no_rx_avail_buffer,
-            1,
-            th.simulate_event(NetEvent::Tap)
-        );
-        // The frame we read from the tap should be deferred now and
-        // no frames should have been transmitted
-        assert!(th.net().rx_deferred_frame);
-        assert_eq!(th.net().metrics.rx_packets_count.count(), rx_packets_count);
-
-        // Let's add a second frame, which should really have the same
-        // fate.
-        let _ = inject_tap_tx_frame(&th.net(), 1000);
-
-        // Adding a descriptor in the queue. This should handle the first deferred
-        // frame. However, this should try to handle the second tap as well and fail
-        // since there's only one Descriptor Chain in the queue.
         th.add_desc_chain(NetQueue::Rx, 0, &[(0, 4096, VIRTQ_DESC_F_WRITE)]);
         check_metric_after_block!(
-            th.net().metrics.no_rx_avail_buffer,
+            th.net().metrics.tap_read_fails,
             1,
             th.simulate_event(NetEvent::Tap)
         );
-        // We should still have a deferred frame
-        assert!(th.net().rx_deferred_frame);
-        // However, we should have delivered the first frame
-        assert_eq!(
-            th.net().metrics.rx_packets_count.count(),
-            rx_packets_count + 1
-        );
-
-        // Let's add one more descriptor and try to handle the last frame as well.
-        th.add_desc_chain(NetQueue::Rx, 0, &[(0, 4096, VIRTQ_DESC_F_WRITE)]);
-        check_metric_after_block!(
-            th.net().metrics.rx_packets_count,
-            1,
-            th.simulate_event(NetEvent::RxQueue)
-        );
-
-        // We should be done with any deferred frame
-        assert!(!th.net().rx_deferred_frame);
     }
 
     #[test]
@@ -1908,10 +1951,10 @@ pub mod tests {
             let mut rl = RateLimiter::new(1000, 0, 500, 0, 0, 0).unwrap();
 
             // set up RX
-            assert!(!th.net().rx_deferred_frame);
+            assert!(th.net().rx_buffer.deferred_descriptor.is_none());
             th.add_desc_chain(NetQueue::Rx, 0, &[(0, 4096, VIRTQ_DESC_F_WRITE)]);
 
-            let frame = inject_tap_tx_frame(&th.net(), 1000);
+            let mut frame = inject_tap_tx_frame(&th.net(), 1000);
 
             // use up the budget (do it after injecting the tx frame, as socket communication is
             // slow enough that the ratelimiter could replenish in the meantime).
@@ -1928,7 +1971,7 @@ pub mod tests {
                 // assert that limiter is blocked
                 assert!(th.net().rx_rate_limiter.is_blocked());
                 assert_eq!(th.net().metrics.rx_rate_limiter_throttled.count(), 1);
-                assert!(th.net().rx_deferred_frame);
+                assert!(th.net().rx_buffer.deferred_descriptor.is_some());
                 // assert that no operation actually completed (limiter blocked it)
                 assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring));
                 // make sure the data is still queued for processing
@@ -1963,6 +2006,7 @@ pub mod tests {
                 assert_eq!(th.rxq.used.idx.get(), 1);
                 th.rxq
                     .check_used_elem(0, 0, frame.len().try_into().unwrap());
+                header_set_num_buffers(frame.as_mut_slice(), 1);
                 th.rxq.dtable[0].check_data(&frame);
             }
         }
@@ -2026,9 +2070,9 @@ pub mod tests {
             let mut rl = RateLimiter::new(0, 0, 0, 1, 0, 500).unwrap();
 
             // set up RX
-            assert!(!th.net().rx_deferred_frame);
+            assert!(th.net().rx_buffer.deferred_descriptor.is_none());
             th.add_desc_chain(NetQueue::Rx, 0, &[(0, 4096, VIRTQ_DESC_F_WRITE)]);
-            let frame = inject_tap_tx_frame(&th.net(), 1234);
+            let mut frame = inject_tap_tx_frame(&th.net(), 1234);
 
             // use up the initial budget
             assert!(rl.consume(1, TokenType::Ops));
@@ -2048,7 +2092,7 @@ pub mod tests {
                 // assert that limiter is blocked
                 assert!(th.net().rx_rate_limiter.is_blocked());
                 assert!(th.net().metrics.rx_rate_limiter_throttled.count() >= 1);
-                assert!(th.net().rx_deferred_frame);
+                assert!(th.net().rx_buffer.deferred_descriptor.is_some());
                 // assert that no operation actually completed (limiter blocked it)
                 assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring));
                 // make sure the data is still queued for processing
@@ -2075,6 +2119,7 @@ pub mod tests {
                 assert_eq!(th.rxq.used.idx.get(), 1);
                 th.rxq
                     .check_used_elem(0, 0, frame.len().try_into().unwrap());
+                header_set_num_buffers(frame.as_mut_slice(), 1);
                 th.rxq.dtable[0].check_data(&frame);
             }
         }
diff --git a/src/vmm/src/devices/virtio/net/mod.rs b/src/vmm/src/devices/virtio/net/mod.rs
index 1a7972595ad..e8a3f86ac72 100644
--- a/src/vmm/src/devices/virtio/net/mod.rs
+++ b/src/vmm/src/devices/virtio/net/mod.rs
@@ -27,8 +27,10 @@ pub mod test_utils;
 mod gen;
 
 pub use tap::{Tap, TapError};
+use vm_memory::VolatileMemoryError;
 
 pub use self::device::Net;
+use super::iovec::IoVecError;
 
 /// Enum representing the Net device queue types
 #[derive(Debug)]
@@ -50,6 +52,10 @@ pub enum NetError {
     EventFd(io::Error),
     /// IO error: {0}
     IO(io::Error),
+    /// Error writing in guest memory: {0}
+    GuestMemoryError(#[from] VolatileMemoryError),
     /// The VNET header is missing from the frame
     VnetHeaderMissing,
+    /// IoVecBuffer(Mut) error: {0}
+    IoVecError(#[from] IoVecError),
 }
diff --git a/src/vmm/src/devices/virtio/net/tap.rs b/src/vmm/src/devices/virtio/net/tap.rs
index 20024a1ae8e..4d1757edc8e 100644
--- a/src/vmm/src/devices/virtio/net/tap.rs
+++ b/src/vmm/src/devices/virtio/net/tap.rs
@@ -7,7 +7,7 @@
 
 use std::fmt::{self, Debug};
 use std::fs::File;
-use std::io::{Error as IoError, Read};
+use std::io::Error as IoError;
 use std::os::raw::*;
 use std::os::unix::io::{AsRawFd, FromRawFd, RawFd};
 
@@ -190,11 +190,19 @@ impl Tap {
         }
         Ok(usize::try_from(ret).unwrap())
     }
-}
 
-impl Read for Tap {
-    fn read(&mut self, buf: &mut [u8]) -> Result<usize, IoError> {
-        self.tap_file.read(buf)
+    /// Read from tap to an `IoVecBufferMut`
+    pub(crate) fn read_iovec(&mut self, buffer: &mut [libc::iovec]) -> Result<usize, IoError> {
+        let iov = buffer.as_mut_ptr();
+        let iovcnt = buffer.len().try_into().unwrap();
+
+        // SAFETY: `readv` is safe. Called with a valid tap fd, the iovec pointer and length
+        // is provide by the `IoVecBufferMut` implementation and we check the return value.
+        let ret = unsafe { libc::readv(self.tap_file.as_raw_fd(), iov, iovcnt) };
+        if ret == -1 {
+            return Err(IoError::last_os_error());
+        }
+        Ok(usize::try_from(ret).unwrap())
     }
 }
 
@@ -211,6 +219,7 @@ pub mod tests {
     use std::os::unix::ffi::OsStrExt;
 
     use super::*;
+    use crate::devices::virtio::iovec::IoVecBufferMut;
     use crate::devices::virtio::net::gen;
     use crate::devices::virtio::net::test_utils::{enable, if_index, TapTrafficSimulator};
 
@@ -218,7 +227,6 @@ pub mod tests {
     const VNET_HDR_SIZE: usize = 10;
 
     const PAYLOAD_SIZE: usize = 512;
-    const PACKET_SIZE: usize = 1024;
 
     #[test]
     fn test_tap_name() {
@@ -287,23 +295,6 @@ pub mod tests {
         assert_eq!(tap.as_raw_fd(), tap.tap_file.as_raw_fd());
     }
 
-    #[test]
-    fn test_read() {
-        let mut tap = Tap::open_named("").unwrap();
-        enable(&tap);
-        let tap_traffic_simulator = TapTrafficSimulator::new(if_index(&tap));
-
-        let packet = vmm_sys_util::rand::rand_alphanumerics(PAYLOAD_SIZE);
-        tap_traffic_simulator.push_tx_packet(packet.as_bytes());
-
-        let mut buf = [0u8; PACKET_SIZE];
-        assert_eq!(tap.read(&mut buf).unwrap(), PAYLOAD_SIZE + VNET_HDR_SIZE);
-        assert_eq!(
-            &buf[VNET_HDR_SIZE..packet.len() + VNET_HDR_SIZE],
-            packet.as_bytes()
-        );
-    }
-
     #[test]
     fn test_write_iovec() {
         let mut tap = Tap::open_named("").unwrap();
@@ -339,4 +330,26 @@ pub mod tests {
             fragment3
         );
     }
+
+    #[test]
+    fn test_read_iovec() {
+        let mut tap = Tap::open_named("").unwrap();
+        enable(&tap);
+        let tap_traffic_simulator = TapTrafficSimulator::new(if_index(&tap));
+
+        let mut buff1 = vec![0; PAYLOAD_SIZE + VNET_HDR_SIZE];
+        let mut buff2 = vec![0; 2 * PAYLOAD_SIZE];
+
+        let mut rx_buffers = IoVecBufferMut::from(vec![buff1.as_mut_slice(), buff2.as_mut_slice()]);
+
+        let packet = vmm_sys_util::rand::rand_alphanumerics(2 * PAYLOAD_SIZE);
+        tap_traffic_simulator.push_tx_packet(packet.as_bytes());
+        assert_eq!(
+            tap.read_iovec(rx_buffers.as_iovec_mut_slice()).unwrap(),
+            2 * PAYLOAD_SIZE + VNET_HDR_SIZE
+        );
+        assert_eq!(&buff1[VNET_HDR_SIZE..], &packet.as_bytes()[..PAYLOAD_SIZE]);
+        assert_eq!(&buff2[..PAYLOAD_SIZE], &packet.as_bytes()[PAYLOAD_SIZE..]);
+        assert_eq!(&buff2[PAYLOAD_SIZE..], &vec![0; PAYLOAD_SIZE])
+    }
 }
diff --git a/src/vmm/src/devices/virtio/net/test_utils.rs b/src/vmm/src/devices/virtio/net/test_utils.rs
index 07808bbb44b..eb1c6f6e883 100644
--- a/src/vmm/src/devices/virtio/net/test_utils.rs
+++ b/src/vmm/src/devices/virtio/net/test_utils.rs
@@ -430,8 +430,9 @@ pub mod test {
             event_fd.write(1).unwrap();
         }
 
-        /// Generate a tap frame of `frame_len` and check that it is deferred
-        pub fn check_rx_deferred_frame(&mut self, frame_len: usize) -> Vec<u8> {
+        /// Generate a tap frame of `frame_len` and check that it is not read and
+        /// the descriptor chain has been discarded
+        pub fn check_rx_discarded_buffer(&mut self, frame_len: usize) -> Vec<u8> {
             let used_idx = self.rxq.used.idx.get();
 
             // Inject frame to tap and run epoll.
@@ -441,8 +442,6 @@ pub mod test {
                 0,
                 self.event_manager.run_with_timeout(100).unwrap()
             );
-            // Check that the frame has been deferred.
-            assert!(self.net().rx_deferred_frame);
             // Check that the descriptor chain has been discarded.
             assert_eq!(self.rxq.used.idx.get(), used_idx + 1);
             assert!(&self.net().irq_trigger.has_pending_irq(IrqType::Vring));
@@ -454,16 +453,9 @@ pub mod test {
         /// is eventually received by the guest
         pub fn check_rx_queue_resume(&mut self, expected_frame: &[u8]) {
             let used_idx = self.rxq.used.idx.get();
-            // Add a valid Rx avail descriptor chain and run epoll.
-            self.add_desc_chain(
-                NetQueue::Rx,
-                0,
-                &[(
-                    0,
-                    u32::try_from(expected_frame.len()).unwrap(),
-                    VIRTQ_DESC_F_WRITE,
-                )],
-            );
+            // Add a valid Rx avail descriptor chain and run epoll. We do not negotiate any feature
+            // offloading so the buffers need to be at least 1526 bytes long.
+            self.add_desc_chain(NetQueue::Rx, 0, &[(0, 1526, VIRTQ_DESC_F_WRITE)]);
             check_metric_after_block!(
                 self.net().metrics.rx_packets_count,
                 1,

From 91238d61067843b2f3403f5fda7bcd27240c5d98 Mon Sep 17 00:00:00 2001
From: Babis Chalios <bchalios@amazon.es>
Date: Fri, 27 Sep 2024 14:00:03 +0300
Subject: [PATCH 4/5] net: support serializing RxBuffer

Now, that we pre-process the buffers that guest provides for performing
RX, we need to save them in the VM state snapshot file, for networking
to work correctly post snapshot resume.

Implement Persist for RxBuffers and and plug them in the
(de)serialization logic of the network device.

Co-authored-by: Egor Lazarchuk <yegorlz@amazon.co.uk>
Signed-off-by: Babis Chalios <bchalios@amazon.es>
---
 src/vmm/src/devices/virtio/iovec.rs       |  7 ++---
 src/vmm/src/devices/virtio/net/persist.rs | 32 +++++++++++++++++++++--
 2 files changed, 34 insertions(+), 5 deletions(-)

diff --git a/src/vmm/src/devices/virtio/iovec.rs b/src/vmm/src/devices/virtio/iovec.rs
index eb8e46e7de6..a1a6b0d068c 100644
--- a/src/vmm/src/devices/virtio/iovec.rs
+++ b/src/vmm/src/devices/virtio/iovec.rs
@@ -4,6 +4,7 @@
 use std::io::ErrorKind;
 
 use libc::{c_void, iovec, size_t};
+use serde::{Deserialize, Serialize};
 #[cfg(not(kani))]
 use smallvec::SmallVec;
 use vm_memory::bitmap::Bitmap;
@@ -218,7 +219,7 @@ impl IoVecBuffer {
     }
 }
 
-#[derive(Debug)]
+#[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct ParsedDescriptorChain {
     pub head_index: u16,
     pub length: u32,
@@ -233,9 +234,9 @@ pub struct ParsedDescriptorChain {
 #[derive(Debug)]
 pub struct IoVecBufferMut {
     // container of the memory regions included in this IO vector
-    vecs: IovDeque,
+    pub vecs: IovDeque,
     // Total length of the IoVecBufferMut
-    len: usize,
+    pub len: usize,
 }
 
 impl IoVecBufferMut {
diff --git a/src/vmm/src/devices/virtio/net/persist.rs b/src/vmm/src/devices/virtio/net/persist.rs
index 4f0ae35d966..c7918f07ab3 100644
--- a/src/vmm/src/devices/virtio/net/persist.rs
+++ b/src/vmm/src/devices/virtio/net/persist.rs
@@ -9,9 +9,10 @@ use std::sync::{Arc, Mutex};
 
 use serde::{Deserialize, Serialize};
 
-use super::device::Net;
-use super::{TapError, NET_NUM_QUEUES};
+use super::device::{Net, RxBuffers};
+use super::{TapError, NET_NUM_QUEUES, RX_INDEX};
 use crate::devices::virtio::device::DeviceState;
+use crate::devices::virtio::iovec::ParsedDescriptorChain;
 use crate::devices::virtio::persist::{PersistError as VirtioStateError, VirtioDeviceState};
 use crate::devices::virtio::queue::FIRECRACKER_MAX_QUEUE_SIZE;
 use crate::devices::virtio::TYPE_NET;
@@ -31,6 +32,23 @@ pub struct NetConfigSpaceState {
     guest_mac: Option<MacAddr>,
 }
 
+/// Information about the parsed RX buffers
+#[derive(Debug, Default, Clone, Serialize, Deserialize)]
+pub struct RxBufferState {
+    // Number of iovecs we have parsed from the guest
+    parsed_descriptor_chains_nr: u16,
+    deferred_descriptor: Option<ParsedDescriptorChain>,
+}
+
+impl RxBufferState {
+    fn from_rx_buffers(rx_buffer: &RxBuffers) -> Self {
+        RxBufferState {
+            parsed_descriptor_chains_nr: rx_buffer.parsed_descriptors.len().try_into().unwrap(),
+            deferred_descriptor: rx_buffer.deferred_descriptor.clone(),
+        }
+    }
+}
+
 /// Information about the network device that are saved
 /// at snapshot.
 #[derive(Debug, Clone, Serialize, Deserialize)]
@@ -43,6 +61,7 @@ pub struct NetState {
     pub mmds_ns: Option<MmdsNetworkStackState>,
     config_space: NetConfigSpaceState,
     virtio_state: VirtioDeviceState,
+    rx_buffers_state: RxBufferState,
 }
 
 /// Auxiliary structure for creating a device when resuming from a snapshot.
@@ -85,6 +104,7 @@ impl Persist<'_> for Net {
                 guest_mac: self.guest_mac,
             },
             virtio_state: VirtioDeviceState::from_device(self),
+            rx_buffers_state: RxBufferState::from_rx_buffers(&self.rx_buffer),
         }
     }
 
@@ -137,6 +157,14 @@ impl Persist<'_> for Net {
                 .map_err(NetPersistError::TapSetOffload)?;
 
             net.device_state = DeviceState::Activated(constructor_args.mem);
+
+            // Recreate `Net::rx_buffer`. We do it by re-parsing the RX queue. We're temporarily
+            // rolling back `next_avail` in the RX queue and call `parse_rx_descriptors`.
+            net.queues[RX_INDEX].next_avail -= state.rx_buffers_state.parsed_descriptor_chains_nr;
+            net.parse_rx_descriptors();
+            net.rx_buffer
+                .deferred_descriptor
+                .clone_from(&state.rx_buffers_state.deferred_descriptor);
         }
 
         Ok(net)

From 7ce0211f9cc0c0dfb7cddfd297df4f0df7687ab1 Mon Sep 17 00:00:00 2001
From: Babis Chalios <bchalios@amazon.es>
Date: Tue, 1 Oct 2024 14:52:12 +0200
Subject: [PATCH 5/5] iovec: fix kani proof for write_volatile_at

IoVecBufferMut type now uses IovDeque as its backing memory. IovDeque is
performing a custom memory allocation, using memfd_create() and a
combination of mmap() calls in order to provide a memory layout where
the iovec objects stored in the IovDeque will always be in consecutive
memory.

kani doesn't really get along with these system calls, which breaks our
proof for IoVecBufferMut::write_volatile_at. Substitute memory
allocation and deallocation with plain calls to std::alloc::(de)alloc
when we run kani proofs. Also provide a stub for IovDeque::push_back to
provide the same memory layout invariants.

Signed-off-by: Babis Chalios <bchalios@amazon.es>
---
 src/vmm/src/devices/virtio/iov_deque.rs |  8 +--
 src/vmm/src/devices/virtio/iovec.rs     | 66 ++++++++++++++++++++++++-
 2 files changed, 69 insertions(+), 5 deletions(-)

diff --git a/src/vmm/src/devices/virtio/iov_deque.rs b/src/vmm/src/devices/virtio/iov_deque.rs
index 201f1d9c92f..0d801d10f3d 100644
--- a/src/vmm/src/devices/virtio/iov_deque.rs
+++ b/src/vmm/src/devices/virtio/iov_deque.rs
@@ -71,9 +71,9 @@ pub enum IovDequeError {
 // so making a slice out of them does not require any copies.
 #[derive(Debug)]
 pub struct IovDeque {
-    iov: *mut libc::iovec,
-    start: u16,
-    len: u16,
+    pub iov: *mut libc::iovec,
+    pub start: u16,
+    pub len: u16,
 }
 
 // SAFETY: This is `Send`. We hold sole ownership of the underlying buffer.
@@ -215,7 +215,7 @@ impl IovDeque {
 
     /// Returns `true` if the [`IovDeque`] is full, `false` otherwise
     #[inline(always)]
-    fn is_full(&self) -> bool {
+    pub fn is_full(&self) -> bool {
         self.len() == FIRECRACKER_MAX_QUEUE_SIZE
     }
 
diff --git a/src/vmm/src/devices/virtio/iovec.rs b/src/vmm/src/devices/virtio/iovec.rs
index a1a6b0d068c..5a015975df0 100644
--- a/src/vmm/src/devices/virtio/iovec.rs
+++ b/src/vmm/src/devices/virtio/iovec.rs
@@ -795,6 +795,7 @@ mod tests {
 }
 
 #[cfg(kani)]
+#[allow(dead_code)] // Avoid warning when using stubs
 mod verification {
     use std::mem::ManuallyDrop;
 
@@ -803,7 +804,9 @@ mod verification {
     use vm_memory::VolatileSlice;
 
     use super::{IoVecBuffer, IoVecBufferMut, IoVecVec};
+    use crate::arch::PAGE_SIZE;
     use crate::devices::virtio::iov_deque::IovDeque;
+    use crate::devices::virtio::queue::FIRECRACKER_MAX_QUEUE_SIZE;
 
     // Maximum memory size to use for our buffers. For the time being 1KB.
     const GUEST_MEMORY_SIZE: usize = 1 << 10;
@@ -815,6 +818,50 @@ mod verification {
     // >= 1.
     const MAX_DESC_LENGTH: usize = 4;
 
+    mod stubs {
+        use super::*;
+
+        /// This is a stub for the `IovDeque::push_back` method.
+        ///
+        /// `IovDeque` relies on a special allocation of two pages of virtual memory, where both of
+        /// these point to the same underlying physical page. This way, the contents of the first
+        /// page of virtual memory are automatically mirrored in the second virtual page. We do
+        /// that in order to always have the elements that are currently in the ring buffer in
+        /// consecutive (virtual) memory.
+        ///
+        /// To build this particular memory layout we create a new `memfd` object, allocate memory
+        /// with `mmap` and call `mmap` again to make sure both pages point to the page allocated
+        /// via the `memfd` object. These ffi calls make kani complain, so here we mock the
+        /// `IovDeque` object memory with a normal memory allocation of two pages worth of data.
+        ///
+        /// This stub helps imitate the effect of mirroring without all the elaborate memory
+        /// allocation trick.
+        pub fn push_back(deque: &mut IovDeque, iov: iovec) {
+            // This should NEVER happen, since our ring buffer is as big as the maximum queue size.
+            // We also check for the sanity of the VirtIO queues, in queue.rs, which means that if
+            // we ever try to add something in a full ring buffer, there is an internal
+            // bug in the device emulation logic. Panic here because the device is
+            // hopelessly broken.
+            assert!(
+                !deque.is_full(),
+                "The number of `iovec` objects is bigger than the available space"
+            );
+
+            let offset = (deque.start + deque.len) as usize;
+            let mirror = if offset >= FIRECRACKER_MAX_QUEUE_SIZE as usize {
+                offset - FIRECRACKER_MAX_QUEUE_SIZE as usize
+            } else {
+                offset + FIRECRACKER_MAX_QUEUE_SIZE as usize
+            };
+
+            // SAFETY: self.iov is a valid pointer and `self.start + self.len` is within range (we
+            // asserted before that the buffer is not full).
+            unsafe { deque.iov.add(offset).write_volatile(iov) };
+            unsafe { deque.iov.add(mirror).write_volatile(iov) };
+            deque.len += 1;
+        }
+    }
+
     fn create_iovecs(mem: *mut u8, size: usize, nr_descs: usize) -> (IoVecVec, u32) {
         let mut vecs: Vec<iovec> = Vec::with_capacity(nr_descs);
         let mut len = 0u32;
@@ -845,8 +892,23 @@ mod verification {
         }
     }
 
+    fn create_iov_deque() -> IovDeque {
+        // SAFETY: safe because the layout has non-zero size
+        let mem = unsafe {
+            std::alloc::alloc(std::alloc::Layout::from_size_align_unchecked(
+                2 * PAGE_SIZE,
+                PAGE_SIZE,
+            ))
+        };
+        IovDeque {
+            iov: mem.cast(),
+            start: kani::any_where(|&start| start < FIRECRACKER_MAX_QUEUE_SIZE),
+            len: 0,
+        }
+    }
+
     fn create_iovecs_mut(mem: *mut u8, size: usize, nr_descs: usize) -> (IovDeque, u32) {
-        let mut vecs = IovDeque::new().unwrap();
+        let mut vecs = create_iov_deque();
         let mut len = 0u32;
         for _ in 0..nr_descs {
             // The `IoVecBufferMut` constructors ensure that the memory region described by every
@@ -955,6 +1017,7 @@ mod verification {
     #[kani::proof]
     #[kani::unwind(5)]
     #[kani::solver(cadical)]
+    #[kani::stub(IovDeque::push_back, stubs::push_back)]
     fn verify_write_to_iovec() {
         for nr_descs in 0..MAX_DESC_LENGTH {
             let mut iov_mut = IoVecBufferMut::any_of_length(nr_descs);
@@ -979,6 +1042,7 @@ mod verification {
                     .unwrap(),
                 buf.len().min(iov_mut.len().saturating_sub(offset) as usize)
             );
+            std::mem::forget(iov_mut.vecs);
         }
     }
 }