Skip to content

Commit 11e71f6

Browse files
bchaliosShadowCurse
andcommitted
virtio: add ring buffer type for describing guest memory
Add a ring buffer type that is tailored for holding `struct iovec` objects that point to guest memory for IO. The `struct iovec` objects represent the memory that the guest passed to us as `Descriptors` in a VirtIO queue for performing some I/O operation. We plan to use this type to describe the guest memory we have available for doing network RX. This should facilitate us in optimizing the reception of data from the TAP device using `readv`, thus avoiding a memory copy. Co-authored-by: Egor Lazarchuk <[email protected]> Signed-off-by: Babis Chalios <[email protected]>
1 parent 7803c42 commit 11e71f6

File tree

2 files changed

+394
-0
lines changed

2 files changed

+394
-0
lines changed
Lines changed: 393 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,393 @@
1+
// Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
use std::os::fd::AsRawFd;
5+
6+
use libc::{c_int, c_void, iovec, off_t, size_t};
7+
use memfd;
8+
9+
use super::queue::FIRECRACKER_MAX_QUEUE_SIZE;
10+
use crate::arch::PAGE_SIZE;
11+
12+
#[derive(Debug, thiserror::Error, displaydoc::Display)]
13+
pub enum IovDequeError {
14+
/// Error with memfd: {0}
15+
Memfd(#[from] memfd::Error),
16+
/// Error while resizing memfd: {0}
17+
MemfdResize(std::io::Error),
18+
/// Error calling mmap: {0}
19+
Mmap(std::io::Error),
20+
}
21+
22+
/// ['IovDeque'] is a ring buffer tailored for `struct iovec` objects.
23+
///
24+
/// From the point of view of API, [`IovDeque`] is a typical ring buffer that allows us to push
25+
/// `struct iovec` objects at the end of the buffer and pop them from its beginning.
26+
///
27+
/// It is tailored to store `struct iovec` objects that described memory that was passed to us from
28+
/// the guest via a VirtIO queue. This allows us to assume the maximum size of a ring buffer (the
29+
/// negotiated size of the queue).
30+
// An important feature of the data structure is that it can give us a slice of all `struct iovec`
31+
// objects in the queue, so that we can use this `&mut [iovec]` to perform operations such as
32+
// `readv`. A typical implementation of a ring buffer allows for entries to wrap around the end of
33+
// the underlying buffer. For example, a ring buffer with a capacity of 10 elements which
34+
// currently holds 4 elements can look like this:
35+
//
36+
// tail head
37+
// | |
38+
// v v
39+
// +---+---+---+---+---+---+---+---+---+---+
40+
// ring buffer: | C | D | | | | | | | A | B |
41+
// +---+---+---+---+---+---+---+---+---+---+
42+
//
43+
// When getting a slice for this data we should get something like that: &[A, B, C, D], which
44+
// would require copies in order to make the elements continuous in memory.
45+
//
46+
// In order to avoid that and make the operation of getting a slice more efficient, we implement
47+
// the optimization described in the "Optimization" section of the "Circular buffer" wikipedia
48+
// entry: https://en.wikipedia.org/wiki/Circular_buffer. The optimization consists of allocating
49+
// double the size of the virtual memory required for the buffer and map both parts on the same
50+
// physical address. Looking at the same example as before, we should get, this picture:
51+
//
52+
// head | tail
53+
// | | |
54+
// v | v
55+
// +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
56+
// | C | D | | | | | | | A | B | C | D | | | | | | | A | B |
57+
// +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
58+
// First virtual page | Second virtual page
59+
// |
60+
// |
61+
//
62+
// Virtual memory
63+
// ---------------------------------------------------------------------------------------
64+
// Physical memory
65+
//
66+
// +---+---+---+---+---+---+---+---+---+---+
67+
// | C | D | | | | | | | A | B |
68+
// +---+---+---+---+---+---+---+---+---+---+
69+
//
70+
// Like that, the elements stored in the buffer are always laid out in contiguous virtual memory,
71+
// so making a slice out of them does not require any copies.
72+
#[derive(Debug)]
73+
pub struct IovDeque {
74+
iov: *mut libc::iovec,
75+
start: u16,
76+
len: u16,
77+
}
78+
79+
// SAFETY: This is `Send`. We hold sole ownership of the underlying buffer.
80+
unsafe impl Send for IovDeque {}
81+
82+
impl IovDeque {
83+
/// Create a [`memfd`] object that represents a single physical page
84+
fn create_memfd() -> Result<memfd::Memfd, IovDequeError> {
85+
// Create a sealable memfd.
86+
let opts = memfd::MemfdOptions::default().allow_sealing(true);
87+
let mfd = opts.create("sized-1K")?;
88+
89+
// Resize to system page size.
90+
mfd.as_file()
91+
.set_len(PAGE_SIZE.try_into().unwrap())
92+
.map_err(IovDequeError::MemfdResize)?;
93+
94+
// Add seals to prevent further resizing.
95+
mfd.add_seals(&[memfd::FileSeal::SealShrink, memfd::FileSeal::SealGrow])?;
96+
97+
// Prevent further sealing changes.
98+
mfd.add_seal(memfd::FileSeal::SealSeal)?;
99+
100+
Ok(mfd)
101+
}
102+
103+
/// A safe wrapper on top of libc's `mmap` system call
104+
fn mmap(
105+
addr: *mut c_void,
106+
len: size_t,
107+
prot: c_int,
108+
flags: c_int,
109+
fd: c_int,
110+
offset: off_t,
111+
) -> Result<*mut c_void, IovDequeError> {
112+
// SAFETY: We are calling the system call with valid arguments and properly checking its
113+
// return value
114+
let ptr = unsafe { libc::mmap(addr, len, prot, flags, fd, offset) };
115+
if ptr == libc::MAP_FAILED {
116+
return Err(IovDequeError::Mmap(std::io::Error::last_os_error()));
117+
}
118+
119+
Ok(ptr)
120+
}
121+
122+
/// Allocate memory for our ring buffer
123+
///
124+
/// This will allocate exactly two pages of virtual memory. In order to implement the
125+
/// optimization that allows us to always have elements in contiguous memory we need
126+
/// allocations at the granularity of `PAGE_SIZE`. Now, our queues are at maximum 256
127+
/// descriptors long and `struct iovec` looks like this:
128+
///
129+
/// ```Rust
130+
/// pub struct iovec {
131+
/// pub iov_base: *mut ::c_void,
132+
/// pub iov_len: ::size_t,
133+
/// }
134+
/// ```
135+
///
136+
/// so, it's 16 bytes long. As a result, we need a single page for holding the actual data of
137+
/// our buffer.
138+
fn allocate_ring_buffer_memory() -> Result<*mut c_void, IovDequeError> {
139+
// The fact that we allocate two pages is due to the size of `struct iovec` times our queue
140+
// size equals the page size. Add here a debug assertion to reflect that and ensure that we
141+
// will adapt our logic if the assumption changes in the future.
142+
debug_assert_eq!(
143+
std::mem::size_of::<iovec>() * usize::from(FIRECRACKER_MAX_QUEUE_SIZE),
144+
PAGE_SIZE
145+
);
146+
147+
Self::mmap(
148+
std::ptr::null_mut(),
149+
PAGE_SIZE * 2,
150+
libc::PROT_NONE,
151+
libc::MAP_PRIVATE | libc::MAP_ANONYMOUS,
152+
-1,
153+
0,
154+
)
155+
}
156+
157+
/// Create a new [`IovDeque`] that can hold memory described by a single VirtIO queue.
158+
pub fn new() -> Result<Self, IovDequeError> {
159+
let memfd = Self::create_memfd()?;
160+
let raw_memfd = memfd.as_file().as_raw_fd();
161+
let buffer = Self::allocate_ring_buffer_memory()?;
162+
163+
// Map the first page of virtual memory to the physical page described by the memfd object
164+
let _ = Self::mmap(
165+
buffer,
166+
PAGE_SIZE,
167+
libc::PROT_READ | libc::PROT_WRITE,
168+
libc::MAP_SHARED | libc::MAP_FIXED,
169+
raw_memfd,
170+
0,
171+
)?;
172+
173+
// Map the second page of virtual memory to the physical page described by the memfd object
174+
// SAFETY: safe because `Self::allocate_ring_buffer_memory` allocates exactly two pages for
175+
// us
176+
let next_page = unsafe { buffer.add(PAGE_SIZE) };
177+
let _ = Self::mmap(
178+
next_page,
179+
PAGE_SIZE,
180+
libc::PROT_READ | libc::PROT_WRITE,
181+
libc::MAP_SHARED | libc::MAP_FIXED,
182+
raw_memfd,
183+
0,
184+
)?;
185+
186+
Ok(Self {
187+
iov: buffer.cast(),
188+
start: 0,
189+
len: 0,
190+
})
191+
}
192+
193+
/// Returns the number of `iovec` objects currently in the [`IovDeque`]
194+
#[inline(always)]
195+
pub fn len(&self) -> u16 {
196+
self.len
197+
}
198+
199+
/// Returns `true` if the [`IovDeque`] is full, `false` otherwise
200+
#[inline(always)]
201+
fn is_full(&self) -> bool {
202+
self.len() == FIRECRACKER_MAX_QUEUE_SIZE
203+
}
204+
205+
/// Resets the queue, dropping all its elements.
206+
#[inline(always)]
207+
pub fn clear(&mut self) {
208+
self.start = 0;
209+
self.len = 0;
210+
}
211+
212+
/// Adds an `iovec` in the ring buffer.
213+
///
214+
/// Returns an `IovDequeError::Full` error if the buffer is full.
215+
pub fn push_back(&mut self, iov: iovec) {
216+
// This should NEVER happen, since our ring buffer is as big as the maximum queue size.
217+
// We also check for the sanity of the VirtIO queues, in queue.rs, which means that if we
218+
// ever try to add something in a full ring buffer, there is an internal bug in the device
219+
// emulation logic. Panic here because the device is hopelessly broken.
220+
assert!(
221+
!self.is_full(),
222+
"The number of `iovec` objects is bigger than the available space"
223+
);
224+
225+
// SAFETY: self.iov is a valid pointer and `self.start + self.len` is within range (we
226+
// asserted before that the buffer is not full).
227+
unsafe {
228+
self.iov
229+
.add((self.start + self.len) as usize)
230+
.write_volatile(iov)
231+
};
232+
self.len += 1;
233+
}
234+
235+
/// Returns the `iovec` at position `pos` if `pos` < self.len(), otherwise it will return
236+
/// `None`.
237+
pub fn peek(&self, pos: u16) -> Option<iovec> {
238+
if pos < self.len() {
239+
// SAFETY: Safe, because `self.iov` is a valid pointer and we checked that `pos` points
240+
// to an existing item.
241+
Some(unsafe { self.iov.add((self.start + pos) as usize).read_volatile() })
242+
} else {
243+
None
244+
}
245+
}
246+
247+
/// Pops the first `nr_iovecs` iovecs from the buffer.
248+
///
249+
/// Returns the total number of bytes of all the popped iovecs. This will panic if we are asked
250+
/// to pop more iovecs than what is currently available in the buffer.
251+
pub fn pop_front(&mut self, nr_iovecs: u16) -> usize {
252+
if self.len() < nr_iovecs {
253+
panic!("Internal bug! Trying to drop more iovec objects than what is available");
254+
}
255+
256+
let mut bytes = 0;
257+
258+
for i in 0..nr_iovecs {
259+
let iov = self.peek(i).unwrap();
260+
bytes += iov.iov_len;
261+
}
262+
263+
self.start += nr_iovecs;
264+
self.len -= nr_iovecs;
265+
if self.start >= FIRECRACKER_MAX_QUEUE_SIZE {
266+
self.start -= FIRECRACKER_MAX_QUEUE_SIZE;
267+
}
268+
269+
bytes
270+
}
271+
272+
/// Gets a mutable pointer of the underlying `iovec` objects currently in the buffer.
273+
pub fn as_mut_ptr(&self) -> *mut iovec {
274+
// SAFETY: This is safe because `self.iov` is a valid `*mut iovec` and `self.start` is
275+
// always within range
276+
unsafe { self.iov.add(self.start as usize) }
277+
}
278+
}
279+
280+
#[cfg(test)]
281+
mod tests {
282+
use libc::iovec;
283+
284+
use super::IovDeque;
285+
286+
#[test]
287+
fn test_new() {
288+
let iov = IovDeque::new().unwrap();
289+
assert_eq!(iov.len(), 0);
290+
}
291+
292+
fn make_iovec(id: u16, len: u16) -> iovec {
293+
iovec {
294+
iov_base: id as *mut libc::c_void,
295+
iov_len: len as usize,
296+
}
297+
}
298+
299+
#[test]
300+
#[should_panic]
301+
fn test_push_back_too_many() {
302+
let mut iov = IovDeque::new().unwrap();
303+
assert_eq!(iov.len(), 0);
304+
305+
for i in 0u16..256 {
306+
iov.push_back(make_iovec(i, i));
307+
assert_eq!(iov.len(), i + 1);
308+
}
309+
310+
iov.push_back(make_iovec(0, 0));
311+
}
312+
313+
#[test]
314+
fn test_pop() {
315+
let mut deque = IovDeque::new().unwrap();
316+
assert_eq!(deque.len(), 0);
317+
assert!(!deque.is_full());
318+
assert_eq!(deque.pop_front(0), 0);
319+
320+
for i in 0u16..256 {
321+
deque.push_back(make_iovec(i, i));
322+
assert_eq!(deque.len(), i + 1);
323+
}
324+
325+
assert!(deque.is_full());
326+
assert!(deque.len() != 0);
327+
328+
for i in 0u16..256 {
329+
let bytes = deque.pop_front(1);
330+
assert_eq!(bytes, i as usize);
331+
}
332+
}
333+
334+
#[test]
335+
fn test_pop_many() {
336+
let mut deque = IovDeque::new().unwrap();
337+
338+
for i in 0u16..256 {
339+
deque.push_back(make_iovec(i, i));
340+
}
341+
342+
assert_eq!(deque.pop_front(1), 0);
343+
assert_eq!(deque.len(), 255);
344+
assert_eq!(deque.pop_front(2), 3);
345+
assert_eq!(deque.len(), 253);
346+
assert_eq!(deque.pop_front(4), (3..7).sum::<usize>());
347+
assert_eq!(deque.len(), 249);
348+
assert_eq!(deque.pop_front(8), (7..15).sum::<usize>());
349+
assert_eq!(deque.len(), 241);
350+
assert_eq!(deque.pop_front(16), (15..31).sum::<usize>());
351+
assert_eq!(deque.len(), 225);
352+
assert_eq!(deque.pop_front(32), (31..63).sum::<usize>());
353+
assert_eq!(deque.len(), 193);
354+
assert_eq!(deque.pop_front(64), (63..127).sum::<usize>());
355+
assert_eq!(deque.len(), 129);
356+
assert_eq!(deque.pop_front(128), (127..255).sum::<usize>());
357+
assert_eq!(deque.len(), 1);
358+
}
359+
360+
#[test]
361+
fn test_peek() {
362+
let mut deque = IovDeque::new().unwrap();
363+
364+
for i in 0u16..256 {
365+
deque.push_back(make_iovec(i, i));
366+
}
367+
368+
for i in 0u16..256 {
369+
assert_eq!(make_iovec(i, i), deque.peek(i).unwrap());
370+
}
371+
372+
assert!(deque.peek(256).is_none());
373+
}
374+
375+
#[test]
376+
fn test_as_mut_ptr() {
377+
let mut deque = IovDeque::new().unwrap();
378+
let mut copy: Vec<libc::iovec> = vec![];
379+
let ptr = deque.as_mut_ptr();
380+
assert!(!ptr.is_null());
381+
assert!(ptr.is_aligned());
382+
383+
for i in 0..256 {
384+
deque.push_back(make_iovec(i, 100));
385+
copy.push(deque.peek(i).unwrap());
386+
}
387+
388+
assert_eq!(copy.len(), deque.len() as usize);
389+
for copied_iov in copy {
390+
assert_eq!(copied_iov.iov_len, deque.pop_front(1));
391+
}
392+
}
393+
}

0 commit comments

Comments
 (0)