|
| 1 | +// Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. |
| 2 | +// SPDX-License-Identifier: Apache-2.0 |
| 3 | + |
| 4 | +use std::os::fd::AsRawFd; |
| 5 | + |
| 6 | +use libc::{c_int, c_void, iovec, off_t, size_t}; |
| 7 | +use memfd; |
| 8 | + |
| 9 | +use super::queue::FIRECRACKER_MAX_QUEUE_SIZE; |
| 10 | +use crate::arch::PAGE_SIZE; |
| 11 | + |
| 12 | +#[derive(Debug, thiserror::Error, displaydoc::Display)] |
| 13 | +pub enum IovDequeError { |
| 14 | + /// Error with memfd: {0} |
| 15 | + Memfd(#[from] memfd::Error), |
| 16 | + /// Error while resizing memfd: {0} |
| 17 | + MemfdResize(std::io::Error), |
| 18 | + /// Error calling mmap: {0} |
| 19 | + Mmap(std::io::Error), |
| 20 | +} |
| 21 | + |
| 22 | +/// ['IovDeque'] is a ring buffer tailored for `struct iovec` objects. |
| 23 | +/// |
| 24 | +/// From the point of view of API, [`IovDeque`] is a typical ring buffer that allows us to push |
| 25 | +/// `struct iovec` objects at the end of the buffer and pop them from its beginning. |
| 26 | +/// |
| 27 | +/// It is tailored to store `struct iovec` objects that described memory that was passed to us from |
| 28 | +/// the guest via a VirtIO queue. This allows us to assume the maximum size of a ring buffer (the |
| 29 | +/// negotiated size of the queue). |
| 30 | +// An important feature of the data structure is that it can give us a slice of all `struct iovec` |
| 31 | +// objects in the queue, so that we can use this `&mut [iovec]` to perform operations such as |
| 32 | +// `readv`. A typical implementation of a ring buffer allows for entries to wrap around the end of |
| 33 | +// the underlying buffer. For example, a ring buffer with a capacity of 10 elements which |
| 34 | +// currently holds 4 elements can look like this: |
| 35 | +// |
| 36 | +// tail head |
| 37 | +// | | |
| 38 | +// v v |
| 39 | +// +---+---+---+---+---+---+---+---+---+---+ |
| 40 | +// ring buffer: | C | D | | | | | | | A | B | |
| 41 | +// +---+---+---+---+---+---+---+---+---+---+ |
| 42 | +// |
| 43 | +// When getting a slice for this data we should get something like that: &[A, B, C, D], which |
| 44 | +// would require copies in order to make the elements continuous in memory. |
| 45 | +// |
| 46 | +// In order to avoid that and make the operation of getting a slice more efficient, we implement |
| 47 | +// the optimization described in the "Optimization" section of the "Circular buffer" wikipedia |
| 48 | +// entry: https://en.wikipedia.org/wiki/Circular_buffer. The optimization consists of allocating |
| 49 | +// double the size of the virtual memory required for the buffer and map both parts on the same |
| 50 | +// physical address. Looking at the same example as before, we should get, this picture: |
| 51 | +// |
| 52 | +// head | tail |
| 53 | +// | | | |
| 54 | +// v | v |
| 55 | +// +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ |
| 56 | +// | C | D | | | | | | | A | B | C | D | | | | | | | A | B | |
| 57 | +// +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ |
| 58 | +// First virtual page | Second virtual page |
| 59 | +// | |
| 60 | +// | |
| 61 | +// |
| 62 | +// Virtual memory |
| 63 | +// --------------------------------------------------------------------------------------- |
| 64 | +// Physical memory |
| 65 | +// |
| 66 | +// +---+---+---+---+---+---+---+---+---+---+ |
| 67 | +// | C | D | | | | | | | A | B | |
| 68 | +// +---+---+---+---+---+---+---+---+---+---+ |
| 69 | +// |
| 70 | +// Like that, the elements stored in the buffer are always laid out in contiguous virtual memory, |
| 71 | +// so making a slice out of them does not require any copies. |
| 72 | +#[derive(Debug)] |
| 73 | +pub struct IovDeque { |
| 74 | + iov: *mut libc::iovec, |
| 75 | + start: u16, |
| 76 | + len: u16, |
| 77 | +} |
| 78 | + |
| 79 | +// SAFETY: This is `Send`. We hold sole ownership of the underlying buffer. |
| 80 | +unsafe impl Send for IovDeque {} |
| 81 | + |
| 82 | +impl IovDeque { |
| 83 | + /// Create a [`memfd`] object that represents a single physical page |
| 84 | + fn create_memfd() -> Result<memfd::Memfd, IovDequeError> { |
| 85 | + // Create a sealable memfd. |
| 86 | + let opts = memfd::MemfdOptions::default().allow_sealing(true); |
| 87 | + let mfd = opts.create("sized-1K")?; |
| 88 | + |
| 89 | + // Resize to system page size. |
| 90 | + mfd.as_file() |
| 91 | + .set_len(PAGE_SIZE.try_into().unwrap()) |
| 92 | + .map_err(IovDequeError::MemfdResize)?; |
| 93 | + |
| 94 | + // Add seals to prevent further resizing. |
| 95 | + mfd.add_seals(&[memfd::FileSeal::SealShrink, memfd::FileSeal::SealGrow])?; |
| 96 | + |
| 97 | + // Prevent further sealing changes. |
| 98 | + mfd.add_seal(memfd::FileSeal::SealSeal)?; |
| 99 | + |
| 100 | + Ok(mfd) |
| 101 | + } |
| 102 | + |
| 103 | + /// A safe wrapper on top of libc's `mmap` system call |
| 104 | + fn mmap( |
| 105 | + addr: *mut c_void, |
| 106 | + len: size_t, |
| 107 | + prot: c_int, |
| 108 | + flags: c_int, |
| 109 | + fd: c_int, |
| 110 | + offset: off_t, |
| 111 | + ) -> Result<*mut c_void, IovDequeError> { |
| 112 | + // SAFETY: We are calling the system call with valid arguments and properly checking its |
| 113 | + // return value |
| 114 | + let ptr = unsafe { libc::mmap(addr, len, prot, flags, fd, offset) }; |
| 115 | + if ptr == libc::MAP_FAILED { |
| 116 | + return Err(IovDequeError::Mmap(std::io::Error::last_os_error())); |
| 117 | + } |
| 118 | + |
| 119 | + Ok(ptr) |
| 120 | + } |
| 121 | + |
| 122 | + /// Allocate memory for our ring buffer |
| 123 | + /// |
| 124 | + /// This will allocate exactly two pages of virtual memory. In order to implement the |
| 125 | + /// optimization that allows us to always have elements in contiguous memory we need |
| 126 | + /// allocations at the granularity of `PAGE_SIZE`. Now, our queues are at maximum 256 |
| 127 | + /// descriptors long and `struct iovec` looks like this: |
| 128 | + /// |
| 129 | + /// ```Rust |
| 130 | + /// pub struct iovec { |
| 131 | + /// pub iov_base: *mut ::c_void, |
| 132 | + /// pub iov_len: ::size_t, |
| 133 | + /// } |
| 134 | + /// ``` |
| 135 | + /// |
| 136 | + /// so, it's 16 bytes long. As a result, we need a single page for holding the actual data of |
| 137 | + /// our buffer. |
| 138 | + fn allocate_ring_buffer_memory() -> Result<*mut c_void, IovDequeError> { |
| 139 | + // The fact that we allocate two pages is due to the size of `struct iovec` times our queue |
| 140 | + // size equals the page size. Add here a debug assertion to reflect that and ensure that we |
| 141 | + // will adapt our logic if the assumption changes in the future. |
| 142 | + debug_assert_eq!( |
| 143 | + std::mem::size_of::<iovec>() * usize::from(FIRECRACKER_MAX_QUEUE_SIZE), |
| 144 | + PAGE_SIZE |
| 145 | + ); |
| 146 | + |
| 147 | + Self::mmap( |
| 148 | + std::ptr::null_mut(), |
| 149 | + PAGE_SIZE * 2, |
| 150 | + libc::PROT_NONE, |
| 151 | + libc::MAP_PRIVATE | libc::MAP_ANONYMOUS, |
| 152 | + -1, |
| 153 | + 0, |
| 154 | + ) |
| 155 | + } |
| 156 | + |
| 157 | + /// Create a new [`IovDeque`] that can hold memory described by a single VirtIO queue. |
| 158 | + pub fn new() -> Result<Self, IovDequeError> { |
| 159 | + let memfd = Self::create_memfd()?; |
| 160 | + let raw_memfd = memfd.as_file().as_raw_fd(); |
| 161 | + let buffer = Self::allocate_ring_buffer_memory()?; |
| 162 | + |
| 163 | + // Map the first page of virtual memory to the physical page described by the memfd object |
| 164 | + let _ = Self::mmap( |
| 165 | + buffer, |
| 166 | + PAGE_SIZE, |
| 167 | + libc::PROT_READ | libc::PROT_WRITE, |
| 168 | + libc::MAP_SHARED | libc::MAP_FIXED, |
| 169 | + raw_memfd, |
| 170 | + 0, |
| 171 | + )?; |
| 172 | + |
| 173 | + // Map the second page of virtual memory to the physical page described by the memfd object |
| 174 | + // SAFETY: safe because `Self::allocate_ring_buffer_memory` allocates exactly two pages for |
| 175 | + // us |
| 176 | + let next_page = unsafe { buffer.add(PAGE_SIZE) }; |
| 177 | + let _ = Self::mmap( |
| 178 | + next_page, |
| 179 | + PAGE_SIZE, |
| 180 | + libc::PROT_READ | libc::PROT_WRITE, |
| 181 | + libc::MAP_SHARED | libc::MAP_FIXED, |
| 182 | + raw_memfd, |
| 183 | + 0, |
| 184 | + )?; |
| 185 | + |
| 186 | + Ok(Self { |
| 187 | + iov: buffer.cast(), |
| 188 | + start: 0, |
| 189 | + len: 0, |
| 190 | + }) |
| 191 | + } |
| 192 | + |
| 193 | + /// Returns the number of `iovec` objects currently in the [`IovDeque`] |
| 194 | + #[inline(always)] |
| 195 | + pub fn len(&self) -> u16 { |
| 196 | + self.len |
| 197 | + } |
| 198 | + |
| 199 | + /// Returns `true` if the [`IovDeque`] is full, `false` otherwise |
| 200 | + #[inline(always)] |
| 201 | + fn is_full(&self) -> bool { |
| 202 | + self.len() == FIRECRACKER_MAX_QUEUE_SIZE |
| 203 | + } |
| 204 | + |
| 205 | + /// Resets the queue, dropping all its elements. |
| 206 | + #[inline(always)] |
| 207 | + pub fn clear(&mut self) { |
| 208 | + self.start = 0; |
| 209 | + self.len = 0; |
| 210 | + } |
| 211 | + |
| 212 | + /// Adds an `iovec` in the ring buffer. |
| 213 | + /// |
| 214 | + /// Returns an `IovDequeError::Full` error if the buffer is full. |
| 215 | + pub fn push_back(&mut self, iov: iovec) { |
| 216 | + // This should NEVER happen, since our ring buffer is as big as the maximum queue size. |
| 217 | + // We also check for the sanity of the VirtIO queues, in queue.rs, which means that if we |
| 218 | + // ever try to add something in a full ring buffer, there is an internal bug in the device |
| 219 | + // emulation logic. Panic here because the device is hopelessly broken. |
| 220 | + assert!( |
| 221 | + !self.is_full(), |
| 222 | + "The number of `iovec` objects is bigger than the available space" |
| 223 | + ); |
| 224 | + |
| 225 | + // SAFETY: self.iov is a valid pointer and `self.start + self.len` is within range (we |
| 226 | + // asserted before that the buffer is not full). |
| 227 | + unsafe { |
| 228 | + self.iov |
| 229 | + .add((self.start + self.len) as usize) |
| 230 | + .write_volatile(iov) |
| 231 | + }; |
| 232 | + self.len += 1; |
| 233 | + } |
| 234 | + |
| 235 | + /// Returns the `iovec` at position `pos` if `pos` < self.len(), otherwise it will return |
| 236 | + /// `None`. |
| 237 | + pub fn peek(&self, pos: u16) -> Option<iovec> { |
| 238 | + if pos < self.len() { |
| 239 | + // SAFETY: Safe, because `self.iov` is a valid pointer and we checked that `pos` points |
| 240 | + // to an existing item. |
| 241 | + Some(unsafe { self.iov.add((self.start + pos) as usize).read_volatile() }) |
| 242 | + } else { |
| 243 | + None |
| 244 | + } |
| 245 | + } |
| 246 | + |
| 247 | + /// Pops the first `nr_iovecs` iovecs from the buffer. |
| 248 | + /// |
| 249 | + /// Returns the total number of bytes of all the popped iovecs. This will panic if we are asked |
| 250 | + /// to pop more iovecs than what is currently available in the buffer. |
| 251 | + pub fn pop_front(&mut self, nr_iovecs: u16) -> usize { |
| 252 | + if self.len() < nr_iovecs { |
| 253 | + panic!("Internal bug! Trying to drop more iovec objects than what is available"); |
| 254 | + } |
| 255 | + |
| 256 | + let mut bytes = 0; |
| 257 | + |
| 258 | + for i in 0..nr_iovecs { |
| 259 | + let iov = self.peek(i).unwrap(); |
| 260 | + bytes += iov.iov_len; |
| 261 | + } |
| 262 | + |
| 263 | + self.start += nr_iovecs; |
| 264 | + self.len -= nr_iovecs; |
| 265 | + if self.start >= FIRECRACKER_MAX_QUEUE_SIZE { |
| 266 | + self.start -= FIRECRACKER_MAX_QUEUE_SIZE; |
| 267 | + } |
| 268 | + |
| 269 | + bytes |
| 270 | + } |
| 271 | + |
| 272 | + /// Gets a mutable pointer of the underlying `iovec` objects currently in the buffer. |
| 273 | + pub fn as_mut_ptr(&self) -> *mut iovec { |
| 274 | + // SAFETY: This is safe because `self.iov` is a valid `*mut iovec` and `self.start` is |
| 275 | + // always within range |
| 276 | + unsafe { self.iov.add(self.start as usize) } |
| 277 | + } |
| 278 | +} |
| 279 | + |
| 280 | +#[cfg(test)] |
| 281 | +mod tests { |
| 282 | + use libc::iovec; |
| 283 | + |
| 284 | + use super::IovDeque; |
| 285 | + |
| 286 | + #[test] |
| 287 | + fn test_new() { |
| 288 | + let iov = IovDeque::new().unwrap(); |
| 289 | + assert_eq!(iov.len(), 0); |
| 290 | + } |
| 291 | + |
| 292 | + fn make_iovec(id: u16, len: u16) -> iovec { |
| 293 | + iovec { |
| 294 | + iov_base: id as *mut libc::c_void, |
| 295 | + iov_len: len as usize, |
| 296 | + } |
| 297 | + } |
| 298 | + |
| 299 | + #[test] |
| 300 | + #[should_panic] |
| 301 | + fn test_push_back_too_many() { |
| 302 | + let mut iov = IovDeque::new().unwrap(); |
| 303 | + assert_eq!(iov.len(), 0); |
| 304 | + |
| 305 | + for i in 0u16..256 { |
| 306 | + iov.push_back(make_iovec(i, i)); |
| 307 | + assert_eq!(iov.len(), i + 1); |
| 308 | + } |
| 309 | + |
| 310 | + iov.push_back(make_iovec(0, 0)); |
| 311 | + } |
| 312 | + |
| 313 | + #[test] |
| 314 | + fn test_pop() { |
| 315 | + let mut deque = IovDeque::new().unwrap(); |
| 316 | + assert_eq!(deque.len(), 0); |
| 317 | + assert!(!deque.is_full()); |
| 318 | + assert_eq!(deque.pop_front(0), 0); |
| 319 | + |
| 320 | + for i in 0u16..256 { |
| 321 | + deque.push_back(make_iovec(i, i)); |
| 322 | + assert_eq!(deque.len(), i + 1); |
| 323 | + } |
| 324 | + |
| 325 | + assert!(deque.is_full()); |
| 326 | + assert!(deque.len() != 0); |
| 327 | + |
| 328 | + for i in 0u16..256 { |
| 329 | + let bytes = deque.pop_front(1); |
| 330 | + assert_eq!(bytes, i as usize); |
| 331 | + } |
| 332 | + } |
| 333 | + |
| 334 | + #[test] |
| 335 | + fn test_pop_many() { |
| 336 | + let mut deque = IovDeque::new().unwrap(); |
| 337 | + |
| 338 | + for i in 0u16..256 { |
| 339 | + deque.push_back(make_iovec(i, i)); |
| 340 | + } |
| 341 | + |
| 342 | + assert_eq!(deque.pop_front(1), 0); |
| 343 | + assert_eq!(deque.len(), 255); |
| 344 | + assert_eq!(deque.pop_front(2), 3); |
| 345 | + assert_eq!(deque.len(), 253); |
| 346 | + assert_eq!(deque.pop_front(4), (3..7).sum::<usize>()); |
| 347 | + assert_eq!(deque.len(), 249); |
| 348 | + assert_eq!(deque.pop_front(8), (7..15).sum::<usize>()); |
| 349 | + assert_eq!(deque.len(), 241); |
| 350 | + assert_eq!(deque.pop_front(16), (15..31).sum::<usize>()); |
| 351 | + assert_eq!(deque.len(), 225); |
| 352 | + assert_eq!(deque.pop_front(32), (31..63).sum::<usize>()); |
| 353 | + assert_eq!(deque.len(), 193); |
| 354 | + assert_eq!(deque.pop_front(64), (63..127).sum::<usize>()); |
| 355 | + assert_eq!(deque.len(), 129); |
| 356 | + assert_eq!(deque.pop_front(128), (127..255).sum::<usize>()); |
| 357 | + assert_eq!(deque.len(), 1); |
| 358 | + } |
| 359 | + |
| 360 | + #[test] |
| 361 | + fn test_peek() { |
| 362 | + let mut deque = IovDeque::new().unwrap(); |
| 363 | + |
| 364 | + for i in 0u16..256 { |
| 365 | + deque.push_back(make_iovec(i, i)); |
| 366 | + } |
| 367 | + |
| 368 | + for i in 0u16..256 { |
| 369 | + assert_eq!(make_iovec(i, i), deque.peek(i).unwrap()); |
| 370 | + } |
| 371 | + |
| 372 | + assert!(deque.peek(256).is_none()); |
| 373 | + } |
| 374 | + |
| 375 | + #[test] |
| 376 | + fn test_as_mut_ptr() { |
| 377 | + let mut deque = IovDeque::new().unwrap(); |
| 378 | + let mut copy: Vec<libc::iovec> = vec![]; |
| 379 | + let ptr = deque.as_mut_ptr(); |
| 380 | + assert!(!ptr.is_null()); |
| 381 | + assert!(ptr.is_aligned()); |
| 382 | + |
| 383 | + for i in 0..256 { |
| 384 | + deque.push_back(make_iovec(i, 100)); |
| 385 | + copy.push(deque.peek(i).unwrap()); |
| 386 | + } |
| 387 | + |
| 388 | + assert_eq!(copy.len(), deque.len() as usize); |
| 389 | + for copied_iov in copy { |
| 390 | + assert_eq!(copied_iov.iov_len, deque.pop_front(1)); |
| 391 | + } |
| 392 | + } |
| 393 | +} |
0 commit comments