Skip to content

Commit 91cd104

Browse files
committed
feat(net): add MRG_RXBUF support to virtio-net device
Now virtio-net device VIRTIO_NET_F_MRG_RXBUF feature which allows it to write single packet into multiple descriptor chains. The amount of descriptor chains (also known as heads) is written into the `virtio_net_hdr_v1` structure which is located at the very begging of the packet. Signed-off-by: Egor Lazarchuk <[email protected]>
1 parent 74fbe4e commit 91cd104

File tree

3 files changed

+141
-79
lines changed

3 files changed

+141
-79
lines changed

src/vmm/src/devices/virtio/iovec.rs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -367,10 +367,15 @@ impl IoVecBufferMut {
367367
/// In contrast to the equivalent [`IoVecBuffer::len()`] which returns `u32`, this one returns
368368
/// `usize` since the buffer can contain multiple `DescriptorChain` objects, so we don't have
369369
/// the limit that the length of a buffer is limited by `u32`.
370-
pub(crate) fn len(&self) -> usize {
370+
pub fn len(&self) -> usize {
371371
self.len
372372
}
373373

374+
/// Returns true if there is buffer is empty.
375+
pub fn is_empty(&self) -> bool {
376+
self.len == 0
377+
}
378+
374379
/// Returns a pointer to the memory keeping the `iovec` structs
375380
pub fn as_iovec_mut_slice(&mut self) -> &mut [iovec] {
376381
self.vecs.as_mut_slice()

src/vmm/src/devices/virtio/net/device.rs

Lines changed: 131 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,12 @@
66
// found in the THIRD-PARTY file.
77

88
use std::collections::VecDeque;
9-
use std::mem;
9+
use std::mem::{self, offset_of};
1010
use std::net::Ipv4Addr;
11+
use std::num::Wrapping;
1112
use std::sync::{Arc, Mutex};
1213

13-
use libc::EAGAIN;
14+
use libc::{iovec, EAGAIN};
1415
use log::error;
1516
use vmm_sys_util::eventfd::EventFd;
1617

@@ -19,7 +20,7 @@ use crate::devices::virtio::gen::virtio_blk::VIRTIO_F_VERSION_1;
1920
use crate::devices::virtio::gen::virtio_net::{
2021
virtio_net_hdr_v1, VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM, VIRTIO_NET_F_GUEST_TSO4,
2122
VIRTIO_NET_F_GUEST_TSO6, VIRTIO_NET_F_GUEST_UFO, VIRTIO_NET_F_HOST_TSO4,
22-
VIRTIO_NET_F_HOST_TSO6, VIRTIO_NET_F_HOST_UFO, VIRTIO_NET_F_MAC,
23+
VIRTIO_NET_F_HOST_TSO6, VIRTIO_NET_F_HOST_UFO, VIRTIO_NET_F_MAC, VIRTIO_NET_F_MRG_RXBUF,
2324
};
2425
use crate::devices::virtio::gen::virtio_ring::VIRTIO_RING_F_EVENT_IDX;
2526
use crate::devices::virtio::iovec::{
@@ -108,7 +109,8 @@ pub struct RxBuffers {
108109
// A map of which part of the memory belongs to which `DescriptorChain` object
109110
pub parsed_descriptors: VecDeque<ParsedDescriptorChain>,
110111
// Buffers that we have used and they are ready to be given back to the guest.
111-
pub used_descriptor: Option<ParsedDescriptorChain>,
112+
pub used_descriptors: u16,
113+
pub used_bytes: u32,
112114
}
113115

114116
impl RxBuffers {
@@ -118,7 +120,8 @@ impl RxBuffers {
118120
min_buffer_size: 0,
119121
iovec: IoVecBufferMut::new()?,
120122
parsed_descriptors: VecDeque::with_capacity(FIRECRACKER_MAX_QUEUE_SIZE.into()),
121-
used_descriptor: None,
123+
used_descriptors: 0,
124+
used_bytes: 0,
122125
})
123126
}
124127

@@ -141,75 +144,114 @@ impl RxBuffers {
141144
Ok(())
142145
}
143146

144-
/// Returns the number of available `iovec` objects.
147+
/// Returns the total size of available space in the buffer.
145148
#[inline(always)]
146-
fn len(&self) -> usize {
149+
fn capacity(&self) -> usize {
147150
self.iovec.len()
148151
}
149152

150-
/// Returns `true` if there aren't any available `iovec` objects.
151-
#[inline(always)]
152-
fn is_empty(&self) -> bool {
153-
self.len() == 0
154-
}
155-
156153
/// Mark the first `size` bytes of available memory as used.
157154
///
158155
/// # Safety:
159156
///
160157
/// * The `RxBuffers` should include at least one parsed `DescriptorChain`.
161158
/// * `size` needs to be smaller or equal to total length of the first `DescriptorChain` stored
162159
/// in the `RxBuffers`.
163-
unsafe fn mark_used(&mut self, size: u32) {
160+
unsafe fn mark_used(&mut self, mut bytes_written: u32, rx_queue: &mut Queue) {
164161
// Since we were able to write a frame in guest memory, we should have at least one
165162
// descriptor chain here. If not, we have a bug, so fail fast, since the device is
166163
// fundamentally broken.
167-
let mut parsed_dc = self.parsed_descriptors.pop_front().expect(
168-
"net: internal bug. Mismatch between written frame size and available descriptors",
164+
debug_assert!(!self.iovec.is_empty());
165+
let header_ptr: *mut virtio_net_hdr_v1 = self.iovec.vecs.as_mut_slice()[0].iov_base.cast();
166+
let header_buff_len = self.iovec.vecs.as_mut_slice()[0].iov_len;
167+
assert!(
168+
vnet_hdr_len() <= header_buff_len,
169+
"Network buffer should be big enough for virtio_net_hdr_v1 object"
169170
);
170171

171-
self.header_set_num_buffers(1);
172-
self.iovec.drop_descriptor_chain(&parsed_dc);
173-
parsed_dc.length = size;
174-
self.used_descriptor = Some(parsed_dc);
172+
self.used_bytes = bytes_written;
173+
174+
let mut used_heads: u16 = 0;
175+
let mut write_used = |head_index: u16, bytes_written: u32, rx_queue: &mut Queue| {
176+
if let Err(err) = rx_queue.write_used_element(
177+
(rx_queue.next_used + Wrapping(self.used_descriptors)).0,
178+
head_index,
179+
bytes_written,
180+
) {
181+
error!(
182+
"net: Failed to add used descriptor {} of length {} to RX queue: {err}",
183+
head_index, bytes_written
184+
);
185+
}
186+
used_heads += 1;
187+
self.used_descriptors += 1;
188+
};
189+
190+
loop {
191+
let parsed_dc = self
192+
.parsed_descriptors
193+
.pop_front()
194+
.expect("This should never happen if write to the buffer succeded.");
195+
self.iovec.drop_descriptor_chain(&parsed_dc);
196+
197+
if bytes_written <= parsed_dc.length {
198+
write_used(parsed_dc.head_index, bytes_written, rx_queue);
199+
break;
200+
} else {
201+
write_used(parsed_dc.head_index, parsed_dc.length, rx_queue);
202+
bytes_written -= parsed_dc.length;
203+
}
204+
}
205+
// SAFETY: The user space pointer was verified at the point of creation and
206+
// we verified the alignment and header buffer size.
207+
unsafe {
208+
Self::header_set_num_buffers(header_ptr, used_heads);
209+
}
175210
}
176211

177-
/// Write the number of descriptors used in VirtIO header
178-
fn header_set_num_buffers(&mut self, nr_descs: u16) {
179-
// We can unwrap here, because we have checked before that the `IoVecBufferMut` holds at
180-
// least one buffer with the proper size, depending on the feature negotiation. In any
181-
// case, the buffer holds memory of at least `std::mem::size_of::<virtio_net_hdr_v1>()`
182-
// bytes.
183-
self.iovec
184-
.write_all_volatile_at(
185-
&nr_descs.to_le_bytes(),
186-
std::mem::offset_of!(virtio_net_hdr_v1, num_buffers),
187-
)
188-
.unwrap()
212+
/// Writes number of buffers to the [`num_buffers`] field of a virtio_net_hdr_v1 struct
213+
/// pointed by the [`ptr`].
214+
///
215+
/// # Safety
216+
/// Memory area needs to be big enoug for virtio_net_hdr_v1 to fit.
217+
unsafe fn header_set_num_buffers(ptr: *mut virtio_net_hdr_v1, num_buffers: u16) {
218+
debug_assert!(
219+
ptr.is_aligned(),
220+
"Pointer should have at least 0x2 aligment"
221+
);
222+
223+
let ptr: *mut u8 = ptr.cast();
224+
let ptr = ptr.add(offset_of!(virtio_net_hdr_v1, num_buffers));
225+
let bytes = num_buffers.to_le_bytes();
226+
let ptr: *mut [u8; 2] = ptr.cast();
227+
ptr.write_volatile(bytes);
189228
}
190229

191230
/// This will let the guest know that about all the `DescriptorChain` object that has been
192231
/// used to receive a frame from the TAP.
193232
fn finish_frame(&mut self, rx_queue: &mut Queue) {
194-
if let Some(used_dc) = self.used_descriptor.take() {
195-
// It is fine to `.unrap()` here. The only reason why `add_used` can fail is if the
196-
// `head_index` is not a valid descriptor id. `head_index` here is a valid
197-
// `DescriptorChain` index. We got it from `queue.pop_or_enable_notification()` which
198-
// checks for its validity. In other words, if this unwrap() fails there's a bug in our
199-
// emulation logic which, most likely, we can't recover from. So, let's crash here
200-
// instead of logging an error and continuing.
201-
rx_queue
202-
.add_used(used_dc.head_index, used_dc.length)
203-
.unwrap();
204-
}
233+
rx_queue.advance_used_ring(self.used_descriptors);
234+
self.used_descriptors = 0;
235+
self.used_bytes = 0;
205236
}
206237

207238
/// Returns the number of bytes that have been used from the buffer
208239
fn used_bytes(&self) -> u32 {
209-
match self.used_descriptor {
210-
Some(ref dc) => dc.length,
211-
None => 0,
212-
}
240+
self.used_bytes
241+
}
242+
243+
/// Return a slice of iovecs for the first slice in the buffer.
244+
///
245+
/// # Safety
246+
/// Buffer needs to have at least one descriptor chain parsed.
247+
unsafe fn single_chain_slice_mut(&mut self) -> &mut [iovec] {
248+
let nr_iovecs = self.parsed_descriptors[0].nr_iovecs as usize;
249+
&mut self.iovec.as_iovec_mut_slice()[..nr_iovecs]
250+
}
251+
252+
/// Return a slice of iovecs for all descriptor chains in the buffer.
253+
fn all_chains_slice_mut(&mut self) -> &mut [iovec] {
254+
self.iovec.as_iovec_mut_slice()
213255
}
214256
}
215257

@@ -272,6 +314,7 @@ impl Net {
272314
| 1 << VIRTIO_NET_F_HOST_TSO6
273315
| 1 << VIRTIO_NET_F_HOST_UFO
274316
| 1 << VIRTIO_F_VERSION_1
317+
| 1 << VIRTIO_NET_F_MRG_RXBUF
275318
| 1 << VIRTIO_RING_F_EVENT_IDX;
276319

277320
let mut config_space = ConfigSpace::default();
@@ -433,13 +476,21 @@ impl Net {
433476
/// Returns the minimum size of buffer we expect the guest to provide us depending on the
434477
/// features we have negotiated with it
435478
fn minimum_rx_buffer_size(&self) -> u32 {
436-
if self.has_feature(VIRTIO_NET_F_GUEST_TSO4 as u64)
437-
|| self.has_feature(VIRTIO_NET_F_GUEST_TSO6 as u64)
438-
|| self.has_feature(VIRTIO_NET_F_GUEST_UFO as u64)
439-
{
440-
65562
479+
if !self.has_feature(VIRTIO_NET_F_MRG_RXBUF as u64) {
480+
if self.has_feature(VIRTIO_NET_F_GUEST_TSO4 as u64)
481+
|| self.has_feature(VIRTIO_NET_F_GUEST_TSO6 as u64)
482+
|| self.has_feature(VIRTIO_NET_F_GUEST_UFO as u64)
483+
{
484+
65562
485+
} else {
486+
1526
487+
}
441488
} else {
442-
1526
489+
// header is 12 bytes long
490+
#[allow(clippy::cast_possible_truncation)]
491+
{
492+
vnet_hdr_len() as u32
493+
}
443494
}
444495
}
445496

@@ -454,6 +505,9 @@ impl Net {
454505
if let Err(err) = unsafe { self.rx_buffer.add_buffer(mem, head) } {
455506
self.metrics.rx_fails.inc();
456507
error!("net: Could not parse an RX descriptor: {err}");
508+
// Notify queue about ready frames. We need this
509+
// to bring queue into up to date state.
510+
self.rx_buffer.finish_frame(queue);
457511
// Try to add the bad descriptor to the used ring.
458512
if let Err(err) = queue.add_used(index, 0) {
459513
error!(
@@ -541,15 +595,14 @@ impl Net {
541595

542596
// We currently prioritize packets from the MMDS over regular network packets.
543597
fn read_from_mmds_or_tap(&mut self) -> Result<usize, NetError> {
544-
// If we don't have any buffers available try to parse more from the RX queue. There might
545-
// be some buffers we didn't get the chance to process, because we got to handle the TAP
546-
// event before the RX queue event.
547-
if self.rx_buffer.is_empty() {
598+
// We only want to read from TAP (or mmds) if we have at least 64K of available capacity as
599+
// this is the max size of 1 packet.
600+
if self.rx_buffer.capacity() < u16::MAX as usize {
548601
self.parse_rx_descriptors();
549602

550-
// If after parsing the RX queue we still don't have any buffers stop processing RX
603+
// If after parsing the RX queue we still don't have enough capacity, stop processing RX
551604
// frames.
552-
if self.rx_buffer.is_empty() {
605+
if self.rx_buffer.capacity() < u16::MAX as usize {
553606
return Ok(0);
554607
}
555608
}
@@ -570,8 +623,10 @@ impl Net {
570623
// * `rx_frame_buf` has size of `MAX_BUFFER_SIZE` and all `DescriptorChain` objects
571624
// are at least that big.
572625
unsafe {
573-
self.rx_buffer
574-
.mark_used((vnet_hdr_len() + len).try_into().unwrap());
626+
self.rx_buffer.mark_used(
627+
(vnet_hdr_len() + len).try_into().unwrap(),
628+
&mut self.queues[RX_INDEX],
629+
);
575630
}
576631
return Ok(vnet_hdr_len() + len);
577632
}
@@ -586,7 +641,8 @@ impl Net {
586641
// * `read_tap` passes the first `DescriptorChain` to `readv` so we can't have read more
587642
// bytes than its capacity.
588643
unsafe {
589-
self.rx_buffer.mark_used(len.try_into().unwrap());
644+
self.rx_buffer
645+
.mark_used(len.try_into().unwrap(), &mut self.queues[RX_INDEX]);
590646
}
591647
Ok(len)
592648
}
@@ -630,13 +686,15 @@ impl Net {
630686
}
631687

632688
fn has_deferred_frame(&self) -> bool {
633-
self.rx_buffer.used_descriptor.is_some()
689+
self.rx_buffer.used_descriptors != 0
634690
}
635691

636692
// Process the deferred frame first, then continue reading from tap.
637693
fn handle_deferred_frame(&mut self) -> Result<(), DeviceError> {
638694
let used_bytes = self.rx_buffer.used_bytes();
639695
if self.rate_limited_rx_single_frame(used_bytes as usize) {
696+
// Finish with rate limitted packet.
697+
self.rx_buffer.finish_frame(&mut self.queues[RX_INDEX]);
640698
// process_rx() was interrupted possibly before consuming all
641699
// packets in the tap; try continuing now.
642700
return self.process_rx();
@@ -797,9 +855,12 @@ impl Net {
797855
///
798856
/// `self.rx_buffer` needs to have at least one descriptor chain parsed
799857
pub unsafe fn read_tap(&mut self) -> std::io::Result<usize> {
800-
let nr_iovecs = self.rx_buffer.parsed_descriptors[0].nr_iovecs as usize;
801-
self.tap
802-
.read_iovec(&mut self.rx_buffer.iovec.as_iovec_mut_slice()[..nr_iovecs])
858+
if self.has_feature(VIRTIO_NET_F_MRG_RXBUF as u64) {
859+
self.tap.read_iovec(self.rx_buffer.all_chains_slice_mut())
860+
} else {
861+
// SAFETY: we only call this if `rx_buffer` is not empty.
862+
unsafe { self.tap.read_iovec(self.rx_buffer.single_chain_slice_mut()) }
863+
}
803864
}
804865

805866
fn write_tap(tap: &mut Tap, buf: &IoVecBuffer) -> std::io::Result<usize> {
@@ -1846,11 +1907,8 @@ pub mod tests {
18461907
unsafe { libc::close(th.net.lock().unwrap().tap.as_raw_fd()) };
18471908

18481909
// The RX queue is empty and there is a deferred frame.
1849-
th.net().rx_buffer.used_descriptor = Some(ParsedDescriptorChain {
1850-
head_index: 1,
1851-
length: 100,
1852-
nr_iovecs: 1,
1853-
});
1910+
th.net().rx_buffer.used_descriptors = 1;
1911+
th.net().rx_buffer.used_bytes = 100;
18541912
check_metric_after_block!(
18551913
th.net().metrics.no_rx_avail_buffer,
18561914
1,
@@ -1860,7 +1918,8 @@ pub mod tests {
18601918
// We need to set this here to false, otherwise the device will try to
18611919
// handle a deferred frame, it will fail and will never try to read from
18621920
// the tap.
1863-
th.net().rx_buffer.used_descriptor = None;
1921+
th.net().rx_buffer.used_descriptors = 0;
1922+
th.net().rx_buffer.used_bytes = 0;
18641923

18651924
th.add_desc_chain(NetQueue::Rx, 0, &[(0, 4096, VIRTQ_DESC_F_WRITE)]);
18661925
check_metric_after_block!(

0 commit comments

Comments
 (0)