diff --git a/resources/seccomp/aarch64-unknown-linux-musl.json b/resources/seccomp/aarch64-unknown-linux-musl.json index d81a1012599..ece2fbdbf9b 100644 --- a/resources/seccomp/aarch64-unknown-linux-musl.json +++ b/resources/seccomp/aarch64-unknown-linux-musl.json @@ -85,6 +85,10 @@ "syscall": "lseek", "comment": "Used by the block device" }, + { + "syscall": "fallocate", + "comment": "Used by the block device for discard (hole punching)" + }, { "syscall": "mremap", "comment": "Used for re-allocating large memory regions, for example vectors" diff --git a/resources/seccomp/x86_64-unknown-linux-musl.json b/resources/seccomp/x86_64-unknown-linux-musl.json index 66c986495fb..6417d54c8ce 100644 --- a/resources/seccomp/x86_64-unknown-linux-musl.json +++ b/resources/seccomp/x86_64-unknown-linux-musl.json @@ -85,6 +85,10 @@ "syscall": "lseek", "comment": "Used by the block device" }, + { + "syscall": "fallocate", + "comment": "Used by the block device for discard (hole punching)" + }, { "syscall": "mremap", "comment": "Used for re-allocating large memory regions, for example vectors" diff --git a/src/vmm/src/devices/virtio/block/virtio/device.rs b/src/vmm/src/devices/virtio/block/virtio/device.rs index ecdd8ee4f6d..74fc1a2c7ed 100644 --- a/src/vmm/src/devices/virtio/block/virtio/device.rs +++ b/src/vmm/src/devices/virtio/block/virtio/device.rs @@ -27,7 +27,7 @@ use crate::devices::virtio::block::CacheType; use crate::devices::virtio::block::virtio::metrics::{BlockDeviceMetrics, BlockMetricsPerDevice}; use crate::devices::virtio::device::{ActiveState, DeviceState, VirtioDevice}; use crate::devices::virtio::generated::virtio_blk::{ - VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_RO, VIRTIO_BLK_ID_BYTES, + VIRTIO_BLK_F_DISCARD, VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_RO, VIRTIO_BLK_ID_BYTES, }; use crate::devices::virtio::generated::virtio_config::VIRTIO_F_VERSION_1; use crate::devices::virtio::generated::virtio_ids::VIRTIO_ID_BLOCK; @@ -298,7 +298,9 @@ impl VirtioBlock { .map_err(VirtioBlockError::RateLimiter)? .unwrap_or_default(); - let mut avail_features = (1u64 << VIRTIO_F_VERSION_1) | (1u64 << VIRTIO_RING_F_EVENT_IDX); + let mut avail_features = (1u64 << VIRTIO_F_VERSION_1) + | (1u64 << VIRTIO_RING_F_EVENT_IDX) + | (1u64 << VIRTIO_BLK_F_DISCARD); if config.cache_type == CacheType::Writeback { avail_features |= 1u64 << VIRTIO_BLK_F_FLUSH; diff --git a/src/vmm/src/devices/virtio/block/virtio/io/async_io.rs b/src/vmm/src/devices/virtio/block/virtio/io/async_io.rs index f83d9dea1df..fa6c752b41c 100644 --- a/src/vmm/src/devices/virtio/block/virtio/io/async_io.rs +++ b/src/vmm/src/devices/virtio/block/virtio/io/async_io.rs @@ -110,7 +110,6 @@ impl AsyncFileEngine { Ok(()) } - #[cfg(test)] pub fn file(&self) -> &File { &self.file } @@ -230,6 +229,22 @@ impl AsyncFileEngine { Ok(()) } + pub fn push_discard( + &mut self, + offset: u64, + len: u32, + req: PendingRequest, + ) -> Result<(), RequestError> { + let wrapped_user_data = WrappedRequest::new(req); + + self.ring + .push(Operation::fallocate(0, len, offset, wrapped_user_data)) + .map_err(|(io_uring_error, data)| RequestError { + req: data.req, + error: AsyncIoError::IoUring(io_uring_error), + }) + } + fn do_pop(&mut self) -> Result>, AsyncIoError> { self.ring.pop().map_err(AsyncIoError::IoUring) } diff --git a/src/vmm/src/devices/virtio/block/virtio/io/mod.rs b/src/vmm/src/devices/virtio/block/virtio/io/mod.rs index b7aa8061d76..1e59c236fe4 100644 --- a/src/vmm/src/devices/virtio/block/virtio/io/mod.rs +++ b/src/vmm/src/devices/virtio/block/virtio/io/mod.rs @@ -6,9 +6,13 @@ pub mod sync_io; use std::fmt::Debug; use std::fs::File; +use std::os::unix::io::AsRawFd; + +use libc::{FALLOC_FL_KEEP_SIZE, FALLOC_FL_PUNCH_HOLE, c_int, off64_t}; pub use self::async_io::{AsyncFileEngine, AsyncIoError}; pub use self::sync_io::{SyncFileEngine, SyncIoError}; +use crate::device_manager::mmio::MMIO_LEN; use crate::devices::virtio::block::virtio::PendingRequest; use crate::devices::virtio::block::virtio::device::FileEngineType; use crate::vstate::memory::{GuestAddress, GuestMemoryMmap}; @@ -75,7 +79,6 @@ impl FileEngine { Ok(()) } - #[cfg(test)] pub fn file(&self) -> &File { match self { FileEngine::Async(engine) => engine.file(), @@ -172,6 +175,30 @@ impl FileEngine { FileEngine::Sync(engine) => engine.flush().map_err(BlockIoError::Sync), } } + + pub fn discard( + &mut self, + offset: u64, + count: u32, + req: PendingRequest, + ) -> Result> { + match self { + FileEngine::Async(engine) => match engine.push_discard(offset, count, req) { + Ok(_) => Ok(FileEngineOk::Submitted), + Err(err) => Err(RequestError { + req: err.req, + error: BlockIoError::Async(err.error), + }), + }, + FileEngine::Sync(engine) => match engine.discard(offset, count) { + Ok(count) => Ok(FileEngineOk::Executed(RequestOk { req, count })), + Err(err) => Err(RequestError { + req, + error: BlockIoError::Sync(err), + }), + }, + } + } } #[cfg(test)] diff --git a/src/vmm/src/devices/virtio/block/virtio/io/sync_io.rs b/src/vmm/src/devices/virtio/block/virtio/io/sync_io.rs index eec3b3d8b8d..167f3c331f1 100644 --- a/src/vmm/src/devices/virtio/block/virtio/io/sync_io.rs +++ b/src/vmm/src/devices/virtio/block/virtio/io/sync_io.rs @@ -3,7 +3,9 @@ use std::fs::File; use std::io::{Seek, SeekFrom, Write}; +use std::os::unix::io::AsRawFd; +use libc::{FALLOC_FL_KEEP_SIZE, FALLOC_FL_PUNCH_HOLE, c_int, off64_t}; use vm_memory::{GuestMemoryError, ReadVolatile, WriteVolatile}; use crate::vstate::memory::{GuestAddress, GuestMemory, GuestMemoryMmap}; @@ -18,6 +20,8 @@ pub enum SyncIoError { SyncAll(std::io::Error), /// Transfer: {0} Transfer(GuestMemoryError), + /// Discard: {0} + Discard(std::io::Error), } #[derive(Debug)] @@ -33,7 +37,6 @@ impl SyncFileEngine { SyncFileEngine { file } } - #[cfg(test)] pub fn file(&self) -> &File { &self.file } @@ -81,4 +84,50 @@ impl SyncFileEngine { // Sync data out to physical media on host. self.file.sync_all().map_err(SyncIoError::SyncAll) } + + pub fn discard(&mut self, offset: u64, len: u32) -> Result { + // Do checked conversion to avoid possible wrap/cast issues on 64-bit systems. + let off_i64 = i64::try_from(offset).map_err(|_| { + SyncIoError::Discard(std::io::Error::new( + std::io::ErrorKind::InvalidInput, + "offset overflow", + )) + })?; + + let len_i64: i64 = len.into(); + + // SAFETY: calling libc::fallocate is safe here because: + // - `self.file.as_raw_fd()` is a valid file descriptor owned by this struct, + // - `off_i64` and `len_i64` are validated copies of the incoming unsigned values converted + // to the C `off64_t` type, and + // - the syscall is properly checked for an error return value. + unsafe { + let ret = libc::fallocate( + self.file.as_raw_fd(), + libc::FALLOC_FL_PUNCH_HOLE | libc::FALLOC_FL_KEEP_SIZE, + off_i64, + len_i64, + ); + if ret != 0 { + return Err(SyncIoError::Discard(std::io::Error::last_os_error())); + } + } + Ok(len) + } + + pub fn fallocate( + fd: c_int, + mode: i32, + offset: off64_t, + len: off64_t, + ) -> Result<(), std::io::Error> { + // SAFETY: calling libc::fallocate is safe because we're passing plain C-compatible + // integer types (fd, mode, offset, len) and we check the integer return value. + let ret: i32 = unsafe { libc::fallocate(fd, mode, offset, len) }; + if ret == 0 { + Ok(()) + } else { + Err(std::io::Error::last_os_error()) + } + } } diff --git a/src/vmm/src/devices/virtio/block/virtio/metrics.rs b/src/vmm/src/devices/virtio/block/virtio/metrics.rs index 0abe2a58963..44491c56b2d 100644 --- a/src/vmm/src/devices/virtio/block/virtio/metrics.rs +++ b/src/vmm/src/devices/virtio/block/virtio/metrics.rs @@ -157,6 +157,8 @@ pub struct BlockDeviceMetrics { pub invalid_reqs_count: SharedIncMetric, /// Number of flushes operation triggered on this block device. pub flush_count: SharedIncMetric, + /// Number of discard operation triggered on this block device. + pub discard_count: SharedIncMetric, /// Number of events triggered on the queue of this block device. pub queue_event_count: SharedIncMetric, /// Number of events ratelimiter-related. @@ -210,6 +212,7 @@ impl BlockDeviceMetrics { self.invalid_reqs_count .add(other.invalid_reqs_count.fetch_diff()); self.flush_count.add(other.flush_count.fetch_diff()); + self.discard_count.add(other.discard_count.fetch_diff()); self.queue_event_count .add(other.queue_event_count.fetch_diff()); self.rate_limiter_event_count diff --git a/src/vmm/src/devices/virtio/block/virtio/mod.rs b/src/vmm/src/devices/virtio/block/virtio/mod.rs index 9e97d6d3897..d2bc3f6dbea 100644 --- a/src/vmm/src/devices/virtio/block/virtio/mod.rs +++ b/src/vmm/src/devices/virtio/block/virtio/mod.rs @@ -63,4 +63,12 @@ pub enum VirtioBlockError { RateLimiter(std::io::Error), /// Persistence error: {0} Persist(crate::devices::virtio::persist::PersistError), + /// Sector overflow in discard segment + SectorOverflow, + /// Discard segment exceeds disk size + BeyondDiskSize, + /// Invalid flags in discard segment + InvalidDiscardFlags, + /// Invalid discard request (e.g., empty segments) + InvalidDiscardRequest, } diff --git a/src/vmm/src/devices/virtio/block/virtio/request.rs b/src/vmm/src/devices/virtio/block/virtio/request.rs index 8fc83cf43da..1131ecdb0f6 100644 --- a/src/vmm/src/devices/virtio/block/virtio/request.rs +++ b/src/vmm/src/devices/virtio/block/virtio/request.rs @@ -9,17 +9,19 @@ use std::convert::From; use vm_memory::GuestMemoryError; +use super::io::{BlockIoError, SyncIoError}; use super::{SECTOR_SHIFT, SECTOR_SIZE, VirtioBlockError, io as block_io}; use crate::devices::virtio::block::virtio::device::DiskProperties; use crate::devices::virtio::block::virtio::metrics::BlockDeviceMetrics; pub use crate::devices::virtio::generated::virtio_blk::{ VIRTIO_BLK_ID_BYTES, VIRTIO_BLK_S_IOERR, VIRTIO_BLK_S_OK, VIRTIO_BLK_S_UNSUPP, - VIRTIO_BLK_T_FLUSH, VIRTIO_BLK_T_GET_ID, VIRTIO_BLK_T_IN, VIRTIO_BLK_T_OUT, + VIRTIO_BLK_T_DISCARD, VIRTIO_BLK_T_FLUSH, VIRTIO_BLK_T_GET_ID, VIRTIO_BLK_T_IN, + VIRTIO_BLK_T_OUT, }; use crate::devices::virtio::queue::DescriptorChain; use crate::logger::{IncMetric, error}; use crate::rate_limiter::{RateLimiter, TokenType}; -use crate::vstate::memory::{ByteValued, Bytes, GuestAddress, GuestMemoryMmap}; +use crate::vstate::memory::{Address, ByteValued, Bytes, GuestAddress, GuestMemoryMmap}; #[derive(Debug, derive_more::From)] pub enum IoErr { @@ -34,6 +36,7 @@ pub enum RequestType { Out, Flush, GetDeviceID, + Discard, Unsupported(u32), } @@ -44,6 +47,7 @@ impl From for RequestType { VIRTIO_BLK_T_OUT => RequestType::Out, VIRTIO_BLK_T_FLUSH => RequestType::Flush, VIRTIO_BLK_T_GET_ID => RequestType::GetDeviceID, + VIRTIO_BLK_T_DISCARD => RequestType::Discard, t => RequestType::Unsupported(t), } } @@ -176,6 +180,12 @@ impl PendingRequest { (Ok(transferred_data_len), RequestType::GetDeviceID) => { Status::from_data(self.data_len, transferred_data_len, true) } + (Ok(_), RequestType::Discard) => { + block_metrics.discard_count.inc(); + Status::Ok { + num_bytes_to_mem: 0, + } + } (_, RequestType::Unsupported(op)) => Status::Unsupported { op }, (Err(err), _) => Status::IoErr { num_bytes_to_mem: 0, @@ -231,6 +241,18 @@ impl RequestHeader { } } +// For this, please see VirtIO v1.2. This struct mimics that of the implementation in Virtio. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +#[repr(C)] +pub struct DiscardSegment { + sector: u64, + num_sectors: u32, + flags: u32, +} +// SAFETY: Safe because `DiscardSegment` only contains plain data (POD) with no padding-dependent +// invariants. +unsafe impl ByteValued for DiscardSegment {} + #[derive(Debug, PartialEq, Eq)] pub struct Request { pub r#type: RequestType, @@ -238,6 +260,7 @@ pub struct Request { pub status_addr: GuestAddress, sector: u64, data_addr: GuestAddress, + discard_segments: Option>, } impl Request { @@ -258,10 +281,11 @@ impl Request { data_addr: GuestAddress(0), data_len: 0, status_addr: GuestAddress(0), + discard_segments: None, }; let data_desc; - let status_desc; + let mut status_desc; let desc = avail_desc .next_descriptor() .ok_or(VirtioBlockError::DescriptorChainTooShort)?; @@ -272,6 +296,7 @@ impl Request { if req.r#type != RequestType::Flush { return Err(VirtioBlockError::DescriptorChainTooShort); } + data_desc = desc; } else { data_desc = desc; status_desc = data_desc @@ -287,6 +312,9 @@ impl Request { if !data_desc.is_write_only() && req.r#type == RequestType::GetDeviceID { return Err(VirtioBlockError::UnexpectedReadOnlyDescriptor); } + if data_desc.is_write_only() && req.r#type == RequestType::Discard { + return Err(VirtioBlockError::UnexpectedWriteOnlyDescriptor); + } req.data_addr = data_desc.addr; req.data_len = data_desc.len; @@ -313,6 +341,49 @@ impl Request { return Err(VirtioBlockError::InvalidDataLength); } } + RequestType::Discard => { + // Validate data length + let segment_size = std::mem::size_of::(); + if (data_desc.len as usize) % segment_size != 0 { + return Err(VirtioBlockError::InvalidDataLength); + } + + // Calculate number of segments + let num_segments = (data_desc.len as usize) / segment_size; + if num_segments == 0 { + return Err(VirtioBlockError::InvalidDiscardRequest); + } + let mut segments = Vec::with_capacity(num_segments); + + // Populate DiscardSegments vector + for i in 0..num_segments { + let offset = i * segment_size; + let segment: DiscardSegment = mem + .read_obj(data_desc.addr.unchecked_add(offset as u64)) + .map_err(VirtioBlockError::GuestMemory)?; + if segment.flags & !0x1 != 0 { + return Err(VirtioBlockError::InvalidDiscardFlags); + } + let end_sector = segment + .sector + .checked_add(segment.num_sectors as u64) + .ok_or(VirtioBlockError::SectorOverflow)?; + if end_sector > num_disk_sectors { + return Err(VirtioBlockError::BeyondDiskSize); + } + segments.push(segment); + } + + // Assign to req.discard_segments + req.discard_segments = Some(segments); + req.data_len = data_desc.len; + status_desc = data_desc + .next_descriptor() + .ok_or(VirtioBlockError::DescriptorChainTooShort)?; + if !status_desc.is_write_only() { + return Err(VirtioBlockError::UnexpectedReadOnlyDescriptor); + } + } _ => {} } @@ -390,6 +461,11 @@ impl Request { .map_err(IoErr::GetId); return ProcessingResult::Executed(pending.finish(mem, res, block_metrics)); } + RequestType::Discard => { + let _metric = block_metrics.write_agg.record_latency_metrics(); + disk.file_engine + .discard(self.offset(), self.data_len, pending) + } RequestType::Unsupported(_) => { return ProcessingResult::Executed(pending.finish(mem, Ok(0), block_metrics)); } @@ -731,6 +807,7 @@ mod tests { RequestType::Out => VIRTIO_BLK_T_OUT, RequestType::Flush => VIRTIO_BLK_T_FLUSH, RequestType::GetDeviceID => VIRTIO_BLK_T_GET_ID, + RequestType::Discard => VIRTIO_BLK_T_DISCARD, RequestType::Unsupported(id) => id, } } @@ -743,6 +820,7 @@ mod tests { RequestType::Out => VIRTQ_DESC_F_NEXT, RequestType::Flush => VIRTQ_DESC_F_NEXT, RequestType::GetDeviceID => VIRTQ_DESC_F_NEXT | VIRTQ_DESC_F_WRITE, + RequestType::Discard => VIRTQ_DESC_F_NEXT, RequestType::Unsupported(_) => VIRTQ_DESC_F_NEXT, } } @@ -833,6 +911,7 @@ mod tests { data_len: valid_data_len, status_addr, sector: sector & (NUM_DISK_SECTORS - sectors_len), + discard_segments: None, data_addr, }; let mut request_header = RequestHeader::new(virtio_request_id, request.sector); diff --git a/src/vmm/src/io_uring/operation/mod.rs b/src/vmm/src/io_uring/operation/mod.rs index ead96df5ba5..fabdbcdc47e 100644 --- a/src/vmm/src/io_uring/operation/mod.rs +++ b/src/vmm/src/io_uring/operation/mod.rs @@ -29,6 +29,8 @@ pub enum OpCode { Write = io_uring_op::IORING_OP_WRITE as u8, /// Fsync operation. Fsync = io_uring_op::IORING_OP_FSYNC as u8, + /// Fallocate operation. + Fallocate = io_uring_op::IORING_OP_FALLOCATE as u8, } // Useful for outputting errors. @@ -38,6 +40,7 @@ impl From for &'static str { OpCode::Read => "read", OpCode::Write => "write", OpCode::Fsync => "fsync", + OpCode::Fallocate => "fallocate", } } } @@ -113,6 +116,19 @@ impl Operation { } } + /// Construct a fallocate operation. + pub fn fallocate(fd: FixedFd, len: u32, offset: u64, user_data: T) -> Self { + Self { + fd, + opcode: OpCode::Fallocate, + addr: None, + len: Some(len), + flags: 0, + offset: Some(offset), + user_data, + } + } + pub(crate) fn fd(&self) -> FixedFd { self.fd } diff --git a/tests/integration_tests/functional/test_discard.py b/tests/integration_tests/functional/test_discard.py new file mode 100644 index 00000000000..96667aa8b14 --- /dev/null +++ b/tests/integration_tests/functional/test_discard.py @@ -0,0 +1,45 @@ +# ...existing code... +# Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 +"""Integration test for discard (VIRTIO_BLK_T_DISCARD).""" + +import os + +import host_tools.drive as drive_tools + + +def test_discard_support(uvm_plain_rw): + """ + Test the VIRTIO_BLK_T_DISCARD feature for valid and invalid requests. + """ + + vm = uvm_plain_rw + vm.spawn() + vm.basic_config() + vm.add_net_iface() + + fs1 = drive_tools.FilesystemFile(os.path.join(vm.fsfiles, "test_disk"), size=128) + vm.add_drive( + drive_id="test_disk", + path_on_host=fs1.path, + is_root_device=False, + is_read_only=False, + ) + + vm.start() + + exit_code, stdout, _ = vm.ssh.run("cat /sys/block/vda/queue/discard_max_bytes") + + assert exit_code == 0, "Failed to read discard_max_bytes" + assert int(stdout.strip()) > 0, f"discard_max_bytes is 0: {stdout}" + + vm.ssh.run("mount -o remount,discard /") + exit_code, stdout, _ = vm.ssh.run("mount | grep /dev/vda") + assert ( + exit_code == 0 + ), f"Failed to remount root filesystem with discard option: {stdout}" + + exit_code, stdout, _ = vm.ssh.run("fstrim -v /") + assert exit_code == 0, f"fstrim -v failed: {stdout}" + assert "trimmed" in stdout, f"Unexpected fstrim output: {stdout}" + vm.kill()