Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 58 additions & 9 deletions block/src/fcntl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -101,13 +101,52 @@ impl LockState {
}
}

/// The granularity of the advisory lock.
///
/// The granularity has significant implications in typical cloud deployments
/// with network storage. The Linux kernel will sync advisory locks to network
/// file systems, but these backends may have different policies and handle
/// locks differently. For example, Netapp speaks a NFS API but will treat
/// advisory OFD locks for the whole file as mandatory locks, whereas byte-range
/// locks for the whole file will remain advisory [0].
///
/// As it is a valid use case to prevent multiple CHV instances from accessing
/// the same disk but disk management software (e.g., Cinder in OpenStack)
/// should be able to snapshot disks while VMs are running, we need special
/// control over the lock granularity. Therefore, it is a valid use case to lock
/// the whole byte range of a disk image without technically locking the whole
/// file - to get the best of both worlds.
///
/// [0] https://kb.netapp.com/on-prem/ontap/da/NAS/NAS-KBs/How_is_Mandatory_Locking_supported_for_NFSv4_on_ONTAP_9
#[derive(Clone, Copy, Debug)]
pub enum LockGranularity {
WholeFile,
ByteRange(u64 /* from, inclusive */, u64 /* len */),
}

impl LockGranularity {
const fn l_start(self) -> u64 {
match self {
LockGranularity::WholeFile => 0,
LockGranularity::ByteRange(start, _) => start,
}
}

const fn l_len(self) -> u64 {
match self {
LockGranularity::WholeFile => 0, /* EOF */
LockGranularity::ByteRange(_, len) => len,
}
}
}

/// Returns a [`struct@libc::flock`] structure for the whole file.
const fn get_flock(lock_type: LockType) -> libc::flock {
const fn get_flock(lock_type: LockType, granularity: LockGranularity) -> libc::flock {
libc::flock {
l_type: lock_type.to_libc_val() as libc::c_short,
l_whence: libc::SEEK_SET as libc::c_short,
l_start: 0,
l_len: 0, /* EOF */
l_start: granularity.l_start() as libc::c_long,
l_len: granularity.l_len() as libc::c_long,
l_pid: 0, /* filled by callee */
}
}
Expand All @@ -122,8 +161,13 @@ const fn get_flock(lock_type: LockType) -> libc::flock {
/// - `file`: The file to acquire a lock for [`LockType`]. The file's state will
/// be logically mutated, but not technically.
/// - `lock_type`: The [`LockType`]
pub fn try_acquire_lock<Fd: AsRawFd>(file: Fd, lock_type: LockType) -> Result<(), LockError> {
let flock = get_flock(lock_type);
/// - `granularity`: The [`LockGranularity`].
pub fn try_acquire_lock<Fd: AsRawFd>(
file: Fd,
lock_type: LockType,
granularity: LockGranularity,
) -> Result<(), LockError> {
let flock = get_flock(lock_type, granularity);

let res = fcntl(file.as_raw_fd(), FcntlArg::F_OFD_SETLK(&flock));
match res {
Expand All @@ -146,17 +190,22 @@ pub fn try_acquire_lock<Fd: AsRawFd>(file: Fd, lock_type: LockType) -> Result<()
///
/// # Parameters
/// - `file`: The file to clear all locks for [`LockType`].
pub fn clear_lock<Fd: AsRawFd>(file: Fd) -> Result<(), LockError> {
try_acquire_lock(file, LockType::Unlock)
/// - `granularity`: The [`LockGranularity`].
pub fn clear_lock<Fd: AsRawFd>(file: Fd, granularity: LockGranularity) -> Result<(), LockError> {
try_acquire_lock(file, LockType::Unlock, granularity)
}

/// Returns the current lock state using [`fcntl`] with respect to the given
/// parameters.
///
/// # Parameters
/// - `file`: The file for which to get the lock state.
pub fn get_lock_state<Fd: AsRawFd>(file: Fd) -> Result<LockState, LockError> {
let mut flock = get_flock(LockType::Write);
/// - `granularity`: The [`LockGranularity`].
pub fn get_lock_state<Fd: AsRawFd>(
file: Fd,
granularity: LockGranularity,
) -> Result<LockState, LockError> {
let mut flock = get_flock(LockType::Write, granularity);
let res = fcntl(file.as_raw_fd(), FcntlArg::F_OFD_GETLK(&mut flock));
match res {
0 => {
Expand Down
34 changes: 29 additions & 5 deletions virtio-devices/src/block.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ use std::{io, result};

use anyhow::anyhow;
use block::async_io::{AsyncIo, AsyncIoError, DiskFile};
use block::fcntl::{LockError, LockType, get_lock_state};
use block::fcntl::{LockError, LockGranularity, LockType, get_lock_state};
use block::{
ExecuteAsync, ExecuteError, Request, RequestType, VirtioBlockConfig, build_serial, fcntl,
};
Expand Down Expand Up @@ -778,20 +778,42 @@ impl Block {
has_feature(self.features(), VIRTIO_BLK_F_RO.into())
}

/// Returns the granularity for the advisory lock for this disk.
// TODO In future, we could add a `lock_granularity=` configuration to the CLI.
// For now, we stick to QEMU behavior.
fn lock_granularity(&mut self) -> LockGranularity {
let fallback = LockGranularity::WholeFile;

self.disk_image
.size()
.map(|size| LockGranularity::ByteRange(0, size))
// use a safe fallback
.unwrap_or_else(|e| {
log::warn!(
"Can't get disk size for id={},path={}, falling back to {:?}: error: {e}",
self.id,
self.disk_path.display(),
fallback
);
fallback
})
}

/// Tries to set an advisory lock for the corresponding disk image.
pub fn try_lock_image(&mut self) -> Result<()> {
let lock_type = match self.read_only() {
true => LockType::Read,
false => LockType::Write,
};
let granularity = self.lock_granularity();
log::debug!(
"Attempting to acquire {lock_type:?} lock for disk image id={},path={}",
"Attempting to acquire {lock_type:?} lock for disk image: id={},path={},granularity={granularity:?}",
self.id,
self.disk_path.display()
);
let fd = self.disk_image.fd();
fcntl::try_acquire_lock(fd, lock_type).map_err(|error| {
let current_lock = get_lock_state(fd);
fcntl::try_acquire_lock(fd, lock_type, granularity).map_err(|error| {
let current_lock = get_lock_state(fd, granularity);
// Don't propagate the error to the outside, as it is not useful at all. Instead,
// we try to log additional help to the user.
if let Ok(current_lock) = current_lock {
Expand All @@ -815,10 +837,12 @@ impl Block {

/// Releases the advisory lock held for the corresponding disk image.
pub fn unlock_image(&mut self) -> Result<()> {
let granularity = self.lock_granularity();

// It is very unlikely that this fails;
// Should we remove the Result to simplify the error propagation on
// higher levels?
fcntl::clear_lock(self.disk_image.fd()).map_err(|error| Error::LockDiskImage {
fcntl::clear_lock(self.disk_image.fd(), granularity).map_err(|error| Error::LockDiskImage {
path: self.disk_path.clone(),
error,
lock_type: LockType::Unlock,
Expand Down
Loading