Skip to content

Commit ddc56ed

Browse files
committed
block: advisory locks: use byte-range locks to match QEMU behavior
The granularity has significant implications in typical cloud deployments with network storage. The Linux kernel will sync advisory locks to network file systems, but these backends may have different policies and handle locks differently. For example, Netapp speaks a NFS API but will treat advisory OFD locks for the whole file as mandatory locks, whereas byte-range locks for the whole file will remain advisory [0]. As it is a valid use case to prevent multiple CHV instances from accessing the same disk but disk management software (e.g., Cinder in OpenStack) should be able to snapshot disks while VMs are running, we need special control over the lock granularity. Therefore, it is a valid use case to lock the whole byte range of a disk image without technically locking the whole file - to get the best of both worlds. This also brings CHVs behavior in line with QEMU [1]. Whole-file locks remain a valid use case and could be supported later. This patch only provides the necessary groundwork; making it configurable is out of scope for now. [0] https://kb.netapp.com/on-prem/ontap/da/NAS/NAS-KBs/How_is_Mandatory_Locking_supported_for_NFSv4_on_ONTAP_9 [1] <qemu>/util/osdep.c::qemu_lock_fcntl() Signed-off-by: Philipp Schuster <philipp.schuster@cyberus-technology.de> On-behalf-of: SAP philipp.schuster@sap.com
1 parent 1498fee commit ddc56ed

File tree

2 files changed

+87
-14
lines changed

2 files changed

+87
-14
lines changed

block/src/fcntl.rs

Lines changed: 58 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -101,13 +101,52 @@ impl LockState {
101101
}
102102
}
103103

104+
/// The granularity of the advisory lock.
105+
///
106+
/// The granularity has significant implications in typical cloud deployments
107+
/// with network storage. The Linux kernel will sync advisory locks to network
108+
/// file systems, but these backends may have different policies and handle
109+
/// locks differently. For example, Netapp speaks a NFS API but will treat
110+
/// advisory OFD locks for the whole file as mandatory locks, whereas byte-range
111+
/// locks for the whole file will remain advisory [0].
112+
///
113+
/// As it is a valid use case to prevent multiple CHV instances from accessing
114+
/// the same disk but disk management software (e.g., Cinder in OpenStack)
115+
/// should be able to snapshot disks while VMs are running, we need special
116+
/// control over the lock granularity. Therefore, it is a valid use case to lock
117+
/// the whole byte range of a disk image without technically locking the whole
118+
/// file - to get the best of both worlds.
119+
///
120+
/// [0] https://kb.netapp.com/on-prem/ontap/da/NAS/NAS-KBs/How_is_Mandatory_Locking_supported_for_NFSv4_on_ONTAP_9
121+
#[derive(Clone, Copy, Debug)]
122+
pub enum LockGranularity {
123+
WholeFile,
124+
ByteRange(u64 /* from, inclusive */, u64 /* len */),
125+
}
126+
127+
impl LockGranularity {
128+
const fn l_start(self) -> u64 {
129+
match self {
130+
LockGranularity::WholeFile => 0,
131+
LockGranularity::ByteRange(start, _) => start,
132+
}
133+
}
134+
135+
const fn l_len(self) -> u64 {
136+
match self {
137+
LockGranularity::WholeFile => 0, /* EOF */
138+
LockGranularity::ByteRange(_, len) => len,
139+
}
140+
}
141+
}
142+
104143
/// Returns a [`struct@libc::flock`] structure for the whole file.
105-
const fn get_flock(lock_type: LockType) -> libc::flock {
144+
const fn get_flock(lock_type: LockType, granularity: LockGranularity) -> libc::flock {
106145
libc::flock {
107146
l_type: lock_type.to_libc_val() as libc::c_short,
108147
l_whence: libc::SEEK_SET as libc::c_short,
109-
l_start: 0,
110-
l_len: 0, /* EOF */
148+
l_start: granularity.l_start() as libc::c_long,
149+
l_len: granularity.l_len() as libc::c_long,
111150
l_pid: 0, /* filled by callee */
112151
}
113152
}
@@ -122,8 +161,13 @@ const fn get_flock(lock_type: LockType) -> libc::flock {
122161
/// - `file`: The file to acquire a lock for [`LockType`]. The file's state will
123162
/// be logically mutated, but not technically.
124163
/// - `lock_type`: The [`LockType`]
125-
pub fn try_acquire_lock<Fd: AsRawFd>(file: Fd, lock_type: LockType) -> Result<(), LockError> {
126-
let flock = get_flock(lock_type);
164+
/// - `granularity`: The [`LockGranularity`].
165+
pub fn try_acquire_lock<Fd: AsRawFd>(
166+
file: Fd,
167+
lock_type: LockType,
168+
granularity: LockGranularity,
169+
) -> Result<(), LockError> {
170+
let flock = get_flock(lock_type, granularity);
127171

128172
let res = fcntl(file.as_raw_fd(), FcntlArg::F_OFD_SETLK(&flock));
129173
match res {
@@ -146,17 +190,22 @@ pub fn try_acquire_lock<Fd: AsRawFd>(file: Fd, lock_type: LockType) -> Result<()
146190
///
147191
/// # Parameters
148192
/// - `file`: The file to clear all locks for [`LockType`].
149-
pub fn clear_lock<Fd: AsRawFd>(file: Fd) -> Result<(), LockError> {
150-
try_acquire_lock(file, LockType::Unlock)
193+
/// - `granularity`: The [`LockGranularity`].
194+
pub fn clear_lock<Fd: AsRawFd>(file: Fd, granularity: LockGranularity) -> Result<(), LockError> {
195+
try_acquire_lock(file, LockType::Unlock, granularity)
151196
}
152197

153198
/// Returns the current lock state using [`fcntl`] with respect to the given
154199
/// parameters.
155200
///
156201
/// # Parameters
157202
/// - `file`: The file for which to get the lock state.
158-
pub fn get_lock_state<Fd: AsRawFd>(file: Fd) -> Result<LockState, LockError> {
159-
let mut flock = get_flock(LockType::Write);
203+
/// - `granularity`: The [`LockGranularity`].
204+
pub fn get_lock_state<Fd: AsRawFd>(
205+
file: Fd,
206+
granularity: LockGranularity,
207+
) -> Result<LockState, LockError> {
208+
let mut flock = get_flock(LockType::Write, granularity);
160209
let res = fcntl(file.as_raw_fd(), FcntlArg::F_OFD_GETLK(&mut flock));
161210
match res {
162211
0 => {

virtio-devices/src/block.rs

Lines changed: 29 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ use std::{io, result};
1919

2020
use anyhow::anyhow;
2121
use block::async_io::{AsyncIo, AsyncIoError, DiskFile};
22-
use block::fcntl::{LockError, LockType, get_lock_state};
22+
use block::fcntl::{LockError, LockGranularity, LockType, get_lock_state};
2323
use block::{
2424
ExecuteAsync, ExecuteError, Request, RequestType, VirtioBlockConfig, build_serial, fcntl,
2525
};
@@ -778,20 +778,42 @@ impl Block {
778778
has_feature(self.features(), VIRTIO_BLK_F_RO.into())
779779
}
780780

781+
/// Returns the granularity for the advisory lock for this disk.
782+
// TODO In future, we could add a `lock_granularity=` configuration to the CLI.
783+
// For now, we stick to QEMU behavior.
784+
fn lock_granularity(&mut self) -> LockGranularity {
785+
let fallback = LockGranularity::WholeFile;
786+
787+
self.disk_image
788+
.size()
789+
.map(|size| LockGranularity::ByteRange(0, size))
790+
// use a safe fallback
791+
.unwrap_or_else(|e| {
792+
log::warn!(
793+
"Can't get disk size for id={},path={}, falling back to {:?}: error: {e}",
794+
self.id,
795+
self.disk_path.display(),
796+
fallback
797+
);
798+
fallback
799+
})
800+
}
801+
781802
/// Tries to set an advisory lock for the corresponding disk image.
782803
pub fn try_lock_image(&mut self) -> Result<()> {
783804
let lock_type = match self.read_only() {
784805
true => LockType::Read,
785806
false => LockType::Write,
786807
};
808+
let granularity = self.lock_granularity();
787809
log::debug!(
788-
"Attempting to acquire {lock_type:?} lock for disk image id={},path={}",
810+
"Attempting to acquire {lock_type:?} lock for disk image: id={},path={},granularity={granularity:?}",
789811
self.id,
790812
self.disk_path.display()
791813
);
792814
let fd = self.disk_image.fd();
793-
fcntl::try_acquire_lock(fd, lock_type).map_err(|error| {
794-
let current_lock = get_lock_state(fd);
815+
fcntl::try_acquire_lock(fd, lock_type, granularity).map_err(|error| {
816+
let current_lock = get_lock_state(fd, granularity);
795817
// Don't propagate the error to the outside, as it is not useful at all. Instead,
796818
// we try to log additional help to the user.
797819
if let Ok(current_lock) = current_lock {
@@ -815,10 +837,12 @@ impl Block {
815837

816838
/// Releases the advisory lock held for the corresponding disk image.
817839
pub fn unlock_image(&mut self) -> Result<()> {
840+
let granularity = self.lock_granularity();
841+
818842
// It is very unlikely that this fails;
819843
// Should we remove the Result to simplify the error propagation on
820844
// higher levels?
821-
fcntl::clear_lock(self.disk_image.fd()).map_err(|error| Error::LockDiskImage {
845+
fcntl::clear_lock(self.disk_image.fd(), granularity).map_err(|error| Error::LockDiskImage {
822846
path: self.disk_path.clone(),
823847
error,
824848
lock_type: LockType::Unlock,

0 commit comments

Comments
 (0)