From 66a6cec5d6d00911d79afc8440e467244b57249f Mon Sep 17 00:00:00 2001 From: iximeow Date: Sun, 28 Dec 2025 02:24:47 +0000 Subject: [PATCH] Add non-Arc-clone'ing variant of access() `access()` implies (typically) an `Arc::clone` of the guarded resource. When that resource is access()'d frequently, especially across many cores, the contention for incrementing and decrementing the shared refcount can become the primary bottleneck for Propolis. This is particularly visible in `nvme`, where queues live in guest memory, are mediated by `MemCtx`, queues are managed across dozens of threads (hundreds in some cases!) threads, and millions of items are read and written to queues per second. On large processors an increment can take hundreds of nanoseconds, or thousands of cycles! ---- This patch is a less comprehensive improvement to the problem than I'd hoped for, but it *is* a substantial improvement enough to include for now. The first try at this change set out to add an alternative to accessors::Guard, call it accessor::View, which would have a refresh() analogous to Guard::access(). View was expected to hold a reference on a Node, and View::refresh() was expected to go through the same "check res_leaf or else re-derive the leaf resource" steps as guard(). After getting through the lock_tree() changes to keep a guard for a node when refreshing the leaf, it became clear in most cases we'd have an accessor::View we would *not* have that view via mutable reference. accessor::View's cached reference would need to be behind a Mutex for interior mutability, at which point a "View" is not much different from any other "Node" in the accessor tree. So, instead, this implements a less aspirational change of adding an "access() but return the guard instead of clone the reference". This is strictly less work than before, but still implies going out to `mutex_lock` and such for something that, in the happy case, should be nothing more than a load. Even so, the mutexes in question are per-Node and generally not contended - this is hard to spot even under load. ---- In the limit, an accessor tree that supports disabling access to subtrees already is up against the issue that `access()`'d `Guard` don't synchronize with operations on the resource. A PCI bus can "deny" access to memory, but a device or its queues may still have a Guard permitting to access memory after the fact and do so. It seems theoretically possible to lean into accessed resources being an `Arc` and have nodes hold an `AtomicUsize` of that pointer, updating it as appropriate while managing the tree. Then, nodes and the tree generally could have generation counters where on access the caller must only compare generations to check that the cached access is still valid. To deny access to a (sub)tree, relevant `Node` could be modified, the tree's generation bumped, and the access-denier can wait on the reference count to the old tree to drop to zero. Critically, comparing the generation numbers requires only loads, so cores don't fight over writes, and in the typical case the generation number won't change, so a view's reference can remain counted until the generation number change is finally witnessed. I *think* that the main difficulty with this approach is that it requires components that hold a long-lived reference on the resource to regularly "check in" that the reference is still valid, but it would be very easy to accidentally sleep while holding a reference. This is the same general problem as blocking a writer becuase you've slept holding a read lock. --- lib/propolis/src/accessors.rs | 120 +++++++++++++++++++++------ lib/propolis/src/hw/nvme/queue.rs | 27 +++--- lib/propolis/src/hw/nvme/requests.rs | 3 +- 3 files changed, 112 insertions(+), 38 deletions(-) diff --git a/lib/propolis/src/accessors.rs b/lib/propolis/src/accessors.rs index c99452b17..e6a8db675 100644 --- a/lib/propolis/src/accessors.rs +++ b/lib/propolis/src/accessors.rs @@ -362,12 +362,12 @@ impl Tree { /// Create a [Tree] returning the root node fn new(res_root: Option) -> Arc> { - let res_leaf = res_root.as_ref().map(T::derive); let tree = Self::new_empty(res_root); - let node = Node::new_root(tree.clone()); - node.0.lock().unwrap().res_leaf = res_leaf; + let node = Node::new_root(tree.clone()); let mut guard = tree.lock().unwrap(); + let res_leaf = guard.res_root.as_ref().map(T::derive); + node.0.lock().unwrap().res_leaf = res_leaf; let root_key = NodeKey::from(&node); guard.root_key = Some(root_key); guard.nodes.insert(root_key, TreeNode::new_root(Arc::downgrade(&node))); @@ -423,14 +423,17 @@ impl Node { /// This is purely a helper function to make lifetimes clearer for /// [`Self::lock_tree()`] #[allow(clippy::type_complexity)] - fn try_lock_tree<'a>( - &'a self, - tree_ref: &'a TreeBackref, - ) -> Result>, TreeBackref> { + fn try_lock_tree<'node, 'guard>( + &'node self, + tree_ref: &'guard TreeBackref, + ) -> Result< + (MutexGuard<'guard, Tree>, MutexGuard<'node, NodeEntry>), + TreeBackref, + > { let guard = tree_ref.lock().unwrap(); let node_guard = self.0.lock().unwrap(); if Arc::ptr_eq(tree_ref, &node_guard.tree) { - Ok(guard) + Ok((guard, node_guard)) } else { Err(node_guard.tree.clone()) } @@ -438,39 +441,64 @@ impl Node { /// Safely acquire the lock to this entry, as well as the containing tree, /// respecting the ordering requirements. - fn lock_tree(&self, f: impl FnOnce(MutexGuard<'_, Tree>) -> R) -> R { + fn lock_tree<'node, F, R>(&'node self, f: F) -> R + where + F: for<'guard> FnOnce( + MutexGuard<'guard, Tree>, + MutexGuard<'node, NodeEntry>, + ) -> R, + { let mut tree = self.0.lock().unwrap().tree.clone(); - let guard = loop { + let (guard, self_guard) = loop { let new_tree = match self.try_lock_tree(&tree) { - Ok(tg) => break tg, + Ok(guards) => break guards, Err(nt) => nt, }; let _ = std::mem::replace(&mut tree, new_tree); }; - f(guard) + f(guard, self_guard) } fn new_root(tree: Arc>>) -> Arc> { Arc::new(Node(Mutex::new(NodeEntry { tree, res_leaf: None }))) } fn new_child(self: &Arc>, name: Option) -> Arc> { - self.lock_tree(|mut guard| guard.add_child(self.into(), name)) + self.lock_tree(|mut guard, _| guard.add_child(self.into(), name)) } + /// Acquire a reference to the accessed resource, if permitted. + /// + /// TODO: The guarded resource can be replaced while this guard is held. + /// There is no synchronization between a potential disabling of access and + /// outstanding guards that would be forbidden by that disablement. fn guard(&self) -> Option> { + self.guard_borrow().map(|guard| { + let leaf_ref = guard + .res_leaf + .clone() + .expect("guard_borrow() only returns Some if res_leaf is Some"); + Guard { inner: leaf_ref, _pd: PhantomData } + }) + } + + /// Lock this node's reference to the resource, if permitted. + /// + /// Take care: this returns the mutex guard, keeping this node locked. + /// Concurrent accesses to this node will block, and attempts to update the + /// resource in this tree will be blocked. + fn guard_borrow(&self) -> Option>> { let local = self.0.lock().unwrap(); - if let Some(leaf) = local.res_leaf.as_ref() { - Some(Guard { inner: leaf.clone(), _pd: PhantomData }) + if let Some(_) = local.res_leaf.as_ref() { + Some(local) } else { drop(local); // Attempt to (re)derive leaf resource from root - self.lock_tree(|tree| { + self.lock_tree(|tree, mut local| { if let Some(root) = tree.res_root.as_ref() { - let mut local = self.0.lock().unwrap(); let leaf = T::derive(root); local.res_leaf = Some(leaf.clone()); - Some(Guard { inner: leaf, _pd: PhantomData }) + Some(local) } else { None } @@ -480,9 +508,9 @@ impl Node { fn drop_from_tree(self: &mut Arc>) { let key = NodeKey::from(&*self); - self.lock_tree(|mut guard| { + self.lock_tree(|mut guard, mut local| { // drop any lingering access to the resource immediately - let _ = self.0.lock().unwrap().res_leaf.take(); + let _ = local.res_leaf.take(); // Since we hold the Tree lock (thus eliminating the chance of any // racing adopt/orphan activity to be manipulating the refcount on @@ -508,6 +536,21 @@ impl std::ops::Deref for Guard<'_, T> { } } +pub struct LockedView<'node, T: AccessedResource> { + guard: MutexGuard<'node, NodeEntry>, +} + +impl<'node, T: AccessedResource> LockedView<'node, T> { + pub fn view(&self) -> &T::Target { + let leaf = self + .guard + .res_leaf + .as_ref() + .expect("LockedView is returned only when res_leaf is Some()"); + T::deref(&leaf) + } +} + pub struct Accessor(Arc>); impl Accessor { /// Create a new accessor hierarchy, mediating access to `resource`. @@ -538,8 +581,10 @@ impl Accessor { assert_ne!(parent_key, child_key, "cannot adopt self"); - self.0.lock_tree(|mut parent_guard| { - child.0.lock_tree(|mut child_guard| { + self.0.lock_tree(|mut parent_guard, node_guard| { + drop(node_guard); + child.0.lock_tree(|mut child_guard, node_guard| { + drop(node_guard); if !child_guard.node_is_root(&child.0) { // Drop all mutex guards prior to panic in order to allow // unwinder to do its job, rather than getting tripped up by @@ -564,7 +609,8 @@ impl Accessor { /// /// If this is called on a non-root node. pub fn remove_resource(&self) -> Option { - self.0.lock_tree(|mut guard| { + self.0.lock_tree(|mut guard, node_guard| { + drop(node_guard); if !guard.node_is_root(&self.0) { drop(guard); panic!("removal of root resource only allowed at root node"); @@ -578,18 +624,42 @@ impl Accessor { /// /// Will return [None] if any ancestor node disables access, or if the node /// is not attached to a hierarchy containing a valid resource. + /// + /// TODO: an outstanding `Guard` does not synchronize with changes to the + /// underlying resource; the resource could be changed, the tree adopted, + /// access disallowed, etc, but an existing `Guard` will still reference the + /// resource as it was at the point the access was allowed. pub fn access(&self) -> Option> { self.0.guard() } + /// Attempt to get a reference to the underlying resource. + /// + /// Will return [None] if any ancestor node disables access, or if the node + /// is not attached to a hierarchy containing a valid resource. + /// + /// Unlike [`Accesor::access()`], this returns a guard on the node's Mutex. + /// This will block other calls to `access()` or `access_borrow()`, as well + /// as attempts to replace or modify the tree's guarded resource. Prefer + /// frequently refreshing borrows with calls to `access_borrow()` to holding + /// a `LockedView` for substantial durations. + pub fn access_borrow(&self) -> Option> { + let guard = self.0.guard_borrow()?; + if guard.res_leaf.is_some() { + Some(LockedView { guard }) + } else { + None + } + } + /// How many nodes exist in this Accessor hierarchy pub fn node_count(&self) -> usize { - self.0.lock_tree(|guard| guard.node_count()) + self.0.lock_tree(|guard, _| guard.node_count()) } /// Print the hierarchy that this node is a member of pub fn print(&self, highlight_self: bool) { - self.0.lock_tree(|tree| { + self.0.lock_tree(|tree, _| { tree.print(print_basic( highlight_self.then_some(NodeKey::from(&self.0)), )); diff --git a/lib/propolis/src/hw/nvme/queue.rs b/lib/propolis/src/hw/nvme/queue.rs index dd955c3d8..d24a6dbad 100644 --- a/lib/propolis/src/hw/nvme/queue.rs +++ b/lib/propolis/src/hw/nvme/queue.rs @@ -631,7 +631,8 @@ impl SubQueue { pub fn pop( self: &Arc, ) -> Option<(GuestData, Permit, u16)> { - let Some(mem) = self.state.acc_mem.access() else { return None }; + let Some(mem) = self.state.acc_mem.access_borrow() else { return None }; + let mem = mem.view(); // Attempt to reserve an entry on the Completion Queue let permit = self.cq.reserve_entry(&self, &mem)?; @@ -868,18 +869,20 @@ impl CompQueue { // // XXX: handle a guest addr that becomes unmapped later let addr = self.base.offset::(idx as usize); - if let Some(mem) = self.state.acc_mem.access() { - cqe.set_phase(!phase); - mem.write(addr, &cqe); - cqe.set_phase(phase); - mem.write(addr, &cqe); - - let devq_id = self.devq_id(); - state.db_buf_read(devq_id, &mem); - state.db_buf_write(devq_id, &mem); - } else { + // TODO: access disallowed? + let Some(mem) = self.state.acc_mem.access_borrow() else { // TODO: mark the queue/controller in error state? - } + return; + }; + let mem = mem.view(); + cqe.set_phase(!phase); + mem.write(addr, &cqe); + cqe.set_phase(phase); + mem.write(addr, &cqe); + + let devq_id = self.devq_id(); + state.db_buf_read(devq_id, &mem); + state.db_buf_write(devq_id, &mem); } pub(super) fn set_db_buf( diff --git a/lib/propolis/src/hw/nvme/requests.rs b/lib/propolis/src/hw/nvme/requests.rs index 97ee8f4f6..5f16c5b8e 100644 --- a/lib/propolis/src/hw/nvme/requests.rs +++ b/lib/propolis/src/hw/nvme/requests.rs @@ -73,7 +73,8 @@ impl block::DeviceQueue for NvmeBlockQueue { /// the underlying block backend fn next_req(&self) -> Option<(Request, Self::Token, Option)> { let sq = &self.sq; - let mem = self.acc_mem.access()?; + let mem = self.acc_mem.access_borrow()?; + let mem = mem.view(); let params = self.sq.params(); while let Some((sub, permit, idx)) = sq.pop() {