diff --git a/Cargo.lock b/Cargo.lock index 88b9029aa93..f1d63ad1d12 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4992,6 +4992,7 @@ dependencies = [ "slog-error-chain", "smf 0.2.3", "thiserror 2.0.17", + "tofino", "tokio", "toml 0.8.23", "uuid", @@ -13921,7 +13922,7 @@ dependencies = [ [[package]] name = "tofino" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/tofino?branch=main#1b66b89c3727d2191082df057b068ec52560e334" +source = "git+https://github.com/oxidecomputer/tofino?branch=presence#361e334f35387f6669f6f03bb2a421b0e9c25b8f" dependencies = [ "anyhow", "cc", diff --git a/Cargo.toml b/Cargo.toml index 5b74dfcd037..ea37e18a8e1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -748,7 +748,7 @@ termtree = "0.5.1" textwrap = { version = "0.16.2", features = [ "terminal_size" ] } test-strategy = "0.4.3" thiserror = "2.0" -tofino = { git = "https://github.com/oxidecomputer/tofino", branch = "main" } +tofino = { git = "https://github.com/oxidecomputer/tofino", branch = "presence" } tokio = "1.47.0" tokio-postgres = { version = "0.7", features = [ "with-chrono-0_4", "with-uuid-1" ] } tokio-stream = "0.1.17" diff --git a/illumos-utils/Cargo.toml b/illumos-utils/Cargo.toml index e5654014023..dee88f505e5 100644 --- a/illumos-utils/Cargo.toml +++ b/illumos-utils/Cargo.toml @@ -40,6 +40,7 @@ tokio.workspace = true uuid.workspace = true whoami.workspace = true zone.workspace = true +tofino.workspace = true omicron-workspace-hack.workspace = true diff --git a/illumos-utils/src/contract.rs b/illumos-utils/src/contract.rs new file mode 100644 index 00000000000..9cb554c04e9 --- /dev/null +++ b/illumos-utils/src/contract.rs @@ -0,0 +1,375 @@ +#[cfg(target_os = "illumos")] +use libc::ctid_t; +use slog::{Logger, debug, error}; +use std::ffi::CString; +use std::ffi::c_char; +use std::ffi::c_int; +use std::ffi::c_uint; +use std::ffi::c_void; +use std::thread; +use std::time::Duration; + +#[allow(non_camel_case_types)] +type ct_evthdl_t = *mut c_void; +#[allow(non_camel_case_types)] +type ct_evid_t = u64; + +#[link(name = "contract")] +unsafe extern "C" { + fn ct_tmpl_set_critical(fd: c_int, events: c_uint) -> c_int; + fn ct_tmpl_set_informative(fd: c_int, events: c_uint) -> c_int; + fn ct_pr_tmpl_set_fatal(fd: c_int, events: c_uint) -> c_int; + fn ct_pr_tmpl_set_param(fd: c_int, params: c_uint) -> c_int; + fn ct_dev_tmpl_set_minor(fd: c_int, minor: *const c_char) -> c_int; + fn ct_tmpl_activate(fd: c_int) -> c_int; + fn ct_tmpl_create(fd: c_int, ctid: *mut c_int) -> c_int; + fn ct_tmpl_clear(fd: c_int) -> c_int; + fn ct_ctl_abandon(fd: c_int) -> c_int; + fn ct_ctl_ack(fd: c_int, evid: ct_evid_t) -> c_int; + fn ct_event_read_critical(fd: c_int, ev: *mut ct_evthdl_t) -> c_int; + fn ct_event_get_type(ev: ct_evthdl_t) -> u32; + fn ct_event_get_ctid(ev: ct_evthdl_t) -> ctid_t; + fn ct_event_get_evid(ev: ct_evthdl_t) -> ct_evid_t; + fn ct_event_free(ev: ct_evthdl_t); +} + +// Convert an error message into an ExecutionError::ContractFailure +fn err(msg: impl ToString) -> crate::ExecutionError { + return crate::ExecutionError::ContractFailure { + msg: msg.to_string(), + err: std::io::Error::last_os_error(), + }; +} + +// Construct a path to a file in the contract filesystem +fn path(typ: ContractType, id: Option, file: &str) -> CString { + let prefix = match typ { + ContractType::Process => "/system/contract/process", + ContractType::Device => "/system/contract/device", + }; + let id = match id { + Some(i) => format!("/{i}"), + None => String::new(), + }; + CString::new(format!("{prefix}{id}/{file}")).unwrap() +} + +#[derive(Clone, Copy, Debug)] +pub enum ContractType { + Process, + Device, +} + +// Constants related to process contracts +// Only kill process group on fatal errors. +pub const CT_PR_PGRPONLY: c_uint = 0x04; +// Automatically detach inherited contracts. +pub const CT_PR_REGENT: c_uint = 0x08; + +// Event types reported on a contract + +// Common events: +pub const CT_EV_NEGEND: c_uint = 0x0; // Contract negotiation ended + +// Process events: +pub const CT_PR_EV_EMPTY: c_uint = 0x1; // Contract has become empty. +pub const CT_PR_EV_HWERR: c_uint = 0x20; // Process had an uncorrectable error. + +// Device events +pub const CT_DEV_EV_OFFLINE: c_uint = 0x4; // Device has gone offline + +#[derive(Debug)] +pub struct ContractEvent { + pub event: ct_evthdl_t, + pub event_id: ct_evid_t, + pub typ: c_uint, + pub ctid: ctid_t, +} +unsafe impl Sync for ContractEvent {} +unsafe impl Send for ContractEvent {} + +impl Drop for ContractEvent { + fn drop(&mut self) { + unsafe { ct_event_free(self.event) }; + } +} + +/// A Watcher is used to wait for events related to contracts +pub struct Watcher { + fd: c_int, +} + +impl Drop for Watcher { + fn drop(&mut self) { + unsafe { libc::close(self.fd) }; + } +} + +impl Watcher { + /// Return a Watcher for a specific type of contract event. The watcher + /// will return all events of the requested type, and it is the caller's + /// responsibility to filter for the events relevent to them. + pub fn new(typ: ContractType) -> Self { + let path = path(typ, None, "pbundle"); + let fd = unsafe { libc::open(path.as_ptr(), libc::O_RDONLY) }; + if fd < 0 { + panic!( + "Could not open {:?}: {}", + path, + std::io::Error::last_os_error() + ); + } + Watcher { fd } + } + + /// Block until a contract event occurs. + pub fn watch(&self, log: &slog::Logger) -> ContractEvent { + loop { + let mut event: ct_evthdl_t = std::ptr::null_mut(); + let evp: *mut ct_evthdl_t = &mut event; + // The event endpoint was not opened as non-blocking, so + // ct_event_read_critical(3CONTRACT) will block until a new + // critical event is available on the channel. + match unsafe { ct_event_read_critical(self.fd, evp) } { + 0 => { + let typ = unsafe { ct_event_get_type(event) }; + let event_id = unsafe { ct_event_get_evid(event) }; + let ctid = unsafe { ct_event_get_ctid(event) }; + return ContractEvent { event, event_id, typ, ctid }; + } + err => { + // ct_event_read_critical(3CONTRACT) does not state any + // error values for this function if the file descriptor + // was not opened non-blocking, but inspection of the + // library code shows that various errnos could be returned + // in situations such as failure to allocate memory. In + // those cases, log a message and pause to avoid entering a + // tight loop if the problem persists. + error!( + log, + "Unexpected response from contract event channel: {}", + std::io::Error::from_raw_os_error(err) + ); + thread::sleep(Duration::from_secs(1)); + } + } + } + } +} + +/// A Control is used to communicate a response to the contract system in +/// response to an event. In practice this is limited to acknowledging an event +/// and cancelling a contract. +pub struct Control { + ctid: ctid_t, + fd: c_int, +} + +impl Drop for Control { + fn drop(&mut self) { + unsafe { libc::close(self.fd) }; + } +} + +impl Control { + /// Construct a new Control for the specified contract + pub fn new( + typ: ContractType, + ctid: ctid_t, + ) -> Result { + let path = path(typ, Some(ctid), "ctl"); + match unsafe { libc::open(path.as_ptr(), libc::O_WRONLY) } { + fd if fd >= 0 => Ok(Control { ctid, fd }), + _ => Err(err(format!( + "opening control path {}", + path.into_string().unwrap() + ))), + } + } + + /// Acknowledge an event on the contract + pub fn ack( + &self, + event_id: ct_evid_t, + ) -> Result<(), crate::ExecutionError> { + match unsafe { ct_ctl_ack(self.fd, event_id) } { + 0 => Ok(()), + _ => Err(err(format!("failed to acknowledge event {}", event_id))), + } + } + + /// Abandon the contract + pub fn abandon(self) -> Result<(), crate::ExecutionError> { + match unsafe { ct_ctl_abandon(self.fd) } { + 0 => Ok(()), + _ => Err(err(format!("failed to abandon contract {}", self.ctid))), + } + } +} + +// This thread watches for critical events coming from all process +// contracts held by sled-agent, and reaps (abandons) contracts which +// become empty. Process contracts are used in conjunction with +// zone_enter() in order to run commands within non-global zones, and +// the contracts used for this come from templates that define becoming +// empty as a critical event. +pub fn process_contract_reaper(log: Logger) { + let watcher = Watcher::new(ContractType::Process); + loop { + let event = watcher.watch(&log); + if event.typ != CT_PR_EV_EMPTY { + continue; + } + + let ctl = match Control::new(ContractType::Process, event.ctid) { + Ok(c) => c, + Err(e) => { + error!(&log, "Failed to open contract control: {e:?}"); + continue; + } + }; + + if let Err(e) = ctl.abandon() { + error!(log, "{e:?}"); + } else { + debug!(&log, "Abandoned contract {}", event.ctid) + } + } +} + +// A Rust wrapper around the process contract template. +#[derive(Debug)] +pub struct Template { + fd: c_int, +} + +impl Drop for Template { + fn drop(&mut self) { + self.clear(); + // Ignore any error, since printing may interfere with `slog`'s + // structured output. + unsafe { libc::close(self.fd) }; + } +} + +fn get_tfpkt_device_path() -> Option> { + let dev_path = CString::new("/dev/tfpkt0").unwrap(); + let mut path_buf = [0i8; 1024]; + let sz = unsafe { + libc::readlink(dev_path.as_ptr(), path_buf.as_mut_ptr(), 1024) + }; + if sz < 0 { + None + } else { + // readlink returns "/devices/pseudo/..." but the contract + // filesystem only wants to know about the "/pseudo/..." part, + // so we strip off the first path element before returning it. + // We also need to drop any bytes after the terminating NULL. + let left = 10; + let right = path_buf.iter().position(|a| *a == 0).unwrap(); + Some(path_buf[left..right + 1].to_vec()) + } +} + +impl Template { + pub fn new(typ: ContractType) -> Result { + let path = path(typ, None, "template"); + let fd = match unsafe { libc::open(path.as_ptr(), libc::O_RDWR) } { + fd if fd >= 0 => Ok(fd), + _ => Err(err(format!( + "opening template {}", + path.into_string().unwrap() + ))), + }?; + + // The two different contract types are initialized with different + // settings. These settings are hardcoded and specific to the manner in + // which the contracts are used within omicron. + match typ { + ContractType::Process => { + // + // Nothing is inherited, we do not allow the contract to be + // orphaned, and the only event which is delivered is EV_EMPTY, + // indicating that the contract has become empty. These events are + // consumed by contract_reaper() above. + // + // See illumos sources in `usr/src/cmd/zlogin/zlogin.c` in the + // implementation of `init_template()` for details. + if unsafe { ct_tmpl_set_critical(fd, CT_PR_EV_EMPTY) } != 0 { + Err(err("set_critical in process template")) + } else if unsafe { ct_tmpl_set_informative(fd, 0) } != 0 { + Err(err("set_informative in process template")) + } else if unsafe { ct_pr_tmpl_set_fatal(fd, CT_PR_EV_HWERR) } + != 0 + { + Err(err("set_fatal in process template")) + } else if unsafe { + ct_pr_tmpl_set_param(fd, CT_PR_PGRPONLY | CT_PR_REGENT) + } != 0 + { + Err(err("set_param in process template")) + } else if unsafe { ct_tmpl_activate(fd) } != 0 { + Err(err("activating in process template")) + } else { + Ok(Self { fd }) + } + } + ContractType::Device => { + // The only device contract we currently support is for the + // tfpkt device. If we ever want to support something else then + // we will need to include a device type, path, and/or instance + // number as an additional argument to this template creation + // function. + // + // Note: what we are actually interested in is the removal of + // the "tofino" device. However, illumos won't report the + // removal of that device while it is still assigned to a zone. + // Since our goal is to shut down the zone to allow the removal + // of the device, we have a chicken/egg problem. Instead, we + // watch here for the removal of "tfpkt", which is a child of + // the "tofino", and whose removal is a reliable indicator that + // the tofino is gone. + let cpath = match get_tfpkt_device_path() { + Some(c) => Ok(c), + None => { + unsafe { libc::close(fd) }; + Err(err("unable to find tfpkt in device tree")) + } + }?; + + // We want to be notified when the device goes offline + if unsafe { ct_tmpl_set_critical(fd, CT_DEV_EV_OFFLINE) } != 0 { + Err(err("set_critical in device template")) + } else if unsafe { ct_dev_tmpl_set_minor(fd, cpath.as_ptr()) } + != 0 + { + let cpath: Vec = + cpath.iter().map(|c| *c as u8).collect(); + Err(err(format!( + "set_minor to {} in device template", + str::from_utf8(&cpath[..]).unwrap() + ))) + } else if unsafe { ct_tmpl_activate(fd) } != 0 { + Err(err("activating device template")) + } else { + Ok(Self { fd }) + } + } + } + .inspect_err(|_e| { + unsafe { libc::close(fd) }; + }) + } + + pub fn create(&self) -> Result { + let mut ctid = 0; + match unsafe { ct_tmpl_create(self.fd, &mut ctid) } { + 0 => Ok(ctid), + _ => Err(err("constructing contract")), + } + } + + pub fn clear(&self) { + unsafe { ct_tmpl_clear(self.fd) }; + } +} diff --git a/illumos-utils/src/lib.rs b/illumos-utils/src/lib.rs index ce1404f09a3..a7789ead888 100644 --- a/illumos-utils/src/lib.rs +++ b/illumos-utils/src/lib.rs @@ -10,6 +10,8 @@ use slog_error_chain::InlineErrorChain; use std::sync::atomic::{AtomicBool, Ordering}; pub mod addrobj; +#[cfg(target_os = "illumos")] +pub mod contract; pub mod coreadm; pub mod destructor; pub mod dkio; @@ -64,8 +66,8 @@ pub enum ExecutionError { #[error("{0}")] CommandFailure(Box), - #[error("Failed to manipulate process contract: {err}")] - ContractFailure { err: std::io::Error }, + #[error("contract error: {msg}: {err}")] + ContractFailure { msg: String, err: std::io::Error }, #[error("Failed to parse command output")] ParseFailure(String), diff --git a/illumos-utils/src/running_zone.rs b/illumos-utils/src/running_zone.rs index 4cb80c24495..89a921a18b6 100644 --- a/illumos-utils/src/running_zone.rs +++ b/illumos-utils/src/running_zone.rs @@ -8,6 +8,8 @@ use crate::addrobj::{ AddrObject, DHCP_ADDROBJ_NAME, IPV4_STATIC_ADDROBJ_NAME, IPV6_STATIC_ADDROBJ_NAME, }; +#[cfg(target_os = "illumos")] +use crate::contract; use crate::dladm::Etherstub; use crate::link::{Link, VnicAllocator}; use crate::opte::{Port, PortTicket}; @@ -112,7 +114,7 @@ pub fn ensure_contract_reaper(log: &Logger) { info!(log, "Ensuring contract reaper thread"); REAPER_THREAD.get_or_init(|| { let log = log.new(o!("component" => "ContractReaper")); - std::thread::spawn(move || zenter::contract_reaper(log)) + std::thread::spawn(move || contract::process_contract_reaper(log)) }); } @@ -125,215 +127,13 @@ pub fn ensure_contract_reaper(log: &Logger) { // inside a non-global zone. #[cfg(target_os = "illumos")] mod zenter { - use libc::ctid_t; use libc::zoneid_t; - use slog::{Logger, debug, error}; use std::ffi::c_int; - use std::ffi::c_uint; - use std::ffi::c_void; - use std::ffi::{CStr, CString}; - use std::process; - use std::thread; - use std::time::Duration; - - #[allow(non_camel_case_types)] - type ct_evthdl_t = *mut c_void; - - #[link(name = "contract")] - extern "C" { - fn ct_tmpl_set_critical(fd: c_int, events: c_uint) -> c_int; - fn ct_tmpl_set_informative(fd: c_int, events: c_uint) -> c_int; - fn ct_pr_tmpl_set_fatal(fd: c_int, events: c_uint) -> c_int; - fn ct_pr_tmpl_set_param(fd: c_int, params: c_uint) -> c_int; - fn ct_tmpl_activate(fd: c_int) -> c_int; - fn ct_tmpl_clear(fd: c_int) -> c_int; - fn ct_ctl_abandon(fd: c_int) -> c_int; - fn ct_event_read_critical(fd: c_int, ev: *mut ct_evthdl_t) -> c_int; - fn ct_event_get_type(ev: ct_evthdl_t) -> u64; - fn ct_event_get_ctid(ev: ct_evthdl_t) -> ctid_t; - fn ct_event_free(ev: ct_evthdl_t); - } #[link(name = "c")] extern "C" { pub fn zone_enter(zid: zoneid_t) -> c_int; } - - // This thread watches for critical events coming from all process - // contracts held by sled-agent, and reaps (abandons) contracts which - // become empty. Process contracts are used in conjunction with - // zone_enter() in order to run commands within non-global zones, and - // the contracts used for this come from templates that define becoming - // empty as a critical event. - pub fn contract_reaper(log: Logger) { - const EVENT_PATH: &'static [u8] = b"/system/contract/process/pbundle"; - const CT_PR_EV_EMPTY: u64 = 1; - - let cpath = CString::new(EVENT_PATH).unwrap(); - let fd = unsafe { libc::open(cpath.as_ptr(), libc::O_RDONLY) }; - - if fd < 0 { - panic!( - "Could not open {:?}: {}", - cpath, - std::io::Error::last_os_error() - ); - } - - loop { - let mut ev: ct_evthdl_t = std::ptr::null_mut(); - let evp: *mut ct_evthdl_t = &mut ev; - // The event endpoint was not opened as non-blocking, so - // ct_event_read_critical(3CONTRACT) will block until a new - // critical event is available on the channel. - match unsafe { ct_event_read_critical(fd, evp) } { - 0 => { - let typ = unsafe { ct_event_get_type(ev) }; - if typ == CT_PR_EV_EMPTY { - let ctid = unsafe { ct_event_get_ctid(ev) }; - match abandon_contract(ctid) { - Err(e) => error!( - &log, - "Failed to abandon contract {}: {}", ctid, e - ), - Ok(_) => { - debug!(&log, "Abandoned contract {}", ctid) - } - } - } - unsafe { ct_event_free(ev) }; - } - err => { - // ct_event_read_critical(3CONTRACT) does not state any - // error values for this function if the file descriptor - // was not opened non-blocking, but inspection of the - // library code shows that various errnos could be returned - // in situations such as failure to allocate memory. In - // those cases, log a message and pause to avoid entering a - // tight loop if the problem persists. - error!( - &log, - "Unexpected response from contract event channel: {}", - std::io::Error::from_raw_os_error(err) - ); - thread::sleep(Duration::from_secs(1)); - } - } - } - } - - #[derive(thiserror::Error, Debug)] - pub enum AbandonContractError { - #[error("Error opening file {file}: {error}")] - Open { file: String, error: std::io::Error }, - - #[error("Error abandoning contract {ctid}: {error}")] - Abandon { ctid: ctid_t, error: std::io::Error }, - - #[error("Error closing file {file}: {error}")] - Close { file: String, error: std::io::Error }, - } - - pub fn abandon_contract(ctid: ctid_t) -> Result<(), AbandonContractError> { - let path = format!("/proc/{}/contracts/{}/ctl", process::id(), ctid); - - let cpath = CString::new(path.clone()).unwrap(); - let fd = unsafe { libc::open(cpath.as_ptr(), libc::O_WRONLY) }; - if fd < 0 { - return Err(AbandonContractError::Open { - file: path, - error: std::io::Error::last_os_error(), - }); - } - let ret = unsafe { ct_ctl_abandon(fd) }; - if ret != 0 { - unsafe { libc::close(fd) }; - return Err(AbandonContractError::Abandon { - ctid, - error: std::io::Error::from_raw_os_error(ret), - }); - } - if unsafe { libc::close(fd) } != 0 { - return Err(AbandonContractError::Close { - file: path, - error: std::io::Error::last_os_error(), - }); - } - - Ok(()) - } - - // A Rust wrapper around the process contract template. - #[derive(Debug)] - pub struct Template { - fd: c_int, - } - - impl Drop for Template { - fn drop(&mut self) { - self.clear(); - // Ignore any error, since printing may interfere with `slog`'s - // structured output. - unsafe { libc::close(self.fd) }; - } - } - - impl Template { - const TEMPLATE_PATH: &'static [u8] = - b"/system/contract/process/template\0"; - - // Constants related to how the contract below is managed. See - // `usr/src/uts/common/sys/contract/process.h` in the illumos sources - // for details. - - // Contract has become empty. - const CT_PR_EV_EMPTY: c_uint = 0x1; - // Process experienced an uncorrectable error. - const CT_PR_EV_HWERR: c_uint = 0x20; - // Only kill process group on fatal errors. - const CT_PR_PGRPONLY: c_uint = 0x04; - // Automatically detach inherited contracts. - const CT_PR_REGENT: c_uint = 0x08; - - pub fn new() -> Result { - let path = CStr::from_bytes_with_nul(Self::TEMPLATE_PATH).unwrap(); - let fd = unsafe { libc::open(path.as_ptr(), libc::O_RDWR) }; - if fd < 0 { - let err = std::io::Error::last_os_error(); - return Err(crate::ExecutionError::ContractFailure { err }); - } - - // Initialize the contract template. - // - // Nothing is inherited, we do not allow the contract to be - // orphaned, and the only event which is delivered is EV_EMPTY, - // indicating that the contract has become empty. These events are - // consumed by contract_reaper() above. - // - // See illumos sources in `usr/src/cmd/zlogin/zlogin.c` in the - // implementation of `init_template()` for details. - if unsafe { ct_tmpl_set_critical(fd, Self::CT_PR_EV_EMPTY) } != 0 - || unsafe { ct_tmpl_set_informative(fd, 0) } != 0 - || unsafe { ct_pr_tmpl_set_fatal(fd, Self::CT_PR_EV_HWERR) } - != 0 - || unsafe { - ct_pr_tmpl_set_param( - fd, - Self::CT_PR_PGRPONLY | Self::CT_PR_REGENT, - ) - } != 0 - || unsafe { ct_tmpl_activate(fd) } != 0 - { - let err = std::io::Error::last_os_error(); - return Err(crate::ExecutionError::ContractFailure { err }); - } - Ok(Self { fd }) - } - - pub fn clear(&self) { - unsafe { ct_tmpl_clear(self.fd) }; - } - } } /// Represents a running zone. @@ -421,10 +221,11 @@ impl RunningZone { err: crate::ExecutionError::NotRunning, }); }; - let template = - std::sync::Arc::new(zenter::Template::new().map_err(|err| { - RunCommandError { zone: self.name().to_string(), err } - })?); + let template = std::sync::Arc::new( + contract::Template::new(contract::ContractType::Process).map_err( + |err| RunCommandError { zone: self.name().to_string(), err }, + )?, + ); let tmpl = std::sync::Arc::clone(&template); let mut command = std::process::Command::new(crate::PFEXEC); let logger = self.inner.log.clone(); diff --git a/sled-agent/src/hardware_monitor.rs b/sled-agent/src/hardware_monitor.rs index c180970266a..56ba86510dc 100644 --- a/sled-agent/src/hardware_monitor.rs +++ b/sled-agent/src/hardware_monitor.rs @@ -77,8 +77,10 @@ pub struct HardwareMonitor { /// or policy changes. service_manager: Option, - /// Whether or not the tofino is loaded. - is_tofino_loaded: bool, + /// Whether or not the tofino is available. This implies that the ASIC is + /// present, the driver has been loaded, and that we are able to use the + /// driver to interact with the ASIC. + is_tofino_available: bool, } impl HardwareMonitor { @@ -110,7 +112,7 @@ impl HardwareMonitor { raw_disks_tx, sled_agent: None, service_manager: None, - is_tofino_loaded: false, + is_tofino_available: false, }; tokio::spawn(monitor.run()); let handle = HardwareMonitorHandle { switch_zone_policy_tx }; @@ -144,7 +146,7 @@ impl HardwareMonitor { // the service manager to start the switch zone; do so now. let policy = self.current_switch_zone_policy(); self.ensure_switch_zone_activated_or_deactivated( - self.is_tofino_loaded, + self.is_tofino_available, policy, ).await; } @@ -154,8 +156,8 @@ impl HardwareMonitor { "Received hardware update message"; "update" => ?update, ); - self.handle_hardware_update(update).await; - } + self.handle_hardware_update(update.clone()).await + }, Ok(()) = self.switch_zone_policy_rx.changed() => { let policy = self.current_switch_zone_policy(); info!( @@ -163,7 +165,7 @@ impl HardwareMonitor { "policy" => ?policy, ); self.ensure_switch_zone_activated_or_deactivated( - self.is_tofino_loaded, + self.is_tofino_available, policy, ).await; } @@ -184,14 +186,22 @@ impl HardwareMonitor { ) { match update { Ok(update) => match update { - HardwareUpdate::TofinoLoaded => { + HardwareUpdate::TofinoAvailable => { + info!( + self.log, + "Hardware monitor got TofinoAvailable message" + ); let policy = self.current_switch_zone_policy(); self.ensure_switch_zone_activated_or_deactivated( true, policy, ) .await } - HardwareUpdate::TofinoUnloaded => { + HardwareUpdate::TofinoUnavailable => { + info!( + self.log, + "Hardware monitor got TofinoUnavailable message" + ); let policy = self.current_switch_zone_policy(); self.ensure_switch_zone_activated_or_deactivated( false, policy, @@ -199,6 +209,10 @@ impl HardwareMonitor { .await } HardwareUpdate::TofinoDeviceChange => { + info!( + self.log, + "Hardware monitor got TofinoDeviceChange message" + ); if let Some(sled_agent) = &mut self.sled_agent { sled_agent.notify_nexus_about_self(&self.log).await; } @@ -231,12 +245,12 @@ impl HardwareMonitor { async fn ensure_switch_zone_activated_or_deactivated( &mut self, - is_tofino_loaded: bool, + is_tofino_available: bool, policy: OperatorSwitchZonePolicy, ) { // Remember whether the tofino is loaded regardless of the action we // take (or don't take) below. - self.is_tofino_loaded = is_tofino_loaded; + self.is_tofino_available = is_tofino_available; // If we don't have the service manager yet, we can't do anything. let Some(service_manager) = &self.service_manager else { @@ -244,8 +258,8 @@ impl HardwareMonitor { }; // Decide whether to activate or deactivate based on the combination of - // `tofino_loaded` and the operator policy. - let should_activate = match (is_tofino_loaded, policy) { + // `tofino_available` and the operator policy. + let should_activate = match (is_tofino_available, policy) { // We have a tofino and policy says to start the switch zone (true, OperatorSwitchZonePolicy::StartIfSwitchPresent) => { info!( @@ -308,7 +322,7 @@ impl HardwareMonitor { let policy = self.current_switch_zone_policy(); self.ensure_switch_zone_activated_or_deactivated( - self.hardware_manager.is_scrimlet_driver_loaded(), + self.hardware_manager.is_scrimlet_asic_available(), policy, ) .await; diff --git a/sled-hardware/src/illumos/mod.rs b/sled-hardware/src/illumos/mod.rs index 3670423bfb7..7356bd28b06 100644 --- a/sled-hardware/src/illumos/mod.rs +++ b/sled-hardware/src/illumos/mod.rs @@ -7,6 +7,10 @@ use crate::{DendriteAsic, HardwareUpdate, SledMode, UnparsedDisk}; use camino::Utf8PathBuf; use gethostname::gethostname; use illumos_devinfo::{DevInfo, DevLinkType, DevLinks, Node, Property}; +use illumos_utils::contract; +use illumos_utils::contract::{ContractType, Control, Template, Watcher}; +use illumos_utils::zone; +use illumos_utils::zone::Api; use libnvme::{Nvme, controller::Controller}; use omicron_common::disk::{DiskIdentity, DiskVariant}; use sled_hardware_types::{Baseboard, OxideSled, SledCpuFamily}; @@ -86,16 +90,29 @@ pub fn is_oxide_sled() -> anyhow::Result { Ok(OxideSled::try_from_root_node_name(&root.node_name()).is_some()) } -// A snapshot of information about the underlying Tofino device +// A snapshot of information about the underlying Tofino device. +// +// This snapshot tells us whether there is a tofino visible to Illumos and +// accessible to us in userspace. We would expect both to be true or both to be +// false. +// +// Note: this doesn't specifically tell us whether there is a sidecar connected +// to the system, but it tells us if there is a sidecar we can use. The +// distinction largely comes down to whether the driver has successfully +// recognized the device and initialized itself. The three-way relationship +// between PCI hotplug, device driver management, and zones is fragile enough +// that we can get stuck in a state that requires a gimlet reboot to fix. #[derive(Copy, Clone)] struct TofinoSnapshot { + // Is there a Tofino ASIC visible in the device tree exists: bool, - driver_loaded: bool, + // Are we able to access the ASIC through the device driver + available: bool, } impl TofinoSnapshot { fn new() -> Self { - Self { exists: false, driver_loaded: false } + Self { exists: false, available: false } } } @@ -251,17 +268,18 @@ impl HardwareView { updates: &mut Vec, ) { match self.tofino { - TofinoView::Real(TofinoSnapshot { driver_loaded, exists }) => { + TofinoView::Real(TofinoSnapshot { available, exists }) => { use HardwareUpdate::*; // Identify if the Tofino device changed power states. if exists != polled_hw.tofino.exists { updates.push(TofinoDeviceChange); } - // Identify if the Tofino driver was recently loaded/unloaded. - match (driver_loaded, polled_hw.tofino.driver_loaded) { - (false, true) => updates.push(TofinoLoaded), - (true, false) => updates.push(TofinoUnloaded), + // Identify if the Tofino asic recently became available or + // unavailable. + match (available, polled_hw.tofino.available) { + (false, true) => updates.push(TofinoAvailable), + (true, false) => updates.push(TofinoUnavailable), _ => (), }; @@ -272,6 +290,13 @@ impl HardwareView { } } + fn tofino_exists(&self) -> bool { + match self.tofino { + TofinoView::Real(TofinoSnapshot { exists, .. }) => exists, + TofinoView::Stub { active } => active, + } + } + // Updates our view of block devices against a snapshot. fn update_blkdev( &mut self, @@ -341,10 +366,9 @@ fn slot_is_boot_disk( } fn get_tofino_snapshot(log: &Logger, devinfo: &mut DevInfo) -> TofinoSnapshot { - let (exists, driver_loaded) = match tofino::get_tofino_from_devinfo(devinfo) - { + let (exists, available) = match tofino::get_tofino_from_devinfo(devinfo) { Ok(None) => (false, false), - Ok(Some(node)) => (node.has_asic(), node.has_driver()), + Ok(Some(node)) => (node.has_asic(), node.is_available()), Err(e) => { error!(log, "failed to get tofino state: {e:?}"); (false, false) @@ -353,11 +377,11 @@ fn get_tofino_snapshot(log: &Logger, devinfo: &mut DevInfo) -> TofinoSnapshot { if exists { debug!( log, - "Found tofino node, with driver {}loaded", - if driver_loaded { "" } else { "not " } + "Found tofino node, with asic {}available", + if available { "" } else { "un" } ); } - TofinoSnapshot { exists, driver_loaded } + TofinoSnapshot { exists, available } } fn get_dev_path_of_whole_disk( @@ -686,6 +710,96 @@ fn poll_device_tree( Ok(()) } +// Block until the switch zone is gone +async fn block_on_switch_zone() { + let zone_api = zone::Zones::real_api(); + + loop { + if let Ok(Some(_zone)) = zone_api.find("oxz_switch").await { + tokio::time::sleep(tokio::time::Duration::from_secs(1)).await; + } else { + return; + } + } +} + +fn monitor_tofino( + log: slog::Logger, + inner: Arc>, + tx: broadcast::Sender, +) { + let log = log.new(o!("component" => "SledAgent-tofino_contractor")); + info!(&log, "tofino monitoring thread started"); + loop { + std::thread::sleep(std::time::Duration::from_secs(1)); + { + if !inner.lock().unwrap().tofino_exists() { + continue; + } + } + + let ctid = match Template::new(ContractType::Device) { + Ok(template) => match template.create() { + Ok(c) => c, + Err(e) => { + error!(log, "unable to create tofino contract: {e:?}"); + continue; + } + }, + Err(e) => { + error!(log, "unable to open tofino contract template: {e:?}"); + continue; + } + }; + let ctl = match Control::new(ContractType::Device, ctid) { + Ok(c) => c, + Err(e) => { + error!(log, "unable to create tofino contract: {e:?}"); + continue; + } + }; + + let watcher = Watcher::new(ContractType::Device); + loop { + let ev = watcher.watch(&log); + match ev.typ { + contract::CT_DEV_EV_OFFLINE => { + info!(&log, "Got tofino removed notification"); + if ev.ctid != ctid { + debug!(&log, "event for wrong contract"); + continue; + } + let _ = tx.send(HardwareUpdate::TofinoUnavailable); + + // The device detach mechanism in the kernel will block for + // up to a minute, waiting for us to acknowledge this event. + // Illumos will not detach the device driver while the + // device is still in use. If a device is assigned to a + // zone, that counts as "in use". By halting the zone and + // deferring that "ack" until the zone is gone, we enable + // the device to be detached cleanly, which will + // subsequently allow the device to be re-attached cleanly + // if/when the sidecar is powered back on. + info!(&log, "Waiting for switch zone to halt"); + let rt = tokio::runtime::Runtime::new().unwrap(); + rt.block_on(block_on_switch_zone()); + info!(log, "Switch zone halted."); + if let Err(e) = ctl.ack(ev.event_id) { + error!(&log, "{e:?}"); + } + } + contract::CT_EV_NEGEND => { + if let Err(e) = ctl.abandon() { + error!(&log, "{e:?}"); + } + break; + } + x => warn!(&log, "unexpected device event: {x}"), + } + } + } +} + async fn hardware_tracking_task( log: Logger, inner: Arc>, @@ -789,6 +903,16 @@ impl HardwareManager { } }; + // We poll the device tree to detect new tofinos and disks. This + // polling also detects devices that have gone away, but we need to + // respond to a tofino disappearance more quickly than a regular polling + // interval will allow. To that end, we fire off a task that maintains a + // device contract with the kernel to handle those disappearances. + let log2 = log.clone(); + let inner2 = inner.clone(); + let tx2 = tx.clone(); + tokio::task::spawn_blocking(move || monitor_tofino(log2, inner2, tx2)); + let log2 = log.clone(); let inner2 = inner.clone(); let tx2 = tx.clone(); @@ -839,12 +963,10 @@ impl HardwareManager { } } - pub fn is_scrimlet_driver_loaded(&self) -> bool { + pub fn is_scrimlet_asic_available(&self) -> bool { let inner = self.inner.lock().unwrap(); match inner.tofino { - TofinoView::Real(TofinoSnapshot { driver_loaded, .. }) => { - driver_loaded - } + TofinoView::Real(TofinoSnapshot { available, .. }) => available, TofinoView::Stub { active } => active, } } diff --git a/sled-hardware/src/lib.rs b/sled-hardware/src/lib.rs index cff03ee869f..b1553400721 100644 --- a/sled-hardware/src/lib.rs +++ b/sled-hardware/src/lib.rs @@ -31,8 +31,8 @@ pub mod underlay; #[allow(dead_code)] pub enum HardwareUpdate { TofinoDeviceChange, - TofinoLoaded, - TofinoUnloaded, + TofinoAvailable, + TofinoUnavailable, DiskAdded(UnparsedDisk), DiskRemoved(UnparsedDisk), DiskUpdated(UnparsedDisk), diff --git a/sled-hardware/src/non_illumos/mod.rs b/sled-hardware/src/non_illumos/mod.rs index e4385019843..f4a5280cba8 100644 --- a/sled-hardware/src/non_illumos/mod.rs +++ b/sled-hardware/src/non_illumos/mod.rs @@ -65,7 +65,7 @@ impl HardwareManager { unimplemented!("Accessing hardware unsupported on non-illumos"); } - pub fn is_scrimlet_driver_loaded(&self) -> bool { + pub fn is_scrimlet_asic_available(&self) -> bool { unimplemented!("Accessing hardware unsupported on non-illumos"); }