diff --git a/Cargo.lock b/Cargo.lock index 04cc4e47b5..154c557136 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1269,6 +1269,7 @@ dependencies = [ "mqttea", "nras", "num_cpus", + "nv-redfish", "oauth2", "oid-registry", "once_cell", @@ -1893,6 +1894,7 @@ dependencies = [ name = "carbide-health" version = "0.0.1" dependencies = [ + "arc-swap", "async-trait", "base64", "carbide-health-report", @@ -2520,6 +2522,7 @@ dependencies = [ "mac_address", "nv-redfish", "reqwest 0.13.4", + "serde", "serde_json", "sqlx", "state-controller", @@ -7347,15 +7350,16 @@ dependencies = [ [[package]] name = "nv-redfish" -version = "0.8.1" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17a770625401680ec7d8539281484e5fa88caa772b21544d3717d6038ff84c6f" +checksum = "06c0a56718c0605b550a9957d90bcf824edd52093f09e32e6c6b5087dd86de7f" dependencies = [ "futures-core", "futures-util", "nv-redfish-bmc-http", "nv-redfish-core", "nv-redfish-csdl-compiler", + "nv-redfish-schema", "serde", "serde_json", "tagged-types", @@ -7363,31 +7367,35 @@ dependencies = [ [[package]] name = "nv-redfish-bmc-http" -version = "0.8.1" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b517fe5bf29664951329bde6f2f1fcd497addcb8a846f26d25bd0e2cbdda40af" +checksum = "d6dd788740d6111263b57d44dd3b4f59f5201f81e5383cd6784a977cdefe2196" dependencies = [ "futures-core", "futures-util", "http", "nv-redfish-core", + "nv-redfish-csdl-compiler", + "nv-redfish-schema", "reqwest 0.12.28", "serde", "serde_json", "serde_path_to_error", "sse-stream", "time", + "tokio-util", "url", "uuid", ] [[package]] name = "nv-redfish-core" -version = "0.8.1" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c019526454812cf5edbda4471cd74b7b0ac61b0b43bed3c7cd82b21a8fa7894" +checksum = "f897d4e715aac6857673a344d05aea380d8d87cbf5eba876c9768664ecc8cba1" dependencies = [ "futures-core", + "futures-io", "rust_decimal", "serde", "serde_json", @@ -7397,9 +7405,9 @@ dependencies = [ [[package]] name = "nv-redfish-csdl-compiler" -version = "0.8.1" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "00e376c01ed74f97332e6ebfbed56fa8563a978ffd8637b24a42d2bf70a8c25a" +checksum = "264eb8b256664d1d2b953d2bfc26648f8c3986d0540dc4eaa2e666f2f16de567" dependencies = [ "clap", "clap_derive", @@ -7413,6 +7421,15 @@ dependencies = [ "toml 0.9.12+spec-1.1.0", ] +[[package]] +name = "nv-redfish-schema" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7049c7e09b59afe558b46d6c35323a3b29e9cc32daeede4eb5a7a2ca47ff4a3" +dependencies = [ + "glob", +] + [[package]] name = "nvue-client" version = "0.0.0" @@ -11224,6 +11241,7 @@ checksum = "9ae9cec805b01e8fc3fd2fe289f89149a9b66dd16786abd8b19cfa7b48cb0098" dependencies = [ "bytes", "futures-core", + "futures-io", "futures-sink", "pin-project-lite", "slab", diff --git a/Cargo.toml b/Cargo.toml index 10f933605a..887a87d4c3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -59,7 +59,7 @@ tracing-appender = "0.2.4" tracing-opentelemetry = "0.32.1" # NV-Redfish -nv-redfish = { version = "0.8.1" } +nv-redfish = { version = "0.10.0" } ######## # MARK: - Pinned packages that we can't upgrade due to conflicts or just bugs diff --git a/crates/api-db/migrations/20260521231500_bmc_redfish_session.sql b/crates/api-db/migrations/20260521231500_bmc_redfish_session.sql new file mode 100644 index 0000000000..447e74502e --- /dev/null +++ b/crates/api-db/migrations/20260521231500_bmc_redfish_session.sql @@ -0,0 +1,11 @@ +-- Tracks the outstanding Redfish sessions +CREATE TABLE bmc_redfish_sessions ( + spiffe_service_id TEXT NOT NULL, + bmc_mac_address macaddr NOT NULL, + session_odata_id TEXT NOT NULL, + issued_at TIMESTAMPTZ NOT NULL DEFAULT now(), + PRIMARY KEY (spiffe_service_id, bmc_mac_address) +); + +CREATE INDEX bmc_redfish_sessions_by_mac + ON bmc_redfish_sessions (bmc_mac_address); diff --git a/crates/api-db/src/bmc_redfish_session.rs b/crates/api-db/src/bmc_redfish_session.rs new file mode 100644 index 0000000000..a6d5eab880 --- /dev/null +++ b/crates/api-db/src/bmc_redfish_session.rs @@ -0,0 +1,113 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +//! Persistence layer for the `BmcSessionManager`. +//! +//! Each row records the outstanding Redfish session that the manager has +//! issued for a given `(spiffe_service_id, bmc_mac_address)` pair, so that +//! the next rotate (or a `flush_mac`) can `DELETE` the prior session on the +//! BMC. The `X-Auth-Token` itself is not persisted; it is returned to the +//! caller once and the only durable artifact is the session's `@odata.id`. + +use mac_address::MacAddress; +use model::bmc_redfish_session::StoredSession; +use sqlx::PgConnection; + +use crate::db_read::DbReader; +use crate::{DatabaseError, DatabaseResult}; + +/// Returns the outstanding session row for `(spiffe_service_id, bmc_mac)` +/// if any has been recorded. +pub async fn get( + txn: impl DbReader<'_>, + spiffe_service_id: &str, + bmc_mac: MacAddress, +) -> DatabaseResult> { + let query = "SELECT spiffe_service_id, bmc_mac_address, session_odata_id, issued_at + FROM bmc_redfish_sessions + WHERE spiffe_service_id = $1 AND bmc_mac_address = $2"; + + sqlx::query_as(query) + .bind(spiffe_service_id) + .bind(bmc_mac) + .fetch_optional(txn) + .await + .map_err(|e| DatabaseError::query(query, e)) +} + +/// Inserts (or overwrites) the outstanding session for +/// `(spiffe_service_id, bmc_mac)`. `issued_at` is set to `now()` server-side +/// so timestamps are consistent across replicas. +pub async fn upsert( + txn: &mut PgConnection, + spiffe_service_id: &str, + bmc_mac: MacAddress, + session_odata_id: &str, +) -> DatabaseResult<()> { + let query = "INSERT INTO bmc_redfish_sessions + (spiffe_service_id, bmc_mac_address, session_odata_id, issued_at) + VALUES ($1, $2, $3, now()) + ON CONFLICT (spiffe_service_id, bmc_mac_address) DO UPDATE + SET session_odata_id = EXCLUDED.session_odata_id, + issued_at = EXCLUDED.issued_at"; + + sqlx::query(query) + .bind(spiffe_service_id) + .bind(bmc_mac) + .bind(session_odata_id) + .execute(txn) + .await + .map(|_| ()) + .map_err(|e| DatabaseError::query(query, e)) +} + +/// Deletes the outstanding session row for `(spiffe_service_id, bmc_mac)`. +/// No-op if the row does not exist. +pub async fn delete( + txn: &mut PgConnection, + spiffe_service_id: &str, + bmc_mac: MacAddress, +) -> DatabaseResult<()> { + let query = "DELETE FROM bmc_redfish_sessions + WHERE spiffe_service_id = $1 AND bmc_mac_address = $2"; + + sqlx::query(query) + .bind(spiffe_service_id) + .bind(bmc_mac) + .execute(txn) + .await + .map(|_| ()) + .map_err(|e| DatabaseError::query(query, e)) +} + +/// Deletes every row whose `bmc_mac_address` matches `bmc_mac` and returns +/// the rows that were removed. The returned vector can be used by callers +/// that want to best-effort `DELETE` the corresponding sessions on the BMC. +pub async fn delete_by_mac( + txn: &mut PgConnection, + bmc_mac: MacAddress, +) -> DatabaseResult> { + let query = "DELETE FROM bmc_redfish_sessions + WHERE bmc_mac_address = $1 + RETURNING spiffe_service_id, bmc_mac_address, session_odata_id, issued_at"; + + sqlx::query_as(query) + .bind(bmc_mac) + .fetch_all(txn) + .await + .map_err(|e| DatabaseError::query(query, e)) +} diff --git a/crates/api-db/src/lib.rs b/crates/api-db/src/lib.rs index 26a2d5fa68..fbc15e8850 100644 --- a/crates/api-db/src/lib.rs +++ b/crates/api-db/src/lib.rs @@ -21,6 +21,7 @@ pub mod attestation; pub mod bmc_metadata; +pub mod bmc_redfish_session; pub mod carbide_version; pub mod compute_allocation; pub mod db_read; diff --git a/crates/api-model/src/bmc_redfish_session.rs b/crates/api-model/src/bmc_redfish_session.rs new file mode 100644 index 0000000000..899f8b7134 --- /dev/null +++ b/crates/api-model/src/bmc_redfish_session.rs @@ -0,0 +1,28 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +use mac_address::MacAddress; +use sqlx::types::chrono::{DateTime, Utc}; + +/// A row in the `bmc_redfish_sessions` table. +#[derive(Debug, Clone, sqlx::FromRow)] +pub struct StoredSession { + pub spiffe_service_id: String, + pub bmc_mac_address: MacAddress, + pub session_odata_id: String, + pub issued_at: DateTime, +} diff --git a/crates/api-model/src/lib.rs b/crates/api-model/src/lib.rs index d21208fbfb..3de0ce7df9 100644 --- a/crates/api-model/src/lib.rs +++ b/crates/api-model/src/lib.rs @@ -37,6 +37,7 @@ pub mod address_selection_strategy; pub mod allocation_type; pub mod attestation; pub mod bmc_info; +pub mod bmc_redfish_session; pub mod component_manager; pub mod compute_allocation; pub mod controller_outcome; diff --git a/crates/api/Cargo.toml b/crates/api/Cargo.toml index 98cb44550e..18135dfccf 100644 --- a/crates/api/Cargo.toml +++ b/crates/api/Cargo.toml @@ -131,6 +131,7 @@ lazy_static = { workspace = true } libredfish = { workspace = true } librms = { workspace = true } mac_address = { workspace = true } +nv-redfish = { workspace = true, features = ["session-service"] } num_cpus = { workspace = true } oauth2 = { default-features = false, workspace = true } oid-registry = { workspace = true } diff --git a/crates/api/src/api.rs b/crates/api/src/api.rs index 9c151828c8..3f620d2948 100644 --- a/crates/api/src/api.rs +++ b/crates/api/src/api.rs @@ -66,6 +66,7 @@ pub struct Api { pub(crate) credential_manager: Arc, pub(crate) certificate_provider: Arc, pub(crate) redfish_pool: Arc, + pub(crate) bmc_session_manager: Arc, pub(crate) eth_data: EthVirtData, pub(crate) common_pools: Arc, pub(crate) ib_fabric_manager: Arc, diff --git a/crates/api/src/cfg/README.md b/crates/api/src/cfg/README.md index defc651db8..6cbfaa296f 100644 --- a/crates/api/src/cfg/README.md +++ b/crates/api/src/cfg/README.md @@ -35,6 +35,7 @@ applicable. | `networks` | `Option>` | — | Networks created at startup. Alternative: `CreateNetworkSegment` gRPC. | | `dpu_ipmi_tool_impl` | `Option` | — | IPMI tool implementation for DPU power control (`"prod"` or `"fake"`). | | `dpu_ipmi_reboot_attempts` | `Option` | — | Retry count when IPMI errors during DPU reboot. | +| `bmc_session_lockout_threshold` | `u32` | `3` | Consecutive BMC HTTP 401/403 responses before session-token login attempts stop for that BMC. | | `ib_fabrics` | `HashMap` | `{}` | InfiniBand fabrics managed by the site. Currently only one fabric is supported. | | `initial_domain_name` | `Option` | — | Domain to create if none exist. Most sites use a single domain. | | `initial_dpu_agent_upgrade_policy` | `Option` | — | Policy for nico-dpu-agent upgrades. Also settable via `nico-admin-cli`. | diff --git a/crates/api/src/cfg/file.rs b/crates/api/src/cfg/file.rs index c41626dd1e..abbb7b9bfe 100644 --- a/crates/api/src/cfg/file.rs +++ b/crates/api/src/cfg/file.rs @@ -201,6 +201,13 @@ pub struct CarbideConfig { /// DPU reboot. pub dpu_ipmi_reboot_attempts: Option, + /// Number of consecutive HTTP 401/403 responses from a BMC before the + /// session-token path stops attempting to log in to that BMC, to avoid + /// exhausting the BMC root account's retry budget. + /// Default is 3. + #[serde(default = "default_bmc_session_lockout_threshold")] + pub bmc_session_lockout_threshold: u32, + /// Infiniband fabrics managed by the site /// Note: At the moment, only a single fabric is supported #[serde(default)] @@ -1487,6 +1494,10 @@ fn default_max_database_connections() -> u32 { 1000 } +pub const fn default_bmc_session_lockout_threshold() -> u32 { + 3 +} + /// DpuConfig related internal configuration #[derive(Clone, Debug, Serialize)] pub struct DpuConfig { @@ -2569,6 +2580,10 @@ mod tests { assert!(config.pools.is_none()); assert!(config.ib_config.is_none()); assert!(config.ib_fabrics.is_empty()); + assert_eq!( + config.bmc_session_lockout_threshold, + default_bmc_session_lockout_threshold() + ); assert!(config.vpc_peering_policy.is_none()); assert!(config.site_explorer.enabled.load(AtomicOrdering::Relaxed)); assert!(config.initial_objects_file.is_none()); @@ -2617,6 +2632,7 @@ mod tests { assert_eq!(config.asn, 777); assert_eq!(config.dhcp_servers, vec!["99.101.102.103".to_string()]); assert!(config.route_servers.is_empty()); + assert_eq!(config.bmc_session_lockout_threshold, 5); assert_eq!(config.vpc_peering_policy, Some(VpcPeeringPolicy::Exclusive)); assert_eq!(config.vpc_peering_policy_on_existing, None); assert_eq!( @@ -2756,6 +2772,7 @@ mod tests { assert_eq!(config.database_url, "postgres://a:b@postgresql".to_string()); assert_eq!(config.max_database_connections, 1222); assert_eq!(config.asn, 123); + assert_eq!(config.bmc_session_lockout_threshold, 4); assert_eq!( config.dhcp_servers, vec!["1.2.3.4".to_string(), "5.6.7.8".to_string()] @@ -3070,6 +3087,7 @@ mod tests { assert_eq!(config.database_url, "postgres://a:b@postgresql".to_string()); assert_eq!(config.max_database_connections, 1333); assert_eq!(config.asn, 777); + assert_eq!(config.bmc_session_lockout_threshold, 5); assert_eq!(config.dhcp_servers, vec!["99.101.102.103".to_string()]); assert_eq!(config.route_servers, vec!["9.10.11.12".to_string()]); assert_eq!( diff --git a/crates/api/src/cfg/test_data/full_config.toml b/crates/api/src/cfg/test_data/full_config.toml index 3d7c2ba6a9..0452026af5 100644 --- a/crates/api/src/cfg/test_data/full_config.toml +++ b/crates/api/src/cfg/test_data/full_config.toml @@ -11,6 +11,7 @@ initial_dpu_agent_upgrade_policy = "off" machine_update_run_interval = 60 dpu_ipmi_tool_impl = "fake" dpu_ipmi_reboot_attempts = 1 +bmc_session_lockout_threshold = 4 max_find_by_ids = 75 vpc_peering_policy = "exclusive" vpc_peering_policy_on_existing = "mixed" diff --git a/crates/api/src/cfg/test_data/site_config.toml b/crates/api/src/cfg/test_data/site_config.toml index 4b3d454700..5ff757bf45 100644 --- a/crates/api/src/cfg/test_data/site_config.toml +++ b/crates/api/src/cfg/test_data/site_config.toml @@ -4,6 +4,7 @@ rapid_iterations = true max_database_connections = 1333 max_find_by_ids = 50 dpu_network_monitor_pinger_type = "OobNetBind" +bmc_session_lockout_threshold = 5 vpc_peering_policy = "exclusive" diff --git a/crates/api/src/credentials/bmc_session_manager.rs b/crates/api/src/credentials/bmc_session_manager.rs new file mode 100644 index 0000000000..abeadffbb5 --- /dev/null +++ b/crates/api/src/credentials/bmc_session_manager.rs @@ -0,0 +1,892 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +//! Per-SPIFFE-caller BMC Redfish session token manager. +//! +//! Issues one live `X-Auth-Token` per `(SPIFFE service id, BMC MAC)` pair by +//! calling `nv-redfish` directly. Every call to [`BmcSessionManager::rotate`] +//! revokes the prior session (if any) and creates a new one. +//! +//! ## Persistence model +//! +//! The outstanding session `@odata.id` for each pair is persisted in the +//! `bmc_redfish_sessions` Postgres table behind the [`BmcSessionStore`] +//! trait. The `X-Auth-Token` itself is returned to the caller once and is +//! never stored anywhere by this manager. The DB row exists purely so the +//! next rotate (and [`BmcSessionManager::flush_mac`]) knows which session +//! resource to `DELETE` on the BMC before issuing a new one. +//! +//! Multiple API replicas may concurrently rotate the same pair. We do not +//! serialize across replicas: in the worst case a race produces one orphan +//! session on the BMC that expires via the BMC's idle-timeout. Within a +//! single replica, a per-key `tokio::sync::Mutex` serializes rotates for the +//! same `(spiffe, mac)` pair. +//! +//! ## Lifecycle hooks +//! +//! * [`BmcSessionManager::flush_mac`] -- intended for use when the BMC root +//! credentials are deleted. Drops all rows for that MAC; does not contact +//! the BMC (the credentials needed to authenticate the DELETE were just +//! wiped). Orphans expire via the BMC idle timer. +//! * [`BmcSessionManager::note_credentials_updated`] -- intended for use +//! when the BMC root credentials are set or rotated. Rows are +//! intentionally retained so the next rotate revokes the now-stale +//! sessions with the new credentials before issuing a fresh one. +//! +//! ## Lockout-avoidance circuit breaker +//! +//! Each [`BmcSessionManager`] tracks an in-memory per-BMC counter of +//! consecutive HTTP 401/403 responses returned during session creation. +//! Once that counter reaches the configured threshold the breaker trips and +//! any subsequent [`BmcSessionManager::rotate`] call for the same BMC +//! short-circuits with [`BmcSessionError::AvoidLockout`] rather than +//! attempting another login (which could exhaust the BMC root account's +//! retry budget). The breaker is cleared by: +//! * a successful [`BmcSessionManager::rotate`] (online recovery), +//! * [`BmcSessionManager::flush_mac`] (credentials deleted), or +//! * [`BmcSessionManager::note_credentials_updated`] (credentials set or +//! rotated). +//! +//! Breaker state is intentionally not persisted: after a process restart a +//! single login attempt per BMC may be burned before the breaker re-trips, +//! and other replicas track lockouts independently. Network errors, +//! timeouts, 5xx responses, and deserialization failures do **not** count +//! against the threshold. + +use std::collections::HashMap; +use std::fmt; +use std::net::SocketAddr; +use std::sync::Arc; +use std::time::Instant; + +use async_trait::async_trait; +use carbide_redfish::nv_redfish::{BmcError, NvRedfishClientPool, RedfishBmc}; +use db::bmc_redfish_session; +use forge_secrets::credentials::{ + BmcCredentialType, CredentialKey, CredentialManager, Credentials, +}; +use mac_address::MacAddress; +use model::bmc_redfish_session::StoredSession; +use nv_redfish::Error as NvError; +use nv_redfish::core::{EntityTypeRef as _, ODataId}; +use nv_redfish::session_service::SessionCreate; +use sqlx::PgPool; +use tokio::sync::Mutex; + +/// Errors surfaced by [`BmcSessionManager`]. +#[derive(thiserror::Error, Debug)] +pub enum BmcSessionError { + /// No BMC root credentials are stored for this MAC; cannot create a + /// session. + #[error("BMC root credentials are not configured for MAC {0}")] + MissingRootCredentials(MacAddress), + + /// Failure interacting with the BMC via nv-redfish (connect, create, + /// or delete failed for a reason other than auth). + #[error("Redfish error talking to BMC at {bmc_addr}: {detail}")] + Redfish { + bmc_addr: SocketAddr, + detail: String, + }, + + /// Failure reading the BMC root credentials from the credential store. + #[error("Credential store error: {0}")] + CredentialStore(String), + + /// Failure persisting or reading session metadata from the + /// [`BmcSessionStore`]. + #[error("BMC session store error: {0}")] + Store(String), + + /// The lockout-avoidance circuit breaker is tripped for this BMC and + /// we refuse to attempt another session creation until the BMC root + /// credentials are deleted or updated. + #[error( + "BMC {bmc_mac} is locked out after {consecutive_unauthorized} consecutive \ + unauthorized responses (last HTTP status {last_status}); update BMC root \ + credentials to recover" + )] + AvoidLockout { + bmc_mac: MacAddress, + consecutive_unauthorized: u32, + last_status: u16, + }, +} + +/// A live Redfish session that we issued to a caller. The `token` is +/// transient: it is returned exactly once and never persisted by us. +#[derive(Clone)] +pub struct SessionEntry { + /// `X-Auth-Token` value returned by the BMC on session creation. + pub token: String, + /// `@odata.id` of the session resource on the BMC; used to revoke the + /// session via `DELETE` on the next rotate. + pub session_odata_id: ODataId, +} + +impl fmt::Debug for SessionEntry { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("SessionEntry") + .field("token", &"[REDACTED]") + .field("session_odata_id", &self.session_odata_id) + .finish() + } +} + +/// Per-BMC lockout-avoidance state. +#[derive(Debug, Clone)] +struct LockoutState { + consecutive_unauthorized: u32, + last_status: u16, + /// `Some(when)` once the breaker has tripped; subsequent rotate calls + /// short-circuit until the state is cleared. + tripped_at: Option, +} + +/// Persistence layer for outstanding Redfish sessions. Wraps DB errors as +/// [`BmcSessionError::Store`] so the manager's surface stays uniform. +#[async_trait] +pub trait BmcSessionStore: Send + Sync { + async fn get( + &self, + spiffe_service_id: &str, + bmc_mac: MacAddress, + ) -> Result, BmcSessionError>; + + async fn upsert( + &self, + spiffe_service_id: &str, + bmc_mac: MacAddress, + session_odata_id: &str, + ) -> Result<(), BmcSessionError>; + + async fn delete_by_mac(&self, bmc_mac: MacAddress) -> Result<(), BmcSessionError>; +} + +/// Postgres-backed [`BmcSessionStore`] used in production. +pub struct PgBmcSessionStore { + pool: PgPool, +} + +impl PgBmcSessionStore { + pub fn new(pool: PgPool) -> Self { + Self { pool } + } +} + +#[async_trait] +impl BmcSessionStore for PgBmcSessionStore { + async fn get( + &self, + spiffe_service_id: &str, + bmc_mac: MacAddress, + ) -> Result, BmcSessionError> { + let mut conn = self + .pool + .acquire() + .await + .map_err(|err| BmcSessionError::Store(err.to_string()))?; + bmc_redfish_session::get(conn.as_mut(), spiffe_service_id, bmc_mac) + .await + .map_err(|err| BmcSessionError::Store(err.to_string())) + } + + async fn upsert( + &self, + spiffe_service_id: &str, + bmc_mac: MacAddress, + session_odata_id: &str, + ) -> Result<(), BmcSessionError> { + let mut conn = self + .pool + .acquire() + .await + .map_err(|err| BmcSessionError::Store(err.to_string()))?; + bmc_redfish_session::upsert(conn.as_mut(), spiffe_service_id, bmc_mac, session_odata_id) + .await + .map_err(|err| BmcSessionError::Store(err.to_string())) + } + + async fn delete_by_mac(&self, bmc_mac: MacAddress) -> Result<(), BmcSessionError> { + let mut conn = self + .pool + .acquire() + .await + .map_err(|err| BmcSessionError::Store(err.to_string()))?; + bmc_redfish_session::delete_by_mac(conn.as_mut(), bmc_mac) + .await + .map(|_| ()) + .map_err(|err| BmcSessionError::Store(err.to_string())) + } +} + +type InFlightKey = (String, MacAddress); + +pub struct BmcSessionManager { + redfish_pool: Arc, + credential_manager: Arc, + store: Arc, + in_flight: Mutex>>>, + lockouts: Mutex>, + lockout_threshold: u32, +} + +impl BmcSessionManager { + pub fn new( + redfish_pool: Arc, + credential_manager: Arc, + store: Arc, + lockout_threshold: u32, + ) -> Self { + Self { + redfish_pool, + credential_manager, + store, + in_flight: Mutex::new(HashMap::new()), + lockouts: Mutex::new(HashMap::new()), + lockout_threshold: lockout_threshold.max(1), + } + } + + /// Revoke the prior session (if any) for the given `(spiffe_service_id, + /// bmc_mac)` pair, then create a brand new session against the BMC at + /// `bmc_addr` and return its token. + pub async fn rotate( + &self, + spiffe_service_id: &str, + bmc_mac: MacAddress, + bmc_addr: SocketAddr, + ) -> Result { + // Ensure what no single combination of spiffe/mac can rotate key concurrently + let key: InFlightKey = (spiffe_service_id.to_owned(), bmc_mac); + let key_lock = self.acquire_key_lock(key).await; + let _guard = key_lock.lock().await; + + if let Some(err) = self.check_not_locked_out(bmc_mac).await { + return Err(err); + } + + let creds = self.bmc_root_credentials(bmc_mac).await?; + let (username, password) = match &creds { + Credentials::UsernamePassword { username, password } => { + (username.clone(), password.clone()) + } + }; + + let service_root = match self.redfish_pool.service_root(bmc_addr, creds).await { + Ok(root) => root, + Err(err) => return Err(self.classify_and_map(err, bmc_mac, bmc_addr).await), + }; + + let session_service = match service_root.session_service().await { + Ok(Some(svc)) => svc, + Ok(None) => { + return Err(BmcSessionError::Redfish { + bmc_addr, + detail: "BMC does not expose Redfish SessionService".to_string(), + }); + } + Err(err) => return Err(self.classify_and_map(err, bmc_mac, bmc_addr).await), + }; + + let sessions = match session_service.sessions().await { + Ok(Some(coll)) => coll, + Ok(None) => { + return Err(BmcSessionError::Redfish { + bmc_addr, + detail: "BMC SessionService does not expose a Sessions collection".to_string(), + }); + } + Err(err) => return Err(self.classify_and_map(err, bmc_mac, bmc_addr).await), + }; + + // We try to revoke previous session, best effort, if we fail we still try to + // create a new session + if let Some(prior) = self.store.get(spiffe_service_id, bmc_mac).await? { + let prior_id = ODataId::from(prior.session_odata_id); + match sessions.members().await { + Ok(members) => { + if let Some(prior_session) = members + .into_iter() + .find(|m| m.raw().odata_id() == &prior_id) + { + if let Err(err) = prior_session.delete().await { + tracing::warn!( + error = ?err, + %bmc_mac, + spiffe_service_id, + session = %prior_id, + "failed to revoke prior BMC session; \ + continuing with new session creation" + ); + } + } else { + tracing::info!( + %bmc_mac, + spiffe_service_id, + session = %prior_id, + "prior BMC session no longer present in Sessions collection; \ + skipping revoke" + ); + } + } + Err(err) => { + tracing::warn!( + error = ?err, + %bmc_mac, + spiffe_service_id, + "failed to list BMC sessions for prior-session revoke; continuing" + ); + } + } + } + + let created = match sessions + .create_session(&SessionCreate::builder(username, password).build()) + .await + { + Ok(s) => s, + Err(err) => return Err(self.classify_and_map(err, bmc_mac, bmc_addr).await), + }; + + let token = created + .auth_token() + .ok_or_else(|| BmcSessionError::Redfish { + bmc_addr, + detail: "BMC did not return an X-Auth-Token on session creation".to_string(), + })? + .to_string(); + let location = created + .location() + .cloned() + .ok_or_else(|| BmcSessionError::Redfish { + bmc_addr, + detail: "BMC did not return a session @odata.id on session creation".to_string(), + })?; + + // If persist fails we revoke token to avoid exhaust of session limit + if let Err(store_err) = self + .store + .upsert(spiffe_service_id, bmc_mac, &location.to_string()) + .await + { + if let Err(revoke_err) = created.delete().await { + tracing::warn!( + error = ?revoke_err, + %bmc_mac, + spiffe_service_id, + session = %location, + "failed to revoke just-created session after store upsert failed; \ + it will leak until BMC idle timeout" + ); + } + return Err(store_err); + } + + self.clear_lockout(bmc_mac).await; + + Ok(SessionEntry { + token, + session_odata_id: location, + }) + } + + async fn classify_and_map( + &self, + err: NvError, + bmc_mac: MacAddress, + bmc_addr: SocketAddr, + ) -> BmcSessionError { + if let Some(status) = classify_unauthorized(&err) + && let Some(lockout_err) = self.record_unauthorized(bmc_mac, status).await + { + return lockout_err; + } + BmcSessionError::Redfish { + bmc_addr, + detail: err.to_string(), + } + } + + /// Drop all session rows for `bmc_mac` and clear any lockout state. + pub async fn flush_mac(&self, bmc_mac: MacAddress) { + if let Err(err) = self.store.delete_by_mac(bmc_mac).await { + tracing::warn!( + error = %err, + %bmc_mac, + "failed to delete BMC session rows during flush_mac; continuing" + ); + } + self.clear_lockout(bmc_mac).await; + } + + /// Reset Circtuit Breaker + pub async fn note_credentials_updated(&self, bmc_mac: MacAddress) { + self.clear_lockout(bmc_mac).await; + } + + pub async fn check_not_locked_out(&self, bmc_mac: MacAddress) -> Option { + let lockouts = self.lockouts.lock().await; + let state = lockouts.get(&bmc_mac)?; + if state.tripped_at.is_some() { + Some(BmcSessionError::AvoidLockout { + bmc_mac, + consecutive_unauthorized: state.consecutive_unauthorized, + last_status: state.last_status, + }) + } else { + None + } + } + + /// Checks and return any authorization/authentication related error, as well as update lockouts + async fn record_unauthorized( + &self, + bmc_mac: MacAddress, + status: u16, + ) -> Option { + let mut lockouts = self.lockouts.lock().await; + let entry = lockouts.entry(bmc_mac).or_insert(LockoutState { + consecutive_unauthorized: 0, + last_status: status, + tripped_at: None, + }); + entry.consecutive_unauthorized = entry.consecutive_unauthorized.saturating_add(1); + entry.last_status = status; + if entry.consecutive_unauthorized >= self.lockout_threshold && entry.tripped_at.is_none() { + entry.tripped_at = Some(Instant::now()); + tracing::warn!( + %bmc_mac, + status, + consecutive_unauthorized = entry.consecutive_unauthorized, + threshold = self.lockout_threshold, + "BmcSessionManager: lockout-avoidance breaker tripped" + ); + return Some(BmcSessionError::AvoidLockout { + bmc_mac, + consecutive_unauthorized: entry.consecutive_unauthorized, + last_status: status, + }); + } + None + } + + async fn clear_lockout(&self, bmc_mac: MacAddress) { + if self.lockouts.lock().await.remove(&bmc_mac).is_some() { + tracing::info!(%bmc_mac, "BmcSessionManager: lockout-avoidance breaker cleared"); + } + } + + #[cfg(test)] + pub(crate) async fn force_trip_for_test( + &self, + bmc_mac: MacAddress, + consecutive_unauthorized: u32, + last_status: u16, + ) { + self.lockouts.lock().await.insert( + bmc_mac, + LockoutState { + consecutive_unauthorized: consecutive_unauthorized.max(1), + last_status, + tripped_at: Some(Instant::now()), + }, + ); + } + + async fn acquire_key_lock(&self, key: InFlightKey) -> Arc> { + let mut in_flight = self.in_flight.lock().await; + in_flight + .entry(key) + .or_insert_with(|| Arc::new(Mutex::new(()))) + .clone() + } + + async fn bmc_root_credentials( + &self, + bmc_mac: MacAddress, + ) -> Result { + self.credential_manager + .get_credentials(&CredentialKey::BmcCredentials { + credential_type: BmcCredentialType::BmcRoot { + bmc_mac_address: bmc_mac, + }, + }) + .await + .map_err(|err| BmcSessionError::CredentialStore(err.to_string()))? + .ok_or(BmcSessionError::MissingRootCredentials(bmc_mac)) + } +} + +pub fn classify_unauthorized(err: &NvError) -> Option { + let NvError::Bmc(BmcError::InvalidResponse { status, .. }) = err else { + return None; + }; + if *status == reqwest::StatusCode::UNAUTHORIZED || *status == reqwest::StatusCode::FORBIDDEN { + Some(status.as_u16()) + } else { + None + } +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + use std::sync::Arc; + + use arc_swap::ArcSwap; + use async_trait::async_trait; + use forge_secrets::credentials::{Credentials, TestCredentialManager}; + use mac_address::MacAddress; + use sqlx::types::chrono::Utc; + use tokio::sync::Mutex; + + use super::{BmcSessionError, BmcSessionManager, BmcSessionStore, InFlightKey, StoredSession}; + + fn mac(byte: u8) -> MacAddress { + MacAddress::from([byte, 0, 0, 0, 0, 1]) + } + + const TEST_LOCKOUT_THRESHOLD: u32 = 3; + + #[derive(Default)] + struct InMemoryBmcSessionStore { + rows: Mutex>, + } + + impl InMemoryBmcSessionStore { + fn new() -> Arc { + Arc::new(Self::default()) + } + } + + #[async_trait] + impl BmcSessionStore for InMemoryBmcSessionStore { + async fn get( + &self, + spiffe_service_id: &str, + bmc_mac: MacAddress, + ) -> Result, BmcSessionError> { + Ok(self + .rows + .lock() + .await + .get(&(spiffe_service_id.to_owned(), bmc_mac)) + .cloned()) + } + + async fn upsert( + &self, + spiffe_service_id: &str, + bmc_mac: MacAddress, + session_odata_id: &str, + ) -> Result<(), BmcSessionError> { + self.rows.lock().await.insert( + (spiffe_service_id.to_owned(), bmc_mac), + StoredSession { + spiffe_service_id: spiffe_service_id.to_owned(), + bmc_mac_address: bmc_mac, + session_odata_id: session_odata_id.to_owned(), + issued_at: Utc::now(), + }, + ); + Ok(()) + } + + async fn delete_by_mac(&self, bmc_mac: MacAddress) -> Result<(), BmcSessionError> { + self.rows.lock().await.retain(|(_, m), _| *m != bmc_mac); + Ok(()) + } + } + + fn manager_with_creds() -> (Arc, Arc) { + manager_with_creds_and_threshold(TEST_LOCKOUT_THRESHOLD) + } + + fn manager_with_creds_and_threshold( + threshold: u32, + ) -> (Arc, Arc) { + let bmc_proxy = Arc::new(ArcSwap::new(Arc::new(None))); + let redfish_pool = carbide_redfish::nv_redfish::new_pool(bmc_proxy); + let credential_manager = + Arc::new(TestCredentialManager::new(Credentials::UsernamePassword { + username: "root".to_string(), + password: "password".to_string(), + })); + let store = InMemoryBmcSessionStore::new(); + let manager = Arc::new(BmcSessionManager::new( + redfish_pool, + credential_manager, + store.clone(), + threshold, + )); + (manager, store) + } + + #[test] + fn odata_id_last_segment_returns_session_id() { + let id = nv_redfish::core::ODataId::from( + "/redfish/v1/SessionService/Sessions/abc123".to_string(), + ); + assert_eq!(id.last_segment(), Some("abc123")); + } + + async fn seed_row( + store: &InMemoryBmcSessionStore, + spiffe_service_id: &str, + bmc_mac: MacAddress, + session_odata_id: &str, + ) { + store + .upsert(spiffe_service_id, bmc_mac, session_odata_id) + .await + .expect("in-memory upsert never fails"); + } + + #[tokio::test] + async fn flush_mac_deletes_store_rows_and_clears_lockout() { + let (manager, store) = manager_with_creds(); + let mac_a = mac(0xAA); + let mac_b = mac(0xBB); + + seed_row(&store, "svc-1", mac_a, "/sessions/1").await; + seed_row(&store, "svc-2", mac_a, "/sessions/2").await; + seed_row(&store, "svc-1", mac_b, "/sessions/3").await; + manager.force_trip_for_test(mac_a, 3, 401).await; + + manager.flush_mac(mac_a).await; + + // mac_a rows are gone, mac_b survives. + let rows = store.rows.lock().await; + assert_eq!(rows.len(), 1); + assert!(rows.keys().all(|(_, m)| *m == mac_b)); + drop(rows); + // lockout was cleared along with the rows. + assert!(manager.check_not_locked_out(mac_a).await.is_none()); + } + + #[tokio::test] + async fn note_credentials_updated_retains_store_rows() { + let (manager, store) = manager_with_creds(); + let bmc_mac = mac(0xCC); + seed_row(&store, "svc-1", bmc_mac, "/sessions/keep-me").await; + manager.force_trip_for_test(bmc_mac, 5, 403).await; + + manager.note_credentials_updated(bmc_mac).await; + + // Row is still present so the next rotate can revoke it with the + // new creds; the breaker has been cleared. + let rows = store.rows.lock().await; + assert!(rows.contains_key(&("svc-1".to_string(), bmc_mac))); + drop(rows); + assert!(manager.check_not_locked_out(bmc_mac).await.is_none()); + } + + #[tokio::test] + async fn in_memory_store_upsert_replaces_existing_row() { + let store = InMemoryBmcSessionStore::new(); + let bmc_mac = mac(0xDD); + store.upsert("svc", bmc_mac, "/sessions/v1").await.unwrap(); + store.upsert("svc", bmc_mac, "/sessions/v2").await.unwrap(); + let row = store + .get("svc", bmc_mac) + .await + .expect("ok") + .expect("row present"); + assert_eq!(row.session_odata_id, "/sessions/v2"); + } + + #[tokio::test] + async fn rotate_returns_missing_credentials_when_unset() { + let bmc_proxy = Arc::new(ArcSwap::new(Arc::new(None))); + let redfish_pool = carbide_redfish::nv_redfish::new_pool(bmc_proxy); + let credential_manager = Arc::new(TestCredentialManager::default()); + let store = InMemoryBmcSessionStore::new(); + let manager = BmcSessionManager::new( + redfish_pool, + credential_manager, + store, + TEST_LOCKOUT_THRESHOLD, + ); + + let bmc_mac = mac(0xCE); + let bmc_addr = "127.0.0.1:9999".parse().unwrap(); + let err = manager + .rotate("svc-x", bmc_mac, bmc_addr) + .await + .expect_err("should fail with missing root credentials"); + match err { + super::BmcSessionError::MissingRootCredentials(got) => assert_eq!(got, bmc_mac), + other => panic!("unexpected error: {other:?}"), + } + } + + #[tokio::test] + async fn in_flight_lock_is_per_key() { + let (manager, _store) = manager_with_creds(); + let key_a: InFlightKey = ("svc-1".to_string(), mac(0x01)); + let key_b: InFlightKey = ("svc-2".to_string(), mac(0x01)); + let lock_a = manager.acquire_key_lock(key_a.clone()).await; + let lock_b = manager.acquire_key_lock(key_b.clone()).await; + assert!(!Arc::ptr_eq(&lock_a, &lock_b)); + let lock_a_again = manager.acquire_key_lock(key_a).await; + assert!(Arc::ptr_eq(&lock_a, &lock_a_again)); + let _g = lock_b.lock().await; + } + + #[tokio::test] + async fn record_unauthorized_returns_none_below_threshold() { + let (manager, _store) = manager_with_creds_and_threshold(3); + let bmc_mac = mac(0xDE); + for _ in 0..2 { + assert!(manager.record_unauthorized(bmc_mac, 401).await.is_none()); + assert!( + manager.check_not_locked_out(bmc_mac).await.is_none(), + "breaker should not trip below threshold" + ); + } + let state = manager + .lockouts + .lock() + .await + .get(&bmc_mac) + .cloned() + .expect("state should exist after recording"); + assert_eq!(state.consecutive_unauthorized, 2); + assert!(state.tripped_at.is_none()); + } + + #[tokio::test] + async fn record_unauthorized_trips_at_threshold() { + let (manager, _store) = manager_with_creds_and_threshold(3); + let bmc_mac = mac(0xDE); + assert!(manager.record_unauthorized(bmc_mac, 401).await.is_none()); + assert!(manager.record_unauthorized(bmc_mac, 401).await.is_none()); + let trip = manager + .record_unauthorized(bmc_mac, 403) + .await + .expect("third unauthorized should trip the breaker"); + match trip { + super::BmcSessionError::AvoidLockout { + bmc_mac: got_mac, + consecutive_unauthorized, + last_status, + } => { + assert_eq!(got_mac, bmc_mac); + assert_eq!(consecutive_unauthorized, 3); + assert_eq!(last_status, 403); + } + other => panic!("unexpected error variant: {other:?}"), + } + let still = manager + .check_not_locked_out(bmc_mac) + .await + .expect("breaker should remain tripped"); + assert!(matches!(still, super::BmcSessionError::AvoidLockout { .. })); + } + + #[tokio::test] + async fn record_unauthorized_only_emits_avoid_lockout_on_the_tripping_request() { + let (manager, _store) = manager_with_creds_and_threshold(2); + let bmc_mac = mac(0xDE); + assert!(manager.record_unauthorized(bmc_mac, 401).await.is_none()); + let trip = manager.record_unauthorized(bmc_mac, 401).await; + assert!(matches!( + trip, + Some(super::BmcSessionError::AvoidLockout { .. }) + )); + let follow_up = manager.record_unauthorized(bmc_mac, 401).await; + assert!( + follow_up.is_none(), + "second AvoidLockout should not be emitted from record_unauthorized" + ); + } + + #[tokio::test] + async fn clear_lockout_removes_tripped_state() { + let (manager, _store) = manager_with_creds_and_threshold(1); + let bmc_mac = mac(0xEE); + manager.force_trip_for_test(bmc_mac, 1, 401).await; + assert!(manager.check_not_locked_out(bmc_mac).await.is_some()); + manager.clear_lockout(bmc_mac).await; + assert!(manager.check_not_locked_out(bmc_mac).await.is_none()); + assert!(!manager.lockouts.lock().await.contains_key(&bmc_mac)); + } + + #[tokio::test] + async fn rotate_short_circuits_when_breaker_tripped() { + let (manager, _store) = manager_with_creds_and_threshold(1); + let bmc_mac = mac(0xF1); + manager.force_trip_for_test(bmc_mac, 7, 401).await; + + let bmc_addr = "127.0.0.1:9999".parse().unwrap(); + let err = manager + .rotate("svc-locked", bmc_mac, bmc_addr) + .await + .expect_err("rotate must refuse to contact a locked-out BMC"); + match err { + super::BmcSessionError::AvoidLockout { + bmc_mac: got, + consecutive_unauthorized, + last_status, + } => { + assert_eq!(got, bmc_mac); + assert_eq!(consecutive_unauthorized, 7); + assert_eq!(last_status, 401); + } + other => panic!("unexpected error: {other:?}"), + } + } + + #[tokio::test] + async fn concurrent_unauthorized_records_trip_exactly_once() { + let (manager, _store) = manager_with_creds_and_threshold(3); + let bmc_mac = mac(0xF2); + + let mut handles = Vec::new(); + for _ in 0..16 { + let manager = manager.clone(); + handles.push(tokio::spawn(async move { + manager.record_unauthorized(bmc_mac, 401).await + })); + } + + let mut trips = 0; + for h in handles { + if matches!( + h.await.expect("task panicked"), + Some(super::BmcSessionError::AvoidLockout { .. }) + ) { + trips += 1; + } + } + assert_eq!( + trips, 1, + "exactly one record_unauthorized should report a trip" + ); + + let state = manager + .lockouts + .lock() + .await + .get(&bmc_mac) + .cloned() + .expect("state present after concurrent records"); + assert!(state.tripped_at.is_some()); + assert!(state.consecutive_unauthorized >= 3); + } +} diff --git a/crates/api/src/credentials/mod.rs b/crates/api/src/credentials/mod.rs index a464c23099..d1a7efc001 100644 --- a/crates/api/src/credentials/mod.rs +++ b/crates/api/src/credentials/mod.rs @@ -21,6 +21,12 @@ use carbide_uuid::machine::MachineId; use forge_secrets::credentials::{BmcCredentialType, CredentialKey, CredentialWriter}; use mac_address::MacAddress; +pub mod bmc_session_manager; + +pub use bmc_session_manager::{ + BmcSessionError, BmcSessionManager, BmcSessionStore, PgBmcSessionStore, +}; + use crate::{CarbideError, CarbideResult}; pub struct UpdateCredentials { diff --git a/crates/api/src/handlers/credential.rs b/crates/api/src/handlers/credential.rs index 79c7085686..ff9f90774e 100644 --- a/crates/api/src/handlers/credential.rs +++ b/crates/api/src/handlers/credential.rs @@ -17,9 +17,10 @@ use std::fs::File; use std::io::Write; +use std::net::{IpAddr, SocketAddr}; use ::rpc::errors::RpcDataConversionError; -use ::rpc::forge as rpc; +use ::rpc::forge::{self as rpc}; use carbide_nvlink_manager::DEFAULT_NMX_M_NAME; use forge_secrets::credentials::{ BgpCredentialType, BmcCredentialType, CredentialKey, CredentialType, Credentials, @@ -34,6 +35,9 @@ use crate::api::Api; use crate::credentials::UpdateCredentials; use crate::handlers::utils::convert_and_log_machine_id; +/// We assume that all BMCs speak Redfish on the standard port +const BMC_REDFISH_PORT: u16 = 443; + /// Default Username for the admin BMC account. const DEFAULT_FORGE_ADMIN_BMC_USERNAME: &str = "root"; @@ -322,6 +326,7 @@ pub(crate) async fn delete_credential( .map_err(CarbideError::from)?; delete_bmc_root_credentials_by_mac(api, parsed_mac).await?; + api.bmc_session_manager.flush_mac(parsed_mac).await; } None => { return Err(CarbideError::InvalidArgument( @@ -387,14 +392,29 @@ pub(crate) async fn update_machine_credentials( .map(Response::new)?) } -/// As for now we only support UsernamePassword credentials type, -/// in future this function should support SessionToken if available +/// Issue (or rotate) a BMC Redfish session token for the SPIFFE service +/// identity making this call. +/// +/// Every call from the same SPIFFE service against the same BMC MAC revokes +/// the prior token and creates a new one. Callers without a SPIFFE service +/// identity are rejected with `PermissionDenied` pub(crate) async fn get_bmc_credentals( api: &Api, request: tonic::Request, ) -> Result, tonic::Status> { crate::api::log_request_data(&request); + let spiffe_service_id = request + .extensions() + .get::() + .and_then(|ctx| ctx.get_spiffe_service_id()) + .ok_or_else(|| { + Status::permission_denied( + "BMC session tokens are only issued to SPIFFE service identities", + ) + })? + .to_owned(); + let req = request.into_inner(); let bmc_mac_address: mac_address::MacAddress = req @@ -402,26 +422,41 @@ pub(crate) async fn get_bmc_credentals( .parse() .map_err(CarbideError::MacAddressParseError)?; - let credentials = api - .credential_manager - .get_credentials(&CredentialKey::BmcCredentials { - credential_type: BmcCredentialType::BmcRoot { bmc_mac_address }, - }) - .await - .map_err(|e| CarbideError::internal(e.to_string()))? - .ok_or_else(|| CarbideError::NotFoundError { - kind: "bmc_root_credentials", - id: req.mac_addr.clone(), + let bmc_ips = db::machine_interface::lookup_bmc_ip_by_mac_address( + &api.database_connection, + bmc_mac_address, + ) + .await?; + + let bmc_ip = bmc_ips + .iter() + .copied() + .find(IpAddr::is_ipv4) + .or_else(|| bmc_ips.first().copied()) + .ok_or_else(|| { + Status::not_found(format!( + "no BMC IP addresses recorded for MAC {bmc_mac_address}" + )) })?; - let (username, password) = match credentials { - Credentials::UsernamePassword { username, password } => (username, password), - }; + let bmc_addr = SocketAddr::new(bmc_ip, BMC_REDFISH_PORT); + + let entry = api + .bmc_session_manager + .rotate(&spiffe_service_id, bmc_mac_address, bmc_addr) + .await + .map_err(|err| match err { + crate::credentials::BmcSessionError::AvoidLockout { .. } => { + Status::failed_precondition(err.to_string()) + } + crate::credentials::BmcSessionError::Store(_) => Status::internal(err.to_string()), + other => CarbideError::internal(other.to_string()).into(), + })?; Ok(Response::new(rpc::GetBmcCredentialsResponse { credentials: Some(rpc::BmcCredentials { - r#type: Some(rpc::bmc_credentials::Type::UsernamePassword( - rpc::UsernamePassword { username, password }, + r#type: Some(rpc::bmc_credentials::Type::SessionToken( + rpc::SessionToken { token: entry.token }, )), }), })) @@ -511,7 +546,13 @@ pub(crate) async fn delete_bmc_root_credentials_by_mac( api.credential_manager .delete_credentials(&credential_key) .await - .map_err(|e| CarbideError::internal(format!("Error deleting credential for BMC: {e:?} "))) + .map_err(|e| { + CarbideError::internal(format!("Error deleting credential for BMC: {e:?} ")) + })?; + + api.bmc_session_manager.flush_mac(bmc_mac_address).await; + + Ok(()) } async fn set_bmc_root_credentials_by_mac( @@ -529,7 +570,14 @@ async fn set_bmc_root_credentials_by_mac( password: password.clone(), }; - set_bmc_credentials(api, &credential_key, &credentials).await + set_bmc_credentials(api, &credential_key, &credentials).await?; + + // Reset breaker + api.bmc_session_manager + .note_credentials_updated(bmc_mac_address) + .await; + + Ok(()) } async fn set_bmc_credentials( diff --git a/crates/api/src/setup.rs b/crates/api/src/setup.rs index eae6e56d40..33134d472b 100644 --- a/crates/api/src/setup.rs +++ b/crates/api/src/setup.rs @@ -619,6 +619,15 @@ pub async fn start_api( ListenMode::PlaintextHttp2 => ApiListenMode::PlaintextHttp2, }; + let bmc_session_store: Arc = + Arc::new(crate::credentials::PgBmcSessionStore::new(db_pool.clone())); + let bmc_session_manager = Arc::new(crate::credentials::BmcSessionManager::new( + shared_nv_redfish_pool.clone(), + credential_manager.clone(), + bmc_session_store, + carbide_config.bmc_session_lockout_threshold, + )); + let bmc_explorer = carbide_site_explorer::new_bmc_explorer( shared_redfish_pool.clone(), shared_nv_redfish_pool, @@ -733,6 +742,7 @@ pub async fn start_api( eth_data, ib_fabric_manager, redfish_pool: shared_redfish_pool, + bmc_session_manager, runtime_config: carbide_config.clone(), scout_stream_registry: ConnectionRegistry::new(), rms_client: rms_client.clone(), diff --git a/crates/api/src/tests/common/api_fixtures/mod.rs b/crates/api/src/tests/common/api_fixtures/mod.rs index 9bad166633..2c37c43d81 100644 --- a/crates/api/src/tests/common/api_fixtures/mod.rs +++ b/crates/api/src/tests/common/api_fixtures/mod.rs @@ -124,7 +124,7 @@ use crate::cfg::file::{ NetworkSecurityGroupConfig, NetworkSegmentStateControllerConfig, PowerShelfStateControllerConfig, RackStateControllerConfig, SpdmConfig, SpdmStateControllerConfig, SwitchStateControllerConfig, VmaasConfig, VpcPeeringPolicy, - default_max_find_by_ids, + default_bmc_session_lockout_threshold, default_max_find_by_ids, }; use crate::dpf::DpfOperations; use crate::ethernet_virtualization::{EthVirtData, SiteFabricPrefixList}; @@ -1200,6 +1200,7 @@ pub fn get_config() -> CarbideConfig { networks: None, dpu_ipmi_tool_impl: None, dpu_ipmi_reboot_attempts: Some(0), + bmc_session_lockout_threshold: default_bmc_session_lockout_threshold(), initial_domain_name: Some("test.com".to_string()), sitename: Some("testsite".to_string()), initial_dpu_agent_upgrade_policy: None, @@ -1621,9 +1622,18 @@ pub async fn create_test_env_with_overrides( }; let bmc_proxy = Arc::new(ArcSwap::new(None.into())); + let nv_redfish_pool = carbide_redfish::nv_redfish::new_pool(bmc_proxy); + let bmc_session_store: Arc = + Arc::new(crate::credentials::PgBmcSessionStore::new(db_pool.clone())); + let bmc_session_manager = Arc::new(crate::credentials::BmcSessionManager::new( + nv_redfish_pool.clone(), + composite_manager.clone(), + bmc_session_store, + config.bmc_session_lockout_threshold, + )); let bmc_explorer = carbide_site_explorer::new_bmc_explorer( redfish_sim.clone(), - carbide_redfish::nv_redfish::new_pool(bmc_proxy), + nv_redfish_pool, carbide_ipmi::test_support(), composite_manager.clone(), Arc::new(std::sync::atomic::AtomicBool::new(false)), @@ -1651,6 +1661,7 @@ pub async fn create_test_env_with_overrides( certificate_provider: certificate_provider.clone(), database_connection: db_pool.clone(), redfish_pool: redfish_sim.clone(), + bmc_session_manager, eth_data: eth_virt_data.clone(), common_pools: common_pools.clone(), ib_fabric_manager: ib_fabric_manager.clone(), diff --git a/crates/api/src/tests/credential.rs b/crates/api/src/tests/credential.rs index ad45c83168..401a983d1b 100644 --- a/crates/api/src/tests/credential.rs +++ b/crates/api/src/tests/credential.rs @@ -22,6 +22,7 @@ use forge_secrets::credentials::{ use rpc::forge::forge_server::Forge; use rpc::forge::{ CredentialCreationRequest, CredentialDeletionRequest, CredentialType as RpcCredentialType, + GetBmcCredentialsRequest, }; use tonic::Code; @@ -176,6 +177,29 @@ async fn test_create_and_delete_bgp_credential(pool: sqlx::PgPool) { assert_eq!(deleted, None); } +#[crate::sqlx_test] +async fn test_get_bmc_credentials_rejects_caller_without_spiffe_service_id(pool: sqlx::PgPool) { + let env = create_test_env(pool).await; + + // No `AuthContext` extension attached -> no SPIFFE service identity -> + // PermissionDenied. We do not need a real BMC, machine record, or + // populated credentials for this assertion because the SPIFFE check + // happens before any of those lookups. + let mut request = tonic::Request::new(GetBmcCredentialsRequest { + mac_addr: "11:22:33:44:55:66".to_string(), + }); + request + .extensions_mut() + .insert(crate::auth::AuthContext::default()); + + let err = env + .api + .get_bmc_credentials(request) + .await + .expect_err("caller without SPIFFE service id should be rejected"); + assert_eq!(err.code(), Code::PermissionDenied); +} + #[crate::sqlx_test] async fn test_create_bgp_credential_validates_max_password_length(pool: sqlx::PgPool) { let env = create_test_env(pool).await; diff --git a/crates/authn/src/middleware.rs b/crates/authn/src/middleware.rs index 91b7f85a62..e775ca7de3 100644 --- a/crates/authn/src/middleware.rs +++ b/crates/authn/src/middleware.rs @@ -398,6 +398,13 @@ impl AuthContext { }) } + pub fn get_spiffe_service_id(&self) -> Option<&str> { + self.principals.iter().find_map(|p| match p { + Principal::SpiffeServiceIdentifier(identifier) => Some(identifier.as_str()), + _ => None, + }) + } + pub fn get_external_user_info(&self) -> Option<&ExternalUserInfo> { self.principals.iter().find_map(|p| match p { Principal::ExternalUser(external_user_info) diff --git a/crates/bmc-mock/src/auth_router.rs b/crates/bmc-mock/src/auth_router.rs index 1277f2cf1d..b3152f43f9 100644 --- a/crates/bmc-mock/src/auth_router.rs +++ b/crates/bmc-mock/src/auth_router.rs @@ -21,7 +21,7 @@ use axum::Router; use axum::body::Body; use axum::extract::{Request, State}; use axum::http::header::WWW_AUTHENTICATE; -use axum::http::{HeaderValue, StatusCode}; +use axum::http::{HeaderValue, Method, StatusCode}; use axum::response::{IntoResponse, Response}; use axum::routing::any; use axum_extra::TypedHeader; @@ -31,15 +31,18 @@ use tracing::instrument; use crate::http::call_router_with_new_request; use crate::redfish::account_service::AccountServiceState; -use crate::redfish::{account_service, service_root}; +use crate::redfish::session_service::SessionServiceState; +use crate::redfish::{account_service, service_root, session_service}; const WWW_AUTHENTICATE_VALUE: HeaderValue = HeaderValue::from_static("Basic realm=\"bmc-mock\""); +const X_AUTH_TOKEN_HEADER: &str = "x-auth-token"; pub fn append(router: Router, authorizer: Authorizer) -> Router { let service_root_path = service_root::resource().odata_id.to_string(); let service_root_path_with_trailing_slash = format!("{service_root_path}/"); let account_service_path = account_service::resource().odata_id.to_string(); let account_service_subtree_path = format!("{account_service_path}/{{*all}}"); + let sessions_collection_path = session_service::sessions_collection().odata_id.to_string(); Router::new() .route(&service_root_path, any(process_without_auth)) @@ -47,6 +50,7 @@ pub fn append(router: Router, authorizer: Authorizer) -> Router { &service_root_path_with_trailing_slash, any(process_without_auth), ) + .route(&sessions_collection_path, any(process_sessions_collection)) .route(&account_service_path, any(process_account_service)) .route(&account_service_subtree_path, any(process_account_service)) .route("/{*all}", any(process)) @@ -65,21 +69,40 @@ async fn process_without_auth( } #[instrument(skip_all)] -async fn process_account_service( +async fn process_sessions_collection( State(mut state): State, authorization: Option>>, request: Request, ) -> Response { - match state.authorizer.authorize(authorization.as_ref(), true) { + if request.method() == Method::POST { + return state.call_inner_router(request).await; + } + match state + .authorizer + .authorize(&request, authorization.as_ref(), true) + { AuthorizationResult::Authorized => state.call_inner_router(request).await, - AuthorizationResult::Unauthorized => { - tracing::warn!( - method = request.method().as_str(), - path = request.uri().path(), - "Unauthorized request", - ); - unauthorized_response() + AuthorizationResult::Unauthorized => unauthorized_log_and_response(request), + AuthorizationResult::FactoryDefaultPasswordForbidden => { + unreachable!( + "session collection authorization does not forbid factory-default passwords" + ) } + } +} + +#[instrument(skip_all)] +async fn process_account_service( + State(mut state): State, + authorization: Option>>, + request: Request, +) -> Response { + match state + .authorizer + .authorize(&request, authorization.as_ref(), true) + { + AuthorizationResult::Authorized => state.call_inner_router(request).await, + AuthorizationResult::Unauthorized => unauthorized_log_and_response(request), AuthorizationResult::FactoryDefaultPasswordForbidden => { unreachable!("account service authorization does not forbid factory-default passwords") } @@ -92,16 +115,12 @@ async fn process( authorization: Option>>, request: Request, ) -> Response { - match state.authorizer.authorize(authorization.as_ref(), false) { + match state + .authorizer + .authorize(&request, authorization.as_ref(), false) + { AuthorizationResult::Authorized => state.call_inner_router(request).await, - AuthorizationResult::Unauthorized => { - tracing::warn!( - method = request.method().as_str(), - path = request.uri().path(), - "Unauthorized request", - ); - unauthorized_response() - } + AuthorizationResult::Unauthorized => unauthorized_log_and_response(request), AuthorizationResult::FactoryDefaultPasswordForbidden => { tracing::warn!( method = request.method().as_str(), @@ -113,6 +132,15 @@ async fn process( } } +fn unauthorized_log_and_response(request: Request) -> Response { + tracing::warn!( + method = request.method().as_str(), + path = request.uri().path(), + "Unauthorized request", + ); + unauthorized_response() +} + fn unauthorized_response() -> Response { ( StatusCode::UNAUTHORIZED, @@ -136,21 +164,36 @@ impl AuthMiddleware { #[derive(Clone)] pub struct Authorizer { account_service_state: Arc, + session_service_state: Arc, } impl Authorizer { /// Builds the factory-default authorizer for a mock BMC state. - pub fn new(account_service_state: Arc) -> Self { + pub fn new( + account_service_state: Arc, + session_service_state: Arc, + ) -> Self { Self { account_service_state, + session_service_state, } } fn authorize( &self, + request: &Request, authorization: Option<&TypedHeader>>, permit_factory_default: bool, ) -> AuthorizationResult { + if let Some(token) = request + .headers() + .get(X_AUTH_TOKEN_HEADER) + .and_then(|v| v.to_str().ok()) + && self.session_service_state.is_token_valid(token) + { + return AuthorizationResult::Authorized; + } + let Some(authorization) = authorization else { return AuthorizationResult::Unauthorized; }; diff --git a/crates/bmc-mock/src/bmc_state.rs b/crates/bmc-mock/src/bmc_state.rs index a08302511c..e35c7d86d4 100644 --- a/crates/bmc-mock/src/bmc_state.rs +++ b/crates/bmc-mock/src/bmc_state.rs @@ -22,6 +22,7 @@ use crate::redfish::account_service::AccountServiceState; use crate::redfish::chassis::ChassisState; use crate::redfish::computer_system::SystemState; use crate::redfish::manager::ManagerState; +use crate::redfish::session_service::SessionServiceState; use crate::redfish::update_service::UpdateServiceState; #[derive(Clone)] @@ -35,6 +36,7 @@ pub struct BmcState { pub chassis_state: Arc, pub update_service_state: Arc, pub account_service_state: Arc, + pub session_service_state: Arc, pub injection: Arc, pub callbacks: Option>, } diff --git a/crates/bmc-mock/src/mock_machine_router.rs b/crates/bmc-mock/src/mock_machine_router.rs index 2f3cab548b..caa485ab0f 100644 --- a/crates/bmc-mock/src/mock_machine_router.rs +++ b/crates/bmc-mock/src/mock_machine_router.rs @@ -80,6 +80,7 @@ pub fn machine_router( .add_routes(crate::redfish::update_service::add_routes) .add_routes(crate::redfish::task_service::add_routes) .add_routes(crate::redfish::account_service::add_routes) + .add_routes(crate::redfish::session_service::add_routes) .add_routes(|routes| crate::redfish::computer_system::add_routes(routes, bmc_vendor)) .add_routes(crate::ipmi::add_routes); let router = match &machine_info { @@ -101,6 +102,8 @@ pub fn machine_router( let account_service_state = Arc::new( crate::redfish::account_service::AccountServiceState::new(factory_default_account), ); + let session_service_state = + Arc::new(crate::redfish::session_service::SessionServiceState::new()); let injection = Arc::new(InjectionStore::new()); let state = BmcState { bmc_vendor, @@ -112,15 +115,20 @@ pub fn machine_router( chassis_state, update_service_state, account_service_state, + session_service_state, injection: injection.clone(), callbacks: Some(callbacks.clone()), }; let account_service_state = state.account_service_state.clone(); + let session_service_state = state.session_service_state.clone(); let router = ([ Box::new(redfish::expander_router::append), Box::new(move |router| { if redfish_auth { - auth_router::append(router, Authorizer::new(account_service_state)) + auth_router::append( + router, + Authorizer::new(account_service_state, session_service_state), + ) } else { router } diff --git a/crates/bmc-mock/src/redfish/mod.rs b/crates/bmc-mock/src/redfish/mod.rs index 42b81ba41c..70c19df62d 100644 --- a/crates/bmc-mock/src/redfish/mod.rs +++ b/crates/bmc-mock/src/redfish/mod.rs @@ -38,6 +38,7 @@ pub mod resource; pub mod secure_boot; pub mod sensor; pub mod service_root; +pub mod session_service; pub mod software_inventory; pub mod storage; pub mod task_service; diff --git a/crates/bmc-mock/src/redfish/service_root.rs b/crates/bmc-mock/src/redfish/service_root.rs index 7224187d5c..a2541de247 100644 --- a/crates/bmc-mock/src/redfish/service_root.rs +++ b/crates/bmc-mock/src/redfish/service_root.rs @@ -62,6 +62,7 @@ async fn get_service_root(State(state): State) -> Response { ) .maybe_with(ServiceRootBuilder::product, &state.bmc_product) .account_service(&redfish::account_service::resource()) + .session_service(&redfish::session_service::service_resource()) .chassis_collection(&redfish::chassis::collection()) .system_collection(&redfish::computer_system::collection()) .manager_collection(&redfish::manager::collection()) @@ -103,6 +104,10 @@ impl ServiceRootBuilder { self.apply_patch(v.nav_property("AccountService")) } + pub fn session_service(self, v: &redfish::Resource<'_>) -> Self { + self.apply_patch(v.nav_property("SessionService")) + } + pub fn chassis_collection(self, v: &redfish::Collection<'_>) -> Self { self.apply_patch(v.nav_property("Chassis")) } diff --git a/crates/bmc-mock/src/redfish/session_service.rs b/crates/bmc-mock/src/redfish/session_service.rs new file mode 100644 index 0000000000..9950af8e48 --- /dev/null +++ b/crates/bmc-mock/src/redfish/session_service.rs @@ -0,0 +1,290 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +use std::borrow::Cow; +use std::collections::HashMap; +use std::fmt::Display; +use std::sync::Mutex; + +use axum::extract::{Path, State}; +use axum::http::{HeaderName, HeaderValue, StatusCode}; +use axum::response::{IntoResponse, Response}; +use axum::routing::get; +use axum::{Json, Router}; +use axum_extra::TypedHeader; +use axum_extra::headers::Authorization; +use axum_extra::headers::authorization::Basic; +use rand::RngExt; +use serde_json::json; + +use crate::bmc_state::BmcState; +use crate::json::JsonExt; +use crate::{http, redfish}; + +const X_AUTH_TOKEN: HeaderName = HeaderName::from_static("x-auth-token"); + +pub fn service_resource() -> redfish::Resource<'static> { + redfish::Resource { + odata_id: Cow::Borrowed("/redfish/v1/SessionService"), + odata_type: Cow::Borrowed("#SessionService.v1_1_9.SessionService"), + id: Cow::Borrowed("SessionService"), + name: Cow::Borrowed("Session Service"), + } +} + +pub fn sessions_collection() -> redfish::Collection<'static> { + redfish::Collection { + odata_id: Cow::Borrowed("/redfish/v1/SessionService/Sessions"), + odata_type: Cow::Borrowed("#SessionCollection.SessionCollection"), + name: Cow::Borrowed("Session Collection"), + } +} + +pub fn session_resource(id: impl Display) -> redfish::Resource<'static> { + redfish::Resource { + odata_id: Cow::Owned(format!("/redfish/v1/SessionService/Sessions/{id}")), + odata_type: Cow::Borrowed("#Session.v1_7_0.Session"), + id: Cow::Owned(id.to_string()), + name: Cow::Borrowed("User Session"), + } +} + +pub fn add_routes(r: Router) -> Router { + r.route(&service_resource().odata_id, get(get_service)) + .route( + &sessions_collection().odata_id, + get(get_sessions).post(post_session), + ) + .route( + format!("{}/{{session_id}}", sessions_collection().odata_id).as_str(), + get(get_session).delete(delete_session), + ) +} + +#[derive(Clone, Debug)] +pub struct SessionRecord { + pub id: String, + pub username: String, + pub token: String, +} + +impl SessionRecord { + fn to_json(&self) -> serde_json::Value { + json!({ + "UserName": self.username, + "SessionType": "Redfish", + }) + .patch(session_resource(&self.id)) + } +} + +#[derive(Debug, Default)] +pub struct SessionServiceState { + next_id: Mutex, + sessions: Mutex>, +} + +impl SessionServiceState { + pub fn new() -> Self { + Self::default() + } + + pub fn is_token_valid(&self, token: &str) -> bool { + self.sessions + .lock() + .expect("mutex poisoned") + .contains_key(token) + } + + pub fn create(&self, username: impl Into) -> SessionRecord { + let id = { + let mut next_id = self.next_id.lock().expect("mutex poisoned"); + *next_id += 1; + next_id.to_string() + }; + let token = generate_token(); + let record = SessionRecord { + id, + username: username.into(), + token: token.clone(), + }; + self.sessions + .lock() + .expect("mutex poisoned") + .insert(token, record.clone()); + record + } + + pub fn list(&self) -> Vec { + self.sessions + .lock() + .expect("mutex poisoned") + .values() + .cloned() + .collect() + } + + pub fn find_by_id(&self, id: &str) -> Option { + self.sessions + .lock() + .expect("mutex poisoned") + .values() + .find(|rec| rec.id == id) + .cloned() + } + + pub fn delete_by_id(&self, id: &str) -> bool { + let mut sessions = self.sessions.lock().expect("mutex poisoned"); + let Some(token) = sessions + .iter() + .find_map(|(tok, rec)| (rec.id == id).then(|| tok.clone())) + else { + return false; + }; + sessions.remove(&token); + true + } +} + +fn generate_token() -> String { + let mut rng = rand::rng(); + let bytes: [u8; 16] = rng.random(); + bytes.iter().fold(String::with_capacity(32), |mut acc, b| { + use std::fmt::Write as _; + let _ = write!(acc, "{b:02x}"); + acc + }) +} + +async fn get_service() -> Response { + json!({ + "ServiceEnabled": true, + "SessionTimeout": 3600, + "Sessions": { + "@odata.id": sessions_collection().odata_id, + }, + }) + .patch(service_resource()) + .into_ok_response() +} + +async fn get_sessions(State(state): State) -> Response { + let members = state + .session_service_state + .list() + .iter() + .map(|rec| session_resource(&rec.id).entity_ref()) + .collect::>(); + sessions_collection() + .with_members(&members) + .into_ok_response() +} + +async fn get_session(State(state): State, Path(session_id): Path) -> Response { + state + .session_service_state + .find_by_id(&session_id) + .map(|rec| rec.to_json().into_ok_response()) + .unwrap_or_else(http::not_found) +} + +/// `POST /redfish/v1/SessionService/Sessions`. +async fn post_session( + State(state): State, + authorization: Option>>, + body: Option>, +) -> Response { + let body_creds = body.as_ref().and_then(|Json(value)| { + let username = value.get("UserName").and_then(serde_json::Value::as_str)?; + let password = value.get("Password").and_then(serde_json::Value::as_str)?; + Some((username.to_string(), password.to_string())) + }); + let basic_creds = authorization + .as_ref() + .map(|TypedHeader(Authorization(basic))| { + (basic.username().to_string(), basic.password().to_string()) + }); + + let Some((username, password)) = body_creds.or(basic_creds) else { + tracing::warn!("Session creation request without credentials"); + return StatusCode::BAD_REQUEST.into_response(); + }; + + if !state + .account_service_state + .is_authorized(&username, &password) + { + tracing::warn!(username, "Session creation rejected: bad credentials"); + return StatusCode::UNAUTHORIZED.into_response(); + } + + let record = state.session_service_state.create(username); + let location = session_resource(&record.id).odata_id.into_owned(); + let mut response = record + .to_json() + .into_response(StatusCode::CREATED) + .into_response(); + let headers = response.headers_mut(); + if let Ok(value) = HeaderValue::from_str(&location) { + headers.insert(axum::http::header::LOCATION, value); + } + if let Ok(value) = HeaderValue::from_str(&record.token) { + headers.insert(X_AUTH_TOKEN, value); + } + response +} + +async fn delete_session(State(state): State, Path(session_id): Path) -> Response { + if state.session_service_state.delete_by_id(&session_id) { + http::ok_no_content() + } else { + http::not_found() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn create_assigns_unique_ids_and_tokens() { + let state = SessionServiceState::new(); + let a = state.create("root"); + let b = state.create("root"); + assert_ne!(a.id, b.id); + assert_ne!(a.token, b.token); + assert!(state.is_token_valid(&a.token)); + assert!(state.is_token_valid(&b.token)); + } + + #[test] + fn delete_by_id_removes_token() { + let state = SessionServiceState::new(); + let rec = state.create("root"); + assert!(state.delete_by_id(&rec.id)); + assert!(!state.is_token_valid(&rec.token)); + assert!(!state.delete_by_id(&rec.id)); + } + + #[test] + fn list_returns_outstanding_sessions() { + let state = SessionServiceState::new(); + state.create("a"); + state.create("b"); + assert_eq!(state.list().len(), 2); + } +} diff --git a/crates/bmc-mock/src/test_support/axum_http_client.rs b/crates/bmc-mock/src/test_support/axum_http_client.rs index d21b1e28c3..79bec0bbc3 100644 --- a/crates/bmc-mock/src/test_support/axum_http_client.rs +++ b/crates/bmc-mock/src/test_support/axum_http_client.rs @@ -22,6 +22,7 @@ use axum::body::Body; use axum::http::{HeaderMap, Method, Request, StatusCode}; use http_body_util::BodyExt; use nv_redfish::bmc_http::{BmcCredentials, CacheableError, HttpClient}; +use nv_redfish::core::upload::{MultipartUpdateRequest, UploadReader}; use nv_redfish::core::{BoxTryStream, ModificationResponse, ODataETag, SessionCreateResponse}; use serde::Serialize; use serde::de::DeserializeOwned; @@ -206,7 +207,6 @@ impl HttpClient for AxumRouterHttpClient { &self, _: Url, _: &B, - _: &BmcCredentials, _: &HeaderMap, ) -> Result, Self::Error> where @@ -215,4 +215,19 @@ impl HttpClient for AxumRouterHttpClient { { Err(Error::NotSupported("POST for Session is not supported yet")) } + + async fn post_multipart_update( + &self, + _: Url, + _: MultipartUpdateRequest<'_, U, V>, + _: &BmcCredentials, + _: &HeaderMap, + ) -> Result, Self::Error> + where + U: UploadReader, + T: DeserializeOwned + Send + Sync, + V: Serialize + Send + Sync, + { + Err(Error::NotSupported("Multipart update is not supported yet")) + } } diff --git a/crates/health/Cargo.toml b/crates/health/Cargo.toml index f645e06221..583905cd73 100644 --- a/crates/health/Cargo.toml +++ b/crates/health/Cargo.toml @@ -27,6 +27,7 @@ name = "forge-hw-health" path = "src/main.rs" [dependencies] +arc-swap = { workspace = true } async-trait = { workspace = true } base64 = { workspace = true } chrono = { workspace = true } diff --git a/crates/health/src/api_client.rs b/crates/health/src/api_client.rs index 4ea2fe5607..5ffe158532 100644 --- a/crates/health/src/api_client.rs +++ b/crates/health/src/api_client.rs @@ -15,43 +15,165 @@ * limitations under the License. */ +use std::collections::{HashMap, HashSet}; use std::convert::TryFrom; use std::net::IpAddr; use std::str::FromStr; -use std::sync::{Arc, RwLock}; +use std::sync::{Arc, Mutex}; use carbide_uuid::rack::RackId; use carbide_uuid::switch::SwitchId; use forge_tls::client_config::ClientCert; use mac_address::MacAddress; +use nv_redfish::bmc_http::reqwest::Client as ReqwestClient; use rpc::forge::MachineSearchConfig; use rpc::forge_api_client::ForgeApiClient; use rpc::forge_tls_client::{ApiConfig, ForgeClientConfig}; use url::Url; use crate::HealthError; +use crate::bmc::{BmcClient, BoxFuture, CredentialProvider}; use crate::endpoint::{ - BmcAddr, BmcCredentials, BmcEndpoint, BoxFuture, CredentialProvider, EndpointMetadata, - EndpointSource, MachineData, PowerShelfData, SwitchData, SwitchEndpointRole, + BmcAddr, BmcCredentials, BmcEndpoint, EndpointMetadata, EndpointSource, MachineData, + PowerShelfData, SwitchData, SwitchEndpointRole, }; +/// [`ApiEndpointSource`]. #[derive(Clone)] pub struct ApiClientWrapper { client: ForgeApiClient, } -#[derive(Clone)] -struct ApiCredentialProvider { - client: ForgeApiClient, - kind: ApiCredentialKind, +impl ApiClientWrapper { + pub fn new(root_ca: String, client_cert: String, client_key: String, api_url: &Url) -> Self { + let client_config = ForgeClientConfig::new( + root_ca, + Some(ClientCert { + cert_path: client_cert, + key_path: client_key, + }), + ); + let api_config = ApiConfig::new(api_url.as_str(), &client_config); + let client = ForgeApiClient::new(&api_config); + + Self { client } + } + + pub async fn submit_health_report( + &self, + machine_id: &carbide_uuid::machine::MachineId, + report: health_report::HealthReport, + ) -> Result<(), HealthError> { + let ovrd = rpc::forge::HealthReportEntry { + report: Some(report.into()), + mode: rpc::forge::HealthReportApplyMode::Merge.into(), + }; + + let request = rpc::forge::InsertMachineHealthReportRequest { + machine_id: Some(*machine_id), + health_report_entry: Some(ovrd), + }; + + self.client + .insert_machine_health_report(request) + .await + .map_err(HealthError::ApiInvocationError)?; + + Ok(()) + } + + pub async fn submit_rack_health_report( + &self, + rack_id: &carbide_uuid::rack::RackId, + report: health_report::HealthReport, + ) -> Result<(), HealthError> { + let ovrd = rpc::forge::HealthReportEntry { + report: Some(report.into()), + mode: rpc::forge::HealthReportApplyMode::Merge.into(), + }; + + let request = rpc::forge::InsertRackHealthReportRequest { + rack_id: Some(rack_id.clone()), + health_report_entry: Some(ovrd), + }; + + self.client + .insert_rack_health_report(request) + .await + .map_err(HealthError::ApiInvocationError)?; + + Ok(()) + } + + pub async fn submit_switch_health_report( + &self, + switch_id: &carbide_uuid::switch::SwitchId, + report: health_report::HealthReport, + ) -> Result<(), HealthError> { + let ovrd = rpc::forge::HealthReportEntry { + report: Some(report.into()), + mode: rpc::forge::HealthReportApplyMode::Merge.into(), + }; + + let request = rpc::forge::InsertSwitchHealthReportRequest { + switch_id: Some(*switch_id), + health_report_entry: Some(ovrd), + }; + + self.client + .insert_switch_health_report(request) + .await + .map_err(HealthError::ApiInvocationError)?; + + Ok(()) + } + + pub async fn submit_power_shelf_health_report( + &self, + power_shelf_id: &carbide_uuid::power_shelf::PowerShelfId, + report: health_report::HealthReport, + ) -> Result<(), HealthError> { + let ovrd = rpc::forge::HealthReportEntry { + report: Some(report.into()), + mode: rpc::forge::HealthReportApplyMode::Merge.into(), + }; + + let request = rpc::forge::InsertPowerShelfHealthReportRequest { + power_shelf_id: Some(*power_shelf_id), + health_report_entry: Some(ovrd), + }; + + self.client + .insert_power_shelf_health_report(request) + .await + .map_err(HealthError::ApiInvocationError)?; + + Ok(()) + } } -#[derive(Clone)] +#[derive(Clone, Debug, PartialEq, Eq)] enum ApiCredentialKind { Bmc, SwitchNvosAdmin { switch_id: SwitchId }, } +impl ApiCredentialKind { + /// Short tag used in tracing fields. + fn tag(&self) -> &'static str { + match self { + ApiCredentialKind::Bmc => "bmc", + ApiCredentialKind::SwitchNvosAdmin { .. } => "switch-nvos-admin", + } + } +} + +#[derive(Clone)] +struct ApiCredentialProvider { + client: ForgeApiClient, + kind: ApiCredentialKind, +} + impl CredentialProvider for ApiCredentialProvider { fn fetch_credentials<'a>( &'a self, @@ -63,7 +185,6 @@ impl CredentialProvider for ApiCredentialProvider { let request = rpc::forge::GetBmcCredentialsRequest { mac_addr: endpoint.mac.to_string(), }; - self.client .get_bmc_credentials(request) .await @@ -73,7 +194,6 @@ impl CredentialProvider for ApiCredentialProvider { let request = rpc::forge::GetSwitchNvosCredentialsRequest { switch_id: Some(*switch_id), }; - self.client .get_switch_nvos_credentials(request) .await @@ -120,20 +240,33 @@ fn switch_endpoint_metadata( })) } -impl ApiClientWrapper { - pub fn new(root_ca: String, client_cert: String, client_key: String, api_url: &Url) -> Self { - let client_config = ForgeClientConfig::new( - root_ca, - Some(ClientCert { - cert_path: client_cert, - key_path: client_key, - }), - ); - let api_config = ApiConfig::new(api_url.as_str(), &client_config); +pub struct ApiEndpointSource { + api: Arc, + reqwest: ReqwestClient, + proxy_url: Option, + cache_size: usize, + bmc_client_cache: Mutex>, +} - let client = ForgeApiClient::new(&api_config); +struct CachedBmcClient { + client: Arc, + kind: ApiCredentialKind, +} - Self { client } +impl ApiEndpointSource { + pub fn new( + api: Arc, + reqwest: ReqwestClient, + proxy_url: Option, + cache_size: usize, + ) -> Self { + Self { + api, + reqwest, + proxy_url, + cache_size, + bmc_client_cache: Mutex::new(HashMap::new()), + } } pub async fn fetch_bmc_hosts(&self) -> Result>, HealthError> { @@ -141,13 +274,31 @@ impl ApiClientWrapper { endpoints.extend(self.fetch_power_shelf_endpoints().await); endpoints.extend(self.fetch_switch_endpoints().await); + self.prune_bmc_client_cache(&endpoints); + tracing::info!("Prepared total {} endpoints", endpoints.len()); Ok(endpoints) } + fn prune_bmc_client_cache(&self, live_endpoints: &[Arc]) { + let live_macs: HashSet = live_endpoints.iter().map(|ep| ep.addr.mac).collect(); + let mut cache = self.bmc_client_cache.lock().expect("cache mutex poisoned"); + let before = cache.len(); + cache.retain(|mac, _| live_macs.contains(mac)); + let removed = before - cache.len(); + if removed > 0 { + tracing::info!( + removed, + remaining = cache.len(), + "Pruned stale BmcClient cache entries" + ); + } + } + async fn fetch_machine_endpoints(&self) -> Result>, HealthError> { let machine_ids = self + .api .client .find_machine_ids(MachineSearchConfig { include_dpus: true, @@ -166,6 +317,7 @@ impl ApiClientWrapper { ..Default::default() }; let machines = self + .api .client .find_machines_by_ids(request) .await @@ -176,8 +328,8 @@ impl ApiClientWrapper { ); for machine in machines.machines { - match self.extract_machine_endpoint(&machine).await { - Ok(endpoint) => endpoints.push(Arc::new(endpoint)), + match self.extract_machine_endpoint(&machine) { + Ok(endpoint) => endpoints.push(endpoint), Err(error) => tracing::warn!( ?machine, ?error, @@ -196,13 +348,13 @@ impl ApiClientWrapper { switch_id: None, }; - match self.client.find_switches(switch_request).await { + match self.api.client.find_switches(switch_request).await { Ok(response) => { let mut endpoints = Vec::new(); for switch in response.switches { - match self.extract_switch_endpoint(&switch).await { - Ok(endpoint) => endpoints.push(Arc::new(endpoint)), + match self.extract_switch_endpoint(&switch) { + Ok(endpoint) => endpoints.push(endpoint), Err(error) => tracing::warn!( ?switch, ?error, @@ -210,16 +362,14 @@ impl ApiClientWrapper { ), } - match self.extract_switch_host_endpoint(&switch).await { - Ok(Some(endpoint)) => endpoints.push(Arc::new(endpoint)), + match self.extract_switch_host_endpoint(&switch) { + Ok(Some(endpoint)) => endpoints.push(endpoint), Ok(None) => {} - Err(error) => { - tracing::warn!( - ?switch, - ?error, - "Could not add switch host endpoint due to error" - ); - } + Err(error) => tracing::warn!( + ?switch, + ?error, + "Could not add switch host endpoint due to error" + ), } } @@ -239,13 +389,13 @@ impl ApiClientWrapper { power_shelf_id: None, }; - match self.client.find_power_shelves(request).await { + match self.api.client.find_power_shelves(request).await { Ok(response) => { let mut endpoints = Vec::new(); for power_shelf in response.power_shelves { - match self.extract_power_shelf_endpoint(&power_shelf).await { - Ok(endpoint) => endpoints.push(Arc::new(endpoint)), + match self.extract_power_shelf_endpoint(&power_shelf) { + Ok(endpoint) => endpoints.push(endpoint), Err(error) => tracing::warn!( ?power_shelf, ?error, @@ -264,10 +414,10 @@ impl ApiClientWrapper { } } - async fn extract_machine_endpoint( + fn extract_machine_endpoint( &self, machine: &rpc::forge::Machine, - ) -> Result { + ) -> Result, HealthError> { let Some(bmc_info) = &machine.bmc_info else { return Err(HealthError::GenericError( "Could not extract machine endpoint without BMC Info".to_string(), @@ -297,43 +447,38 @@ impl ApiClientWrapper { }) }); - self.endpoint_with_auth( + self.endpoint_for( addr, metadata, machine.rack_id.clone(), ApiCredentialKind::Bmc, ) - .await } - async fn extract_switch_endpoint( + fn extract_switch_endpoint( &self, switch: &rpc::forge::Switch, - ) -> Result { + ) -> Result, HealthError> { let Some(bmc_info) = &switch.bmc_info else { return Err(HealthError::GenericError( "Could not extract switch endpoint without BMC Info".to_string(), )); }; let addr = BmcAddr::try_from(bmc_info)?; + let metadata = switch_endpoint_metadata(switch, SwitchEndpointRole::Bmc, false)?; - self.endpoint_with_auth( + self.endpoint_for( addr, - Some(switch_endpoint_metadata( - switch, - SwitchEndpointRole::Bmc, - false, - )?), + Some(metadata), switch.rack_id.clone(), ApiCredentialKind::Bmc, ) - .await } - async fn extract_switch_host_endpoint( + fn extract_switch_host_endpoint( &self, switch: &rpc::forge::Switch, - ) -> Result, HealthError> { + ) -> Result>, HealthError> { let Some(nvos_info) = switch.nvos_info.as_ref() else { return Ok(None); }; @@ -341,25 +486,23 @@ impl ApiClientWrapper { HealthError::GenericError("switch host endpoint missing switch ID".to_string()) })?; let addr = BmcAddr::try_from(nvos_info)?; + let metadata = + switch_endpoint_metadata(switch, SwitchEndpointRole::Host, switch.is_primary)?; - self.endpoint_with_auth( + let endpoint = self.endpoint_for( addr, - Some(switch_endpoint_metadata( - switch, - SwitchEndpointRole::Host, - switch.is_primary, - )?), + Some(metadata), switch.rack_id.clone(), ApiCredentialKind::SwitchNvosAdmin { switch_id }, - ) - .await - .map(Some) + )?; + + Ok(Some(endpoint)) } - async fn extract_power_shelf_endpoint( + fn extract_power_shelf_endpoint( &self, power_shelf: &rpc::forge::PowerShelf, - ) -> Result { + ) -> Result, HealthError> { let Some(bmc_info) = &power_shelf.bmc_info else { return Err(HealthError::GenericError( "Could not extract power shelf endpoint without BMC Info".to_string(), @@ -374,7 +517,7 @@ impl ApiClientWrapper { "Power shelf endpoint does not have serial".to_string(), ))?; - self.endpoint_with_auth( + self.endpoint_for( addr, Some(EndpointMetadata::PowerShelf(PowerShelfData { id: power_shelf.id, @@ -383,126 +526,71 @@ impl ApiClientWrapper { None, ApiCredentialKind::Bmc, ) - .await } - async fn endpoint_with_auth( + fn endpoint_for( &self, addr: BmcAddr, metadata: Option, rack_id: Option, credential_kind: ApiCredentialKind, - ) -> Result { - let provider = ApiCredentialProvider { - client: self.client.clone(), - kind: credential_kind, + ) -> Result, HealthError> { + let bmc = { + let mut cache = self.bmc_client_cache.lock().expect("cache mutex poisoned"); + cache_or_create_bmc_client(&mut cache, addr.mac, credential_kind, |kind| { + let provider: Arc = Arc::new(ApiCredentialProvider { + client: self.api.client.clone(), + kind, + }); + Ok(Arc::new(BmcClient::new( + self.reqwest.clone(), + addr.clone(), + provider, + self.proxy_url.clone(), + self.cache_size, + )?)) + })? }; - - let credentials = provider.fetch_credentials(&addr).await?; - - Ok(BmcEndpoint { + Ok(Arc::new(BmcEndpoint { addr, - provider: Arc::new(provider), metadata, rack_id, - credentials: Arc::new(RwLock::new(credentials)), - }) + bmc, + })) } +} - pub async fn submit_health_report( - &self, - machine_id: &carbide_uuid::machine::MachineId, - report: health_report::HealthReport, - ) -> Result<(), HealthError> { - let ovrd = rpc::forge::HealthReportEntry { - report: Some(report.into()), - mode: rpc::forge::HealthReportApplyMode::Merge.into(), - }; - - let request = rpc::forge::InsertMachineHealthReportRequest { - machine_id: Some(*machine_id), - health_report_entry: Some(ovrd), - }; - - self.client - .insert_machine_health_report(request) - .await - .map_err(HealthError::ApiInvocationError)?; - - Ok(()) - } - - pub async fn submit_rack_health_report( - &self, - rack_id: &carbide_uuid::rack::RackId, - report: health_report::HealthReport, - ) -> Result<(), HealthError> { - let ovrd = rpc::forge::HealthReportEntry { - report: Some(report.into()), - mode: rpc::forge::HealthReportApplyMode::Merge.into(), - }; - - let request = rpc::forge::InsertRackHealthReportRequest { - rack_id: Some(rack_id.clone()), - health_report_entry: Some(ovrd), - }; - - self.client - .insert_rack_health_report(request) - .await - .map_err(HealthError::ApiInvocationError)?; - - Ok(()) - } - - pub async fn submit_switch_health_report( - &self, - switch_id: &carbide_uuid::switch::SwitchId, - report: health_report::HealthReport, - ) -> Result<(), HealthError> { - let ovrd = rpc::forge::HealthReportEntry { - report: Some(report.into()), - mode: rpc::forge::HealthReportApplyMode::Merge.into(), - }; - - let request = rpc::forge::InsertSwitchHealthReportRequest { - switch_id: Some(*switch_id), - health_report_entry: Some(ovrd), - }; - - self.client - .insert_switch_health_report(request) - .await - .map_err(HealthError::ApiInvocationError)?; - - Ok(()) +fn cache_or_create_bmc_client( + cache: &mut HashMap, + mac: MacAddress, + credential_kind: ApiCredentialKind, + make_client: impl FnOnce(ApiCredentialKind) -> Result, HealthError>, +) -> Result, HealthError> { + if let Some(existing) = cache.get(&mac) { + if existing.kind != credential_kind { + return Err(HealthError::GenericError(format!( + "MAC {mac} reported with conflicting credential kinds \ + (cached={}, requested={}); refusing to construct endpoint \ + so we don't rotate credentials with the wrong RPC", + existing.kind.tag(), + credential_kind.tag(), + ))); + } + return Ok(existing.client.clone()); } - pub async fn submit_power_shelf_health_report( - &self, - power_shelf_id: &carbide_uuid::power_shelf::PowerShelfId, - report: health_report::HealthReport, - ) -> Result<(), HealthError> { - let ovrd = rpc::forge::HealthReportEntry { - report: Some(report.into()), - mode: rpc::forge::HealthReportApplyMode::Merge.into(), - }; - - let request = rpc::forge::InsertPowerShelfHealthReportRequest { - power_shelf_id: Some(*power_shelf_id), - health_report_entry: Some(ovrd), - }; - - self.client - .insert_power_shelf_health_report(request) - .await - .map_err(HealthError::ApiInvocationError)?; - - Ok(()) - } + let client = make_client(credential_kind.clone())?; + cache.insert( + mac, + CachedBmcClient { + client: client.clone(), + kind: credential_kind, + }, + ); + Ok(client) } -impl EndpointSource for ApiClientWrapper { +impl EndpointSource for ApiEndpointSource { fn fetch_bmc_hosts<'a>(&'a self) -> BoxFuture<'a, Result>, HealthError>> { Box::pin(self.fetch_bmc_hosts()) } @@ -579,3 +667,127 @@ impl From for BmcCredentials { } } } + +#[cfg(test)] +mod tests { + use std::sync::atomic::{AtomicUsize, Ordering}; + + use carbide_uuid::switch::{SwitchId, SwitchIdSource, SwitchType}; + use nv_redfish::bmc_http::reqwest::ClientParams as ReqwestClientParams; + + use super::*; + use crate::bmc::FixedCredentialProvider; + + fn test_mac() -> MacAddress { + MacAddress::from_str("00:11:22:33:44:55").expect("valid mac") + } + + fn test_addr() -> BmcAddr { + BmcAddr { + ip: "10.0.0.1".parse().expect("valid ip"), + port: Some(443), + mac: test_mac(), + } + } + + fn reqwest() -> ReqwestClient { + ReqwestClient::with_params(ReqwestClientParams::new().accept_invalid_certs(true)) + .expect("reqwest client builds") + } + + fn test_switch_id() -> SwitchId { + SwitchId::new(SwitchIdSource::Tpm, [7u8; 32], SwitchType::NvLink) + } + + fn make_test_client(_kind: ApiCredentialKind) -> Result, HealthError> { + let provider = Arc::new(FixedCredentialProvider::new(BmcCredentials::SessionToken { + token: "t".to_string(), + })); + Ok(Arc::new(BmcClient::new( + reqwest(), + test_addr(), + provider, + None, + 10, + )?)) + } + + #[test] + fn cache_returns_existing_client_on_matching_kind() { + let mut cache: HashMap = HashMap::new(); + let factory_calls = AtomicUsize::new(0); + + let first = + cache_or_create_bmc_client(&mut cache, test_mac(), ApiCredentialKind::Bmc, |kind| { + factory_calls.fetch_add(1, Ordering::SeqCst); + make_test_client(kind) + }) + .expect("first insert"); + + let second = + cache_or_create_bmc_client(&mut cache, test_mac(), ApiCredentialKind::Bmc, |kind| { + factory_calls.fetch_add(1, Ordering::SeqCst); + make_test_client(kind) + }) + .expect("cache hit"); + + assert!( + Arc::ptr_eq(&first, &second), + "cache hit must reuse the same BmcClient Arc — otherwise every \ + iteration of discovery rebuilds the session and re-fetches creds" + ); + assert_eq!( + factory_calls.load(Ordering::SeqCst), + 1, + "factory must only be called on cache miss" + ); + } + + #[test] + fn cache_rejects_conflicting_credential_kinds() { + let mut cache: HashMap = HashMap::new(); + + cache_or_create_bmc_client( + &mut cache, + test_mac(), + ApiCredentialKind::Bmc, + make_test_client, + ) + .expect("first insert ok"); + + let err = cache_or_create_bmc_client( + &mut cache, + test_mac(), + ApiCredentialKind::SwitchNvosAdmin { + switch_id: test_switch_id(), + }, + |_| panic!("factory must not be invoked when the cache mismatch is detected"), + ) + .map(|_| ()) + .expect_err("mismatched credential kind must error out"); + + match err { + HealthError::GenericError(msg) => { + assert!( + msg.contains("conflicting credential kinds"), + "expected mismatch message, got: {msg}" + ); + assert!(msg.contains(ApiCredentialKind::Bmc.tag())); + assert!( + msg.contains( + ApiCredentialKind::SwitchNvosAdmin { + switch_id: test_switch_id() + } + .tag() + ) + ); + } + other => panic!("unexpected error variant: {other:?}"), + } + + let still_cached = cache + .get(&test_mac()) + .expect("cached entry not removed by failed registration"); + assert_eq!(still_cached.kind, ApiCredentialKind::Bmc); + } +} diff --git a/crates/health/src/bmc.rs b/crates/health/src/bmc.rs index bf19e4e75d..9cef572a76 100644 --- a/crates/health/src/bmc.rs +++ b/crates/health/src/bmc.rs @@ -15,39 +15,126 @@ * limitations under the License. */ +use std::pin::Pin; use std::sync::Arc; use std::sync::atomic::{AtomicU64, Ordering}; +use std::time::Duration; use futures::TryStreamExt; -use nv_redfish::bmc_http::HttpBmc; +use http::HeaderMap; +use http::header::{self, InvalidHeaderValue}; use nv_redfish::bmc_http::reqwest::{BmcError, Client as ReqwestClient}; +use nv_redfish::bmc_http::{CacheSettings, HttpBmc}; use nv_redfish::core::query::{ExpandQuery, FilterQuery}; +use nv_redfish::core::upload::{MultipartUpdateRequest, UploadReader}; use nv_redfish::core::{ Action, Bmc, BoxTryStream, EntityTypeRef, Expandable, ModificationResponse, ODataETag, ODataId, SessionCreateResponse, }; use serde::{Deserialize, Serialize}; -use tokio::sync::Mutex; +use tokio::sync::{Mutex, OnceCell}; +use url::Url; use crate::HealthError; -use crate::endpoint::BmcEndpoint; +use crate::endpoint::{BmcAddr, BmcCredentials}; -/// BMC client wrapper that refreshes endpoint credentials after auth failures. -pub struct AuthRefreshingBmc { +pub(crate) const CREDENTIAL_REFRESH_TIMEOUT: Duration = Duration::from_secs(30); + +pub type BoxFuture<'a, T> = Pin + Send + 'a>>; + +pub trait CredentialProvider: Send + Sync { + fn fetch_credentials<'a>( + &'a self, + endpoint: &'a BmcAddr, + ) -> BoxFuture<'a, Result>; +} + +#[derive(Clone)] +pub struct FixedCredentialProvider { + credentials: BmcCredentials, +} + +impl FixedCredentialProvider { + pub fn new(credentials: BmcCredentials) -> Self { + Self { credentials } + } +} + +impl CredentialProvider for FixedCredentialProvider { + fn fetch_credentials<'a>( + &'a self, + _endpoint: &'a BmcAddr, + ) -> BoxFuture<'a, Result> { + let credentials = self.credentials.clone(); + Box::pin(async move { Ok(credentials) }) + } +} + +pub struct BmcClient { inner: HttpBmc, - endpoint: Arc, + addr: BmcAddr, + provider: Arc, credential_generation: AtomicU64, + init: OnceCell<()>, refresh_lock: Mutex<()>, } -impl AuthRefreshingBmc { - pub(crate) fn new(inner: HttpBmc, endpoint: Arc) -> Self { - Self { +impl BmcClient { + pub fn new( + reqwest: ReqwestClient, + addr: BmcAddr, + provider: Arc, + proxy_url: Option, + cache_size: usize, + ) -> Result { + let bmc_url = bmc_url(&addr, proxy_url.as_ref())?; + let headers = bmc_headers(&addr, proxy_url.as_ref())?; + + // Currently nv-redfish BMC, requires credentials, so this placeholder is sued + // they will be replaced as soon as we call ensure_credentials + let placeholder = + nv_redfish::bmc_http::BmcCredentials::username_password(String::new(), None::); + let inner = HttpBmc::with_custom_headers( + reqwest, + bmc_url, + placeholder, + CacheSettings::with_capacity(cache_size), + headers, + ); + Ok(Self { inner, - endpoint, + addr, + provider, credential_generation: AtomicU64::new(0), + init: OnceCell::new(), refresh_lock: Mutex::new(()), - } + }) + } + + pub async fn ensure_credentials(&self) -> Result<(), HealthError> { + self.init + .get_or_try_init(|| async { + let credentials = tokio::time::timeout( + CREDENTIAL_REFRESH_TIMEOUT, + self.provider.fetch_credentials(&self.addr), + ) + .await + .map_err(|_elapsed| { + HealthError::GenericError(format!( + "Timed out after {}s fetching initial BMC credentials", + CREDENTIAL_REFRESH_TIMEOUT.as_secs(), + )) + })??; + self.inner.set_credentials(credentials.into()); + self.credential_generation.fetch_add(1, Ordering::AcqRel); + Ok::<_, HealthError>(()) + }) + .await?; + Ok(()) + } + + pub fn credential_provider(&self) -> Arc { + self.provider.clone() } async fn refresh_credentials( @@ -64,11 +151,22 @@ impl AuthRefreshingBmc { tracing::warn!( error = ?error, - endpoint = ?self.endpoint.addr, + endpoint = ?self.addr, "Authentication failed, refreshing BMC credentials" ); - let credentials = self.endpoint.refresh().await.map_err(|refresh_error| { + let credentials = tokio::time::timeout( + CREDENTIAL_REFRESH_TIMEOUT, + self.provider.fetch_credentials(&self.addr), + ) + .await + .map_err(|_elapsed| { + HealthError::GenericError(format!( + "Timed out after {}s refreshing BMC credentials following auth error {error}", + CREDENTIAL_REFRESH_TIMEOUT.as_secs(), + )) + })? + .map_err(|refresh_error| { HealthError::GenericError(format!( "Failed to refresh credentials after auth error {error}: {refresh_error}" )) @@ -91,7 +189,7 @@ impl AuthRefreshingBmc { tracing::error!( error = ?refresh_error, original_error = ?error, - endpoint = ?self.endpoint.addr, + endpoint = ?self.addr, "Failed to refresh BMC credentials after authentication error" ); } @@ -100,7 +198,29 @@ impl AuthRefreshingBmc { } } -impl Bmc for AuthRefreshingBmc { +fn bmc_url(addr: &BmcAddr, proxy_url: Option<&Url>) -> Result { + match proxy_url { + Some(url) => Ok(url.clone()), + None => addr + .to_url() + .map_err(|e| HealthError::GenericError(e.to_string())), + } +} + +fn bmc_headers(addr: &BmcAddr, proxy_url: Option<&Url>) -> Result { + let mut headers = HeaderMap::new(); + if proxy_url.is_some() { + headers.insert( + header::FORWARDED, + format!("host={}", addr.ip) + .parse() + .map_err(|e: InvalidHeaderValue| HealthError::GenericError(e.to_string()))?, + ); + } + Ok(headers) +} + +impl Bmc for BmcClient { type Error = HealthError; async fn expand( @@ -108,6 +228,7 @@ impl Bmc for AuthRefreshingBmc { id: &ODataId, query: ExpandQuery, ) -> Result, Self::Error> { + self.ensure_credentials().await?; let credential_generation = self.credential_generation.load(Ordering::Acquire); match self .inner @@ -126,6 +247,7 @@ impl Bmc for AuthRefreshingBmc { &self, id: &ODataId, ) -> Result, Self::Error> { + self.ensure_credentials().await?; let credential_generation = self.credential_generation.load(Ordering::Acquire); match self.inner.get(id).await.map_err(HealthError::from) { Ok(value) => Ok(value), @@ -140,6 +262,7 @@ impl Bmc for AuthRefreshingBmc { id: &ODataId, query: FilterQuery, ) -> Result, Self::Error> { + self.ensure_credentials().await?; let credential_generation = self.credential_generation.load(Ordering::Acquire); match self .inner @@ -159,6 +282,7 @@ impl Bmc for AuthRefreshingBmc { id: &ODataId, query: &V, ) -> Result, Self::Error> { + self.ensure_credentials().await?; self.inner .create(id, query) .await @@ -174,16 +298,35 @@ impl Bmc for AuthRefreshingBmc { etag: Option<&ODataETag>, update: &V, ) -> Result, Self::Error> { + self.ensure_credentials().await?; self.inner .update(id, etag, update) .await .map_err(HealthError::from) } + async fn multipart_update( + &self, + uri: &str, + request: MultipartUpdateRequest<'_, U, V>, + ) -> Result, Self::Error> + where + U: UploadReader, + R: Send + Sync + for<'de> Deserialize<'de>, + V: Send + Sync + Serialize, + { + self.ensure_credentials().await?; + self.inner + .multipart_update(uri, request) + .await + .map_err(HealthError::from) + } + async fn delete Deserialize<'de>>( &self, id: &ODataId, ) -> Result, Self::Error> { + self.ensure_credentials().await?; self.inner.delete(id).await.map_err(HealthError::from) } @@ -195,6 +338,7 @@ impl Bmc for AuthRefreshingBmc { action: &Action, params: &T, ) -> Result, Self::Error> { + self.ensure_credentials().await?; self.inner .action(action, params) .await @@ -205,6 +349,7 @@ impl Bmc for AuthRefreshingBmc { &self, uri: &str, ) -> Result, Self::Error> { + self.ensure_credentials().await?; let credential_generation = self.credential_generation.load(Ordering::Acquire); match self.inner.stream(uri).await.map_err(HealthError::from) { Ok(stream) => Ok(Box::pin(stream.map_err(HealthError::from))), @@ -222,6 +367,7 @@ impl Bmc for AuthRefreshingBmc { id: &ODataId, query: &V, ) -> Result, Self::Error> { + self.ensure_credentials().await?; self.inner .create_session(id, query) .await @@ -258,9 +404,67 @@ fn is_auth_bmc_error(error: &BmcError) -> bool { #[cfg(test)] mod tests { - use url::Url; + use std::str::FromStr; + use std::sync::atomic::{AtomicUsize, Ordering as AtomicOrdering}; + use std::sync::{Arc, Mutex as StdMutex}; + use std::time::Duration; + + use mac_address::MacAddress; + use nv_redfish::bmc_http::reqwest::ClientParams as ReqwestClientParams; use super::*; + use crate::endpoint::BmcAddr; + + struct CountingProvider { + calls: Arc, + delay: Option, + credentials: BmcCredentials, + } + + impl CountingProvider { + fn new( + credentials: BmcCredentials, + delay: Option, + ) -> (Arc, Arc) { + let calls = Arc::new(AtomicUsize::new(0)); + let provider = Arc::new(Self { + calls: calls.clone(), + delay, + credentials, + }); + (provider, calls) + } + } + + impl CredentialProvider for CountingProvider { + fn fetch_credentials<'a>( + &'a self, + _endpoint: &'a BmcAddr, + ) -> BoxFuture<'a, Result> { + let delay = self.delay; + let credentials = self.credentials.clone(); + self.calls.fetch_add(1, AtomicOrdering::SeqCst); + Box::pin(async move { + if let Some(d) = delay { + tokio::time::sleep(d).await; + } + Ok(credentials) + }) + } + } + + fn test_addr() -> BmcAddr { + BmcAddr { + ip: "10.0.0.1".parse().unwrap(), + port: Some(443), + mac: MacAddress::from_str("00:11:22:33:44:55").unwrap(), + } + } + + fn reqwest() -> ReqwestClient { + ReqwestClient::with_params(ReqwestClientParams::new().accept_invalid_certs(true)) + .expect("reqwest client builds") + } fn bmc_status_error(status: http::StatusCode) -> BmcError { BmcError::InvalidResponse { @@ -295,4 +499,257 @@ mod tests { "request failed with HTTP 404".to_string(), ))); } + + #[tokio::test] + async fn new_does_not_fetch_credentials_eagerly() { + let (provider, calls) = CountingProvider::new( + BmcCredentials::UsernamePassword { + username: "u".to_string(), + password: Some("p".to_string()), + }, + None, + ); + let client = BmcClient::new(reqwest(), test_addr(), provider, None, 10) + .expect("constructor succeeds"); + + assert_eq!( + calls.load(AtomicOrdering::SeqCst), + 0, + "construction must not call the credential provider" + ); + assert_eq!( + client.credential_generation.load(Ordering::Acquire), + 0, + "generation stays 0 until first successful fetch" + ); + } + + #[tokio::test] + async fn ensure_credentials_calls_provider_exactly_once_under_concurrency() { + let (provider, calls) = CountingProvider::new( + BmcCredentials::SessionToken { + token: "t".to_string(), + }, + Some(Duration::from_millis(50)), + ); + let client = + Arc::new(BmcClient::new(reqwest(), test_addr(), provider, None, 10).expect("ok")); + + let mut handles = Vec::new(); + for _ in 0..16 { + let client = client.clone(); + handles.push(tokio::spawn( + async move { client.ensure_credentials().await }, + )); + } + for h in handles { + h.await.expect("task").expect("ensure ok"); + } + + assert_eq!(calls.load(AtomicOrdering::SeqCst), 1); + assert_eq!(client.credential_generation.load(Ordering::Acquire), 1); + } + + #[tokio::test] + async fn ensure_credentials_retries_after_failed_fetch() { + struct FlakyProvider { + attempts: AtomicUsize, + } + + impl CredentialProvider for FlakyProvider { + fn fetch_credentials<'a>( + &'a self, + _endpoint: &'a BmcAddr, + ) -> BoxFuture<'a, Result> { + let attempt = self.attempts.fetch_add(1, AtomicOrdering::SeqCst); + Box::pin(async move { + if attempt == 0 { + Err(HealthError::GenericError("transient".to_string())) + } else { + Ok(BmcCredentials::SessionToken { + token: "t".to_string(), + }) + } + }) + } + } + + let provider = Arc::new(FlakyProvider { + attempts: AtomicUsize::new(0), + }); + let client = BmcClient::new(reqwest(), test_addr(), provider.clone(), None, 10) + .expect("constructor succeeds"); + + assert!(client.ensure_credentials().await.is_err()); + assert_eq!(client.credential_generation.load(Ordering::Acquire), 0); + assert!(client.ensure_credentials().await.is_ok()); + assert_eq!(client.credential_generation.load(Ordering::Acquire), 1); + assert_eq!(provider.attempts.load(AtomicOrdering::SeqCst), 2); + } + + #[tokio::test] + async fn concurrent_refresh_collapses_to_a_single_provider_call() { + let (provider, calls) = CountingProvider::new( + BmcCredentials::SessionToken { + token: "t".to_string(), + }, + Some(Duration::from_millis(50)), + ); + let client = + Arc::new(BmcClient::new(reqwest(), test_addr(), provider, None, 10).expect("ok")); + client.ensure_credentials().await.expect("init ok"); + assert_eq!(calls.load(AtomicOrdering::SeqCst), 1); + + let observed = client.credential_generation.load(Ordering::Acquire); + let mut handles = Vec::new(); + for _ in 0..8 { + let client = client.clone(); + handles.push(tokio::spawn(async move { + client + .refresh_credentials( + &HealthError::HttpError("HTTP 401".to_string()), + Some(observed), + ) + .await + })); + } + for h in handles { + h.await.expect("task").expect("refresh ok"); + } + + // One init fetch + exactly one refresh fetch. + assert_eq!(calls.load(AtomicOrdering::SeqCst), 2); + assert_eq!(client.credential_generation.load(Ordering::Acquire), 2); + } + + #[tokio::test] + async fn refresh_consumes_provider_and_bumps_generation() { + struct SequenceProvider { + tokens: StdMutex>, + handed_out: StdMutex>, + calls: Arc, + } + + impl CredentialProvider for SequenceProvider { + fn fetch_credentials<'a>( + &'a self, + _endpoint: &'a BmcAddr, + ) -> BoxFuture<'a, Result> { + self.calls.fetch_add(1, AtomicOrdering::SeqCst); + let token = self + .tokens + .lock() + .unwrap() + .pop() + .expect("token sequence exhausted"); + self.handed_out.lock().unwrap().push(token); + Box::pin(async move { + Ok(BmcCredentials::SessionToken { + token: token.to_string(), + }) + }) + } + } + + let calls = Arc::new(AtomicUsize::new(0)); + let provider = Arc::new(SequenceProvider { + tokens: StdMutex::new(vec!["second", "first"]), + handed_out: StdMutex::new(Vec::new()), + calls: calls.clone(), + }); + let client = BmcClient::new(reqwest(), test_addr(), provider.clone(), None, 10) + .expect("constructor ok"); + + client.ensure_credentials().await.expect("init ok"); + assert_eq!(client.credential_generation.load(Ordering::Acquire), 1); + + client + .refresh_credentials(&HealthError::HttpError("HTTP 401".to_string()), None) + .await + .expect("refresh ok"); + + assert_eq!(client.credential_generation.load(Ordering::Acquire), 2); + assert_eq!(calls.load(AtomicOrdering::SeqCst), 2); + assert_eq!( + provider.handed_out.lock().unwrap().as_slice(), + &["first", "second"], + "init must consume the first token, refresh the second" + ); + } + + #[tokio::test(start_paused = true)] + async fn refresh_credentials_respects_timeout() { + struct HangingProvider; + + impl CredentialProvider for HangingProvider { + fn fetch_credentials<'a>( + &'a self, + _endpoint: &'a BmcAddr, + ) -> BoxFuture<'a, Result> { + Box::pin(std::future::pending()) + } + } + + let client = Arc::new( + BmcClient::new(reqwest(), test_addr(), Arc::new(HangingProvider), None, 10) + .expect("constructor ok"), + ); + let refresh_client = client.clone(); + let refresh = tokio::spawn(async move { + refresh_client + .refresh_credentials(&HealthError::HttpError("HTTP 401".to_string()), None) + .await + }); + + // Sleep just past the timeout so the timer fires; tokio's paused + // clock auto-advances via tokio::time::advance. + tokio::time::advance(CREDENTIAL_REFRESH_TIMEOUT + Duration::from_secs(1)).await; + let result = refresh.await.expect("task joined"); + assert!(result.is_err(), "hanging provider must surface as timeout"); + } + + #[tokio::test(start_paused = true)] + async fn ensure_credentials_respects_timeout() { + struct HangingProvider; + + impl CredentialProvider for HangingProvider { + fn fetch_credentials<'a>( + &'a self, + _endpoint: &'a BmcAddr, + ) -> BoxFuture<'a, Result> { + Box::pin(std::future::pending()) + } + } + + let client = Arc::new( + BmcClient::new(reqwest(), test_addr(), Arc::new(HangingProvider), None, 10) + .expect("constructor ok"), + ); + let ensure_client = client.clone(); + let ensure = tokio::spawn(async move { ensure_client.ensure_credentials().await }); + + tokio::time::advance(CREDENTIAL_REFRESH_TIMEOUT + Duration::from_secs(1)).await; + let result = ensure.await.expect("task joined"); + let error = result.expect_err("hanging provider must surface as timeout"); + match error { + HealthError::GenericError(msg) => assert!( + msg.contains("Timed out") && msg.contains("initial BMC credentials"), + "expected timeout message, got: {msg}" + ), + other => panic!("unexpected error variant: {other:?}"), + } + + // OnceCell must not have latched the failure — a subsequent call + // with a working provider has to be able to succeed. + let (recovery_provider, recovery_calls) = CountingProvider::new( + BmcCredentials::SessionToken { + token: "t".to_string(), + }, + None, + ); + let recovered = BmcClient::new(reqwest(), test_addr(), recovery_provider, None, 10) + .expect("constructor ok"); + recovered.ensure_credentials().await.expect("recovery ok"); + assert_eq!(recovery_calls.load(AtomicOrdering::SeqCst), 1); + } } diff --git a/crates/health/src/collectors/nvue/rest/client.rs b/crates/health/src/collectors/nvue/rest/client.rs index 5076f5abe0..08427b4ccf 100644 --- a/crates/health/src/collectors/nvue/rest/client.rs +++ b/crates/health/src/collectors/nvue/rest/client.rs @@ -16,8 +16,10 @@ */ use std::collections::HashMap; +use std::sync::Arc; use std::time::Duration; +use arc_swap::ArcSwapOption; use reqwest::Client; use reqwest::header::ACCEPT; use serde::Deserialize; @@ -31,12 +33,25 @@ const NVUE_CLUSTER_APPS: &str = "/nvue_v1/cluster/apps"; const NVUE_SDN_PARTITIONS: &str = "/nvue_v1/sdn/partition"; const NVUE_INTERFACES: &str = "/nvue_v1/interface"; -/// Client for NVUE REST API on NVUE-managed switches. +#[derive(Clone)] +pub struct UsernamePassword { + pub username: String, + pub password: Option, +} + +impl std::fmt::Debug for UsernamePassword { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("UsernamePassword") + .field("username", &self.username) + .field("password", &self.password.as_ref().map(|_| "")) + .finish() + } +} + pub struct RestClient { pub(crate) switch_id: String, base_url: Url, - username: Option, - password: Option, + credentials: ArcSwapOption, paths: NvueRestPaths, client: Client, } @@ -45,8 +60,6 @@ impl RestClient { pub fn new( switch_id: String, host: &str, - username: Option, - password: Option, request_timeout: Duration, self_signed_tls: bool, paths: NvueRestPaths, @@ -69,13 +82,24 @@ impl RestClient { Ok(Self { switch_id, base_url, - username, - password, + credentials: ArcSwapOption::empty(), paths, client, }) } + pub fn set_credentials(&self, creds: UsernamePassword) { + self.credentials.store(Some(Arc::new(creds))); + } + + pub fn clear_credentials(&self) { + self.credentials.store(None); + } + + pub fn has_credentials(&self) -> bool { + self.credentials.load().is_some() + } + pub async fn get_system_health(&self) -> Result, HealthError> { if !self.paths.system_health_enabled { return Ok(None); @@ -161,8 +185,8 @@ impl RestClient { request = request.query(extra_query); } - if let Some(user) = &self.username { - request = request.basic_auth(user, self.password.as_ref()); + if let Some(creds) = self.credentials.load_full() { + request = request.basic_auth(&creds.username, creds.password.as_ref()); } request = request.header(ACCEPT, "application/json"); diff --git a/crates/health/src/collectors/nvue/rest/collector.rs b/crates/health/src/collectors/nvue/rest/collector.rs index 3f76808179..efb4e2a243 100644 --- a/crates/health/src/collectors/nvue/rest/collector.rs +++ b/crates/health/src/collectors/nvue/rest/collector.rs @@ -18,13 +18,12 @@ use std::borrow::Cow; use std::sync::Arc; -use nv_redfish::Bmc; - -use super::client::RestClient; +use super::client::{RestClient, UsernamePassword}; use crate::HealthError; +use crate::bmc::{CREDENTIAL_REFRESH_TIMEOUT, CredentialProvider, is_auth_error}; use crate::collectors::{IterationResult, PeriodicCollector}; use crate::config::NvueRestConfig; -use crate::endpoint::{BmcCredentials, BmcEndpoint, EndpointMetadata}; +use crate::endpoint::{BmcAddr, BmcCredentials, BmcEndpoint, EndpointMetadata}; use crate::sink::{CollectorEvent, DataSink, EventContext, SensorHealthData}; const COLLECTOR_NAME: &str = "nvue_rest"; @@ -66,6 +65,7 @@ fn diagnostic_opcode_to_f64(code: &str) -> f64 { pub struct NvueRestCollectorConfig { pub rest_config: NvueRestConfig, pub data_sink: Option>, + pub credential_provider: Arc, } pub struct NvueRestCollector { @@ -73,22 +73,18 @@ pub struct NvueRestCollector { switch_id: String, event_context: EventContext, data_sink: Option>, + addr: BmcAddr, + provider: Arc, } -impl PeriodicCollector for NvueRestCollector { +impl PeriodicCollector for NvueRestCollector { type Config = NvueRestCollectorConfig; fn new_runner( - _bmc: Arc, + _bmc: Arc, endpoint: Arc, config: Self::Config, ) -> Result { - let BmcCredentials::UsernamePassword { username, password } = endpoint.credentials() else { - return Err(HealthError::GenericError( - "NVUE REST collector requires cached credentials at startup".to_string(), - )); - }; - let switch_id = match &endpoint.metadata { Some(EndpointMetadata::Switch(s)) => s.serial.clone(), _ => endpoint.addr.mac.to_string(), @@ -101,8 +97,6 @@ impl PeriodicCollector for NvueRestCollector { let client = RestClient::new( switch_id.clone(), &switch_ip, - Some(username), - password, rest_cfg.request_timeout, true, rest_cfg.paths.clone(), @@ -113,13 +107,31 @@ impl PeriodicCollector for NvueRestCollector { switch_id, event_context, data_sink: config.data_sink, + addr: endpoint.addr.clone(), + provider: config.credential_provider, }) } async fn run_iteration(&mut self) -> Result { + if !self.client.has_credentials() + && let Err(error) = self.refresh_rest_credentials().await + { + tracing::warn!( + ?error, + switch_id = %self.switch_id, + "nvue_rest: skipping iteration — credential fetch failed" + ); + return Ok(IterationResult { + refresh_triggered: false, + entity_count: Some(0), + fetch_failures: 1, + }); + } + self.emit_event(CollectorEvent::MetricCollectionStart); let mut entity_count = 0usize; let mut fetch_failures = 0usize; + let mut saw_auth_failure = false; match self.client.get_system_health().await { Ok(Some(health)) => { @@ -130,6 +142,7 @@ impl PeriodicCollector for NvueRestCollector { Ok(None) => {} Err(e) => { fetch_failures += 1; + saw_auth_failure |= is_auth_error(&e); tracing::warn!( error = ?e, switch_id = %self.switch_id, @@ -155,6 +168,7 @@ impl PeriodicCollector for NvueRestCollector { Ok(None) => {} Err(e) => { fetch_failures += 1; + saw_auth_failure |= is_auth_error(&e); tracing::warn!( error = ?e, switch_id = %self.switch_id, @@ -194,6 +208,7 @@ impl PeriodicCollector for NvueRestCollector { Ok(None) => {} Err(e) => { fetch_failures += 1; + saw_auth_failure |= is_auth_error(&e); tracing::warn!( error = ?e, switch_id = %self.switch_id, @@ -222,6 +237,7 @@ impl PeriodicCollector for NvueRestCollector { } Err(e) => { fetch_failures += 1; + saw_auth_failure |= is_auth_error(&e); tracing::warn!( error = ?e, switch_id = %self.switch_id, @@ -230,6 +246,14 @@ impl PeriodicCollector for NvueRestCollector { } } + if saw_auth_failure { + tracing::warn!( + switch_id = %self.switch_id, + "nvue_rest: auth failure observed, clearing cached credentials" + ); + self.client.clear_credentials(); + } + self.emit_event(CollectorEvent::MetricCollectionEnd); tracing::debug!( @@ -255,6 +279,30 @@ impl PeriodicCollector for NvueRestCollector { } impl NvueRestCollector { + async fn refresh_rest_credentials(&self) -> Result<(), HealthError> { + let creds = tokio::time::timeout( + CREDENTIAL_REFRESH_TIMEOUT, + self.provider.fetch_credentials(&self.addr), + ) + .await + .map_err(|_elapsed| { + HealthError::GenericError(format!( + "Timed out after {}s fetching NVUE REST credentials", + CREDENTIAL_REFRESH_TIMEOUT.as_secs(), + )) + })??; + match creds { + BmcCredentials::UsernamePassword { username, password } => { + self.client + .set_credentials(UsernamePassword { username, password }); + Ok(()) + } + _ => Err(HealthError::GenericError( + "NVUE REST collector requires username/password credentials".to_string(), + )), + } + } + fn emit_event(&self, event: CollectorEvent) { if let Some(data_sink) = &self.data_sink { data_sink.handle_event(&self.event_context, &event); @@ -297,7 +345,17 @@ impl NvueRestCollector { #[cfg(test)] mod tests { + use std::net::{IpAddr, Ipv4Addr}; + use std::str::FromStr; + use std::sync::Mutex as StdMutex; + use std::sync::atomic::{AtomicUsize, Ordering}; + use std::time::Duration; + + use mac_address::MacAddress; + use super::*; + use crate::bmc::BoxFuture; + use crate::config::NvueRestPaths; #[test] fn test_system_health_mapping() { @@ -332,4 +390,232 @@ mod tests { assert_eq!(diagnostic_opcode_to_f64("1024"), 1.0); assert_eq!(diagnostic_opcode_to_f64("57"), 1.0); } + + struct ScriptedProvider { + calls: AtomicUsize, + // Each call pops the front of this queue; an empty queue yields an + // error. `HealthError` is not `Clone`, so we store and consume by + // value rather than indexing + `.cloned()`. + responses: StdMutex>>, + } + + impl ScriptedProvider { + fn new(responses: Vec>) -> Arc { + Arc::new(Self { + calls: AtomicUsize::new(0), + responses: StdMutex::new(responses.into_iter().collect()), + }) + } + } + + impl CredentialProvider for ScriptedProvider { + fn fetch_credentials<'a>( + &'a self, + _endpoint: &'a BmcAddr, + ) -> BoxFuture<'a, Result> { + self.calls.fetch_add(1, Ordering::SeqCst); + let response = self + .responses + .lock() + .unwrap() + .pop_front() + .unwrap_or_else(|| { + Err(HealthError::GenericError( + "scripted provider exhausted".to_string(), + )) + }); + Box::pin(async move { response }) + } + } + + fn test_addr() -> BmcAddr { + BmcAddr { + ip: IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1)), + port: Some(443), + mac: MacAddress::from_str("aa:bb:cc:dd:ee:ff").unwrap(), + } + } + + fn paths_all_disabled() -> NvueRestPaths { + NvueRestPaths { + system_health_enabled: false, + cluster_apps_enabled: false, + sdn_partitions_enabled: false, + interfaces_enabled: false, + } + } + + fn collector_with_provider(provider: Arc) -> NvueRestCollector { + let addr = test_addr(); + let client = RestClient::new( + "test-switch".to_string(), + &addr.ip.to_string(), + Duration::from_millis(10), + true, + paths_all_disabled(), + ) + .expect("rest client builds"); + + let event_context = EventContext { + endpoint_key: "test-switch".to_string(), + addr: addr.clone(), + collector_type: COLLECTOR_NAME, + metadata: None, + rack_id: None, + }; + + NvueRestCollector { + client, + switch_id: "test-switch".to_string(), + event_context, + data_sink: None, + addr, + provider, + } + } + + #[tokio::test] + async fn first_iteration_lazy_fetches_credentials_then_runs() { + let provider = ScriptedProvider::new(vec![Ok(BmcCredentials::UsernamePassword { + username: "admin".to_string(), + password: Some("hunter2".to_string()), + })]); + let mut collector = collector_with_provider(provider.clone()); + + assert!( + !collector.client.has_credentials(), + "client must start credential-less so sharded-out endpoints never trigger a fetch" + ); + + let result = collector + .run_iteration() + .await + .expect("iteration returns Ok even when all paths are disabled"); + + assert_eq!(provider.calls.load(Ordering::SeqCst), 1); + assert!(collector.client.has_credentials()); + assert_eq!( + result.fetch_failures, 0, + "all four paths disabled → no HTTP, no failures" + ); + // Subsequent iterations reuse the already-installed credentials. + collector + .run_iteration() + .await + .expect("second iteration ok"); + assert_eq!( + provider.calls.load(Ordering::SeqCst), + 1, + "credential provider must not be re-hit while creds are still valid" + ); + } + + #[tokio::test] + async fn iteration_is_skipped_when_credential_fetch_fails_and_recovers_next_time() { + let provider = ScriptedProvider::new(vec![ + Err(HealthError::GenericError("forge unavailable".to_string())), + Ok(BmcCredentials::UsernamePassword { + username: "admin".to_string(), + password: None, + }), + ]); + let mut collector = collector_with_provider(provider.clone()); + + let first = collector.run_iteration().await.expect("first iteration ok"); + assert_eq!(first.fetch_failures, 1, "credential fetch failure surfaces"); + assert!(!first.refresh_triggered); + assert!( + !collector.client.has_credentials(), + "failed fetch must NOT install bogus credentials" + ); + + let second = collector + .run_iteration() + .await + .expect("second iteration ok"); + assert_eq!(provider.calls.load(Ordering::SeqCst), 2); + assert!(collector.client.has_credentials()); + assert_eq!( + second.fetch_failures, 0, + "second iteration recovers — credentials now present, no GETs to fail" + ); + } + + #[tokio::test] + async fn refresh_rejects_session_token_credentials() { + let provider = ScriptedProvider::new(vec![Ok(BmcCredentials::SessionToken { + token: "irrelevant".to_string(), + })]); + let collector = collector_with_provider(provider); + + let error = collector + .refresh_rest_credentials() + .await + .expect_err("session-token credentials are not usable for NVUE basic auth"); + match error { + HealthError::GenericError(msg) => assert!( + msg.contains("requires username/password"), + "expected explicit message, got: {msg}" + ), + other => panic!("unexpected error variant: {other:?}"), + } + } + + #[tokio::test(start_paused = true)] + async fn refresh_rest_credentials_respects_timeout() { + // Mirrors the `BmcClient::refresh_credentials_respects_timeout` + // contract on the NVUE REST side: a hung Forge call must not block + // the collector's iteration loop past `CREDENTIAL_REFRESH_TIMEOUT`. + struct HangingProvider; + impl CredentialProvider for HangingProvider { + fn fetch_credentials<'a>( + &'a self, + _endpoint: &'a BmcAddr, + ) -> BoxFuture<'a, Result> { + Box::pin(std::future::pending()) + } + } + + let collector = Arc::new(collector_with_provider(Arc::new(HangingProvider))); + let refresh_collector = collector.clone(); + let refresh = + tokio::spawn(async move { refresh_collector.refresh_rest_credentials().await }); + + // Sleep just past the timeout so the tokio timer fires. + tokio::time::advance(CREDENTIAL_REFRESH_TIMEOUT + Duration::from_secs(1)).await; + let result = refresh.await.expect("task joined"); + let error = result.expect_err("hanging provider must surface as timeout"); + match error { + HealthError::GenericError(msg) => assert!( + msg.contains("Timed out"), + "expected timeout message, got: {msg}" + ), + other => panic!("unexpected error variant: {other:?}"), + } + } + + #[test] + fn debug_redacts_password() { + let creds = UsernamePassword { + username: "admin".to_string(), + password: Some("hunter2".to_string()), + }; + let rendered = format!("{creds:?}"); + assert!( + !rendered.contains("hunter2"), + "Debug must not leak the password; got: {rendered}" + ); + assert!(rendered.contains("admin")); + assert!(rendered.contains("")); + + let no_password = UsernamePassword { + username: "admin".to_string(), + password: None, + }; + let rendered = format!("{no_password:?}"); + assert!( + !rendered.contains(""), + "missing password must not show as redacted; got: {rendered}" + ); + } } diff --git a/crates/health/src/collectors/runtime.rs b/crates/health/src/collectors/runtime.rs index 94203d659a..4f459ee12d 100644 --- a/crates/health/src/collectors/runtime.rs +++ b/crates/health/src/collectors/runtime.rs @@ -23,11 +23,7 @@ use std::time::{Duration, Instant}; use async_trait::async_trait; use futures::stream::BoxStream; use futures::{StreamExt, TryStreamExt}; -use http::header::InvalidHeaderValue; -use http::{HeaderMap, header}; use nv_redfish::ServiceRoot; -use nv_redfish::bmc_http::reqwest::Client as ReqwestClient; -use nv_redfish::bmc_http::{CacheSettings, HttpBmc}; use nv_redfish::core::Bmc; use nv_redfish::event_service::EventStreamPayload; use prometheus::{Counter, Gauge, Histogram, HistogramOpts, IntCounter, IntGauge, Opts}; @@ -36,9 +32,7 @@ use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; use crate::HealthError; -use crate::bmc::AuthRefreshingBmc; -use crate::config::Config as AppConfig; -use crate::discovery::BmcClient; +use crate::bmc::BmcClient; use crate::endpoint::BmcEndpoint; use crate::limiter::RateLimiter; use crate::metrics::{ @@ -257,63 +251,22 @@ pub struct Collector { cancel_token: CancellationToken, } -fn create_bmc( - client: ReqwestClient, - endpoint: Arc, - health_options: &AppConfig, -) -> Result, HealthError> { - let bmc_url = match &health_options.bmc_proxy_url { - Some(url) => url.clone(), - None => endpoint - .addr - .to_url() - .map_err(|e| HealthError::GenericError(e.to_string()))?, - }; - - let headers = if health_options.bmc_proxy_url.is_some() { - let mut headers = HeaderMap::new(); - headers.insert( - header::FORWARDED, - format!("host={}", endpoint.addr.ip) - .parse() - .map_err(|e: InvalidHeaderValue| HealthError::GenericError(e.to_string()))?, - ); - headers - } else { - HeaderMap::new() - }; - - let initial_credentials = endpoint.credentials(); - let inner = HttpBmc::with_custom_headers( - client, - bmc_url, - initial_credentials.into(), - CacheSettings::with_capacity(health_options.cache_size), - headers, - ); - - Ok(Arc::new(AuthRefreshingBmc::new(inner, endpoint))) -} - pub struct CollectorStartContext { pub limiter: Arc, pub iteration_interval: Duration, pub collector_registry: Arc, pub metrics_manager: Arc, - pub client: ReqwestClient, - pub health_options: Arc, } pub struct StreamingCollectorStartContext { pub backoff_config: BackoffConfig, pub collector_registry: Arc, - pub client: ReqwestClient, - pub health_options: Arc, } impl Collector { pub fn start>( endpoint: Arc, + bmc: Arc, config: C::Config, start_context: CollectorStartContext, ) -> Result { @@ -322,15 +275,11 @@ impl Collector { iteration_interval, collector_registry, metrics_manager, - client, - health_options, } = start_context; let cancel_token = CancellationToken::new(); let cancel_token_clone = cancel_token.clone(); - let bmc = create_bmc(client, Arc::clone(&endpoint), &health_options)?; - let mut runner = C::new_runner(bmc, endpoint.clone(), config)?; let endpoint_key = endpoint.key(); @@ -454,6 +403,7 @@ impl Collector { pub fn start_streaming>( endpoint: Arc, + bmc: Arc, config: S::Config, data_sink: Arc, start_context: StreamingCollectorStartContext, @@ -461,15 +411,11 @@ impl Collector { let StreamingCollectorStartContext { backoff_config, collector_registry, - client, - health_options, } = start_context; let cancel_token = CancellationToken::new(); let cancel_clone = cancel_token.clone(); - let bmc = create_bmc(client, Arc::clone(&endpoint), &health_options)?; - let mut collector = S::new_runner(Arc::clone(&bmc), endpoint.clone(), config)?; let event_context = EventContext::from_endpoint(&endpoint, collector.collector_type()); diff --git a/crates/health/src/discovery/context.rs b/crates/health/src/discovery/context.rs index 0c38c32d10..4611fa403f 100644 --- a/crates/health/src/discovery/context.rs +++ b/crates/health/src/discovery/context.rs @@ -19,13 +19,9 @@ use std::borrow::Cow; use std::collections::{HashMap, HashSet}; use std::sync::Arc; -use nv_redfish::bmc_http::reqwest::{ - BmcError, Client as ReqwestClient, ClientParams as ReqwestClientParams, -}; use prometheus::{Histogram, HistogramOpts}; use crate::HealthError; -use crate::bmc::AuthRefreshingBmc; use crate::collectors::Collector; use crate::config::{ Config, Configurable, FirmwareCollectorConfig as FirmwareCollectorOptions, @@ -36,8 +32,6 @@ use crate::config::{ use crate::limiter::RateLimiter; use crate::metrics::{MetricsManager, operation_duration_buckets_seconds}; -pub(crate) type BmcClient = AuthRefreshingBmc; - #[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)] pub(super) enum CollectorKind { Sensor, @@ -156,10 +150,8 @@ pub struct DiscoveryLoopContext { pub(super) collectors: CollectorState, pub(crate) discovery_iteration_histogram: Histogram, pub(crate) discovery_endpoint_fetch_histogram: Histogram, - pub(crate) client: ReqwestClient, pub(crate) limiter: Arc, pub(crate) metrics_manager: Arc, - pub(crate) config: Arc, pub(crate) sensors_config: Configurable, pub(crate) logs_config: Configurable, pub(crate) firmware_config: Configurable, @@ -196,31 +188,18 @@ impl DiscoveryLoopContext { )?; registry.register(Box::new(discovery_endpoint_fetch_histogram.clone()))?; - let client = - ReqwestClient::with_params(ReqwestClientParams::new().accept_invalid_certs(true)) - .map_err(BmcError::ReqwestError)?; - - let sensors_config = config.collectors.sensors.clone(); - let logs_config = config.collectors.logs.clone(); - let firmware_config = config.collectors.firmware.clone(); - let leak_detector_config = config.collectors.leak_detector.clone(); - let nmxt_config = config.collectors.nmxt.clone(); - let nvue_config = config.collectors.nvue.clone(); - Ok(Self { collectors: CollectorState::new(), discovery_iteration_histogram, discovery_endpoint_fetch_histogram, - client, limiter, metrics_manager, - config, - sensors_config, - logs_config, - firmware_config, - leak_detector_config, - nmxt_config, - nvue_config, + sensors_config: config.collectors.sensors.clone(), + logs_config: config.collectors.logs.clone(), + firmware_config: config.collectors.firmware.clone(), + leak_detector_config: config.collectors.leak_detector.clone(), + nmxt_config: config.collectors.nmxt.clone(), + nvue_config: config.collectors.nvue.clone(), }) } } diff --git a/crates/health/src/discovery/iteration.rs b/crates/health/src/discovery/iteration.rs index 4bb1407f5c..9a86fbec9a 100644 --- a/crates/health/src/discovery/iteration.rs +++ b/crates/health/src/discovery/iteration.rs @@ -74,7 +74,7 @@ pub async fn run_discovery_iteration( } for endpoint in &sharded_endpoints { - spawn_collectors_for_endpoint(ctx, endpoint, data_sink.clone(), metrics_prefix).await?; + spawn_collectors_for_endpoint(ctx, endpoint, data_sink.clone(), metrics_prefix)?; } let active_endpoints = active_keys(&sharded_endpoints); @@ -100,12 +100,24 @@ mod tests { use mac_address::MacAddress; use super::*; + use crate::endpoint::test_support::endpoint_with_creds; use crate::endpoint::{ BmcAddr, BmcCredentials, EndpointMetadata, SwitchData, SwitchEndpointRole, }; - fn endpoint(mac: MacAddress, switch: bool) -> Arc { - Arc::new(BmcEndpoint::with_fixed_credentials( + fn endpoint(mac: MacAddress, switch: bool, rack_id: Option) -> Arc { + let metadata = switch.then(|| { + EndpointMetadata::Switch(SwitchData { + id: None, + serial: format!("serial-{mac}"), + slot_number: None, + tray_index: None, + endpoint_role: SwitchEndpointRole::Host, + is_primary: false, + nmxt_enabled: false, + }) + }); + Arc::new(endpoint_with_creds( BmcAddr { ip: IpAddr::V4(Ipv4Addr::LOCALHOST), port: Some(443), @@ -115,28 +127,23 @@ mod tests { username: "user".to_string(), password: Some("pass".to_string()), }, - if switch { - Some(EndpointMetadata::Switch(SwitchData { - id: None, - serial: format!("serial-{mac}"), - slot_number: None, - tray_index: None, - endpoint_role: SwitchEndpointRole::Host, - is_primary: false, - nmxt_enabled: false, - })) - } else { - None - }, - None, + metadata, + rack_id, )) } - #[test] - fn test_active_keys_includes_all_endpoints() { - let mut ep1 = endpoint(MacAddress::from_str("42:9e:b1:bd:9d:dd").unwrap(), false); - Arc::get_mut(&mut ep1).unwrap().rack_id = Some(RackId::new("rack-a")); - let ep2 = endpoint(MacAddress::from_str("11:22:33:44:55:66").unwrap(), true); + #[tokio::test] + async fn test_active_keys_includes_all_endpoints() { + let ep1 = endpoint( + MacAddress::from_str("42:9e:b1:bd:9d:dd").unwrap(), + false, + Some(RackId::new("rack-a")), + ); + let ep2 = endpoint( + MacAddress::from_str("11:22:33:44:55:66").unwrap(), + true, + None, + ); let keys = active_keys(&[ep1.clone(), ep2.clone()]); diff --git a/crates/health/src/discovery/mod.rs b/crates/health/src/discovery/mod.rs index a9243bdb56..17fe3b6b95 100644 --- a/crates/health/src/discovery/mod.rs +++ b/crates/health/src/discovery/mod.rs @@ -20,7 +20,6 @@ mod context; mod iteration; mod spawn; -pub(crate) use context::BmcClient; pub use context::DiscoveryLoopContext; pub use iteration::run_discovery_iteration; diff --git a/crates/health/src/discovery/spawn.rs b/crates/health/src/discovery/spawn.rs index 2cdcbf3a9a..61490c930f 100644 --- a/crates/health/src/discovery/spawn.rs +++ b/crates/health/src/discovery/spawn.rs @@ -18,8 +18,9 @@ use std::path::PathBuf; use std::sync::Arc; -use super::context::{BmcClient, CollectorKind, DiscoveryLoopContext}; +use super::context::{CollectorKind, DiscoveryLoopContext}; use crate::HealthError; +use crate::bmc::BmcClient; use crate::collectors::{ BackoffConfig, Collector, CollectorStartContext, FirmwareCollector, FirmwareCollectorConfig, LeakDetectorCollector, LeakDetectorCollectorConfig, LogsCollector, LogsCollectorConfig, @@ -35,7 +36,7 @@ fn logs_state_file_path(template: &str, endpoint_id: &str) -> PathBuf { PathBuf::from(template.replace("{machine_id}", endpoint_id)) } -pub(super) async fn spawn_collectors_for_endpoint( +pub(super) fn spawn_collectors_for_endpoint( ctx: &mut DiscoveryLoopContext, endpoint: &Arc, data_sink: Option>, @@ -58,6 +59,7 @@ fn spawn_generic_redfish_collectors( ) -> Result<(), HealthError> { let key = endpoint.key(); let endpoint_arc = endpoint.clone(); + let bmc = endpoint.bmc().clone(); if let Configurable::Enabled(sensor_cfg) = &ctx.sensors_config && !ctx.collectors.contains(CollectorKind::Sensor, &key) @@ -68,6 +70,7 @@ fn spawn_generic_redfish_collectors( ); match Collector::start::>( endpoint_arc.clone(), + bmc.clone(), SensorCollectorConfig { data_sink: data_sink.clone(), state_refresh_interval: sensor_cfg.state_refresh_interval, @@ -79,8 +82,6 @@ fn spawn_generic_redfish_collectors( iteration_interval: sensor_cfg.sensor_fetch_interval, collector_registry, metrics_manager: ctx.metrics_manager.clone(), - client: ctx.client.clone(), - health_options: ctx.config.clone(), }, ) { Ok(monitor) => { @@ -115,13 +116,12 @@ fn spawn_generic_redfish_collectors( if let Some(data_sink) = data_sink.clone() { Some(Collector::start_streaming::>( endpoint_arc.clone(), + bmc.clone(), SseLogCollectorConfig, data_sink, StreamingCollectorStartContext { backoff_config: BackoffConfig::default(), collector_registry, - client: ctx.client.clone(), - health_options: ctx.config.clone(), }, )) } else { @@ -136,6 +136,7 @@ fn spawn_generic_redfish_collectors( Some(Collector::start::>( endpoint_arc.clone(), + bmc.clone(), LogsCollectorConfig { state_file_path, service_refresh_interval: pcfg.state_refresh_interval, @@ -146,8 +147,6 @@ fn spawn_generic_redfish_collectors( iteration_interval: pcfg.logs_collection_interval, collector_registry, metrics_manager: ctx.metrics_manager.clone(), - client: ctx.client.clone(), - health_options: ctx.config.clone(), }, )) } else { @@ -192,6 +191,7 @@ fn spawn_generic_redfish_collectors( ); match Collector::start::>( endpoint_arc.clone(), + bmc.clone(), FirmwareCollectorConfig { data_sink: data_sink.clone(), }, @@ -200,8 +200,6 @@ fn spawn_generic_redfish_collectors( iteration_interval: firmware_cfg.firmware_refresh_interval, collector_registry, metrics_manager: ctx.metrics_manager.clone(), - client: ctx.client.clone(), - health_options: ctx.config.clone(), }, ) { Ok(collector) => { @@ -233,6 +231,7 @@ fn spawn_generic_redfish_collectors( )?); match Collector::start::>( endpoint_arc, + bmc, LeakDetectorCollectorConfig { data_sink: data_sink.clone(), state_refresh_interval: leak_detector_cfg.state_refresh_interval, @@ -242,8 +241,6 @@ fn spawn_generic_redfish_collectors( iteration_interval: leak_detector_cfg.poll_interval, collector_registry, metrics_manager: ctx.metrics_manager.clone(), - client: ctx.client.clone(), - health_options: ctx.config.clone(), }, ) { Ok(collector) => { @@ -277,6 +274,7 @@ fn spawn_switch_host_collectors( ) -> Result<(), HealthError> { let key = endpoint.key(); let endpoint_arc = endpoint.clone(); + let bmc = endpoint.bmc().clone(); if endpoint .switch_data() @@ -290,6 +288,7 @@ fn spawn_switch_host_collectors( ); match Collector::start::( endpoint_arc.clone(), + bmc.clone(), NmxtCollectorConfig { nmxt_config: nmxt_cfg.clone(), data_sink: data_sink.clone(), @@ -299,8 +298,6 @@ fn spawn_switch_host_collectors( iteration_interval: nmxt_cfg.scrape_interval, collector_registry, metrics_manager: ctx.metrics_manager.clone(), - client: ctx.client.clone(), - health_options: ctx.config.clone(), }, ) { Ok(handle) => { @@ -326,23 +323,24 @@ fn spawn_switch_host_collectors( && let Configurable::Enabled(rest_cfg) = &nvue_cfg.rest && !ctx.collectors.contains(CollectorKind::NvueRest, &key) { + let credential_provider = bmc.credential_provider(); let collector_registry = Arc::new( ctx.metrics_manager .create_collector_registry(format!("nvue_rest_collector_{key}"), metrics_prefix)?, ); match Collector::start::( endpoint_arc, + bmc, NvueRestCollectorConfig { rest_config: rest_cfg.clone(), data_sink: data_sink.clone(), + credential_provider, }, CollectorStartContext { limiter: ctx.limiter.clone(), iteration_interval: rest_cfg.poll_interval, collector_registry, metrics_manager: ctx.metrics_manager.clone(), - client: ctx.client.clone(), - health_options: ctx.config.clone(), }, ) { Ok(handle) => { @@ -376,6 +374,7 @@ mod tests { use super::*; use crate::config::{Config, Configurable}; + use crate::endpoint::test_support::endpoint_with_creds; use crate::endpoint::{ BmcAddr, BmcCredentials, EndpointMetadata, MachineData, SwitchData, SwitchEndpointRole, }; @@ -406,7 +405,7 @@ mod tests { mac: &str, metadata: Option, ) -> Arc { - Arc::new(BmcEndpoint::with_fixed_credentials( + Arc::new(endpoint_with_creds( BmcAddr { ip: IpAddr::V4(ip), port: Some(443), @@ -460,15 +459,15 @@ mod tests { assert_eq!(path, PathBuf::from("/tmp/logs_endpoint-42.json")); } - #[test] - fn test_endpoint_log_identity_falls_back_to_mac_without_metadata() { + #[tokio::test] + async fn test_endpoint_log_identity_falls_back_to_mac_without_metadata() { let endpoint = test_endpoint(Ipv4Addr::new(10, 0, 0, 1), "aa:bb:cc:dd:ee:ff", None); assert_eq!(endpoint.log_identity().as_ref(), "AA:BB:CC:DD:EE:FF"); } - #[test] - fn test_endpoint_log_identity_uses_switch_serial_when_available() { + #[tokio::test] + async fn test_endpoint_log_identity_uses_switch_serial_when_available() { let endpoint = test_endpoint( Ipv4Addr::new(10, 0, 0, 2), "11:22:33:44:55:66", @@ -501,7 +500,6 @@ mod tests { Some(Arc::new(NoopSink)), "test_switch_generic_redfish_gate", ) - .await .expect("spawn should succeed"); assert_eq!(ctx.collectors.len(CollectorKind::Sensor), 0); @@ -533,7 +531,6 @@ mod tests { ); spawn_collectors_for_endpoint(&mut ctx, &endpoint, None, "test_switch_bmc_redfish_only") - .await .expect("spawn should succeed"); assert_eq!(ctx.collectors.len(CollectorKind::Sensor), 1); @@ -564,7 +561,6 @@ mod tests { ); spawn_collectors_for_endpoint(&mut ctx, &endpoint, None, "test") - .await .expect("spawn should succeed"); assert_eq!(ctx.collectors.len(CollectorKind::Sensor), 0); @@ -595,7 +591,6 @@ mod tests { ); spawn_collectors_for_endpoint(&mut ctx, &endpoint, None, "test") - .await .expect("spawn should succeed"); assert_eq!(ctx.collectors.len(CollectorKind::Nmxt), 0); @@ -625,7 +620,6 @@ mod tests { ); spawn_collectors_for_endpoint(&mut ctx, &endpoint, None, "test") - .await .expect("spawn should succeed"); assert_eq!(ctx.collectors.len(CollectorKind::Nmxt), 0); @@ -655,12 +649,82 @@ mod tests { Some(Arc::new(NoopSink)), "test_machine_sse_logs_collector", ) - .await .expect("spawn should succeed"); assert_eq!(ctx.collectors.len(CollectorKind::Logs), 1); } + #[tokio::test] + async fn test_nvue_rest_still_spawned_when_credentials_currently_unavailable() { + use crate::bmc::{BmcClient, BoxFuture, CredentialProvider}; + use crate::endpoint::test_support::reqwest; + + struct FailingProvider; + impl CredentialProvider for FailingProvider { + fn fetch_credentials<'a>( + &'a self, + _endpoint: &'a BmcAddr, + ) -> BoxFuture<'a, Result> { + Box::pin(async move { + Err(HealthError::GenericError( + "simulated credential provider failure".to_string(), + )) + }) + } + } + + let addr = BmcAddr { + ip: IpAddr::V4(Ipv4Addr::new(10, 0, 0, 99)), + port: Some(443), + mac: MacAddress::from_str("99:88:77:66:55:44").expect("valid mac"), + }; + let bmc = Arc::new( + BmcClient::new(reqwest(), addr.clone(), Arc::new(FailingProvider), None, 10) + .expect("constructor succeeds"), + ); + let endpoint = Arc::new(BmcEndpoint { + addr, + metadata: Some(switch_metadata_with_role( + SwitchEndpointRole::Host, + true, + true, + "failing-switch-host", + )), + rack_id: None, + bmc, + }); + + let mut config = Config::default(); + config.collectors.sensors = Configurable::Disabled; + config.collectors.logs = Configurable::Disabled; + config.collectors.firmware = Configurable::Disabled; + config.collectors.leak_detector = Configurable::Disabled; + config.collectors.nmxt = Configurable::Enabled(Default::default()); + config.collectors.nvue = Configurable::Enabled(Default::default()); + + let mut ctx = context_with_config(config, "test_nvue_rest_spawn_despite_cred_failure"); + + spawn_collectors_for_endpoint( + &mut ctx, + &endpoint, + None, + "test_nvue_rest_spawn_despite_cred_failure", + ) + .expect("spawn returns Ok even when the credential provider is failing"); + + assert_eq!( + ctx.collectors.len(CollectorKind::NvueRest), + 1, + "NVUE REST must still spawn — credential fetch is now per-iteration, \ + not part of the spawn contract" + ); + assert_eq!( + ctx.collectors.len(CollectorKind::Nmxt), + 1, + "NMX-T must still start — it doesn't depend on BMC credentials" + ); + } + #[tokio::test] async fn test_spawn_is_idempotent_when_collectors_are_disabled() { let mut config = Config::default(); @@ -675,10 +739,8 @@ mod tests { let endpoint = test_endpoint(Ipv4Addr::new(10, 0, 0, 1), "aa:bb:cc:dd:ee:ff", None); spawn_collectors_for_endpoint(&mut ctx, &endpoint, None, "test") - .await .expect("first spawn should succeed"); spawn_collectors_for_endpoint(&mut ctx, &endpoint, None, "test") - .await .expect("second spawn should also succeed without duplicate registry errors"); assert_eq!(ctx.collectors.len(CollectorKind::Sensor), 0); diff --git a/crates/health/src/endpoint/mod.rs b/crates/health/src/endpoint/mod.rs index 081caef8d8..e5e4a9e33b 100644 --- a/crates/health/src/endpoint/mod.rs +++ b/crates/health/src/endpoint/mod.rs @@ -19,28 +19,52 @@ mod model; mod sources; pub use model::{ - BmcAddr, BmcCredentials, BmcEndpoint, BoxFuture, CredentialProvider, EndpointMetadata, - EndpointSource, MachineData, PowerShelfData, SwitchData, SwitchEndpointRole, + BmcAddr, BmcCredentials, BmcEndpoint, EndpointMetadata, EndpointSource, MachineData, + PowerShelfData, SwitchData, SwitchEndpointRole, }; pub use sources::{CompositeEndpointSource, StaticEndpointSource}; +pub use crate::bmc::{BoxFuture, CredentialProvider}; + #[cfg(test)] -mod tests { +pub(crate) mod test_support { use std::str::FromStr; use std::sync::Arc; - use carbide_uuid::power_shelf::{PowerShelfIdSource, PowerShelfType}; - use carbide_uuid::switch::{SwitchIdSource, SwitchType}; use mac_address::MacAddress; + use nv_redfish::bmc_http::reqwest::{ + Client as ReqwestClient, ClientParams as ReqwestClientParams, + }; use super::*; - use crate::HealthError; - use crate::config::{ - StaticBmcEndpoint, StaticMachineEndpoint, StaticPowerShelfEndpoint, StaticSwitchEndpoint, - }; + use crate::bmc::{BmcClient, FixedCredentialProvider}; - fn make_test_endpoint(mac: MacAddress) -> BmcEndpoint { - BmcEndpoint::with_fixed_credentials( + pub fn reqwest() -> ReqwestClient { + ReqwestClient::with_params(ReqwestClientParams::new().accept_invalid_certs(true)) + .expect("reqwest client builds") + } + + pub fn endpoint_with_creds( + addr: BmcAddr, + creds: BmcCredentials, + metadata: Option, + rack_id: Option, + ) -> BmcEndpoint { + let provider = Arc::new(FixedCredentialProvider::new(creds)); + let bmc = Arc::new( + BmcClient::new(reqwest(), addr.clone(), provider, None, 10) + .expect("fixed-credential BmcClient construction is infallible"), + ); + BmcEndpoint { + addr, + metadata, + rack_id, + bmc, + } + } + + pub fn test_endpoint(mac: MacAddress) -> BmcEndpoint { + endpoint_with_creds( BmcAddr { ip: "10.0.0.1".parse().unwrap(), port: Some(443), @@ -55,29 +79,23 @@ mod tests { ) } - fn test_switch_id(label: &str) -> carbide_uuid::switch::SwitchId { - let mut hash = [0u8; 32]; - let bytes = label.as_bytes(); - hash[..bytes.len().min(32)].copy_from_slice(&bytes[..bytes.len().min(32)]); - carbide_uuid::switch::SwitchId::new(SwitchIdSource::Tpm, hash, SwitchType::NvLink) + pub fn mac(s: &str) -> MacAddress { + MacAddress::from_str(s).unwrap() } +} - fn test_power_shelf_id(label: &str) -> carbide_uuid::power_shelf::PowerShelfId { - let mut hash = [0u8; 32]; - let bytes = label.as_bytes(); - hash[..bytes.len().min(32)].copy_from_slice(&bytes[..bytes.len().min(32)]); - carbide_uuid::power_shelf::PowerShelfId::new( - PowerShelfIdSource::ProductBoardChassisSerial, - hash, - PowerShelfType::Rack, - ) - } +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use super::test_support::{mac, test_endpoint}; + use super::*; #[tokio::test] async fn test_static_endpoint_source_shares_arc_data() { let endpoints = vec![ - make_test_endpoint(MacAddress::from_str("00:11:22:33:44:55").unwrap()), - make_test_endpoint(MacAddress::from_str("aa:bb:cc:dd:ee:ff").unwrap()), + test_endpoint(mac("00:11:22:33:44:55")), + test_endpoint(mac("aa:bb:cc:dd:ee:ff")), ]; let source = StaticEndpointSource::new(endpoints); @@ -92,12 +110,8 @@ mod tests { #[tokio::test] async fn test_composite_endpoint_source_preserves_arc_sharing() { - let endpoints1 = vec![make_test_endpoint( - MacAddress::from_str("00:11:22:33:44:55").unwrap(), - )]; - let endpoints2 = vec![make_test_endpoint( - MacAddress::from_str("aa:bb:cc:dd:ee:ff").unwrap(), - )]; + let endpoints1 = vec![test_endpoint(mac("00:11:22:33:44:55"))]; + let endpoints2 = vec![test_endpoint(mac("aa:bb:cc:dd:ee:ff"))]; let source1 = Arc::new(StaticEndpointSource::new(endpoints1)); let source2 = Arc::new(StaticEndpointSource::new(endpoints2)); @@ -118,12 +132,12 @@ mod tests { let addr_http = BmcAddr { ip: "10.0.0.1".parse().expect("valid ip"), port: Some(80), - mac: MacAddress::from_str("00:11:22:33:44:55").unwrap(), + mac: mac("00:11:22:33:44:55"), }; let addr_https = BmcAddr { ip: "10.0.0.2".parse().expect("valid ip"), port: Some(443), - mac: MacAddress::from_str("aa:bb:cc:dd:ee:ff").unwrap(), + mac: mac("aa:bb:cc:dd:ee:ff"), }; let url_http = addr_http.to_url().expect("url should build"); @@ -132,210 +146,4 @@ mod tests { assert_eq!(url_http.scheme(), "http"); assert_eq!(url_https.scheme(), "https"); } - - #[tokio::test] - async fn test_static_endpoint_source_filters_invalid_ip() { - let configs = vec![ - StaticBmcEndpoint { - ip: "10.0.0.1".to_string(), - port: Some(443), - mac: "00:11:22:33:44:55".to_string(), - username: "admin".to_string(), - password: Some("pass".to_string()), - machine: None, - power_shelf: None, - switch: None, - rack_id: None, - }, - StaticBmcEndpoint { - ip: "not-an-ip".to_string(), - port: Some(443), - mac: "aa:bb:cc:dd:ee:ff".to_string(), - username: "admin".to_string(), - password: Some("pass".to_string()), - machine: None, - power_shelf: None, - switch: None, - rack_id: None, - }, - ]; - - let source = StaticEndpointSource::from_config(&configs); - let endpoints = source.fetch_bmc_hosts().await.expect("fetch should work"); - - assert_eq!(endpoints.len(), 1); - assert_eq!( - endpoints[0].addr.mac, - MacAddress::from_str("00:11:22:33:44:55").unwrap() - ); - } - - #[tokio::test] - async fn test_static_endpoint_with_switch_serial_sets_metadata() { - let switch_id = test_switch_id("switch-a"); - let configs = vec![StaticBmcEndpoint { - ip: "10.0.1.1".to_string(), - port: Some(443), - mac: "11:22:33:44:55:66".to_string(), - username: "cumulus".to_string(), - password: Some("pass".to_string()), - machine: None, - power_shelf: None, - switch: Some(StaticSwitchEndpoint { - id: Some(switch_id.to_string()), - serial: Some("SN-001".to_string()), - slot_number: Some(7), - tray_index: Some(3), - endpoint_role: crate::config::StaticSwitchEndpointRole::Host, - is_primary: true, - nmxt_enabled: None, - }), - rack_id: None, - }]; - - let source = StaticEndpointSource::from_config(&configs); - let endpoints = source.fetch_bmc_hosts().await.unwrap(); - - assert_eq!(endpoints.len(), 1); - match &endpoints[0].metadata { - Some(EndpointMetadata::Switch(s)) => { - assert_eq!(s.id, Some(switch_id)); - assert_eq!(s.serial, "SN-001"); - assert_eq!(s.slot_number, Some(7)); - assert_eq!(s.tray_index, Some(3)); - assert_eq!(s.endpoint_role, SwitchEndpointRole::Host); - assert!(s.is_primary); - assert!(s.nmxt_enabled); - } - other => panic!("expected Switch metadata, got {other:?}"), - } - } - - #[tokio::test] - async fn test_static_endpoint_with_power_shelf_metadata() { - let power_shelf_id = test_power_shelf_id("power-shelf-a"); - let configs = vec![StaticBmcEndpoint { - ip: "10.0.2.1".to_string(), - port: Some(443), - mac: "22:33:44:55:66:77".to_string(), - username: "admin".to_string(), - password: Some("pass".to_string()), - machine: None, - power_shelf: Some(StaticPowerShelfEndpoint { - id: Some(power_shelf_id.to_string()), - serial: Some("PS-001".to_string()), - }), - switch: None, - rack_id: None, - }]; - - let source = StaticEndpointSource::from_config(&configs); - let endpoints = source.fetch_bmc_hosts().await.unwrap(); - - assert_eq!(endpoints.len(), 1); - match &endpoints[0].metadata { - Some(EndpointMetadata::PowerShelf(power_shelf)) => { - assert_eq!(power_shelf.id, Some(power_shelf_id)); - assert_eq!(power_shelf.serial, "PS-001"); - } - other => panic!("expected PowerShelf metadata, got {other:?}"), - } - } - - #[tokio::test] - async fn test_static_machine_endpoint_sets_placement_and_nvlink_metadata() { - let machine_id = "fm100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0" - .parse() - .expect("valid machine id"); - let domain_uuid = "00000000-0000-0000-0000-000000000000" - .parse() - .expect("valid NVLink domain UUID"); - let configs = vec![StaticBmcEndpoint { - ip: "10.0.1.2".to_string(), - port: Some(443), - mac: "11:22:33:44:55:11".to_string(), - username: "admin".to_string(), - password: Some("pass".to_string()), - machine: Some(StaticMachineEndpoint { - id: "fm100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0".to_string(), - serial: Some("MN-001".to_string()), - slot_number: Some(15), - tray_index: Some(5), - nvlink_domain_uuid: Some("00000000-0000-0000-0000-000000000000".to_string()), - }), - power_shelf: None, - switch: None, - rack_id: Some("RACK_1".to_string()), - }]; - - let source = StaticEndpointSource::from_config(&configs); - let endpoints = source.fetch_bmc_hosts().await.unwrap(); - - assert_eq!(endpoints.len(), 1); - assert_eq!( - endpoints[0] - .rack_id - .as_ref() - .map(|rack_id| rack_id.as_str()), - Some("RACK_1") - ); - match &endpoints[0].metadata { - Some(EndpointMetadata::Machine(machine)) => { - assert_eq!(machine.machine_id, machine_id); - assert_eq!(machine.machine_serial.as_deref(), Some("MN-001")); - assert_eq!(machine.slot_number, Some(15)); - assert_eq!(machine.tray_index, Some(5)); - assert_eq!(machine.nvlink_domain_uuid, Some(domain_uuid)); - } - other => panic!("expected Machine metadata, got {other:?}"), - } - } - - #[tokio::test] - async fn test_static_endpoint_without_switch_serial_has_no_metadata() { - let configs = vec![StaticBmcEndpoint { - ip: "10.0.0.1".to_string(), - port: Some(443), - mac: "aa:bb:cc:dd:ee:ff".to_string(), - username: "admin".to_string(), - password: Some("pass".to_string()), - machine: None, - power_shelf: None, - switch: None, - rack_id: None, - }]; - - let source = StaticEndpointSource::from_config(&configs); - let endpoints = source.fetch_bmc_hosts().await.unwrap(); - - assert_eq!(endpoints.len(), 1); - assert!(endpoints[0].metadata.is_none()); - } - - struct FailingSource; - - impl EndpointSource for FailingSource { - fn fetch_bmc_hosts<'a>( - &'a self, - ) -> BoxFuture<'a, Result>, HealthError>> { - Box::pin(async { - Err(HealthError::GenericError( - "simulated endpoint source failure".to_string(), - )) - }) - } - } - - #[tokio::test] - async fn test_composite_endpoint_source_propagates_errors() { - let source_ok = Arc::new(StaticEndpointSource::new(vec![make_test_endpoint( - MacAddress::from_str("00:11:22:33:44:55").unwrap(), - )])); - let source_fail = Arc::new(FailingSource); - let composite = CompositeEndpointSource::new(vec![source_ok, source_fail]); - - let result = composite.fetch_bmc_hosts().await; - - assert!(result.is_err()); - } } diff --git a/crates/health/src/endpoint/model.rs b/crates/health/src/endpoint/model.rs index bebfba1d10..629bfcc21c 100644 --- a/crates/health/src/endpoint/model.rs +++ b/crates/health/src/endpoint/model.rs @@ -16,10 +16,8 @@ */ use std::borrow::Cow; -use std::future::Future; use std::net::IpAddr; -use std::pin::Pin; -use std::sync::{Arc, RwLock}; +use std::sync::Arc; use carbide_uuid::machine::MachineId; use carbide_uuid::nvlink::NvLinkDomainId; @@ -30,38 +28,14 @@ use mac_address::MacAddress; use url::Url; use crate::HealthError; - -pub type BoxFuture<'a, T> = Pin + Send + 'a>>; - -pub trait CredentialProvider: Send + Sync { - fn fetch_credentials<'a>( - &'a self, - endpoint: &'a BmcAddr, - ) -> BoxFuture<'a, Result>; -} - -#[derive(Clone)] -pub struct FixedCredentialProvider { - credentials: BmcCredentials, -} - -impl CredentialProvider for FixedCredentialProvider { - fn fetch_credentials<'a>( - &'a self, - _endpoint: &'a BmcAddr, - ) -> BoxFuture<'a, Result> { - let credentials = self.credentials.clone(); - Box::pin(async move { Ok(credentials) }) - } -} +use crate::bmc::{BmcClient, BoxFuture}; #[derive(Clone)] pub struct BmcEndpoint { pub addr: BmcAddr, pub metadata: Option, pub rack_id: Option, - pub(crate) credentials: Arc>, - pub(crate) provider: Arc, + pub bmc: Arc, } impl BmcEndpoint { @@ -78,25 +52,6 @@ impl BmcEndpoint { ) } - pub fn with_fixed_credentials( - addr: BmcAddr, - credentials: BmcCredentials, - metadata: Option, - rack_id: Option, - ) -> Self { - let provider = Arc::new(FixedCredentialProvider { - credentials: credentials.clone(), - }); - - Self { - addr, - metadata, - rack_id, - credentials: Arc::new(RwLock::new(credentials)), - provider, - } - } - pub fn log_identity(&self) -> Cow<'_, str> { match &self.metadata { Some(EndpointMetadata::Machine(machine)) => Cow::Owned(machine.machine_id.to_string()), @@ -106,21 +61,12 @@ impl BmcEndpoint { } } - pub fn switch_data(&self) -> Option<&SwitchData> { - self.metadata.as_ref().and_then(EndpointMetadata::as_switch) + pub fn bmc(&self) -> &Arc { + &self.bmc } - pub fn credentials(&self) -> BmcCredentials { - self.credentials.read().expect("lock poisoned").to_owned() - } - - pub async fn refresh(&self) -> Result { - let credentials = self.provider.fetch_credentials(&self.addr).await?; - self.credentials - .write() - .map(|mut current| *current = credentials.clone()) - .expect("lock poisoned"); - Ok(credentials) + pub fn switch_data(&self) -> Option<&SwitchData> { + self.metadata.as_ref().and_then(EndpointMetadata::as_switch) } } diff --git a/crates/health/src/endpoint/sources.rs b/crates/health/src/endpoint/sources.rs index 6023f20c2b..cd46f852ea 100644 --- a/crates/health/src/endpoint/sources.rs +++ b/crates/health/src/endpoint/sources.rs @@ -21,8 +21,11 @@ use std::sync::Arc; use carbide_uuid::nvlink::NvLinkDomainId; use carbide_uuid::rack::RackId; use mac_address::MacAddress; +use nv_redfish::bmc_http::reqwest::Client as ReqwestClient; +use url::Url; use crate::HealthError; +use crate::bmc::{BmcClient, FixedCredentialProvider}; use crate::config::{StaticBmcEndpoint, StaticSwitchEndpointRole}; use crate::endpoint::{ BmcAddr, BmcCredentials, BmcEndpoint, BoxFuture, EndpointMetadata, EndpointSource, MachineData, @@ -40,78 +43,87 @@ impl StaticEndpointSource { } } - pub fn from_config(configs: &[StaticBmcEndpoint]) -> Self { - let endpoints = configs - .iter() - .filter_map(|cfg| { - let ip = match cfg.ip.parse() { - Ok(ip) => ip, + pub fn from_config( + configs: &[StaticBmcEndpoint], + reqwest: &ReqwestClient, + proxy_url: Option<&Url>, + cache_size: usize, + ) -> Self { + let mut endpoints = Vec::with_capacity(configs.len()); + + for cfg in configs { + let ip = match cfg.ip.parse() { + Ok(ip) => ip, + Err(error) => { + tracing::warn!(?error, ip = ?cfg.ip, "Invalid IP in static endpoint config"); + continue; + } + }; + + let mac = match MacAddress::from_str(&cfg.mac) { + Ok(mac) => mac, + Err(error) => { + tracing::warn!(?error, mac = ?cfg.mac, "Invalid MAC in static endpoint config"); + continue; + } + }; + + let metadata = if let Some(power_shelf) = &cfg.power_shelf { + let id = power_shelf.id.as_ref().and_then(|id| match id.parse() { + Ok(id) => Some(id), Err(error) => { - tracing::warn!(?error, ip = ?cfg.ip, "Invalid IP in static endpoint config"); - return None; + tracing::warn!( + ?error, + power_shelf_id = ?id, + "Invalid power_shelf.id in static endpoint config" + ); + None } + }); + let serial = power_shelf + .serial + .clone() + .or_else(|| power_shelf.id.clone()) + .unwrap_or_else(|| cfg.mac.clone()); + + Some(EndpointMetadata::PowerShelf(PowerShelfData { id, serial })) + } else if let Some(switch) = &cfg.switch { + let id = switch.id.as_ref().and_then(|id| match id.parse() { + Ok(id) => Some(id), + Err(error) => { + tracing::warn!( + ?error, + switch_id = ?id, + "Invalid switch.id in static endpoint config" + ); + None + } + }); + let serial = switch + .serial + .clone() + .or_else(|| switch.id.clone()) + .unwrap_or_else(|| cfg.mac.clone()); + let endpoint_role = match switch.endpoint_role { + StaticSwitchEndpointRole::Bmc => SwitchEndpointRole::Bmc, + StaticSwitchEndpointRole::Host => SwitchEndpointRole::Host, }; + let nmxt_enabled = switch.nmxt_enabled.unwrap_or(switch.is_primary); - let mac = MacAddress::from_str(&cfg.mac).ok()?; - - let metadata = if let Some(power_shelf) = &cfg.power_shelf { - let id = power_shelf.id.as_ref().and_then(|id| match id.parse() { - Ok(id) => Some(id), - Err(error) => { - tracing::warn!( - ?error, - power_shelf_id = ?id, - "Invalid power_shelf.id in static endpoint config" - ); - None - } - }); - let serial = power_shelf - .serial - .clone() - .or_else(|| power_shelf.id.clone()) - .unwrap_or_else(|| cfg.mac.clone()); - - Some(EndpointMetadata::PowerShelf(PowerShelfData { - id, - serial, - })) - } else if let Some(switch) = &cfg.switch { - let id = switch.id.as_ref().and_then(|id| match id.parse() { - Ok(id) => Some(id), - Err(error) => { - tracing::warn!( - ?error, - switch_id = ?id, - "Invalid switch.id in static endpoint config" - ); - None - } - }); - let serial = switch - .serial - .clone() - .or_else(|| switch.id.clone()) - .unwrap_or_else(|| cfg.mac.clone()); - let endpoint_role = match switch.endpoint_role { - StaticSwitchEndpointRole::Bmc => SwitchEndpointRole::Bmc, - StaticSwitchEndpointRole::Host => SwitchEndpointRole::Host, - }; - let nmxt_enabled = switch.nmxt_enabled.unwrap_or(switch.is_primary); - - Some(EndpointMetadata::Switch(SwitchData { - id, - serial, - slot_number: switch.slot_number, - tray_index: switch.tray_index, - endpoint_role, - is_primary: switch.is_primary, - nmxt_enabled, - })) - } else if let Some(machine) = &cfg.machine { - let machine_id = &machine.id; - let nvlink_domain_uuid = machine.nvlink_domain_uuid.as_ref().and_then(|id| { - match NvLinkDomainId::from_str(id) { + Some(EndpointMetadata::Switch(SwitchData { + id, + serial, + slot_number: switch.slot_number, + tray_index: switch.tray_index, + endpoint_role, + is_primary: switch.is_primary, + nmxt_enabled, + })) + } else if let Some(machine) = &cfg.machine { + let machine_id = &machine.id; + let nvlink_domain_uuid = + machine.nvlink_domain_uuid.as_ref().and_then( + |id| match NvLinkDomainId::from_str(id) { Ok(id) => Some(id), Err(error) => { tracing::warn!( @@ -121,47 +133,65 @@ impl StaticEndpointSource { ); None } - } - }); - - match machine_id.parse() { - Ok(machine_id) => Some(EndpointMetadata::Machine(MachineData { - machine_id, - machine_serial: machine.serial.clone(), - slot_number: machine.slot_number, - tray_index: machine.tray_index, - nvlink_domain_uuid, - })), - Err(error) => { - tracing::warn!( - ?error, - ?machine_id, - "Invalid machine.id in static endpoint config" - ); - None - } + }, + ); + + match machine_id.parse() { + Ok(machine_id) => Some(EndpointMetadata::Machine(MachineData { + machine_id, + machine_serial: machine.serial.clone(), + slot_number: machine.slot_number, + tray_index: machine.tray_index, + nvlink_domain_uuid, + })), + Err(error) => { + tracing::warn!( + ?error, + ?machine_id, + "Invalid machine.id in static endpoint config" + ); + None } - } else { - None - }; + } + } else { + None + }; - let endpoint = BmcEndpoint::with_fixed_credentials( - BmcAddr { - ip, - port: cfg.port, - mac, - }, - BmcCredentials::UsernamePassword { - username: cfg.username.clone(), - password: cfg.password.clone(), - }, - metadata, - cfg.rack_id.as_ref().map(|id| RackId::new(id.as_str())), - ); - - Some(Arc::new(endpoint)) - }) - .collect(); + let addr = BmcAddr { + ip, + port: cfg.port, + mac, + }; + let credentials = BmcCredentials::UsernamePassword { + username: cfg.username.clone(), + password: cfg.password.clone(), + }; + let provider = Arc::new(FixedCredentialProvider::new(credentials)); + let bmc = match BmcClient::new( + reqwest.clone(), + addr.clone(), + provider, + proxy_url.cloned(), + cache_size, + ) { + Ok(client) => Arc::new(client), + Err(error) => { + tracing::warn!( + ?error, + ?addr, + "Failed to construct BmcClient for static endpoint" + ); + continue; + } + }; + let endpoint = BmcEndpoint { + addr, + metadata, + rack_id: cfg.rack_id.as_ref().map(|id| RackId::new(id.as_str())), + bmc, + }; + endpoints.push(Arc::new(endpoint)); + } Self { endpoints } } @@ -201,3 +231,246 @@ impl EndpointSource for CompositeEndpointSource { }) } } + +#[cfg(test)] +mod tests { + use carbide_uuid::power_shelf::{PowerShelfIdSource, PowerShelfType}; + use carbide_uuid::switch::{SwitchIdSource, SwitchType}; + use nv_redfish::bmc_http::reqwest::ClientParams as ReqwestClientParams; + + use super::*; + use crate::config::{ + StaticBmcEndpoint, StaticMachineEndpoint, StaticPowerShelfEndpoint, StaticSwitchEndpoint, + StaticSwitchEndpointRole, + }; + + fn reqwest() -> ReqwestClient { + ReqwestClient::with_params(ReqwestClientParams::new().accept_invalid_certs(true)) + .expect("reqwest client builds") + } + + fn test_switch_id(label: &str) -> carbide_uuid::switch::SwitchId { + let mut hash = [0u8; 32]; + let bytes = label.as_bytes(); + hash[..bytes.len().min(32)].copy_from_slice(&bytes[..bytes.len().min(32)]); + carbide_uuid::switch::SwitchId::new(SwitchIdSource::Tpm, hash, SwitchType::NvLink) + } + + fn test_power_shelf_id(label: &str) -> carbide_uuid::power_shelf::PowerShelfId { + let mut hash = [0u8; 32]; + let bytes = label.as_bytes(); + hash[..bytes.len().min(32)].copy_from_slice(&bytes[..bytes.len().min(32)]); + carbide_uuid::power_shelf::PowerShelfId::new( + PowerShelfIdSource::ProductBoardChassisSerial, + hash, + PowerShelfType::Rack, + ) + } + + #[tokio::test] + async fn test_static_endpoint_source_filters_invalid_ip() { + let configs = vec![ + StaticBmcEndpoint { + ip: "10.0.0.1".to_string(), + port: Some(443), + mac: "00:11:22:33:44:55".to_string(), + username: "admin".to_string(), + password: Some("pass".to_string()), + machine: None, + power_shelf: None, + switch: None, + rack_id: None, + }, + StaticBmcEndpoint { + ip: "not-an-ip".to_string(), + port: Some(443), + mac: "aa:bb:cc:dd:ee:ff".to_string(), + username: "admin".to_string(), + password: Some("pass".to_string()), + machine: None, + power_shelf: None, + switch: None, + rack_id: None, + }, + ]; + + let source = StaticEndpointSource::from_config(&configs, &reqwest(), None, 10); + let endpoints = source.fetch_bmc_hosts().await.expect("fetch should work"); + + assert_eq!(endpoints.len(), 1); + assert_eq!( + endpoints[0].addr.mac, + MacAddress::from_str("00:11:22:33:44:55").unwrap() + ); + } + + #[tokio::test] + async fn test_static_endpoint_with_switch_serial_sets_metadata() { + let switch_id = test_switch_id("switch-a"); + let configs = vec![StaticBmcEndpoint { + ip: "10.0.1.1".to_string(), + port: Some(443), + mac: "11:22:33:44:55:66".to_string(), + username: "cumulus".to_string(), + password: Some("pass".to_string()), + machine: None, + power_shelf: None, + switch: Some(StaticSwitchEndpoint { + id: Some(switch_id.to_string()), + serial: Some("SN-001".to_string()), + slot_number: Some(7), + tray_index: Some(3), + endpoint_role: StaticSwitchEndpointRole::Host, + is_primary: true, + nmxt_enabled: None, + }), + rack_id: None, + }]; + + let source = StaticEndpointSource::from_config(&configs, &reqwest(), None, 10); + let endpoints = source.fetch_bmc_hosts().await.unwrap(); + + assert_eq!(endpoints.len(), 1); + match &endpoints[0].metadata { + Some(EndpointMetadata::Switch(s)) => { + assert_eq!(s.id, Some(switch_id)); + assert_eq!(s.serial, "SN-001"); + assert_eq!(s.slot_number, Some(7)); + assert_eq!(s.tray_index, Some(3)); + assert_eq!(s.endpoint_role, SwitchEndpointRole::Host); + assert!(s.is_primary); + assert!(s.nmxt_enabled); + } + other => panic!("expected Switch metadata, got {other:?}"), + } + } + + #[tokio::test] + async fn test_static_endpoint_with_power_shelf_metadata() { + let power_shelf_id = test_power_shelf_id("power-shelf-a"); + let configs = vec![StaticBmcEndpoint { + ip: "10.0.2.1".to_string(), + port: Some(443), + mac: "22:33:44:55:66:77".to_string(), + username: "admin".to_string(), + password: Some("pass".to_string()), + machine: None, + power_shelf: Some(StaticPowerShelfEndpoint { + id: Some(power_shelf_id.to_string()), + serial: Some("PS-001".to_string()), + }), + switch: None, + rack_id: None, + }]; + + let source = StaticEndpointSource::from_config(&configs, &reqwest(), None, 10); + let endpoints = source.fetch_bmc_hosts().await.unwrap(); + + assert_eq!(endpoints.len(), 1); + match &endpoints[0].metadata { + Some(EndpointMetadata::PowerShelf(power_shelf)) => { + assert_eq!(power_shelf.id, Some(power_shelf_id)); + assert_eq!(power_shelf.serial, "PS-001"); + } + other => panic!("expected PowerShelf metadata, got {other:?}"), + } + } + + #[tokio::test] + async fn test_static_machine_endpoint_sets_placement_and_nvlink_metadata() { + let machine_id = "fm100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0" + .parse() + .expect("valid machine id"); + let domain_uuid = "00000000-0000-0000-0000-000000000000" + .parse() + .expect("valid NVLink domain UUID"); + let configs = vec![StaticBmcEndpoint { + ip: "10.0.1.2".to_string(), + port: Some(443), + mac: "11:22:33:44:55:11".to_string(), + username: "admin".to_string(), + password: Some("pass".to_string()), + machine: Some(StaticMachineEndpoint { + id: "fm100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0".to_string(), + serial: Some("MN-001".to_string()), + slot_number: Some(15), + tray_index: Some(5), + nvlink_domain_uuid: Some("00000000-0000-0000-0000-000000000000".to_string()), + }), + power_shelf: None, + switch: None, + rack_id: Some("RACK_1".to_string()), + }]; + + let source = StaticEndpointSource::from_config(&configs, &reqwest(), None, 10); + let endpoints = source.fetch_bmc_hosts().await.unwrap(); + + assert_eq!(endpoints.len(), 1); + assert_eq!( + endpoints[0] + .rack_id + .as_ref() + .map(|rack_id| rack_id.as_str()), + Some("RACK_1") + ); + match &endpoints[0].metadata { + Some(EndpointMetadata::Machine(machine)) => { + assert_eq!(machine.machine_id, machine_id); + assert_eq!(machine.machine_serial.as_deref(), Some("MN-001")); + assert_eq!(machine.slot_number, Some(15)); + assert_eq!(machine.tray_index, Some(5)); + assert_eq!(machine.nvlink_domain_uuid, Some(domain_uuid)); + } + other => panic!("expected Machine metadata, got {other:?}"), + } + } + + #[tokio::test] + async fn test_static_endpoint_without_switch_serial_has_no_metadata() { + let configs = vec![StaticBmcEndpoint { + ip: "10.0.0.1".to_string(), + port: Some(443), + mac: "aa:bb:cc:dd:ee:ff".to_string(), + username: "admin".to_string(), + password: Some("pass".to_string()), + machine: None, + power_shelf: None, + switch: None, + rack_id: None, + }]; + + let source = StaticEndpointSource::from_config(&configs, &reqwest(), None, 10); + let endpoints = source.fetch_bmc_hosts().await.unwrap(); + + assert_eq!(endpoints.len(), 1); + assert!(endpoints[0].metadata.is_none()); + } + + struct FailingSource; + + impl EndpointSource for FailingSource { + fn fetch_bmc_hosts<'a>( + &'a self, + ) -> BoxFuture<'a, Result>, HealthError>> { + Box::pin(async { + Err(HealthError::GenericError( + "simulated endpoint source failure".to_string(), + )) + }) + } + } + + #[tokio::test] + async fn test_composite_endpoint_source_propagates_errors() { + let endpoints = vec![super::super::test_support::test_endpoint( + MacAddress::from_str("00:11:22:33:44:55").unwrap(), + )]; + let source_ok = Arc::new(StaticEndpointSource::new(endpoints)); + let source_fail = Arc::new(FailingSource); + let composite = CompositeEndpointSource::new(vec![source_ok, source_fail]); + + let result = composite.fetch_bmc_hosts().await; + + assert!(result.is_err()); + } +} diff --git a/crates/health/src/lib.rs b/crates/health/src/lib.rs index 12fbd024e3..9386859f09 100644 --- a/crates/health/src/lib.rs +++ b/crates/health/src/lib.rs @@ -18,7 +18,9 @@ use std::sync::Arc; use std::time::Duration; -use nv_redfish::bmc_http::reqwest::BmcError; +use nv_redfish::bmc_http::reqwest::{ + BmcError, Client as ReqwestClient, ClientParams as ReqwestClientParams, +}; use prometheus::{Gauge, GaugeVec, Opts}; pub mod api_client; @@ -37,7 +39,7 @@ pub mod sink; pub use config::Config; pub use discovery::{DiscoveryIterationStats, DiscoveryLoopContext}; -use crate::api_client::ApiClientWrapper; +use crate::api_client::{ApiClientWrapper, ApiEndpointSource}; use crate::config::Configurable; use crate::endpoint::{CompositeEndpointSource, EndpointSource, StaticEndpointSource}; use crate::limiter::{BucketLimiter, NoopLimiter, RateLimiter}; @@ -110,11 +112,16 @@ struct EndpointWiring { } fn build_endpoint_wiring(config: &Config) -> Result { + let reqwest = ReqwestClient::with_params(ReqwestClientParams::new().accept_invalid_certs(true)) + .map_err(BmcError::ReqwestError)?; let mut sources: Vec> = Vec::new(); if !config.endpoint_sources.static_bmc_endpoints.is_empty() { let static_source = StaticEndpointSource::from_config( config.endpoint_sources.static_bmc_endpoints.as_slice(), + &reqwest, + config.bmc_proxy_url.as_ref(), + config.cache_size, ); sources.push(Arc::new(static_source)); } @@ -126,7 +133,13 @@ fn build_endpoint_wiring(config: &Config) -> Result source_cfg.client_key.clone(), &source_cfg.api_url, )); - sources.push(api_client as Arc); + let endpoint_source = Arc::new(ApiEndpointSource::new( + api_client, + reqwest, + config.bmc_proxy_url.clone(), + config.cache_size, + )); + sources.push(endpoint_source as Arc); } let composite_source = CompositeEndpointSource::new(sources); diff --git a/crates/health/src/sharding.rs b/crates/health/src/sharding.rs index a64e5b65a7..e8fba8ae32 100644 --- a/crates/health/src/sharding.rs +++ b/crates/health/src/sharding.rs @@ -61,10 +61,11 @@ mod tests { use mac_address::MacAddress; use super::*; + use crate::endpoint::test_support::endpoint_with_creds; use crate::endpoint::{BmcAddr, BmcCredentials}; - fn endpoint(mac: &str) -> BmcEndpoint { - BmcEndpoint::with_fixed_credentials( + fn endpoint(mac: &str, rack: Option<&str>) -> BmcEndpoint { + endpoint_with_creds( BmcAddr { ip: "10.0.0.1".parse().unwrap(), port: Some(443), @@ -75,23 +76,7 @@ mod tests { password: None, }, None, - None, - ) - } - - fn endpoint_with_rack(mac: &str, rack: &str) -> BmcEndpoint { - BmcEndpoint::with_fixed_credentials( - BmcAddr { - ip: "10.0.0.1".parse().unwrap(), - port: Some(443), - mac: MacAddress::from_str(mac).unwrap(), - }, - BmcCredentials::UsernamePassword { - username: "admin".into(), - password: None, - }, - None, - Some(RackId::new(rack)), + rack.map(RackId::new), ) } @@ -101,13 +86,13 @@ mod tests { shard: 0, shards_count: 1, }; - assert!(manager.should_monitor(&endpoint("42:9e:b1:bd:9d:dd"))); + assert!(manager.should_monitor(&endpoint("42:9e:b1:bd:9d:dd", None))); } #[test] fn test_consistent_hashing() { - let ep1 = endpoint("42:9e:b1:bd:9d:dd"); - let ep2 = endpoint("42:9e:b2:bd:9d:dd"); + let ep1 = endpoint("42:9e:b1:bd:9d:dd", None); + let ep2 = endpoint("42:9e:b2:bd:9d:dd", None); let managers: Vec<_> = (0..3) .map(|shard| ShardManager { @@ -154,8 +139,8 @@ mod tests { #[test] fn test_same_rack_id_same_shard() { - let ep_a = endpoint_with_rack("42:9e:b1:bd:9d:dd", "rack-7"); - let ep_b = endpoint_with_rack("42:9e:b2:bd:9d:dd", "rack-7"); + let ep_a = endpoint("42:9e:b1:bd:9d:dd", Some("rack-7")); + let ep_b = endpoint("42:9e:b2:bd:9d:dd", Some("rack-7")); let managers: Vec<_> = (0..3) .map(|shard| ShardManager { diff --git a/crates/redfish/Cargo.toml b/crates/redfish/Cargo.toml index 303c637cff..0773769cd0 100644 --- a/crates/redfish/Cargo.toml +++ b/crates/redfish/Cargo.toml @@ -31,8 +31,9 @@ chrono = { workspace = true, optional = true } http = { workspace = true } libredfish = { workspace = true } mac_address = { workspace = true } -nv-redfish = { workspace = true, features = ["bmc-http", "oem-hpe"] } +nv-redfish = { workspace = true, features = ["bmc-http", "oem-hpe", "session-service"] } reqwest = { workspace = true, default-features = false } +serde = { workspace = true, features = ["derive"] } serde_json = { workspace = true, optional = true } sqlx = { workspace = true, features = [ "postgres" ] } thiserror = { workspace = true } diff --git a/crates/redfish/src/nv_redfish/mod.rs b/crates/redfish/src/nv_redfish/mod.rs index eaec319157..bec454c68e 100644 --- a/crates/redfish/src/nv_redfish/mod.rs +++ b/crates/redfish/src/nv_redfish/mod.rs @@ -48,7 +48,7 @@ pub struct NvRedfishClientPool { struct PoolKey { proxy_address: Arc>, bmc_address: SocketAddr, - credentials: Credentials, + credentials: BmcCredentials, } impl NvRedfishClientPool { @@ -64,10 +64,13 @@ impl NvRedfishClientPool { bmc_address: SocketAddr, credentials: Credentials, ) -> Result, Error> { - if let Some(sevice_root) = self.cached_root(bmc_address, credentials.clone()) { + let Credentials::UsernamePassword { username, password } = credentials; + let bmc_credentials = BmcCredentials::new(username, password); + + if let Some(sevice_root) = self.cached_root(bmc_address, bmc_credentials.clone()) { Ok(sevice_root) } else { - let bmc = self.create_bmc(bmc_address, credentials.clone(), false)?; + let bmc = self.create_bmc(bmc_address, bmc_credentials.clone(), false)?; let service_root = ServiceRoot::new(bmc).await?; let service_root = if service_root.vendor() == Some(nv_redfish::service_root::Vendor::new("HPE")) @@ -86,13 +89,13 @@ impl NvRedfishClientPool { // when reqwest thinks that connection is alive but it // is about to close by server. Reusing such // connections causes errors. - let bmc = self.create_bmc(bmc_address, credentials.clone(), true)?; + let bmc = self.create_bmc(bmc_address, bmc_credentials.clone(), true)?; service_root.replace_bmc(bmc.clone()) } else { service_root }; let service_root = Arc::new(service_root); - self.update_cache(bmc_address, credentials, service_root.clone()); + self.update_cache(bmc_address, bmc_credentials, service_root.clone()); Ok(service_root) } } @@ -100,7 +103,7 @@ impl NvRedfishClientPool { fn cached_root( &self, bmc_address: SocketAddr, - credentials: Credentials, + credentials: BmcCredentials, ) -> Option> { let proxy_address = self.proxy_address.load(); let key = PoolKey { @@ -118,7 +121,7 @@ impl NvRedfishClientPool { fn update_cache( &self, bmc_address: SocketAddr, - credentials: Credentials, + credentials: BmcCredentials, root: Arc, ) { let proxy_address = self.proxy_address.load(); @@ -134,10 +137,10 @@ impl NvRedfishClientPool { cache.insert(key, root); } - fn create_bmc( + pub fn create_bmc( &self, bmc_address: SocketAddr, - Credentials::UsernamePassword { username, password }: Credentials, + credentials: BmcCredentials, connection_close: bool, ) -> Result, Error> { let proxy_address = self.proxy_address.load(); @@ -174,7 +177,7 @@ impl NvRedfishClientPool { Ok(Arc::new(RedfishBmc::with_custom_headers( client, bmc_url, - BmcCredentials::new(username, password), + credentials, CacheSettings::with_capacity(10), headers, )))